import io
import requests
import datetime as dt

# PANDAS
import pandas as pd
from pandas import DataFrame as df

# MATPLOTLIB
import matplotlib as mpl
from matplotlib import pyplot as plt
mpl.rc_file_defaults()
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# SEABORN
import seaborn as sns
sns.reset_orig()


# Pass in a URL for the csv file you want to retrieve.
def get_data(url):
    r = requests.get(url)
    s = requests.get(url).content
    try:
        # Make sure response="ok" (200-300) and length > 0
        if (r.ok) and (len(s) > 0):
            print(f"Got URL: {r.status_code}")
            return make_df(s)
        else:
            print(
                f"Could not get URL, or data is empty. Error is: {r.status_code}"
            )
    except:
        print(
                f"Exception in try block. Could not get URL. Error is: {r.status_code}"
            )


# Create our DataFrame from the returned csv file
def make_df(s):
    # AB Health returns 403 forbidden error if I call the CSV directly with read_csv("..."), 
    # so using requests library
    data=pd.read_csv(io.StringIO(s.decode('utf-8')))
    pd.set_option("display.max_rows", 15)
    pd.set_option("display.max_columns", 100)
    pd.set_option('display.width', 1000)
    df = pd.DataFrame(data).rename(columns=str.lower) # Lower case all column names, common in python
    df.columns = df.columns.str.replace(' ', '_') # Replace column name spaces with underscore
    return df


# Alberta Health has a csv containing daily confirmed covid cases. We will download that file.
# See https://www.alberta.ca/stats/covid-19-alberta-statistics.htm#data-export
# If using url:
url="https://www.alberta.ca/data/stats/covid-19-alberta-statistics-data.csv"
df = get_data(url)

# If using local file...good for testing, so you don't keep hitting their server:
# df = pd.read_csv("covid-19-alberta-statistics-data-20210910.csv")

# Make sure df is not empty:
try:
    if (len(df) > 0):
        print(df)
    elif df is None:
        print("Error: df is empty") 
    else:
        print("No df to display")
except:
    print("Exception in try block: df is empty")

Got URL: 200
        unnamed:_0 date_reported alberta_health_services_zone  gender    age_group case_status  case_type
0                1    2020-11-13                 Calgary Zone  Female    1-4 years   Recovered  Confirmed
1                2    2021-04-21                Edmonton Zone    Male  30-39 years   Recovered  Confirmed
2                3    2021-05-17                   North Zone    Male  10-19 years   Recovered  Confirmed
3                4    2020-12-13                Edmonton Zone    Male    5-9 years   Recovered  Confirmed
4                5    2021-01-05                 Central Zone    Male  50-59 years   Recovered  Confirmed
...            ...           ...                          ...     ...          ...         ...        ...
277553      277554    2021-08-04                   North Zone    Male  20-29 years   Recovered  Confirmed
277554      277555    2021-03-02                 Central Zone  Female  50-59 years   Recovered  Confirmed
277555      277556    2021-09-02                   North Zone  Female  50-59 years   Recovered  Confirmed
277556      277557    2021-06-02                   North Zone    Male  20-29 years   Recovered  Confirmed
277557      277558    2021-01-01                 Central Zone    Male  30-39 years   Recovered  Confirmed

[277558 rows x 7 columns]


# Let's get column names as list
list(df.columns)

['unnamed:_0',
 'date_reported',
 'alberta_health_services_zone',
 'gender',
 'age_group',
 'case_status',
 'case_type']


# Rename some columns to be a bit shorter whilst still descriptive
rename_cols = {"unnamed:_0":"line_no","date_reported":"reported", "alberta_health_services_zone":"zone"}
df.rename(columns=rename_cols,inplace = True)
print(df)

        line_no    reported           zone  gender    age_group case_status  case_type
0             1  2020-11-13   Calgary Zone  Female    1-4 years   Recovered  Confirmed
1             2  2021-04-21  Edmonton Zone    Male  30-39 years   Recovered  Confirmed
2             3  2021-05-17     North Zone    Male  10-19 years   Recovered  Confirmed
3             4  2020-12-13  Edmonton Zone    Male    5-9 years   Recovered  Confirmed
4             5  2021-01-05   Central Zone    Male  50-59 years   Recovered  Confirmed
...         ...         ...            ...     ...          ...         ...        ...
277553   277554  2021-08-04     North Zone    Male  20-29 years   Recovered  Confirmed
277554   277555  2021-03-02   Central Zone  Female  50-59 years   Recovered  Confirmed
277555   277556  2021-09-02     North Zone  Female  50-59 years   Recovered  Confirmed
277556   277557  2021-06-02     North Zone    Male  20-29 years   Recovered  Confirmed
277557   277558  2021-01-01   Central Zone    Male  30-39 years   Recovered  Confirmed

[277558 rows x 7 columns]


# "line_no" can be deleted (we'll use index number instead)
del df['line_no']


if len(df) > 0:
    df.info() #Get summary df info

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 277558 entries, 0 to 277557
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   reported     277558 non-null  object
 1   zone         277558 non-null  object
 2   gender       277558 non-null  object
 3   age_group    277558 non-null  object
 4   case_status  277558 non-null  object
 5   case_type    277558 non-null  object
dtypes: object(6)
memory usage: 12.7+ MB


# Convert "reported" column values from string datatype to datetime datatype
df['reported'] = df['reported'].astype('datetime64[ns]')
df['reported'] = pd.to_datetime(df['reported'], format='%y%m%d')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 277558 entries, 0 to 277557
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   reported     277558 non-null  datetime64[ns]
 1   zone         277558 non-null  object        
 2   gender       277558 non-null  object        
 3   age_group    277558 non-null  object        
 4   case_status  277558 non-null  object        
 5   case_type    277558 non-null  object        
dtypes: datetime64[ns](1), object(5)
memory usage: 12.7+ MB


# Strip the "years" string from "age_group", as we might want to use the numbers later on
df['age_group'] = df['age_group'].replace('\s?years', '', regex=True)
df


# Strip the "Zone" string from ahs_zone column values
df['zone'] = df['zone'].replace('\s?Zone', '', regex=True)
df


# Finally, let's sort index records by reported date, ascending (oldest-to-newest):
df.sort_values(by='reported',ascending=True)


# Okay, data is in good format, now let's make sure there are no empty (NaN or Null) values
df.isna().values.any() #Can also use .isnull. "False" means no null values (that's good!)

False


# Not getting any null values, but if you do, you can count them with:
# df.isna().sum()
# and then replace any null values with N/A or 0, depending on field datatype. 
# Use the ".fillna('N/A') or .fillna(0)" method


# Lets start some analysis
# Group each reported incident by date and return a count (reported cases/day):
covid_by_freq = df.reported.value_counts(normalize=False)  # Highest Covid days to lowest
print(covid_by_freq)

2021-04-30    2406
2021-05-04    2279
2021-05-05    2176
2021-04-28    2056
2021-05-07    2052
              ... 
2020-03-13       8
2020-03-11       7
2020-03-09       6
2020-03-12       3
2020-03-06       1
Name: reported, Length: 558, dtype: int64


# To sort by date:
covid_by_date = covid_by_freq.sort_index()
print(covid_by_date)
print(type(covid_by_date))

2020-03-06       1
2020-03-09       6
2020-03-10       9
2020-03-11       7
2020-03-12       3
              ... 
2021-09-12    1580
2021-09-13    1436
2021-09-14    1655
2021-09-15    1727
2021-09-16    2039
Name: reported, Length: 558, dtype: int64
<class 'pandas.core.series.Series'>


# Note: We can normalize this to show "the relative frequencies of the unique values."
# Src: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.value_counts.html
df.reported.value_counts(normalize=True).sort_index()

2020-03-06    0.000004
2020-03-09    0.000022
2020-03-10    0.000032
2020-03-11    0.000025
2020-03-12    0.000011
                ...   
2021-09-12    0.005693
2021-09-13    0.005174
2021-09-14    0.005963
2021-09-15    0.006222
2021-09-16    0.007346
Name: reported, Length: 558, dtype: float64


# Let's make the Series data result into a DataFrame
# Src: https://stackoverflow.com/questions/47136436/python-pandas-convert-value-counts-output-to-dataframe
df_covid_by_date = df["reported"].value_counts(normalize=False).reset_index()
df_covid_by_date.columns = ['reported', 'total_cases']
df_covid_by_date = df_covid_by_date.sort_values(by='reported', ascending=True)
print (df_covid_by_date)

      reported  total_cases
557 2020-03-06            1
555 2020-03-09            6
552 2020-03-10            9
554 2020-03-11            7
556 2020-03-12            3
..         ...          ...
39  2021-09-12         1580
58  2021-09-13         1436
29  2021-09-14         1655
21  2021-09-15         1727
5   2021-09-16         2039

[558 rows x 2 columns]


# Create a rolling 7-day simple moving-average and add it as a column. 
# The first few entries will not have a value, until we get to day 7.
# The NaN will need to be converted to integer 0.
# Since will have added six (0) integer values at the start, 
# any statistical operations would need to account for this.

df_covid_by_date['sma_7d'] = df_covid_by_date['total_cases'].rolling(7).mean()
print(df_covid_by_date.isna().sum())
# If there are NaN (Null) values, we will fill them with integer 0
if len(df_covid_by_date.isna().sum()) > 0:
    df_covid_by_date['sma_7d'] = df_covid_by_date['sma_7d'].fillna(0)

# Print sum again to make sure there are no NaN
print()
print(df_covid_by_date.isna().sum())

# Convert sma_7d values to integers
df_covid_by_date['sma_7d'] = df_covid_by_date['sma_7d'].astype(int)
print(df_covid_by_date.iloc[0:10])  # Example of getting 7day moving average
df_covid_by_date.describe()  # Get some statistics about dataset

reported       0
total_cases    0
sma_7d         6
dtype: int64

reported       0
total_cases    0
sma_7d         0
dtype: int64
      reported  total_cases  sma_7d
557 2020-03-06            1       0
555 2020-03-09            6       0
552 2020-03-10            9       0
554 2020-03-11            7       0
556 2020-03-12            3       0
553 2020-03-13            8       0
542 2020-03-14           22       8
551 2020-03-15           11       9
539 2020-03-16           24      12
550 2020-03-17           11      12


# Draw chart
import datetime as dt
from matplotlib.lines import Line2D
%config InlineBackend.figure_format = 'png' # 'retina' setting distorts image on mobile
dt_now = dt.datetime.now()
df_cbd = df_covid_by_date

# Figure styles
sns.set_style("whitegrid", {'grid.color': '.7', 'xtick.bottom': True})
sns.set_context("paper", font_scale=1.35, rc={"axes.titlesize":24,"axes.labelsize":16,'dpi': 100})

# Plot Size
plt.figure(figsize=(15, 10))

# The 7-day simple moving average as shaded line plot:
# Shading to bottom x-axis for better U/X
ax = sns.lineplot(data=df_cbd, x='reported', y='sma_7d', linewidth=2, color="red");
ax.fill_between(df_cbd.reported, df_cbd.sma_7d, color="#e0e0e0", alpha=0.3)
# You can restrict output to only a certain date range, pass in your start yyyy, m, d:
# ax.set_xlim([dt.date(2020, 8, 1), dt.datetime.now()])  
ax.set_xlim([df_cbd.reported.iloc[0], dt.datetime.now()]) 

# The daily Covid-19 case counts as a scatterplot
sns.scatterplot(data=df_cbd, x='reported', y='total_cases', marker='o', s=40);

# Legend and Titles
legend_elements = [Line2D([0], [0], marker='o', lw=0.5, label='Covid Cases/Day'),
                   Line2D([0], [0], lw=3, color='r', label='7-Day Moving Average')]
plt.legend(fontsize = 15, \
    bbox_to_anchor= (0.01, 0.97), \
    loc='upper left', \
    title="Legend", \
    title_fontsize = 22, \
    shadow = True, \
    facecolor = 'white', \
    handles=legend_elements
    )
plt.title('New Daily Covid-19 Cases in Alberta', fontsize=24)
plt.xlabel('Chart: Dan Hertz / Data: Alberta Health', labelpad=15, fontsize=14)
plt.ylabel('Total Cases per Day', labelpad=5, fontsize=20)
plt.tight_layout()

# If you want to remove borders, use:
# sns.despine()

# Store fig in variable or you get a blank image when saved 'gcf()' = get current figure
f = plt.gcf()
# Save .png chart to local folder
f.savefig(f'df_covid_by_date_{dt_now}.png', bbox_inches='tight', dpi=100)


# Finally, let's export the covid_by_date DataFrame to csv, excel and json (in case we want to use it in the future)
dt_now = dt.datetime.now()
df_covid_by_date.to_csv(f'ab_covid_by_date_{dt_now}.csv', index=True) #CSV
df_covid_by_date.to_excel(f'ab_covid_by_date_{dt_now}.xlsx', index=True) #Excel
df_covid_by_date.to_json(f'ab_covid_by_date_columns_{dt_now}.json', index=True, orient='columns', date_format="iso") #JSON Column format
df_covid_by_date.to_json(f'ab_covid_by_date_records_{dt_now}.json', index=True, orient='records', date_format="iso") #JSON Records format
df_covid_by_date.to_json(f'ab_covid_by_date_index_{dt_now}.json', index=True, orient='index', date_format="iso") #JSON Index format

	total_cases	sma_7d
count	558.000000	558.000000
mean	497.415771	487.657706
std	549.806879	531.045268
min	1.000000	0.000000
25%	81.500000	84.250000
50%	243.500000	248.000000
75%	751.750000	755.750000
max	2406.000000	2044.000000

Alberta Health Covid-19 Data Analysis¶

Goal is to visualize Alberta's new daily Covid-19 case counts -- sorted by date. I will also create and overlay a 7-day simple moving average trendline.¶

Step I: Import classes¶

Custom Functions¶

STEP II: Get CSV File and create Pandas DataFrame¶

Step III: Clean up data¶

Step III (b): Make sure data has no missing values¶

Step IV: Analysis¶

	reported	zone	gender	age_group	case_status	case_type
0	2020-11-13	Calgary Zone	Female	1-4	Recovered	Confirmed
1	2021-04-21	Edmonton Zone	Male	30-39	Recovered	Confirmed
2	2021-05-17	North Zone	Male	10-19	Recovered	Confirmed
3	2020-12-13	Edmonton Zone	Male	5-9	Recovered	Confirmed
4	2021-01-05	Central Zone	Male	50-59	Recovered	Confirmed
...	...	...	...	...	...	...
277553	2021-08-04	North Zone	Male	20-29	Recovered	Confirmed
277554	2021-03-02	Central Zone	Female	50-59	Recovered	Confirmed
277555	2021-09-02	North Zone	Female	50-59	Recovered	Confirmed
277556	2021-06-02	North Zone	Male	20-29	Recovered	Confirmed
277557	2021-01-01	Central Zone	Male	30-39	Recovered	Confirmed

	reported	zone	gender	age_group	case_status	case_type
0	2020-11-13	Calgary	Female	1-4	Recovered	Confirmed
1	2021-04-21	Edmonton	Male	30-39	Recovered	Confirmed
2	2021-05-17	North	Male	10-19	Recovered	Confirmed
3	2020-12-13	Edmonton	Male	5-9	Recovered	Confirmed
4	2021-01-05	Central	Male	50-59	Recovered	Confirmed
...	...	...	...	...	...	...
277553	2021-08-04	North	Male	20-29	Recovered	Confirmed
277554	2021-03-02	Central	Female	50-59	Recovered	Confirmed
277555	2021-09-02	North	Female	50-59	Recovered	Confirmed
277556	2021-06-02	North	Male	20-29	Recovered	Confirmed
277557	2021-01-01	Central	Male	30-39	Recovered	Confirmed

	reported	zone	gender	age_group	case_status	case_type
259303	2020-03-06	Calgary	Female	50-59	Recovered	Confirmed
227899	2020-03-09	Calgary	Male	30-39	Recovered	Confirmed
257745	2020-03-09	Edmonton	Male	60-69	Recovered	Confirmed
270881	2020-03-09	Calgary	Female	30-39	Recovered	Confirmed
245662	2020-03-09	Edmonton	Female	70-79	Recovered	Confirmed
...	...	...	...	...	...	...
215938	2021-09-16	Edmonton	Female	40-49	Active	Confirmed
119979	2021-09-16	Central	Female	5-9	Active	Confirmed
119976	2021-09-16	Edmonton	Male	1-4	Active	Confirmed
28404	2021-09-16	Edmonton	Female	20-29	Active	Confirmed
161231	2021-09-16	Calgary	Female	5-9	Active	Confirmed