AB Health CSV Data Licence: https://open.alberta.ca/licence.
This file was last modified by Dan Hertz on 2021-09-17. (c) 2021 Dan Hertz.
import io
import requests
import datetime as dt
# PANDAS
import pandas as pd
from pandas import DataFrame as df
# MATPLOTLIB
import matplotlib as mpl
from matplotlib import pyplot as plt
mpl.rc_file_defaults()
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
# SEABORN
import seaborn as sns
sns.reset_orig()
# Pass in a URL for the csv file you want to retrieve.
def get_data(url):
r = requests.get(url)
s = requests.get(url).content
try:
# Make sure response="ok" (200-300) and length > 0
if (r.ok) and (len(s) > 0):
print(f"Got URL: {r.status_code}")
return make_df(s)
else:
print(
f"Could not get URL, or data is empty. Error is: {r.status_code}"
)
except:
print(
f"Exception in try block. Could not get URL. Error is: {r.status_code}"
)
# Create our DataFrame from the returned csv file
def make_df(s):
# AB Health returns 403 forbidden error if I call the CSV directly with read_csv("..."),
# so using requests library
data=pd.read_csv(io.StringIO(s.decode('utf-8')))
pd.set_option("display.max_rows", 15)
pd.set_option("display.max_columns", 100)
pd.set_option('display.width', 1000)
df = pd.DataFrame(data).rename(columns=str.lower) # Lower case all column names, common in python
df.columns = df.columns.str.replace(' ', '_') # Replace column name spaces with underscore
return df
# Alberta Health has a csv containing daily confirmed covid cases. We will download that file.
# See https://www.alberta.ca/stats/covid-19-alberta-statistics.htm#data-export
# If using url:
url="https://www.alberta.ca/data/stats/covid-19-alberta-statistics-data.csv"
df = get_data(url)
# If using local file...good for testing, so you don't keep hitting their server:
# df = pd.read_csv("covid-19-alberta-statistics-data-20210910.csv")
# Make sure df is not empty:
try:
if (len(df) > 0):
print(df)
elif df is None:
print("Error: df is empty")
else:
print("No df to display")
except:
print("Exception in try block: df is empty")
Got URL: 200 unnamed:_0 date_reported alberta_health_services_zone gender age_group case_status case_type 0 1 2020-11-13 Calgary Zone Female 1-4 years Recovered Confirmed 1 2 2021-04-21 Edmonton Zone Male 30-39 years Recovered Confirmed 2 3 2021-05-17 North Zone Male 10-19 years Recovered Confirmed 3 4 2020-12-13 Edmonton Zone Male 5-9 years Recovered Confirmed 4 5 2021-01-05 Central Zone Male 50-59 years Recovered Confirmed ... ... ... ... ... ... ... ... 277553 277554 2021-08-04 North Zone Male 20-29 years Recovered Confirmed 277554 277555 2021-03-02 Central Zone Female 50-59 years Recovered Confirmed 277555 277556 2021-09-02 North Zone Female 50-59 years Recovered Confirmed 277556 277557 2021-06-02 North Zone Male 20-29 years Recovered Confirmed 277557 277558 2021-01-01 Central Zone Male 30-39 years Recovered Confirmed [277558 rows x 7 columns]
# Let's get column names as list
list(df.columns)
['unnamed:_0', 'date_reported', 'alberta_health_services_zone', 'gender', 'age_group', 'case_status', 'case_type']
# Rename some columns to be a bit shorter whilst still descriptive
rename_cols = {"unnamed:_0":"line_no","date_reported":"reported", "alberta_health_services_zone":"zone"}
df.rename(columns=rename_cols,inplace = True)
print(df)
line_no reported zone gender age_group case_status case_type 0 1 2020-11-13 Calgary Zone Female 1-4 years Recovered Confirmed 1 2 2021-04-21 Edmonton Zone Male 30-39 years Recovered Confirmed 2 3 2021-05-17 North Zone Male 10-19 years Recovered Confirmed 3 4 2020-12-13 Edmonton Zone Male 5-9 years Recovered Confirmed 4 5 2021-01-05 Central Zone Male 50-59 years Recovered Confirmed ... ... ... ... ... ... ... ... 277553 277554 2021-08-04 North Zone Male 20-29 years Recovered Confirmed 277554 277555 2021-03-02 Central Zone Female 50-59 years Recovered Confirmed 277555 277556 2021-09-02 North Zone Female 50-59 years Recovered Confirmed 277556 277557 2021-06-02 North Zone Male 20-29 years Recovered Confirmed 277557 277558 2021-01-01 Central Zone Male 30-39 years Recovered Confirmed [277558 rows x 7 columns]
# "line_no" can be deleted (we'll use index number instead)
del df['line_no']
if len(df) > 0:
df.info() #Get summary df info
<class 'pandas.core.frame.DataFrame'> RangeIndex: 277558 entries, 0 to 277557 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 reported 277558 non-null object 1 zone 277558 non-null object 2 gender 277558 non-null object 3 age_group 277558 non-null object 4 case_status 277558 non-null object 5 case_type 277558 non-null object dtypes: object(6) memory usage: 12.7+ MB
# Convert "reported" column values from string datatype to datetime datatype
df['reported'] = df['reported'].astype('datetime64[ns]')
df['reported'] = pd.to_datetime(df['reported'], format='%y%m%d')
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 277558 entries, 0 to 277557 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 reported 277558 non-null datetime64[ns] 1 zone 277558 non-null object 2 gender 277558 non-null object 3 age_group 277558 non-null object 4 case_status 277558 non-null object 5 case_type 277558 non-null object dtypes: datetime64[ns](1), object(5) memory usage: 12.7+ MB
# Strip the "years" string from "age_group", as we might want to use the numbers later on
df['age_group'] = df['age_group'].replace('\s?years', '', regex=True)
df
reported | zone | gender | age_group | case_status | case_type | |
---|---|---|---|---|---|---|
0 | 2020-11-13 | Calgary Zone | Female | 1-4 | Recovered | Confirmed |
1 | 2021-04-21 | Edmonton Zone | Male | 30-39 | Recovered | Confirmed |
2 | 2021-05-17 | North Zone | Male | 10-19 | Recovered | Confirmed |
3 | 2020-12-13 | Edmonton Zone | Male | 5-9 | Recovered | Confirmed |
4 | 2021-01-05 | Central Zone | Male | 50-59 | Recovered | Confirmed |
... | ... | ... | ... | ... | ... | ... |
277553 | 2021-08-04 | North Zone | Male | 20-29 | Recovered | Confirmed |
277554 | 2021-03-02 | Central Zone | Female | 50-59 | Recovered | Confirmed |
277555 | 2021-09-02 | North Zone | Female | 50-59 | Recovered | Confirmed |
277556 | 2021-06-02 | North Zone | Male | 20-29 | Recovered | Confirmed |
277557 | 2021-01-01 | Central Zone | Male | 30-39 | Recovered | Confirmed |
277558 rows × 6 columns
# Strip the "Zone" string from ahs_zone column values
df['zone'] = df['zone'].replace('\s?Zone', '', regex=True)
df
reported | zone | gender | age_group | case_status | case_type | |
---|---|---|---|---|---|---|
0 | 2020-11-13 | Calgary | Female | 1-4 | Recovered | Confirmed |
1 | 2021-04-21 | Edmonton | Male | 30-39 | Recovered | Confirmed |
2 | 2021-05-17 | North | Male | 10-19 | Recovered | Confirmed |
3 | 2020-12-13 | Edmonton | Male | 5-9 | Recovered | Confirmed |
4 | 2021-01-05 | Central | Male | 50-59 | Recovered | Confirmed |
... | ... | ... | ... | ... | ... | ... |
277553 | 2021-08-04 | North | Male | 20-29 | Recovered | Confirmed |
277554 | 2021-03-02 | Central | Female | 50-59 | Recovered | Confirmed |
277555 | 2021-09-02 | North | Female | 50-59 | Recovered | Confirmed |
277556 | 2021-06-02 | North | Male | 20-29 | Recovered | Confirmed |
277557 | 2021-01-01 | Central | Male | 30-39 | Recovered | Confirmed |
277558 rows × 6 columns
# Finally, let's sort index records by reported date, ascending (oldest-to-newest):
df.sort_values(by='reported',ascending=True)
reported | zone | gender | age_group | case_status | case_type | |
---|---|---|---|---|---|---|
259303 | 2020-03-06 | Calgary | Female | 50-59 | Recovered | Confirmed |
227899 | 2020-03-09 | Calgary | Male | 30-39 | Recovered | Confirmed |
257745 | 2020-03-09 | Edmonton | Male | 60-69 | Recovered | Confirmed |
270881 | 2020-03-09 | Calgary | Female | 30-39 | Recovered | Confirmed |
245662 | 2020-03-09 | Edmonton | Female | 70-79 | Recovered | Confirmed |
... | ... | ... | ... | ... | ... | ... |
215938 | 2021-09-16 | Edmonton | Female | 40-49 | Active | Confirmed |
119979 | 2021-09-16 | Central | Female | 5-9 | Active | Confirmed |
119976 | 2021-09-16 | Edmonton | Male | 1-4 | Active | Confirmed |
28404 | 2021-09-16 | Edmonton | Female | 20-29 | Active | Confirmed |
161231 | 2021-09-16 | Calgary | Female | 5-9 | Active | Confirmed |
277558 rows × 6 columns
# Okay, data is in good format, now let's make sure there are no empty (NaN or Null) values
df.isna().values.any() #Can also use .isnull. "False" means no null values (that's good!)
False
# Not getting any null values, but if you do, you can count them with:
# df.isna().sum()
# and then replace any null values with N/A or 0, depending on field datatype.
# Use the ".fillna('N/A') or .fillna(0)" method
Goal is to visualize Alberta's new daily Covid-19 case counts -- sorted by date. I will also create and overlay a 7-day simple moving average trendline.
# Lets start some analysis
# Group each reported incident by date and return a count (reported cases/day):
covid_by_freq = df.reported.value_counts(normalize=False) # Highest Covid days to lowest
print(covid_by_freq)
2021-04-30 2406 2021-05-04 2279 2021-05-05 2176 2021-04-28 2056 2021-05-07 2052 ... 2020-03-13 8 2020-03-11 7 2020-03-09 6 2020-03-12 3 2020-03-06 1 Name: reported, Length: 558, dtype: int64
# To sort by date:
covid_by_date = covid_by_freq.sort_index()
print(covid_by_date)
print(type(covid_by_date))
2020-03-06 1 2020-03-09 6 2020-03-10 9 2020-03-11 7 2020-03-12 3 ... 2021-09-12 1580 2021-09-13 1436 2021-09-14 1655 2021-09-15 1727 2021-09-16 2039 Name: reported, Length: 558, dtype: int64 <class 'pandas.core.series.Series'>
# Note: We can normalize this to show "the relative frequencies of the unique values."
# Src: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.value_counts.html
df.reported.value_counts(normalize=True).sort_index()
2020-03-06 0.000004 2020-03-09 0.000022 2020-03-10 0.000032 2020-03-11 0.000025 2020-03-12 0.000011 ... 2021-09-12 0.005693 2021-09-13 0.005174 2021-09-14 0.005963 2021-09-15 0.006222 2021-09-16 0.007346 Name: reported, Length: 558, dtype: float64
# Let's make the Series data result into a DataFrame
# Src: https://stackoverflow.com/questions/47136436/python-pandas-convert-value-counts-output-to-dataframe
df_covid_by_date = df["reported"].value_counts(normalize=False).reset_index()
df_covid_by_date.columns = ['reported', 'total_cases']
df_covid_by_date = df_covid_by_date.sort_values(by='reported', ascending=True)
print (df_covid_by_date)
reported total_cases 557 2020-03-06 1 555 2020-03-09 6 552 2020-03-10 9 554 2020-03-11 7 556 2020-03-12 3 .. ... ... 39 2021-09-12 1580 58 2021-09-13 1436 29 2021-09-14 1655 21 2021-09-15 1727 5 2021-09-16 2039 [558 rows x 2 columns]
# Create a rolling 7-day simple moving-average and add it as a column.
# The first few entries will not have a value, until we get to day 7.
# The NaN will need to be converted to integer 0.
# Since will have added six (0) integer values at the start,
# any statistical operations would need to account for this.
df_covid_by_date['sma_7d'] = df_covid_by_date['total_cases'].rolling(7).mean()
print(df_covid_by_date.isna().sum())
# If there are NaN (Null) values, we will fill them with integer 0
if len(df_covid_by_date.isna().sum()) > 0:
df_covid_by_date['sma_7d'] = df_covid_by_date['sma_7d'].fillna(0)
# Print sum again to make sure there are no NaN
print()
print(df_covid_by_date.isna().sum())
# Convert sma_7d values to integers
df_covid_by_date['sma_7d'] = df_covid_by_date['sma_7d'].astype(int)
print(df_covid_by_date.iloc[0:10]) # Example of getting 7day moving average
df_covid_by_date.describe() # Get some statistics about dataset
reported 0 total_cases 0 sma_7d 6 dtype: int64 reported 0 total_cases 0 sma_7d 0 dtype: int64 reported total_cases sma_7d 557 2020-03-06 1 0 555 2020-03-09 6 0 552 2020-03-10 9 0 554 2020-03-11 7 0 556 2020-03-12 3 0 553 2020-03-13 8 0 542 2020-03-14 22 8 551 2020-03-15 11 9 539 2020-03-16 24 12 550 2020-03-17 11 12
total_cases | sma_7d | |
---|---|---|
count | 558.000000 | 558.000000 |
mean | 497.415771 | 487.657706 |
std | 549.806879 | 531.045268 |
min | 1.000000 | 0.000000 |
25% | 81.500000 | 84.250000 |
50% | 243.500000 | 248.000000 |
75% | 751.750000 | 755.750000 |
max | 2406.000000 | 2044.000000 |
# Draw chart
import datetime as dt
from matplotlib.lines import Line2D
%config InlineBackend.figure_format = 'png' # 'retina' setting distorts image on mobile
dt_now = dt.datetime.now()
df_cbd = df_covid_by_date
# Figure styles
sns.set_style("whitegrid", {'grid.color': '.7', 'xtick.bottom': True})
sns.set_context("paper", font_scale=1.35, rc={"axes.titlesize":24,"axes.labelsize":16,'dpi': 100})
# Plot Size
plt.figure(figsize=(15, 10))
# The 7-day simple moving average as shaded line plot:
# Shading to bottom x-axis for better U/X
ax = sns.lineplot(data=df_cbd, x='reported', y='sma_7d', linewidth=2, color="red");
ax.fill_between(df_cbd.reported, df_cbd.sma_7d, color="#e0e0e0", alpha=0.3)
# You can restrict output to only a certain date range, pass in your start yyyy, m, d:
# ax.set_xlim([dt.date(2020, 8, 1), dt.datetime.now()])
ax.set_xlim([df_cbd.reported.iloc[0], dt.datetime.now()])
# The daily Covid-19 case counts as a scatterplot
sns.scatterplot(data=df_cbd, x='reported', y='total_cases', marker='o', s=40);
# Legend and Titles
legend_elements = [Line2D([0], [0], marker='o', lw=0.5, label='Covid Cases/Day'),
Line2D([0], [0], lw=3, color='r', label='7-Day Moving Average')]
plt.legend(fontsize = 15, \
bbox_to_anchor= (0.01, 0.97), \
loc='upper left', \
title="Legend", \
title_fontsize = 22, \
shadow = True, \
facecolor = 'white', \
handles=legend_elements
)
plt.title('New Daily Covid-19 Cases in Alberta', fontsize=24)
plt.xlabel('Chart: Dan Hertz / Data: Alberta Health', labelpad=15, fontsize=14)
plt.ylabel('Total Cases per Day', labelpad=5, fontsize=20)
plt.tight_layout()
# If you want to remove borders, use:
# sns.despine()
# Store fig in variable or you get a blank image when saved 'gcf()' = get current figure
f = plt.gcf()
# Save .png chart to local folder
f.savefig(f'df_covid_by_date_{dt_now}.png', bbox_inches='tight', dpi=100)
# Finally, let's export the covid_by_date DataFrame to csv, excel and json (in case we want to use it in the future)
dt_now = dt.datetime.now()
df_covid_by_date.to_csv(f'ab_covid_by_date_{dt_now}.csv', index=True) #CSV
df_covid_by_date.to_excel(f'ab_covid_by_date_{dt_now}.xlsx', index=True) #Excel
df_covid_by_date.to_json(f'ab_covid_by_date_columns_{dt_now}.json', index=True, orient='columns', date_format="iso") #JSON Column format
df_covid_by_date.to_json(f'ab_covid_by_date_records_{dt_now}.json', index=True, orient='records', date_format="iso") #JSON Records format
df_covid_by_date.to_json(f'ab_covid_by_date_index_{dt_now}.json', index=True, orient='index', date_format="iso") #JSON Index format