Severity is computed by $Severity = No.Confirmed / Populaion$
temp.style.background_gradient(cmap='Pastel1_r')
# This is the graphical visualization of serverity
## Hover your mouse over cities to display details
m
fig = px.scatter_geo(china_map, lat='Lat', lon='Long', scope='asia',
color="size", size='size', hover_name='Province/State',
hover_data=['Confirmed', 'Deaths', 'Severity'],
animation_frame="Date",
title='Spread in China over time')
fig.update(layout_coloraxis_showscale=False)
fig.show()
fig = px.treemap(china_latest.sort_values(by='Confirmed', ascending=False).reset_index(drop=True),
path=["Province/State"], values="Confirmed", title='Number of Confirmed Cases in Chinese Provinces')
fig.show()
fig = px.treemap(row_latest, path=["Country/Region"],
values="Confirmed", title='Number of Confirmed Cases outside china')
fig.show()
We would be consider Chinese cities without Hubei which is a better reference for Singapore
fig.show()
This makes sense because Singapore's epidemic started later than Chinese. Depite the government taking measures to control the epidemic, it's yet start to plateau(as compared to Chinese). We should hope to see SG's curve getting to a plateaued state in the next 8 days.
fig.show()
Blue band indicates our confidence region of future confirmed cases. In the best case scenario, Chinese cities(without considering Hubei) should have a good chance of decliding number of confirmed cases starting next week. Whereas for Singapore, it's like that the growing trend will last for a 8-10 days before starting to decline.
plot_China_without_Hubei()
plot_Singapore()
import numpy as np
import pandas as pd
from fbprophet import Prophet
from fbprophet.diagnostics import cross_validation, performance_metrics
from fbprophet.plot import plot_cross_validation_metric, add_changepoints_to_plot, plot_plotly
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
%matplotlib inline
plt.style.use('ggplot')
df = pd.read_csv('2019_nCoV_data.csv')
# conf_df = pd.read_csv('time_series_2019_ncov_confirmed.csv')
# deaths_df = pd.read_csv('time_series_2019_ncov_deaths.csv')
# recv_df = pd.read_csv('time_series_2019_ncov_recovered.csv')
conf_df = pd.read_csv('time_series_2019-ncov-Confirmed.csv')
deaths_df = pd.read_csv('time_series_2019-ncov-Deaths.csv')
recv_df = pd.read_csv('time_series_2019-ncov-Recovered.csv')
df.head(10)
dates = list(conf_df.columns[4:])
dates1 = list(recv_df.columns[4:])
conf_df_long = conf_df.melt(id_vars=['Province/State', 'Country/Region', 'Lat', 'Long'],
value_vars=dates, var_name='Date', value_name='Confirmed')
deaths_df_long = deaths_df.melt(id_vars=['Province/State', 'Country/Region', 'Lat', 'Long'],
value_vars=dates, var_name='Date', value_name='Deaths')
recv_df_long = recv_df.melt(id_vars=['Province/State', 'Country/Region', 'Lat', 'Long'],
value_vars=dates, var_name='Date', value_name='Recovered')
full_table = pd.concat([conf_df_long, deaths_df_long['Deaths'], recv_df_long['Recovered']],
axis=1, sort=False)
full_table['Date'] = pd.to_datetime(full_table['Date'])
full_table['Recovered'] = full_table['Recovered'].astype('int')
full_table.dtypes
full_table.head()
full_table['Country/Region'].unique()
full_table['Province/State'].unique()
def label_race (row):
if row['Province/State'] == 'Anhui' :
return 62.0
if row['Province/State'] == 'Beijing' :
return 21.54
if row['Province/State'] == 'Chongqing':
return 30.48
if row['Province/State'] == 'Fujian':
return 38.56
if row['Province/State'] == 'Gansu':
return 25.58
if row['Province/State'] == 'Guangdong':
return 113.46
if row['Province/State'] == 'Guangxi':
return 48.38
if row['Province/State'] == 'Guizhou':
return 34.75
if row['Province/State'] == 'Hainan':
return 9.3
if row['Province/State'] == 'Hebei':
return 74.7
if row['Province/State'] == 'Heilongjiang':
return 38.31
if row['Province/State'] == 'Henan':
return 94
if row['Province/State'] == 'Hubei':
return 58.5
if row['Province/State'] == 'Hunan':
return 67.37
if row['Province/State'] == 'Inner Mongolia':
return 24.71
if row['Province/State'] == 'Jiangsu':
return 80.4
if row['Province/State'] == 'Jiangxi':
return 45.2
if row['Province/State'] == 'Jilin':
return 27.46
if row['Province/State'] == 'Liaoning':
return 43.9
if row['Province/State'] == 'Ningxia':
return 6.3
if row['Province/State'] == 'Qinghai':
return 5.6
if row['Province/State'] == 'Shaanxi':
return 37.33
if row['Province/State'] == 'Shandong':
return 90.0
if row['Province/State'] == 'Shanghai':
return 24.24
if row['Province/State'] == 'Shanxi':
return 36.5
if row['Province/State'] == 'Sichuan':
return 81.1
if row['Province/State'] == 'Tianjin':
return 15.0
if row['Province/State'] == 'Tibet':
return 3.2
if row['Province/State'] == 'Xinjiang':
return 22.0
if row['Province/State'] == 'Yunnan':
return 46.0
if row['Province/State'] == 'Zhejiang':
return 57.4
if row['Province/State'] == 'Taiwan':
return 23.78
if row['Province/State'] == 'Seattle, WA':
return 0.72
if row['Province/State'] == 'Chicago, IL':
return 2.7
if row['Province/State'] == 'Tempe, AZ':
return 0.185
if row['Province/State'] == 'Macau':
return 0.62
if row['Province/State'] == 'Hong Kong':
return 7.4
if row['Province/State'] == 'Toronto, ON':
return 2.93
if row['Province/State'] == 'British Columbia':
return 5.07
if row['Province/State'] == 'Orange, CA':
return 0.14
if row['Province/State'] == 'Los Angeles, CA':
return 4.0
if row['Province/State'] == 'New South Wales':
return 7.54
if row['Province/State'] == 'Victoria':
return 6.4
if row['Province/State'] == 'Queensland':
return 5.07
if row['Province/State'] == 'London, ON':
return 0.4
if row['Province/State'] == 'Santa Clara, CA':
return 0.127
if row['Province/State'] == 'South Australia':
return 1.677
if row['Province/State'] == 'Boston, MA':
return 0.685
if row['Province/State'] == 'San Benito, CA':
return 0.06
if row['Province/State'] == 'Madison, WI':
return 0.255
if row['Province/State'] == 'Diamond Princess cruise ship':
return 4/1000
# Below are countries without going to specific cities
if row['Country/Region'] == 'Thailand':
return 69.04
if row['Country/Region'] == 'Japan':
return 126.8
if row['Country/Region'] == 'South Korea':
return 51.47
if row['Country/Region'] == 'Singapore':
return 5.6
if row['Country/Region'] == 'Vietnam':
return 95.54
if row['Country/Region'] == 'France':
return 67.0
if row['Country/Region'] == 'Nepal':
return 29.3
if row['Country/Region'] == 'Malaysia':
return 31.62
if row['Country/Region'] == 'Cambodia':
return 16.01
if row['Country/Region'] == 'Sri Lanka':
return 21.44
if row['Country/Region'] == 'Germany':
return 82.79
if row['Country/Region'] == 'Finland':
return 5.5
if row['Country/Region'] == 'United Arab Emirates':
return 9.4
if row['Country/Region'] == 'Philippines':
return 104.9
if row['Country/Region'] == 'India':
return 1339
if row['Country/Region'] == 'Italy':
return 60.48
if row['Country/Region'] == 'UK':
return 66.44
if row['Country/Region'] == 'Russia':
return 144.5
if row['Country/Region'] == 'Sweden':
return 10.12
if row['Country/Region'] == 'Spain':
return 46.66
if row['Country/Region'] == 'Belgium':
return 11.4
return 10
full_table.loc[full_table['Province/State'].isna()]['Country/Region'].unique()
full_table['population'] = full_table.apply (lambda row: label_race(row), axis=1)
# filling missing values with 0 in columns ('Confirmed', 'Deaths', 'Recovered')
full_table[['Confirmed', 'Deaths', 'Recovered']] = full_table[['Confirmed', 'Deaths', 'Recovered']].fillna(0)
full_table[['Province/State']] = full_table[['Province/State']].fillna('NA')
# cases in the Diamond Princess cruise ship
ship = full_table[full_table['Province/State']=='Diamond Princess cruise ship']
# full table
full_table = full_table[full_table['Province/State']!='Diamond Princess cruise ship']
full_table.head()
full_table['Severity'] = full_table['Confirmed']/full_table['population']
# derived dataframes
china = full_table[full_table['Country/Region']=='Mainland China']
row = full_table[full_table['Country/Region']!='Mainland China']
full_latest = full_table[full_table['Date'] == max(full_table['Date'])].reset_index()
china_latest = full_latest[full_latest['Country/Region']=='Mainland China']
row_latest = full_latest[full_latest['Country/Region']!='Mainland China']
full_latest_grouped = full_latest.groupby('Country/Region')['Confirmed', 'Deaths', 'Recovered','population'].sum().reset_index()
china_latest_grouped = china_latest.groupby('Province/State')['Confirmed', 'Deaths', 'Recovered','population'].sum().reset_index()
row_latest_grouped = row_latest.groupby('Country/Region')['Confirmed', 'Deaths', 'Recovered','population'].sum().reset_index()
temp = full_latest.groupby(['Country/Region', 'Province/State'])['Confirmed', 'Deaths', 'Recovered','population'].max()
temp['severity'] = temp.Confirmed/temp.population
temp.drop(columns='population', inplace = True)
temp.style.background_gradient(cmap='Pastel1_r')
import folium
m = folium.Map(location=[0,0], tiles='cartodbpositron',
min_zoom=1, max_zoom=4, zoom_start=1)
for i in range(len(full_latest)):
folium.Circle(
location=[full_latest.iloc[i]['Lat'], full_latest.iloc[i]['Long']],
color='crimson',
tooltip = '<li><bold>Country : '+str(full_latest.iloc[i]['Country/Region'])+
'<li><bold>Province : '+str(full_latest.iloc[i]['Province/State'])+
'<li><bold>Confirmed : '+str(full_latest.iloc[i]['Confirmed'])+
'<li><bold>Deaths : '+str(full_latest.iloc[i]['Deaths'])+
'<li><bold>Severity : '+str(full_latest.iloc[i]['Severity']),
radius=int(full_latest.iloc[i]['Severity']*200)
).add_to(m)
m
china_map = china.groupby(['Date', 'Province/State'])['Confirmed','Deaths','Severity', 'Lat','Long'].max()
china_map = china_map.reset_index()
china_map['size'] = china_map['Severity'].pow(0.5)
china_map['Date'] = pd.to_datetime(china_map['Date'])
china_map['Date'] = china_map['Date'].dt.strftime('%m/%d/%Y')
fig = px.scatter_geo(china_map, lat='Lat', lon='Long', scope='asia',
color="size", size='size', hover_name='Province/State',
hover_data=['Confirmed', 'Deaths', 'Severity'],
animation_frame="Date",
title='Spread in China over time')
fig.update(layout_coloraxis_showscale=False)
fig.show()
fig = px.treemap(china_latest.sort_values(by='Confirmed', ascending=False).reset_index(drop=True),
path=["Province/State"], values="Confirmed", title='Number of Confirmed Cases in Chinese Provinces')
fig.show()
fig = px.treemap(row_latest, path=["Country/Region"],
values="Confirmed", title='Number of Confirmed Cases outside china')
fig.show()
df.dtypes
df['Date'] = pd.to_datetime(df['Date'])
df['Date'] = df['Date'].dt.strftime('%m/%d/%Y')
df['Date'] = pd.to_datetime(df['Date'])
df['Confirmed'] = df['Confirmed'].astype('int')
df['Deaths'] = df['Deaths'].astype('int')
df['Recovered'] = df['Recovered'].astype('int')
# some Chinese data are labelled as China and some are labelled as 'Mainland China'
df['Country'] = df['Country'].replace('Mainland China', 'China')
df['Country'].unique()
df.dtypes
len(df)
df_China = df.loc[df['Country'] == 'China']
df_China_without_Hubei = df_China.loc[df_China['Province/State'] != 'Hubei']
df_Singapore = df.loc[df['Country'] == 'Singapore']
df_China_grouped = df_China.groupby('Date')['Confirmed'].sum().reset_index()
df_China_without_Hubei_grouped = df_China_without_Hubei.groupby('Date')['Confirmed'].sum().reset_index()
df_Singapore_grouped = df_Singapore.groupby('Date')['Confirmed'].sum().reset_index()
fig = go.Figure()
fig.add_trace(
go.Scatter(
x=df_China_grouped.Date,
y=df_China_grouped.Confirmed,
name='China Confirmed cases',
mode='lines+markers',
marker_color = 'rgb(55, 83, 109)',
hovertemplate =
'<br><b>Date</b>: %{x} <br>' +
'<b>Confirmed Cases:</b> %{y}<br>'
)
)
fig.add_trace(
go.Scatter(
x=df_China_without_Hubei_grouped.Date,
y=df_China_without_Hubei_grouped.Confirmed,
name='China Confirmed cases without Hubei',
mode='lines+markers',
marker_color = 'rgb(26, 118, 255)',
hovertemplate =
'<br><b>Date</b>: %{x} <br>' +
'<b>Confirmed Cases:</b> %{y}<br>'
)
)
fig.update_traces(
mode='lines+markers',
marker_line_width=2,
marker_size=5
)
fig.update_layout(
title={'text': 'Confermed case in China and China without Hubei',
'y':0.95,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'},
yaxis_zeroline=False,
xaxis_zeroline=False,
hoverlabel_align = 'left',
)
fig.show()
fig = go.Figure()
fig.add_trace(
go.Scatter(
x=df_Singapore_grouped.Date,
y=df_Singapore_grouped.Confirmed,
name='China Confirmed cases in SG',
mode='lines+markers',
marker_color = 'rgb(55, 83, 109)',
hovertemplate =
'<br><b>Date</b>: %{x} <br>' +
'<b>Confirmed Cases:</b> %{y}<br>'
)
)
fig.update_traces(
mode='lines+markers',
marker_line_width=2,
marker_size=5
)
fig.update_layout(
title={'text': 'Confermed case in Singapore',
'y':0.95,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'},
yaxis_zeroline=False,
xaxis_zeroline=False,
hoverlabel_align = 'left',
)
fig.show()
m_d = Prophet(
yearly_seasonality=False,
weekly_seasonality = False,
daily_seasonality = False,
seasonality_mode = 'additive')
df_China_without_Hubei_grouped.columns = ['ds','y']
m_d.fit(df_China_without_Hubei_grouped)
future_China_without_Hubei = m_d.make_future_dataframe(periods=7)
fcst_daily_China_without_Hubei= m_d.predict(future_China_without_Hubei)
# to quantify our prediction performance
def mean_absolute_percentage_error(y_true, y_pred):
"""Calculates MAPE given y_true and y_pred"""
y_true, y_pred = np.array(y_true), np.array(y_pred)
return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
def plot_China_without_Hubei():
trace1 = {
"fill": None,
"mode": "markers",
"name": "actual no. of Confirmed",
"type": "scatter",
"x": df_China_without_Hubei_grouped.ds,
"y": df_China_without_Hubei_grouped.y
}
trace2 = {
"fill": "tonexty",
"line": {"color": "#57b8ff"},
"mode": "lines",
"name": "upper_band",
"type": "scatter",
"x": fcst_daily_China_without_Hubei.ds,
"y": fcst_daily_China_without_Hubei.yhat_upper
}
trace3 = {
"fill": "tonexty",
"line": {"color": "#57b8ff"},
"mode": "lines",
"name": "lower_band",
"type": "scatter",
"x": fcst_daily_China_without_Hubei.ds,
"y": fcst_daily_China_without_Hubei.yhat_lower
}
trace4 = {
"line": {"color": "#eb0e0e"},
"mode": "lines+markers",
"name": "prediction",
"type": "scatter",
"x": fcst_daily_China_without_Hubei.ds,
"y": fcst_daily_China_without_Hubei.yhat
}
data = [trace1, trace2, trace3, trace4]
layout = {
"title": "Confirmed - Time Series Forecast - Daily Trend",
"xaxis": {
"title": "",
"ticklen": 5,
"gridcolor": "rgb(255, 255, 255)",
"gridwidth": 2,
"zerolinewidth": 1
},
"yaxis": {
"title": "Confirmed nCov - Hubei",
"ticklen": 5,
"gridcolor": "rgb(255, 255, 255)",
"gridwidth": 2,
"zerolinewidth": 1
},
}
fig = go.Figure(data=data, layout=layout)
iplot(fig)
max_date = df_China_without_Hubei_grouped.ds.max()
y_true = df_China_without_Hubei_grouped.y.values
y_pred_daily = fcst_daily_China_without_Hubei.loc[fcst_daily_China_without_Hubei['ds'] <= max_date].yhat.values
print('MAPE : {}'.format(mean_absolute_percentage_error(y_true,y_pred_daily)))
return
plot_China_without_Hubei()
m_d_SG = Prophet(
yearly_seasonality=False,
weekly_seasonality = False,
daily_seasonality = False,
seasonality_mode = 'additive')
df_Singapore_grouped.columns = ['ds','y']
m_d_SG.fit(df_Singapore_grouped)
future_Singapore_grouped = m_d_SG.make_future_dataframe(periods=7)
fcst_daily_Singapore_grouped= m_d_SG.predict(future_Singapore_grouped)
def plot_Singapore():
trace1 = {
"fill": None,
"mode": "markers",
"name": "actual no. of Confirmed",
"type": "scatter",
"x": df_Singapore_grouped.ds,
"y": df_Singapore_grouped.y
}
trace2 = {
"fill": "tonexty",
"line": {"color": "#57b8ff"},
"mode": "lines",
"name": "upper_band",
"type": "scatter",
"x": fcst_daily_Singapore_grouped.ds,
"y": fcst_daily_Singapore_grouped.yhat_upper
}
trace3 = {
"fill": "tonexty",
"line": {"color": "#57b8ff"},
"mode": "lines",
"name": "lower_band",
"type": "scatter",
"x": fcst_daily_Singapore_grouped.ds,
"y": fcst_daily_Singapore_grouped.yhat_lower
}
trace4 = {
"line": {"color": "#eb0e0e"},
"mode": "lines+markers",
"name": "prediction",
"type": "scatter",
"x": fcst_daily_Singapore_grouped.ds,
"y": fcst_daily_Singapore_grouped.yhat
}
data = [trace1, trace2, trace3, trace4]
layout = {
"title": "Confirmed - Time Series Forecast - Daily Trend",
"xaxis": {
"title": "",
"ticklen": 5,
"gridcolor": "rgb(255, 255, 255)",
"gridwidth": 2,
"zerolinewidth": 1
},
"yaxis": {
"title": "Confirmed nCov - Hubei",
"ticklen": 5,
"gridcolor": "rgb(255, 255, 255)",
"gridwidth": 2,
"zerolinewidth": 1
},
}
fig = go.Figure(data=data, layout=layout)
iplot(fig)
max_date = df_Singapore_grouped.ds.max()
y_true = df_Singapore_grouped.y.values
y_pred_daily = fcst_daily_Singapore_grouped.loc[fcst_daily_Singapore_grouped['ds'] <= max_date].yhat.values
print('MAPE : {}'.format(mean_absolute_percentage_error(y_true,y_pred_daily)))
return
plot_Singapore()
It doesn't look like really good prediction. For China cities the prediciton doesn't capture the going down trend of the change.
Now let's try to add in 'changepoint' features for more accurate modeling
m_d = Prophet(
changepoint_range=0.85,
changepoint_prior_scale=20,
n_changepoints=19,
yearly_seasonality=False,
weekly_seasonality = False,
daily_seasonality = False,
seasonality_mode = 'additive')
df_China_without_Hubei_grouped.columns = ['ds','y']
m_d.fit(df_China_without_Hubei_grouped)
future_China_without_Hubei = m_d.make_future_dataframe(periods=7)
fcst_daily_China_without_Hubei= m_d.predict(future_China_without_Hubei)
plot_China_without_Hubei()
m_d_SG = Prophet(
changepoint_range=0.8,
changepoint_prior_scale=19,
n_changepoints=20,
yearly_seasonality=False,
weekly_seasonality = False,
daily_seasonality = False,
seasonality_mode = 'additive')
df_Singapore_grouped.columns = ['ds','y']
m_d_SG.fit(df_Singapore_grouped)
future_Singapore_grouped = m_d_SG.make_future_dataframe(periods=7)
fcst_daily_Singapore_grouped= m_d_SG.predict(future_Singapore_grouped)
plot_Singapore()
Note that here we didn't touch the seasonality