This time, I will try to visualize it with matplotlib based on the data of coronavirus infected people in Tokyo.
The data on the number of people infected with coronavirus in Tokyo is ["Details of announcement of new coronavirus positive patients in Tokyo"](https://catalog.data.metro.tokyo.lg.jp/dataset/t000010d0000000068/resource/c2d997db-1450 -43fa-8037-ebb11ec28d4c) can be downloaded in csv format.
import pandas as pd
import matplotlib.pyplot as plt
import japanize_matplotlib
import os
import numpy as np
from matplotlib import dates as mdates
from matplotlib.ticker import MultipleLocator
from matplotlib.dates import DateFormatter
import seaborn as sns
First, check the data.
df = pd.read_csv('130001_tokyo_covid19_patients.csv')
print('------column-------')
print(df.columns.values)
print('----head values----')
print(df.head().values)
print('----tail values----')
print(df.tail().values)
#output
------column-------
['No' 'National local government code' 'Name of prefectures' 'City name' 'Published_date' 'Day of the week' 'Onset_date' 'patient_residence'
'patient_Age' 'patient_sex' 'patient_attribute' 'patient_Status' 'patient_Symptoms' 'patient_Travel history flag' 'Remarks' 'Discharged flag']
----head values----
[[1 130001 'Tokyo' nan '2020-01-24' 'Money' nan 'Wuhan City, Hubei Province' 'Forties' 'male' nan nan nan
nan nan 1.0]
[2 130001 'Tokyo' nan '2020-01-25' 'soil' nan 'Wuhan City, Hubei Province' '30s' 'Female' nan nan nan
nan nan 1.0]
[3 130001 'Tokyo' nan '2020-01-30' 'wood' nan 'Changsha City, Hunan Province' '30s' 'Female' nan nan nan
nan nan 1.0]
[4 130001 'Tokyo' nan '2020-02-13' 'wood' nan 'In Tokyo' '70s' 'male' nan nan nan nan
nan 1.0]
[5 130001 'Tokyo' nan '2020-02-14' 'Money' nan 'In Tokyo' '50s' 'Female' nan nan nan nan
nan 1.0]]
----tail values----
[[26064 130001 'Tokyo' nan '2020-10-02' 'Money' nan nan '50s' 'male' nan nan nan
nan nan nan]
[26065 130001 'Tokyo' nan '2020-10-02' 'Money' nan nan '50s' 'male' nan nan nan
nan nan nan]
[26066 130001 'Tokyo' nan '2020-10-02' 'Money' nan nan '70s' 'Female' nan nan nan
nan nan nan]
[26067 130001 'Tokyo' nan '2020-10-02' 'Money' nan nan '50s' 'male' nan nan nan
nan nan nan]
[26068 130001 'Tokyo' nan '2020-10-02' 'Money' nan nan '60s' 'male' nan nan nan
nan nan nan]]
It seems that the date, age, gender, etc. are summarized for each line. It seems that there is one person per line, so I will process it so that it can be easily aggregated later. The date part is converted to datetime.
df['qty'] = 1
df['Published_date'] = pd.to_datetime(df['Published_date'])
Plot the number of infected people by date.
def plot_bar(df):
df_pivot = pd.pivot_table(df, index='Published_date', values='qty', aggfunc=np.sum)
labels = df_pivot.index.values
vals = [i[0] for i in df_pivot.values]
#Figure generation
fig, ax = plt.subplots(figsize=(10, 6))
ax.bar(labels, height=vals, linewidth=1, color='orangered', alpha=0.8)
plt.show()
It feels a little unfashionable at this rate, so I will process it to a good feeling.
#Change tick color
ax.tick_params(axis='y', colors='gray')
ax.tick_params(axis='x', colors='dimgray')
#Show grid
ax.grid(axis='y')
#Set ylabel and change color
ax.set(ylabel='Number of infected people', ylim=(0, 500))
ax.yaxis.label.set_color('gray')
#Erase the y-axis and x-axis tick lines
ax.tick_params(bottom=False, left=False)
#Display by month
ax.xaxis.set_major_locator(mdates.MonthLocator())
#Corrected x label notation
ax.xaxis.set_major_formatter(mdates.DateFormatter('%m-%d'))
#Border removal
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
#set title
ax.set_title('tokyo covid19 patients-bar', color='gray')
plt.tight_layout()
plt.show()
As mentioned in the news, you can see that it has calmed down once since May and has been excited again.
Let's plot the number of infected people by day of the week.
def plot_barh(df):
df_pivot = pd.pivot_table(df, index='Day of the week', values='qty', aggfunc=np.sum)
week_days = df_pivot.index.values
#List the number of infected people by day of the week
vals = [val[0] for val in df_pivot.values]
#Graph generation
fig, ax = plt.subplots(figsize=(10, 6))
ax.barh(week_days, vals, color='tomato')
plt.show()
I was able to plot it safely, but the order of the days of the week is different, so I'll play around with this as well.
def plot_barh(df):
df_pivot = pd.pivot_table(df, index='Day of the week', values='qty', aggfunc=np.sum)
#Sort the days of the week
df_pivot = df_pivot.reindex(index=['Moon', 'fire', 'water', 'wood', 'Money', 'soil', 'Day'])
week_days = df_pivot.index.values
#List the number of infected people by day of the week
vals = [val[0] for val in df_pivot.values]
#Graph generation
fig, ax = plt.subplots(figsize=(10, 6))
ax.barh(week_days, vals, color='tomato')
#y Change label color
ax.tick_params(axis='y', colors='dimgray')
#Sunday will be on top, so sort
ax.invert_yaxis()
#Erase the tick line
ax.tick_params(bottom=False, left=False)
#Erase the border
ax.spines['bottom'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
#x Remove label
ax.set_xticklabels([])
#Display a number to the right of bar
vmax = np.array(vals).max()
for i, val in enumerate(vals):
ax.text(val + vmax * 0.02, i, f'{val:,}', fontsize='x-large', va='center', color='darkred')
#Give a title
ax.set_title('tokyo covid19 patients-barh(day of week count)', color='dimgray')
plt.show()
I was able to plot safely and neatly. I think there are a lot of publishers from Thursday to Saturday, but maybe the aim is to curb people who go out over the weekend. Well, I think that the number of people undergoing PCR tests is biased by day of the week.
Plot the number of infected people by age group and gender. Since the gender and age columns contain data such as'Unknown','-','-', etc., this should be cleaned in advance.
def plot_stacked_bar(df):
#cleaning
#Unified from male to male
df = df.replace({'patient_sex': {'Man': 'Man性'}})
#Delete records of unknown gender and age
df = df[df['patient_sex'] != 'unknown']
df = df[df['patient_sex'] != '―']
df = df[df['patient_sex'] != '-']
df = df[df['patient_Age'] != '-']
df = df[df['patient_Age'] != 'unknown']
#Aggregated by gender and age
df_pivot = pd.pivot_table(df, index='patient_Age', columns='patient_sex', values='qty', aggfunc=np.sum)
#Rearranges
df_pivot = df_pivot.reindex(index=['Under 10 years old', '10's', '20's', '30s', 'Forties', '50s',
'60s', '70s', '80s', '90s', '100 years and over'])
#Get the number for each man and woman with a slicer
men_qty = df_pivot.values[:, 0]
women_qty = df_pivot.values[:, 1]
labels = ['male', 'Female']
ages = df_pivot.index.values
# figure,ax generation
fig, ax = plt.subplots(figsize=(10, 6))
#Plot of stacked bar
width = 0.6
ax.bar(ages, men_qty, width, label=labels[0], color='skyblue')
ax.bar(ages, women_qty, width, label=labels[1], color='pink', bottom=men_qty)
plt.show()
This will also be modified.
#Erase the tick line
ax.tick_params(bottom=False, left=False)
#Erasing the border
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
# y,Set xlabel and change color
ax.set(ylabel='Number of infected people')
ax.yaxis.label.set_color('gray')
ax.set(xlabel='Age')
ax.xaxis.label.set_color('gray')
#Change the color of the y / x label
ax.tick_params(axis='y', colors='dimgray')
ax.tick_params(axis='x', colors='dimgray')
#Displaying the legend
ax.legend(loc="upper left", bbox_to_anchor=(1.02, 1.0,), borderaxespad=0, frameon=False)
#Display of grid
ax.grid(axis='y')
#Change y-axis display width every 2000
ax.yaxis.set_major_locator(MultipleLocator(2000))
#Put a comma in the number
ax.yaxis.set_major_formatter('{x:,.0f}')
#Give a title
ax.set_title('tokyo covid19 patients-stacked bar(age,sex,count)', color='dimgray')
plt.show()
I am worried that there is a big difference between teens and 20s. I may not have been tested in the first place because my physical condition does not deteriorate. Also, because there are many students, it may indicate that the outbreak of infection can be prevented by closing the school.
Create heat maps by age and month. This is easy to describe, so I will plot it using seaborn.
def plot_heatmap(df):
df = df.set_index('Published_date')
df['month'] = df.index.month
months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct']
#Delete records of unknown gender and age
df = df[df['patient_sex'] != 'unknown']
df = df[df['patient_sex'] != '―']
df = df[df['patient_sex'] != '-']
df = df[df['patient_Age'] != '-']
df = df[df['patient_Age'] != 'unknown']
#Aggregated by month and age
df_pivot = pd.pivot_table(df, index='patient_Age', columns='month', values='qty', aggfunc=np.sum)
#Rearranges
df_pivot = df_pivot.reindex(index=['Under 10 years old', '10's', '20's', '30s', 'Forties', '50s',
'60s', '70s', '80s', '90s', '100 years and over'])
fig, ax = plt.subplots(figsize=(10, 6))
#Plot with seaborn
ax = sns.heatmap(df_pivot, annot=True, fmt="1.0f", cmap='YlGnBu')
ax.tick_params(bottom=False, left=False)
ax.set_xticklabels(months)
ax.set_title('tokyo covid19 heatmap(month,age count)', color='gray')
#Change the color of the y / x tick label
ax.tick_params(axis='y', colors='dimgray')
ax.tick_params(axis='x', colors='dimgray')
# y,x erase label
ax.set(ylabel='', xlabel='')
plt.show()
If you visualize the data, you will be able to read the trends. The default graphs in matplotlib feel awkward, so I'd like to continue studying techniques for making graphs clean.
Recommended Posts