Information on the new coronavirus, which is updated daily, is posted on the website of the Ministry of Health, Labor and Welfare. Press release on new coronavirus infection (outbreak situation, domestic patient outbreak, overseas situation, etc.) From there, I created a program that collects detailed data by prefecture (although it is only data after 5/10 with the same PDF format) and records it in Google Spread Sheet or Excel, so I would like to introduce it.
Since it is recorded in spreadsheet software, you can easily get a bird's-eye view of changes in the number of infected people and the rate of aggravation by graphing them later.
If you use it in Excel, you can use it as it is, but you need to prepare a little to access Google Spread Sheet from a Python program. Please refer to the following site etc., make preparations, and enter the name of the json file and the spreadsheet key in the comment part (around line 105) in the program. [Don't hesitate anymore] Summary of initial settings for reading and writing spreadsheets with Python Edit Google Sheets in Python
import requests
import re
import regex
from bs4 import BeautifulSoup
from tika import parser
import gspread
from oauth2client.service_account import ServiceAccountCredentials
import openpyxl
import pickle
isGss = False if bool(int(input("1. GoogleSpreadSheet 2. Excel ")) -
1) else True
req = requests.get(
"https://www.mhlw.go.jp/stf/seisakunitsuite/bunya/0000121431_00086.html")
req.encoding = 'utf-8'
soup = BeautifulSoup(req.text, 'html.parser')
urls = {}
try:
with open("dates.pickle", "rb") as f:
dates = pickle.load(f)
except:
dates = []
column_cnt = len(dates) + 2
for i in soup.select("div[class='m-grid__col1']")[1].find_all("a"):
url = i["href"]
if "Current status of new coronavirus infection and response by the Ministry of Health, Labor and Welfare" not in i.text: continue
b = re.findall("Reiwa\d{1,2}Year\d{1,2}Month\d{1,2}Day", i.text)
if b: date = b[0]
else:
temp_r = requests.get(url)
temp_r.encoding = 'utf-8'
date = re.findall("Reiwa\d{1,2}Year\d{1,2}Month\d{1,2}Day", temp_r.text)[0]
date = date.translate(
str.maketrans({
"0": "0",
"1": "1",
"2": "2",
"3": "3",
"4": "4",
"5": "5",
"6": "6",
"7": "7",
"8": "8",
"9": "9"
}))
date = "".join([(i.zfill(2) if i.isdigit() else i) for i in re.findall(
"(Reiwa)(\d{1,2})(Year)(\d{1,2})(Month)(\d{1,2})(Day)", date)[0]])
if re.findall("Reiwa May 2002 0[1-9]Day|Reiwa 2002 0[1-4]Month\d{1,2}Day", date): continue
elif date not in dates:
urls[date] = i["href"]
dates.append(date)
with open("dates.pickle", "wb") as f:
pickle.dump(dates, f)
pdfs = {}
for date, url in urls.items():
temp_r = requests.get(url)
temp_r.encoding = 'utf-8'
soup = BeautifulSoup(temp_r.text, 'html.parser')
for j in soup.find_all('a', href=True):
if 'Status of prefectural test positives' in j.text:
pdfs[date] = "https://www.mhlw.go.jp" + j['href']
def makeDict(text):
l = {}
for i in regex.findall(
"(\p{Han}(?:\s+\p{Han}|Total){1,2}|(Other))([※\d\s]+?[\nG])",
re.sub("※\d{1,2}", "", text).translate(
str.maketrans({
"\u3000": " ",
",": "",
"-": "0"
}))):
a = list(map(int, i[1].replace("G", "").split()))
b = "".join(i[0].split())
l[b] = {}
l[b]["Number of positives"] = a[0]
l[b]["Number of people performing PCR tests"] = a[1]
l[b]["Those who need hospital treatment, etc."] = a[2]
l[b]["Severe"] = a[3]
l[b]["Number of people discharged or canceled"] = a[4]
l[b]["Death (cumulative)"] = a[5]
l[b]["Checking"] = a[6]
return l
cnt = 0
if len(pdfs) == 0: pass
elif isGss:
scope = [
'https://spreadsheets.google.com/feeds',
'https://www.googleapis.com/auth/drive'
]
credentials = ServiceAccountCredentials.from_json_keyfile_name('json file name.json', scope) #json file name
gc = gspread.authorize(credentials)
SPREADSHEET_KEY = 'Spreadsheet key' #Spreadsheet key
workbook = gc.open_by_key(SPREADSHEET_KEY)
try:
worksheets = [
workbook.worksheet('Number of positives'),
workbook.worksheet('Number of people performing PCR tests'),
workbook.worksheet('Those who need hospital treatment, etc.'),
workbook.worksheet('Severe'),
workbook.worksheet('Number of people discharged or canceled'),
workbook.worksheet('Death (cumulative)'),
workbook.worksheet('Checking')
]
isFirst = False
except:
workbook.add_worksheet(title='Number of positives', rows=100, cols=500)
workbook.add_worksheet(title='Number of people performing PCR tests', rows=100, cols=500)
workbook.add_worksheet(title='Those who need hospital treatment, etc.', rows=100, cols=500)
workbook.add_worksheet(title='Severe', rows=100, cols=500)
workbook.add_worksheet(title='Number of people discharged or canceled', rows=100, cols=500)
workbook.add_worksheet(title='Death (cumulative)', rows=100, cols=500)
workbook.add_worksheet(title='Checking', rows=100, cols=500)
workbook.del_worksheet(workbook.sheet1)
worksheets = [
workbook.worksheet('Number of positives'),
workbook.worksheet('Number of people performing PCR tests'),
workbook.worksheet('Those who need hospital treatment, etc.'),
workbook.worksheet('Severe'),
workbook.worksheet('Number of people discharged or canceled'),
workbook.worksheet('Death (cumulative)'),
workbook.worksheet('Checking')
]
isFirst = True
cas = [
worksheet.range(1, column_cnt, 50, column_cnt + len(pdfs) - 1)
for worksheet in worksheets
]
for date, url in sorted(pdfs.items()):
print(date)
file_data = parser.from_buffer(requests.get(url))
text = file_data["content"]
l = makeDict(text)
if isFirst:
for worksheet in worksheets:
cells = worksheet.range(2, 1, 50, 1)
for i, key in enumerate(l.keys()):
cells[i].value = key
worksheet.update_cells(cells)
isFirst = False
for i, worksheet in enumerate(worksheets):
cells = cas[i][cnt::len(pdfs)]
cells[0].value = date
for j, a in enumerate(l.keys()):
key = list(l[a].keys())[i]
cells[j + 1].value = l[a][key]
cnt += 1
for i, worksheet in enumerate(worksheets):
worksheet.update_cells(cas[i])
else:
filename = "covid19.xlsx"
try:
wb = openpyxl.load_workbook(filename)
wss = [
wb.get_sheet_by_name('Number of positives'),
wb.get_sheet_by_name('Number of people performing PCR tests'),
wb.get_sheet_by_name('Those who need hospital treatment, etc.'),
wb.get_sheet_by_name('Severe'),
wb.get_sheet_by_name('Number of people discharged or canceled'),
wb.get_sheet_by_name('Death (cumulative)'),
wb.get_sheet_by_name('Checking')
]
isFirst = False
except:
wb = openpyxl.Workbook()
wss = [
wb.create_sheet('Number of positives'),
wb.create_sheet('Number of people performing PCR tests'),
wb.create_sheet('Those who need hospital treatment, etc.'),
wb.create_sheet('Severe'),
wb.create_sheet('Number of people discharged or canceled'),
wb.create_sheet('Death (cumulative)'),
wb.create_sheet('Checking')
]
wb.remove(wb.get_sheet_by_name('Sheet'))
isFirst = True
for date, url in sorted(pdfs.items()):
print(date)
file_data = parser.from_buffer(requests.get(url))
text = file_data["content"]
l = makeDict(text)
if isFirst:
for ws in wss:
for i, key in enumerate(l.keys()):
ws.cell(i + 2, 1, key)
isFirst = False
for i, ws in enumerate(wss):
ws.cell(1, column_cnt + cnt, date)
for j, a in enumerate(l.keys()):
key = list(l[a].keys())[i]
ws.cell(j + 2, column_cnt + cnt, l[a][key])
cnt += 1
wb.save(filename)
Just save and run from Python.
A dates.pickle
will be generated for saving the recorded dates, please do not move it from the same directory as the executable.
You can easily visualize changes in the number of infected people.
Looking at the changes in the number of infected people nationwide, I wonder if the second wave is settling down ...? If you want to observe the infection transition in detail, or if you want to perform statistical processing, please try it. It seems that it can be updated automatically if it is executed regularly using the task scheduler or GCF.