Scraping the number of downloads and the number of positive registrations in the upper left of the graph of the Ministry of Health, Labor and Welfare's New Coronavirus Contact-Confirming Application (COCOA) COVID-19 Contact-Confirming Application
import base64
import requests
from bs4 import BeautifulSoup
url = "https://www.mhlw.go.jp/stf/seisakunitsuite/bunya/cocoa_00138.html"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}
r = requests.get(url, headers=headers)
r.raise_for_status()
soup = BeautifulSoup(r.content, "lxml")
img_tag = soup.select('div.m-grid > div.m-grid__col1 > img[src^="data:image/png;base64"]')[1]
img_b64 = img_tag.get("src").replace("data:image/png;base64,", "")
png = base64.b64decode(img_b64)
import pathlib
import datetime
#Today's date
dt_today = datetime.date.today()
#Convert date to string
s_today = dt_today.strftime("%Y%m%d")
#PATH specification
p = pathlib.Path(f"../getIMG_pool/cocoa_info{s_today}.png ")
p.parent.mkdir(parents=True, exist_ok=True)
with p.open(mode="wb") as fw:
fw.write(png)
png_path = str(p)
OCR
!add-apt-repository ppa:alex-p/tesseract-ocr -y
!apt update
!apt install tesseract-ocr
!apt install libtesseract-dev
!tesseract -v
!apt install tesseract-ocr-jpn tesseract-ocr-jpn-vert
!apt install tesseract-ocr-script-jpan tesseract-ocr-script-jpan-vert
!tesseract --list-langs
!pip install pytesseract
import pytesseract
import cv2
import numpy as np
from google.colab.patches import cv2_imshow
#Read in grayscale
img = cv2.imread(png_path, 0)
#Image crop
img_crop = img[55:150, 55:300]
#Resize 1.2 times
height, width = img_crop.shape
imgx12 = cv2.resize(img_crop, (int(width * 1.2), int(height * 1.2)))
#Check the image
cv2_imshow(imgx12)
txt = (
pytesseract.image_to_string(imgx12, lang="jpn", config="--psm 6")
.strip()
.replace(",", "")
.replace(".", "")
).splitlines()
print(txt)
# ['As of December 28', '-Number of downloads:Approximately 22.45 million', '-Number of positive registrations:5566 cases']
import re
#Update date
m = re.search("(\d{1,2})Moon(\d{1,2})Day", txt[0])
dt_update = dt_today
if m:
year = dt_today.year
month, day = map(int, m.group(1, 2))
dt_update = datetime.date(year, month, day)
if dt_today < dt_update:
dt_update = datetime.date(year - 1, month, day)
def str2num(s: str) -> int:
m = re.search("(\d+)(Ten thousand)?Case", s)
n = 0
if m:
n = int(m.group(1))
if m.group(2) == "Ten thousand":
n = n * 10000
return n
#Number of downloads
download_cnt = str2num(txt[1])
#Number of positive registrations
patients_cnt = str2num(txt[2])
print(dt_update, download_cnt, patients_cnt)