Scraping the number of downloads and the number of positive registrations in the upper left of the graph of the Ministry of Health, Labor and Welfare's New Coronavirus Contact-Confirming Application (COCOA) COVID-19 Contact-Confirming Application

program

Scraping

import base64

import requests
from bs4 import BeautifulSoup

url = "https://www.mhlw.go.jp/stf/seisakunitsuite/bunya/cocoa_00138.html"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}

r = requests.get(url, headers=headers)
r.raise_for_status()

soup = BeautifulSoup(r.content, "lxml")

img_tag = soup.select('div.m-grid > div.m-grid__col1 > img[src^="data:image/png;base64"]')[1]

img_b64 = img_tag.get("src").replace("data:image/png;base64,", "")

png = base64.b64decode(img_b64)

Save image

import pathlib
import datetime

#Today's date
dt_today = datetime.date.today()

#Convert date to string
s_today = dt_today.strftime("%Y%m%d")

#PATH specification
p = pathlib.Path(f"../getIMG_pool/cocoa_info{s_today}.png ")
p.parent.mkdir(parents=True, exist_ok=True)

with p.open(mode="wb") as fw:
    fw.write(png)

png_path = str(p)

OCR

Install tesseract-ocr

!add-apt-repository ppa:alex-p/tesseract-ocr -y
!apt update
!apt install tesseract-ocr
!apt install libtesseract-dev
!tesseract -v

!apt install tesseract-ocr-jpn  tesseract-ocr-jpn-vert
!apt install tesseract-ocr-script-jpan tesseract-ocr-script-jpan-vert
!tesseract --list-langs
!pip install pytesseract

Extract text from images

import pytesseract

import cv2
import numpy as np

from google.colab.patches import cv2_imshow

#Read in grayscale
img = cv2.imread(png_path, 0)

#Image crop
img_crop = img[55:150, 55:300]

#Resize 1.2 times
height, width = img_crop.shape
imgx12 = cv2.resize(img_crop, (int(width * 1.2), int(height * 1.2)))

#Check the image
cv2_imshow(imgx12)

txt = (
    pytesseract.image_to_string(imgx12, lang="jpn", config="--psm 6")
    .strip()
    .replace(",", "")
    .replace(".", "")
).splitlines()

print(txt)
# ['As of December 28', '-Number of downloads:Approximately 22.45 million', '-Number of positive registrations:5566 cases']

Character extraction

import re

#Update date

m = re.search("(\d{1,2})Moon(\d{1,2})Day", txt[0])

dt_update = dt_today

if m:
    year = dt_today.year
    month, day = map(int, m.group(1, 2))

    dt_update = datetime.date(year, month, day)

    if dt_today < dt_update:
        dt_update = datetime.date(year - 1, month, day)


def str2num(s: str) -> int:

    m = re.search("(\d+)(Ten thousand)?Case", s)
    n = 0

    if m:
        n = int(m.group(1))

        if m.group(2) == "Ten thousand":
            n = n * 10000

    return n


#Number of downloads
download_cnt = str2num(txt[1])

#Number of positive registrations
patients_cnt = str2num(txt[2])

print(dt_update, download_cnt, patients_cnt)