Scraping the number of downloads and positive registrations of the new coronavirus contact confirmation app

Scraping the number of downloads and the number of positive registrations in the upper left of the graph of the Ministry of Health, Labor and Welfare's New Coronavirus Contact-Confirming Application (COCOA) COVID-19 Contact-Confirming Application




import base64

import requests
from bs4 import BeautifulSoup

url = ""

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"

r = requests.get(url, headers=headers)

soup = BeautifulSoup(r.content, "lxml")

img_tag ='div.m-grid > div.m-grid__col1 > img[src^="data:image/png;base64"]')[1]

img_b64 = img_tag.get("src").replace("data:image/png;base64,", "")

png = base64.b64decode(img_b64)

Save image

import pathlib
import datetime

#Today's date
dt_today =

#Convert date to string
s_today = dt_today.strftime("%Y%m%d")

#PATH specification
p = pathlib.Path(f"../getIMG_pool/cocoa_info{s_today}.png ")
p.parent.mkdir(parents=True, exist_ok=True)

with"wb") as fw:

png_path = str(p)


Install tesseract-ocr

!add-apt-repository ppa:alex-p/tesseract-ocr -y
!apt update
!apt install tesseract-ocr
!apt install libtesseract-dev
!tesseract -v

!apt install tesseract-ocr-jpn  tesseract-ocr-jpn-vert
!apt install tesseract-ocr-script-jpan tesseract-ocr-script-jpan-vert
!tesseract --list-langs
!pip install pytesseract


Extract text from images

import pytesseract

import cv2
import numpy as np

from google.colab.patches import cv2_imshow

#Read in grayscale
img = cv2.imread(png_path, 0)

#Image crop
img_crop = img[55:150, 55:300]

#Resize 1.2 times
height, width = img_crop.shape
imgx12 = cv2.resize(img_crop, (int(width * 1.2), int(height * 1.2)))

#Check the image

txt = (
    pytesseract.image_to_string(imgx12, lang="jpn", config="--psm 6")
    .replace(",", "")
    .replace(".", "")

# ['As of December 28', '-Number of downloads:Approximately 22.45 million', '-Number of positive registrations:5566 cases']

Character extraction

import re

#Update date

m ="(\d{1,2})Moon(\d{1,2})Day", txt[0])

dt_update = dt_today

if m:
    year = dt_today.year
    month, day = map(int,, 2))

    dt_update =, month, day)

    if dt_today < dt_update:
        dt_update = - 1, month, day)

def str2num(s: str) -> int:

    m ="(\d+)(Ten thousand)?Case", s)
    n = 0

    if m:
        n = int(

        if == "Ten thousand":
            n = n * 10000

    return n

#Number of downloads
download_cnt = str2num(txt[1])

#Number of positive registrations
patients_cnt = str2num(txt[2])

print(dt_update, download_cnt, patients_cnt)

