Convert PDF of new corona outbreak case in Aichi prefecture to CSV

import datetime
import pathlib
import re
from urllib.parse import urljoin

import pandas as pd
import pdfplumber
import requests
from bs4 import BeautifulSoup


def fetch_file(url, dir="."):

    r = requests.get(url)
    r.raise_for_status()

    p = pathlib.Path(dir, pathlib.PurePath(url).name)
    p.parent.mkdir(parents=True, exist_ok=True)

    with p.open(mode="wb") as fw:
        fw.write(r.content)
    return p


def days2date(s):

    y = dt_now.year

    days = re.findall("[0-9]{1,2}", s)

    if len(days) == 2:
        m, d = map(int, days)
        return pd.Timestamp(year=y, month=m, day=d)
    else:
        return pd.NaT


def wareki2date(s):

    m = re.search("(Showa|Heisei|Reiwa)([ 0-9 yuan]{1,2})Year(\d{1,2})Moon(\d{1,2})Day", s)

    if m:

        year, month, day = [1 if i == "Former" else int(i.strip()) for i in m.group(2, 3, 4)]

        if m.group(1) == "Showa":
            year += 1925
        elif m.group(1) == "Heisei":
            year += 1988
        elif m.group(1) == "Reiwa":
            year += 2018

        return datetime.date(year, month, day)

    else:
        return dt_now.date


url = "https://www.pref.aichi.jp/site/covid19-aichi/"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}

JST = datetime.timezone(datetime.timedelta(hours=+9), "JST")
dt_now = datetime.datetime.now(JST)

r = requests.get(url, headers=headers)
r.raise_for_status()

soup = BeautifulSoup(r.content, "html.parser")

dfs = []
dt_text = ""

for tag in soup.find("span", text="▶ Cases of occurrence in Aichi Prefecture").parent.find_all(
    "a", href=re.compile(".pdf$")
)[::-1]:

    link = urljoin(url, tag.get("href"))

    path_pdf = fetch_file(link)

    with pdfplumber.open(path_pdf) as pdf:

        for page in pdf.pages:

            if page.page_number == 1:

                dt_text = page.within_bbox((0, 80, page.width, 90)).extract_text()

            table = page.extract_table()

            df_tmp = pd.DataFrame(table[1:], columns=table[0])

            dfs.append(df_tmp)

df = pd.concat(dfs).set_index("No")

df["Announcement date"] = df["Announcement date"].apply(days2date)

df.dropna(subset=["Announcement date"], inplace=True)

#Divide age and gender
df_ages = df["Age / Gender"].str.extract("(.+)(male|Female)").rename(columns={0: "Age", 1: "sex"})

df = df.join(df_ages)


dt_update = wareki2date(dt_text)

path_csv = pathlib.Path(dt_update.strftime("%Y%m%d") + ".csv")

df.to_csv(path_csv, encoding="utf_8_sig")

Recommended Posts

Convert PDF of new corona outbreak case in Aichi prefecture to CSV
Convert PDF of available stores of Go To EAT in Kagoshima prefecture to CSV
Convert PDF of Go To EAT member stores in Ishikawa prefecture to CSV
Convert PDF of Kumamoto Prefecture Go To EAT member store list to CSV
Convert PDF of Chiba Prefecture Go To EAT member store list to CSV (command)
Convert PDF of product list containing effective surfactants for new coronavirus to CSV
Convert markdown to PDF in Python
Convert PDF of the situation of people infected in Tokyo with the new coronavirus infection of the Tokyo Metropolitan Health and Welfare Bureau to CSV
Convert PDF of Go To Eat Hokkaido campaign dealer list to CSV
Convert PDF of Sagamihara City presentation materials (occurrence status, etc.) regarding new coronavirus infection to CSV
Convert PDF of the progress of the division of labor (trends in insurance dispensing) of the Japan Pharmaceutical Association to CSV
How to convert csv to tsv in CLI
Convert from PDF to CSV with pdfplumber
I want to convert a table converted to PDF in Python back to CSV
Convert UTF-8 CSV files to read in Excel
Batch convert PSD files in directory to PDF
[Python] Convert PDF text to CSV page by page (2/24 postscript)
Convert the image in .zip to PDF with Python
Scraping the member stores of Go To EAT in Osaka Prefecture and converting them to CSV
Batch convert all xlsx files in the folder to CSV files
Scraping the list of Go To EAT member stores in Fukuoka prefecture and converting it to CSV
Partial in case of trouble
Convert SDF to CSV quickly
Sphinx extension to arbitrarily convert text in pre-processing of document generation
Convert a large number of PDF files to text files using pdfminer