Convert PDF of Sagamihara City presentation materials (occurrence status, etc.) regarding new coronavirus infection to CSV

import datetime
import pathlib
import re
from urllib.parse import urljoin

import pandas as pd
import pdfplumber
import requests
from bs4 import BeautifulSoup


def fetch_file(url, dir="."):

    r = requests.get(url)
    r.raise_for_status()

    p = pathlib.Path(dir, pathlib.PurePath(url).name)
    p.parent.mkdir(parents=True, exist_ok=True)

    with p.open(mode="wb") as fw:
        fw.write(r.content)
    return p


url = "https://www.city.sagamihara.kanagawa.jp/shisei/koho/1019191.html"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}

r = requests.get(url, headers=headers)
r.raise_for_status()

soup = BeautifulSoup(r.content, "html.parser")

tag = soup.find(
    "a", href=re.compile(".pdf$"), onclick=re.compile("Confirmation of new patients due to new coronavirus infection")
)

link = urljoin(url, tag.get("href"))

path_pdf = fetch_file(link)

with pdfplumber.open(path_pdf) as pdf:

    dfs = []

    for page in pdf.pages:

        if page.page_number == 1:

            #Get text with crop
            crop = page.within_bbox((400, 44, page.width, 60))
            update = crop.extract_text()

        for table in page.extract_tables():

            df_tmp = pd.DataFrame(table)

            row, col = df_tmp.shape

            #11 columns

            if col == 11:

                #No less than the top of the table

                if "Less than" not in table[0][0]:

                    dfs.append(df_tmp)

df = (
    pd.concat(dfs)
    .iloc[1:]
    .set_axis(
        ["Case No..", "Age", "sex", "Occupation, etc.", "place", "residence", "Symptoms", "Date of onset", "Positive finding date", "Infection route, etc.", "Remarks"],
        axis=1,
    )
)

df

#Whitespace before and after, normalization
for col in df.select_dtypes(include=object).columns:
    df[col] = df[col].str.replace("\s", "").str.normalize("NFKC")

dt_now = datetime.datetime.now()


def str2date(s: pd.Series) -> pd.Series:

    df = (
        s.str.extract("(\d{1,2})Moon(\d{1,2})Day")
        .rename(columns={0: "month", 1: "day"})
        .fillna(0)
        .astype(int)
    )

    df["year"] = dt_now.year

    tmp = pd.to_datetime(df, errors="coerce")

    df["year"] = df["year"].mask(tmp > dt_now, df["year"] - 1)

    return pd.to_datetime(df, errors="coerce")


df["Date of onset YMD"] = str2date(df["Date of onset"])

df["Positive finding date YMD"] = str2date(df["Positive finding date"])

y, m, d = map(int, re.findall("\d+", update))

dt_update = datetime.datetime(2018 + y, m, d)


df.to_csv(f'sagamihara{dt_update.strftime("%Y%m%d")}.csv', encoding="utf_8_sig")