Convert PDF of the situation of people infected in Tokyo with the new coronavirus infection of the Tokyo Metropolitan Health and Welfare Bureau to CSV

import pathlib
from urllib.parse import urljoin

from bs4 import BeautifulSoup
import pandas as pd
import pdfplumber
import requests
from tqdm.notebook import tqdm

def fetch_file(url, dir="."):

    r = requests.get(url)

    p = pathlib.Path(dir, pathlib.PurePath(url).name)
    p.parent.mkdir(parents=True, exist_ok=True)

    with"wb") as fw:
    return p

url = ""

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"

r = requests.get(url, headers=headers)

soup = BeautifulSoup(r.content, "html.parser")

tag = soup.select_one("div#main p.filelink > a.pdf")

link = urljoin(url, tag.get("href"))

path_pdf = fetch_file(link)

dfs = []

#Convert PDF
with as pdf:

    for page in tqdm(pdf.pages):

        table = page.extract_table()

        df_tmp = pd.DataFrame(table[1:], columns=table[0])


#Combine all pages
df = pd.concat(dfs)


#Whitespace before and after, normalization
for col in df.select_dtypes(include=object).columns:
    df[col] = df[col].str.strip().str.normalize("NFKC")

#Change extension to CSV
path_csv = path_pdf.with_suffix(".csv")

df.to_csv(path_csv, encoding="utf_8_sig", index=False)

df1 = df.copy()

#Data wrangling

import datetime

dt_now =

#Complement the date with the current year and convert it to the date, and if the date is in the future from the present, set it one year ago
def str2date(s: pd.Series) -> pd.Series:

    df = s.str.extract("(\d{1,2})Moon(\d{1,2})Day").rename(columns={0: "month", 1: "day"}).fillna(0).astype(int)

    df["year"] = dt_now.year

    tmp = pd.to_datetime(df, errors="coerce")

    df["year"] = df["year"].mask(tmp > dt_now, df["year"] - 1)

    return pd.to_datetime(df, errors="coerce")

df1["Release date YMD"] = str2date(df1["Release date"])
df1["Date of onset YMD"] = str2date(df1["Date of onset"])
df1["Confirmed date YMD"] = str2date(df1["Fixed date"])

p = path_csv.with_name(".csv", "_c.csv"))

df1.to_csv(p, index=False, encoding="utf_8_sig")


from google.colab import files

