import datetime
import pathlib
import re
from urllib.parse import urljoin
import pandas as pd
import pdfplumber
import requests
from bs4 import BeautifulSoup
def fetch_file(url, dir="."):
r = requests.get(url)
r.raise_for_status()
p = pathlib.Path(dir, pathlib.PurePath(url).name)
p.parent.mkdir(parents=True, exist_ok=True)
with p.open(mode="wb") as fw:
fw.write(r.content)
return p
url = "https://www.city.sagamihara.kanagawa.jp/shisei/koho/1019191.html"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}
r = requests.get(url, headers=headers)
r.raise_for_status()
soup = BeautifulSoup(r.content, "html.parser")
tag = soup.find(
"a", href=re.compile(".pdf$"), onclick=re.compile("Confirmation of new patients due to new coronavirus infection")
)
link = urljoin(url, tag.get("href"))
path_pdf = fetch_file(link)
with pdfplumber.open(path_pdf) as pdf:
dfs = []
for page in pdf.pages:
if page.page_number == 1:
#Get text with crop
crop = page.within_bbox((400, 44, page.width, 60))
update = crop.extract_text()
for table in page.extract_tables():
df_tmp = pd.DataFrame(table)
row, col = df_tmp.shape
#11 columns
if col == 11:
#No less than the top of the table
if "Less than" not in table[0][0]:
dfs.append(df_tmp)
df = (
pd.concat(dfs)
.iloc[1:]
.set_axis(
["Case No..", "Age", "sex", "Occupation, etc.", "place", "residence", "Symptoms", "Date of onset", "Positive finding date", "Infection route, etc.", "Remarks"],
axis=1,
)
)
df
#Whitespace before and after, normalization
for col in df.select_dtypes(include=object).columns:
df[col] = df[col].str.replace("\s", "").str.normalize("NFKC")
dt_now = datetime.datetime.now()
def str2date(s: pd.Series) -> pd.Series:
df = (
s.str.extract("(\d{1,2})Moon(\d{1,2})Day")
.rename(columns={0: "month", 1: "day"})
.fillna(0)
.astype(int)
)
df["year"] = dt_now.year
tmp = pd.to_datetime(df, errors="coerce")
df["year"] = df["year"].mask(tmp > dt_now, df["year"] - 1)
return pd.to_datetime(df, errors="coerce")
df["Date of onset YMD"] = str2date(df["Date of onset"])
df["Positive finding date YMD"] = str2date(df["Positive finding date"])
y, m, d = map(int, re.findall("\d+", update))
dt_update = datetime.datetime(2018 + y, m, d)
df.to_csv(f'sagamihara{dt_update.strftime("%Y%m%d")}.csv', encoding="utf_8_sig")
Recommended Posts