Created PDF data wrangling about influenza outbreak situation of Ministry of Health, Labor and Welfare with pdfplumber
It's easy because you can check the position of the character with chars and specify the range of crop.
with pdfplumber.open("data.pdf") as pdf:
p1 = pdf.pages[1]
#Check the position of the text
p1.chars
#Get text with crop
week_crop = p1.within_bbox((0, 90, p1.width, 105))
s = week_crop.extract_text()
import csv
import datetime
import pathlib
import re
from urllib.parse import urljoin
import pdfplumber
import pandas as pd
import requests
from bs4 import BeautifulSoup
def fetch_file(url, dir="."):
r = requests.get(url)
r.raise_for_status()
p = pathlib.Path(dir, pathlib.PurePath(url).name)
p.parent.mkdir(parents=True, exist_ok=True)
with p.open(mode="wb") as fw:
fw.write(r.content)
return p
url = "https://www.mhlw.go.jp/stf/seisakunitsuite/bunya/kenkou_iryou/kenkou/kekkaku-kansenshou01/houdou_00008.html"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}
r = requests.get(url, headers=headers)
r.raise_for_status()
soup = BeautifulSoup(r.content, "html.parser")
d1 = []
d2 = []
for i in soup.select('ul.m-listLink > li > a[href$=".pdf"]')[::-1]:
text = i.get_text(strip=True)
t = re.match("(\d{4})Year(\d{1,2})Moon(\d{1,2})Day", text)
#Press release date
if t:
year, month, day = map(int, t.groups())
dt_date = datetime.date(year, month, day)
else:
dt_date = datetime.date.today()
#PDF file
link = urljoin(url, i.get("href"))
p = fetch_file(link)
with pdfplumber.open(p) as pdf:
p1 = pdf.pages[1]
#Get text with crop
week_crop = p1.within_bbox((0, 90, p1.width, 105))
s = week_crop.extract_text()
m = re.search("(\d{4})Year(\d{1,2})week\((\d{1,2})Moon(\d{1,2})Sun ~(\d{1,2})Moon(\d{1,2})Day\)", s)
if m:
s_year, s_week, s_month, s_day, e_month, e_day = map(int, m.groups())
dt_start = datetime.date(s_year, s_month, s_day)
dt_end = datetime.date(s_year, e_month, e_day)
if dt_start > dt_end:
dt_end = datetime.date(s_year + 1, e_month, e_day)
table = p1.extract_table()
df_tmp = pd.DataFrame(
table[2:], columns=["Prefectures", "Number of reports", "Per fixed point"]
).set_index("Prefectures")
df_tmp.index = df_tmp.index.map(lambda s: "".join(s.split()))
df_tmp = df_tmp.mask(df_tmp == "-")
df_tmp["Number of reports"] = df_tmp["Number of reports"].str.replace(",", "").astype(float).astype("Int64")
df_tmp["Per fixed point"] = df_tmp["Per fixed point"].astype(float)
df_tmp.loc["Year"] = s_year
df_tmp.loc["week"] = s_week
df_tmp.loc["start date"] = dt_start
df_tmp.loc["End date"] = dt_end
s1 = df_tmp["Number of reports"]
s1.name = dt_date
d1.append(s1)
s2 = df_tmp["Per fixed point"]
s2.name = dt_date
d2.append(s2)
df1 = pd.concat(d1, axis=1, sort=False).T.astype({"Year": int, "week": int})
df2 = pd.concat(d2, axis=1, sort=False).T.astype({"Year": int, "week": int})
df3 = df1.join(df2, rsuffix="(Per fixed point)")
df = df3.reindex(
columns=[
"Year",
"week",
"start date",
"End date",
"Hokkaido",
"Hokkaido (per fixed point)",
"Aomori Prefecture",
"Aomori prefecture (per fixed point)",
"Iwate Prefecture",
"Iwate Prefecture (per fixed point)",
"Miyagi Prefecture",
"Miyagi prefecture (per fixed point)",
"Akita",
"Akita Prefecture (per fixed point)",
"Yamagata Prefecture",
"Yamagata Prefecture (per fixed point)",
"Fukushima Prefecture",
"Fukushima Prefecture (per fixed point)",
"Ibaraki Prefecture",
"Ibaraki Prefecture (per fixed point)",
"Tochigi Prefecture",
"Tochigi prefecture (per fixed point)",
"Gunma Prefecture",
"Gunma prefecture (per fixed point)",
"Saitama",
"Saitama Prefecture (per fixed point)",
"Chiba",
"Chiba (per fixed point)",
"Tokyo",
"Tokyo (per fixed point)",
"Kanagawa Prefecture",
"Kanagawa Prefecture (per fixed point)",
"Niigata Prefecture",
"Niigata Prefecture (per fixed point)",
"Toyama Prefecture",
"Toyama Prefecture (per fixed point)",
"Ishikawa Prefecture",
"Ishikawa Prefecture (per fixed point)",
"Fukui prefecture",
"Fukui Prefecture (per fixed point)",
"Yamanashi Prefecture",
"Yamanashi Prefecture (per fixed point)",
"Nagano Prefecture",
"Nagano prefecture (per fixed point)",
"Gifu Prefecture",
"Gifu prefecture (per fixed point)",
"Shizuoka Prefecture",
"Shizuoka Prefecture (per fixed point)",
"Aichi prefecture",
"Aichi Prefecture (per fixed point)",
"Mie Prefecture",
"Mie prefecture (per fixed point)",
"Shiga Prefecture",
"Shiga Prefecture (per fixed point)",
"Kyoto",
"Kyoto (per fixed point)",
"Osaka",
"Osaka (per fixed point)",
"Hyogo prefecture",
"Hyogo prefecture (per fixed point)",
"Nara Prefecture",
"Nara prefecture (per fixed point)",
"Wakayama Prefecture",
"Wakayama Prefecture (per fixed point)",
"Tottori prefecture",
"Tottori prefecture (per fixed point)",
"Shimane Prefecture",
"Shimane Prefecture (per fixed point)",
"Okayama Prefecture",
"Okayama Prefecture (per fixed point)",
"Hiroshima Prefecture",
"Hiroshima prefecture (per fixed point)",
"Yamaguchi Prefecture",
"Yamaguchi Prefecture (per fixed point)",
"Tokushima Prefecture",
"Tokushima Prefecture (per fixed point)",
"Kagawa Prefecture",
"Kagawa Prefecture (per fixed point)",
"Ehime Prefecture",
"Ehime Prefecture (per fixed point)",
"Kochi Prefecture",
"Kochi prefecture (per fixed point)",
"Fukuoka Prefecture",
"Fukuoka prefecture (per fixed point)",
"Saga Prefecture",
"Saga Prefecture (per fixed point)",
"Nagasaki Prefecture",
"Nagasaki (per fixed point)",
"Kumamoto Prefecture",
"Kumamoto Prefecture (per fixed point)",
"Oita Prefecture",
"Oita prefecture (per fixed point)",
"Miyazaki prefecture",
"Miyazaki Prefecture (per fixed point)",
"Kagoshima prefecture",
"Kagoshima Prefecture (per fixed point)",
"Okinawa Prefecture",
"Okinawa prefecture (per fixed point)",
"Total number",
"Total number (per fixed point)",
"Synchronized last year (total)",
"Synchronized last year (total number) (per fixed point)",
]
)
df1.to_csv(
"influ_count.csv", index=False, quoting=csv.QUOTE_NONNUMERIC, encoding="utf_8_sig",
)
df2.to_csv(
"influ_point.csv", index=False, quoting=csv.QUOTE_NONNUMERIC, encoding="utf_8_sig",
)
df.to_csv(
"influ_all.csv", index=False, quoting=csv.QUOTE_NONNUMERIC, encoding="utf_8_sig", na_rep="-",
)