Convert PDF of Pharmaceutical Division Progress (Trends in Insurance Dispensing) of Japan Pharmaceutical Association to CSV
import pathlib
import time
from urllib.parse import urljoin
import pandas as pd
import pdfplumber
import requests
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}
def fetch_soup(url, parser="html.parser"):
r = requests.get(url, headers=headers)
r.raise_for_status()
soup = BeautifulSoup(r.content, parser)
return soup
def fetch_file(url, dir="."):
p = pathlib.Path(dir, pathlib.PurePath(url).name)
p.parent.mkdir(parents=True, exist_ok=True)
#Do not download if the file name is the same
if not p.exists():
#Wait 3 seconds to reduce server load
time.sleep(3)
r = requests.get(url)
with p.open(mode="wb") as fw:
fw.write(r.content)
return p
#Scraping
url = "https://www.nichiyaku.or.jp/activities/division/faqShinchoku.html"
soup = fetch_soup(url)
#Extract PDF URL
links = [urljoin(url, i.get("href")) for i in soup.select("section.section a.btn-pdf")]
#PDF conversion
table_settings = {
"vertical_strategy": "lines",
"horizontal_strategy": "text",
"intersection_tolerance": 5,
}
for link in tqdm(links):
#Download PDF file
path_pdf = fetch_file(link, "pdf")
with pdfplumber.open(path_pdf) as pdf:
page = pdf.pages[0]
#Convert PDF table to text
table = page.extract_table(table_settings)
#Temporarily read with pandas for CSV processing
df = pd.DataFrame(table)
#Complement the merged cell in the first row of the header
df.iloc[0] = df.iloc[0].fillna(method="ffill")
#Change PDF file name extension to CSV for saving
path_csv = pathlib.Path("csv", path_pdf.with_suffix(".csv").name)
path_csv.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(path_csv, encoding="utf_8_sig", index=False, header=False)
Recommended Posts