Convert PDF of the progress of the division of labor (trends in insurance dispensing) of the Japan Pharmaceutical Association to CSV

Convert PDF of Pharmaceutical Division Progress (Trends in Insurance Dispensing) of Japan Pharmaceutical Association to CSV

import pathlib
import time
from urllib.parse import urljoin

import pandas as pd
import pdfplumber
import requests
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}

def fetch_soup(url, parser="html.parser"):

    r = requests.get(url, headers=headers)
    r.raise_for_status()

    soup = BeautifulSoup(r.content, parser)

    return soup

def fetch_file(url, dir="."):

    p = pathlib.Path(dir, pathlib.PurePath(url).name)
    p.parent.mkdir(parents=True, exist_ok=True)

    #Do not download if the file name is the same
    if not p.exists():

        #Wait 3 seconds to reduce server load
        time.sleep(3)

        r = requests.get(url)

        with p.open(mode="wb") as fw:
            fw.write(r.content)

    return p

#Scraping

url = "https://www.nichiyaku.or.jp/activities/division/faqShinchoku.html"

soup = fetch_soup(url)

#Extract PDF URL
links = [urljoin(url, i.get("href")) for i in soup.select("section.section a.btn-pdf")]

#PDF conversion

table_settings = {
    "vertical_strategy": "lines",
    "horizontal_strategy": "text",
    "intersection_tolerance": 5,
}

for link in tqdm(links):

    #Download PDF file
    path_pdf = fetch_file(link, "pdf")

    with pdfplumber.open(path_pdf) as pdf:

        page = pdf.pages[0]

        #Convert PDF table to text
        table = page.extract_table(table_settings)

        #Temporarily read with pandas for CSV processing
        df = pd.DataFrame(table)

        #Complement the merged cell in the first row of the header
        df.iloc[0] = df.iloc[0].fillna(method="ffill")

        #Change PDF file name extension to CSV for saving
        path_csv = pathlib.Path("csv", path_pdf.with_suffix(".csv").name)
        path_csv.parent.mkdir(parents=True, exist_ok=True)

        df.to_csv(path_csv, encoding="utf_8_sig", index=False, header=False)

Recommended Posts

Convert PDF of the progress of the division of labor (trends in insurance dispensing) of the Japan Pharmaceutical Association to CSV
Convert PDF of available stores of Go To EAT in Kagoshima prefecture to CSV
Convert PDF of Go To EAT member stores in Ishikawa prefecture to CSV
Convert PDF of new corona outbreak case in Aichi prefecture to CSV
Convert PDF of list of Go To EAT member stores in Niigata prefecture to CSV
Convert the image in .zip to PDF with Python
Batch convert all xlsx files in the folder to CSV files
Convert markdown to PDF in Python
How to change multiple columns of csv in Pandas (Unixtime-> Japan Time)
Hit the Rakuten Ranking API to save the ranking of any category in CSV
I want to convert a table converted to PDF in Python back to CSV
I want to batch convert the result of "string" .split () in Python
Convert PDF of Kumamoto Prefecture Go To EAT member store list to CSV
How to convert csv to tsv in CLI
Convert from PDF to CSV with pdfplumber
Various ways to read the last line of a csv file in Python
Convert PDF of Chiba Prefecture Go To EAT member store list to CSV (command)
Convert PDF of product list containing effective surfactants for new coronavirus to CSV