Convert from PDF to CSV of Available stores of Kagoshima Chamber of Commerce and Industry
PDF files are separated for each area, so combine them into one
import requests
from bs4 import BeautifulSoup
url = "http://www.kagoshima-cci.or.jp/?p=20375"
r = requests.get(url)
r.raise_for_status()
soup = BeautifulSoup(r.content, "html.parser")
result = []
for a in soup.select("#contents_layer > span > p > a"):
s = a.get_text(strip=True).replace("Whole area", "").lstrip("〇")
#District excluded
if not s.endswith("district"):
result.append({"area": s, "link": a.get("href")})
import camelot
import pandas as pd
dfs = []
for data in result:
tables = camelot.read_pdf(
data["link"], pages="all", flavor="lattice", split_text=True, strip_text=" \n"
)
for table in tables:
df_tmp = table.df.iloc[1:].set_axis(["Japanese syllabary", "Store name", "location"], axis=1)
df_tmp["area"] = data["area"]
dfs.append(df_tmp)
df = pd.concat(dfs)
df.to_csv("kagoshima.csv", encoding="utf_8_sig")
Recommended Posts