CSV of PDF of Product list containing surfactants effective for new coronavirus of National Institute of Technology and Evaluation Conversion to
apt install python3-tk ghostscript
pip install camelot-py[cv]
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
url = "https://www.nite.go.jp/information/osirasedetergentlist.html"
r = requests.get(url)
r.raise_for_status()
soup = BeautifulSoup(r.content, "html.parser")
tag = soup.select_one("div.main div.cf ul > li > a")
link = urljoin(url, tag.get("href"))
import camelot
import pandas as pd
tables = camelot.read_pdf(
link, pages="all", split_text=True, line_scale=40, copy_text=["v"]
)
df_tmp = pd.concat([table.df for table in tables[:-1]])
#Detergent for home furniture, etc.
df1 = df_tmp.iloc[1:].set_axis(df_tmp.iloc[0].to_list(), axis=1).reset_index(drop=True)
df1.index += 1
df1.to_csv("housing.csv", encoding="utf_8_sig")
#Synthetic detergent for kitchen, etc.
df2 = tables[-1].df.iloc[1:].set_axis(tables[-1].df.iloc[0].to_list(), axis=1)
df2.to_csv("kitchen.csv", encoding="utf_8_sig")
Recommended Posts