Was released last time
Convert PDF of list of Go To EAT member stores in Niigata prefecture to CSV https://qiita.com/barobaro/items/74fb5bdedbf1ae7267a0
Can't find PDF, so scrape to create a list
import re
import time
import requests
from bs4 import BeautifulSoup
url = "https://niigata-gte.com/shop/"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}
result = []
while True:
r = requests.get(url, headers=headers)
r.raise_for_status()
soup = BeautifulSoup(r.content, "html.parser")
for shop in soup.select("div#result > div.cont"):
data = {}
data["Dealer code"] = (
shop.select_one("div.no").get_text(strip=True).split(":", 1)[-1]
)
span = shop.select("div.tag > span")
data["area"] = span[0].get_text(strip=True)
data["Genre"] = span[1].get_text(strip=True)
if len(span) > 2:
temp = {i.get("alt"): "○" for i in span[2].select("img")}
data.update(temp)
h4 = shop.select_one("h4")
data["Store name"] = h4.get_text(strip=True)
if h4.select_one("a"):
link = h4.a.get("href")
if link:
data["home page"] = link
p_add = shop.select_one("p.add").contents
postcode, address = p_add[0].split(sep=None, maxsplit=1)
#Extract latitude / longitude from google map link
gps = re.search(r"(?<=@)(.+?),(.+?)(?=,\d{1,2}z)", p_add[1].a.get("href"))
if gps:
data["latitude"] = float(gps.group(1))
data["longitude"] = float(gps.group(2))
data["Postal code"] = postcode.strip()
data["location"] = address.strip()
data["phone number"] = shop.select_one("p.tel").get_text(strip=True)
result.append(data)
tag = soup.select_one("li.next")
if tag:
m = re.search("https://niigata-gte.com/shop/page/\d+/", tag.a.get("onclick"))
if m:
url = m.group(0)
else:
break
time.sleep(3)
result
import pandas as pd
df = pd.DataFrame(result)
df.index += 1
df.to_csv("niigata.csv", encoding="utf_8_sig")
Recommended Posts