From the Weather Warning / Warning page of the Japan Meteorological Agency Scraping and tabulating weather warnings and warnings nationwide
import pathlib
import time
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
#For sorting
pref_code = {
"01": "Hokkaido",
"02": "Aomori Prefecture",
"03": "Iwate Prefecture",
"04": "Miyagi Prefecture",
"05": "Akita",
"06": "Yamagata Prefecture",
"07": "Fukushima Prefecture",
"08": "Ibaraki Prefecture",
"09": "Tochigi Prefecture",
"10": "Gunma Prefecture",
"11": "Saitama",
"12": "Chiba",
"13": "Tokyo",
"14": "Kanagawa Prefecture",
"15": "Niigata Prefecture",
"16": "Toyama Prefecture",
"17": "Ishikawa Prefecture",
"18": "Fukui prefecture",
"19": "Yamanashi Prefecture",
"20": "Nagano Prefecture",
"21": "Gifu Prefecture",
"22": "Shizuoka Prefecture",
"23": "Aichi prefecture",
"24": "Mie Prefecture",
"25": "Shiga Prefecture",
"26": "Kyoto",
"27": "Osaka",
"28": "Hyogo prefecture",
"29": "Nara Prefecture",
"30": "Wakayama Prefecture",
"31": "Tottori prefecture",
"32": "Shimane Prefecture",
"33": "Okayama Prefecture",
"34": "Hiroshima Prefecture",
"35": "Yamaguchi Prefecture",
"36": "Tokushima Prefecture",
"37": "Kagawa Prefecture",
"38": "Ehime Prefecture",
"39": "Kochi Prefecture",
"40": "Fukuoka Prefecture",
"41": "Saga Prefecture",
"42": "Nagasaki Prefecture",
"43": "Kumamoto Prefecture",
"44": "Oita Prefecture",
"45": "Miyazaki prefecture",
"46": "Kagoshima prefecture",
"47": "Okinawa Prefecture",
}
#Creating a list of prefecture names
pref = [v for v in pref_code.values()]
url = "https://www.jma.go.jp/jp/warn/"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}
#session
with requests.Session() as s:
r = s.get(url, headers=headers)
r.raise_for_status()
base = BeautifulSoup(r.content, "html5lib")
htmls = []
for tag in tqdm(base.select("div#title > noscript > table > tbody > tr > td > a")):
area = tag.get_text(strip=True)
link = urljoin(url, tag.get("href"))
r = s.get(link, headers=headers)
r.raise_for_status()
soup = BeautifulSoup(r.content, "html5lib")
p = pathlib.Path("html", pathlib.PurePath(link).name)
p.parent.mkdir(parents=True, exist_ok=True)
with p.open(mode="w") as fw:
fw.write(soup.prettify())
htmls.append({"area": area, "url": link, "path": p})
time.sleep(3)
import pandas as pd
def fetch_warn(p, area):
tmp = pd.read_html(p.open(mode="r"), attrs={"id": "WarnTableTable"})[0]
df = tmp.melt(
id_vars=[
("Unnamed: 0_level_0", "Unnamed: 0_level_1"),
("Unnamed: 1_level_0", "Unnamed: 1_level_1"),
("Unnamed: 2_level_0", "Unnamed: 2_level_1"),
]
).dropna(thresh=5)
df.set_axis(
["area1", "area2", "city", "level", "alert", "value"], axis=1, inplace=True
)
df["pref"] = area
return df
dfs = [fetch_warn(html["path"], html["area"]) for html in htmls]
df = pd.concat(dfs).reset_index(drop=True)
#Normalize characters, remove blank characters
for col in df.select_dtypes(include=object).columns:
df[col] = df[col].str.normalize("NFKC").str.replace("\s", "")
#Replaced local names with Hokkaido and Okinawa
df["pref"].replace(
{
"Soya region": "Hokkaido",
"Kamikawa / Rumoi region": "Hokkaido",
"Abashiri / Kitami / Monbetsu region": "Hokkaido",
"Kushiro / Nemuro / Tokachi region": "Hokkaido",
"Iburi / Hidaka region": "Hokkaido",
"Ishikari / Sorachi / Shiribeshi region": "Hokkaido",
"Watashima / Hiyama region": "Hokkaido",
"Okinawa main island region": "Okinawa Prefecture",
"Daito Islands region": "Okinawa Prefecture",
"Miyakojima region": "Okinawa Prefecture",
"Yaeyama region": "Okinawa Prefecture",
},
inplace=True,
)
# "●"Converted to 0 and 1
df["value"] = (df["value"] == "●").astype(int)
#Aggregate
df_alert = df.pivot_table(
index="pref", columns="level", values="value", aggfunc=sum
).reindex(index=pref, columns=["alarm", "Warning"])
df_alert
df.to_csv("alert.csv", encoding="utf_8_sig")
Recommended Posts