Bellmark Education Grant Foundation Weekly Bellmark Reception Status Aggregated from PDF by prefecture / city / ward / town / village
Currently, Webbellmark has also started, and you can access your favorite shop from the webbellmark site and collect bellmark points according to your shopping.
You can also use Jalan and Rakuten Travel, so you can support without paying yourself just by using it before applying for Go To Travel.
This process aggregates what is in the limit range of X coordinate and Y coordinate to the position of many coordinates. I am adjusting things that have a lot of characters and are in two stages, or where there is a slight deviation
def snap_adjustment(s, limit=5):
count = s.value_counts().sort_index()
index = 0
value = 0
for i, v in count.items():
if (i - index) < limit:
if v > value:
s = s.replace(index, i)
index = i
value = v
else:
s = s.replace(i, index)
else:
index = i
value = v
return s
import pathlib
import time
import pandas as pd
import pdfplumber
import requests
from bs4 import BeautifulSoup
def fetch_file(url, dir="."):
r = requests.get(url)
r.raise_for_status()
p = pathlib.Path(dir, pathlib.PurePath(url).name)
p.parent.mkdir(parents=True, exist_ok=True)
with p.open(mode="wb") as fw:
fw.write(r.content)
return p
def snap_adjustment(s, limit=5):
count = s.value_counts().sort_index()
index = 0
value = 0
for i, v in count.items():
if (i - index) < limit:
if v > value:
s = s.replace(index, i)
index = i
value = v
else:
s = s.replace(i, index)
else:
index = i
value = v
return s
url = "https://www.bellmark.or.jp/collect/accept.htm"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}
r = requests.get(url, headers=headers)
r.raise_for_status()
soup = BeautifulSoup(r.content, "html.parser")
links = {
href.get("href")
for href in soup.select("div.cal-process > div.cal-row-date > div > a")
}
dfs = []
for link in links:
p = fetch_file(link)
with pdfplumber.open(p) as pdf:
for page in pdf.pages:
crop = page.within_bbox((0, 65, page.width, page.height - 40))
df_tmp = (
pd.DataFrame(crop.extract_words(keep_blank_chars=True))
.astype({"x0": float, "x1": float, "top": float, "bottom": float})
.sort_values(["top", "x0"])
)
df_tmp["top"] = snap_adjustment(df_tmp["top"], 6)
df_tmp["x0"] = snap_adjustment(df_tmp["x0"])
table = (
df_tmp.pivot_table(
index=["top"],
columns="x0",
values="text",
aggfunc=lambda x: "".join(str(v) for v in x),
)
).values
df = pd.DataFrame(table, columns=["Prefectures", "city", "Municipalities", "Participating groups", "Reception date"])
dfs.append(df)
time.sleep(3)
df = pd.concat(dfs)
df
df["Municipality"] = df["city"].fillna("") + df["Municipalities"].fillna("")
df1 = df.reindex(columns=["Prefectures", "Municipality", "Participating groups", "Reception date"])
df1.to_csv("data.csv")
df1
import japanize_matplotlib
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rcParams["figure.dpi"] = 200
df1["Prefectures"].value_counts(ascending=True).plot.barh(figsize=(5, 10))
#Save graph
plt.savefig("01.png ", dpi=200, bbox_inches="tight")
plt.show()
s = df1.groupby(["Prefectures", "Municipality"])["Municipality"].count().sort_values(ascending=True)
s.tail(50).plot.barh(figsize=(5, 10))
#Save graph
plt.savefig("02.png ", dpi=200, bbox_inches="tight")
plt.show()