import time
import unicodedata
from urllib.parse import urljoin
import re
import requests
from bs4 import BeautifulSoup
def cleaning(info, team, html):
result = []
for trs in html:
data = [i.get_text(strip=True) for i in trs.select("th, td")]
#Calculate overtime after removing minutes of time
data[0] = eval(data[0].rstrip("Minutes"))
#Deleted the PK of the player name
data[2] = re.sub("\(.+\)", "", unicodedata.normalize("NFKC", data[2])).strip()
result.append(info + [team] + data)
return result
def scraping(n, url):
r = requests.get(url)
r.raise_for_status()
soup = BeautifulSoup(r.content, "html5lib")
#section
score_season = soup.select_one(
"div.score-header > h2.score-meta > span.score-season"
).get_text(strip=True)
score_season = int(score_season.strip("Section"))
# print(score_season)
#Date and time
score_date = (
soup.select_one("div.score-header > h2.score-meta > span.score-date")
.get_text(strip=True)
.split()
)
# print(score_date)
#Team name
score_table = soup.select_one("table.score-table")
home_team = score_table.select_one("th.score-team1").get_text(strip=True)
away_team = score_table.select_one("th.score-team2").get_text(strip=True)
# print(home_team, away_team)
#Match information
game_info = [n, score_season] + score_date + [home_team, away_team]
#score
tag = soup.find("h3", text="score")
#Check if it is a scoring table
if tag:
table_home = [
trs
for trs in tag.parent.select(
"div.score-frame > div.score-left > table > tbody > tr"
)
]
home_data = cleaning(game_info, home_team, table_home)
table_away = [
trs
for trs in tag.parent.select(
"div.score-frame > div.score-right > table > tbody > tr"
)
]
away_data = cleaning(game_info, away_team, table_away)
score_data = home_data + away_data
return score_data
return None
url = "http://www.jfl.or.jp/jfl-pc/view/s.php?a=1542&f=2020A001_spc.html"
r = requests.get(url)
r.raise_for_status()
soup = BeautifulSoup(r.content, "html5lib")
links = [urljoin(url, link.get("href")) for link in soup.select("td.detail-link > a") if link.text == "Details"]
result = []
for i, link in enumerate(links):
score_data = scraping(i, link)
if score_data:
result.extend(score_data)
time.sleep(1)
import pandas as pd
df = pd.DataFrame(result, columns=["match", "section", "date", "Times of Day", "home", "Away", "Team name", "time", "Uniform number", "Player name"])
df
df["score"] = 1
#Goal number ranking
pv_goal = df.pivot_table(
values="score", index=["Player name", "Team name", "Uniform number"], aggfunc=sum, fill_value=0
).drop(["Own goal"]).reset_index()
pv_goal["Uniform number"] = pv_goal["Uniform number"].astype(int)
#Ranking
pv_goal["Ranking"] = pv_goal["score"].rank(ascending=False, method="min").astype(int)
#team
jfl_2020 = [
"Honda FC",
"Sony Sendai FC",
"Tokyo Musashino City FC",
"Tegevajaro Miyazaki",
"Honda Lock SC",
"Verspah Oita",
"FC Osaka",
"MIO Biwako Shiga",
"Veertien Mie",
"FC Maruyasu Okazaki",
"Suzuka Point Getters",
"Line mail Aomori",
"Nara club",
"Matsue City FC",
"Iwaki FC",
"Kochi United SC",
]
team = {name: i for i, name in enumerate(jfl_2020, 1)}
pv_goal["Team ID"] = pv_goal["Team name"].map(team)
#Ascending order by ranking, team name, player name
pv_goal.sort_values(["Ranking", "Team ID", "Uniform number"], ascending=[True, True, True], inplace=True)
pv_goal.drop(["Team ID", "Uniform number"], axis=1, inplace=True)
pv_goal.set_index("Ranking", inplace=True)
pv_goal.to_csv("goal.csv")
df_rank = pd.read_html("http://www.jfl.or.jp/jfl-pc/view/s.php?a=1544", index_col=0, header=0)[0]
df_rank["Player name"] = df_rank["Player name"].str.normalize("NFKC")
df_rank.to_csv("ranking.csv")
Recommended Posts