import time
import unicodedata
from urllib.parse import urljoin
import re
import requests
from bs4 import BeautifulSoup
def cleaning(info, team, html):
result = []
for trs in html:
data = [i.get_text(strip=True) for i in trs.select("th, td")]
#Calculer le temps d'extension après avoir supprimé les minutes
data[0] = eval(data[0].rstrip("Minutes"))
#PK supprimé du nom du joueur
data[2] = re.sub("\(.+\)", "", unicodedata.normalize("NFKC", data[2])).strip()
result.append(info + [team] + data)
return result
def scraping(n, url):
r = requests.get(url)
r.raise_for_status()
soup = BeautifulSoup(r.content, "html5lib")
#section
score_season = soup.select_one(
"div.score-header > h2.score-meta > span.score-season"
).get_text(strip=True)
score_season = int(score_season.strip("Section"))
# print(score_season)
#Date et l'heure
score_date = (
soup.select_one("div.score-header > h2.score-meta > span.score-date")
.get_text(strip=True)
.split()
)
# print(score_date)
#Nom de l'équipe
score_table = soup.select_one("table.score-table")
home_team = score_table.select_one("th.score-team1").get_text(strip=True)
away_team = score_table.select_one("th.score-team2").get_text(strip=True)
# print(home_team, away_team)
#Informations sur le match
game_info = [n, score_season] + score_date + [home_team, away_team]
#But
tag = soup.find("h3", text="But")
#Vérifiez s'il s'agit d'une table de notation
if tag:
table_home = [
trs
for trs in tag.parent.select(
"div.score-frame > div.score-left > table > tbody > tr"
)
]
home_data = cleaning(game_info, home_team, table_home)
table_away = [
trs
for trs in tag.parent.select(
"div.score-frame > div.score-right > table > tbody > tr"
)
]
away_data = cleaning(game_info, away_team, table_away)
score_data = home_data + away_data
return score_data
return None
url = "http://www.jfl.or.jp/jfl-pc/view/s.php?a=1542&f=2020A001_spc.html"
r = requests.get(url)
r.raise_for_status()
soup = BeautifulSoup(r.content, "html5lib")
links = [urljoin(url, link.get("href")) for link in soup.select("td.detail-link > a") if link.text == "Détails"]
result = []
for i, link in enumerate(links):
score_data = scraping(i, link)
if score_data:
result.extend(score_data)
time.sleep(1)
import pandas as pd
df = pd.DataFrame(result, columns=["rencontre", "section", "Date", "Heures du jour", "domicile", "Une façon", "Nom de l'équipe", "temps", "Numéro uniforme", "Nom de joueur"])
df
df["But"] = 1
#Classement par numéro d'objectif
pv_goal = df.pivot_table(
values="But", index=["Nom de joueur", "Nom de l'équipe", "Numéro uniforme"], aggfunc=sum, fill_value=0
).drop(["But contre son camp"]).reset_index()
pv_goal["Numéro uniforme"] = pv_goal["Numéro uniforme"].astype(int)
#Classement
pv_goal["Classement"] = pv_goal["But"].rank(ascending=False, method="min").astype(int)
#équipe
jfl_2020 = [
"Honda FC",
"Sony Sendai FC",
"Tokyo Musashino City FC",
"Tegevajaro Miyazaki",
"Honda Lock SC",
"Verspa Oita",
"FC Osaka",
"MIO Biwako Shiga",
"Viatin Mie",
"FC Maruyasu Okazaki",
"Suzuka Point Getters",
"Courrier en ligne Aomori",
"Nara Club",
"Matsue City FC",
"Iwaki FC",
"Kochi United SC",
]
team = {name: i for i, name in enumerate(jfl_2020, 1)}
pv_goal["ID de l'équipe"] = pv_goal["Nom de l'équipe"].map(team)
#Ordre croissant par classement, nom d'équipe, nom de joueur
pv_goal.sort_values(["Classement", "ID de l'équipe", "Numéro uniforme"], ascending=[True, True, True], inplace=True)
pv_goal.drop(["ID de l'équipe", "Numéro uniforme"], axis=1, inplace=True)
pv_goal.set_index("Classement", inplace=True)
pv_goal.to_csv("goal.csv")
df_rank = pd.read_html("http://www.jfl.or.jp/jfl-pc/view/s.php?a=1544", index_col=0, header=0)[0]
df_rank["Nom de joueur"] = df_rank["Nom de joueur"].str.normalize("NFKC")
df_rank.to_csv("ranking.csv")
Recommended Posts