Scraping

import time
import unicodedata
from urllib.parse import urljoin
import re

import requests
from bs4 import BeautifulSoup

def cleaning(info, team, html):

    result = []

    for trs in html:

        data = [i.get_text(strip=True) for i in trs.select("th, td")]

        #Calculate overtime after removing minutes of time
        data[0] = eval(data[0].rstrip("Minutes"))

        #Deleted the PK of the player name
        data[2] = re.sub("\(.+\)", "", unicodedata.normalize("NFKC", data[2])).strip()

        result.append(info + [team] + data)

    return result

def scraping(n, url):

    r = requests.get(url)
    r.raise_for_status()

    soup = BeautifulSoup(r.content, "html5lib")

    #section
    score_season = soup.select_one(
        "div.score-header > h2.score-meta > span.score-season"
    ).get_text(strip=True)

    score_season = int(score_season.strip("Section"))

    # print(score_season)

    #Date and time
    score_date = (
        soup.select_one("div.score-header > h2.score-meta > span.score-date")
        .get_text(strip=True)
        .split()
    )

    # print(score_date)

    #Team name
    score_table = soup.select_one("table.score-table")

    home_team = score_table.select_one("th.score-team1").get_text(strip=True)
    away_team = score_table.select_one("th.score-team2").get_text(strip=True)

    # print(home_team, away_team)

    #Match information
    game_info = [n, score_season] + score_date + [home_team, away_team]

    #score
    tag = soup.find("h3", text="score")

    #Check if it is a scoring table
    if tag:

        table_home = [
            trs
            for trs in tag.parent.select(
                "div.score-frame > div.score-left > table > tbody > tr"
            )
        ]
        home_data = cleaning(game_info, home_team, table_home)

        table_away = [
            trs
            for trs in tag.parent.select(
                "div.score-frame > div.score-right > table > tbody > tr"
            )
        ]
        away_data = cleaning(game_info, away_team, table_away)

        score_data = home_data + away_data

        return score_data

    return None

url = "http://www.jfl.or.jp/jfl-pc/view/s.php?a=1542&f=2020A001_spc.html"

r = requests.get(url)
r.raise_for_status()

soup = BeautifulSoup(r.content, "html5lib")

links = [urljoin(url, link.get("href")) for link in soup.select("td.detail-link > a") if link.text == "Details"]

result = []

for i, link in enumerate(links):

    score_data = scraping(i, link)

    if score_data:

        result.extend(score_data)
    
    time.sleep(1)

Data wrangling

import pandas as pd

df = pd.DataFrame(result, columns=["match", "section", "date", "Times of Day", "home", "Away", "Team name", "time", "Uniform number", "Player name"])

df

df["score"] = 1

#Goal number ranking
pv_goal = df.pivot_table(
    values="score", index=["Player name", "Team name", "Uniform number"], aggfunc=sum, fill_value=0
).drop(["Own goal"]).reset_index()

pv_goal["Uniform number"] = pv_goal["Uniform number"].astype(int)

#Ranking
pv_goal["Ranking"] = pv_goal["score"].rank(ascending=False, method="min").astype(int)

#team
jfl_2020 = [
    "Ｈｏｎｄａ ＦＣ",
    "Sony Sendai FC",
    "Tokyo Musashino City FC",
    "Tegevajaro Miyazaki",
    "Honda Lock SC",
    "Verspah Oita",
    "FC Osaka",
    "MIO Biwako Shiga",
    "Veertien Mie",
    "FC Maruyasu Okazaki",
    "Suzuka Point Getters",
    "Line mail Aomori",
    "Nara club",
    "Matsue City FC",
    "Iwaki FC",
    "Kochi United SC",
]

team = {name: i for i, name in enumerate(jfl_2020, 1)}

pv_goal["Team ID"] = pv_goal["Team name"].map(team)

#Ascending order by ranking, team name, player name
pv_goal.sort_values(["Ranking", "Team ID", "Uniform number"], ascending=[True, True, True], inplace=True)

pv_goal.drop(["Team ID", "Uniform number"], axis=1, inplace=True)

pv_goal.set_index("Ranking", inplace=True)

pv_goal.to_csv("goal.csv")

Ranking

df_rank = pd.read_html("http://www.jfl.or.jp/jfl-pc/view/s.php?a=1544", index_col=0, header=0)[0]

df_rank["Player name"] = df_rank["Player name"].str.normalize("NFKC")

df_rank.to_csv("ranking.csv")

Create a score ranking from JFL match results

Scraping

Data wrangling

Ranking