Predict horse racing with machine learning and aim for a recovery rate of 100%.
Last time, I made a machine learning model that predicts horses that will be in the top 3 with LightGBM. This time, I would like to add "past performance of horses" as a feature, but scraping and data processing are quite difficult when actually trying to do it. So, I would like to summarize what kind of code should be written and implemented </ font>.
First of all, scraping the past results of all horses running in 2019 from netkeiba.com. On netkeiba.com, horse_id is given for each horse, and the page URL of the past performance result is 「https://db.netkeiba.com/horse/(horse_id)」 Since it has the structure, scrape the necessary horse_id (and the jockey id) by processing the scrape_race_results function created in Previous article.
import time
from tqdm.notebook import tqdm
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
def scrape_race_results(race_id_list, pre_race_results={}):
race_results = pre_race_results
for race_id in tqdm(race_id_list):
if race_id in race_results.keys():
continue
try:
url = "https://db.netkeiba.com/race/" + race_id
df = pd.read_html(url)[0]
# horse_id and jockey_scraping id
html = requests.get(url)
html.encoding = "EUC-JP"
soup = BeautifulSoup(html.text, "html.parser")
# horse_id
horse_id_list = []
horse_a_list = soup.find("table", attrs={"summary": "Race result"}).find_all(
"a", attrs={"href": re.compile("^/horse")}
)
for a in horse_a_list:
horse_id = re.findall(r"\d+", a["href"])
#If you use backslash in qiita, it will be buggy, so it is capitalized.
horse_id_list.append(horse_id[0])
# jockey_id
jockey_id_list = []
jockey_a_list = soup.find("table", attrs={"summary": "Race result"}).find_all(
"a", attrs={"href": re.compile("^/jockey")}
)
for a in jockey_a_list:
jockey_id = re.findall(r"\d+", a["href"])
jockey_id_list.append(jockey_id[0])
df["horse_id"] = horse_id_list
df["jockey_id"] = jockey_id_list
race_results[race_id] = df
time.sleep(1)
except IndexError:
continue
except Exception as e:
print(e)
break
return race_results
Convert to DataFrame type referring to the previous article. This will give you a list of the horse_ids you need.
results = scrape_race_results(race_id_list)
results = pd.concat([results[key] for key in results])
horse_id_list = results['horse_id'].unique()
This is used to scrape past performance data.
def scrape_horse_results(horse_id_list, pre_horse_id=[]):
horse_results = {}
for horse_id in tqdm(horse_id_list):
if horse_id in pre_horse_id:
continue
try:
url = 'https://db.netkeiba.com/horse/' + horse_id
df = pd.read_html(url)[3]
if df.columns[0]=='Award history':
df = pd.read_html(url)[4]
horse_results[horse_id] = df
time.sleep(1)
except IndexError:
continue
except Exception as e:
import traceback
traceback.print_exc()
print(e)
break
except:
break
return horse_results
It takes a long time, but after scraping, make it a DataFrame type again and save it in a pickle file.
horse_results = scrape_horse_results(horse_id_list)
for key in horse_results:
horse_results[key].index = [key] * len(horse_results[key])
df = pd.concat([horse_results[key] for key in horse_results])
df.to_pickle('horse_results.pickle')
Next, create a class called HorseResults and implement a function that merges the order of arrival and the average of the prize money.
class HorseResults:
def __init__(self, horse_results):
self.horse_results = horse_results[['date', 'Order of arrival', 'Prize money']]
self.preprocessing()
def preprocessing(self):
df = self.horse_results.copy()
#Remove items that contain non-numeric character strings in the order of arrival
df['Order of arrival'] = pd.to_numeric(df['Order of arrival'], errors='coerce')
df.dropna(subset=['Order of arrival'], inplace=True)
df['Order of arrival'] = df['Order of arrival'].astype(int)
df["date"] = pd.to_datetime(df["date"])
df.drop(['date'], axis=1, inplace=True)
#Fill the prize NaN with 0
df['Prize money'].fillna(0, inplace=True)
self.horse_results = df
def average(self, horse_id_list, date, n_samples='all'):
target_df = self.horse_results.loc[horse_id_list]
#Specify how many runs in the past
if n_samples == 'all':
filtered_df = target_df[target_df['date'] < date]
elif n_samples > 0:
filtered_df = target_df[target_df['date'] < date].\
sort_values('date', ascending=False).groupby(level=0).head(n_samples)
else:
raise Exception('n_samples must be >0')
average = filtered_df.groupby(level=0)[['Order of arrival', 'Prize money']].mean()
return average.rename(columns={'Order of arrival':'Order of arrival_{}R'.format(n_samples), 'Prize money':'Prize money_{}R'.format(n_samples)})
def merge(self, results, date, n_samples='all'):
df = results[results['date']==date]
horse_id_list = df['horse_id']
merged_df = df.merge(self.average(horse_id_list, date, n_samples), left_on='horse_id',
right_index=True, how='left')
return merged_df
def merge_all(self, results, n_samples='all'):
date_list = results['date'].unique()
merged_df = pd.concat([self.merge(results, date, n_samples) for date in tqdm(date_list)])
return merged_df
With this, for example, if you want to add the results of the past 5 races to the feature quantity, you can implement it as follows.
hr = HorseResults(horse_results)
results_5R = hr.merge_all(results_p, n_samples=5)
Now you can see that the rightmost two columns have added the finish order and the average of the last five races of prize money.
Details are explained in the video ↓ Data analysis / machine learning starting with horse racing prediction
Recommended Posts