I heard that web scraping can be done for free without a server, so I tried it. If there is no subject, I will not be motivated, so this time I will acquire the data of Lotto 6 so that I can get motivated even a little. Is it really random? I verified it.
colab
First, register in colab and select File
→ Python3 Notebook
.
Change hardware to GPU
Every time you close colab, you will lose everything you put in, so you need to install it every time.
!apt-get update
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
!pip install selenium
!pip install beautifulsoup4
I did a lot of research, but it didn't work, so Corrected the code by referring to here. I decided to get it by automatic operation of Chrome.
import time
import random
import pandas as pd
from selenium import webdriver
+ from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
#URL of Mizuho Bank page where the winning number of Lotto 6 is posted
loto_url1 = 'https://www.mizuhobank.co.jp/retail/takarakuji/loto/backnumber/loto6' #1st to 460th
loto_url2 = 'https://www.mizuhobank.co.jp/retail/takarakuji/loto/backnumber/detail.html?fromto=' #After the 461st time
num = 1
main_num_list = [] #A list that stores the 6 digits of this number
bonus_num_list = [] #List to store bonus numbers
- #Use PhantomJS via selenium
- #driver = webdriver.PhantomJS()
+ #Launch the browser in headless mode (mode that runs in the background), display the website, get the generated html, and use BeautifulSoup+clean.
+ options = webdriver.ChromeOptions()
+ #Mandatory
+ options.add_argument('--headless')
+ options.add_argument('--disable-gpu')
+ options.add_argument('--no-sandbox')
+ #Error tolerance
+ options.add_argument('--ignore-certificate-errors')
+ options.add_argument('--allow-running-insecure-content')
+ options.add_argument('--disable-web-security')
+ #Functions that may not be needed with headless
+ options.add_argument('--disable-desktop-notifications')
+ options.add_argument("--disable-extensions")
+ #language
+ options.add_argument('--lang=ja')
+ #Lighten without loading images
+ options.add_argument('--blink-settings=imagesEnabled=false')
+ driver = webdriver.Chrome('chromedriver',options=options)
while num <= 1341:
#URL of the winning page from the 1st to the 460th
if num < 461:
url = loto_url1 + str(num).zfill(4) + '.html'
#URL of the winning page after the 461st time
else:
url = loto_url2 + str(num) + '_' + str(num+19) + '&type=loto6'
#Get the corresponding page with PhantomJS
driver.get(url)
- #time.sleep(2) #Time to load javascript page
+ #Change the delay time because the acquisition destination site is asynchronous from the middle
+ time.sleep(5)
html = driver.page_source.encode('utf-8')
soup = BeautifulSoup(html, "html.parser")
print(soup.title)
#Get the table with the winning number of Lotto 6
table = soup.find_all("table")
del table[0]
for i in table:
#Acquisition of this number
main_num = i.find_all("tr")[2].find("td")
main_num_list.append(main_num.string.split(" "))
#Get bonus numbers
bonus_num = i.find_all("tr")[3].find("td")
bonus_num_list.append(bonus_num.string)
num += 20 #Add 20 to num to go to the next page
time.sleep(random.uniform(1, 3)) #Stop the code for 1-3 seconds to avoid Dos attacks
#Output with csv
df = pd.DataFrame(main_num_list, columns = ['main1', 'main2', 'main3', 'main4', 'main5', 'main6'])
df['bonus'] = bonus_num_list
df.index = df.index + 1
df.to_csv('loto6.csv')
When this process is finished, loto6.csv
should be in the file tab, and it will appear when you update.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
df = pd.read_csv('loto6.csv') #Get the header as the header
df = df.drop("Unnamed: 0", axis=1) #Delete the first column
nums_all = df[['main1','main2','main3','main4','main5','main6','bonus']]
plt.rcParams['figure.figsize'] = (8, 6)#Default size for subsequent figures
plt.rcParams["font.size"] = 15
#tes = nums_all.values.flatten().size #Quantity
tes = (nums_all.sum().sum())/(nums_all.size) #average
plt.hist(nums_all.values.flatten(), bins=43, normed=True)
plt.show()
tes
The average is 22, and the histogram is also very disjointed. I thought it was useless to do various things such as machine learning.
Recommended Posts