I received a text file containing the URL I want to browse and created a tool that automatically browses it. The point I was particular about is youtube, yahoo, etc. ** Also on pages where the content increases as you scroll. Corresponding point **. I'm glad if you can use it as a reference.
AutoBrowsing.py
import os
import re
import sys
import time
import chromedriver_binary
import requests
from selenium import webdriver
# ***main function(Execution is at the bottom) ***
def main():
#Accepting URL list files
input_path = input_urls_file()
#Get URL list from file
url_list = get_url_list(input_path)
#Validate URLs in the URL list
validate_url(url_list)
#Reception of browsing confirmation
confirm_browsing()
#browsing
browsing_urls(url_list)
# ***Function that accepts input of URL list file***
def input_urls_file():
print("\n########## Start processing ##########")
print("Input filepath of urls : ")
#Accepting file input(full path)
input_path = input()
print("\nCheck input file ...\n")
#File existence check
if os.path.exists(input_path):
print(' [OK]: File exists. : ' + input_path)
#Exit if the file does not exist
else:
print(" [ERROR]: File doesn't exist! : " + input_path)
print("\nSystem Exit.\n")
sys.exit()
return input_path
# ***Function to get URL list from file***
def get_url_list(input_path):
#Open file
targetFile = open(input_path)
#List of URLs by line
url_list = targetFile.readlines()
#File close
targetFile.close()
return url_list
# ***Functions that validate URL schemes and status codes***
def validate_url(url_list):
print("\nCheck url scheme and status code ...\n")
#Error flag
hasError = False
for url in url_list:
# Tips:readlines()A line feed code is attached to the one line read in, so delete it.
unsafe_url = url.rstrip()
#URL scheme pattern
URL_PTN = re.compile(r"^(http|https)://")
#Error if the pattern does not match
if not (URL_PTN.match(unsafe_url)):
print(" [ERROR]: Url isn't valid! : " + unsafe_url)
hasError = True
#Do not request the URL if the scheme is incorrect
continue
#Request to URL if the scheme is correct
r = requests.get(unsafe_url)
#Status code is 200(200 even with redirect)Otherwise an error
if (r.status_code != 200):
print(" [ERROR]: Status code isn't 200! : [" +
r.status_code + "]:" + unsafe_url)
hasError = True
#Exit if the scheme is incorrect or if there is a status code other than 200
if hasError:
print("\nSystem Exit.\n")
sys.exit()
print(" [OK]: All urls are valid and 200.")
print(" [OK]: Number of urls : " + str(len(url_list)))
# ***Function that accepts input to start browsing***
def confirm_browsing():
# Yes/Infinite loop except No
while True:
print("\nStart browsing? y/n (default:y)")
#Input is accepted as all lowercase letters(Easy to compare)
confirm_cmd = input().lower()
#Default(Enter)Treat only as y
if ((confirm_cmd == "") or (confirm_cmd == "y")):
break
elif ((confirm_cmd == "n")):
print("\nSystem Exit.\n")
sys.exit()
else:
pass
# ***Function to perform browsing***
def browsing_urls(url_list):
options = webdriver.ChromeOptions()
#Maximize your browser
options.add_argument("--start-maximized")
#Specifying options to turn off "Chrome is controlled by automated testing software."
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome(options=options)
print("\n===== start =====")
#Open the browser line by line
for i, url in enumerate(url_list):
#Output the number of URLs displayed in the entire URL list
print(" " + str(i+1) + "/" + str(len(url_list)))
#Access URL
driver.get(url)
#↓ If there is a process you want to do in each URL, change the function to call here
#The process of scrolling each URL to the bottom
scrolle_to_end(driver)
print("===== end =====\n")
#End
driver.quit()
print("Complete.\n")
# ***A function that scrolls to the bottom of the page***
def scrolle_to_end(driver):
#Scrolling speed(Specify 1 or more)
SCROLL_SPEED = 3
while not is_scrolle_end(driver):
# 0.Wait 5 seconds(Basically unnecessary, but used when loading is slow)
# time.sleep(0.5)
#Scroll by relative value
driver.execute_script("window.scrollBy(0, "+str(SCROLL_SPEED)+");")
#Wait 1 second
time.sleep(1)
# ***Function to determine if scrolled to the bottom***
def is_scrolle_end(driver):
#Get the number when scrolling to the bottom(window.inner Height minutes(For screen display area)Pull because it does not scroll)
script = "return " + str(get_page_height(driver)) + \
" - window.innerHeight;"
page_most_bottom = driver.execute_script(script)
#Get scroll amount(The acquisition method differs depending on the browser type and version, etc.)
script = "return window.pageYOffset || document.documentElement.scrollTop;"
scroll_top = driver.execute_script(script)
is_end = scroll_top >= page_most_bottom
return is_end
# ***Function to get the height of the page***
def get_page_height(driver):
#Take the maximum value as it depends on the browser version and site
# https://ja.javascript.info/size-and-scroll-window#ref-633
# Tips:If you want to write a character string on multiple lines without line breaks()Surround with
script = ("return Math.max("
"document.body.scrollHeight, document.documentElement.scrollHeight,"
"document.body.offsetHeight, document.documentElement.offsetHeight,"
"document.body.clientHeight, document.documentElement.clientHeight"
");")
height = driver.execute_script(script)
return height
#Execution of main function
main()
↓ Sample input file
test_url_list.txt
https://www.google.com/
https://qiita.com/
https://www.youtube.com/
https://www.yahoo.co.jp/
↓ Run-time console
########## Start processing ##########
Input filepath of urls :
c:\Users\hoge\Desktop\work\python\AutoBrowsing\test_url_list.txt
Check input file ...
[OK]: File exists. : c:\Users\hoge\Desktop\work\python\AutoBrowsing\test_url_list.txt
Check url scheme and status code ...
[OK]: All urls are valid and 200.
[OK]: Number of urls : 4
Start browsing? y/n (default:y)
===== start =====
1/4
2/4
3/4
4/4
===== end =====
Complete.
If you install with pip when scanning Chrome with selenium, simply installing with the pip install chromedriver-binary
command will result in a ** version issue and a run-time error. ** **
Please specify the version at the time of installation.
Even if you install without specifying the version, if you install by specifying the version, the previous one will be automatically uninstalled.
This time, after launching the browser automatically, we scrolled to the bottom. The process of accepting a list and performing some processing on that URL seems to be reusable, so I'll try to create something else.
Recommended Posts