When I search google for something I don't understand, I don't just search from one site to improve the reliability of the source, but I also search across multiple sites. At that time, I found it troublesome to open the search result pages one by one, so I made a program that opens 10 search pages at once when I look up a word. In addition to that, I thought it would be convenient to keep the search history, so I made it so that the title and URL of the search result page are automatically summarized in EXCEL.
CPU:Intel core i5 7200U dual core OS:Winddows10 home Python 3.8.2(Anaconda) Chrome Driver 81.0.4044.92 Selenium Openpyxl
Write on the assumption that the execution environment is in place.
Enter search words and refinements in GUI text boxes Press the search button The top 10 sites in the search results open in a separate tab at once The URL and title of the searched site are written to Excel
First, create a text box with PySimpleGUI and display the entered characters. I made it with reference to this site.
import PySimpleGUI as sg
sg.theme('Dark Blue 3')
layout = [[sg.Text('Search word', size=(15, 1)), sg.InputText(default_text='', key='-SEARCHWORD-')],
[sg.Text('Narrowing down conditions', size=(15, 1)), sg.InputText(
default_text='', key='-CONDITION-')],
[sg.Submit('Search')]]
window = sg.Window('Enter a search word', layout)
while True:
event, values = window.read()
if event is None:
print('exit')
break
if event == 'Search':
show_message = "Search word:" + values['-SEARCHWORD-'] + 'Has been entered.\n'
show_message = "Narrowing conditions:" + values['-CONDITION-'] + 'Has been entered.\n'
sg.popup(show_message)
window.close()
We will do web scraping using selenium. For how to use it, I referred to the quick reference. Display the "Title", "URL", and "Summary" for convenience when putting them together in Excel later, and then open them in a new tab.
open_newtab.py
from selenium import webdriver
import chromedriver_binary
from selenium.webdriver.common.by import By
search_word = 'python usage'
const = 'https://www.google.com/search?q='
getword = const+search_word
driver = webdriver.Chrome()
driver.get(getword)
url_list = []
for i, g in enumerate(driver.find_elements(By.CLASS_NAME, "g")):
print("------ " + str(i+1) + " ------")
r = g.find_element(By.CLASS_NAME, "r")
print(r.find_element(By.TAG_NAME, "h3").text)
url = r.find_element(By.TAG_NAME, "a").get_attribute("href") #Extract URL
url_list.append(url)
print("\t" + url)
s = g.find_element(By.CLASS_NAME, "s")
print("\t" + s.find_element(By.CLASS_NAME, "st").text)
for num in range(10):
driver.execute_script("window.open()") #Open a new tab
driver.switch_to.window(driver.window_handles[num+1]) #Switch to new tab
driver.get(url_list[num])
However, if you leave it as it is, it will take a lot of time to open the next tab after waiting for the page to be displayed. In this case, it is better to enter it by hand. I want to be able to move to the next page before the page is displayed. So use "CONTROL" + click to change the page to open in a new tab.
Since xpath is convenient for clicking buttons and links in a fixed place, I will actively use it.
Add the code as shown in the Quick Reference (https://www.seleniumqref.com/api/python/actions/Python_key_down_click.html).
If you check the xpath of the title of the web search result, `// * [@id =" rso "] / div [1] / div / div [1] / a / h3```,
// * [@id = "rso"] / div [2] / div / div [1] / a / h3``` ... You can see that the argument of the first ``` / div [1]
`` from the left is different. Give this a variable and turn it with a for statement.
open_newtab.py
from selenium import webdriver
import chromedriver_binary
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
search_word = 'python usage'
const = 'https://www.google.com/search?q='
getword = const+search_word
driver = webdriver.Chrome()
driver.get(getword)
url_list = []
for i, g in enumerate(driver.find_elements(By.CLASS_NAME, "g")):
print("------ " + str(i+1) + " ------")
r = g.find_element(By.CLASS_NAME, "r")
print(r.find_element(By.TAG_NAME, "h3").text)
url = r.find_element(By.TAG_NAME, "a").get_attribute("href") #Extract URL
url_list.append(url)
print("\t" + url)
s = g.find_element(By.CLASS_NAME, "s")
print("\t" + s.find_element(By.CLASS_NAME, "st").text)
xpath = '//*[@id = "rso"]/div[{}]/div/div[1]/a/h3' #Title click xpath
for num in range(10):
element = driver.find_element_by_xpath(xpath.format(num+1))
actions = ActionChains(driver)
actions.key_down(Keys.CONTROL)
actions.click(element)
actions.perform()
When you save the Excel file, it will be saved in the current directory.
wite_to_excel.py
import openpyxl
from selenium import webdriver
import chromedriver_binary
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
import datetime
search_word = 'python openpyxl'
const = 'https://www.google.com/search?q='
getword = const+search_word
driver = webdriver.Chrome()
driver.get(getword)
url_list = []
title_list = []
Overview_list = []
for i, g in enumerate(driver.find_elements(By.CLASS_NAME, "g")):
r = g.find_element(By.CLASS_NAME, "r")
title_list.append(r.find_element(By.TAG_NAME, "h3").text) #Extract the title
url_list.append(r.find_element(
By.TAG_NAME, "a").get_attribute("href")) #Extract URL
s = g.find_element(By.CLASS_NAME, "s")
Overview_list.append(s.find_element(By.CLASS_NAME, "st").text) #Extraction of outline
xpath = '//*[@id = "rso"]/div[{}]/div/div[1]/a/h3' #Title click xpath
for num in range(10):
element = driver.find_element_by_xpath(xpath.format(num+1))
actions = ActionChains(driver)
actions.key_down(Keys.CONTROL)
actions.click(element)
actions.perform()
wb = openpyxl.Workbook() #Create a new empty Workbook object
ws = wb.active
ws.title = 'sheet1'
#ws = sheet.get_sheet_by_name('Sheet1')
ws['A1'] = 'date'
ws['B1'] = 'title'
ws['C1'] = 'Overview'
ws['D1'] = 'URL'
def write_in_xlsx(column_num, list):
num = 0
for row in range(2, 11):
ws.cell(row=row, column=column_num).value = list[num]
num = num+1
for row in range(2, 11):
ws.cell(row=row, column=1).value = datetime.datetime.today()
write_in_xlsx(2, title_list)
write_in_xlsx(3, Overview_list)
write_in_xlsx(4, url_list)
wb.save('test.xlsx') # test.Save as xlsx
This still has problems. When you do a Google search, for example, if the search result appears on the right side as shown in this photo, `selenium.common.exceptions.NoSuchElementException: Message: no such element: Unable to locate element: {"method " : "css selector", "selector": ".r"}
`and I get an exception.
The code that implements all the functions is as follows. `class Websc```,
class Excel```, ``
class GUIWindow` `` I want to create a blueprint that is the basis of each function and combine objects with Application class to realize this time. Creating a function. Run the file and type in the text box to open the tabs at once. (Refinement conditions are not implemented)
By the way, why can't I search without pressing the button twice? If anyone knows the cause, please comment.
web_search_Efficiency.py
import PySimpleGUI as sg
import time
import openpyxl
from selenium import webdriver
import chromedriver_binary
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
import datetime
def main():
app = Application()
app.RUN()
class WebSc:
def __init__(self,):
self.search_word = ''
self.const = 'https://www.google.com/search?q='
self.url_list = []
self.title_list = []
self.Overview_list = []
self.search_word = 0
self.driver = 0
def SetSearchWord(self, search_word):
getword = self.const + search_word
self.driver = webdriver.Chrome()
self.driver.get(getword)
def ExtractElement(self):
for g in self.driver.find_elements(By.CLASS_NAME, "g"):
try:
r = g.find_element(By.CLASS_NAME, "r")
self.title_list.append(r.find_element(
By.TAG_NAME, "h3").text) #Extract the title
self.url_list.append(r.find_element(
By.TAG_NAME, "a").get_attribute("href")) #Extract URL
s = g.find_element(By.CLASS_NAME, "s")
self.Overview_list.append(s.find_element(
By.CLASS_NAME, "st").text) #Extraction of outline
except:
continue
def ClickElementAsNewTab(self, click_num): # num:How many tops to get
xpath = '//*[@id = "rso"]/div[{}]/div/div[1]/a/h3' #Title click xpath
for num in range(click_num):
element = self.driver.find_element_by_xpath(xpath.format(num+1))
actions = ActionChains(self.driver)
actions.key_down(Keys.CONTROL)
actions.click(element)
actions.perform()
class Excel:
def __init__(self, websc):
self.wb = openpyxl.Workbook() #Create a new empty Workbook object
self.ws = self.wb.active
self.ws.title = 'sheet1'
self.cell_list = ['A1', 'B1', 'C1', 'D1']
self.name_list = ['date', 'title', 'Overview', 'URL']
self.url_list = []
self.title_list = []
self.Overview_list = []
def SetGotList(self, title_list, Overview_list, url_list): #List setters obtained from outside
self.url_list = url_list
self.title_list = title_list
self.Overview_list = Overview_list
def __write_in_column(self, column_num, list, min=2, max=12): #Function to write to cells one column at a time
num = 0
for row in range(min, max):
self.ws.cell(row=row, column=column_num).value = list[num]
num = num + 1
def SetCellname(self, cell_list, name_list): #A function that names cells
for num, cell in enumerate(cell_list):
self.ws[cell] = name_list[num]
def MakeFile(self, file_name):
self.SetCellname(self.cell_list, self.name_list) #Give the top line a name
for row in range(2, 12):
self.ws.cell(
row=row, column=1).value = datetime.datetime.today() #The first column is the date
self.__write_in_column(2, self.title_list) #Write the acquired title
self.__write_in_column(3, self.Overview_list)
self.__write_in_column(4, self.url_list)
self.wb.save(file_name) # test.Save as xlsx
class GUIWindow:
def __init__(self,):
sg.theme('Dark Blue 3')
self.layout = [[sg.Text('Search word', size=(15, 1)), sg.InputText(default_text='', key='-SEARCHWORD-')],
[sg.Text('Narrowing down conditions', size=(15, 1)), sg.InputText(
default_text='', key='-CONDITION-')], [sg.Text('Excel file name', size=(15, 1)), sg.InputText(default_text='', key='-EXCELFILE-')],
[sg.Submit('Search'), sg.Submit('Search and ファイル保存')]]
self.window = sg.Window('Enter a search word', self.layout)
self.event = 0
self.values = 0
def CloseWindow(self):
self.window.close()
def ReadWindow(self):
self.event, self.values = self.window.read()
class Application: #This application
window = GUIWindow()
websc = WebSc()
excel = Excel(websc)
def __init__(self):
pass
def ButtonAction(self, button_name): #Behavior when the argument button is pressed
if button_name == 'Search':
Application.window.ReadWindow()
Application.websc.SetSearchWord(
Application.window.values['-SEARCHWORD-'])
Application.websc.ExtractElement() #Element extraction
Application.websc.ClickElementAsNewTab(10) #Specify the number of acquisition sites
if button_name == 'Search and save file':
Application.window.ReadWindow()
Application.websc.SetSearchWord(
Application.window.values['-SEARCHWORD-'])
Application.websc.ExtractElement()
Application.websc.ClickElementAsNewTab(10) #Specify the number of acquisition sites
Application.excel.SetGotList(Application.websc.title_list, Application.websc.Overview_list,
Application.websc.url_list) #Set what you got
Application.excel.MakeFile(
Application.window.values['-EXCELFILE-']) #Create excel file
def RUN(self):
while True:
Application.window.ReadWindow()
if Application.window.event is None:
print('exit')
break
if Application.window.event == 'Search':
self.ButtonAction('Search')
if Application.window.event == 'Search and save file':
self.ButtonAction('Search and save file')
Application.excel.CloseWindow()
if __name__ == "__main__":
main()
This time it was a program that automatically opens multiple tabs, but it seems that various functions can be realized by adding functions to each class or changing the combination of methods in Application. For example, a bot that searches for registered words and automatically searches at a specified time to collect information is likely to expand.
https://qiita.com/memakura/items/20a02161fa7e18d8a693#%E5%BE%85%E3%81%A4
Recommended Posts