If you do a Google search and any of the title
ʻog: description`` h1~
h4` in the hit page contains a specific keyword, the title and URL of the target page will be separated into text files. Output to.
If the URL is already in the text file, skip processing.
3.8.2
After building a virtual environment with venv etc.
pip install -r requirements.txt
python main.py
requirements.txt
beautifulsoup4 == 4.9.1
requests == 2.24.0
settings.py
settings = {
#Keywords used for Google search
'google_search_keywords': ['Medical', 'corona'],
#Number of searches
'google_search_num': 10,
#Keywords to search from within the hit page
'search_keywords_in_page': ['Medical']
}
main.py
import urllib.parse
import re
import requests
import bs4
from settings import settings
from output import OutputText
def get_ogdesc_from_soup(soup: bs4.BeautifulSoup) -> str:
"""
From a BeautifulSoup instance
<meta property="og:description" content="...">
And returns the contents of content.
If not found, returns an empty string.
"""
og_desc = soup.find('meta', attrs={'property': 'og:description', 'content': True})
if og_desc:
return og_desc['content']
return ''
def get_href_from_soup(soup: bs4.BeautifulSoup):
href = soup.get('href')
href = re.search('(http)(.+)(&sa)', href).group()[0:-3] #Remove unnecessary strings
href = urllib.parse.unquote(href) #Decode
return href
def do_google_search(keywords: [str], search_num: int) -> [str]:
"""
Perform a Google search with keywords
Returns a list of hit URLs
"""
#Perform a Google search
url = 'https://www.google.co.jp/search'
params = {
'hl': 'ja',
'num': search_num,
'q': ' '.join(keywords)
}
response = requests.get(url, params=params)
#Returns a list of hit URLs
# `.kCrYT`May need to be fixed due to changes in Google specifications
soup = bs4.BeautifulSoup(response.content, 'html.parser')
soups = soup.select('.kCrYT > a')
return [get_href_from_soup(soup) for soup in soups]
def main():
output_text = OutputText('output.txt')
urls = do_google_search(settings['google_search_keywords'], settings['google_search_num'])
for url in urls:
#Skip processing if the text file already contains the URL
if url in output_text.get_urls():
continue
try:
response = requests.get(url)
response.encoding = 'utf-8'
response.raise_for_status()
except:
#Skip processing if connection error occurs
continue
soup = bs4.BeautifulSoup(response.content, 'html.parser')
titles = [a.text for a in soup.select('title')]
desc = get_ogdesc_from_soup(soup)
h1s = [a.text for a in soup.select('h1')]
h2s = [a.text for a in soup.select('h2')]
h3s = [a.text for a in soup.select('h3')]
h4s = [a.text for a in soup.select('h4')]
#Skip processing if keywords are not included in the page
no_keyword = True
for keyword in settings['search_keywords_in_page']:
for text in titles + [desc] + h1s + h2s + h3s + h4s:
if keyword in text:
no_keyword = False
if no_keyword:
continue
#Write to text file
title = '**No title**' if len(titles) <= 0 else titles[0].strip().replace('\n', '')
output_text.write(title, url)
#Output a text file in an easy-to-read format
output_text.output_readable_file()
if __name__ == '__main__':
main()
output.py
import myutil as u
import os
class OutputText:
file_path = None
def __init__(self, file_path):
self.file_path = file_path
if not os.path.isfile(file_path):
file = open(self.file_path, 'w', encoding='utf-8')
file.close()
def write(self, title, url):
with open(self.file_path, mode='a', encoding='utf-8') as f:
u.write_with_tab(f, title, url)
f.write('\n')
def get_urls(self):
lines = self.get_lines()
return [self.get_url(line) for line in lines]
def output_readable_file(self):
file = self.file_path.replace('.txt', '_readable.txt')
with open(file, mode='w', encoding='utf-8') as f:
lines = self.get_lines()
for line in lines:
f.write(self.get_title(line) + '\n' + self.get_url(line) + '\n')
f.write('\n------------------------------\n\n')
def get_lines(self):
with open(self.file_path, mode='r', encoding='utf-8') as f:
text = f.read()
lines = text.strip().split('\n')
return lines
def get_title(self, line):
texts_in_line = line.split('\t')
return texts_in_line[0] if len(texts_in_line) >= 1 else ''
def get_url(self, line):
texts_in_line = line.split('\t')
return texts_in_line[1] if len(texts_in_line) >= 2 else ''
myutil.py
def write_with_tab(file, *strings):
"""
Write a string to the file separated by tabs
"""
for i, string in enumerate(strings):
file.write(string)
if i != len(strings) - 1: #If not the last loop
file.write('\t')
return file
Recommended Posts