If you do a Google search and any of the title ʻog: description`` h1~h4` in the hit page contains a specific keyword, the title and URL of the target page will be separated into text files. Output to.
If the URL is already in the text file, skip processing.
3.8.2
After building a virtual environment with venv etc.
pip install -r requirements.txt
python main.py
requirements.txt
beautifulsoup4 == 4.9.1
requests == 2.24.0
settings.py
settings = {
    #Keywords used for Google search
    'google_search_keywords': ['Medical', 'corona'],
    #Number of searches
    'google_search_num': 10,
    #Keywords to search from within the hit page
    'search_keywords_in_page': ['Medical']
}
main.py
import urllib.parse
import re
import requests
import bs4
from settings import settings
from output import OutputText
def get_ogdesc_from_soup(soup: bs4.BeautifulSoup) -> str:
    """
From a BeautifulSoup instance
    <meta property="og:description" content="...">
And returns the contents of content.
If not found, returns an empty string.
    """
    og_desc = soup.find('meta', attrs={'property': 'og:description', 'content': True})
    if og_desc:
        return og_desc['content']
    return ''
def get_href_from_soup(soup: bs4.BeautifulSoup):
    href = soup.get('href')
    href = re.search('(http)(.+)(&sa)', href).group()[0:-3]  #Remove unnecessary strings
    href = urllib.parse.unquote(href)  #Decode
    return href
def do_google_search(keywords: [str], search_num: int) -> [str]:
    """
Perform a Google search with keywords
Returns a list of hit URLs
    """
    #Perform a Google search
    url = 'https://www.google.co.jp/search'
    params = {
        'hl': 'ja',
        'num': search_num,
        'q': ' '.join(keywords)
    }
    response = requests.get(url, params=params)
    #Returns a list of hit URLs
    # `.kCrYT`May need to be fixed due to changes in Google specifications
    soup = bs4.BeautifulSoup(response.content, 'html.parser')
    soups = soup.select('.kCrYT > a')
    return [get_href_from_soup(soup) for soup in soups]
def main():
    output_text = OutputText('output.txt')
    urls = do_google_search(settings['google_search_keywords'], settings['google_search_num'])
    for url in urls:
        #Skip processing if the text file already contains the URL
        if url in output_text.get_urls():
            continue
        try:
            response = requests.get(url)
            response.encoding = 'utf-8'
            response.raise_for_status()
        except:
            #Skip processing if connection error occurs
            continue
        soup = bs4.BeautifulSoup(response.content, 'html.parser')
        titles = [a.text for a in soup.select('title')]
        desc = get_ogdesc_from_soup(soup)
        h1s = [a.text for a in soup.select('h1')]
        h2s = [a.text for a in soup.select('h2')]
        h3s = [a.text for a in soup.select('h3')]
        h4s = [a.text for a in soup.select('h4')]
        #Skip processing if keywords are not included in the page
        no_keyword = True
        for keyword in settings['search_keywords_in_page']:
            for text in titles + [desc] + h1s + h2s + h3s + h4s:
                if keyword in text:
                    no_keyword = False
        if no_keyword:
            continue
        #Write to text file
        title = '**No title**' if len(titles) <= 0 else titles[0].strip().replace('\n', '')
        output_text.write(title, url)
    #Output a text file in an easy-to-read format
    output_text.output_readable_file()
if __name__ == '__main__':
    main()
output.py
import myutil as u
import os
class OutputText:
    file_path = None
    def __init__(self, file_path):
        self.file_path = file_path
        if not os.path.isfile(file_path):
            file = open(self.file_path, 'w', encoding='utf-8')
            file.close()
    def write(self, title, url):
        with open(self.file_path, mode='a', encoding='utf-8') as f:
            u.write_with_tab(f, title, url)
            f.write('\n')
    def get_urls(self):
        lines = self.get_lines()
        return [self.get_url(line) for line in lines]
    def output_readable_file(self):
        file = self.file_path.replace('.txt', '_readable.txt')
        with open(file, mode='w', encoding='utf-8') as f:
            lines = self.get_lines()
            for line in lines:
                f.write(self.get_title(line) + '\n' + self.get_url(line) + '\n')
                f.write('\n------------------------------\n\n')
    def get_lines(self):
        with open(self.file_path, mode='r', encoding='utf-8') as f:
            text = f.read()
            lines = text.strip().split('\n')
            return lines
    def get_title(self, line):
        texts_in_line = line.split('\t')
        return texts_in_line[0] if len(texts_in_line) >= 1 else ''
    def get_url(self, line):
        texts_in_line = line.split('\t')
        return texts_in_line[1] if len(texts_in_line) >= 2 else ''
myutil.py
def write_with_tab(file, *strings):
    """
Write a string to the file separated by tabs
    """
    for i, string in enumerate(strings):
        file.write(string)
        if i != len(strings) - 1:  #If not the last loop
            file.write('\t')
    return file
        Recommended Posts