Web scraping with Beutiful Soup 4

Following WEB scraping with BeautifulSoup4 (serial number page), I wrote the code for the layered page, so make a note.

point

If you create a list in the order of category → page → desired file and process it, it is easy to restart even if it is interrupted in the middle

code

`scraper.py`


# -*- coding: utf-8 -*-

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

try:
    # Python 3
    from urllib import request
except ImportError:
    # Python 2
    import urllib2 as request

from bs4 import BeautifulSoup
import time, os, codecs, string, json

domain = 'http://hoge.com'
wait_sec = 3
headers = { 'User-Agent' : 'Mozilla/5.0)' }
cwd = os.getcwd()
result_file = cwd + '/result_url.txt'
category_file = cwd + '/category.txt'
page_file = cwd + '/page.txt'

def fetchSoup(url):
    time.sleep(wait_sec)

    req = request.Request(url, headers = headers)
    try:
        print('open {url}'.format(url = url))
        response = request.urlopen(req)
        print('ok')
        body = response.read()
        return BeautifulSoup(body, 'lxml')
    except URLError, e:
        print('error: {reason}'.format(reason = e.reason))
        return None

def getUrl(src):
    return '{domain}{src}'.format(domain = domain, src = src)

def extractUrlFromTags(tags):
    result = []
    for tag in tags:
        if tag.name == 'a':
            result.append(getUrl(tag['href']))
        elif tag.name == 'img':
            result.append(getUrl(tag['src']))
    return result

def saveUrl(file_name, url_list):
    with codecs.open(file_name, 'a', 'utf-8') as f:
        f.write('{list}\n'.format(list = '\n'.join(url_list)))

def deleteFirstLine(file_name):
    with codecs.open(file_name, 'r', 'utf-8') as f:
        content = f.read()
        content = content[content.find('\n') + 1:]
    with codecs.open(file_name, 'w', 'utf-8') as f:
        f.write(content)

def fetchAllCategories():
    page = 1
    while True:
        url = '{domain}/category_{page}/'.format(domain = domain, page = page)
        soup = fetchSoup(url)
        categories = soup.find('div', id = 'list').find_all('a')
        url_list = extractUrlFromTags(categories)
        if len(url_list):
            saveUrl(category_file, url_list)
        page_list_last = soup.find('div', class = 'pagenation').find_all('a')[-1].string
        if page_list_last not in ['>', '>>']:
            break
        page += 1

def fetchCategory():
    if not os.path.exists(category_file):
        fetchAllCategories()
    with codecs.open(category_file, 'r', 'utf-8') as f:
        result = f.readline().rstrip('\n')
    return result

def fetchAllPages():
    category = fetchCategory()
    while category != '':
        soup = fetchSoup(category)
        pages = soup.find_all('a', class = 'page')
        url_list = extractUrlFromTag(pages)
        if len(url_list):
            saveUrl(page_file)
        deleteFirstLine(page_file)
        small_category = fetchCategory()

def fetchPage():
    if os.path.exists(page_file) or fetchCategory() != '':
        fetchAllPages()
    with codecs.open(page_file, 'r', 'utf-8'):
        result = f.readline().rstrip('\n')
    return result

def fetchTargets():
    page = fetchPage()
    while page != '':
        soup = fetchSoup(page)
        targets = soup.find_all('img', class = 'target')
        url_list = extractUrlFromTags(targets)
        if len(url_list):
            saveUrl(result_file, url_list)
        deleteFirstLine(page_file)
        page = fetchPage()

fetchTargets()

Convenient technique

When names etc. are categorized in the alphabet

alphabet_l = list(string.ascii_lowercase)
alphabet_u = list(string.ascii_uppercase)

To process variables etc. extracted from the script tag

data = json.loads(json_string)

In order to process in the background when executing with VPS etc. and continue even if you log off

$ nohup python scraper.py < /dev/null &

To check the process that is continuing

$ ps x

Web scraping with BeautifulSoup4 (layered page)

Web scraping with Beutiful Soup 4

point

code

scraper.py

Convenient technique

`scraper.py`