J'ai écrit un code pour créer une liste d'URL à télécharger en même temps sur une page avec des numéros de série d'URL courantes, alors prenez note
$ apt-get install lxml-python
$ pip install beautifulsoup4
scraper.py
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
try:
# Python 3
from urllib import request
except ImportError:
# Python 2
import urllib2 as request
from bs4 import BeautifulSoup
import codecs
import time
def getSoup(url):
response = request.urlopen(url)
body = response.read()
# Parse HTML
return BeautifulSoup(body, 'lxml')
wait_sec = 3
domain = 'http://hoge.com'
result_file = 'list.txt'
i = 1
while(True):
url = '{domain}/{index:0>2}/'.format(domain = domain, index = i)
try:
soup = getSoup(url)
except IOError:
break
div = soup.find('div', attrs = {'id': 'div_id'})
all_a = div.find_all('a', attrs = {'class': 'a_class'})
src_list = []
for a in all_a:
src_list.append(a.img['src'])
with codecs.open(result_file, 'a', 'utf-8') as f:
f.write('\n'.join(src_list))
print(i)
i += 1
time.sleep(wait_sec)
[Python: raclage de sites Web avec BeautifulSoup4](http://momijiame.tumblr.com/post/114227737756/python-beautifulsoup4-%E3%82%92%E4%BD%BF%E3%81%A3 % E3% 81% A6-web-% E3% 82% B5% E3% 82% A4% E3% 83% 88% E3% 82% 92% E3% 82% B9% E3% 82% AF% E3% 83% AC% E3% 82% A4% E3% 83% 94% E3% 83% B3% E3% 82% B0% E3% 81% 99% E3% 82% 8B)
Grattage avec Python et Beautiful Soup
Recommended Posts