Scrapy : 1.3.3 lxml : 3.7.3.0 libxml2 : 2.9.3 cssselect : 1.0.1 parsel : 1.1.0 w3lib : 1.17.0 Twisted : 17.1.0 Python : 2.7.9 (default, Mar 6 2017, 10:54:15) - [GCC 4.4.7 20120313 (Red Hat 4.4.7-16)] pyOpenSSL : 16.2.0 (OpenSSL 1.0.1e-fips 11 Feb 2013) Platform : Linux-2.6.32-431.el6.x86_64-x86_64-with-centos-6.5-Final
spider_movices.py
python:scrapy_movices.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from ygdy8_20170508.items import moviceItem
#from scrapy.linkextractors import LinkExtractor
class MySpider(CrawlSpider):
name = 'Myspider'
allowed_domains = ['ygdy8.com']
start_urls = ['http://www.ygdy8.com/']
# rules = (
# Rule(LinkExtractor(allow=('\d+\.html', ), deny=('html\/game\/\d+\.html', ))),
# Rule(LinkExtractor(allow=('\d+\.html', )), callback='parse_item',follow=False,),
# )
rules = [
Rule(SgmlLinkExtractor(allow=('\d+\.html'),),
callback='parse_item',
follow=True)
]
def parse_item(self, response):
self.log(response.url)
item = moviceItem()
if 'game' in response.url:
#ftps = response.css('td[style*=WORD-WRAP]').xpath('u/font/a/@href').extract()
ftps = response.css('td[style*=WORD-WRAP] a::attr(href)').extract()
else:
ftps = response.css('td[style*=WORD-WRAP]').xpath('a[contains(href,ftp)]/text()').extract()
if len(ftps) == 0:
ftps = response.css('td[style*=WORD-WRAP] a::attr(href)').extract()
for i in ftps:
x = i.encode('utf-8')
if not x.startswith('ftp'):
ftps = response.css('td[style*=WORD-WRAP] a::attr(href)').extract()
break
item['ftp'] = map(lambda x:x.encode('utf-8'),ftps)
item['name'] = ''.join(response.xpath('//title/text()').extract()).encode('utf-8')
yield item
items.py
items.py
# -*- coding: utf-8 -*-
import scrapy
class moviceItem(scrapy.Item):
name = scrapy.Field()
ftp = scrapy.Field()
pipelines.py
pipelines.py
# -*- coding: utf-8 -*-
import codecs
import json
class Ygdy820170508Pipeline(object):
def __init__(self):
self.file = codecs.open('movice.json','wb',encoding='utf-8')
def process_item(self, item, spider):
m = item['ftp']
if len(m) == 0:
raise DropItem('Drop %s the item' %item)
line = json.dumps(dict(item)) + '\n'
self.file.write(line.decode('unicode_escape'))
return item