Use scrapy movie network

Use scrapy movie network

Environmental environment

Scrapy : 1.3.3 lxml : 3.7.3.0 libxml2 : 2.9.3 cssselect : 1.0.1 parsel : 1.1.0 w3lib : 1.17.0 Twisted : 17.1.0 Python : 2.7.9 (default, Mar 6 2017, 10:54:15) - [GCC 4.4.7 20120313 (Red Hat 4.4.7-16)] pyOpenSSL : 16.2.0 (OpenSSL 1.0.1e-fips 11 Feb 2013) Platform : Linux-2.6.32-431.el6.x86_64-x86_64-with-centos-6.5-Final

spider_movices.py

python:scrapy_movices.py



# -*- coding: utf-8 -*-

import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from ygdy8_20170508.items import moviceItem
#from  scrapy.linkextractors import LinkExtractor

class MySpider(CrawlSpider):
    name = 'Myspider'
    allowed_domains = ['ygdy8.com']
    start_urls = ['http://www.ygdy8.com/']

#    rules = (
#        Rule(LinkExtractor(allow=('\d+\.html', ), deny=('html\/game\/\d+\.html', ))),
#        Rule(LinkExtractor(allow=('\d+\.html', )), callback='parse_item',follow=False,),
#    )

    rules = [
        Rule(SgmlLinkExtractor(allow=('\d+\.html'),),
             callback='parse_item',
             follow=True)
    ]

    def parse_item(self, response):
        self.log(response.url)
        item = moviceItem()
        if 'game' in response.url:
            #ftps = response.css('td[style*=WORD-WRAP]').xpath('u/font/a/@href').extract()
            ftps = response.css('td[style*=WORD-WRAP] a::attr(href)').extract()
        else:
            ftps = response.css('td[style*=WORD-WRAP]').xpath('a[contains(href,ftp)]/text()').extract()
            if len(ftps) == 0:
                ftps = response.css('td[style*=WORD-WRAP] a::attr(href)').extract()
            for i in ftps:
                x = i.encode('utf-8')
                if  not x.startswith('ftp'):
                    ftps = response.css('td[style*=WORD-WRAP] a::attr(href)').extract()
                    break
        item['ftp'] = map(lambda x:x.encode('utf-8'),ftps)
        item['name'] = ''.join(response.xpath('//title/text()').extract()).encode('utf-8')
        yield item

items.py

items.py


# -*- coding: utf-8 -*-

import scrapy

class moviceItem(scrapy.Item):
    name = scrapy.Field()
    ftp = scrapy.Field()


pipelines.py

pipelines.py


# -*- coding: utf-8 -*-
import codecs
import json

class Ygdy820170508Pipeline(object):
    def __init__(self):
        self.file = codecs.open('movice.json','wb',encoding='utf-8')

    def process_item(self, item, spider):
        m = item['ftp']
        if len(m) == 0:
            raise DropItem('Drop %s the item' %item)
        line = json.dumps(dict(item)) + '\n'
        self.file.write(line.decode('unicode_escape'))
        return item

Download of the results

Recommended Posts

Use scrapy movie network
network