beijing.py
beijing.py
# -*- coding: utf-8 -*-
import scrapy
from zufang.items import ZufangItem
class BeijingSpider(scrapy.Spider):
name = "beijing"
allowed_domains = ["58.com"]
start_urls = ['http://bj.58.com/chuzu/']
def parse(self, response):
item = ZufangItem()
for i in response.css('.des'):
item['title'] = ''.join(i.css('h2>a::text').extract()).encode('utf-8').strip()
item['weburl'] = i.css('h2>a::attr(href)').extract_first()
item['addres'] = i.css('.add>a:first-child::text').extract_first()
pending_str = i.css('.room::text').extract_first()
try:
item['roomtype'] = pending_str.split()[0]
item['size'] = pending_str.split()[1]
except:
item['roomtype'] = ''
item['size'] = ''
x = i.css('.jjr')
if len(x) == 0:
item['contacts'] = ''
else:
shopname = x.css('span>span::text').extract_first().strip()
person = x.css('.listjjr>a::text').extract_first()
item['contacts'] = person
item['company'] = shopname
item['price'] = i.xpath('following-sibling::div[@class="listliright"]/div[@class="money"]/b/text()').extract_first()
yield item
next_page = response.css('.next::attr(href)').extract_first()
if next_page:
yield scrapy.Request(next_page,callback=self.parse)
items.py
items.py
import scrapy
class ZufangItem(scrapy.Item):
title = scrapy.Field()
addres = scrapy.Field()
size = scrapy.Field()
roomtype = scrapy.Field()
contacts = scrapy.Field()
price = scrapy.Field()
company = scrapy.Field()
weburl = scrapy.Field()
pipelines.py
pipelies.py
import codecs
import json
class ZufangPipeline(object):
def __init__(self):
self.file = codecs.open('zufang.json','wb',encoding='utf-8')
def process_item(self, item, spider):
line = json.dumps(dict(item)) + '\n'
self.file.write(line.decode('unicode_escape'))
return item
Recommended Posts