from scrapy.spiders import CrawlSpider,Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.selector import Selector
from teizi.items import TeiziItem
from scrapy import log
class XunduSpider(CrawlSpider):
name="teizi"
download_delay=1
allowed_domains=['http://www.xunread.com/']
start_urls=["http://www.xunread.com/article/8c39f5a0-ca54-44d7-86cc-148eee4d6615/index.shtml"]
rules=[Rule(LinkExtractor(allow=('\d\.shtml')),callback='parse_item',follow=True)]
def parse_item(self,response):
log.msg("parse_item",level='INFO')
item=TeiziItem
sel=Selector(response)
script_content = sel.xpath('//div[@id="content"]/script/div/text()').extract()
script_title= sel.xpath('//div[@id="title"]/script/div/text()').extract()
item['content']=[n.encode('utf-8') for n in script_content]
item['title']=[n.encode('utf-8') for n in script_title]
yield item