使用scrapy框架写的爬虫项目代码

源代码文件:

import scrapy import json from..items import Db250Item class W666Spider(scrapy.Spider): name = w666 allowed_domains = [movie.douban.com] start_urls = [http://movie.douban.com/top250] page_num=0 def parse(self, response): node_list=response.xpath(//div[@class="info"]) if node_list: for i in node_list: movies_name=i.xpath(.//div/a/span/text()).get() director = i.xpath(./div/p/text()).get().strip() score = i .xpath(//span[@class="rating_num"]/text()).get() item=Db250Item() item["movies_name"]=movies_name item["director"]=director item["score"]=score yield item self.page_num +=1 new_url=https://movie.douban.com/top250?start={}&filter=.format(self.page_num*25) yield scrapy.Request(new_url,callback=self.parse) else: return

items.py:

import scrapy class Db250Item(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() movies_name=scrapy.Field() director=scrapy.Field() score=scrapy.Field()

管道pipelines.py:

import json class Db250Pipeline: def open_spider(self,spider): self.f = open(wxin.txt,w,encoding=utf-8) def process_item(self, item, spider): json_str=json.dumps(dict(item),ensure_ascii=False)+ self.f.write(json_str) return item def close_spider(self): self.f.close()

settings.py需要修改以及激活的内容:

ROBOTSTXT_OBEY =False #这个默认的是Ture,必须该成Flase,要不然就爬取不到数据 DEFAULT_REQUEST_HEADERS = { Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8, Accept-Language: en, User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.82 Safari/537.36 } #这个必须要去network里找到user-agent,然后添加进去,如果不加大概率是爬不到数据的 ITEM_PIPELINES = { db250.pipelines.Db250Pipeline: 300, } #这个储存管道设定后必须激活的,才能进行持久化保存
经验分享 程序员 微信小程序 职场和发展