scrapy---爬虫中间件和下载中间件
爬虫中间件
# 爬虫中间件 (了解) middlewares.py class MysfirstscrapySpiderMiddleware: @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_spider_input(self, response, spider): return None def process_spider_output(self, response, result, spider): for i in result: yield i def process_spider_exception(self, response, exception, spider): pass def process_start_requests(self, start_requests, spider): for r in start_requests: yield r def spider_opened(self, spider): spider.logger.info(Spider opened: %s % spider.name)
下载中间件
-进来request对象 -加代理 -加cookie -加请求头 -出去response对象 -修改响应对象,最后进入到爬虫的parser中就是修改后的response
# 下载中间件 class MysfirstscrapyDownloaderMiddleware: @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s # 请求来了执行 def process_request(self, request, spider): # 返回值可以是如下 # return None:继续处理本次请求,执行执行下一个中间件的process_request #return Response:执行当前中间件的process_response回去,进入到引擎,被调度,进入第6步,返回到爬虫的解析方法中 # return a Request:直接返回,给引擎,被调度,进入第2步,进入调度器等待下次被调度爬取 # raise IgnoreRequest:执行 process_exception return None # 请求走了 def process_response(self, request, response, spider): # 返回如下 # return Response :继续往后走,进入到引擎,被调度到爬虫中解析 # return Request :进入到引擎,被调度进调度器 # - or raise IgnoreRequest:会执行process_exception return response def process_exception(self, request, exception, spider): # Called when a download handler or a process_request() # (from other downloader middleware) raises an exception. # Must either: # - return None: continue processing this exception # - return a Response object: stops process_exception() chain # - return a Request object: stops process_exception() chain pass def spider_opened(self, spider): spider.logger.info(Spider opened: %s % spider.name) # 在配置文件中配置
1.加代理
# 在下载中间件的def process_request(self, request, spider):写代码 # 第一步: -在下载中间件写process_request方法 def get_proxy(self): import requests res = requests.get(http://127.0.0.1:5010/get/).json() if res.get(https): return https:// + res.get(proxy) else: return http:// + res.get(proxy) def process_request(self, request, spider): request.meta[proxy] = self.get_proxy() return None # 第二步:代理可能不能用,会触发process_exception,在里面写 def process_exception(self, request, exception, spider): print(-----,request.url) # 这个地址没有爬 return request
2.加cookie,修改请求头,随机生成UserAgent
2.1加cookie
def process_request(self, request, spider): print(request.cookies) request.cookies[name]=lqz return None
2.2 修改请求头
def process_request(self, request, spider): print(request.headers) request.headers[referer] = http://www.lagou.com return None
2.3 动态生成User-agent使用
需要先安装模块
pip insttall fake_useragent
def process_request(self, request, spider): # fake_useragent模块 from fake_useragent import UserAgent ua = UserAgent() request.headers[User-Agent]=str(ua.random) print(request.headers) return None
下一篇:
Airtest--基本使用(简易描述)