简单使用scrapy框架下载漫画
定义items
import scrapy class CartoonItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() chapter = scrapy.Field() url = scrapy.Field() img_url = scrapy.Field() img_path = scrapy.Field()
定义管道
通过管道获得items,使用urllib下载图片
import os from urllib.request import urlretrieve class CartoonPipeline: def open_spider(self, spider): os.mkdir(./妖神记) if not os.path.exists("./妖神记") else 1 def process_item(self, item, spider): os.mkdir(f./妖神记/{item["chapter"]}) if not os.path.exists(f"./妖神记/{item[chapter]}") else 1 with open(f./妖神记/{item["chapter"]}/url.txt, w, encoding=utf-8) as f: f.write(furl: {item["url"]}) for i, img_url in enumerate(item[img_url]): urlretrieve(url=img_url, filename=f./妖神记/{item["chapter"]}/{i}.jpg) print(item) return item
spiders类
import scrapy import re from cartoon.items import CartoonItem class YsjSpider(scrapy.Spider): name = ysj allowed_domains = [dmzj.com] start_urls = [https://www.dmzj.com/info/yaoshenji.html]
爬取列表页
def parse(self, response): # 对//节点需要加个()才能取第2个 tr_list = response.xpath("(//ul[@class=list_con_li autoHeight])[2]//li") # 1是从大到小 2是从小到大 print(len(tr_list)) for tr in tr_list: item = CartoonItem() # 章节名 item[chapter] = tr.xpath("./a/@title").extract_first() # 详情页地址 item[url] = tr.xpath("./a/@href").extract_first() yield scrapy.Request( url=item[url], callback=self.parse_jpg, meta={item: item} )
爬取漫画jpg的url
图片的地址在从源码中正则获取,然后排序放入items
def parse_jpg(self, response): item = response.meta[item] item[img_url] = [] html_data = response.body.decode(utf-8) # 2进制转utf-8 # 从源码正则取 pics = re.findall(rd{13,14}, html_data) # 所有章节的img都在里面 # 或者 del_lists = [] for pic in pics: # 边遍历边删除需要注意删不干净 if pic[:3] == 100: del_lists.append(pic) for del_list in del_lists: pics.remove(del_list) # 排序 pics = self.sort_pics(pics) chapterpic_hou = re.findall(r|(d{5})|, html_data)[0] # findall取第一个 chapterpic_qian = re.findall(r|(d{4})|, html_data)[0] for pic in pics: item[img_url].append(fhttps://images.dmzj.com/img/chapterpic/{chapterpic_qian}/{chapterpic_hou}/{pic}.jpg) print(fhttps://images.dmzj.com/img/chapterpic/{chapterpic_qian}/{chapterpic_hou}/{pic}.jpg) yield item def sort_pics(self, pics): # 补0后排序后去0 # for pic in pics: # if len(pic) == 13: # pic += 0 # 这里的改变并不会改变列表中的pic # 通过索引在列表中改 for i, pic in enumerate(pics): if len(pic) == 13: pics[i] += 0 pics = sorted(pics, key=lambda x: int(x)) # 从小到大 for i, pic in enumerate(pics): if pic[-1] == 0: pics[i] = pic[:-1] # 去除刚才补的0 return pics