简单使用scrapy框架下载漫画

定义items

import scrapy


class CartoonItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    chapter = scrapy.Field()
    url = scrapy.Field()
    img_url = scrapy.Field()
    img_path = scrapy.Field()

定义管道

通过管道获得items,使用urllib下载图片

import os
from urllib.request import urlretrieve


class CartoonPipeline:

    def open_spider(self, spider):

        os.mkdir(./妖神记) if not os.path.exists("./妖神记") else 1


    def process_item(self, item, spider):
        os.mkdir(f./妖神记/{item["chapter"]}) if not os.path.exists(f"./妖神记/{item[chapter]}") else 1
        with open(f./妖神记/{item["chapter"]}/url.txt, w, encoding=utf-8) as f:
            f.write(furl:
{item["url"]})

        for i, img_url in enumerate(item[img_url]):
            urlretrieve(url=img_url,
                        filename=f./妖神记/{item["chapter"]}/{i}.jpg)

        print(item)
        return item

spiders类

import scrapy
import re
from cartoon.items import CartoonItem


class YsjSpider(scrapy.Spider):
    name = ysj
    allowed_domains = [dmzj.com]

    start_urls = [https://www.dmzj.com/info/yaoshenji.html]

爬取列表页

def parse(self, response):
        # 对//节点需要加个()才能取第2个
        tr_list = response.xpath("(//ul[@class=list_con_li autoHeight])[2]//li") # 1是从大到小 2是从小到大

        print(len(tr_list))
        for tr in tr_list:
            item = CartoonItem()
            # 章节名
            item[chapter] = tr.xpath("./a/@title").extract_first()
            # 详情页地址
            item[url] = tr.xpath("./a/@href").extract_first()

            yield scrapy.Request(
                url=item[url],
                callback=self.parse_jpg,
                meta={item: item}
            )

爬取漫画jpg的url

图片的地址在从源码中正则获取,然后排序放入items

def parse_jpg(self, response):
        item = response.meta[item]
        item[img_url] = []
        html_data = response.body.decode(utf-8) # 2进制转utf-8

        # 从源码正则取
        pics = re.findall(rd{13,14}, html_data) # 所有章节的img都在里面
        # 或者
        del_lists = []
        for pic in pics:  # 边遍历边删除需要注意删不干净
            if pic[:3] == 100:
                del_lists.append(pic)
        for del_list in del_lists:
            pics.remove(del_list)

        # 排序
        pics = self.sort_pics(pics)

        chapterpic_hou = re.findall(r|(d{5})|, html_data)[0] # findall取第一个
        chapterpic_qian = re.findall(r|(d{4})|, html_data)[0]

        for pic in pics:
            item[img_url].append(fhttps://images.dmzj.com/img/chapterpic/{chapterpic_qian}/{chapterpic_hou}/{pic}.jpg)
            print(fhttps://images.dmzj.com/img/chapterpic/{chapterpic_qian}/{chapterpic_hou}/{pic}.jpg)

        yield item

    def sort_pics(self, pics): # 补0后排序后去0
        # for pic in pics:
        #     if len(pic) == 13:
        #         pic += 0    # 这里的改变并不会改变列表中的pic

        # 通过索引在列表中改
        for i, pic in enumerate(pics):
            if len(pic) == 13:
                pics[i] += 0

        pics = sorted(pics, key=lambda x: int(x)) # 从小到大

        for i, pic in enumerate(pics):
            if pic[-1] == 0:
                pics[i] = pic[:-1]  # 去除刚才补的0

        return pics
经验分享 程序员 微信小程序 职场和发展