简单使用scrapy框架下载漫画
定义items
import scrapy
class CartoonItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
chapter = scrapy.Field()
url = scrapy.Field()
img_url = scrapy.Field()
img_path = scrapy.Field()
定义管道
通过管道获得items,使用urllib下载图片
import os
from urllib.request import urlretrieve
class CartoonPipeline:
def open_spider(self, spider):
os.mkdir(./妖神记) if not os.path.exists("./妖神记") else 1
def process_item(self, item, spider):
os.mkdir(f./妖神记/{item["chapter"]}) if not os.path.exists(f"./妖神记/{item[chapter]}") else 1
with open(f./妖神记/{item["chapter"]}/url.txt, w, encoding=utf-8) as f:
f.write(furl:
{item["url"]})
for i, img_url in enumerate(item[img_url]):
urlretrieve(url=img_url,
filename=f./妖神记/{item["chapter"]}/{i}.jpg)
print(item)
return item
spiders类
import scrapy
import re
from cartoon.items import CartoonItem
class YsjSpider(scrapy.Spider):
name = ysj
allowed_domains = [dmzj.com]
start_urls = [https://www.dmzj.com/info/yaoshenji.html]
爬取列表页
def parse(self, response):
# 对//节点需要加个()才能取第2个
tr_list = response.xpath("(//ul[@class=list_con_li autoHeight])[2]//li") # 1是从大到小 2是从小到大
print(len(tr_list))
for tr in tr_list:
item = CartoonItem()
# 章节名
item[chapter] = tr.xpath("./a/@title").extract_first()
# 详情页地址
item[url] = tr.xpath("./a/@href").extract_first()
yield scrapy.Request(
url=item[url],
callback=self.parse_jpg,
meta={item: item}
)
爬取漫画jpg的url
图片的地址在从源码中正则获取,然后排序放入items
def parse_jpg(self, response):
item = response.meta[item]
item[img_url] = []
html_data = response.body.decode(utf-8) # 2进制转utf-8
# 从源码正则取
pics = re.findall(rd{13,14}, html_data) # 所有章节的img都在里面
# 或者
del_lists = []
for pic in pics: # 边遍历边删除需要注意删不干净
if pic[:3] == 100:
del_lists.append(pic)
for del_list in del_lists:
pics.remove(del_list)
# 排序
pics = self.sort_pics(pics)
chapterpic_hou = re.findall(r|(d{5})|, html_data)[0] # findall取第一个
chapterpic_qian = re.findall(r|(d{4})|, html_data)[0]
for pic in pics:
item[img_url].append(fhttps://images.dmzj.com/img/chapterpic/{chapterpic_qian}/{chapterpic_hou}/{pic}.jpg)
print(fhttps://images.dmzj.com/img/chapterpic/{chapterpic_qian}/{chapterpic_hou}/{pic}.jpg)
yield item
def sort_pics(self, pics): # 补0后排序后去0
# for pic in pics:
# if len(pic) == 13:
# pic += 0 # 这里的改变并不会改变列表中的pic
# 通过索引在列表中改
for i, pic in enumerate(pics):
if len(pic) == 13:
pics[i] += 0
pics = sorted(pics, key=lambda x: int(x)) # 从小到大
for i, pic in enumerate(pics):
if pic[-1] == 0:
pics[i] = pic[:-1] # 去除刚才补的0
return pics
