Scrapy+bs4爬取京东商品对应的评论信息
spiders/comm.py
# -*- coding: utf-8 -*-
import json
import jsonpath
import scrapy
from bs4 import BeautifulSoup
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from commit.items import CommitItem
from scrapy_redis.spiders import RedisSpider
# class CommSpider(scrapy.Spider):
class CommSpider(RedisSpider):
name = comm
# allowed_domains = [search.jd.com,sclub.jd.com]
# start_urls = [https://search.jd.com/Search?keyword=ipad&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&bs=1&ev=exbrand_Apple%5E&page=1&s=1&click=0]
redis_key = comm:start_urls
# lpush comm:start_urls https://search.jd.com/Search?keyword=ipad&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&bs=1&ev=exbrand_Apple%5E&page=1&s=1&click=0
def parse(self, response):
soup = BeautifulSoup(response.text, lxml)
one_page_products = soup.find_all(class_=gl-item)
for one_product in one_page_products:
# 商品id
productId = one_product.get(data-sku)
for i in range(0,2):
src = fhttps://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv3067&productId={productId}&score=0&sortType=5&page={i}&pageSize=10&isShadowSku=0&fold=1
item = CommitItem()
item[productid] = productId
request = scrapy.Request(src,callback=self.pase_comment)
request.meta[item] = item
yield request
def pase_comment(self,response):
data = response.text
# print(data)
# 做字符串处理
data = data.split(()[1]
data = data.split())[0]
data = json.loads(data)
data_list = jsonpath.jsonpath(data, $..comments)[0]
# print(data_list)
for one in data_list:
# 用户id
userid = one[id]
# 用户评论的内容
content = one[content]
# 用户评论的时间
datatime = one[creationTime]
item = response.meta[item]
item[userid] = userid
item[content] = content
item[datatime] = datatime
yield item
import scrapy
class CommitItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
content = scrapy.Field()
userid = scrapy.Field()
productid = scrapy.Field()
datatime = scrapy.Field()
ITEM_PIPELINES = {
# commit.pipelines.CommitPipeline: 300,
scrapy_redis.pipelines.RedisPipeline: 300,
}
DB_HOST=127.0.0.1
DB_PORT=3306
DB_USER=root
DB_PASSWORD=密码
DB_DATABASE=jd
DB_CHARSET=utf8
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
REDIS_HOST = localhost
REDIS_PORT = 6379
import pymysql
from scrapy.utils.project import get_project_settings
class CommitPipeline(object):
def open_spider(self,spider):
settings = get_project_settings()
host = settings[DB_HOST]
port = settings[DB_PORT]
user = settings[DB_USER]
password = settings[DB_PASSWORD]
database = settings[DB_DATABASE]
charset = settings[DB_CHARSET]
self.conn = pymysql.connect(host=host,port=port,user=user,password=password,database=database,charset=charset)
self.cursor = self.conn.cursor()
# print(self.conn)
def process_item(self, item, spider):
sql = INSERT INTO nr(productid,userid,datatime,content) VALUES ("%s","%s","%s","%s")%(item[productid],item[userid],item[datatime],item[content])
try:
self.cursor.execute(sql)
self.conn.commit()
except:
self.conn.rollback()
return item
def close_spider(self,spider):
self.cursor.close()
self.conn.close()