python爬虫之获取京东商城手机信息
code
import requests import re import random from fake_useragent import UserAgent import json from pandas import DataFrame from lxml import etree from idlelib.iomenu import encoding import pandas as pd import time class GetJob(): def __init__(self): self.uas = self.create_ua() # 数据框 self.pInfoAll = DataFrame() def create_ua(self): uas = [] ua = UserAgent() for i in range(5): uas.append(ua.random) return uas def get_telephone(self, page): url = "https://list.jd.com/list.html" data = {"cat": "9987,653,655", "page": page, "s": 1, "click": 0} headers = {"user-agent":random.choice(self.uas)} res = requests.get(url, params=data, headers=headers) assert 200 == res.status_code return res.text def get_comment_count(self, goods_sku): # get commentCount headers = {"user-agent":random.choice(self.uas)} referenceIds = ",".join(goods_sku) url = f"https://club.jd.com/comment/productCommentSummaries.action?referenceIds={referenceIds}" res = requests.get(url, headers=headers) assert 200 == res.status_code goods_comment = re.findall("CommentCount":(.*?),, res.text) return goods_comment def get_info(self, text): root = etree.HTML(text) goods_name = root.xpath(//div[@id="J_goodsList"]//*[contains(@class,"p-name")]//em/text()) # 商品编号 goods_sku = root.xpath(//*[@id="J_goodsList"]//li[@data-sku]/@data-sku) goods_price = root.xpath(//div[@id="J_goodsList"]//*[@class="p-price"]//i/text()) goods_comment = self.get_comment_count(goods_sku) pInfo = DataFrame([goods_name, goods_price, goods_comment]).T return pInfo # 翻页 def get_all_info(self, pageCount): for i in range(pageCount): text = self.get_telephone(i) pInfo = self.get_info(text) self.pInfoAll = pd.concat([self.pInfoAll, pInfo]) time.sleep(3) #设置列名 self.pInfoAll.columns = ["商品名称","价格","评论数"] #保存到csv文件 self.pInfoAll.to_csv("goods_info.csv", encoding="utf8") if __name__ == "__main__": m = GetJob() m.get_all_info(5)