python爬虫之获取京东商城手机信息

code

import requests
import re
import random
from fake_useragent import UserAgent
import json
from pandas import DataFrame
from lxml import etree
from idlelib.iomenu import encoding
import pandas as pd
import time


class GetJob():

    def __init__(self):
        self.uas = self.create_ua()
        # 数据框
        self.pInfoAll = DataFrame()
    
    def create_ua(self):
        uas = []
        ua = UserAgent()
        for i in range(5):
            uas.append(ua.random)
        return uas
    
    def get_telephone(self, page):
        url = "https://list.jd.com/list.html"
        data = {"cat": "9987,653,655",
                "page": page,
                "s": 1,
                "click": 0}
        headers = {"user-agent":random.choice(self.uas)}
        res = requests.get(url, params=data, headers=headers)
        assert 200 == res.status_code
        return res.text
    
    def get_comment_count(self, goods_sku):
        # get commentCount
        headers = {"user-agent":random.choice(self.uas)}
        referenceIds = ",".join(goods_sku)
        url = f"https://club.jd.com/comment/productCommentSummaries.action?referenceIds={referenceIds}"
        res = requests.get(url, headers=headers)
        assert 200 == res.status_code
        goods_comment = re.findall("CommentCount":(.*?),, res.text)
        return goods_comment
        
    def get_info(self, text):
        root = etree.HTML(text)
        goods_name = root.xpath(//div[@id="J_goodsList"]//*[contains(@class,"p-name")]//em/text())
        # 商品编号
        goods_sku = root.xpath(//*[@id="J_goodsList"]//li[@data-sku]/@data-sku)
        goods_price = root.xpath(//div[@id="J_goodsList"]//*[@class="p-price"]//i/text())
        goods_comment = self.get_comment_count(goods_sku)
        pInfo = DataFrame([goods_name, goods_price, goods_comment]).T
        return pInfo

    # 翻页
    def get_all_info(self, pageCount):
        for i in range(pageCount):
            text = self.get_telephone(i)
            pInfo = self.get_info(text)
            self.pInfoAll = pd.concat([self.pInfoAll, pInfo])
            time.sleep(3)
        #设置列名
        self.pInfoAll.columns = ["商品名称","价格","评论数"]
        #保存到csv文件
        self.pInfoAll.to_csv("goods_info.csv", encoding="utf8")

if __name__ == "__main__":
    m = GetJob()
    m.get_all_info(5)
经验分享 程序员 微信小程序 职场和发展