python爬虫之获取京东商城手机信息
code
import requests
import re
import random
from fake_useragent import UserAgent
import json
from pandas import DataFrame
from lxml import etree
from idlelib.iomenu import encoding
import pandas as pd
import time
class GetJob():
def __init__(self):
self.uas = self.create_ua()
# 数据框
self.pInfoAll = DataFrame()
def create_ua(self):
uas = []
ua = UserAgent()
for i in range(5):
uas.append(ua.random)
return uas
def get_telephone(self, page):
url = "https://list.jd.com/list.html"
data = {"cat": "9987,653,655",
"page": page,
"s": 1,
"click": 0}
headers = {"user-agent":random.choice(self.uas)}
res = requests.get(url, params=data, headers=headers)
assert 200 == res.status_code
return res.text
def get_comment_count(self, goods_sku):
# get commentCount
headers = {"user-agent":random.choice(self.uas)}
referenceIds = ",".join(goods_sku)
url = f"https://club.jd.com/comment/productCommentSummaries.action?referenceIds={referenceIds}"
res = requests.get(url, headers=headers)
assert 200 == res.status_code
goods_comment = re.findall("CommentCount":(.*?),, res.text)
return goods_comment
def get_info(self, text):
root = etree.HTML(text)
goods_name = root.xpath(//div[@id="J_goodsList"]//*[contains(@class,"p-name")]//em/text())
# 商品编号
goods_sku = root.xpath(//*[@id="J_goodsList"]//li[@data-sku]/@data-sku)
goods_price = root.xpath(//div[@id="J_goodsList"]//*[@class="p-price"]//i/text())
goods_comment = self.get_comment_count(goods_sku)
pInfo = DataFrame([goods_name, goods_price, goods_comment]).T
return pInfo
# 翻页
def get_all_info(self, pageCount):
for i in range(pageCount):
text = self.get_telephone(i)
pInfo = self.get_info(text)
self.pInfoAll = pd.concat([self.pInfoAll, pInfo])
time.sleep(3)
#设置列名
self.pInfoAll.columns = ["商品名称","价格","评论数"]
#保存到csv文件
self.pInfoAll.to_csv("goods_info.csv", encoding="utf8")
if __name__ == "__main__":
m = GetJob()
m.get_all_info(5)
