爬子第一篇:zol手机型号参数抓取
目标
爬取url:https://detail.zol.com.cn/cell_phone_advSearch/subcate57_1_s8975_1_1__2.html 数据需求: 抓取主流品牌的所有手机机型和相关参数。这应该是我写过的第一个正式的爬虫。
方法论
不需要登陆。没有加密参数。只需要cookie和ua就能获取。通过selenium获取cookie,通过分页获取每个手机的url【注:详情页的url需要手动拼接】,再获取详情页的参数。 提前准备好要抓取的列表。包括品牌、url、页数。如: brand,url,pagenum 华为,https://detail.zol.com.cn/cell_phone_advSearch/subcate57_1_m613_1_1__{}.html,24 vivo,https://detail.zol.com.cn/cell_phone_advSearch/subcate57_1_m1795_1_1__{}.html,15 oppo,https://detail.zol.com.cn/cell_phone_advSearch/subcate57_1_m1673_1_1__{}.html,13
代码
import csv import json import time import pandas as pd from selenium import webdriver import requests from lxml import etree def down_cookie(): url = https://detail.zol.com.cn/ driver = webdriver.Chrome(executable_path=/Users/fangli/Downloads/chromedriver) driver.get(url) dictCookies = driver.get_cookies() # 核心 jsonCookies = json.dumps(dictCookies) print(jsonCookies) # 登录完成后将cookie保存到本地文件 with open(cookies.json, w) as f: f.write(jsonCookies) time.sleep(3) driver.close() def get_cookie(): with open(cookies.json, r, encoding=utf-8) as f: listCookies = json.loads(f.read()) cookie = [item["name"] + "=" + item["value"] for item in listCookies] cookiestr = ; .join(item for item in cookie) return cookiestr def get_response(pageurl): cookie=get_cookie() headers = { "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36" , "cookie": cookie } try: response = requests.get(url=pageurl, headers=headers).text return response except: print(更换cookie) down_cookie() get_response(pageurl) def get_datail(detail_url): response = get_response(detail_url) html = etree.HTML(response) phone_name = str(html.xpath(//h1[@class="product-model__name"]/text())[0]).replace(参数,) with open(file=手机参数数据采集全量.csv,encoding=utf-8,mode=a) as files: for i in range(1,11): trs = html.xpath(//div[@class="detailed-parameters"]/table[{}]/tr.format(i)) for tr in trs: try: k=tr.xpath("./th/span/text() | ./th/a/text()")[0] v = str(tr.xpath(./td/span/text() | ./td/span/a/text())[0]).replace(>,).replace(,,;) files.write({},{},{}.format(phone_name,k,v)) files.write( ) print(phone_name,k,v) except: pass #获取pagelist中的url def get_pagelist(pageurl): response = get_response(pageurl) html = etree.HTML(response) result = html.xpath(//*[@id="result_box"]/div[2]/ul/li) for i in result: proname = str(i.xpath(./dl/dt/a/@id)[0]).replace(proName_,) prename = int(proname[0:4])+1 detail_url = https://detail.zol.com.cn/{}/{}/param.shtml.format(prename,proname) get_datail(detail_url) def get_data(): df=pd.read_csv(brandlist.csv) for i in range(df.shape[0]): brand = df.iat[i,0] url = df.iat[i,1] pagenum = int(df.iat[i,2]) for j in range(1,pagenum+1): pageurl = str(url).format(j) print(brand,pageurl,j) get_pagelist(pageurl) if __name__ == __main__: get_data()
下一篇:
利用PfamScan寻找同源基因家族