Selenium自动化测试工具——以爬取京东商品信息为例

需要安装的包

import re
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from pyquery import PyQuery as pq
import csv
import time

完整代码

browser = webdriver.Chrome()
wait = WebDriverWait(browser, 10)
def search():
    print(正在搜索)
    try:
        browser.get("https://www.jd.com/")
        input = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, #key))
        )
        input.send_keys(keywords)
        submit = wait.until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, #search > div > div.form > button)))
        time.sleep(3)
        submit.click()
        total = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, #J_bottomPage > span.p-skip > em:nth-child(1))))
        return total.text
    except TimeoutException:
        return search()


def next_page(page_number):
    print(f正在翻第{page_number}页)
    try:
        print(定位到跳转页数)
        input = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, #J_bottomPage > span.p-skip > input))
        )
        print(定位到跳转按钮,确保可点击)
        submit = wait.until(EC.element_to_be_clickable(
            (By.CSS_SELECTOR, #J_bottomPage > span.p-skip > a)))
        input.clear()
        input.send_keys(page_number)
        submit.click()
        # time.sleep(2)
        wait.until(EC.text_to_be_present_in_element(
            (By.CSS_SELECTOR, #J_bottomPage > span.p-num > a.curr), str(page_number)))
        get_products()
    except TimeoutException:
        next_page(page_number)

def get_products():
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, #J_goodsList > ul)))
    html = browser.page_source
    doc = pq(html,parser="html")
    items = doc(#J_goodsList .gl-item).items()
    for item in items:
        # print(item)
        image="http:"+str(item(.gl-i-wrap .p-img a img).attr(src))
        price=item.find(.p-price).text()
        title=item.find(.p-name).text()
        title = title.strip(
)
        shop=item.find(div span a).text()
        comment=item.find(.p-commit a).text()
        product = [image,price,title,shop,comment]
        # product = ["http:"+str(item(.gl-i-wrap .p-img a img).attr(src)), item.find(.p-price).text(), item.find(.p-name).text(), item.find(div span a).text(), item.find(.p-commit a).text()]
        print(product)
        writer.writerow(product)


def main():
    keywords = input(请输入关键字:)
    total = search()
    total = int(re.compile((d+)).search(total).group(1))
    global f
    f=open(result.csv,mode=w,encoding=gbk,newline=)
    global writer
    writer = csv.writer(f)
    head = [image, price, title, shop, comment]
    writer.writerow(head)
    # get_products()
    for i in range(2, total + 1):
        next_page(i)
    f.close()

if __name__ == __main__:
    main()
经验分享 程序员 微信小程序 职场和发展