BeautifulSoup对网页进行解析
from bs4 import BeautifulSoup
path = ./web/new_index.html
data = []
with open(path, r, encoding=gbk) as f:
Soup = BeautifulSoup(f.read(), lxml)
titles = Soup.select(body > div.main-content > ul > li > div.article-info > h3 > a) # 标题
pics = Soup.select(body > div.main-content > ul > li > img) # 图片
descs = Soup.select(body > div.main-content > ul > li > div.article-info > p.description) # 简介
rates = Soup.select(body > div.main-content > ul > li > div.rate > span) # 分数
cates = Soup.select(body > div.main-content > ul > li > div.article-info > p.meta-info) # 标签
# 将多个列表组装成字典
for title, pic, desc, rate, cate in zip(titles, pics, descs, rates, cates):
info = {
title: title.get_text(),
pic: pic.get(src),
descs: desc.get_text(),
rate: rate.get_text(),
cate: list(cate.stripped_strings)
}
data.append(info)
for item in data:
if len(item[rate]) >= 3: # 大于3分的内容
print(item[title], item[cate])
真实世界中的网页解析