Python爬取新闻联播（文字版）

Python爬取新闻联播（文字版） 2023-07-16 229

环境安装

先安装 Python3 和 pip3 环境，然后需要安装以下的库：

pip install beautifulsoup4 pip install requests

脚本编写

直接把运行代码会把文字版新闻联播爬取下来并保存在 news.txt 中：

import datetime
import requests
from bs4 import BeautifulSoup

headers = {
          
   
    User-Agent: "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"
}
url = "http://www.sdpp.com.cn/list/list_98.html"
s = requests.get(url,headers=headers)
s.encoding="utf-8"
print(s.status_code)
bs = BeautifulSoup(s.text,"html.parser")
news = bs.find(aside,class_="news_list").find("a")[href]
today = str((datetime.date.today() + datetime.timedelta(days=0)).strftime(%Y%m%d))
preday = str((datetime.date.today() + datetime.timedelta(days=-1)).strftime(%Y%m%d))
pre2day = str((datetime.date.today() + datetime.timedelta(days=-2)).strftime(%Y%m%d))
if str(pre2day) in news or str(preday) in news or today in news:
    print("have new news")
    r = requests.get(news,headers=headers)
    r.encoding = urf8
    bs = BeautifulSoup(r.text, "html.parser")
    allpagecount = int(bs.find("span",{
          
   "id":"allpagecount"}).get_text())
    title = bs.find("div", {
          
   "class": "keys3"}).get_text()
    temp = title + "
"
    for i in range(1,allpagecount):
        link = bs.find("a",{
          
   "id":"nextpageurl"})["href"]
        r = requests.get(link,headers=headers)
        r.encoding = urf8
        bs = BeautifulSoup(r.text, "html.parser")
        maintext = bs.find("div", {
          
   "class": "textCon"}).get_text()
        temp = temp + maintext + "
"
        print(maintext)
    with open("news.txt", "w", encoding="utf8") as f:
        f.write(temp)
    # email_data = {
          
   
    #     "title": title,
    #     "body": temp,
    #     "sender": "your_email@xx.com",
    #     "password": "password",
    #     "receiver": "your_email@xx.com",
    #     "smtpserver": "smtp.163.com",
    #     "is_send_email": True
    # }
    # send_email(**email_data)