爬取B站弹幕生成词云
一、爬取弹幕
import requests
import json
import re
#下载页面
def download_page(url):
headers = {
User-Agent:Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36
}
res = requests.get(url,headers)
return res
#根据av号获取cid
def get_cid(av):
"""
:param av:B站视频的av号 如:https://www.bilibili.com/video/av95811021
:return:cid
"""
av = av.strip(av)
url = fhttps://api.bilibili.com/x/player/pagelist?aid={av}&jsonp=jsonpa
res = download_page(url)
res_text = res.text
res_dict = json.loads(res_text)
cid = res_dict[data][0][cid]
return cid
#根据cid请求弹幕
def get_danmu(cid):
"""
:param cid: 获取弹幕所需的id
:return: 弹幕列表
"""
url = fhttps://api.bilibili.com/x/v1/dm/list.so?oid={cid}
res = download_page(url)
res_xml = res.content.decode(utf-8)
pattern = re.compile(<d.*?>(.*?)</d>)
danmu_list = pattern.findall(res_xml)
return danmu_list
#保存弹幕文件
def save_to_file(danmu_list,filename):
"""
:param danmu_list: 弹幕列表
:param filename: 文件名
"""
with open(filename,mode=w,encoding=utf-8) as f:
for one_danmu in danmu_list:
f.write(one_danmu)
f.write(
)
def main(av):
cid = get_cid(av)
danmu_list = get_danmu(cid)
save_to_file(danmu_list, f{av}.txt)
if __name__ == __main__:
av = av95811021
main(av)
二、生成词云
import jieba
from wordcloud import WordCloud
#读取弹幕文件
def read_file(filename):
"""
:param filename: 文件名
:return: 所有弹幕字符串
"""
with open(filename,mode=r,encoding=utf-8) as f:
danmu = f.read()
return danmu
#jieba分词
def jieba_cut(str):
"""
:param str: 所有待分词的弹幕字符串
:return: 词组列表
"""
jieba.suggest_freq(原声大碟, tune=True) #指定不切割词组
jieba.suggest_freq(前方高能, tune=True)
cut_list = jieba.lcut(str)
return cut_list
#生成词云图
def gen_word_cloud(cut_list):
"""
:param cut_list: 词组列表
"""
word_str = .join(cut_list)
wc_settings = {
font_path :msyh.ttc, #字体
width : 800, #图片宽度
height : 600, #图片高度
max_words : 200, #最大词数
background_color : white #背景颜色
}
#生成词云对象
wc = WordCloud(**wc_settings).generate(word_str)
#保存词云图片
wc.to_file(经典对线.png)
def main(av):
str = read_file(f{av}.txt)
cut_list = jieba_cut(str)
gen_word_cloud(cut_list)
if __name__ == __main__:
av = av95811021
main(av)
改进: 备注:需要一张背景为白色的图片
import jieba
from wordcloud import WordCloud
import numpy as np
from PIL import Image
#读取弹幕文件
def read_file(filename):
"""
:param filename: 文件名
:return: 所有弹幕字符串
"""
with open(filename,mode=r,encoding=utf-8) as f:
danmu = f.read()
return danmu
#jieba分词
def jieba_cut(str):
"""
:param str: 所有待分词的弹幕字符串
:return: 词组列表
"""
jieba.suggest_freq(原声大碟, tune=True)
jieba.suggest_freq(前方高能, tune=True)
cut_list = jieba.lcut(str)
return cut_list
#生成词云图
def gen_word_cloud(cut_list):
"""
:param cut_list: 词组列表
"""
word_str = .join(cut_list)
mask = np.array(Image.open(ysg.jpg))
wc_settings = {
font_path :msyh.ttc,
width : 800,
height : 600,
max_words : 600,
background_color : white,
mask:mask,
colormap:Reds,
contour_width:1,
contour_color:red,
collocations:False
}
#生成词云对象
wc = WordCloud(**wc_settings).generate(word_str)
#保存词云图片
wc.to_file(经典对线.png)
def main(av):
str = read_file(f{av}.txt)
cut_list = jieba_cut(str)
gen_word_cloud(cut_list)
if __name__ == __main__:
av = av95811021
main(av)
上一篇:
IDEA上Java项目控制台中文乱码
