爬取B站弹幕生成词云
一、爬取弹幕
import requests import json import re #下载页面 def download_page(url): headers = { User-Agent:Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 } res = requests.get(url,headers) return res #根据av号获取cid def get_cid(av): """ :param av:B站视频的av号 如:https://www.bilibili.com/video/av95811021 :return:cid """ av = av.strip(av) url = fhttps://api.bilibili.com/x/player/pagelist?aid={av}&jsonp=jsonpa res = download_page(url) res_text = res.text res_dict = json.loads(res_text) cid = res_dict[data][0][cid] return cid #根据cid请求弹幕 def get_danmu(cid): """ :param cid: 获取弹幕所需的id :return: 弹幕列表 """ url = fhttps://api.bilibili.com/x/v1/dm/list.so?oid={cid} res = download_page(url) res_xml = res.content.decode(utf-8) pattern = re.compile(<d.*?>(.*?)</d>) danmu_list = pattern.findall(res_xml) return danmu_list #保存弹幕文件 def save_to_file(danmu_list,filename): """ :param danmu_list: 弹幕列表 :param filename: 文件名 """ with open(filename,mode=w,encoding=utf-8) as f: for one_danmu in danmu_list: f.write(one_danmu) f.write( ) def main(av): cid = get_cid(av) danmu_list = get_danmu(cid) save_to_file(danmu_list, f{av}.txt) if __name__ == __main__: av = av95811021 main(av)
二、生成词云
import jieba from wordcloud import WordCloud #读取弹幕文件 def read_file(filename): """ :param filename: 文件名 :return: 所有弹幕字符串 """ with open(filename,mode=r,encoding=utf-8) as f: danmu = f.read() return danmu #jieba分词 def jieba_cut(str): """ :param str: 所有待分词的弹幕字符串 :return: 词组列表 """ jieba.suggest_freq(原声大碟, tune=True) #指定不切割词组 jieba.suggest_freq(前方高能, tune=True) cut_list = jieba.lcut(str) return cut_list #生成词云图 def gen_word_cloud(cut_list): """ :param cut_list: 词组列表 """ word_str = .join(cut_list) wc_settings = { font_path :msyh.ttc, #字体 width : 800, #图片宽度 height : 600, #图片高度 max_words : 200, #最大词数 background_color : white #背景颜色 } #生成词云对象 wc = WordCloud(**wc_settings).generate(word_str) #保存词云图片 wc.to_file(经典对线.png) def main(av): str = read_file(f{av}.txt) cut_list = jieba_cut(str) gen_word_cloud(cut_list) if __name__ == __main__: av = av95811021 main(av)
改进: 备注:需要一张背景为白色的图片
import jieba from wordcloud import WordCloud import numpy as np from PIL import Image #读取弹幕文件 def read_file(filename): """ :param filename: 文件名 :return: 所有弹幕字符串 """ with open(filename,mode=r,encoding=utf-8) as f: danmu = f.read() return danmu #jieba分词 def jieba_cut(str): """ :param str: 所有待分词的弹幕字符串 :return: 词组列表 """ jieba.suggest_freq(原声大碟, tune=True) jieba.suggest_freq(前方高能, tune=True) cut_list = jieba.lcut(str) return cut_list #生成词云图 def gen_word_cloud(cut_list): """ :param cut_list: 词组列表 """ word_str = .join(cut_list) mask = np.array(Image.open(ysg.jpg)) wc_settings = { font_path :msyh.ttc, width : 800, height : 600, max_words : 600, background_color : white, mask:mask, colormap:Reds, contour_width:1, contour_color:red, collocations:False } #生成词云对象 wc = WordCloud(**wc_settings).generate(word_str) #保存词云图片 wc.to_file(经典对线.png) def main(av): str = read_file(f{av}.txt) cut_list = jieba_cut(str) gen_word_cloud(cut_list) if __name__ == __main__: av = av95811021 main(av)
上一篇:
IDEA上Java项目控制台中文乱码