爬取B站弹幕生成词云

一、爬取弹幕

import requests
import json
import re

#下载页面
def download_page(url):
    headers = {
          
   
        User-Agent:Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36
    }
    res = requests.get(url,headers)
    return res

#根据av号获取cid
def get_cid(av):
    """
    :param av:B站视频的av号 如:https://www.bilibili.com/video/av95811021
    :return:cid
    """
    av = av.strip(av)
    url = fhttps://api.bilibili.com/x/player/pagelist?aid={av}&jsonp=jsonpa
    res = download_page(url)
    res_text = res.text
    res_dict = json.loads(res_text)
    cid = res_dict[data][0][cid]
    return cid


#根据cid请求弹幕
def get_danmu(cid):
    """
    :param cid: 获取弹幕所需的id
    :return: 弹幕列表
    """
    url = fhttps://api.bilibili.com/x/v1/dm/list.so?oid={cid}
    res = download_page(url)
    res_xml = res.content.decode(utf-8)
    pattern = re.compile(<d.*?>(.*?)</d>)
    danmu_list = pattern.findall(res_xml)
    return  danmu_list

#保存弹幕文件
def save_to_file(danmu_list,filename):
    """
    :param danmu_list: 弹幕列表
    :param filename: 文件名
    """
    with open(filename,mode=w,encoding=utf-8) as f:
        for one_danmu in danmu_list:
            f.write(one_danmu)
            f.write(
)


def main(av):
    cid = get_cid(av)
    danmu_list = get_danmu(cid)
    save_to_file(danmu_list, f{av}.txt)

if __name__ == __main__:
    av = av95811021
    main(av)

二、生成词云

import jieba
from wordcloud import WordCloud

#读取弹幕文件
def read_file(filename):
    """
    :param filename: 文件名
    :return: 所有弹幕字符串
    """
    with open(filename,mode=r,encoding=utf-8) as f:
        danmu = f.read()
        return danmu


#jieba分词
def jieba_cut(str):
    """
    :param str: 所有待分词的弹幕字符串
    :return: 词组列表
    """
    jieba.suggest_freq(原声大碟, tune=True) #指定不切割词组
    jieba.suggest_freq(前方高能, tune=True)
    cut_list = jieba.lcut(str)
    return cut_list


#生成词云图
def gen_word_cloud(cut_list):
    """
    :param cut_list: 词组列表
    """
    word_str =   .join(cut_list)
    wc_settings = {
          
   
        font_path :msyh.ttc, #字体
        width : 800, #图片宽度
        height : 600, #图片高度
        max_words : 200, #最大词数
        background_color : white #背景颜色
    }
    #生成词云对象
    wc = WordCloud(**wc_settings).generate(word_str)
    #保存词云图片
    wc.to_file(经典对线.png)

def main(av):
    str = read_file(f{av}.txt)
    cut_list = jieba_cut(str)
    gen_word_cloud(cut_list)


if __name__ == __main__:
    av = av95811021
    main(av)

改进: 备注:需要一张背景为白色的图片

import jieba
from wordcloud import WordCloud
import numpy as np
from PIL import Image

#读取弹幕文件
def read_file(filename):
    """
    :param filename: 文件名
    :return: 所有弹幕字符串
    """
    with open(filename,mode=r,encoding=utf-8) as f:
        danmu = f.read()
        return danmu


#jieba分词
def jieba_cut(str):
    """
    :param str: 所有待分词的弹幕字符串
    :return: 词组列表
    """
    jieba.suggest_freq(原声大碟, tune=True)
    jieba.suggest_freq(前方高能, tune=True)
    cut_list = jieba.lcut(str)
    return cut_list


#生成词云图
def gen_word_cloud(cut_list):
    """
    :param cut_list: 词组列表
    """
    word_str =   .join(cut_list)
    mask = np.array(Image.open(ysg.jpg))
    wc_settings = {
          
   
        font_path :msyh.ttc,
        width : 800,
        height : 600,
        max_words : 600,
        background_color : white,
        mask:mask,
        colormap:Reds,
        contour_width:1,
        contour_color:red,
        collocations:False
    }
    #生成词云对象
    wc = WordCloud(**wc_settings).generate(word_str)
    #保存词云图片
    wc.to_file(经典对线.png)

def main(av):
    str = read_file(f{av}.txt)
    cut_list = jieba_cut(str)
    gen_word_cloud(cut_list)


if __name__ == __main__:
    av = av95811021
    main(av)
经验分享 程序员 微信小程序 职场和发展