import os import time libs = {'lxml','requests','pandas','numpy','you-get','opencv-python','pandas','fake_useragent','matplotlib','moviepy'} try: for lib in libs: os.system(f'pip3 install -i https://pypi./simple/ {lib}') print(lib+'下载成功') except: print('下载失败')
导入模块
在这里统一先导入所需的模块
import os import re import cv2 import jieba import requests import moviepy import pandas as pd import numpy as np from PIL import Image from lxml import etree from wordcloud import WordCloud import matplotlib.pyplot as plt from fake_useragent import UserAgent
import requests import pandas as pd import re import csv from fake_useragent import UserAgent from concurrent.futures import ThreadPoolExecutor import datetime
ua = UserAgent() start_time = datetime.datetime.now()
defGrab_barrage(date): headers = { 'origin': 'https://www.bilibili.com', 'referer': 'https://www.bilibili.com/video/BV1jZ4y1K78N?from=search&seid=1084505810439035065', 'cookie': '', 'user-agent': ua.random(), } params = { 'type': 1, 'oid' : '222413092', 'date': date } r= requests.get(url, params=params, headers=headers) r.encoding = 'utf-8' comment = re.findall('<d p='.*?'>(.*?)</d>', r.text) for i in comments: df.append(i) a = pd.DataFrame(df) a.to_excel('danmu.xlsx') defmain(): with ThreadPoolExecutor(max_workers=4) as executor: executor.map(Grab_barrage, date_list) '''计算所需时间''' delta = (datetime.datetime.now() - start_time).total_seconds() print(f'用时:{delta}s') if __name__ == '__main__': # 目标url url = 'https://api.bilibili.com/x/v2/dm/history' start,end = '20200808','20200908' date_list = [x for x in pd.date_range(start, end).strftime('%Y-%m-%d')] count = 0 main()
deffunc(s): for i in range(1,int(len(s)/2)+1): for j in range(len(s)): if s[j:j+i] == s[j+i:j+2*i]: k = j + i while s[k:k+i] == s[k+i:k+2*i] and k<len(s): k = k + i s = s[:j] + s[k:] return s data['短评'] = data['短评'].apply(func)
添加停用词和自定义词组
import pandas as pd from wordcloud import WordCloud import jieba from tkinter import _flatten import matplotlib.pyplot as plt
jieba.load_userdict('./词云图//add.txt') with open('./词云图//stoplist.txt', 'r', encoding='utf-8') as f: stopWords = f.read()
生成词云图
from wordcloud import WordCloud import collections import jieba import re from PIL import Image import matplotlib.pyplot as plt import numpy as np with open('barrages.txt') as f: data = f.read() jieba.load_userdict('./词云图//add.txt')
# 读取数据 with open('barrages.txt') as f: data = f.read() jieba.load_userdict('./词云图//add.txt') # 文本预处理 去除一些无用的字符 只提取出中文出来 new_data = re.findall('[\u4e00-\u9fa5]+', data, re.S) new_data = '/'.join(new_data)