master

分支 (1)

管理

管理

master

fund-crawler-and-data-analysis
/
wordcloud.py

import pandas as  pd
import re
import collections
import jieba
from pyecharts.charts import WordCloud
from pyecharts import options as opts

def wordc(facode):
    # 1.处理数据
    # 读取数据
    data = pd.read_excel('./file/基金'+facode+'的前100页评论.xlsx')# 读取数据
    data_cy = data.copy()#拷贝
    string_data = ''
    for i in data_cy['title']:
        string_data += str(i)# 去除空格
    pattern = re.compile(u'\t|\n| |；|\.|。|：|：\.|-|:|\d|;|、|，|\)|\(|\?|"')#定义正则表达式匹配模式，其中的|代表或
    string_data = re.sub(pattern, '', string_data)# 将符合模式的字符去除，re.sub代表替换，把符合pattern的替换为空
    seg_list_exact = jieba.cut(string_data, cut_all=False)  # 精确模式分词

    object_list = []
    with open('./stopword.txt', 'r', encoding="utf-8") as fp:# 读取过滤词表
        remove_words = fp.read().split()

    for word in seg_list_exact:# 循环读出每个分词
        if word not in remove_words and word != ' ' and word != '\xa0':#看每个分词是否在常用词表中或结果是否为空或\xa0不间断空白符，如果不是再追加
            object_list.append(word)  # 分词追加到列表
    word_counts = collections.Counter(object_list)  # 对分词做词频统计
    word_counts_top = word_counts.most_common(100)  # 获取前100最高频的词
    c = (WordCloud().add("", word_counts_top).render("./file/基金"+facode+"词云图.html"))#根据词频最高的词生成页面https://gallery.pyecharts.org/#/WordCloud/wordcloud_custom_mask_image