代码拉取完成,页面将自动刷新
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from pprint import pprint
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np
### 使用sklearn训练LDA模型
###
###
with open('./dataset/hit_stopwords.txt', 'r', encoding="utf-8") as f:
line = f.read()
line = line.split('","')
f.close()
stopwords = []
for l in line:
stopwords.append(l.strip())
with open('./segment/segment_CN102804030A.txt','r',encoding='utf-8') as f:
res1=' '.join(f.read().split('\n'))
f.close()
with open('./segment/segment_CN106607772A.txt','r',encoding='utf-8') as f:
res2=' '.join(f.read().split('\n'))
f.close()
with open('./segment/segment_CN111024533A.txt','r',encoding='utf-8') as f:
res3=' '.join(f.read().split('\n'))
f.close()
def load_data():
with open('./segment/segment_03_26.txt','r',encoding='utf-8') as f:
sentence_list=f.readlines()
f.close()
return sentence_list
if __name__ == '__main__':
corpus=load_data()
cntVector=CountVectorizer(stop_words=stopwords)
cntTf=cntVector.fit_transform(corpus)
lda=LatentDirichletAllocation(n_topics=4,max_iter=5,
learning_method='online',
learning_offset=50,
random_state=100)
docres=lda.fit_transform(cntTf)
vocab = cntVector.get_feature_names()
n_top_words = 5
topic_words = {}
pprint(docres)
pprint(lda.components_)
for topic, comp in enumerate(lda.components_):
# for the n-dimensional array "arr":
# argsort() returns a ranked n-dimensional array of arr, call it "ranked_array"
# which contains the indices that would sort arr in a descending fashion
# for the ith element in ranked_array, ranked_array[i] represents the index of the
# element in arr that should be at the ith index in ranked_array
# ex. arr = [3,7,1,0,3,6]
# np.argsort(arr) -> [3, 2, 0, 4, 5, 1]
# word_idx contains the indices in "topic" of the top num_top_words most relevant
# to a given topic ... it is sorted ascending to begin with and then reversed (desc. now)
word_idx = np.argsort(comp)[::-1][:n_top_words]
# store the words most relevant to the topic
topic_words[topic] = [vocab[i] for i in word_idx]
print(topic_words)
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。