patent_system
/
patent_LDA.py

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from pprint import pprint
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np

### 使用sklearn训练LDA模型
###
###

with open('./dataset/hit_stopwords.txt', 'r', encoding="utf-8") as f:
    line = f.read()
    line = line.split('","')
    f.close()

stopwords = []
for l in line:
    stopwords.append(l.strip())


with open('./segment/segment_CN102804030A.txt','r',encoding='utf-8') as f:
    res1=' '.join(f.read().split('\n'))
    f.close()


with open('./segment/segment_CN106607772A.txt','r',encoding='utf-8') as f:
    res2=' '.join(f.read().split('\n'))
    f.close()

with open('./segment/segment_CN111024533A.txt','r',encoding='utf-8') as f:
    res3=' '.join(f.read().split('\n'))
    f.close()

def load_data():
    with open('./segment/segment_03_26.txt','r',encoding='utf-8') as f:
        sentence_list=f.readlines()
    f.close()
    return sentence_list


if __name__ == '__main__':
    corpus=load_data()

    cntVector=CountVectorizer(stop_words=stopwords)
    cntTf=cntVector.fit_transform(corpus)

    lda=LatentDirichletAllocation(n_topics=4,max_iter=5,
                                  learning_method='online',
                                  learning_offset=50,
                                  random_state=100)
    docres=lda.fit_transform(cntTf)
    vocab = cntVector.get_feature_names()

    n_top_words = 5
    topic_words = {}
    pprint(docres)
    pprint(lda.components_)
    for topic, comp in enumerate(lda.components_):
        # for the n-dimensional array "arr":
        # argsort() returns a ranked n-dimensional array of arr, call it "ranked_array"
        # which contains the indices that would sort arr in a descending fashion
        # for the ith element in ranked_array, ranked_array[i] represents the index of the
        # element in arr that should be at the ith index in ranked_array
        # ex. arr = [3,7,1,0,3,6]
        # np.argsort(arr) -> [3, 2, 0, 4, 5, 1]
        # word_idx contains the indices in "topic" of the top num_top_words most relevant
        # to a given topic ... it is sorted ascending to begin with and then reversed (desc. now)
        word_idx = np.argsort(comp)[::-1][:n_top_words]

        # store the words most relevant to the topic
        topic_words[topic] = [vocab[i] for i in word_idx]

    print(topic_words)