1 Star 0 Fork 0

isoft/email-analysis

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
克隆/下载
main.py 8.46 KB
一键复制 编辑 原始数据 按行查看 历史
isoft 提交于 2023-06-25 22:42 . new py
# %%
# coding:utf-8
import os
import csv
import jieba
import jieba.analyse
import pandas as pd
import warnings
import numpy as np
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer, util
import preparing
warnings.filterwarnings("ignore")
# 在gensim包中使用LDA
from gensim import corpora, models
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary
# plt.rcParams["font.sans-serif"]=["KaiTi"] #设置字体
plt.rcParams["axes.unicode_minus"] = False # 该语句解决图像中的“-”负号的乱码问题
# %% [markdown]
# ## 导入分词词典、停用词词典
# %%
# 读取停用词表并去重
def get_stop_words_set(file_name):
with open(file_name, "r", encoding="utf-8") as file:
return set([line.strip() for line in file])
# %% [markdown]
# ## 导入文本数据并分词
# %%
from LAC import LAC
# 装载分词模型
lac = LAC(mode="seg")
"""
# 单个样本输入,输入为Unicode编码的字符串
# text = u"LAC是个优秀的分词工具"
seg_result = lac.run(text)
# 批量样本输入, 输入为多个句子组成的list,平均速率会更快
texts = [u"LAC是个优秀的分词工具", u"百度是一家高科技公司"]
seg_result = lac.run(texts)
print(seg_result)
"""
sample_path = "./dataset/handled.txt"
f = open(sample_path, mode="r", encoding="utf-8", errors="ignore")
lines = f.readlines()
f.close()
lines = filter(lambda x: x != "\n" and x != " ", lines)
lines = np.array(list(lines))[0:200]
print(len(lines), "\n", lines[0:10])
# 待分析数据
def get_words_list():
word_matrix = lac.run(list(lines))
return word_matrix
# 去停用词
def clean_stopword(word_matrix):
cleanned = []
stop_words_set = get_stop_words_set("./model_data/stop_words.txt")
print("共导入 %d 个停用词" % len(stop_words_set))
for l in word_matrix:
l = list(
set(l)
.difference(set(stop_words_set))
.difference(set("\n"))
.difference(set(" "))
)
if len(l) == 0:
continue
cleanned.append(l)
return cleanned
# %%
# 文本列表,其中每个元素也是一个列表,即每行文字分词后形成的词语列表
word_list = get_words_list() # 第9列:岗位要求
print("停用词处理前:", len(word_list), word_list[0:10])
word_list = clean_stopword(word_list)
print("停用词处理后:", len(word_list), word_list[0:10])
# # 保存分词结果
# test = pd.DataFrame(data = [word_list])
# test.to_csv('./testcsv.csv', encoding = 'utf_8_sig')
# %% [markdown]
# ## 主题个数选择:基于困惑度和一致性
# %%
# 根据文本列表创建一个语料库,每个词与一个整型索引值对应
word_dict = corpora.Dictionary(word_list)
# 词频统计,转化成空间向量格式
corpus_list = [word_dict.doc2bow(text) for text in word_list]
print(len(corpus_list))
# %%
import gensim.corpora
from gensim.test.utils import common_corpus, common_dictionary
from gensim.models.coherencemodel import CoherenceModel
model_list = []
perplexity = [] # 困惑度
coherence_values = [] # 一致性
topics = range(2, 20, 1)
for num_topics in topics:
lda_model = models.LdaModel(
corpus=corpus_list,
id2word=word_dict,
random_state=1,
num_topics=num_topics,
passes=20,
alpha="auto",
)
model_list.append(lda_model) # 不同主题个数下的lda模型
# 模型对应的困惑度(越低越好)
perplexity_values = lda_model.log_perplexity(corpus_list)
perplexity.append(round(perplexity_values, 3))
# print('第 %d 个主题的Perplexity为: ' % (num_topics - 1), round(perplexity_values, 3))
# 模型对应的一致性(越高越好)
coherencemodel = CoherenceModel(
model=lda_model, corpus=common_corpus, coherence="u_mass"
)
coherence_values.append(round(coherencemodel.get_coherence(), 3))
# print('第 %d 个主题的Coherence为: ' % (num_topics - 1), round(coherencemodel.get_coherence(),3))
plt.title("topics and perplexity (lower is better)")
plt.plot(topics, perplexity)
plt.show()
plt.title("topics and coherence (higher is better)")
plt.plot(topics, coherence_values)
plt.show()
# %%
# 最大的一致性值以及对应的主题个数
print("最大的Coherence为:" + str(max(coherence_values)))
for i in range(len(coherence_values)):
if coherence_values[i] == max(coherence_values):
print("对应的主题个数为:" + str(i + 2))
# %% [markdown]
# ## lda结果可视化
# %%
import pyLDAvis.gensim_models
# %%
# 最终模型
## corpus: 文档词频矩阵
## num_topics:主题数目
## passes:训练伦次
lda = models.LdaModel(
corpus=corpus_list,
id2word=word_dict,
random_state=1,
num_topics=5,
passes=50,
alpha="auto",
)
# %%
# 结果展示
## lda: 训练好的模型
d = pyLDAvis.gensim_models.prepare(
lda, corpus_list, word_dict, mds="pcoa", sort_topics=True
)
# 获取每个文档的主题分布
doc_topic_distributions = []
for doc_bow in corpus_list:
doc_topics = lda.get_document_topics(doc_bow)
doc_topic_distributions.append(doc_topics)
# 打印每个文档的主题分布
for doc_idx, doc_topics in enumerate(doc_topic_distributions):
for topic, probability in doc_topics:
pass
# print(f"Document {doc_idx}:", f" Topic {topic}: {probability:.4f}")
# 打印每个主题的词分布
num_top_words = 20 # 设置每个主题的关键词数量
topics = lda.show_topics(num_topics=100000000, num_words=num_top_words, formatted=False)
for topic_idx, topic_words in topics:
print(f"Topic {topic_idx}:", topic_words)
# 打印每个文档的词分布
doc_word_distributions = []
for doc_bow in corpus_list:
doc_word_distribution = {}
for word_id, count in doc_bow:
word = word_dict[word_id]
doc_word_distribution[word] = count
doc_word_distributions.append(doc_word_distribution)
for doc_idx, doc_word_distribution in enumerate(doc_word_distributions):
print(f"Document {doc_idx}:", doc_word_distribution)
# 可视化
pyLDAvis.save_html(d, "lda_show.html") # 将结果保存为html文件
# 展示在notebook的output cell中
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda, corpus_list, word_dict)
vis
# %% [markdown]
# # 向量连接
# %%
from tensorflow.keras.layers import Input, Dense, Concatenate
from tensorflow.keras.models import Model
from sentence_transformers import SentenceTransformer, util
# sbert文本向量化
model = SentenceTransformer(
"DMetaSoul/sbert-chinese-general-v2", cache_folder="./model/"
)
sentences = lines
# Encode all sentences
embeddings = model.encode(sentences)
topic_embeddings = []
for corpus in range(0, len(corpus_list)):
corpus = corpus_list[i]
corpus, _ = zip(*corpus)
topic_embeddings.append(list(corpus))
topic_embeddings = np.array(topic_embeddings)
print("主题向量:", topic_embeddings.shape)
print("文章向量:", embeddings.shape)
# 堆叠式自动编码器
input_topics_embeddings = Input(shape=(topic_embeddings.shape[1],))
input_embeddings = Input(shape=(embeddings.shape[1],))
# 将输入向量堆叠在一起
stacked_vectors = Concatenate()([input_topics_embeddings, input_embeddings])
# 编码器部分 降维至16维
# encoded = Dense(16, activation='relu')(stacked_vectors)
encoded = Dense(1024, activation="relu")(stacked_vectors)
# 编码器模型
encoder = Model(
inputs=[input_topics_embeddings, input_embeddings], outputs=encoded, name="encoder"
)
# 打印模型结构
encoder.summary()
# %% [markdown]
# # 使用k-means实现堆叠向量的聚类操作
# %%
from sklearn.cluster import KMeans
from sklearn import preprocessing
stack_encoded_vector = encoder.predict([topic_embeddings, embeddings])
print(stack_encoded_vector.shape, stack_encoded_vector)
train_x = stack_encoded_vector
df = pd.DataFrame(train_x)
kmeans = KMeans(n_clusters=12, init="k-means++", random_state=42, n_init=10)
# kmeans算法
kmeans.fit(train_x)
predict_y = kmeans.predict(train_x)
# 合并聚类结果,插入到原数据中
label = pd.DataFrame(lines)[0].map(lambda x: x.split("##")[0])
title = pd.DataFrame(lines)[0].map(lambda x: x.split("##")[1])
content = pd.DataFrame(lines)[0].map(lambda x: x.split("##")[2])
keyword_dis = pd.DataFrame(topics) # 每个文档的词分布
y = pd.DataFrame(predict_y)
result = pd.concat((label, title, content, y, keyword_dis), axis=1)
result.rename({0: "聚类"}, axis=1, inplace=True)
result.to_csv("./result/result.csv", encoding="gb2312", errors="ignore")
print("============", result.head)
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/perfectionSoft/email-analysis.git
git@gitee.com:perfectionSoft/email-analysis.git
perfectionSoft
email-analysis
email-analysis
master

搜索帮助