1 Star 0 Fork 2

ZSW/patent_system

forked from 魏泽桦/patent_system 
加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
cut_data.py 2.49 KB
一键复制 编辑 原始数据 按行查看 历史
aimerin 提交于 2021-04-26 16:47 . 2021-04-26 简单实现了LDA模型
# -*- coding=utf8 -*-
# 使用jieba/hanlp进行分词
import jieba
import re
from zhon.hanzi import punctuation
from pprint import pprint
import string
from string import digits
import pandas as pd
import jieba.posseg
jieba.load_userdict("./dataset/dict.txt")
from pyhanlp import *
# 保存路径
patent_path = 'dataset/CN102804030A.txt'
save_path = "./segment/segment_" + patent_path
def clean_data(text):
text = [i.strip('\n') for i in text]
text = [x.strip() for x in text if x.strip() != '']
for i in range(len(text)):
text[i] = re.sub('\(.*?\)', ' ', text[i]) # 删除()及()里面的内容
text[i] = re.sub('(.*?\)', ' ', text[i]) # 删除()及()里面的内容
text[i] = re.sub("[{}]+".format(punctuation), " ", text[i]) # 删除标点符号
text[i] = re.sub("[{}]+".format(string.punctuation), " ", text[i]) # 删除中文标点符号
text[i] = text[i].translate(str.maketrans(' ', ' ', digits))
return text
def cut_data(text, choose=1):
if choose == 1:
# hanlp分词
sentence_list = []
for i in text:
terms = HanLP.segment(i)
sentence = []
for term in terms:
if len(term.word) > 1:
sentence.append(term.word)
sentence_list.append(sentence)
else:
# jieba切词
documents = [' '.join(jieba.lcut(i)) for i in text if isinstance(i, str)]
# 载入停用词词典,去除documents中的停用词
stoplist = [i.strip() for i in open('dataset/hit_stopwords.txt', encoding='utf-8').readlines()]
sentence_list = [[word for word in document.lower().split() if word not in stoplist and len(word) > 1] for
document in
documents]
return sentence_list
if __name__ == "__main__":
patent_data = pd.read_csv('./dataset/patents03-26.csv')
summary = patent_data['summary'].tolist()
###1、清洗数据
###删除空格符换行符和html标签,将中文标点转化为英文标点
summary = clean_data(summary)
pprint(summary[13])
###2、切词
sentence_list = cut_data(summary)
###3、保存本地
with open('segment/segment_03_26.txt', "w", encoding='utf-8') as fw:
for sentence in sentence_list:
for word in sentence:
word.encode('utf-8')
data = word.strip()
fw.write(data)
fw.write(",")
fw.write("\n")
fw.close()
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/zhushangwu/patent_system.git
git@gitee.com:zhushangwu/patent_system.git
zhushangwu
patent_system
patent_system
master

搜索帮助