代码拉取完成,页面将自动刷新
同步操作将从 魏泽桦/patent_system 强制同步,此操作会覆盖自 Fork 仓库以来所做的任何修改,且无法恢复!!!
确定后同步将在后台操作,完成时将刷新页面,请耐心等待。
# -*- coding=utf8 -*-
# 使用jieba/hanlp进行分词
import jieba
import re
from zhon.hanzi import punctuation
from pprint import pprint
import string
from string import digits
import pandas as pd
import jieba.posseg
jieba.load_userdict("./dataset/dict.txt")
from pyhanlp import *
# 保存路径
patent_path = 'dataset/CN102804030A.txt'
save_path = "./segment/segment_" + patent_path
def clean_data(text):
text = [i.strip('\n') for i in text]
text = [x.strip() for x in text if x.strip() != '']
for i in range(len(text)):
text[i] = re.sub('\(.*?\)', ' ', text[i]) # 删除()及()里面的内容
text[i] = re.sub('(.*?\)', ' ', text[i]) # 删除()及()里面的内容
text[i] = re.sub("[{}]+".format(punctuation), " ", text[i]) # 删除标点符号
text[i] = re.sub("[{}]+".format(string.punctuation), " ", text[i]) # 删除中文标点符号
text[i] = text[i].translate(str.maketrans(' ', ' ', digits))
return text
def cut_data(text, choose=1):
if choose == 1:
# hanlp分词
sentence_list = []
for i in text:
terms = HanLP.segment(i)
sentence = []
for term in terms:
if len(term.word) > 1:
sentence.append(term.word)
sentence_list.append(sentence)
else:
# jieba切词
documents = [' '.join(jieba.lcut(i)) for i in text if isinstance(i, str)]
# 载入停用词词典,去除documents中的停用词
stoplist = [i.strip() for i in open('dataset/hit_stopwords.txt', encoding='utf-8').readlines()]
sentence_list = [[word for word in document.lower().split() if word not in stoplist and len(word) > 1] for
document in
documents]
return sentence_list
if __name__ == "__main__":
patent_data = pd.read_csv('./dataset/patents03-26.csv')
summary = patent_data['summary'].tolist()
###1、清洗数据
###删除空格符换行符和html标签,将中文标点转化为英文标点
summary = clean_data(summary)
pprint(summary[13])
###2、切词
sentence_list = cut_data(summary)
###3、保存本地
with open('segment/segment_03_26.txt', "w", encoding='utf-8') as fw:
for sentence in sentence_list:
for word in sentence:
word.encode('utf-8')
data = word.strip()
fw.write(data)
fw.write(",")
fw.write("\n")
fw.close()
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。