1 Star 1 Fork 2

魏泽桦/patent_system

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
patentprocess.py 2.88 KB
一键复制 编辑 原始数据 按行查看 历史
aimerin 提交于 2021-03-24 23:25 . 2021-03-24
# encoding utf-8
# 弃用
# import jieba
# import jieba.posseg as pseg
# import pandas as pd
# from pprint import pprint
# import re
# from jpype import *
# import gensim
#
# startJVM(getDefaultJVMPath(), "-Djava.class.path=E:\code\python\patent\hanlp\hanlp-1.5.0.jar;E:\code\python\patent\hanlp",
# "-Xms1g",
# "-Xmx1g") # 启动JVM,Linux需替换分号;为冒号:
# path="../patents03_22.csv"
#
# ##使用jieba分词
# ##分词并去除停用词
# def segment(docs):
# jieba.add_word("刀剪")
# jieba.add_word("左视图")
# jieba.add_word("右视图")
# documents=[' '.join(jieba.lcut(i)) for i in docs if isinstance(i, str)]
# stoplist = [i.strip() for i in open('./dataset/hit_stopwords.txt', encoding='utf-8').readlines()]
# texts = [[word for word in document.lower().split() if word not in stoplist]for document in documents]
# return texts
#
# def clear():
# patent_data = pd.read_csv(path)
# summary_list = patent_data['summary'].tolist()
# reg1 = "[^0-9A-Za-z\u4e00-\u9fa5\,\、\。]"
# punctuation = """0-9A-Za-z!' .?。"#$%&'。' , ( )()*+-//:;<=>@[\]^_`{|}~⦅⦆「」、、〃》',「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘'‛“”„‟…‧﹏<br> """
# re_punctuation = "[{}]+".format(punctuation)
#
# for i in range(len(summary_list)):
# summary_list[i] = re.sub(re_punctuation, " ", str(summary_list[i]))
#
#
# def csv2list(path):
# """
# :param path:csv文件路径
# :return: dataframe
# """
# patent_data = pd.read_csv(path)
# summary_list = patent_data['summary'].tolist()
# for i in summary_list:
# if isinstance(i, float):
# summary_list.remove(i)
# return summary_list
#
# # def filter(sentences):
# # reg = "[^0-9A-Za-z\u4e00-\u9fa5\,]"
# # for sentence in sentences:
# # # s=re.sub(reg,'',text)
# # return s.replace(","," ").lstrip()
# if __name__=='__main__':
# clear()
# docs=csv2list(path)
# sentences=segment(docs)
# pprint(sentences)
# file=open('segment.txt','w', encoding='utf-8')
#
# with open("segment.txt", "w", encoding='utf-8') as fw:
# for sentence in sentences:
# for word in sentence:
# word.encode('utf-8')
# data=word.strip()
# if len(data)!=0:
# print(data)
# fw.write(data)
# fw.write("\n")
#
#
# model = gensim.models.Word2Vec(sentences, min_count=1)
# pprint(model.wv.most_similar('激光'))
# result=''
# patent_data=get_data(path)
# summary_list=patent_data['summary'].tolist()
# for summary in summary_list:
# if isinstance(summary, str):
# result=result+segment(summary).toString()
# ts=filter(result)
# print(len(set(ts))/len(ts))
#
#
#
#
#
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/aimerin/patent_system.git
git@gitee.com:aimerin/patent_system.git
aimerin
patent_system
patent_system
master

搜索帮助