代码拉取完成,页面将自动刷新
# encoding utf-8
# 弃用
# import jieba
# import jieba.posseg as pseg
# import pandas as pd
# from pprint import pprint
# import re
# from jpype import *
# import gensim
#
# startJVM(getDefaultJVMPath(), "-Djava.class.path=E:\code\python\patent\hanlp\hanlp-1.5.0.jar;E:\code\python\patent\hanlp",
# "-Xms1g",
# "-Xmx1g") # 启动JVM,Linux需替换分号;为冒号:
# path="../patents03_22.csv"
#
# ##使用jieba分词
# ##分词并去除停用词
# def segment(docs):
# jieba.add_word("刀剪")
# jieba.add_word("左视图")
# jieba.add_word("右视图")
# documents=[' '.join(jieba.lcut(i)) for i in docs if isinstance(i, str)]
# stoplist = [i.strip() for i in open('./dataset/hit_stopwords.txt', encoding='utf-8').readlines()]
# texts = [[word for word in document.lower().split() if word not in stoplist]for document in documents]
# return texts
#
# def clear():
# patent_data = pd.read_csv(path)
# summary_list = patent_data['summary'].tolist()
# reg1 = "[^0-9A-Za-z\u4e00-\u9fa5\,\、\。]"
# punctuation = """0-9A-Za-z!' .?。"#$%&'。' , ( )()*+-//:;<=>@[\]^_`{|}~⦅⦆「」、、〃》',「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘'‛“”„‟…‧﹏<br> """
# re_punctuation = "[{}]+".format(punctuation)
#
# for i in range(len(summary_list)):
# summary_list[i] = re.sub(re_punctuation, " ", str(summary_list[i]))
#
#
# def csv2list(path):
# """
# :param path:csv文件路径
# :return: dataframe
# """
# patent_data = pd.read_csv(path)
# summary_list = patent_data['summary'].tolist()
# for i in summary_list:
# if isinstance(i, float):
# summary_list.remove(i)
# return summary_list
#
# # def filter(sentences):
# # reg = "[^0-9A-Za-z\u4e00-\u9fa5\,]"
# # for sentence in sentences:
# # # s=re.sub(reg,'',text)
# # return s.replace(","," ").lstrip()
# if __name__=='__main__':
# clear()
# docs=csv2list(path)
# sentences=segment(docs)
# pprint(sentences)
# file=open('segment.txt','w', encoding='utf-8')
#
# with open("segment.txt", "w", encoding='utf-8') as fw:
# for sentence in sentences:
# for word in sentence:
# word.encode('utf-8')
# data=word.strip()
# if len(data)!=0:
# print(data)
# fw.write(data)
# fw.write("\n")
#
#
# model = gensim.models.Word2Vec(sentences, min_count=1)
# pprint(model.wv.most_similar('激光'))
# result=''
# patent_data=get_data(path)
# summary_list=patent_data['summary'].tolist()
# for summary in summary_list:
# if isinstance(summary, str):
# result=result+segment(summary).toString()
# ts=filter(result)
# print(len(set(ts))/len(ts))
#
#
#
#
#
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。