代码拉取完成,页面将自动刷新
# -*- coding: utf-8 -*-
import os
import pathlib
import re
import shutil
import time
import auto_ke
import pandas as pd
from tqdm import tqdm
from pke.unsupervised import FirstPhrases, TfIdf, KPMiner, YAKE
from pke.unsupervised import TextRank, SingleRank, TopicRank, TopicalPageRank, PositionRank, MultipartiteRank
r""" 在SemEval的数据集上测试本算法和PKE其他算法的效果。
https://github.com/ScienceIE/scienceie.github.io/blob/master/resources/semeval_articles_test.zip
python .\test\eval_semeval.py .\data\semeval_truth\ .\data\semeval_pred\ types
python .\test\eval_semeval.py .\data\phrase_semeval_truth\ .\data\semeval_pred\ types
"""
pred_path = "../data/semeval_pred"
truth_path = "../data/semeval_truth"
phrase_truth_path = "../data/phrase_semeval_truth"
def ake_run():
txt_names = [f for f in os.listdir(pred_path) if f.endswith(".txt")]
ake = auto_ke.AutomationKeywordExtractor()
for txt_name in txt_names:
with open(os.path.join(pred_path, txt_name), "r", encoding="utf-8") as txt_f:
txt_content = " ".join(txt_f.readlines())
kw_records = ake.extract(txt_content, mode="ann")
kw_df = pd.DataFrame.from_records(kw_records).sort_values(by="start", ascending=True).reset_index()
kw_df["tag"] = "Material"
kw_df['combined'] = kw_df['tag'] + " " + kw_df['start'].astype(str) + ' ' + kw_df['end'].astype(str)
kw_df.drop(["tag", "start", "end"], axis=1, inplace=True)
kw_df = kw_df[["index", "combined", "keyword"]]
kw_df.to_csv(os.path.join(pred_path, txt_name).replace("txt", "ann"), sep="\t", header=False, index=False)
def make_phrase_truth():
ann_names = [f for f in os.listdir(truth_path) if f.endswith(".ann")]
for ann_name in ann_names:
ann_df = pd.read_csv(os.path.join(truth_path, ann_name), sep="\t", header=None)
print(ann_name)
ann_df.dropna(inplace=True)
ann_df = ann_df[ann_df[2].str.contains(" ")]
ann_df.to_csv(os.path.join(phrase_truth_path, ann_name), sep="\t", header=False, index=False)
def run_pke_models():
""" 运行所以支持的PKE主题词提取算法 """
# initialize keyphrase extraction model, here TopicRank
models = [FirstPhrases, TfIdf, KPMiner, YAKE,
TextRank, SingleRank, TopicRank, TopicalPageRank, PositionRank, MultipartiteRank]
model_names = ["FirstPhrases", "TfIdf", "KPMiner", "YAKE",
"TextRank", "SingleRank", "TopicRank", "TopicalPageRank", "PositionRank", "MultipartiteRank"]
for model, model_name in zip(models, model_names): # eg TF-IDF, TextRank
extractor = model()
print(str(model_name))
for n in [5, 10, 15]:
print(f"==Top{n}==")
txt_names = [f for f in os.listdir(truth_path) if f.endswith(".txt")]
# Iterate all corpus file
start_time = time.time()
for txt_name in tqdm(txt_names):
with open(os.path.join(truth_path, txt_name), "r", encoding="utf-8") as txt_f:
txt_content = " ".join(txt_f.readlines())
# load the content of the document, here document is expected to be a simple
# test string and preprocessing is carried out using spacy
extractor.load_document(input=txt_content)
# keyphrase candidate selection: sequences of nouns # and adjectives (i.e. `(Noun|Adj)*`)
extractor.candidate_selection()
# candidate weighting: using a random walk algorithm
extractor.candidate_weighting()
# N-best selection, keyphrases contains the 10 highest scored candidates as # (keyphrase, score) tuples
keyphrases = [w for (w, score) in extractor.get_n_best(n=n)]
# Output
result_records = []
print(keyphrases)
for kp in keyphrases:
kp = re.sub(r"[\[\]()]", "", kp)
for kp_item in re.finditer(kp, txt_content, flags=re.IGNORECASE):
result_records.append({
"span": "Material" + " " + str(kp_item.span()[0]) + " " + str(kp_item.span()[1]),
"text": kp_item.group()
})
result_df = pd.DataFrame.from_records(result_records).reset_index() # Make DataFrame
output_dir = f"../data/{model_name}-{n}-pred/"
pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)
result_df.to_csv(os.path.join(output_dir, txt_name.replace("txt", "ann")),
sep="\t", header=False, index=False)
end_time = time.time()
latency = end_time - start_time
with open("latency.log", "a", encoding="utf-8") as log_f:
log_f.write(f"{str(model_name)},{n},{latency}\n")
if __name__ == '__main__':
run_pke_models()
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。