master

分支 (1)

管理

管理

master

ake
/
run_semeval.py

# -*- coding: utf-8 -*-

import os
import pathlib
import re
import shutil
import time

import auto_ke
import pandas as pd

from tqdm import tqdm
from pke.unsupervised import FirstPhrases, TfIdf, KPMiner, YAKE
from pke.unsupervised import TextRank, SingleRank, TopicRank, TopicalPageRank, PositionRank, MultipartiteRank

r""" 在SemEval的数据集上测试本算法和PKE其他算法的效果。
https://github.com/ScienceIE/scienceie.github.io/blob/master/resources/semeval_articles_test.zip


python .\test\eval_semeval.py .\data\semeval_truth\ .\data\semeval_pred\ types

python .\test\eval_semeval.py .\data\phrase_semeval_truth\ .\data\semeval_pred\ types

"""

pred_path = "../data/semeval_pred"
truth_path = "../data/semeval_truth"
phrase_truth_path = "../data/phrase_semeval_truth"


def ake_run():

    txt_names = [f for f in os.listdir(pred_path) if f.endswith(".txt")]

    ake = auto_ke.AutomationKeywordExtractor()

    for txt_name in txt_names:

        with open(os.path.join(pred_path, txt_name), "r", encoding="utf-8") as txt_f:
            txt_content = " ".join(txt_f.readlines())
        kw_records = ake.extract(txt_content, mode="ann")
        kw_df = pd.DataFrame.from_records(kw_records).sort_values(by="start", ascending=True).reset_index()
        kw_df["tag"] = "Material"
        kw_df['combined'] = kw_df['tag'] + " " + kw_df['start'].astype(str) + ' ' + kw_df['end'].astype(str)
        kw_df.drop(["tag", "start", "end"], axis=1, inplace=True)
        kw_df = kw_df[["index", "combined", "keyword"]]
        kw_df.to_csv(os.path.join(pred_path, txt_name).replace("txt", "ann"), sep="\t", header=False, index=False)


def make_phrase_truth():

    ann_names = [f for f in os.listdir(truth_path) if f.endswith(".ann")]

    for ann_name in ann_names:
        ann_df = pd.read_csv(os.path.join(truth_path, ann_name), sep="\t", header=None)
        print(ann_name)
        ann_df.dropna(inplace=True)
        ann_df = ann_df[ann_df[2].str.contains(" ")]
        ann_df.to_csv(os.path.join(phrase_truth_path, ann_name), sep="\t", header=False, index=False)


def run_pke_models():
    """ 运行所以支持的PKE主题词提取算法 """

    # initialize keyphrase extraction model, here TopicRank
    models = [FirstPhrases, TfIdf, KPMiner, YAKE,
              TextRank, SingleRank, TopicRank, TopicalPageRank, PositionRank, MultipartiteRank]
    model_names = ["FirstPhrases", "TfIdf", "KPMiner", "YAKE",
                   "TextRank", "SingleRank", "TopicRank", "TopicalPageRank", "PositionRank", "MultipartiteRank"]

    for model, model_name in zip(models, model_names):  # eg TF-IDF, TextRank
        extractor = model()
        print(str(model_name))
        for n in [5, 10, 15]:

            print(f"==Top{n}==")
            txt_names = [f for f in os.listdir(truth_path) if f.endswith(".txt")]

            # Iterate all corpus file
            start_time = time.time()
            for txt_name in tqdm(txt_names):

                with open(os.path.join(truth_path, txt_name), "r", encoding="utf-8") as txt_f:
                    txt_content = " ".join(txt_f.readlines())

                # load the content of the document, here document is expected to be a simple
                # test string and preprocessing is carried out using spacy
                extractor.load_document(input=txt_content)

                # keyphrase candidate selection: sequences of nouns # and adjectives (i.e. `(Noun|Adj)*`)
                extractor.candidate_selection()

                # candidate weighting: using a random walk algorithm
                extractor.candidate_weighting()

                # N-best selection, keyphrases contains the 10 highest scored candidates as # (keyphrase, score) tuples
                keyphrases = [w for (w, score) in extractor.get_n_best(n=n)]

                # Output
                result_records = []
                print(keyphrases)
                for kp in keyphrases:
                    kp = re.sub(r"[\[\]()]", "", kp)
                    for kp_item in re.finditer(kp, txt_content, flags=re.IGNORECASE):
                        result_records.append({
                            "span": "Material" + " " + str(kp_item.span()[0]) + " " + str(kp_item.span()[1]),
                            "text": kp_item.group()
                        })

                result_df = pd.DataFrame.from_records(result_records).reset_index()  # Make DataFrame

                output_dir = f"../data/{model_name}-{n}-pred/"
                pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)

                result_df.to_csv(os.path.join(output_dir, txt_name.replace("txt", "ann")),
                                 sep="\t", header=False, index=False)

            end_time = time.time()

            latency = end_time - start_time
            with open("latency.log", "a", encoding="utf-8") as log_f:
                log_f.write(f"{str(model_name)},{n},{latency}\n")


if __name__ == '__main__':
    run_pke_models()