代码拉取完成,页面将自动刷新
# -*- coding: utf-8 -*-
import re
import string
from typing import List, Dict, Tuple, Union
from nltk import word_tokenize, pos_tag
# NLTK支持的PoS Tag类型
pos_tags = ['CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNS', 'NNP', 'NNPS', 'PDT',
'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP',
'VBZ', 'WDT', 'WP', 'WP$', 'WRB', ".", "$", "#", ",", "(", ")", ":", "``", "''"]
# 本算法目前支持的关键短语词性组合模式
keyword_patterns = ['JJ+ NN+ NNS', 'NN+ NNS', 'JJ+ NN+', 'JJ+ NNS', 'NN IN NN', 'NNP NN+', 'VBD NN+', 'NN NN+']
def build_tag_mapping(tags: List[str]) -> Dict[str, str]:
""" 将NLTK PoS Tag编码为ASCII编码
:param tags, List[str], NLTK支持的PoS Tag
:return Dict[str, str], NLTK Pos Tag 到 ASCII Encoding 的映射字典
"""
alphabet = """ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"""
tag_to_ascii = dict(zip(tags, alphabet))
return tag_to_ascii
def build_pattern_mapping(patterns: List[str], tag_to_ascii: Dict[str, str]) -> Dict[str, str]:
""" 将关键短语词性组合中的NLTK PoS Tag转换成ASCII编码形式
:param patterns, List[str], 关键短语词性组合模式
:param tag_to_ascii, Dict[str, str], NLTK Pos Tag 到 ASCII Encoding 的映射字典
:return Dict[str, str], 关键短语词性组合到 ASCII 缩写版关键短语词性组合的映射字典,例如 NN NN+ 被映射为 LL+
"""
pattern_mapping_dict = {}
for pattern in patterns:
# 匹配到关键短语模式之后立刻进行lambda替换
complied_pattern = re.sub("[A-Z]+", lambda x: tag_to_ascii[x.group()], pattern)
# 剔除关键短语模式中的空格
complied_pattern = complied_pattern.replace(" ", "")
pattern_mapping_dict[pattern] = complied_pattern
return pattern_mapping_dict
def compute_keywords_span(raw_text: str, keywords: List[str]) -> List[Dict]:
keyword_span_records = []
offset = 0
for keyword in keywords:
span = tuple()
for result in re.finditer(pattern=keyword, string=raw_text[offset:]):
span = result.span()
break
try:
keyword_span_records.append({
"start": offset + span[0],
"end": offset + span[1],
"keyword": keyword
})
offset += span[1]
except IndexError:
continue
return keyword_span_records
class Keyword:
""" 用于保存抽取的关键短语 """
def __init__(self, text: str, pattern: str, span: Tuple[int, int]):
""" 构建Keyword实例
:param text: str, 关键短语文本内容
:param pattern: str, 关键短语所属的词性状态模式
:param span: [start, end), 关键短语文本在源文本中的位置
"""
self.text = text
self.pattern = pattern
self.span = span
class AutomationKeywordExtractor:
def __init__(self):
self.tag_to_encoding = build_tag_mapping(pos_tags)
self.encoding_to_tag = {v: k for k, v in self.tag_to_encoding.items()}
self.pattern_to_encoding = build_pattern_mapping(keyword_patterns, self.tag_to_encoding)
self.encoding_to_pattern = {v: k for k, v in self.pattern_to_encoding.items()}
def extract(self, text: str, mode="simple") -> Union[List[Keyword], List[str], List[Dict]]:
""" 从给定文本片段中抽取关键词
:param text: str, 文本片段
:param mode: str, 关键词序列的输出形式, "simple", "records", "full"
:return: List, 各种形式的关键短语序列
"""
""" Step 1: 切词+标注 """
tagged_tokens = pos_tag(word_tokenize(text)) # POS-Tag
tags = "".join([self.tag_to_encoding[t] for _, t in tagged_tokens])
tokens = [w for w, _ in tagged_tokens]
""" Step 2: 根据自动机抽取关键短语 """
keywords = []
for item in re.finditer("|".join(self.pattern_to_encoding.values()), tags): # 经过压缩编码的关键短语模式
start, end = item.span()
keyword = " ".join(tokens[start: end]).strip(string.punctuation).strip()
keywords.append(Keyword(text=keyword, pattern=item.group(), span=item.span()))
""" Step 3: 输出不同形式的关键词序列 """
if mode == "simple":
return [kw.text for kw in keywords]
elif mode == "records":
return [{"text": kw.text, "pattern": kw.pattern, "begin": kw.span[0], "end": kw.span[1]} for kw in keywords]
elif mode == "ann":
return compute_keywords_span(text, [kw.text for kw in keywords])
elif mode == "full":
return keywords
else:
raise ValueError(f"mode {mode} is undefined.")
def mark(self, text: str, style="latex") -> str:
""" 在原始文本中高亮这些关键短语
:param text, 源文本
:param style: 高亮的格式,包括latex markdown html
"""
def html(origin: str) -> str: return f'<b style="color:red;">{origin}</b>'
def markdown(origin: str) -> str: return f'<span style="color:red">**{origin}**</span>'
def latex(origin: str) -> str: return fr'\textbf{{\textcolor{{red}}{{{origin}}}}}'
""" Step 1 准备源文本切词和关键短语提取 """
tokens = word_tokenize(text) # 切分源文本
keywords = self.extract(text, mode="full") # 抽取关键词
""" Step 2 添加高亮标记 """
marked_tokens = []
text_i = 0
for kw in keywords:
start, end = kw.span
while text_i < start: # 跳过非关键词的语段
marked_tokens.append(tokens[text_i])
text_i += 1
marked = locals()[style](kw.text) # 高亮,根据不同的style选择不同的处理方式
marked_tokens.append(marked)
text_i = end # 刷新语段offset到当前关键短语末尾
return " ".join(marked_tokens) # 高亮之后的字符串拼接
def convert_instance_to_pattern(instance: str, patterns) -> Union[str, None]:
""" 把抽取出来的关键短语词性实例归纳到对应的词性模式
:param instance: str, 抽取出来的关键短语的词性实例,例如 GGGGLLLLM
:param patterns: List[str], 可供选择的词性模式,例如 ["G+L+M", "L+M", "G+L+", "G+M", "LFL", "NL+", "bL+", "LL+"]
:return: str, 词性实例对应的词性模式,例如 G+L+M
"""
patterns = map(lambda x: "^"+x+"$", patterns) # 加上头尾限定符
for pattern in patterns:
if re.match(pattern, instance):
instance = re.sub(pattern, instance, pattern)
return instance.strip("^$") # 去除头尾限定符
return None
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。