lingting
/
text_fc.py

import re
import sqlite3
from database_utils import get_valid_speakers


# 过滤目录部分
def remove_table_of_contents(text):
    toc_threshold = int(len(text) * 0.04)  # 寻找范围：文本的前4%
    potential_toc_part = text[:toc_threshold]

    # 目录标志的正则表达式
    toc_marker_pattern = re.compile(r'目录|目\s*录', re.IGNORECASE)

    # 行末带有多个章节编号或节编号的正则表达式（支持阿拉伯数字和中文数字）
    toc_entry_pattern = re.compile(r'^(.*?)(\d+|第[一二三四五六七八九十百千]+)\s*(章|节|录)', re.MULTILINE)

    # 查找目录标志的位置
    toc_marker_match = toc_marker_pattern.search(potential_toc_part)

    # 如果没有找到明确的“目录”标志，从头开始判断目录特征
    start_position = toc_marker_match.start() if toc_marker_match else 0
    toc_lines = []
    temp_line = ""
    previous_number = None

    # 遍历从目录标志之后的文本
    for line in potential_toc_part[start_position:].splitlines():
        stripped_line = line.strip()

        # 检查是否是目录条目（含多个章节编号和标题的混合格式）
        if len(stripped_line) < 50 and toc_entry_pattern.search(stripped_line):
            temp_line += " " + stripped_line
            if len(temp_line) > 50 or len(toc_lines) > 0:  # 确保是目录内容，不是单独的一行
                toc_lines.append(temp_line.strip())
                temp_line = ""

        # 检查是否是递增的数字序列
        elif re.match(r'^\d+', stripped_line):
            current_number = int(re.match(r'^\d+', stripped_line).group())
            if previous_number is None or current_number == previous_number + 1:
                toc_lines.append(stripped_line)
                previous_number = current_number
            else:
                # 如果数字不再递增，停止捕获
                break
        elif toc_lines:
            # 如果前面已经捕获了目录条目，遇到不符合条件的行时停止捕获
            break

    # 如果捕获到了符合条件的目录条目，移除目录部分
    if toc_lines:
        toc_end_index = text.find(toc_lines[-1]) + len(toc_lines[-1])
        return text[toc_end_index:]

    return text

# 预处理文本
def preprocess_text(text):
    # 识别并处理百分数的情况，确保小数点后的百分数被正确处理
    percentage_pattern = re.compile(r'(\d+\.\d+)%')
    text = re.sub(percentage_pattern, lambda x: f'百分之{x.group(1)}', text)

    # 处理非百分数的小数点，防止被替换为中文句号
    non_percentage_decimal_pattern = re.compile(r'(\d+\.\d+)(?!%)')
    text = re.sub(non_percentage_decimal_pattern, lambda x: x.group(0).replace('.', 'DOT'), text)

    # 替换标点符号
    text = re.sub(r',', '，', text)
    text = re.sub(r'\.', '。', text)
    text = re.sub(r';', '；', text)
    text = re.sub(r':', '：', text)
    text = re.sub(r'\!', '！', text)
    text = re.sub(r'\?', '？', text)
    text = re.sub(r'\+', '加', text)
    text = re.sub(r'=', '等于', text)
    text = re.sub(r'#', '', text)
    text = re.sub(r'\*\s*(.*?)\s*\*', r' \1 ', text)

    # 恢复非百分数的小数点
    text = re.sub(r'DOT', '.', text)

    # 清理多余的空格和换行符
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r'(\w{1,3}\s*){1,3}\n', lambda m: m.group(0).replace('\n', ' '), text)
    text = re.sub(r'\s+', ' ', text)

    return text

# 强化后的章节和节捕获与切片逻辑
def split_text(text):
    # 先移除目录部分
    text = remove_table_of_contents(text)

    # 再进行文本预处理
    text = preprocess_text(text)

    chapter_pattern = re.compile(r'(第[一二三四五六七八九十百千]+章)')
    section_patterns = [
        re.compile(r'(第[一二三四五六七八九十百千]+节)'),
        re.compile(r'(第[一二三四五六七八九十百千]+段)'),
        re.compile(r'(第[一二三四五六七八九十百千]+篇)')
    ]

    # 获取有效发音人列表（从 database_utils 中导入的函数）
    valid_speakers = get_valid_speakers()

    # 发音人标记的正则表达式
    speaker_pattern = re.compile(r'@(\S+?)@')
    end_speaker_pattern = re.compile(r'@@')

    current_speaker = None  # 当前的发音人

    # 捕获章节信息，确保所有章节都能在切片前被识别
    sentences = re.split(r'(?<=[。！？])|(?<=[，,])\s*|\n+', text)
    new_sentences = []
    temp_sentence = ""
    current_chapter = ""
    current_section = 0
    chapter_count = 1
    section_count = 1
    chapter_titles = set()
    section_titles = {}
    found_first_section = False

    for sentence in sentences:
        sentence = sentence.strip()
        if len(sentence) == 0 or sentence in [",", "，", ".", "。"]:
            continue

        # 处理发音人标记
        speaker_match = speaker_pattern.search(sentence)
        end_speaker_match = end_speaker_pattern.search(sentence)

        if speaker_match:
            speaker_name = speaker_match.group(1)
            if speaker_name in valid_speakers:  # 检查发音人是否存在于 speaker 表中
                if current_speaker and temp_sentence:
                    # 如果在找到下一个讲述人标记前没有找到结束标记，将当前积累的句子加入
                    new_sentences.append((temp_sentence.strip(), 0, current_chapter, current_section, current_speaker))
                    temp_sentence = ""
                current_speaker = speaker_name  # 捕获有效的发音人名称
                sentence = speaker_pattern.sub('', sentence)  # 移除发音人标记
            else:
                current_speaker = None  # 无效发音人，忽略处理
                continue  # 过滤掉无效的讲述人标记以及其相关内容，不写入数据库

        elif end_speaker_match:
            if current_speaker and temp_sentence:
                # 如果找到结束标记，将当前积累的句子加入
                new_sentences.append((temp_sentence.strip(), 0, current_chapter, current_section, current_speaker))
                temp_sentence = ""
            current_speaker = None  # 重置发音人
            sentence = end_speaker_pattern.sub('', sentence)  # 移除结束标记
            continue  # 过滤掉结束标记的内容，不写入数据库


        # 捕获章节标记，即使它与正文混在一起
        chapter_match = chapter_pattern.search(sentence)
        section_match = None
        for pattern in section_patterns:
            if pattern.search(sentence):
                section_match = pattern.search(sentence)
                break

        # 处理章节
        if chapter_match:
            chapter_title = chapter_match.group(0)
            if chapter_title not in chapter_titles:
                if temp_sentence:
                    new_sentences.append((temp_sentence.strip(), 0, current_chapter, current_section, current_speaker))
                    temp_sentence = ""
                current_chapter = f"第{chapter_count}章"
                chapter_label = f"【{current_chapter}】{sentence}"
                new_sentences.append((chapter_label.strip(), 1, current_chapter, 0, current_speaker))
                chapter_titles.add(chapter_title)
                chapter_count += 1
                section_count = 1
                current_section = 0
                section_titles[current_chapter] = set()
                found_first_section = False

        # 处理节
        elif section_match and current_chapter:
            section_title = section_match.group(0)
            if section_title not in section_titles[current_chapter]:
                section_num_match = re.search(r'第([一二三四五六七八九十百千]+)节', section_title)
                if section_num_match:
                    section_num = section_num_match.group(1)
                    section_num_arabic = convert_chinese_numerals_to_arabic(section_num)
                    if temp_sentence:
                        new_sentences.append((temp_sentence.strip(), 0, current_chapter, current_section, current_speaker))
                        temp_sentence = ""
                    section_label = f"【{current_chapter} 第{section_num_arabic}节】{sentence}"
                    new_sentences.append((section_label.strip(), 1, current_chapter, section_num_arabic, current_speaker))
                    section_count = section_num_arabic + 1
                    current_section = section_count
                    section_titles[current_chapter].add(section_title)
                    found_first_section = True

        # 处理普通句子
        elif len(temp_sentence) + len(sentence) <= 45:
            temp_sentence += sentence
        else:
            if 35 <= len(temp_sentence) <= 45:
                new_sentences.append((temp_sentence.strip(), 0, current_chapter, current_section, current_speaker))
                temp_sentence = sentence
            else:
                sub_sentences = re.split(r'(?<=，)', temp_sentence)
                combined_sentence = ""
                for sub_sentence in sub_sentences:
                    if len(combined_sentence) + len(sub_sentence) <= 35:
                        combined_sentence += sub_sentence
                    else:
                        new_sentences.append((combined_sentence.strip(), 0, current_chapter, current_section, current_speaker))
                        combined_sentence = sub_sentence
                if combined_sentence:
                    new_sentences.append((combined_sentence.strip(), 0, current_chapter, current_section, current_speaker))
                temp_sentence = sentence

    if temp_sentence and len(temp_sentence.strip()) > 0:
        new_sentences.append((temp_sentence.strip(), 0, current_chapter, current_section, current_speaker))

    new_sentences = [(s.strip(), is_chapter, chapter, section, speaker) for s, is_chapter, chapter, section, speaker in new_sentences if len(s.strip()) > 0]

    # 调整段落的优先级
    adjusted_sentences = []
    chapter_sections = {}
    for i, (sentence, is_chapter, chapter, section, speaker) in enumerate(new_sentences):
        if is_chapter == 1:
            current_chapter = chapter
            current_section = 0
            chapter_sections[current_chapter] = []  # 确保初始化
        elif chapter == current_chapter:
            # 确保 current_chapter 键已被初始化
            if current_chapter not in chapter_sections:
                chapter_sections[current_chapter] = []
            chapter_sections[current_chapter].append(section)

    for chapter, sections in chapter_sections.items():
        if sections:
            missing_sections = set(range(1, max(sections) + 1)) - set(sections)
        else:
            missing_sections = set()  # 如果 sections 为空，则 missing_sections 也为空
        if missing_sections:
            for i, (sentence, is_chapter, chap, sec, speaker) in enumerate(new_sentences):
                if chap == chapter and is_chapter == 0:
                    if sec in missing_sections:
                        missing_sections.remove(sec)
                    else:
                        if not found_first_section:
                            new_sentences[i] = (sentence, is_chapter, chap, 1, speaker)
                        else:
                            new_sentences[i] = (sentence, is_chapter, chap, sec, speaker)

    current_section = 0
    for i, (sentence, is_chapter, chapter, section, speaker) in enumerate(new_sentences):
        if section > current_section + 1 and is_chapter == 0:
            for j in range(current_section + 1, section):
                new_sentences.insert(i, ("", 0, chapter, j, speaker))
        current_section = section

    # 检查并修正输出
    corrected_sentences = []
    for i, (sentence, is_chapter, chapter, section, speaker) in enumerate(new_sentences):
        if is_chapter == 1:
            corrected_sentences.append((sentence, is_chapter, chapter, section, speaker))
        else:
            if corrected_sentences and corrected_sentences[-1][2] == chapter and corrected_sentences[-1][3] == section - 1:
                corrected_sentences.append((sentence, is_chapter, chapter, section, speaker))
            else:
                if corrected_sentences and corrected_sentences[-1][3] + 1 != section:
                    section = corrected_sentences[-1][3] + 1
                corrected_sentences.append((sentence, is_chapter, chapter, section, speaker))

    # 处理普通句子的句子分割
    final_sentences = []
    for sentence, is_chapter, chapter, section, speaker in corrected_sentences:
        if is_chapter == 0:
            split_sents = split_sentences(sentence)
            final_sentences.extend([(s, is_chapter, chapter, section, speaker) for s in split_sents])
        else:
            final_sentences.append((sentence, is_chapter, chapter, section, speaker))

    return final_sentences

# 分割长句子的函数
def split_sentences(text):
    primary_splitters = re.compile(r'(?<=[。！？……])')
    secondary_splitters = re.compile(r'(?<=[，、；：\/“”『』"\'()–—~\s])')

    def split_by_delimiters(sentence, delimiters):
        parts = []
        while len(sentence) > 50:
            match = delimiters.search(sentence[:50])
            if match:
                split_pos = match.end()
                if 45 <= split_pos <= 50 and split_pos == len(sentence[:split_pos].rstrip()):
                    parts.append(sentence[:split_pos].strip())
                    sentence = sentence[split_pos:].strip()
                else:
                    parts.append(sentence[:45].strip())
                    sentence = sentence[45:].strip()
            else:
                parts.append(sentence[:45].strip())
                sentence = sentence[45:].strip()
        if sentence:
            parts.append(sentence.strip())
        return parts

    sentences = primary_splitters.split(text)
    final_sentences = []

    for sentence in sentences:
        if len(sentence.strip()) > 50:
            sub_sentences = split_by_delimiters(sentence.strip(), secondary_splitters)
            for sub_sentence in sub_sentences:
                if len(sub_sentence.strip()) > 50:
                    parts = [sub_sentence[i:i + 45].strip() for i in range(0, len(sub_sentence), 45)]
                    final_sentences.extend(parts)
                else:
                    final_sentences.append(sub_sentence.strip())
        else:
            final_sentences.append(sentence.strip())

    # 合并那些长度小于10的短句到下一句
    adjusted_sentences = []
    temp_sentence = ""

    for sentence in final_sentences:
        if len(sentence) < 15:
            temp_sentence += sentence
        else:
            if temp_sentence:
                sentence = temp_sentence + sentence
                temp_sentence = ""

            if 45 < len(sentence) <= 50:
                match = secondary_splitters.search(sentence[:50])
                if match and match.end() <= 50:
                    split_pos = match.end()
                    adjusted_sentences.append(sentence[:split_pos].strip())
                    temp_sentence = sentence[split_pos:].strip()
                else:
                    adjusted_sentences.append(sentence[:45].strip())
                    temp_sentence = sentence[45:].strip()
            else:
                if len(sentence) > 50:
                    parts = [sentence[i:i + 45].strip() for i in range(0, len(sentence), 45)]
                    adjusted_sentences.extend(parts)
                else:
                    adjusted_sentences.append(sentence.strip())

    if temp_sentence:
        if adjusted_sentences and len(adjusted_sentences[-1]) + len(temp_sentence) <= 50:
            adjusted_sentences[-1] += temp_sentence
        else:
            adjusted_sentences.append(temp_sentence.strip())

    return [s.strip() for s in adjusted_sentences if s.strip()]

# 中文数字转阿拉伯数字
def convert_chinese_numerals_to_arabic(chinese_numeral):
    chinese_to_arabic = {
        '一': 1, '二': 2, '三': 3, '四': 4, '五': 5,
        '六': 6, '七': 7, '八': 8, '九': 9, '十': 10,
        '百': 100, '千': 1000
    }
    result = 0
    unit = 1
    temp_result = 0  # 暂存结果，处理当前部分的数字
    for char in reversed(chinese_numeral):
        if char in chinese_to_arabic:
            num = chinese_to_arabic[char]
            if num >= 10:  # 处理“十”、“百”、“千”
                if temp_result == 0:
                    temp_result = 1  # 处理“十”、“百”、“千”前面没有数字的情况，如“十一”、“百二十”
                if num > unit:
                    unit = num
                else:
                    unit *= num
            else:
                temp_result += num * unit
        else:
            return 0  # 如果遇到无法识别的字符，则返回0，表示错误

    result += temp_result
    return result