3 Star 5 Fork 2

新媒体网络营销/针对cosyvoice开发的大文本转语音处理工具_听书狂人处理机

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
克隆/下载
text_fc.py 17.19 KB
一键复制 编辑 原始数据 按行查看 历史
import re
import sqlite3
from database_utils import get_valid_speakers
# 过滤目录部分
def remove_table_of_contents(text):
toc_threshold = int(len(text) * 0.04) # 寻找范围:文本的前4%
potential_toc_part = text[:toc_threshold]
# 目录标志的正则表达式
toc_marker_pattern = re.compile(r'目录|目\s*录', re.IGNORECASE)
# 行末带有多个章节编号或节编号的正则表达式(支持阿拉伯数字和中文数字)
toc_entry_pattern = re.compile(r'^(.*?)(\d+|第[一二三四五六七八九十百千]+)\s*(章|节|录)', re.MULTILINE)
# 查找目录标志的位置
toc_marker_match = toc_marker_pattern.search(potential_toc_part)
# 如果没有找到明确的“目录”标志,从头开始判断目录特征
start_position = toc_marker_match.start() if toc_marker_match else 0
toc_lines = []
temp_line = ""
previous_number = None
# 遍历从目录标志之后的文本
for line in potential_toc_part[start_position:].splitlines():
stripped_line = line.strip()
# 检查是否是目录条目(含多个章节编号和标题的混合格式)
if len(stripped_line) < 50 and toc_entry_pattern.search(stripped_line):
temp_line += " " + stripped_line
if len(temp_line) > 50 or len(toc_lines) > 0: # 确保是目录内容,不是单独的一行
toc_lines.append(temp_line.strip())
temp_line = ""
# 检查是否是递增的数字序列
elif re.match(r'^\d+', stripped_line):
current_number = int(re.match(r'^\d+', stripped_line).group())
if previous_number is None or current_number == previous_number + 1:
toc_lines.append(stripped_line)
previous_number = current_number
else:
# 如果数字不再递增,停止捕获
break
elif toc_lines:
# 如果前面已经捕获了目录条目,遇到不符合条件的行时停止捕获
break
# 如果捕获到了符合条件的目录条目,移除目录部分
if toc_lines:
toc_end_index = text.find(toc_lines[-1]) + len(toc_lines[-1])
return text[toc_end_index:]
return text
# 预处理文本
def preprocess_text(text):
# 识别并处理百分数的情况,确保小数点后的百分数被正确处理
percentage_pattern = re.compile(r'(\d+\.\d+)%')
text = re.sub(percentage_pattern, lambda x: f'百分之{x.group(1)}', text)
# 处理非百分数的小数点,防止被替换为中文句号
non_percentage_decimal_pattern = re.compile(r'(\d+\.\d+)(?!%)')
text = re.sub(non_percentage_decimal_pattern, lambda x: x.group(0).replace('.', 'DOT'), text)
# 替换标点符号
text = re.sub(r',', ',', text)
text = re.sub(r'\.', '。', text)
text = re.sub(r';', ';', text)
text = re.sub(r':', ':', text)
text = re.sub(r'\!', '!', text)
text = re.sub(r'\?', '?', text)
text = re.sub(r'\+', '加', text)
text = re.sub(r'=', '等于', text)
text = re.sub(r'#', '', text)
text = re.sub(r'\*\s*(.*?)\s*\*', r' \1 ', text)
# 恢复非百分数的小数点
text = re.sub(r'DOT', '.', text)
# 清理多余的空格和换行符
text = re.sub(r'\n+', '\n', text)
text = re.sub(r'(\w{1,3}\s*){1,3}\n', lambda m: m.group(0).replace('\n', ' '), text)
text = re.sub(r'\s+', ' ', text)
return text
# 强化后的章节和节捕获与切片逻辑
def split_text(text):
# 先移除目录部分
text = remove_table_of_contents(text)
# 再进行文本预处理
text = preprocess_text(text)
chapter_pattern = re.compile(r'(第[一二三四五六七八九十百千]+章)')
section_patterns = [
re.compile(r'(第[一二三四五六七八九十百千]+节)'),
re.compile(r'(第[一二三四五六七八九十百千]+段)'),
re.compile(r'(第[一二三四五六七八九十百千]+篇)')
]
# 获取有效发音人列表(从 database_utils 中导入的函数)
valid_speakers = get_valid_speakers()
# 发音人标记的正则表达式
speaker_pattern = re.compile(r'@(\S+?)@')
end_speaker_pattern = re.compile(r'@@')
current_speaker = None # 当前的发音人
# 捕获章节信息,确保所有章节都能在切片前被识别
sentences = re.split(r'(?<=[。!?])|(?<=[,,])\s*|\n+', text)
new_sentences = []
temp_sentence = ""
current_chapter = ""
current_section = 0
chapter_count = 1
section_count = 1
chapter_titles = set()
section_titles = {}
found_first_section = False
for sentence in sentences:
sentence = sentence.strip()
if len(sentence) == 0 or sentence in [",", ",", ".", "。"]:
continue
# 处理发音人标记
speaker_match = speaker_pattern.search(sentence)
end_speaker_match = end_speaker_pattern.search(sentence)
if speaker_match:
speaker_name = speaker_match.group(1)
if speaker_name in valid_speakers: # 检查发音人是否存在于 speaker 表中
if current_speaker and temp_sentence:
# 如果在找到下一个讲述人标记前没有找到结束标记,将当前积累的句子加入
new_sentences.append((temp_sentence.strip(), 0, current_chapter, current_section, current_speaker))
temp_sentence = ""
current_speaker = speaker_name # 捕获有效的发音人名称
sentence = speaker_pattern.sub('', sentence) # 移除发音人标记
else:
current_speaker = None # 无效发音人,忽略处理
continue # 过滤掉无效的讲述人标记以及其相关内容,不写入数据库
elif end_speaker_match:
if current_speaker and temp_sentence:
# 如果找到结束标记,将当前积累的句子加入
new_sentences.append((temp_sentence.strip(), 0, current_chapter, current_section, current_speaker))
temp_sentence = ""
current_speaker = None # 重置发音人
sentence = end_speaker_pattern.sub('', sentence) # 移除结束标记
continue # 过滤掉结束标记的内容,不写入数据库
# 捕获章节标记,即使它与正文混在一起
chapter_match = chapter_pattern.search(sentence)
section_match = None
for pattern in section_patterns:
if pattern.search(sentence):
section_match = pattern.search(sentence)
break
# 处理章节
if chapter_match:
chapter_title = chapter_match.group(0)
if chapter_title not in chapter_titles:
if temp_sentence:
new_sentences.append((temp_sentence.strip(), 0, current_chapter, current_section, current_speaker))
temp_sentence = ""
current_chapter = f"第{chapter_count}章"
chapter_label = f"【{current_chapter}{sentence}"
new_sentences.append((chapter_label.strip(), 1, current_chapter, 0, current_speaker))
chapter_titles.add(chapter_title)
chapter_count += 1
section_count = 1
current_section = 0
section_titles[current_chapter] = set()
found_first_section = False
# 处理节
elif section_match and current_chapter:
section_title = section_match.group(0)
if section_title not in section_titles[current_chapter]:
section_num_match = re.search(r'第([一二三四五六七八九十百千]+)节', section_title)
if section_num_match:
section_num = section_num_match.group(1)
section_num_arabic = convert_chinese_numerals_to_arabic(section_num)
if temp_sentence:
new_sentences.append((temp_sentence.strip(), 0, current_chapter, current_section, current_speaker))
temp_sentence = ""
section_label = f"【{current_chapter}{section_num_arabic}节】{sentence}"
new_sentences.append((section_label.strip(), 1, current_chapter, section_num_arabic, current_speaker))
section_count = section_num_arabic + 1
current_section = section_count
section_titles[current_chapter].add(section_title)
found_first_section = True
# 处理普通句子
elif len(temp_sentence) + len(sentence) <= 45:
temp_sentence += sentence
else:
if 35 <= len(temp_sentence) <= 45:
new_sentences.append((temp_sentence.strip(), 0, current_chapter, current_section, current_speaker))
temp_sentence = sentence
else:
sub_sentences = re.split(r'(?<=,)', temp_sentence)
combined_sentence = ""
for sub_sentence in sub_sentences:
if len(combined_sentence) + len(sub_sentence) <= 35:
combined_sentence += sub_sentence
else:
new_sentences.append((combined_sentence.strip(), 0, current_chapter, current_section, current_speaker))
combined_sentence = sub_sentence
if combined_sentence:
new_sentences.append((combined_sentence.strip(), 0, current_chapter, current_section, current_speaker))
temp_sentence = sentence
if temp_sentence and len(temp_sentence.strip()) > 0:
new_sentences.append((temp_sentence.strip(), 0, current_chapter, current_section, current_speaker))
new_sentences = [(s.strip(), is_chapter, chapter, section, speaker) for s, is_chapter, chapter, section, speaker in new_sentences if len(s.strip()) > 0]
# 调整段落的优先级
adjusted_sentences = []
chapter_sections = {}
for i, (sentence, is_chapter, chapter, section, speaker) in enumerate(new_sentences):
if is_chapter == 1:
current_chapter = chapter
current_section = 0
chapter_sections[current_chapter] = [] # 确保初始化
elif chapter == current_chapter:
# 确保 current_chapter 键已被初始化
if current_chapter not in chapter_sections:
chapter_sections[current_chapter] = []
chapter_sections[current_chapter].append(section)
for chapter, sections in chapter_sections.items():
if sections:
missing_sections = set(range(1, max(sections) + 1)) - set(sections)
else:
missing_sections = set() # 如果 sections 为空,则 missing_sections 也为空
if missing_sections:
for i, (sentence, is_chapter, chap, sec, speaker) in enumerate(new_sentences):
if chap == chapter and is_chapter == 0:
if sec in missing_sections:
missing_sections.remove(sec)
else:
if not found_first_section:
new_sentences[i] = (sentence, is_chapter, chap, 1, speaker)
else:
new_sentences[i] = (sentence, is_chapter, chap, sec, speaker)
current_section = 0
for i, (sentence, is_chapter, chapter, section, speaker) in enumerate(new_sentences):
if section > current_section + 1 and is_chapter == 0:
for j in range(current_section + 1, section):
new_sentences.insert(i, ("", 0, chapter, j, speaker))
current_section = section
# 检查并修正输出
corrected_sentences = []
for i, (sentence, is_chapter, chapter, section, speaker) in enumerate(new_sentences):
if is_chapter == 1:
corrected_sentences.append((sentence, is_chapter, chapter, section, speaker))
else:
if corrected_sentences and corrected_sentences[-1][2] == chapter and corrected_sentences[-1][3] == section - 1:
corrected_sentences.append((sentence, is_chapter, chapter, section, speaker))
else:
if corrected_sentences and corrected_sentences[-1][3] + 1 != section:
section = corrected_sentences[-1][3] + 1
corrected_sentences.append((sentence, is_chapter, chapter, section, speaker))
# 处理普通句子的句子分割
final_sentences = []
for sentence, is_chapter, chapter, section, speaker in corrected_sentences:
if is_chapter == 0:
split_sents = split_sentences(sentence)
final_sentences.extend([(s, is_chapter, chapter, section, speaker) for s in split_sents])
else:
final_sentences.append((sentence, is_chapter, chapter, section, speaker))
return final_sentences
# 分割长句子的函数
def split_sentences(text):
primary_splitters = re.compile(r'(?<=[。!?……])')
secondary_splitters = re.compile(r'(?<=[,、;:\/“”『』"\'()–—~\s])')
def split_by_delimiters(sentence, delimiters):
parts = []
while len(sentence) > 50:
match = delimiters.search(sentence[:50])
if match:
split_pos = match.end()
if 45 <= split_pos <= 50 and split_pos == len(sentence[:split_pos].rstrip()):
parts.append(sentence[:split_pos].strip())
sentence = sentence[split_pos:].strip()
else:
parts.append(sentence[:45].strip())
sentence = sentence[45:].strip()
else:
parts.append(sentence[:45].strip())
sentence = sentence[45:].strip()
if sentence:
parts.append(sentence.strip())
return parts
sentences = primary_splitters.split(text)
final_sentences = []
for sentence in sentences:
if len(sentence.strip()) > 50:
sub_sentences = split_by_delimiters(sentence.strip(), secondary_splitters)
for sub_sentence in sub_sentences:
if len(sub_sentence.strip()) > 50:
parts = [sub_sentence[i:i + 45].strip() for i in range(0, len(sub_sentence), 45)]
final_sentences.extend(parts)
else:
final_sentences.append(sub_sentence.strip())
else:
final_sentences.append(sentence.strip())
# 合并那些长度小于10的短句到下一句
adjusted_sentences = []
temp_sentence = ""
for sentence in final_sentences:
if len(sentence) < 15:
temp_sentence += sentence
else:
if temp_sentence:
sentence = temp_sentence + sentence
temp_sentence = ""
if 45 < len(sentence) <= 50:
match = secondary_splitters.search(sentence[:50])
if match and match.end() <= 50:
split_pos = match.end()
adjusted_sentences.append(sentence[:split_pos].strip())
temp_sentence = sentence[split_pos:].strip()
else:
adjusted_sentences.append(sentence[:45].strip())
temp_sentence = sentence[45:].strip()
else:
if len(sentence) > 50:
parts = [sentence[i:i + 45].strip() for i in range(0, len(sentence), 45)]
adjusted_sentences.extend(parts)
else:
adjusted_sentences.append(sentence.strip())
if temp_sentence:
if adjusted_sentences and len(adjusted_sentences[-1]) + len(temp_sentence) <= 50:
adjusted_sentences[-1] += temp_sentence
else:
adjusted_sentences.append(temp_sentence.strip())
return [s.strip() for s in adjusted_sentences if s.strip()]
# 中文数字转阿拉伯数字
def convert_chinese_numerals_to_arabic(chinese_numeral):
chinese_to_arabic = {
'一': 1, '二': 2, '三': 3, '四': 4, '五': 5,
'六': 6, '七': 7, '八': 8, '九': 9, '十': 10,
'百': 100, '千': 1000
}
result = 0
unit = 1
temp_result = 0 # 暂存结果,处理当前部分的数字
for char in reversed(chinese_numeral):
if char in chinese_to_arabic:
num = chinese_to_arabic[char]
if num >= 10: # 处理“十”、“百”、“千”
if temp_result == 0:
temp_result = 1 # 处理“十”、“百”、“千”前面没有数字的情况,如“十一”、“百二十”
if num > unit:
unit = num
else:
unit *= num
else:
temp_result += num * unit
else:
return 0 # 如果遇到无法识别的字符,则返回0,表示错误
result += temp_result
return result
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/xinmeitiyingxiao/lingting.git
git@gitee.com:xinmeitiyingxiao/lingting.git
xinmeitiyingxiao
lingting
针对cosyvoice开发的大文本转语音处理工具_听书狂人处理机
master

搜索帮助