Fetch the repository succeeded.
This action will force synchronization from 东方佑/chinese_chatbot_corpus, which will overwrite any changes that you have made since you forked the repository, and can not be recovered!!!
Synchronous operation will process in the background and will refresh the page when finishing processing. Please be patient.
import codecs
import chardet
from config import Config
def str_q2b(s):
res = ""
for u in s:
c = ord(u)
if c == 12288:
c = 32
elif 65281 <= c <= 65374:
c -= 65248
res += chr(c)
return res
def check_file_encoding(file_name):
f = open(file_name, 'rb')
data = f.read()
detect_result = chardet.detect(data)
print(detect_result)
return detect_result
def generate_single_pairs_from_multi_turn(utterances):
pairs = []
for index in range(len(utterances) - 1):
pairs.append((utterances[index], utterances[index + 1]))
return pairs
def check_format(file_name):
file = codecs.open(file_name, encoding=Config.encoding)
for index, line in enumerate(file):
pair = line.split("\t")
if not len(pair) == 2:
print("error", file_name)
print(line, index, len(pair))
break
file.close()
def format_refine(file_name):
file = codecs.open(file_name, encoding=Config.encoding)
valid_lines = []
for index, line in enumerate(file):
pair = line.split("\t")
if len(pair) == 2:
valid_lines.append(line)
file.close()
file = codecs.open(file_name, "w", encoding=Config.encoding)
for line in valid_lines:
file.write(line)
file.close()
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。