代码拉取完成,页面将自动刷新
from io import open
import pickle
wubi2ch = "data/wubi_to_chinese.pkl"
ch2wubi = "data/chinese_to_wubi.pkl"
def load_dict(dict_path):
return pickle.load(open(dict_path, "rb"))
with open("byte_char_map.pkl", "rb") as f:
byte_char_map = pickle.load(f)
SEP = chr(ord('_')+50000)
with open('/data2/private/clsi/wubi_corpus_orig/formatted/baidubaike_corpus.txt', 'r') as f:
with open('/data2/private/clsi/wubi_corpus_byte/formatted/baidubaike_corpus.txt', 'w+') as fw:
line = f.readline()
idx = 0
while line:
idx += 1
newline = ''
for c in line.strip():
c = bytes(c, 'utf-8')
for byte_index in c:
ch = byte_char_map[byte_index]
newline += ch
newline += SEP
newline += '\n'
fw.write(newline)
line = f.readline()
if idx % 400000 == 0:
print (idx)
print (newline)
# with open('/data2/private/clsi/wubi_corpus_byte/formatted/baidubaike_corpus.txt', 'r') as f:
# print (f.readline())
## tmux 12
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。