Community-document
/
translate.py

import requests
import random
from hashlib import md5
import time
import os
import sys
import logging

# 日志模块
logger = logging.getLogger()
logger.setLevel(logging.INFO)
# 创建一个handler，用于写入日志文件
rq = time.strftime('%Y%m%d%H%M', time.localtime(time.time()))
log_path = os.path.dirname(os.path.abspath(__file__)) + '/logs/'
log_name = log_path + rq + '.log'
logfile = log_name
fh = logging.FileHandler(logfile, mode='w')
formatter = logging.Formatter("%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s")
fh.setFormatter(formatter)
logger.addHandler(fh)


# TODO  QuecPython 翻译成Quecpthon了 已解决
# TODO 图片文件不翻译 已解决
# TODO 代码的首字母被大写了 已解决
# TODO 括号被汉化了 已解决
# TODO * * 之间多了个空格 无法正常加粗 已解决
# TODO 对表格处理 翻译后缺失|
# TODO  yaml文件也需要补全处理 已解决
# TODO 注释多了个空格   <!-- * --> 已解决
# TODO 目录只翻译前半段 已解决

# Set your own appid/appkey.
appid = '20210722000894813'
appkey = 'QBqv9vS1CIgKSj_foFed'
# For list of language codes, please refer to `https://api.fanyi.baidu.com/doc/21`
# from_lang = 'auto'
from_lang = 'zh'
to_lang = 'en'
endpoint = 'http://api.fanyi.baidu.com'
path = '/api/trans/vip/translate'
url = endpoint + path


# 删除源文档
def delete_txt(file_path):
    if os.path.exists(file_path):
        try:
            os.remove(file_path)
            return True
        except Exception as e:
            info = sys.exc_info()
            print("remove file error.", e)
            print(info[0], info[1])
            return False
    else:
        return True


# 写文档
def write_txt(file_path, content):
    with open(file_path, 'a+', encoding='utf-8') as f:
        f.write(content)


# 判断是否有中文字符
def check_contain_chinese(check_str):
    for ch in check_str.decode('utf-8'):
        if u'\u4e00' <= ch <= u'\u9fff':
            return True
    return False


# 用于保持和原文一样的格式(only for code-)
def space_str_handle(space_str):
    # count = ''
    # for i in space_str:
    #     if i == '#':
    #         count += ' '
    #         continue
    #     else:
    #         print('count:', len(count))
    #         return [count, space_str]
    space_str_list = space_str.split('#', 1)
    return space_str_list


# 用于保持和原文一样的格式 for yaml
def space_yaml_handle(space_str):
    count = ''
    for i in space_str:
        if i == ' ':
            count += ' '
            continue
        else:
            # print('count:', len(count))
            return [count, space_str]


# 用于处理路径翻译部分（路径不能翻译，翻译会多很多空格）
def space_path_handle(space_str: str) -> list:
    if space_str.find('](') != -1:
        print(space_str)
        index = space_str.index(']')
        return [space_str[:index+1], space_str[index+1:]]
    else:
        return [space_str, '']


# 获取英语翻译-> 百度翻译
def get_english(words: str) -> str:
    """
    功能: 源语言(自动识别)=> 英语
    传入翻译文字,自动识别,翻译后返回一个列表,每个元素为一段落
    """
    # Set your own appid/appkey.
    query = words

    # Generate salt and sign
    def make_md5(s, encoding='utf-8'):
        return md5(s.encode(encoding)).hexdigest()

    salt = random.randint(32768, 65536)
    sign = make_md5(appid + query + str(salt) + appkey)
    # Build request
    headers = {'Content-Type': 'application/x-www-form-urlencoded'}
    payload = {'appid': appid, 'q': query, 'from': from_lang, 'to': to_lang, 'salt': salt, 'sign': sign}
    # Send request
    time.sleep(2)
    try:
        r = requests.post(url, params=payload, headers=headers)
        result = r.json()
        print(result)

        trans = result['trans_result']
        ret = ''
        for n in range(len(trans)):
            ret += trans[n]['dst']
        # 解决专用名词翻译问题
        ret = ret.replace('Quecpthon', 'QuecPython')
    except Exception as e:
        ret = query
        print(e)
        logger.info('翻译失败的大哥：' + ret)
        logger.info('翻译失败的原因：' + str(e))
    return ret


# 从文件中取翻译内容
def main(src_path):
    out_path = src_path.replace('zh', 'en')
    code_flag = False
    for line in open(src_path, 'r', encoding='utf-8'):
        line = line.strip('\n')
        if len(line) < 1:  # 该行仅仅是换行
            write_txt(out_path, "\n")
        else:
            # 图片文件路径不翻译（翻译后找不到文件了）
            if line.startswith('!['):
                write_txt(out_path, line + "\n")
                continue
            if line == '```python':
                code_flag = True
            if line == '```':
                code_flag = False

            if check_contain_chinese(line.encode('utf-8')):
                # print(code_flag)
                if code_flag:
                    line = space_str_handle(line)
                    if len(line) == 2:
                        print('待翻译内容：', line[1])
                        data = get_english(line[1])
                        write_txt(out_path, line[0] + '#' + data + "\n")  # 百度API 方法
                    else:
                        print('待翻译内容：', line[0])
                        data = get_english(line[0])
                        write_txt(out_path, data + "\n")  # 百度API 方法
                else:
                    if line[0] == " ":    # 不是目录
                        line = space_yaml_handle(line)
                        print('待翻译内容：', line[1])
                        data = get_english(line[1])
                        write_txt(out_path, line[0] + tran_handler(data) + "\n")
                    else:
                        line = space_path_handle(line)
                        print('待翻译内容：', line[0])
                        data = get_english(line[0])
                        write_txt(out_path,  tran_handler(data) + line[1] + "\n")

            else:
                write_txt(out_path, line + "\n")


# 处理md格式问题 ####后面加空格
def tran_handler(hanle_str):
    hanle_str = hanle_str.replace('* * *', '**')  # 去除翻译过程中产生多余的空格(有时候会多一个）
    hanle_str = hanle_str.replace('* *', '**')  # 去除翻译过程中产生多余的空格
    hanle_str = hanle_str.replace('- >', '-->')  # 补全翻译过程缺失的内容
    hanle_str = hanle_str.replace('-Label', '- label')  # 补全翻译过程缺失的内容

    # hanle_str = hanle_str.replace('] (', '](')  # 补全翻译过程缺失的内容

    if hanle_str.startswith('#'):
        count = 0
        for j in hanle_str:
            if j == "#":
                count += 1
                continue
            else:
                break
        str_list = list(hanle_str)
        str_list.insert(count, " ")
        return ''.join(str_list)
    else:
        return hanle_str


if __name__ == '__main__':

    # 单个文件
    root_list = 'E:\\teedoc_wiki\\新版文档中心\\Community-document\\docs\\sbs\\zh\\README.md'
    delete_txt(root_list.replace('zh', 'en'))  # 删除原文件
    main(root_list)
    # 单层目录
    # root_list = 'E:\\teedoc_wiki\\新版文档中心\\Community-document\\docs\\FAQ\\zh'
    # for root, dirs, files in os.walk(root_list, topdown=False):
    #     for name in files:
    #         if name.split(".")[-1] in ('md', 'yaml'):
    #             # print(os.path.join(root, name))
    #             delete_txt(os.path.join(root, name).replace('zh', 'en'))  # 删除原文件
    #             main(os.path.join(root, name))
    #             time.sleep(5)
    # 整个项目
    # root_path = 'E:\\teedoc_wiki\\新版文档中心\\Community-document\\docs\\'
    # root_list = os.listdir(root_path)
    # for i in root_list:
    #     print(os.path.join(root_path, i))
        # for root, dirs, files in os.walk(os.path.join(root_path, i+'\\zh'), topdown=False):
            # for name in files:
                # if name.split(".")[-1] in ('md', 'yaml'):
                    # delete_txt(os.path.join(root, name).replace('zh', 'en'))  # 删除原文件
                    # main(os.path.join(root, name))
                    # time.sleep(5)

    # 传参
    # if len(sys.argv) < 2:
    #     print('No input file provided')
    #     exit()
    # if sys.argv[1] == 'all':
    #     # 翻译全部
    #     pass
    # else:
    #     # 单个文件
    #     pass