master

分支 (1)

管理

管理

master

translate
/
fanyi.py

import json
import random
import hashlib

from googletrans import Translator
import collections
import os
import requests
import re
from lxml import etree
import pandas as pd
h_dir = 'html'
excel = 'excel'

zh_json = 'zh_en.json'
# zh_json = 'zh_jp.json'


def bd_api(q='机器人', fromLang='zh', toLang='en'):
    appid = '20200324000404031'  # 填写你的appid
    secretKey = '3E3ilPZAIudqQboGY4Jt'  # 填写你的密钥
    url_api = 'https://fanyi-api.baidu.com/api/trans/vip/translate'
    # fromLang = 'zh'  # 原文语种
    # toLang = 'en'  # 译文语种
    # toLang = 'jp'  # 译文语种
    # q = '机器人'
    salt = random.randint(32768, 65536)
    sign = appid+q+str(salt)+secretKey
    sign = hashlib.md5(sign.encode()).hexdigest()
    params = {
        'q': q,
        'from': fromLang,
        'to': toLang,
        'appid': appid,
        'salt': salt,
        'sign': sign,
    }
    r = requests.get(url_api, params=params)
    # print(r.status_code, r.json())
    return r.json()


def is_contain_chinese(check_str):
    """
    判断字符串中是否包含中文
    :param check_str: {str} 需要检测的字符串
    :return: {bool} 包含返回True， 不包含返回False
    """
    for ch in check_str:
        if u'\u4e00' <= ch <= u'\u9fff':
            return True
    return False


def sitemap1():
    with open('./sitemap1/sitemap.txt', 'r', encoding='utf8') as f:
        urls = f.read()
    url_li = urls.split()
    print(url_li, len(url_li))
    b = dict(collections.Counter(url_li))
    print({key: value for key, value in b.items()if value > 1})  # 展现重复元素和重复次数
    new_url_li = []
    for url in url_li:
        if 'https://www.encootech.com/' in url and url not in new_url_li:
            new_url_li.append(url)
    new_url_li.sort()
    print(new_url_li, len(new_url_li))
    with open('./sitemap1/sitemap去重后.txt', 'w', encoding='utf8') as f:
        for url in new_url_li:
            f.write(f'{url}\n')


def sitemap2():
    with open('./sitemap2/www.encootech.com.txt', 'r', encoding='utf8') as f:
        urls = f.read()
    url_li = urls.split()
    print(url_li, len(url_li))
    b = dict(collections.Counter(url_li))
    print({key: value for key, value in b.items()if value > 1})  # 展现重复元素和重复次数
    new_url_li = []
    for url in url_li:
        if 'https://www.encootech.com/' in url and url not in new_url_li:
            new_url_li.append(url)
    new_url_li.sort()
    print(new_url_li, len(new_url_li))
    with open('./sitemap1/sitemap去重后.txt', 'w', encoding='utf8') as f:
        for url in new_url_li:
            f.write(f'{url}\n')


def get_html():
    # 需要提前准备好去重后的url列表，可以用提供sitemap的网站生成，如https://sitemap.webkk.net/
    if not os.path.exists(h_dir):
        os.mkdir(h_dir)
    with open('./sitemap1/sitemap去重后.txt', 'r', encoding='utf8') as f:
        urls = f.read()
    url_li = urls.split()
    for url in url_li:
        print('网站url:', url)
        r = requests.get(url)
        html = etree.HTML(r.text)
        title = html.xpath('string(//title)')
        title = title.replace('|', '$')
        title = title.replace(' ', '')
        n = url[8:].replace('/', '$')
        print('文件名：', f'{h_dir}/{title}——{n}.html')
        with open(f'{h_dir}/{title}——{n}.html', 'wb') as f:
            f.write(r.content)

# get_html()


def get_str():
    txt = 'txt'
    if not os.path.exists(txt):
        os.mkdir(txt)
    if not os.path.exists(excel):
        os.mkdir(excel)
    h_li = os.listdir(h_dir)
    for h in h_li:
        url = h.split('——')[-1].replace('$', '/').replace('.html', '')
        url = 'https://'+url
        with open(f'{h_dir}/{h}', 'r', encoding='utf8') as f:
            text = f.read()
        regex = re.compile('>(.*?)<', re.S)  # 匹配标签里的内容
        con = regex.findall(text)
        con = [c.strip() for c in con]
        con = [c for c in con if c != '' and c != '-->'and 'function'not in c and 'var 'not in c]
        con.insert(0, url)
        # print(con, len(con))
        n = h.replace('.html', '')
        with open(f'{txt}/{n}.txt', 'w', encoding='utf8') as f:
            for c in con:
                f.write(f'{c}\n')

         # 用pandas将匹配的数据保存为表格
        df = pd.DataFrame(con, columns=['中文'])
        # print(df)
        df.to_excel(f'{excel}/{n}.xlsx', sheet_name=n[-30:], index=False)


# get_str()


def get_tran():
    fromLang = zh_json.split('.')[0].split('_')[0]
    toLang = zh_json.split('.')[0].split('_')[1]
    txt = 'txt'
    t_li = os.listdir(txt)
    if not os.path.exists(zh_json):
        with open(zh_json, 'w') as f:
            f.write('')
    with open(zh_json, 'r', encoding='utf8') as f:
        done = f.read()
    if done:
        done_di = json.loads(done)
    else:
        done_di = {}
    # print(done_di)
    try:
        for t in t_li:
            with open(f'{txt}/{t}', 'r', encoding='utf8') as f:
                text = f.read()
            text_li = text.split('\n')
            for t in text_li:
                if t not in list(done_di.keys()):
                    if is_contain_chinese(t):
                        r = bd_api(t, fromLang, toLang)
                        print(r)
                        done_di[t] = r.get('trans_result')[0].get('dst')
    except Exception as e:
        print(e)
    if done_di:
        with open(zh_json, 'w', encoding='utf8') as f:
            f.write(json.dumps(done_di, indent=4, ensure_ascii=False))

# get_tran()


def json_excel():
    name = zh_json.split('.')
    with open(zh_json, 'r', encoding='utf8') as f:
        s = f.read()
    done_di = json.loads(s)
    a = []
    b = []
    for key, value in done_di.items():
        a.append(key)
        b.append(value)
    col = name[0].split('_')
    c = {'中文': a, col[1]: b}
    df = pd.DataFrame(c)
    print(df)
    df.to_excel(f'{name[0]}.xlsx', index=False)


# json_excel()


def excel_all():
    exc_li = os.listdir(excel)
    res = '翻译结果'
    if not os.path.exists(res):
        os.mkdir(res)

    df_en = pd.read_excel('zh_en.xlsx')
    df_jp = pd.read_excel('zh_jp.xlsx')
    for exc in exc_li:
        df_0 = pd.read_excel(f'{excel}/{exc}')
        # df_0 = pd.read_excel(r'D:\python\翻译\excel\新闻中心——www.encootech.com$news.xlsx')
        df_r = pd.merge(df_0, df_en, how='left')
        df_r = pd.merge(df_r, df_jp, how='left')
        df_r = df_r.rename(columns={'en': '英文', 'jp': '日文'})
        print(df_r)
        df_r.to_excel(f'{res}/{exc}', index=False)
    print('完成翻译')

excel_all()