代码拉取完成,页面将自动刷新
import json
import random
import hashlib
from googletrans import Translator
import collections
import os
import requests
import re
from lxml import etree
import pandas as pd
h_dir = 'html'
excel = 'excel'
zh_json = 'zh_en.json'
# zh_json = 'zh_jp.json'
def bd_api(q='机器人', fromLang='zh', toLang='en'):
appid = '20200324000404031' # 填写你的appid
secretKey = '3E3ilPZAIudqQboGY4Jt' # 填写你的密钥
url_api = 'https://fanyi-api.baidu.com/api/trans/vip/translate'
# fromLang = 'zh' # 原文语种
# toLang = 'en' # 译文语种
# toLang = 'jp' # 译文语种
# q = '机器人'
salt = random.randint(32768, 65536)
sign = appid+q+str(salt)+secretKey
sign = hashlib.md5(sign.encode()).hexdigest()
params = {
'q': q,
'from': fromLang,
'to': toLang,
'appid': appid,
'salt': salt,
'sign': sign,
}
r = requests.get(url_api, params=params)
# print(r.status_code, r.json())
return r.json()
def is_contain_chinese(check_str):
"""
判断字符串中是否包含中文
:param check_str: {str} 需要检测的字符串
:return: {bool} 包含返回True, 不包含返回False
"""
for ch in check_str:
if u'\u4e00' <= ch <= u'\u9fff':
return True
return False
def sitemap1():
with open('./sitemap1/sitemap.txt', 'r', encoding='utf8') as f:
urls = f.read()
url_li = urls.split()
print(url_li, len(url_li))
b = dict(collections.Counter(url_li))
print({key: value for key, value in b.items()if value > 1}) # 展现重复元素和重复次数
new_url_li = []
for url in url_li:
if 'https://www.encootech.com/' in url and url not in new_url_li:
new_url_li.append(url)
new_url_li.sort()
print(new_url_li, len(new_url_li))
with open('./sitemap1/sitemap去重后.txt', 'w', encoding='utf8') as f:
for url in new_url_li:
f.write(f'{url}\n')
def sitemap2():
with open('./sitemap2/www.encootech.com.txt', 'r', encoding='utf8') as f:
urls = f.read()
url_li = urls.split()
print(url_li, len(url_li))
b = dict(collections.Counter(url_li))
print({key: value for key, value in b.items()if value > 1}) # 展现重复元素和重复次数
new_url_li = []
for url in url_li:
if 'https://www.encootech.com/' in url and url not in new_url_li:
new_url_li.append(url)
new_url_li.sort()
print(new_url_li, len(new_url_li))
with open('./sitemap1/sitemap去重后.txt', 'w', encoding='utf8') as f:
for url in new_url_li:
f.write(f'{url}\n')
def get_html():
# 需要提前准备好去重后的url列表,可以用提供sitemap的网站生成,如https://sitemap.webkk.net/
if not os.path.exists(h_dir):
os.mkdir(h_dir)
with open('./sitemap1/sitemap去重后.txt', 'r', encoding='utf8') as f:
urls = f.read()
url_li = urls.split()
for url in url_li:
print('网站url:', url)
r = requests.get(url)
html = etree.HTML(r.text)
title = html.xpath('string(//title)')
title = title.replace('|', '$')
title = title.replace(' ', '')
n = url[8:].replace('/', '$')
print('文件名:', f'{h_dir}/{title}——{n}.html')
with open(f'{h_dir}/{title}——{n}.html', 'wb') as f:
f.write(r.content)
# get_html()
def get_str():
txt = 'txt'
if not os.path.exists(txt):
os.mkdir(txt)
if not os.path.exists(excel):
os.mkdir(excel)
h_li = os.listdir(h_dir)
for h in h_li:
url = h.split('——')[-1].replace('$', '/').replace('.html', '')
url = 'https://'+url
with open(f'{h_dir}/{h}', 'r', encoding='utf8') as f:
text = f.read()
regex = re.compile('>(.*?)<', re.S) # 匹配标签里的内容
con = regex.findall(text)
con = [c.strip() for c in con]
con = [c for c in con if c != '' and c != '-->'and 'function'not in c and 'var 'not in c]
con.insert(0, url)
# print(con, len(con))
n = h.replace('.html', '')
with open(f'{txt}/{n}.txt', 'w', encoding='utf8') as f:
for c in con:
f.write(f'{c}\n')
# 用pandas将匹配的数据保存为表格
df = pd.DataFrame(con, columns=['中文'])
# print(df)
df.to_excel(f'{excel}/{n}.xlsx', sheet_name=n[-30:], index=False)
# get_str()
def get_tran():
fromLang = zh_json.split('.')[0].split('_')[0]
toLang = zh_json.split('.')[0].split('_')[1]
txt = 'txt'
t_li = os.listdir(txt)
if not os.path.exists(zh_json):
with open(zh_json, 'w') as f:
f.write('')
with open(zh_json, 'r', encoding='utf8') as f:
done = f.read()
if done:
done_di = json.loads(done)
else:
done_di = {}
# print(done_di)
try:
for t in t_li:
with open(f'{txt}/{t}', 'r', encoding='utf8') as f:
text = f.read()
text_li = text.split('\n')
for t in text_li:
if t not in list(done_di.keys()):
if is_contain_chinese(t):
r = bd_api(t, fromLang, toLang)
print(r)
done_di[t] = r.get('trans_result')[0].get('dst')
except Exception as e:
print(e)
if done_di:
with open(zh_json, 'w', encoding='utf8') as f:
f.write(json.dumps(done_di, indent=4, ensure_ascii=False))
# get_tran()
def json_excel():
name = zh_json.split('.')
with open(zh_json, 'r', encoding='utf8') as f:
s = f.read()
done_di = json.loads(s)
a = []
b = []
for key, value in done_di.items():
a.append(key)
b.append(value)
col = name[0].split('_')
c = {'中文': a, col[1]: b}
df = pd.DataFrame(c)
print(df)
df.to_excel(f'{name[0]}.xlsx', index=False)
# json_excel()
def excel_all():
exc_li = os.listdir(excel)
res = '翻译结果'
if not os.path.exists(res):
os.mkdir(res)
df_en = pd.read_excel('zh_en.xlsx')
df_jp = pd.read_excel('zh_jp.xlsx')
for exc in exc_li:
df_0 = pd.read_excel(f'{excel}/{exc}')
# df_0 = pd.read_excel(r'D:\python\翻译\excel\新闻中心——www.encootech.com$news.xlsx')
df_r = pd.merge(df_0, df_en, how='left')
df_r = pd.merge(df_r, df_jp, how='left')
df_r = df_r.rename(columns={'en': '英文', 'jp': '日文'})
print(df_r)
df_r.to_excel(f'{res}/{exc}', index=False)
print('完成翻译')
excel_all()
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。