5 Star 33 Fork 5

Gitee 极速下载/ChatPaper

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
此仓库是为了提升国内下载速度的镜像仓库,每日同步一次。 原始仓库: https://github.com/kaixindelele/ChatPaper
克隆/下载
chat_translate.py 9.70 KB
一键复制 编辑 原始数据 按行查看 历史
import scipdf
import sys, os
import openai
import tenacity
import tiktoken
import re
from functools import lru_cache
class LazyloadTiktoken(object):
def __init__(self, model):
self.model = model
@staticmethod
@lru_cache(maxsize=128)
def get_encoder(model):
print('正在加载tokenizer,如果是第一次运行,可能需要一点时间下载参数')
tmp = tiktoken.encoding_for_model(model)
print('加载tokenizer完毕')
return tmp
def encode(self, *args, **kwargs):
encoder = self.get_encoder(self.model)
return encoder.encode(*args, **kwargs)
def decode(self, *args, **kwargs):
encoder = self.get_encoder(self.model)
def parse_pdf(path):
try:
pdf = scipdf.parse_pdf_to_dict(path, as_list=False)
# 下面这段内容,可以加,也可以删除
pdf['authors'] = pdf['authors'].split('; ')
pdf['section_names'] = [it['heading'] for it in pdf['sections']]
pdf['section_texts'] = [it['text'] for it in pdf['sections']]
except Exception as e:
print("parse_pdf_to_dict(path:", e)
exc_type, exc_obj, exc_tb = sys.exc_info()
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
print(exc_type, fname, exc_tb.tb_lineno)
return pdf
@tenacity.retry(wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
stop=tenacity.stop_after_attempt(8),
reraise=True)
def chat_translate_part(text, key, title=False, domain="", tokenizer_gpt35=None, task="翻译"):
openai.api_key = key
# 这里需要做切分,如果长文本的话,需要多次翻译,或者直接换用16K的api.
# 先判断文本token长度:
token_size = len(tokenizer_gpt35.encode(text))
if token_size > 1800:
model = "gpt-3.5-turbo-16k"
else:
model = "gpt-3.5-turbo"
if title:
messages = [
{"role": "system",
"content": "You are now a professional Science and technology editor"},
{"role": "assistant",
"content": "Your task now is to translate title of the paper, the paper is about "+ domain},
{"role": "user", "content": "Input Contents:" + text +
"""
你需要把输入的标题,翻译成中文,且加上原标题。
注意,一些专业的词汇,或者缩写,还是需要保留为英文。
输出中文翻译部分的时候,只保留翻译的标题,不要有任何其他的多余内容,不要重复,不要解释。
输出原标题的时候,完整输出即可,不要多也不要少。
你的输出格式如下:
Output format is (你需要根据上面的要求,xxx是中文翻译的占位符,yyy是英文原标题的占位符,你需要将内容填充进去):
\n
# xxx
## yyy
\n
"""},
]
else:
messages = [
{"role": "system",
"content": "You are a professional academic paper translator."},
{"role": "assistant",
"content": "Your task now is to {} the Input Contents, which a section, part of a paper, the paper is about {}".format(task, domain)},
{"role": "user", "content": f"""
你的任务是口语化{task}输入的论文章节,{task}的内容要遵循下面的要求:
1. 在保证术语严谨的同时,文字表述需要更加口语化。
2. 需要地道的中文{task},逻辑清晰且连贯,少用倒装句式。
3. 对于简短的Input Contents,不要画蛇添足,增加多余的解释和扩展。
4. 对于本领域的专业术语,需要标注英文,便于读者参考。这篇论文的领域是{domain}
5. 适当使用MarkDown语法,比如有序列表、加粗等。
你的输出内容格式需要遵循下面的要求:
1. ## 章节名称,中文{task}(Original English section name)
2. 章节内容的{task}
Output format is (你需要根据上面的要求,自动填充xxx和yyy的占位符):
\n
## xxx
yyy
\n
Input include section name and section text, Input Contents: {text}
"""},
]
response = openai.ChatCompletion.create(
model=model,
messages=messages,
temperature=0.3,
)
result = ''
for choice in response.choices:
result += choice.message.content
print("summary_result:\n", result)
print("prompt_token_used:", response.usage.prompt_tokens,
"completion_token_used:", response.usage.completion_tokens,
"total_token_used:", response.usage.total_tokens)
print("response_time:", response.response_ms / 1000.0, 's')
info = {}
info['result'] = result
info['token_used'] = response.usage.total_tokens
info['response_time'] = response.response_ms / 1000.0
return info
@tenacity.retry(wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
stop=tenacity.stop_after_attempt(8),
reraise=True)
def chat_check_domain(text, key):
openai.api_key = key
messages = [
{"role": "system",
"content": "You are now a professional Science and technology editor"},
{"role": "assistant",
"content": "Your task is to judge the subject and domain of the paper based on the title and abstract of the paper, and your output should not exceed five words!"},
{"role": "user", "content": "Input Contents:" + text},
]
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=messages,
temperature=0.3,
)
result = ''
for choice in response.choices:
result += choice.message.content
print("summary_result:\n", result)
print("prompt_token_used:", response.usage.prompt_tokens,
"completion_token_used:", response.usage.completion_tokens,
"total_token_used:", response.usage.total_tokens)
print("response_time:", response.response_ms / 1000.0, 's')
info = {}
info['result'] = result
info['token_used'] = response.usage.total_tokens
info['response_time'] = response.response_ms / 1000.0
return info
def main(root_path, pdf_path, base_url, key, task="翻译"):
md_file = root_path + pdf_path.split("/")[-1].replace(".pdf", '.md')
md_str = "\n"
token_consumed = 0
paper_pdf = parse_pdf(pdf_path)
tokenizer_gpt35 = LazyloadTiktoken("gpt-3.5-turbo")
# 先根据标题和摘要,确定这篇文章的主题,给接下来的提示词,提供一个约束。效果提升非常明显
if "title" in paper_pdf.keys() and "abstract" in paper_pdf.keys():
text = "Title:" + paper_pdf['title'] + "Abstract:" + paper_pdf['abstract']
return_dict = chat_check_domain(text, key)
domains = return_dict['result']
token_consumed += return_dict["token_used"]
else:
domains = ""
print("这篇文章的domain是:", domains)
# input("继续?")
openai.api_base = base_url
# 先把标题翻译了
if "title" in paper_pdf.keys():
text = paper_pdf['title']
return_dict = chat_translate_part(text, key, title=True, domain=domains, tokenizer_gpt35=tokenizer_gpt35)
result = return_dict['result']
md_str += result
md_str += "\n"
md_str += "\n"
token_consumed += return_dict["token_used"]
with open(md_file, 'w', encoding="utf-8") as f:
f.write(md_str)
# 再把摘要翻译了
if "abstract" in paper_pdf.keys():
text = "Section Name:" + "Abstract" + "\n Section text:" + paper_pdf['abstract']
return_dict = chat_translate_part(text, key, domain=domains, tokenizer_gpt35=tokenizer_gpt35)
result = return_dict['result']
cur_str = "\n"
cur_str += result
cur_str += "\n"
token_consumed += return_dict["token_used"]
with open(md_file, 'a', encoding="utf-8") as f:
f.write(cur_str)
for section_index, section_name in enumerate(paper_pdf['section_names']):
print(section_index, section_name)
# 判断文本是否为空:
if len(paper_pdf['section_texts'][section_index])>0:
text = "Section Name:" + section_name + "\n Section text:" + paper_pdf['section_texts'][section_index]
return_dict = chat_translate_part(text, key, domain=domains, tokenizer_gpt35=tokenizer_gpt35, task=task)
result = return_dict['result']
cur_str = "\n"
cur_str += result
cur_str += "\n"
token_consumed += return_dict["token_used"]
# 找到其中包含##的文本,如果##的前面没有\n,且后面文本到\n的文本长度小于18个word,则将其替换为\n##,否则不替换
pattern = r'([^\\n])##([^\\n]{1,18}\W+)'
cur_str = re.sub(pattern, r'\1\n##\2', cur_str)
with open(md_file, 'a', encoding="utf-8") as f:
f.write(cur_str)
print("整篇文章消耗了{}的token!".format(token_consumed))
if __name__ == "__main__":
root_path = r'./'
pdf_path = r'./demo.pdf'
base_url = 'https://api.openai.com/v1'
key = "sk-xxx"
task = "翻译"
main(root_path, pdf_path, base_url, key, task)
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/mirrors/ChatPaper.git
git@gitee.com:mirrors/ChatPaper.git
mirrors
ChatPaper
ChatPaper
main

搜索帮助