diff --git a/Dockerfile b/Dockerfile index 19d988f..97ad13d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -10,12 +10,16 @@ RUN echo '[global]' > /etc/pip.conf && \ WORKDIR /gpt -# 装载项目文件 -COPY . . + + # 安装依赖 +COPY requirements.txt ./ +COPY ./docs/gradio-3.32.2-py3-none-any.whl ./docs/gradio-3.32.2-py3-none-any.whl +RUN pip3 install -r requirements.txt +# 装载项目文件 +COPY . . RUN pip3 install -r requirements.txt - # 可选步骤,用于预热模块 RUN python3 -c 'from check_proxy import warm_up_modules; warm_up_modules()' diff --git a/README.md b/README.md index 02f047d..39b37ea 100644 --- a/README.md +++ b/README.md @@ -43,10 +43,11 @@ chat分析报告生成 | [函数插件] 运行后自动生成总结汇报 [Arxiv小助手](https://www.bilibili.com/video/BV1LM4y1279X) | [函数插件] 输入arxiv文章url即可一键翻译摘要+下载PDF [谷歌学术统合小助手](https://www.bilibili.com/video/BV19L411U7ia) | [函数插件] 给定任意谷歌学术搜索页面URL,让gpt帮你[写relatedworks](https://www.bilibili.com/video/BV1GP411U7Az/) 互联网信息聚合+GPT | [函数插件] 一键[让GPT先从互联网获取信息](https://www.bilibili.com/video/BV1om4y127ck),再回答问题,让信息永不过时 +⭐Arxiv论文精细翻译 | [函数插件] 一键[以超高质量翻译arxiv论文](https://www.bilibili.com/video/BV1dz4y1v77A/),迄今为止最好的论文翻译工具⭐ 公式/图片/表格显示 | 可以同时显示公式的[tex形式和渲染形式](https://user-images.githubusercontent.com/96192199/230598842-1d7fcddd-815d-40ee-af60-baf488a199df.png),支持公式、代码高亮 多线程函数插件支持 | 支持多线调用chatgpt,一键处理[海量文本](https://www.bilibili.com/video/BV1FT411H7c5/)或程序 启动暗色gradio[主题](https://github.com/binary-husky/chatgpt_academic/issues/173) | 在浏览器url后面添加```/?__theme=dark```可以切换dark主题 -[多LLM模型](https://www.bilibili.com/video/BV1wT411p7yf)支持,[API2D](https://api2d.com/)接口支持 | 同时被GPT3.5、GPT4、[清华ChatGLM](https://github.com/THUDM/ChatGLM-6B)、[复旦MOSS](https://github.com/OpenLMLab/MOSS)同时伺候的感觉一定会很不错吧? +[多LLM模型](https://www.bilibili.com/video/BV1wT411p7yf)支持 | 同时被GPT3.5、GPT4、[清华ChatGLM](https://github.com/THUDM/ChatGLM-6B)、[复旦MOSS](https://github.com/OpenLMLab/MOSS)同时伺候的感觉一定会很不错吧? 更多LLM模型接入,支持[huggingface部署](https://huggingface.co/spaces/qingxu98/gpt-academic) | 加入Newbing接口(新必应),引入清华[Jittorllms](https://github.com/Jittor/JittorLLMs)支持[LLaMA](https://github.com/facebookresearch/llama),[RWKV](https://github.com/BlinkDL/ChatRWKV)和[盘古α](https://openi.org.cn/pangu/) 更多新功能展示(图像生成等) …… | 见本文档结尾处 …… @@ -227,38 +228,33 @@ docker-compose up 1. 对话保存功能。在函数插件区调用 `保存当前的对话` 即可将当前对话保存为可读+可复原的html文件, 另外在函数插件区(下拉菜单)调用 `载入对话历史存档` ,即可还原之前的会话。 -Tip:不指定文件直接点击 `载入对话历史存档` 可以查看历史html存档缓存,点击 `删除所有本地对话历史记录` 可以删除所有html存档缓存。 +Tip:不指定文件直接点击 `载入对话历史存档` 可以查看历史html存档缓存。
- - -2. 生成报告。大部分插件都会在执行结束后,生成工作报告 +2. ⭐Latex/Arxiv论文翻译功能⭐
- - - + ===> +
-3. 模块化功能设计,简单的接口却能支持强大的功能 +3. 生成报告。大部分插件都会在执行结束后,生成工作报告 +
+ + +
+ +4. 模块化功能设计,简单的接口却能支持强大的功能
-4. 这是一个能够“自我译解”的开源项目 +5. 译解其他开源项目
- -
- -5. 译解其他开源项目,不在话下 -
- -
- -
- + +
6. 装饰[live2d](https://github.com/fghrsh/live2d_demo)的小功能(默认关闭,需要修改`config.py`) @@ -283,13 +279,15 @@ Tip:不指定文件直接点击 `载入对话历史存档` 可以查看历史h 10. Latex全文校对纠错
- + ===> +
+ ## 版本: - version 3.5(Todo): 使用自然语言调用本项目的所有函数插件(高优先级) -- version 3.4(Todo): 完善chatglm本地大模型的多线支持 +- version 3.4: +arxiv论文翻译、latex论文批改功能 - version 3.3: +互联网信息综合功能 - version 3.2: 函数插件支持更多参数接口 (保存对话功能, 解读任意语言代码+同时询问任意的LLM组合) - version 3.1: 支持同时问询多个gpt模型!支持api2d,支持多个apikey负载均衡 @@ -307,30 +305,32 @@ gpt_academic开发者QQ群-2:610599535 - 已知问题 - 某些浏览器翻译插件干扰此软件前端的运行 - - 官方Gradio目前有很多兼容性Bug,请务必使用requirement.txt安装Gradio + - 官方Gradio目前有很多兼容性Bug,请务必使用`requirement.txt`安装Gradio ## 参考与学习 ``` -代码中参考了很多其他优秀项目中的设计,主要包括: +代码中参考了很多其他优秀项目中的设计,顺序不分先后: -# 项目1:清华ChatGLM-6B: +# 清华ChatGLM-6B: https://github.com/THUDM/ChatGLM-6B -# 项目2:清华JittorLLMs: +# 清华JittorLLMs: https://github.com/Jittor/JittorLLMs -# 项目3:Edge-GPT: -https://github.com/acheong08/EdgeGPT - -# 项目4:ChuanhuChatGPT: -https://github.com/GaiZhenbiao/ChuanhuChatGPT - -# 项目5:ChatPaper: +# ChatPaper: https://github.com/kaixindelele/ChatPaper -# 更多: +# Edge-GPT: +https://github.com/acheong08/EdgeGPT + +# ChuanhuChatGPT: +https://github.com/GaiZhenbiao/ChuanhuChatGPT + +# Oobabooga one-click installer: +https://github.com/oobabooga/one-click-installers + +# More: https://github.com/gradio-app/gradio https://github.com/fghrsh/live2d_demo -https://github.com/oobabooga/one-click-installers ``` diff --git a/config.py b/config.py index 09517c1..0383dce 100644 --- a/config.py +++ b/config.py @@ -56,7 +56,9 @@ MAX_RETRY = 2 # 模型选择是 (注意: LLM_MODEL是默认选中的模型, 同时它必须被包含在AVAIL_LLM_MODELS切换列表中 ) LLM_MODEL = "gpt-3.5-turbo" # 可选 ↓↓↓ -AVAIL_LLM_MODELS = ["gpt-3.5-turbo", "api2d-gpt-3.5-turbo", "gpt-4", 'proxy-gpt-4', "api2d-gpt-4", "chatglm", "moss", "newbing", "newbing-free", "stack-claude"] + +AVAIL_LLM_MODELS = ["gpt-3.5-turbo-16k", "gpt-3.5-turbo", "api2d-gpt-3.5-turbo", "gpt-4", 'proxy-gpt-4', "api2d-gpt-4", "chatglm", "moss", "newbing", "newbing-free", "stack-claude"] + # P.S. 其他可用的模型还包括 ["newbing-free", "jittorllms_rwkv", "jittorllms_pangualpha", "jittorllms_llama"] # 本地LLM模型如ChatGLM的执行方式 CPU/GPU diff --git a/crazy_functions/Langchain知识库.py b/crazy_functions/Langchain知识库.py index 36999d5..5b09d3b 100644 --- a/crazy_functions/Langchain知识库.py +++ b/crazy_functions/Langchain知识库.py @@ -75,9 +75,18 @@ def 知识库问答(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_pro @CatchException def 读取知识库作答(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port=-1): + # resolve deps + try: + from zh_langchain import construct_vector_store + from langchain.embeddings.huggingface import HuggingFaceEmbeddings + from .crazy_utils import knowledge_archive_interface + except Exception as e: + chatbot.append(["依赖不足", "导入依赖失败。正在尝试自动安装,请查看终端的输出或耐心等待..."]) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + from .crazy_utils import try_install_deps + try_install_deps(['zh_langchain==0.2.0']) # < ------------------- --------------- > - from .crazy_utils import knowledge_archive_interface kai = knowledge_archive_interface() if 'langchain_plugin_embedding' in chatbot._cookies: diff --git a/crazy_functions/Latex输出PDF结果.py b/crazy_functions/Latex输出PDF结果.py index daac763..2e9a30b 100644 --- a/crazy_functions/Latex输出PDF结果.py +++ b/crazy_functions/Latex输出PDF结果.py @@ -5,7 +5,7 @@ pj = os.path.join ARXIV_CACHE_DIR = os.path.expanduser(f"~/arxiv_cache/") # =================================== 工具函数 =============================================== -沙雕GPT啊别犯这些低级翻译错误 = 'You must to translate "agent" to "智能体". ' +专业词汇声明 = 'If the term "agent" is used in this section, it should be translated to "智能体". ' def switch_prompt(pfg, mode): """ Generate prompts and system prompts based on the mode for proofreading or translating. @@ -25,7 +25,7 @@ def switch_prompt(pfg, mode): f"\n\n{frag}" for frag in pfg.sp_file_contents] sys_prompt_array = ["You are a professional academic paper writer." for _ in range(n_split)] elif mode == 'translate_zh': - inputs_array = [r"Below is a section from an English academic paper, translate it into Chinese." + 沙雕GPT啊别犯这些低级翻译错误 + + inputs_array = [r"Below is a section from an English academic paper, translate it into Chinese. " + 专业词汇声明 + r"Do not modify any latex command such as \section, \cite, \begin, \item and equations. " + r"Answer me only with the translated text:" + f"\n\n{frag}" for frag in pfg.sp_file_contents] @@ -65,8 +65,10 @@ def move_project(project_folder, arxiv_id=None): new_workfolder = pj(ARXIV_CACHE_DIR, arxiv_id, 'workfolder') else: new_workfolder = f'gpt_log/{gen_time_str()}' - try: shutil.rmtree(new_workfolder) - except: pass + try: + shutil.rmtree(new_workfolder) + except: + pass shutil.copytree(src=project_folder, dst=new_workfolder) return new_workfolder @@ -80,7 +82,14 @@ def arxiv_download(chatbot, history, txt): promote_file_to_downloadzone(target_file) return target_file return False - + def is_float(s): + try: + float(s) + return True + except ValueError: + return False + if ('.' in txt) and ('/' not in txt) and is_float(txt): + txt = 'https://arxiv.org/abs/' + txt if not txt.startswith('https://arxiv.org'): return txt, None @@ -132,12 +141,12 @@ def Latex英文纠错加PDF对比(txt, llm_kwargs, plugin_kwargs, chatbot, histo # <-------------- check deps -------------> try: - import glob, os, time - os.system(f'pdflatex -version') - from .latex_utils import Latex精细分解与转化, 编译Latex差别 + import glob, os, time, subprocess + subprocess.Popen(['pdflatex', '-version']) + from .latex_utils import Latex精细分解与转化, 编译Latex except Exception as e: chatbot.append([ f"解析项目: {txt}", - f"尝试执行Latex指令失败。Latex没有安装, 或者不在环境变量PATH中。报错信息\n\n```\n\n{trimmed_format_exc()}\n\n```\n\n"]) + f"尝试执行Latex指令失败。Latex没有安装, 或者不在环境变量PATH中。安装方法https://tug.org/texlive/。报错信息\n\n```\n\n{trimmed_format_exc()}\n\n```\n\n"]) yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 return @@ -172,7 +181,7 @@ def Latex英文纠错加PDF对比(txt, llm_kwargs, plugin_kwargs, chatbot, histo # <-------------- compile PDF -------------> - success = yield from 编译Latex差别(chatbot, history, main_file_original='merge', main_file_modified='merge_proofread', + success = yield from 编译Latex(chatbot, history, main_file_original='merge', main_file_modified='merge_proofread', work_folder_original=project_folder, work_folder_modified=project_folder, work_folder=project_folder) @@ -196,18 +205,18 @@ def Latex翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot, # <-------------- information about this plugin -------------> chatbot.append([ "函数插件功能?", - "对整个Latex项目进行翻译, 生成中文PDF。函数插件贡献者: Binary-Husky。注意事项: 目前仅支持GPT3.5/GPT4,其他模型转化效果未知。目前对机器学习类文献转化效果最好,其他类型文献转化效果未知。仅在Windows系统进行了测试,其他操作系统表现未知。"]) + "对整个Latex项目进行翻译, 生成中文PDF。函数插件贡献者: Binary-Husky。注意事项: 目前仅支持GPT3.5/GPT4,其他模型转化效果未知。目前对机器学习类文献转化效果最好,其他类型文献转化效果未知。"]) yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # <-------------- check deps -------------> try: - import glob, os, time - os.system(f'pdflatex -version') - from .latex_utils import Latex精细分解与转化, 编译Latex差别 + import glob, os, time, subprocess + subprocess.Popen(['pdflatex', '-version']) + from .latex_utils import Latex精细分解与转化, 编译Latex except Exception as e: chatbot.append([ f"解析项目: {txt}", - f"尝试执行Latex指令失败。Latex没有安装, 或者不在环境变量PATH中。报错信息\n\n```\n\n{trimmed_format_exc()}\n\n```\n\n"]) + f"尝试执行Latex指令失败。Latex没有安装, 或者不在环境变量PATH中。安装方法https://tug.org/texlive/。报错信息\n\n```\n\n{trimmed_format_exc()}\n\n```\n\n"]) yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 return @@ -219,6 +228,8 @@ def Latex翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot, report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"发现已经存在翻译好的PDF文档") yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 return + + if os.path.exists(txt): project_folder = txt else: @@ -226,6 +237,7 @@ def Latex翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot, report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}") yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 return + file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.tex', recursive=True)] if len(file_manifest) == 0: report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.tex文件: {txt}") @@ -247,7 +259,7 @@ def Latex翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot, # <-------------- compile PDF -------------> - success = yield from 编译Latex差别(chatbot, history, main_file_original='merge', main_file_modified='merge_translate_zh', + success = yield from 编译Latex(chatbot, history, main_file_original='merge', main_file_modified='merge_translate_zh', work_folder_original=project_folder, work_folder_modified=project_folder, work_folder=project_folder) # <-------------- zip PDF -------------> @@ -259,5 +271,6 @@ def Latex翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot, chatbot.append((f"失败了", '虽然PDF生成失败了, 但请查收结果(压缩包), 内含已经翻译的Tex文档, 也是可读的, 您可以到Github Issue区, 用该压缩包+对话历史存档进行反馈 ...')) yield from update_ui(chatbot=chatbot, history=history); time.sleep(1) # 刷新界面 + # <-------------- we are done -------------> return success diff --git a/crazy_functions/crazy_functions_test.py b/crazy_functions/crazy_functions_test.py index d4e3274..e743878 100644 --- a/crazy_functions/crazy_functions_test.py +++ b/crazy_functions/crazy_functions_test.py @@ -181,18 +181,14 @@ def test_Langchain知识库读取(): def test_Latex(): from crazy_functions.Latex输出PDF结果 import Latex英文纠错加PDF对比, Latex翻译中文并重新编译PDF - txt = "C:/Users/fuqingxu/Desktop/proofread" - txt = "C:/Users/fuqingxu/Desktop/旧文件/gpt/paperx" - txt = "C:/Users/fuqingxu/Desktop/旧文件/gpt/papery" - txt = r"C:\Users\fuqingxu\Desktop\旧文件\gpt\latex2pdf\2023-06-03-14-57-06" - txt = r"C:\Users\fuqingxu\Desktop\旧文件\gpt\latex2pdf\2023-06-03-15-40-20" - txt = r"https://arxiv.org/abs/1902.03185" - txt = r"C:\Users\fuqingxu\Desktop\旧文件\gpt\latex2pdf\2023-06-03-17-14-40" - txt = r"https://arxiv.org/abs/2305.18290" - txt = r"https://arxiv.org/abs/2305.17608" - # txt = r"https://arxiv.org/abs/2306.00324" - txt = r"https://arxiv.org/abs/2211.16068" - + + # txt = r"https://arxiv.org/abs/1706.03762" + # txt = r"https://arxiv.org/abs/1902.03185" + # txt = r"https://arxiv.org/abs/2305.18290" + # txt = r"https://arxiv.org/abs/2305.17608" + # txt = r"https://arxiv.org/abs/2211.16068" # ACE + # txt = r"C:\Users\x\arxiv_cache\2211.16068\workfolder" # ACE + txt = r"https://arxiv.org/abs/2002.09253" for cookies, cb, hist, msg in (Latex翻译中文并重新编译PDF)(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): cli_printer.print(cb) # print(cb) diff --git a/crazy_functions/latex_utils.py b/crazy_functions/latex_utils.py index 3e128eb..53894ca 100644 --- a/crazy_functions/latex_utils.py +++ b/crazy_functions/latex_utils.py @@ -2,8 +2,111 @@ from toolbox import update_ui, update_ui_lastest_msg # 刷新Gradio前端界 from toolbox import zip_folder, objdump, objload, promote_file_to_downloadzone import os, shutil import re +import numpy as np pj = os.path.join +""" +======================================================================== +Part One +Latex segmentation to a linklist +======================================================================== +""" +PRESERVE = 0 +TRANSFORM = 1 + +def split_worker(text, mask, pattern, flags=0): + """ + Add a preserve text area in this paper + """ + pattern_compile = re.compile(pattern, flags) + for res in pattern_compile.finditer(text): + mask[res.span()[0]:res.span()[1]] = PRESERVE + return text, mask + +def split_worker_careful_brace(text, mask, pattern, flags=0): + """ + Move area into preserve area + """ + pattern_compile = re.compile(pattern, flags) + for res in pattern_compile.finditer(text): + brace_level = -1 + p = begin = end = res.regs[0][0] + for _ in range(1024*16): + if text[p] == '}' and brace_level == 0: break + elif text[p] == '}': brace_level -= 1 + elif text[p] == '{': brace_level += 1 + p += 1 + end = p+1 + mask[begin:end] = PRESERVE + return text, mask + +def split_worker_reverse_careful_brace(text, mask, pattern, flags=0): + """ + Move area out of preserve area + """ + pattern_compile = re.compile(pattern, flags) + for res in pattern_compile.finditer(text): + brace_level = 0 + p = begin = end = res.regs[1][0] + for _ in range(1024*16): + if text[p] == '}' and brace_level == 0: break + elif text[p] == '}': brace_level -= 1 + elif text[p] == '{': brace_level += 1 + p += 1 + end = p + mask[begin:end] = TRANSFORM + return text, mask + +def split_worker_begin_end(text, mask, pattern, flags=0, limit_n_lines=42): + """ + Find all \begin{} ... \end{} text block that with less than limit_n_lines lines. + Add it to preserve area + """ + pattern_compile = re.compile(pattern, flags) + def search_with_line_limit(text, mask): + for res in pattern_compile.finditer(text): + cmd = res.group(1) # begin{what} + this = res.group(2) # content between begin and end + this_mask = mask[res.regs[2][0]:res.regs[2][1]] + white_list = ['document', 'abstract', 'lemma', 'definition', 'sproof', + 'em', 'emph', 'textit', 'textbf', 'itemize', 'enumerate'] + if (cmd in white_list) or this.count('\n') >= limit_n_lines: # use a magical number 42 + this, this_mask = search_with_line_limit(this, this_mask) + mask[res.regs[2][0]:res.regs[2][1]] = this_mask + else: + mask[res.regs[0][0]:res.regs[0][1]] = PRESERVE + return text, mask + return search_with_line_limit(text, mask) + +class LinkedListNode(): + """ + Linked List Node + """ + def __init__(self, string, preserve=True) -> None: + self.string = string + self.preserve = preserve + self.next = None + # self.begin_line = 0 + # self.begin_char = 0 + +def convert_to_linklist(text, mask): + root = LinkedListNode("", preserve=True) + current_node = root + for c, m, i in zip(text, mask, range(len(text))): + if (m==PRESERVE and current_node.preserve) \ + or (m==TRANSFORM and not current_node.preserve): + # add + current_node.string += c + else: + current_node.next = LinkedListNode(c, preserve=(m==PRESERVE)) + current_node = current_node.next + return root +""" +======================================================================== +Latex Merge File +======================================================================== +""" + def 寻找Latex主文件(file_manifest, mode): """ 在多Tex文档中,寻找主文件,必须包含documentclass,返回找到的第一个。 @@ -20,10 +123,23 @@ def 寻找Latex主文件(file_manifest, mode): continue raise RuntimeError('无法找到一个主Tex文件(包含documentclass关键字)') +def rm_comments(main_file): + new_file_remove_comment_lines = [] + for l in main_file.splitlines(): + # 删除整行的空注释 + if l.lstrip().startswith("%"): + pass + else: + new_file_remove_comment_lines.append(l) + main_file = '\n'.join(new_file_remove_comment_lines) + main_file = re.sub(r'(? None: - self.string = string - self.preserve = preserve - self.next = None - +""" +======================================================================== +Post process +======================================================================== +""" def mod_inbraket(match): """ - 为啥chatgpt会把cite里面的逗号换成中文逗号呀 艹 + 为啥chatgpt会把cite里面的逗号换成中文逗号呀 """ # get the matched string cmd = match.group(1) @@ -91,271 +202,251 @@ def fix_content(final_tex, node_string): """ Fix common GPT errors to increase success rate """ - final_tex = final_tex.replace('%', r'\%') - final_tex = final_tex.replace(r'\%', r'\\%') + final_tex = re.sub(r"(? 0 and node_string.count('\_') > final_tex.count('\_'): + # walk and replace any _ without \ + final_tex = re.sub(r"(?') + if not node.preserve: + segment_parts_for_gpt.append(node.string) + f.write(f'

#{show_html}#

') + else: + f.write(f'

{show_html}

') + node = node.next + if node is None: break + + for n in nodes: n.next = None # break + return_dict['nodes'] = nodes + return_dict['segment_parts_for_gpt'] = segment_parts_for_gpt + return return_dict + + + class LatexPaperSplit(): """ - 将Latex文档分解到一个链表中,每个链表节点用preserve的标志位提示它是否应当被GPT处理 + break down latex file to a linked list, + each node use a preserve flag to indicate whether it should + be proccessed by GPT. """ def __init__(self) -> None: - """ - root是链表的根节点 - """ - self.root = None + self.nodes = None + self.msg = "{\\scriptsize\\textbf{警告:该PDF由GPT-Academic开源项目调用大语言模型+Latex翻译插件一键生成," + \ + "版权归原文作者所有。翻译内容可靠性无任何保障,请仔细鉴别并以原文为准。" + \ + "项目Github地址 \\url{https://github.com/binary-husky/gpt_academic/}。" + # 请您不要删除或修改这行警告,除非您是论文的原作者(如果您是论文原作者,欢迎加REAME中的QQ联系开发者) + self.msg_declare = "为了防止大语言模型的意外谬误产生扩散影响,禁止移除或修改此警告。}}\\\\" def merge_result(self, arr, mode, msg): """ - 将GPT处理后的结果融合 + Merge the result after the GPT process completed """ result_string = "" - node = self.root p = 0 - while True: + for node in self.nodes: if node.preserve: result_string += node.string else: result_string += fix_content(arr[p], node.string) p += 1 - node = node.next - if node is None: break if mode == 'translate_zh': - try: - pattern = re.compile(r'\\begin\{abstract\}.*\n') - match = pattern.search(result_string) - position = match.end() - result_string = result_string[:position] + \ - "{\\scriptsize\\textbf{警告:该PDF由GPT-Academic开源项目调用大语言模型+Latex翻译插件一键生成,其内容可靠性没有任何保障,请仔细鉴别并以原文为准。" + \ - "项目Github地址 \\url{https://github.com/binary-husky/gpt_academic/}。" + \ - msg + \ - "为了防止大语言模型的意外谬误产生扩散影响,禁止移除或修改此警告。}}\\\\" + \ - result_string[position:] - except: - pass + pattern = re.compile(r'\\begin\{abstract\}.*\n') + match = pattern.search(result_string) + position = match.end() + result_string = result_string[:position] + self.msg + msg + self.msg_declare + result_string[position:] return result_string - def split(self, txt, project_folder): + def split(self, txt, project_folder, opts): """ - 将Latex文档分解到一个链表中,每个链表节点用preserve的标志位提示它是否应当被GPT处理 + break down latex file to a linked list, + each node use a preserve flag to indicate whether it should + be proccessed by GPT. + P.S. use multiprocessing to avoid timeout error """ - root = LinkedListNode(txt, False) - def split_worker(root, pattern, flags=0): - lt = root - cnt = 0 - pattern_compile = re.compile(pattern, flags) - while True: - if not lt.preserve: - while True: - res = pattern_compile.search(lt.string) - if not res: break - before = res.string[:res.span()[0]] - this = res.group(0) - after = res.string[res.span()[1]:] - # ====== - lt.string = before - tmp = lt.next - # ====== - mid = LinkedListNode(this, True) - lt.next = mid - # ====== - aft = LinkedListNode(after, False) - mid.next = aft - aft.next = tmp - # ====== - lt = aft - lt = lt.next - cnt += 1 - # print(cnt) - if lt is None: break - - def split_worker_begin_end(root, pattern, flags=0, limit_n_lines=25): - lt = root - cnt = 0 - pattern_compile = re.compile(pattern, flags) - while True: - if not lt.preserve: - while True: - target_string = lt.string - - def search_with_line_limit(target_string): - for res in pattern_compile.finditer(target_string): - cmd = res.group(1) # begin{what} - this = res.group(2) # content between begin and end - white_list = ['document', 'abstract', 'lemma', 'definition', 'sproof', 'em', 'emph', 'textit', 'textbf'] - if cmd in white_list or this.count('\n') > 25: - sub_res = search_with_line_limit(this) - if not sub_res: continue - else: return sub_res - else: - return res.group(0) - return False - # ====== - # search for first encounter of \begin \end pair with less than 25 lines in the middle - ps = search_with_line_limit(target_string) - if not ps: break - res = re.search(re.escape(ps), target_string, flags) - if not res: assert False - before = res.string[:res.span()[0]] - this = res.group(0) - after = res.string[res.span()[1]:] - # ====== - lt.string = before - tmp = lt.next - # ====== - mid = LinkedListNode(this, True) - lt.next = mid - # ====== - aft = LinkedListNode(after, False) - mid.next = aft - aft.next = tmp - # ====== - lt = aft - lt = lt.next - cnt += 1 - # print(cnt) - if lt is None: break - - - # root 是链表的头 - print('正在分解Latex源文件,构建链表结构') - # 删除iffalse注释 - split_worker(root, r"\\iffalse(.*?)\\fi", re.DOTALL) - # 吸收在25行以内的begin-end组合 - split_worker_begin_end(root, r"\\begin\{([a-z\*]*)\}(.*?)\\end\{\1\}", re.DOTALL, limit_n_lines=25) - # 吸收匿名公式 - split_worker(root, r"\$\$(.*?)\$\$", re.DOTALL) - # 吸收其他杂项 - split_worker(root, r"(.*?)\\maketitle", re.DOTALL) - split_worker(root, r"\\section\{(.*?)\}") - split_worker(root, r"\\section\*\{(.*?)\}") - split_worker(root, r"\\subsection\{(.*?)\}") - split_worker(root, r"\\subsubsection\{(.*?)\}") - split_worker(root, r"\\bibliography\{(.*?)\}") - split_worker(root, r"\\bibliographystyle\{(.*?)\}") - split_worker(root, r"\\begin\{lstlisting\}(.*?)\\end\{lstlisting\}", re.DOTALL) - split_worker(root, r"\\begin\{wraptable\}(.*?)\\end\{wraptable\}", re.DOTALL) - split_worker(root, r"\\begin\{algorithm\}(.*?)\\end\{algorithm\}", re.DOTALL) - split_worker(root, r"\\begin\{wrapfigure\}(.*?)\\end\{wrapfigure\}", re.DOTALL) - split_worker(root, r"\\begin\{wrapfigure\*\}(.*?)\\end\{wrapfigure\*\}", re.DOTALL) - split_worker(root, r"\\begin\{figure\}(.*?)\\end\{figure\}", re.DOTALL) - split_worker(root, r"\\begin\{figure\*\}(.*?)\\end\{figure\*\}", re.DOTALL) - split_worker(root, r"\\begin\{multline\}(.*?)\\end\{multline\}", re.DOTALL) - split_worker(root, r"\\begin\{multline\*\}(.*?)\\end\{multline\*\}", re.DOTALL) - split_worker(root, r"\\begin\{table\}(.*?)\\end\{table\}", re.DOTALL) - split_worker(root, r"\\begin\{table\*\}(.*?)\\end\{table\*\}", re.DOTALL) - split_worker(root, r"\\begin\{minipage\}(.*?)\\end\{minipage\}", re.DOTALL) - split_worker(root, r"\\begin\{minipage\*\}(.*?)\\end\{minipage\*\}", re.DOTALL) - split_worker(root, r"\\begin\{align\*\}(.*?)\\end\{align\*\}", re.DOTALL) - split_worker(root, r"\\begin\{align\}(.*?)\\end\{align\}", re.DOTALL) - split_worker(root, r"\\begin\{equation\}(.*?)\\end\{equation\}", re.DOTALL) - split_worker(root, r"\\begin\{equation\*\}(.*?)\\end\{equation\*\}", re.DOTALL) - split_worker(root, r"\\item ") - split_worker(root, r"\\label\{(.*?)\}") - split_worker(root, r"\\begin\{(.*?)\}") - split_worker(root, r"\\vspace\{(.*?)\}") - split_worker(root, r"\\hspace\{(.*?)\}") - split_worker(root, r"\\end\{(.*?)\}") - - node = root - while True: - if len(node.string.strip('\n').strip(''))==0: node.preserve = True - if len(node.string.strip('\n').strip(''))<50: node.preserve = True - node = node.next - if node is None: break - - # 修复括号 - node = root - while True: - string = node.string - if node.preserve: - node = node.next - if node is None: break - continue - def break_check(string): - str_stack = [""] # (lv, index) - for i, c in enumerate(string): - if c == '{': - str_stack.append('{') - elif c == '}': - if len(str_stack) == 1: - print('stack kill') - return i - str_stack.pop(-1) - else: - str_stack[-1] += c - return -1 - bp = break_check(string) - - if bp == -1: - pass - elif bp == 0: - node.string = string[:1] - q = LinkedListNode(string[1:], False) - q.next = node.next - node.next = q - else: - node.string = string[:bp] - q = LinkedListNode(string[bp:], False) - q.next = node.next - node.next = q - - node = node.next - if node is None: break - - node = root - while True: - if len(node.string.strip('\n').strip(''))==0: node.preserve = True - if len(node.string.strip('\n').strip(''))<50: node.preserve = True - node = node.next - if node is None: break - - # 将前后断行符脱离 - node = root - prev_node = None - while True: - if not node.preserve: - lstriped_ = node.string.lstrip().lstrip('\n') - if (prev_node is not None) and (prev_node.preserve) and (len(lstriped_)!=len(node.string)): - prev_node.string += node.string[:-len(lstriped_)] - node.string = lstriped_ - rstriped_ = node.string.rstrip().rstrip('\n') - if (node.next is not None) and (node.next.preserve) and (len(rstriped_)!=len(node.string)): - node.next.string = node.string[len(rstriped_):] + node.next.string - node.string = rstriped_ - # ===== - prev_node = node - node = node.next - if node is None: break - - # 将分解结果返回 res_to_t - with open(pj(project_folder, 'debug_log.html'), 'w', encoding='utf8') as f: - res_to_t = [] - node = root - while True: - show_html = node.string.replace('\n','
') - if not node.preserve: - res_to_t.append(node.string) - f.write(f'

#{show_html}#

') - else: - f.write(f'

{show_html}

') - node = node.next - if node is None: break - - self.root = root - self.sp = res_to_t + import multiprocessing + manager = multiprocessing.Manager() + return_dict = manager.dict() + p = multiprocessing.Process( + target=split_subprocess, + args=(txt, project_folder, return_dict, opts)) + p.start() + p.join() + self.nodes = return_dict['nodes'] + self.sp = return_dict['segment_parts_for_gpt'] return self.sp + + class LatexPaperFileGroup(): + """ + use tokenizer to break down text according to max_token_limit + """ def __init__(self): self.file_paths = [] self.file_contents = [] @@ -371,7 +462,7 @@ class LatexPaperFileGroup(): def run_file_split(self, max_token_limit=1900): """ - 将长文本分离开来 + use tokenizer to break down text according to max_token_limit """ for index, file_content in enumerate(self.file_contents): if self.get_token_num(file_content) < max_token_limit: @@ -402,7 +493,7 @@ class LatexPaperFileGroup(): -def Latex精细分解与转化(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, mode='proofread', switch_prompt=None): +def Latex精细分解与转化(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, mode='proofread', switch_prompt=None, opts=[]): import time, os, re from .crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency from .latex_utils import LatexPaperFileGroup, merge_tex_files, LatexPaperSplit, 寻找Latex主文件 @@ -411,7 +502,7 @@ def Latex精细分解与转化(file_manifest, project_folder, llm_kwargs, plugin maintex = 寻找Latex主文件(file_manifest, mode) chatbot.append((f"定位主Latex文件", f'[Local Message] 分析结果:该项目的Latex主文件是{maintex}, 如果分析错误, 请立即终止程序, 删除或修改歧义文件, 然后重试。主程序即将开始, 请稍候。')) yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - time.sleep(5) + time.sleep(3) # <-------- 读取Latex文件, 将多文件tex工程融合为一个巨型tex ----------> main_tex_basename = os.path.basename(maintex) @@ -431,8 +522,10 @@ def Latex精细分解与转化(file_manifest, project_folder, llm_kwargs, plugin f.write(merged_content) # <-------- 精细切分latex文件 ----------> + chatbot.append((f"Latex文件融合完成", f'[Local Message] 正在精细切分latex文件,这需要一段时间计算,文档越长耗时越长,请耐心等待。')) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 lps = LatexPaperSplit() - res = lps.split(merged_content, project_folder) + res = lps.split(merged_content, project_folder, opts) # 消耗时间的函数 # <-------- 拆分过长的latex片段 ----------> pfg = LatexPaperFileGroup() @@ -480,7 +573,8 @@ def Latex精细分解与转化(file_manifest, project_folder, llm_kwargs, plugin msg = f"当前大语言模型: {llm_kwargs['llm_model']},当前语言模型温度设定: {llm_kwargs['temperature']}。" final_tex = lps.merge_result(pfg.file_result, mode, msg) with open(project_folder + f'/merge_{mode}.tex', 'w', encoding='utf-8', errors='replace') as f: - f.write(final_tex) + if mode != 'translate_zh' or "binary" in final_tex: f.write(final_tex) + # <-------- 整理结果, 退出 ----------> chatbot.append((f"完成了吗?", 'GPT结果已输出, 正在编译PDF')) @@ -507,7 +601,8 @@ def remove_buggy_lines(file_path, log_path, tex_name, tex_name_pure, n_fix, work f.writelines(file_lines) return True, f"{tex_name_pure}_fix_{n_fix}", buggy_lines except: - return False, 0, [0] + print("Fatal error occurred, but we cannot identify error, please download zip, read latex log, and compile manually.") + return False, -1, [-1] def compile_latex_with_timeout(command, timeout=60): @@ -522,12 +617,12 @@ def compile_latex_with_timeout(command, timeout=60): return False return True -def 编译Latex差别(chatbot, history, main_file_original, main_file_modified, work_folder_original, work_folder_modified, work_folder): +def 编译Latex(chatbot, history, main_file_original, main_file_modified, work_folder_original, work_folder_modified, work_folder): import os, time current_dir = os.getcwd() n_fix = 1 max_try = 32 - chatbot.append([f"正在编译PDF文档", f'编译已经开始。当前工作路径为{work_folder},如果程序停顿5分钟以上,则大概率是卡死在Latex里面了。不幸卡死时请直接去该路径下取回翻译结果,或者重启之后再度尝试 ...']); yield from update_ui(chatbot=chatbot, history=history) + chatbot.append([f"正在编译PDF文档", f'编译已经开始。当前工作路径为{work_folder},如果程序停顿5分钟以上,请直接去该路径下取回翻译结果,或者重启之后再度尝试 ...']); yield from update_ui(chatbot=chatbot, history=history) chatbot.append([f"正在编译PDF文档", '...']); yield from update_ui(chatbot=chatbot, history=history); time.sleep(1); chatbot[-1] = list(chatbot[-1]) # 刷新界面 yield from update_ui_lastest_msg('编译已经开始...', chatbot, history) # 刷新Gradio前端界面 diff --git a/crazy_functions/数学动画生成manim.py b/crazy_functions/数学动画生成manim.py index 5851b9c..26e61b1 100644 --- a/crazy_functions/数学动画生成manim.py +++ b/crazy_functions/数学动画生成manim.py @@ -8,7 +8,7 @@ def inspect_dependency(chatbot, history): import manim return True except: - chatbot.append(["导入依赖失败", "使用该模块需要额外依赖,安装方法:```pip install manimgl```"]) + chatbot.append(["导入依赖失败", "使用该模块需要额外依赖,安装方法:```pip install manim manimgl```"]) yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 return False diff --git a/docs/Dockerfile+NoLocal+Latex b/docs/Dockerfile+NoLocal+Latex new file mode 100644 index 0000000..0f9ac8a --- /dev/null +++ b/docs/Dockerfile+NoLocal+Latex @@ -0,0 +1,27 @@ +# 此Dockerfile适用于“无本地模型”的环境构建,如果需要使用chatglm等本地模型,请参考 docs/Dockerfile+ChatGLM +# - 1 修改 `config.py` +# - 2 构建 docker build -t gpt-academic-nolocal-latex -f docs/Dockerfile+NoLocal+Latex . +# - 3 运行 docker run -v /home/fuqingxu/arxiv_cache:/root/arxiv_cache --rm -it --net=host gpt-academic-nolocal-latex + +FROM fuqingxu/python311_texlive_ctex:latest + +# 指定路径 +WORKDIR /gpt + +ARG useProxyNetwork='' + +RUN $useProxyNetwork pip3 install gradio openai numpy arxiv rich -i https://pypi.douban.com/simple/ +RUN $useProxyNetwork pip3 install colorama Markdown pygments pymupdf -i https://pypi.douban.com/simple/ + +# 装载项目文件 +COPY . . + + +# 安装依赖 +RUN $useProxyNetwork pip3 install -r requirements.txt -i https://pypi.douban.com/simple/ + +# 可选步骤,用于预热模块 +RUN python3 -c 'from check_proxy import warm_up_modules; warm_up_modules()' + +# 启动 +CMD ["python3", "-u", "main.py"] diff --git a/docs/translate_english.json b/docs/translate_english.json index b7a582d..13b0869 100644 --- a/docs/translate_english.json +++ b/docs/translate_english.json @@ -58,6 +58,8 @@ "连接网络回答问题": "ConnectToNetworkToAnswerQuestions", "联网的ChatGPT": "ChatGPTConnectedToNetwork", "解析任意code项目": "ParseAnyCodeProject", + "读取知识库作答": "ReadKnowledgeArchiveAnswerQuestions", + "知识库问答": "UpdateKnowledgeArchive", "同时问询_指定模型": "InquireSimultaneously_SpecifiedModel", "图片生成": "ImageGeneration", "test_解析ipynb文件": "Test_ParseIpynbFile", diff --git a/request_llm/bridge_all.py b/request_llm/bridge_all.py index efafa39..195c030 100644 --- a/request_llm/bridge_all.py +++ b/request_llm/bridge_all.py @@ -85,6 +85,15 @@ model_info = { "tokenizer": tokenizer_gpt35, "token_cnt": get_token_num_gpt35, }, + + "gpt-3.5-turbo-16k": { + "fn_with_ui": chatgpt_ui, + "fn_without_ui": chatgpt_noui, + "endpoint": openai_endpoint, + "max_token": 1024*16, + "tokenizer": tokenizer_gpt35, + "token_cnt": get_token_num_gpt35, + }, "gpt-4": { "fn_with_ui": chatgpt_ui, diff --git a/toolbox.py b/toolbox.py index 861051d..98a280e 100644 --- a/toolbox.py +++ b/toolbox.py @@ -562,7 +562,9 @@ def on_report_generated(files, chatbot): if len(report_files) == 0: return None, chatbot # files.extend(report_files) - chatbot.append(['报告如何远程获取?', '报告已经添加到右侧“文件上传区”(可能处于折叠状态),请查收。']) + file_links = '' + for f in report_files: file_links += f'
{f}' + chatbot.append(['报告如何远程获取?', f'报告已经添加到右侧“文件上传区”(可能处于折叠状态),请查收。{file_links}']) return report_files, chatbot def is_openai_api_key(key): diff --git a/version b/version index ad75b2c..ceb909a 100644 --- a/version +++ b/version @@ -1,5 +1,5 @@ { - "version": 3.37, + "version": 3.41, "show_feature": true, - "new_feature": "修复gradio复制按钮BUG <-> 修复PDF翻译的BUG, 新增HTML中英双栏对照 <-> 添加了OpenAI图片生成插件 <-> 添加了OpenAI音频转文本总结插件 <-> 通过Slack添加对Claude的支持 <-> 提供复旦MOSS模型适配(启用需额外依赖) <-> 提供docker-compose方案兼容LLAMA盘古RWKV等模型的后端 <-> 新增Live2D装饰 <-> 完善对话历史的保存/载入/删除 <-> 保存对话功能" + "new_feature": "增加gpt-3.5-16k的支持 <-> 新增最强Arxiv论文翻译插件 <-> 修复gradio复制按钮BUG <-> 修复PDF翻译的BUG, 新增HTML中英双栏对照 <-> 添加了OpenAI图片生成插件 <-> 添加了OpenAI音频转文本总结插件 <-> 通过Slack添加对Claude的支持" }