diff --git a/README.md b/README.md
index 9a1e72d..02f047d 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,8 @@
> **Note**
>
-> 5月27日对gradio依赖进行了较大的修复和调整,fork并解决了官方Gradio的一系列bug。但如果27日当天进行了更新,可能会导致代码报错(依赖缺失,卡在loading界面等),请及时更新到**最新版代码**并重新安装pip依赖即可。若给您带来困扰还请谅解。安装依赖时,请严格选择requirements.txt中**指定的版本**:
+> 2023.5.27 对Gradio依赖进行了调整,Fork并解决了官方Gradio的若干Bugs。请及时**更新代码**并重新更新pip依赖。安装依赖时,请严格选择`requirements.txt`中**指定的版本**:
>
-> `pip install -r requirements.txt -i https://pypi.org/simple`
+> `pip install -r requirements.txt`
>
#
GPT 学术优化 (GPT Academic)
@@ -175,21 +175,26 @@ docker-compose up
## 安装-方法3:其他部署姿势
+1. 一键运行脚本。
+完全不熟悉python环境的Windows用户可以下载[Release](https://github.com/binary-husky/gpt_academic/releases)中发布的一键运行脚本安装无本地模型的版本,
+不建议电脑上已有python的用户采用此方法(在此基础上安装插件的依赖很麻烦)。
+脚本的贡献来源是[oobabooga](https://github.com/oobabooga/one-click-installers)。
-1. 如何使用反代URL/微软云AzureAPI
+2. 使用docker-compose运行。
+请阅读docker-compose.yml后,按照其中的提示操作即可
+
+3. 如何使用反代URL/微软云AzureAPI。
按照`config.py`中的说明配置API_URL_REDIRECT即可。
-2. 远程云服务器部署(需要云服务器知识与经验)
+4. 远程云服务器部署(需要云服务器知识与经验)。
请访问[部署wiki-1](https://github.com/binary-husky/chatgpt_academic/wiki/%E4%BA%91%E6%9C%8D%E5%8A%A1%E5%99%A8%E8%BF%9C%E7%A8%8B%E9%83%A8%E7%BD%B2%E6%8C%87%E5%8D%97)
-3. 使用WSL2(Windows Subsystem for Linux 子系统)
+5. 使用WSL2(Windows Subsystem for Linux 子系统)。
请访问[部署wiki-2](https://github.com/binary-husky/chatgpt_academic/wiki/%E4%BD%BF%E7%94%A8WSL2%EF%BC%88Windows-Subsystem-for-Linux-%E5%AD%90%E7%B3%BB%E7%BB%9F%EF%BC%89%E9%83%A8%E7%BD%B2)
-4. 如何在二级网址(如`http://localhost/subpath`)下运行
+6. 如何在二级网址(如`http://localhost/subpath`)下运行。
请访问[FastAPI运行说明](docs/WithFastapi.md)
-5. 使用docker-compose运行
-请阅读docker-compose.yml后,按照其中的提示操作即可
---
# Advanced Usage
## 自定义新的便捷按钮 / 自定义函数插件
@@ -327,4 +332,5 @@ https://github.com/kaixindelele/ChatPaper
# 更多:
https://github.com/gradio-app/gradio
https://github.com/fghrsh/live2d_demo
+https://github.com/oobabooga/one-click-installers
```
diff --git a/colorful.py b/colorful.py
index d90972b..9749861 100644
--- a/colorful.py
+++ b/colorful.py
@@ -34,58 +34,28 @@ def print亮紫(*kw,**kargs):
def print亮靛(*kw,**kargs):
print("\033[1;36m",*kw,"\033[0m",**kargs)
-
-
-def print亮红(*kw,**kargs):
- print("\033[1;31m",*kw,"\033[0m",**kargs)
-def print亮绿(*kw,**kargs):
- print("\033[1;32m",*kw,"\033[0m",**kargs)
-def print亮黄(*kw,**kargs):
- print("\033[1;33m",*kw,"\033[0m",**kargs)
-def print亮蓝(*kw,**kargs):
- print("\033[1;34m",*kw,"\033[0m",**kargs)
-def print亮紫(*kw,**kargs):
- print("\033[1;35m",*kw,"\033[0m",**kargs)
-def print亮靛(*kw,**kargs):
- print("\033[1;36m",*kw,"\033[0m",**kargs)
-
-print_red = print红
-print_green = print绿
-print_yellow = print黄
-print_blue = print蓝
-print_purple = print紫
-print_indigo = print靛
-
-print_bold_red = print亮红
-print_bold_green = print亮绿
-print_bold_yellow = print亮黄
-print_bold_blue = print亮蓝
-print_bold_purple = print亮紫
-print_bold_indigo = print亮靛
-
-if not stdout.isatty():
- # redirection, avoid a fucked up log file
- print红 = print
- print绿 = print
- print黄 = print
- print蓝 = print
- print紫 = print
- print靛 = print
- print亮红 = print
- print亮绿 = print
- print亮黄 = print
- print亮蓝 = print
- print亮紫 = print
- print亮靛 = print
- print_red = print
- print_green = print
- print_yellow = print
- print_blue = print
- print_purple = print
- print_indigo = print
- print_bold_red = print
- print_bold_green = print
- print_bold_yellow = print
- print_bold_blue = print
- print_bold_purple = print
- print_bold_indigo = print
\ No newline at end of file
+# Do you like the elegance of Chinese characters?
+def sprint红(*kw):
+ return "\033[0;31m"+' '.join(kw)+"\033[0m"
+def sprint绿(*kw):
+ return "\033[0;32m"+' '.join(kw)+"\033[0m"
+def sprint黄(*kw):
+ return "\033[0;33m"+' '.join(kw)+"\033[0m"
+def sprint蓝(*kw):
+ return "\033[0;34m"+' '.join(kw)+"\033[0m"
+def sprint紫(*kw):
+ return "\033[0;35m"+' '.join(kw)+"\033[0m"
+def sprint靛(*kw):
+ return "\033[0;36m"+' '.join(kw)+"\033[0m"
+def sprint亮红(*kw):
+ return "\033[1;31m"+' '.join(kw)+"\033[0m"
+def sprint亮绿(*kw):
+ return "\033[1;32m"+' '.join(kw)+"\033[0m"
+def sprint亮黄(*kw):
+ return "\033[1;33m"+' '.join(kw)+"\033[0m"
+def sprint亮蓝(*kw):
+ return "\033[1;34m"+' '.join(kw)+"\033[0m"
+def sprint亮紫(*kw):
+ return "\033[1;35m"+' '.join(kw)+"\033[0m"
+def sprint亮靛(*kw):
+ return "\033[1;36m"+' '.join(kw)+"\033[0m"
diff --git a/crazy_functional.py b/crazy_functional.py
index f05c91a..7812e53 100644
--- a/crazy_functional.py
+++ b/crazy_functional.py
@@ -138,7 +138,7 @@ def get_crazy_functions():
###################### 第二组插件 ###########################
# [第二组插件]: 经过充分测试
from crazy_functions.批量总结PDF文档 import 批量总结PDF文档
- from crazy_functions.批量总结PDF文档pdfminer import 批量总结PDF文档pdfminer
+ # from crazy_functions.批量总结PDF文档pdfminer import 批量总结PDF文档pdfminer
from crazy_functions.批量翻译PDF文档_多线程 import 批量翻译PDF文档
from crazy_functions.谷歌检索小助手 import 谷歌检索小助手
from crazy_functions.理解PDF文档内容 import 理解PDF文档内容标准文件输入
@@ -164,17 +164,16 @@ def get_crazy_functions():
# HotReload 的意思是热更新,修改函数插件代码后,不需要重启程序,代码直接生效
"Function": HotReload(批量总结PDF文档)
},
- "[测试功能] 批量总结PDF文档pdfminer": {
- "Color": "stop",
- "AsButton": False, # 加入下拉菜单中
- "Function": HotReload(批量总结PDF文档pdfminer)
- },
+ # "[测试功能] 批量总结PDF文档pdfminer": {
+ # "Color": "stop",
+ # "AsButton": False, # 加入下拉菜单中
+ # "Function": HotReload(批量总结PDF文档pdfminer)
+ # },
"谷歌学术检索助手(输入谷歌学术搜索页url)": {
"Color": "stop",
"AsButton": False, # 加入下拉菜单中
"Function": HotReload(谷歌检索小助手)
},
-
"理解PDF文档内容 (模仿ChatPDF)": {
# HotReload 的意思是热更新,修改函数插件代码后,不需要重启程序,代码直接生效
"Color": "stop",
@@ -193,7 +192,7 @@ def get_crazy_functions():
"AsButton": False, # 加入下拉菜单中
"Function": HotReload(Latex英文纠错)
},
- "[测试功能] 中文Latex项目全文润色(输入路径或上传压缩包)": {
+ "中文Latex项目全文润色(输入路径或上传压缩包)": {
# HotReload 的意思是热更新,修改函数插件代码后,不需要重启程序,代码直接生效
"Color": "stop",
"AsButton": False, # 加入下拉菜单中
@@ -222,15 +221,7 @@ def get_crazy_functions():
})
###################### 第三组插件 ###########################
- # [第三组插件]: 尚未充分测试的函数插件,放在这里
- from crazy_functions.下载arxiv论文翻译摘要 import 下载arxiv论文并翻译摘要
- function_plugins.update({
- "一键下载arxiv论文并翻译摘要(先在input输入编号,如1812.10695)": {
- "Color": "stop",
- "AsButton": False, # 加入下拉菜单中
- "Function": HotReload(下载arxiv论文并翻译摘要)
- }
- })
+ # [第三组插件]: 尚未充分测试的函数插件
from crazy_functions.联网的ChatGPT import 连接网络回答问题
function_plugins.update({
@@ -307,5 +298,56 @@ def get_crazy_functions():
except:
print('Load function plugin failed')
+ try:
+ from crazy_functions.Langchain知识库 import 知识库问答
+ function_plugins.update({
+ "[功能尚不稳定] 构建知识库(请先上传文件素材)": {
+ "Color": "stop",
+ "AsButton": False,
+ "AdvancedArgs": True,
+ "ArgsReminder": "待注入的知识库名称id, 默认为default",
+ "Function": HotReload(知识库问答)
+ }
+ })
+ except:
+ print('Load function plugin failed')
+
+ try:
+ from crazy_functions.Langchain知识库 import 读取知识库作答
+ function_plugins.update({
+ "[功能尚不稳定] 知识库问答": {
+ "Color": "stop",
+ "AsButton": False,
+ "AdvancedArgs": True,
+ "ArgsReminder": "待提取的知识库名称id, 默认为default, 您需要首先调用构建知识库",
+ "Function": HotReload(读取知识库作答)
+ }
+ })
+ except:
+ print('Load function plugin failed')
+
+ try:
+ from crazy_functions.Latex输出PDF结果 import Latex英文纠错加PDF对比
+ function_plugins.update({
+ "[功能尚不稳定] Latex英文纠错+LatexDiff高亮修正位置": {
+ "Color": "stop",
+ "AsButton": False,
+ # "AdvancedArgs": True,
+ # "ArgsReminder": "",
+ "Function": HotReload(Latex英文纠错加PDF对比)
+ }
+ })
+ from crazy_functions.Latex输出PDF结果 import Latex翻译中文并重新编译PDF
+ function_plugins.update({
+ "[功能尚不稳定] Latex翻译/Arixv翻译+重构PDF": {
+ "Color": "stop",
+ "AsButton": False,
+ # "AdvancedArgs": True,
+ # "ArgsReminder": "",
+ "Function": HotReload(Latex翻译中文并重新编译PDF)
+ }
+ })
+ except:
+ print('Load function plugin failed')
###################### 第n组插件 ###########################
return function_plugins
diff --git a/crazy_functions/Langchain知识库.py b/crazy_functions/Langchain知识库.py
new file mode 100644
index 0000000..36999d5
--- /dev/null
+++ b/crazy_functions/Langchain知识库.py
@@ -0,0 +1,98 @@
+from toolbox import CatchException, update_ui, ProxyNetworkActivate
+from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive, get_files_from_everything
+
+
+
+@CatchException
+def 知识库问答(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
+ """
+ txt 输入栏用户输入的文本,例如需要翻译的一段话,再例如一个包含了待处理文件的路径
+ llm_kwargs gpt模型参数, 如温度和top_p等, 一般原样传递下去就行
+ plugin_kwargs 插件模型的参数,暂时没有用武之地
+ chatbot 聊天显示框的句柄,用于显示给用户
+ history 聊天历史,前情提要
+ system_prompt 给gpt的静默提醒
+ web_port 当前软件运行的端口号
+ """
+ history = [] # 清空历史,以免输入溢出
+ chatbot.append(("这是什么功能?", "[Local Message] 从一批文件(txt, md, tex)中读取数据构建知识库, 然后进行问答。"))
+ yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
+
+ # resolve deps
+ try:
+ from zh_langchain import construct_vector_store
+ from langchain.embeddings.huggingface import HuggingFaceEmbeddings
+ from .crazy_utils import knowledge_archive_interface
+ except Exception as e:
+ chatbot.append(
+ ["依赖不足",
+ "导入依赖失败。正在尝试自动安装,请查看终端的输出或耐心等待..."]
+ )
+ yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
+ from .crazy_utils import try_install_deps
+ try_install_deps(['zh_langchain==0.2.0'])
+
+ # < --------------------读取参数--------------- >
+ if ("advanced_arg" in plugin_kwargs) and (plugin_kwargs["advanced_arg"] == ""): plugin_kwargs.pop("advanced_arg")
+ kai_id = plugin_kwargs.get("advanced_arg", 'default')
+
+ # < --------------------读取文件--------------- >
+ file_manifest = []
+ spl = ["txt", "doc", "docx", "email", "epub", "html", "json", "md", "msg", "pdf", "ppt", "pptx", "rtf"]
+ for sp in spl:
+ _, file_manifest_tmp, _ = get_files_from_everything(txt, type=f'.{sp}')
+ file_manifest += file_manifest_tmp
+
+ if len(file_manifest) == 0:
+ chatbot.append(["没有找到任何可读取文件", "当前支持的格式包括: txt, md, docx, pptx, pdf, json等"])
+ yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
+ return
+
+ # < -------------------预热文本向量化模组--------------- >
+ chatbot.append(['
'.join(file_manifest), "正在预热文本向量化模组, 如果是第一次运行, 将消耗较长时间下载中文向量化模型..."])
+ yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
+ print('Checking Text2vec ...')
+ from langchain.embeddings.huggingface import HuggingFaceEmbeddings
+ with ProxyNetworkActivate(): # 临时地激活代理网络
+ HuggingFaceEmbeddings(model_name="GanymedeNil/text2vec-large-chinese")
+
+ # < -------------------构建知识库--------------- >
+ chatbot.append(['
'.join(file_manifest), "正在构建知识库..."])
+ yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
+ print('Establishing knowledge archive ...')
+ with ProxyNetworkActivate(): # 临时地激活代理网络
+ kai = knowledge_archive_interface()
+ kai.feed_archive(file_manifest=file_manifest, id=kai_id)
+ kai_files = kai.get_loaded_file()
+ kai_files = '
'.join(kai_files)
+ # chatbot.append(['知识库构建成功', "正在将知识库存储至cookie中"])
+ # yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
+ # chatbot._cookies['langchain_plugin_embedding'] = kai.get_current_archive_id()
+ # chatbot._cookies['lock_plugin'] = 'crazy_functions.Langchain知识库->读取知识库作答'
+ # chatbot.append(['完成', "“根据知识库作答”函数插件已经接管问答系统, 提问吧! 但注意, 您接下来不能再使用其他插件了,刷新页面即可以退出知识库问答模式。"])
+ chatbot.append(['构建完成', f"当前知识库内的有效文件:\n\n---\n\n{kai_files}\n\n---\n\n请切换至“知识库问答”插件进行知识库访问, 或者使用此插件继续上传更多文件。"])
+ yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 由于请求gpt需要一段时间,我们先及时地做一次界面更新
+
+@CatchException
+def 读取知识库作答(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port=-1):
+
+ # < ------------------- --------------- >
+ from .crazy_utils import knowledge_archive_interface
+ kai = knowledge_archive_interface()
+
+ if 'langchain_plugin_embedding' in chatbot._cookies:
+ resp, prompt = kai.answer_with_archive_by_id(txt, chatbot._cookies['langchain_plugin_embedding'])
+ else:
+ if ("advanced_arg" in plugin_kwargs) and (plugin_kwargs["advanced_arg"] == ""): plugin_kwargs.pop("advanced_arg")
+ kai_id = plugin_kwargs.get("advanced_arg", 'default')
+ resp, prompt = kai.answer_with_archive_by_id(txt, kai_id)
+
+ chatbot.append((txt, '[Local Message] ' + prompt))
+ yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 由于请求gpt需要一段时间,我们先及时地做一次界面更新
+ gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(
+ inputs=prompt, inputs_show_user=txt,
+ llm_kwargs=llm_kwargs, chatbot=chatbot, history=[],
+ sys_prompt=system_prompt
+ )
+ history.extend((prompt, gpt_say))
+ yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 由于请求gpt需要一段时间,我们先及时地做一次界面更新
diff --git a/crazy_functions/Latex全文润色.py b/crazy_functions/Latex全文润色.py
index 8d3f97b..9e1d4b6 100644
--- a/crazy_functions/Latex全文润色.py
+++ b/crazy_functions/Latex全文润色.py
@@ -238,3 +238,6 @@ def Latex英文纠错(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_p
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
return
yield from 多文件润色(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, language='en', mode='proofread')
+
+
+
diff --git a/crazy_functions/Latex输出PDF结果.py b/crazy_functions/Latex输出PDF结果.py
new file mode 100644
index 0000000..daac763
--- /dev/null
+++ b/crazy_functions/Latex输出PDF结果.py
@@ -0,0 +1,263 @@
+from toolbox import update_ui, trimmed_format_exc, get_conf, objdump, objload, promote_file_to_downloadzone
+from toolbox import CatchException, report_execption, update_ui_lastest_msg, zip_result, gen_time_str
+import glob, os, requests, time
+pj = os.path.join
+ARXIV_CACHE_DIR = os.path.expanduser(f"~/arxiv_cache/")
+
+# =================================== 工具函数 ===============================================
+沙雕GPT啊别犯这些低级翻译错误 = 'You must to translate "agent" to "智能体". '
+def switch_prompt(pfg, mode):
+ """
+ Generate prompts and system prompts based on the mode for proofreading or translating.
+ Args:
+ - pfg: Proofreader or Translator instance.
+ - mode: A string specifying the mode, either 'proofread' or 'translate_zh'.
+
+ Returns:
+ - inputs_array: A list of strings containing prompts for users to respond to.
+ - sys_prompt_array: A list of strings containing prompts for system prompts.
+ """
+ n_split = len(pfg.sp_file_contents)
+ if mode == 'proofread':
+ inputs_array = [r"Below is a section from an academic paper, proofread this section." +
+ r"Do not modify any latex command such as \section, \cite, \begin, \item and equations. " +
+ r"Answer me only with the revised text:" +
+ f"\n\n{frag}" for frag in pfg.sp_file_contents]
+ sys_prompt_array = ["You are a professional academic paper writer." for _ in range(n_split)]
+ elif mode == 'translate_zh':
+ inputs_array = [r"Below is a section from an English academic paper, translate it into Chinese." + 沙雕GPT啊别犯这些低级翻译错误 +
+ r"Do not modify any latex command such as \section, \cite, \begin, \item and equations. " +
+ r"Answer me only with the translated text:" +
+ f"\n\n{frag}" for frag in pfg.sp_file_contents]
+ sys_prompt_array = ["You are a professional translator." for _ in range(n_split)]
+ else:
+ assert False, "未知指令"
+ return inputs_array, sys_prompt_array
+
+def desend_to_extracted_folder_if_exist(project_folder):
+ """
+ Descend into the extracted folder if it exists, otherwise return the original folder.
+
+ Args:
+ - project_folder: A string specifying the folder path.
+
+ Returns:
+ - A string specifying the path to the extracted folder, or the original folder if there is no extracted folder.
+ """
+ maybe_dir = [f for f in glob.glob(f'{project_folder}/*') if os.path.isdir(f)]
+ if len(maybe_dir) == 0: return project_folder
+ if maybe_dir[0].endswith('.extract'): return maybe_dir[0]
+ return project_folder
+
+def move_project(project_folder, arxiv_id=None):
+ """
+ Create a new work folder and copy the project folder to it.
+
+ Args:
+ - project_folder: A string specifying the folder path of the project.
+
+ Returns:
+ - A string specifying the path to the new work folder.
+ """
+ import shutil, time
+ time.sleep(2) # avoid time string conflict
+ if arxiv_id is not None:
+ new_workfolder = pj(ARXIV_CACHE_DIR, arxiv_id, 'workfolder')
+ else:
+ new_workfolder = f'gpt_log/{gen_time_str()}'
+ try: shutil.rmtree(new_workfolder)
+ except: pass
+ shutil.copytree(src=project_folder, dst=new_workfolder)
+ return new_workfolder
+
+def arxiv_download(chatbot, history, txt):
+ def check_cached_translation_pdf(arxiv_id):
+ translation_dir = pj(ARXIV_CACHE_DIR, arxiv_id, 'translation')
+ if not os.path.exists(translation_dir):
+ os.makedirs(translation_dir)
+ target_file = pj(translation_dir, 'translate_zh.pdf')
+ if os.path.exists(target_file):
+ promote_file_to_downloadzone(target_file)
+ return target_file
+ return False
+
+ if not txt.startswith('https://arxiv.org'):
+ return txt, None
+
+ # <-------------- inspect format ------------->
+ chatbot.append([f"检测到arxiv文档连接", '尝试下载 ...'])
+ yield from update_ui(chatbot=chatbot, history=history)
+ time.sleep(1) # 刷新界面
+
+ url_ = txt # https://arxiv.org/abs/1707.06690
+ if not txt.startswith('https://arxiv.org/abs/'):
+ msg = f"解析arxiv网址失败, 期望格式例如: https://arxiv.org/abs/1707.06690。实际得到格式: {url_}"
+ yield from update_ui_lastest_msg(msg, chatbot=chatbot, history=history) # 刷新界面
+ return msg, None
+ # <-------------- set format ------------->
+ arxiv_id = url_.split('/abs/')[-1]
+ cached_translation_pdf = check_cached_translation_pdf(arxiv_id)
+ if cached_translation_pdf: return cached_translation_pdf, arxiv_id
+
+ url_tar = url_.replace('/abs/', '/e-print/')
+ translation_dir = pj(ARXIV_CACHE_DIR, arxiv_id, 'e-print')
+ extract_dst = pj(ARXIV_CACHE_DIR, arxiv_id, 'extract')
+ os.makedirs(translation_dir, exist_ok=True)
+
+ # <-------------- download arxiv source file ------------->
+ dst = pj(translation_dir, arxiv_id+'.tar')
+ if os.path.exists(dst):
+ yield from update_ui_lastest_msg("调用缓存", chatbot=chatbot, history=history) # 刷新界面
+ else:
+ yield from update_ui_lastest_msg("开始下载", chatbot=chatbot, history=history) # 刷新界面
+ proxies, = get_conf('proxies')
+ r = requests.get(url_tar, proxies=proxies)
+ with open(dst, 'wb+') as f:
+ f.write(r.content)
+ # <-------------- extract file ------------->
+ yield from update_ui_lastest_msg("下载完成", chatbot=chatbot, history=history) # 刷新界面
+ from toolbox import extract_archive
+ extract_archive(file_path=dst, dest_dir=extract_dst)
+ return extract_dst, arxiv_id
+# ========================================= 插件主程序1 =====================================================
+
+
+@CatchException
+def Latex英文纠错加PDF对比(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
+ # <-------------- information about this plugin ------------->
+ chatbot.append([ "函数插件功能?",
+ "对整个Latex项目进行纠错, 用latex编译为PDF对修正处做高亮。函数插件贡献者: Binary-Husky。注意事项: 目前仅支持GPT3.5/GPT4,其他模型转化效果未知。目前对机器学习类文献转化效果最好,其他类型文献转化效果未知。仅在Windows系统进行了测试,其他操作系统表现未知。"])
+ yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
+
+
+ # <-------------- check deps ------------->
+ try:
+ import glob, os, time
+ os.system(f'pdflatex -version')
+ from .latex_utils import Latex精细分解与转化, 编译Latex差别
+ except Exception as e:
+ chatbot.append([ f"解析项目: {txt}",
+ f"尝试执行Latex指令失败。Latex没有安装, 或者不在环境变量PATH中。报错信息\n\n```\n\n{trimmed_format_exc()}\n\n```\n\n"])
+ yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
+ return
+
+
+ # <-------------- clear history and read input ------------->
+ history = []
+ if os.path.exists(txt):
+ project_folder = txt
+ else:
+ if txt == "": txt = '空空如也的输入栏'
+ report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}")
+ yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
+ return
+ file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.tex', recursive=True)]
+ if len(file_manifest) == 0:
+ report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.tex文件: {txt}")
+ yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
+ return
+
+
+ # <-------------- if is a zip/tar file ------------->
+ project_folder = desend_to_extracted_folder_if_exist(project_folder)
+
+
+ # <-------------- move latex project away from temp folder ------------->
+ project_folder = move_project(project_folder, arxiv_id=None)
+
+
+ # <-------------- if merge_translate_zh is already generated, skip gpt req ------------->
+ if not os.path.exists(project_folder + '/merge_proofread.tex'):
+ yield from Latex精细分解与转化(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, mode='proofread_latex', switch_prompt=switch_prompt)
+
+
+ # <-------------- compile PDF ------------->
+ success = yield from 编译Latex差别(chatbot, history, main_file_original='merge', main_file_modified='merge_proofread',
+ work_folder_original=project_folder, work_folder_modified=project_folder, work_folder=project_folder)
+
+
+ # <-------------- zip PDF ------------->
+ zip_result(project_folder)
+ if success:
+ chatbot.append((f"成功啦", '请查收结果(压缩包)...'))
+ yield from update_ui(chatbot=chatbot, history=history); time.sleep(1) # 刷新界面
+ else:
+ chatbot.append((f"失败了", '虽然PDF生成失败了, 但请查收结果(压缩包), 内含已经翻译的Tex文档, 也是可读的, 您可以到Github Issue区, 用该压缩包+对话历史存档进行反馈 ...'))
+ yield from update_ui(chatbot=chatbot, history=history); time.sleep(1) # 刷新界面
+
+ # <-------------- we are done ------------->
+ return success
+
+
+# ========================================= 插件主程序2 =====================================================
+
+@CatchException
+def Latex翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
+ # <-------------- information about this plugin ------------->
+ chatbot.append([
+ "函数插件功能?",
+ "对整个Latex项目进行翻译, 生成中文PDF。函数插件贡献者: Binary-Husky。注意事项: 目前仅支持GPT3.5/GPT4,其他模型转化效果未知。目前对机器学习类文献转化效果最好,其他类型文献转化效果未知。仅在Windows系统进行了测试,其他操作系统表现未知。"])
+ yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
+
+
+ # <-------------- check deps ------------->
+ try:
+ import glob, os, time
+ os.system(f'pdflatex -version')
+ from .latex_utils import Latex精细分解与转化, 编译Latex差别
+ except Exception as e:
+ chatbot.append([ f"解析项目: {txt}",
+ f"尝试执行Latex指令失败。Latex没有安装, 或者不在环境变量PATH中。报错信息\n\n```\n\n{trimmed_format_exc()}\n\n```\n\n"])
+ yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
+ return
+
+
+ # <-------------- clear history and read input ------------->
+ history = []
+ txt, arxiv_id = yield from arxiv_download(chatbot, history, txt)
+ if txt.endswith('.pdf'):
+ report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"发现已经存在翻译好的PDF文档")
+ yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
+ return
+ if os.path.exists(txt):
+ project_folder = txt
+ else:
+ if txt == "": txt = '空空如也的输入栏'
+ report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}")
+ yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
+ return
+ file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.tex', recursive=True)]
+ if len(file_manifest) == 0:
+ report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.tex文件: {txt}")
+ yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
+ return
+
+
+ # <-------------- if is a zip/tar file ------------->
+ project_folder = desend_to_extracted_folder_if_exist(project_folder)
+
+
+ # <-------------- move latex project away from temp folder ------------->
+ project_folder = move_project(project_folder, arxiv_id)
+
+
+ # <-------------- if merge_translate_zh is already generated, skip gpt req ------------->
+ if not os.path.exists(project_folder + '/merge_translate_zh.tex'):
+ yield from Latex精细分解与转化(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, mode='translate_zh', switch_prompt=switch_prompt)
+
+
+ # <-------------- compile PDF ------------->
+ success = yield from 编译Latex差别(chatbot, history, main_file_original='merge', main_file_modified='merge_translate_zh',
+ work_folder_original=project_folder, work_folder_modified=project_folder, work_folder=project_folder)
+
+ # <-------------- zip PDF ------------->
+ zip_result(project_folder)
+ if success:
+ chatbot.append((f"成功啦", '请查收结果(压缩包)...'))
+ yield from update_ui(chatbot=chatbot, history=history); time.sleep(1) # 刷新界面
+ else:
+ chatbot.append((f"失败了", '虽然PDF生成失败了, 但请查收结果(压缩包), 内含已经翻译的Tex文档, 也是可读的, 您可以到Github Issue区, 用该压缩包+对话历史存档进行反馈 ...'))
+ yield from update_ui(chatbot=chatbot, history=history); time.sleep(1) # 刷新界面
+
+ # <-------------- we are done ------------->
+ return success
diff --git a/crazy_functions/crazy_functions_test.py b/crazy_functions/crazy_functions_test.py
index a9bfbf8..d4e3274 100644
--- a/crazy_functions/crazy_functions_test.py
+++ b/crazy_functions/crazy_functions_test.py
@@ -3,6 +3,8 @@
这个文件用于函数插件的单元测试
运行方法 python crazy_functions/crazy_functions_test.py
"""
+
+# ==============================================================================================================================
def validate_path():
import os, sys
@@ -10,10 +12,16 @@ def validate_path():
root_dir_assume = os.path.abspath(os.path.dirname(__file__) + '/..')
os.chdir(root_dir_assume)
sys.path.append(root_dir_assume)
-
validate_path() # validate path so you can run from base directory
+
+# ==============================================================================================================================
+
from colorful import *
from toolbox import get_conf, ChatBotWithCookies
+import contextlib
+import os
+import sys
+from functools import wraps
proxies, WEB_PORT, LLM_MODEL, CONCURRENT_COUNT, AUTHENTICATION, CHATBOT_HEIGHT, LAYOUT, API_KEY = \
get_conf('proxies', 'WEB_PORT', 'LLM_MODEL', 'CONCURRENT_COUNT', 'AUTHENTICATION', 'CHATBOT_HEIGHT', 'LAYOUT', 'API_KEY')
@@ -30,7 +38,43 @@ history = []
system_prompt = "Serve me as a writing and programming assistant."
web_port = 1024
+# ==============================================================================================================================
+def silence_stdout(func):
+ @wraps(func)
+ def wrapper(*args, **kwargs):
+ _original_stdout = sys.stdout
+ sys.stdout = open(os.devnull, 'w')
+ for q in func(*args, **kwargs):
+ sys.stdout = _original_stdout
+ yield q
+ sys.stdout = open(os.devnull, 'w')
+ sys.stdout.close()
+ sys.stdout = _original_stdout
+ return wrapper
+
+class CLI_Printer():
+ def __init__(self) -> None:
+ self.pre_buf = ""
+
+ def print(self, buf):
+ bufp = ""
+ for index, chat in enumerate(buf):
+ a, b = chat
+ bufp += sprint亮靛('[Me]:' + a) + '\n'
+ bufp += '[GPT]:' + b
+ if index < len(buf)-1:
+ bufp += '\n'
+
+ if self.pre_buf!="" and bufp.startswith(self.pre_buf):
+ print(bufp[len(self.pre_buf):], end='')
+ else:
+ print('\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n'+bufp, end='')
+ self.pre_buf = bufp
+ return
+
+cli_printer = CLI_Printer()
+# ==============================================================================================================================
def test_解析一个Python项目():
from crazy_functions.解析项目源代码 import 解析一个Python项目
txt = "crazy_functions/test_project/python/dqn"
@@ -116,6 +160,52 @@ def test_Markdown多语言():
for cookies, cb, hist, msg in Markdown翻译指定语言(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
print(cb)
+def test_Langchain知识库():
+ from crazy_functions.Langchain知识库 import 知识库问答
+ txt = "./"
+ chatbot = ChatBotWithCookies(llm_kwargs)
+ for cookies, cb, hist, msg in silence_stdout(知识库问答)(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
+ cli_printer.print(cb) # print(cb)
+
+ chatbot = ChatBotWithCookies(cookies)
+ from crazy_functions.Langchain知识库 import 读取知识库作答
+ txt = "What is the installation method?"
+ for cookies, cb, hist, msg in silence_stdout(读取知识库作答)(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
+ cli_printer.print(cb) # print(cb)
+
+def test_Langchain知识库读取():
+ from crazy_functions.Langchain知识库 import 读取知识库作答
+ txt = "远程云服务器部署?"
+ for cookies, cb, hist, msg in silence_stdout(读取知识库作答)(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
+ cli_printer.print(cb) # print(cb)
+
+def test_Latex():
+ from crazy_functions.Latex输出PDF结果 import Latex英文纠错加PDF对比, Latex翻译中文并重新编译PDF
+ txt = "C:/Users/fuqingxu/Desktop/proofread"
+ txt = "C:/Users/fuqingxu/Desktop/旧文件/gpt/paperx"
+ txt = "C:/Users/fuqingxu/Desktop/旧文件/gpt/papery"
+ txt = r"C:\Users\fuqingxu\Desktop\旧文件\gpt\latex2pdf\2023-06-03-14-57-06"
+ txt = r"C:\Users\fuqingxu\Desktop\旧文件\gpt\latex2pdf\2023-06-03-15-40-20"
+ txt = r"https://arxiv.org/abs/1902.03185"
+ txt = r"C:\Users\fuqingxu\Desktop\旧文件\gpt\latex2pdf\2023-06-03-17-14-40"
+ txt = r"https://arxiv.org/abs/2305.18290"
+ txt = r"https://arxiv.org/abs/2305.17608"
+ # txt = r"https://arxiv.org/abs/2306.00324"
+ txt = r"https://arxiv.org/abs/2211.16068"
+
+ for cookies, cb, hist, msg in (Latex翻译中文并重新编译PDF)(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
+ cli_printer.print(cb) # print(cb)
+
+
+
+ # txt = "2302.02948.tar"
+ # print(txt)
+ # main_tex, work_folder = Latex预处理(txt)
+ # print('main tex:', main_tex)
+ # res = 编译Latex(main_tex, work_folder)
+ # # for cookies, cb, hist, msg in silence_stdout(编译Latex)(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
+ # cli_printer.print(cb) # print(cb)
+
# test_解析一个Python项目()
@@ -129,7 +219,8 @@ def test_Markdown多语言():
# test_联网回答问题()
# test_解析ipynb文件()
# test_数学动画生成manim()
-test_Markdown多语言()
-
+# test_Langchain知识库()
+# test_Langchain知识库读取()
+test_Latex()
input("程序完成,回车退出。")
print("退出。")
\ No newline at end of file
diff --git a/crazy_functions/crazy_utils.py b/crazy_functions/crazy_utils.py
index de205d7..96301ff 100644
--- a/crazy_functions/crazy_utils.py
+++ b/crazy_functions/crazy_utils.py
@@ -1,4 +1,5 @@
from toolbox import update_ui, get_conf, trimmed_format_exc
+import threading
def input_clipping(inputs, history, max_token_limit):
import numpy as np
@@ -606,3 +607,94 @@ def get_files_from_everything(txt, type): # type='.md'
success = False
return success, file_manifest, project_folder
+
+
+
+
+def Singleton(cls):
+ _instance = {}
+
+ def _singleton(*args, **kargs):
+ if cls not in _instance:
+ _instance[cls] = cls(*args, **kargs)
+ return _instance[cls]
+
+ return _singleton
+
+
+@Singleton
+class knowledge_archive_interface():
+ def __init__(self) -> None:
+ self.threadLock = threading.Lock()
+ self.current_id = ""
+ self.kai_path = None
+ self.qa_handle = None
+ self.text2vec_large_chinese = None
+
+ def get_chinese_text2vec(self):
+ if self.text2vec_large_chinese is None:
+ # < -------------------预热文本向量化模组--------------- >
+ from toolbox import ProxyNetworkActivate
+ print('Checking Text2vec ...')
+ from langchain.embeddings.huggingface import HuggingFaceEmbeddings
+ with ProxyNetworkActivate(): # 临时地激活代理网络
+ self.text2vec_large_chinese = HuggingFaceEmbeddings(model_name="GanymedeNil/text2vec-large-chinese")
+
+ return self.text2vec_large_chinese
+
+
+ def feed_archive(self, file_manifest, id="default"):
+ self.threadLock.acquire()
+ # import uuid
+ self.current_id = id
+ from zh_langchain import construct_vector_store
+ self.qa_handle, self.kai_path = construct_vector_store(
+ vs_id=self.current_id,
+ files=file_manifest,
+ sentence_size=100,
+ history=[],
+ one_conent="",
+ one_content_segmentation="",
+ text2vec = self.get_chinese_text2vec(),
+ )
+ self.threadLock.release()
+
+ def get_current_archive_id(self):
+ return self.current_id
+
+ def get_loaded_file(self):
+ return self.qa_handle.get_loaded_file()
+
+ def answer_with_archive_by_id(self, txt, id):
+ self.threadLock.acquire()
+ if not self.current_id == id:
+ self.current_id = id
+ from zh_langchain import construct_vector_store
+ self.qa_handle, self.kai_path = construct_vector_store(
+ vs_id=self.current_id,
+ files=[],
+ sentence_size=100,
+ history=[],
+ one_conent="",
+ one_content_segmentation="",
+ text2vec = self.get_chinese_text2vec(),
+ )
+ VECTOR_SEARCH_SCORE_THRESHOLD = 0
+ VECTOR_SEARCH_TOP_K = 4
+ CHUNK_SIZE = 512
+ resp, prompt = self.qa_handle.get_knowledge_based_conent_test(
+ query = txt,
+ vs_path = self.kai_path,
+ score_threshold=VECTOR_SEARCH_SCORE_THRESHOLD,
+ vector_search_top_k=VECTOR_SEARCH_TOP_K,
+ chunk_conent=True,
+ chunk_size=CHUNK_SIZE,
+ text2vec = self.get_chinese_text2vec(),
+ )
+ self.threadLock.release()
+ return resp, prompt
+
+def try_install_deps(deps):
+ for dep in deps:
+ import subprocess, sys
+ subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--user', dep])
diff --git a/crazy_functions/latex_utils.py b/crazy_functions/latex_utils.py
new file mode 100644
index 0000000..3e128eb
--- /dev/null
+++ b/crazy_functions/latex_utils.py
@@ -0,0 +1,606 @@
+from toolbox import update_ui, update_ui_lastest_msg # 刷新Gradio前端界面
+from toolbox import zip_folder, objdump, objload, promote_file_to_downloadzone
+import os, shutil
+import re
+pj = os.path.join
+
+def 寻找Latex主文件(file_manifest, mode):
+ """
+ 在多Tex文档中,寻找主文件,必须包含documentclass,返回找到的第一个。
+ P.S. 但愿没人把latex模板放在里面传进来
+ """
+ for texf in file_manifest:
+ if os.path.basename(texf).startswith('merge'):
+ continue
+ with open(texf, 'r', encoding='utf8') as f:
+ file_content = f.read()
+ if r'\documentclass' in file_content:
+ return texf
+ else:
+ continue
+ raise RuntimeError('无法找到一个主Tex文件(包含documentclass关键字)')
+
+def merge_tex_files_(project_foler, main_file, mode):
+ """
+ 递归地把多Tex工程整合为一个Tex文档
+ """
+ for s in reversed([q for q in re.finditer(r"\\input\{(.*?)\}", main_file, re.M)]):
+ f = s.group(1)
+ fp = os.path.join(project_foler, f)
+ if os.path.exists(fp):
+ # e.g., \input{srcs/07_appendix.tex}
+ with open(fp, 'r', encoding='utf-8', errors='replace') as fx:
+ c = fx.read()
+ else:
+ # e.g., \input{srcs/07_appendix}
+ with open(fp+'.tex', 'r', encoding='utf-8', errors='replace') as fx:
+ c = fx.read()
+ c = merge_tex_files_(project_foler, c, mode)
+ main_file = main_file[:s.span()[0]] + c + main_file[s.span()[1]:]
+ return main_file
+
+def merge_tex_files(project_foler, main_file, mode):
+ """
+ 递归地把多Tex工程整合为一个Tex文档(递归外层)
+ P.S. 顺便把CTEX塞进去以支持中文
+ P.S. 顺便把Latex的注释去除
+ """
+ main_file = merge_tex_files_(project_foler, main_file, mode)
+ if mode == 'translate_zh':
+ pattern = re.compile(r'\\documentclass.*\n')
+ match = pattern.search(main_file)
+ position = match.end()
+ main_file = main_file[:position] + '\\usepackage{CTEX}\n\\usepackage{url}\n' + main_file[position:]
+
+ new_file_remove_comment_lines = []
+ for l in main_file.splitlines():
+ # 删除整行的空注释
+ if l.startswith("%") or (l.startswith(" ") and l.lstrip().startswith("%")):
+ pass
+ else:
+ new_file_remove_comment_lines.append(l)
+ main_file = '\n'.join(new_file_remove_comment_lines)
+ main_file = re.sub(r'(? None:
+ self.string = string
+ self.preserve = preserve
+ self.next = None
+
+
+def mod_inbraket(match):
+ """
+ 为啥chatgpt会把cite里面的逗号换成中文逗号呀 艹
+ """
+ # get the matched string
+ cmd = match.group(1)
+ str_to_modify = match.group(2)
+ # modify the matched string
+ str_to_modify = str_to_modify.replace(':', ':') # 前面是中文冒号,后面是英文冒号
+ str_to_modify = str_to_modify.replace(',', ',') # 前面是中文逗号,后面是英文逗号
+ # str_to_modify = 'BOOM'
+ return "\\" + cmd + "{" + str_to_modify + "}"
+
+def fix_content(final_tex, node_string):
+ """
+ Fix common GPT errors to increase success rate
+ """
+ final_tex = final_tex.replace('%', r'\%')
+ final_tex = final_tex.replace(r'\%', r'\\%')
+ final_tex = re.sub(r"\\([a-z]{2,10})\ \{", r"\\\1{", string=final_tex)
+ final_tex = re.sub(r"\\\ ([a-z]{2,10})\{", r"\\\1{", string=final_tex)
+ final_tex = re.sub(r"\\([a-z]{2,10})\{([^\}]*?)\}", mod_inbraket, string=final_tex)
+ if node_string.count('{') != node_string.count('}'):
+ if final_tex.count('{') != node_string.count('{'):
+ final_tex = node_string # 出问题了,还原原文
+ if final_tex.count('}') != node_string.count('}'):
+ final_tex = node_string # 出问题了,还原原文
+
+ return final_tex
+
+class LatexPaperSplit():
+ """
+ 将Latex文档分解到一个链表中,每个链表节点用preserve的标志位提示它是否应当被GPT处理
+ """
+ def __init__(self) -> None:
+ """
+ root是链表的根节点
+ """
+ self.root = None
+
+ def merge_result(self, arr, mode, msg):
+ """
+ 将GPT处理后的结果融合
+ """
+ result_string = ""
+ node = self.root
+ p = 0
+ while True:
+ if node.preserve:
+ result_string += node.string
+ else:
+ result_string += fix_content(arr[p], node.string)
+ p += 1
+ node = node.next
+ if node is None: break
+ if mode == 'translate_zh':
+ try:
+ pattern = re.compile(r'\\begin\{abstract\}.*\n')
+ match = pattern.search(result_string)
+ position = match.end()
+ result_string = result_string[:position] + \
+ "{\\scriptsize\\textbf{警告:该PDF由GPT-Academic开源项目调用大语言模型+Latex翻译插件一键生成,其内容可靠性没有任何保障,请仔细鉴别并以原文为准。" + \
+ "项目Github地址 \\url{https://github.com/binary-husky/gpt_academic/}。" + \
+ msg + \
+ "为了防止大语言模型的意外谬误产生扩散影响,禁止移除或修改此警告。}}\\\\" + \
+ result_string[position:]
+ except:
+ pass
+ return result_string
+
+ def split(self, txt, project_folder):
+ """
+ 将Latex文档分解到一个链表中,每个链表节点用preserve的标志位提示它是否应当被GPT处理
+ """
+ root = LinkedListNode(txt, False)
+ def split_worker(root, pattern, flags=0):
+ lt = root
+ cnt = 0
+ pattern_compile = re.compile(pattern, flags)
+ while True:
+ if not lt.preserve:
+ while True:
+ res = pattern_compile.search(lt.string)
+ if not res: break
+ before = res.string[:res.span()[0]]
+ this = res.group(0)
+ after = res.string[res.span()[1]:]
+ # ======
+ lt.string = before
+ tmp = lt.next
+ # ======
+ mid = LinkedListNode(this, True)
+ lt.next = mid
+ # ======
+ aft = LinkedListNode(after, False)
+ mid.next = aft
+ aft.next = tmp
+ # ======
+ lt = aft
+ lt = lt.next
+ cnt += 1
+ # print(cnt)
+ if lt is None: break
+
+ def split_worker_begin_end(root, pattern, flags=0, limit_n_lines=25):
+ lt = root
+ cnt = 0
+ pattern_compile = re.compile(pattern, flags)
+ while True:
+ if not lt.preserve:
+ while True:
+ target_string = lt.string
+
+ def search_with_line_limit(target_string):
+ for res in pattern_compile.finditer(target_string):
+ cmd = res.group(1) # begin{what}
+ this = res.group(2) # content between begin and end
+ white_list = ['document', 'abstract', 'lemma', 'definition', 'sproof', 'em', 'emph', 'textit', 'textbf']
+ if cmd in white_list or this.count('\n') > 25:
+ sub_res = search_with_line_limit(this)
+ if not sub_res: continue
+ else: return sub_res
+ else:
+ return res.group(0)
+ return False
+ # ======
+ # search for first encounter of \begin \end pair with less than 25 lines in the middle
+ ps = search_with_line_limit(target_string)
+ if not ps: break
+ res = re.search(re.escape(ps), target_string, flags)
+ if not res: assert False
+ before = res.string[:res.span()[0]]
+ this = res.group(0)
+ after = res.string[res.span()[1]:]
+ # ======
+ lt.string = before
+ tmp = lt.next
+ # ======
+ mid = LinkedListNode(this, True)
+ lt.next = mid
+ # ======
+ aft = LinkedListNode(after, False)
+ mid.next = aft
+ aft.next = tmp
+ # ======
+ lt = aft
+ lt = lt.next
+ cnt += 1
+ # print(cnt)
+ if lt is None: break
+
+
+ # root 是链表的头
+ print('正在分解Latex源文件,构建链表结构')
+ # 删除iffalse注释
+ split_worker(root, r"\\iffalse(.*?)\\fi", re.DOTALL)
+ # 吸收在25行以内的begin-end组合
+ split_worker_begin_end(root, r"\\begin\{([a-z\*]*)\}(.*?)\\end\{\1\}", re.DOTALL, limit_n_lines=25)
+ # 吸收匿名公式
+ split_worker(root, r"\$\$(.*?)\$\$", re.DOTALL)
+ # 吸收其他杂项
+ split_worker(root, r"(.*?)\\maketitle", re.DOTALL)
+ split_worker(root, r"\\section\{(.*?)\}")
+ split_worker(root, r"\\section\*\{(.*?)\}")
+ split_worker(root, r"\\subsection\{(.*?)\}")
+ split_worker(root, r"\\subsubsection\{(.*?)\}")
+ split_worker(root, r"\\bibliography\{(.*?)\}")
+ split_worker(root, r"\\bibliographystyle\{(.*?)\}")
+ split_worker(root, r"\\begin\{lstlisting\}(.*?)\\end\{lstlisting\}", re.DOTALL)
+ split_worker(root, r"\\begin\{wraptable\}(.*?)\\end\{wraptable\}", re.DOTALL)
+ split_worker(root, r"\\begin\{algorithm\}(.*?)\\end\{algorithm\}", re.DOTALL)
+ split_worker(root, r"\\begin\{wrapfigure\}(.*?)\\end\{wrapfigure\}", re.DOTALL)
+ split_worker(root, r"\\begin\{wrapfigure\*\}(.*?)\\end\{wrapfigure\*\}", re.DOTALL)
+ split_worker(root, r"\\begin\{figure\}(.*?)\\end\{figure\}", re.DOTALL)
+ split_worker(root, r"\\begin\{figure\*\}(.*?)\\end\{figure\*\}", re.DOTALL)
+ split_worker(root, r"\\begin\{multline\}(.*?)\\end\{multline\}", re.DOTALL)
+ split_worker(root, r"\\begin\{multline\*\}(.*?)\\end\{multline\*\}", re.DOTALL)
+ split_worker(root, r"\\begin\{table\}(.*?)\\end\{table\}", re.DOTALL)
+ split_worker(root, r"\\begin\{table\*\}(.*?)\\end\{table\*\}", re.DOTALL)
+ split_worker(root, r"\\begin\{minipage\}(.*?)\\end\{minipage\}", re.DOTALL)
+ split_worker(root, r"\\begin\{minipage\*\}(.*?)\\end\{minipage\*\}", re.DOTALL)
+ split_worker(root, r"\\begin\{align\*\}(.*?)\\end\{align\*\}", re.DOTALL)
+ split_worker(root, r"\\begin\{align\}(.*?)\\end\{align\}", re.DOTALL)
+ split_worker(root, r"\\begin\{equation\}(.*?)\\end\{equation\}", re.DOTALL)
+ split_worker(root, r"\\begin\{equation\*\}(.*?)\\end\{equation\*\}", re.DOTALL)
+ split_worker(root, r"\\item ")
+ split_worker(root, r"\\label\{(.*?)\}")
+ split_worker(root, r"\\begin\{(.*?)\}")
+ split_worker(root, r"\\vspace\{(.*?)\}")
+ split_worker(root, r"\\hspace\{(.*?)\}")
+ split_worker(root, r"\\end\{(.*?)\}")
+
+ node = root
+ while True:
+ if len(node.string.strip('\n').strip(''))==0: node.preserve = True
+ if len(node.string.strip('\n').strip(''))<50: node.preserve = True
+ node = node.next
+ if node is None: break
+
+ # 修复括号
+ node = root
+ while True:
+ string = node.string
+ if node.preserve:
+ node = node.next
+ if node is None: break
+ continue
+ def break_check(string):
+ str_stack = [""] # (lv, index)
+ for i, c in enumerate(string):
+ if c == '{':
+ str_stack.append('{')
+ elif c == '}':
+ if len(str_stack) == 1:
+ print('stack kill')
+ return i
+ str_stack.pop(-1)
+ else:
+ str_stack[-1] += c
+ return -1
+ bp = break_check(string)
+
+ if bp == -1:
+ pass
+ elif bp == 0:
+ node.string = string[:1]
+ q = LinkedListNode(string[1:], False)
+ q.next = node.next
+ node.next = q
+ else:
+ node.string = string[:bp]
+ q = LinkedListNode(string[bp:], False)
+ q.next = node.next
+ node.next = q
+
+ node = node.next
+ if node is None: break
+
+ node = root
+ while True:
+ if len(node.string.strip('\n').strip(''))==0: node.preserve = True
+ if len(node.string.strip('\n').strip(''))<50: node.preserve = True
+ node = node.next
+ if node is None: break
+
+ # 将前后断行符脱离
+ node = root
+ prev_node = None
+ while True:
+ if not node.preserve:
+ lstriped_ = node.string.lstrip().lstrip('\n')
+ if (prev_node is not None) and (prev_node.preserve) and (len(lstriped_)!=len(node.string)):
+ prev_node.string += node.string[:-len(lstriped_)]
+ node.string = lstriped_
+ rstriped_ = node.string.rstrip().rstrip('\n')
+ if (node.next is not None) and (node.next.preserve) and (len(rstriped_)!=len(node.string)):
+ node.next.string = node.string[len(rstriped_):] + node.next.string
+ node.string = rstriped_
+ # =====
+ prev_node = node
+ node = node.next
+ if node is None: break
+
+ # 将分解结果返回 res_to_t
+ with open(pj(project_folder, 'debug_log.html'), 'w', encoding='utf8') as f:
+ res_to_t = []
+ node = root
+ while True:
+ show_html = node.string.replace('\n','
')
+ if not node.preserve:
+ res_to_t.append(node.string)
+ f.write(f'
#{show_html}#
') + else: + f.write(f'{show_html}
') + node = node.next + if node is None: break + + self.root = root + self.sp = res_to_t + return self.sp + +class LatexPaperFileGroup(): + def __init__(self): + self.file_paths = [] + self.file_contents = [] + self.sp_file_contents = [] + self.sp_file_index = [] + self.sp_file_tag = [] + + # count_token + from request_llm.bridge_all import model_info + enc = model_info["gpt-3.5-turbo"]['tokenizer'] + def get_token_num(txt): return len(enc.encode(txt, disallowed_special=())) + self.get_token_num = get_token_num + + def run_file_split(self, max_token_limit=1900): + """ + 将长文本分离开来 + """ + for index, file_content in enumerate(self.file_contents): + if self.get_token_num(file_content) < max_token_limit: + self.sp_file_contents.append(file_content) + self.sp_file_index.append(index) + self.sp_file_tag.append(self.file_paths[index]) + else: + from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf + segments = breakdown_txt_to_satisfy_token_limit_for_pdf(file_content, self.get_token_num, max_token_limit) + for j, segment in enumerate(segments): + self.sp_file_contents.append(segment) + self.sp_file_index.append(index) + self.sp_file_tag.append(self.file_paths[index] + f".part-{j}.tex") + print('Segmentation: done') + + def merge_result(self): + self.file_result = ["" for _ in range(len(self.file_paths))] + for r, k in zip(self.sp_file_result, self.sp_file_index): + self.file_result[k] += r + + def write_result(self): + manifest = [] + for path, res in zip(self.file_paths, self.file_result): + with open(path + '.polish.tex', 'w', encoding='utf8') as f: + manifest.append(path + '.polish.tex') + f.write(res) + return manifest + + + +def Latex精细分解与转化(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, mode='proofread', switch_prompt=None): + import time, os, re + from .crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency + from .latex_utils import LatexPaperFileGroup, merge_tex_files, LatexPaperSplit, 寻找Latex主文件 + + # <-------- 寻找主tex文件 ----------> + maintex = 寻找Latex主文件(file_manifest, mode) + chatbot.append((f"定位主Latex文件", f'[Local Message] 分析结果:该项目的Latex主文件是{maintex}, 如果分析错误, 请立即终止程序, 删除或修改歧义文件, 然后重试。主程序即将开始, 请稍候。')) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + time.sleep(5) + + # <-------- 读取Latex文件, 将多文件tex工程融合为一个巨型tex ----------> + main_tex_basename = os.path.basename(maintex) + assert main_tex_basename.endswith('.tex') + main_tex_basename_bare = main_tex_basename[:-4] + may_exist_bbl = pj(project_folder, f'{main_tex_basename_bare}.bbl') + if os.path.exists(may_exist_bbl): + shutil.copyfile(may_exist_bbl, pj(project_folder, f'merge.bbl')) + shutil.copyfile(may_exist_bbl, pj(project_folder, f'merge_{mode}.bbl')) + shutil.copyfile(may_exist_bbl, pj(project_folder, f'merge_diff.bbl')) + + with open(maintex, 'r', encoding='utf-8', errors='replace') as f: + content = f.read() + merged_content = merge_tex_files(project_folder, content, mode) + + with open(project_folder + '/merge.tex', 'w', encoding='utf-8', errors='replace') as f: + f.write(merged_content) + + # <-------- 精细切分latex文件 ----------> + lps = LatexPaperSplit() + res = lps.split(merged_content, project_folder) + + # <-------- 拆分过长的latex片段 ----------> + pfg = LatexPaperFileGroup() + for index, r in enumerate(res): + pfg.file_paths.append('segment-' + str(index)) + pfg.file_contents.append(r) + + pfg.run_file_split(max_token_limit=1024) + n_split = len(pfg.sp_file_contents) + + # <-------- 根据需要切换prompt ----------> + inputs_array, sys_prompt_array = switch_prompt(pfg, mode) + inputs_show_user_array = [f"{mode} {f}" for f in pfg.sp_file_tag] + + if os.path.exists(pj(project_folder,'temp.pkl')): + + # <-------- 【仅调试】如果存在调试缓存文件,则跳过GPT请求环节 ----------> + pfg = objload(file=pj(project_folder,'temp.pkl')) + + else: + # <-------- gpt 多线程请求 ----------> + gpt_response_collection = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency( + inputs_array=inputs_array, + inputs_show_user_array=inputs_show_user_array, + llm_kwargs=llm_kwargs, + chatbot=chatbot, + history_array=[[""] for _ in range(n_split)], + sys_prompt_array=sys_prompt_array, + # max_workers=5, # 并行任务数量限制, 最多同时执行5个, 其他的排队等待 + scroller_max_len = 40 + ) + + # <-------- 文本碎片重组为完整的tex片段 ----------> + pfg.sp_file_result = [] + for i_say, gpt_say, orig_content in zip(gpt_response_collection[0::2], gpt_response_collection[1::2], pfg.sp_file_contents): + pfg.sp_file_result.append(gpt_say) + pfg.merge_result() + + # <-------- 临时存储用于调试 ----------> + pfg.get_token_num = None + objdump(pfg, file=pj(project_folder,'temp.pkl')) + + + # <-------- 写出文件 ----------> + msg = f"当前大语言模型: {llm_kwargs['llm_model']},当前语言模型温度设定: {llm_kwargs['temperature']}。" + final_tex = lps.merge_result(pfg.file_result, mode, msg) + with open(project_folder + f'/merge_{mode}.tex', 'w', encoding='utf-8', errors='replace') as f: + f.write(final_tex) + + # <-------- 整理结果, 退出 ----------> + chatbot.append((f"完成了吗?", 'GPT结果已输出, 正在编译PDF')) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + + # <-------- 返回 ----------> + return project_folder + f'/merge_{mode}.tex' + + + +def remove_buggy_lines(file_path, log_path, tex_name, tex_name_pure, n_fix, work_folder_modified): + try: + with open(log_path, 'r', encoding='utf-8', errors='replace') as f: + log = f.read() + with open(file_path, 'r', encoding='utf-8', errors='replace') as f: + file_lines = f.readlines() + import re + buggy_lines = re.findall(tex_name+':([0-9]{1,5}):', log) + buggy_lines = [int(l) for l in buggy_lines] + buggy_lines = sorted(buggy_lines) + print("removing lines that has errors", buggy_lines) + file_lines.pop(buggy_lines[0]-1) + with open(pj(work_folder_modified, f"{tex_name_pure}_fix_{n_fix}.tex"), 'w', encoding='utf-8', errors='replace') as f: + f.writelines(file_lines) + return True, f"{tex_name_pure}_fix_{n_fix}", buggy_lines + except: + return False, 0, [0] + + +def compile_latex_with_timeout(command, timeout=60): + import subprocess + process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + try: + stdout, stderr = process.communicate(timeout=timeout) + except subprocess.TimeoutExpired: + process.kill() + stdout, stderr = process.communicate() + print("Process timed out!") + return False + return True + +def 编译Latex差别(chatbot, history, main_file_original, main_file_modified, work_folder_original, work_folder_modified, work_folder): + import os, time + current_dir = os.getcwd() + n_fix = 1 + max_try = 32 + chatbot.append([f"正在编译PDF文档", f'编译已经开始。当前工作路径为{work_folder},如果程序停顿5分钟以上,则大概率是卡死在Latex里面了。不幸卡死时请直接去该路径下取回翻译结果,或者重启之后再度尝试 ...']); yield from update_ui(chatbot=chatbot, history=history) + chatbot.append([f"正在编译PDF文档", '...']); yield from update_ui(chatbot=chatbot, history=history); time.sleep(1); chatbot[-1] = list(chatbot[-1]) # 刷新界面 + yield from update_ui_lastest_msg('编译已经开始...', chatbot, history) # 刷新Gradio前端界面 + + while True: + import os + # https://stackoverflow.com/questions/738755/dont-make-me-manually-abort-a-latex-compile-when-theres-an-error + yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译原始PDF ...', chatbot, history) # 刷新Gradio前端界面 + os.chdir(work_folder_original); ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_original}.tex'); os.chdir(current_dir) + + yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译转化后的PDF ...', chatbot, history) # 刷新Gradio前端界面 + os.chdir(work_folder_modified); ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_modified}.tex'); os.chdir(current_dir) + + if ok and os.path.exists(pj(work_folder_modified, f'{main_file_modified}.pdf')): + # 只有第二步成功,才能继续下面的步骤 + yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译BibTex ...', chatbot, history) # 刷新Gradio前端界面 + if not os.path.exists(pj(work_folder_original, f'{main_file_original}.bbl')): + os.chdir(work_folder_original); ok = compile_latex_with_timeout(f'bibtex {main_file_original}.aux'); os.chdir(current_dir) + if not os.path.exists(pj(work_folder_modified, f'{main_file_modified}.bbl')): + os.chdir(work_folder_modified); ok = compile_latex_with_timeout(f'bibtex {main_file_modified}.aux'); os.chdir(current_dir) + + yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译文献交叉引用 ...', chatbot, history) # 刷新Gradio前端界面 + os.chdir(work_folder_original); ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_original}.tex'); os.chdir(current_dir) + os.chdir(work_folder_modified); ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_modified}.tex'); os.chdir(current_dir) + os.chdir(work_folder_original); ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_original}.tex'); os.chdir(current_dir) + os.chdir(work_folder_modified); ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_modified}.tex'); os.chdir(current_dir) + + yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 使用latexdiff生成论文转化前后对比 ...', chatbot, history) # 刷新Gradio前端界面 + print( f'latexdiff --encoding=utf8 --append-safecmd=subfile {work_folder_original}/{main_file_original}.tex {work_folder_modified}/{main_file_modified}.tex --flatten > {work_folder}/merge_diff.tex') + ok = compile_latex_with_timeout(f'latexdiff --encoding=utf8 --append-safecmd=subfile {work_folder_original}/{main_file_original}.tex {work_folder_modified}/{main_file_modified}.tex --flatten > {work_folder}/merge_diff.tex') + + yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 正在编译对比PDF ...', chatbot, history) # 刷新Gradio前端界面 + os.chdir(work_folder); ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error merge_diff.tex'); os.chdir(current_dir) + os.chdir(work_folder); ok = compile_latex_with_timeout(f'bibtex merge_diff.aux'); os.chdir(current_dir) + os.chdir(work_folder); ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error merge_diff.tex'); os.chdir(current_dir) + os.chdir(work_folder); ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error merge_diff.tex'); os.chdir(current_dir) + + # <---------------------> + os.chdir(current_dir) + + # <---------- 检查结果 -----------> + results_ = "" + original_pdf_success = os.path.exists(pj(work_folder_original, f'{main_file_original}.pdf')) + modified_pdf_success = os.path.exists(pj(work_folder_modified, f'{main_file_modified}.pdf')) + diff_pdf_success = os.path.exists(pj(work_folder, f'merge_diff.pdf')) + results_ += f"原始PDF编译是否成功: {original_pdf_success};" + results_ += f"转化PDF编译是否成功: {modified_pdf_success};" + results_ += f"对比PDF编译是否成功: {diff_pdf_success};" + yield from update_ui_lastest_msg(f'第{n_fix}编译结束: