合并master

2023-06-14 11:08:50 +08:00
parent fe16cc76a7 73d4a1ff4b
commit c8c4d37616
13 changed files with 498 additions and 339 deletions
--- a/crazy_functions/Langchain知识库.py
+++ b/crazy_functions/Langchain知识库.py
@ -75,9 +75,18 @@ def 知识库问答(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_pro

@CatchException
 def 读取知识库作答(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port=-1):
+    # resolve deps
+    try:
+        from zh_langchain import construct_vector_store
+        from langchain.embeddings.huggingface import HuggingFaceEmbeddings
+        from .crazy_utils import knowledge_archive_interface
+    except Exception as e:
+        chatbot.append(["依赖不足", "导入依赖失败。正在尝试自动安装，请查看终端的输出或耐心等待..."])
+        yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
+        from .crazy_utils import try_install_deps
+        try_install_deps(['zh_langchain==0.2.0'])

    # < -------------------  --------------- >
-    from .crazy_utils import knowledge_archive_interface
    kai = knowledge_archive_interface()

    if 'langchain_plugin_embedding' in chatbot._cookies:
--- a/crazy_functions/Latex输出PDF结果.py
+++ b/crazy_functions/Latex输出PDF结果.py
@ -5,7 +5,7 @@ pj = os.path.join
 ARXIV_CACHE_DIR = os.path.expanduser(f"~/arxiv_cache/")

 # =================================== 工具函数 ===============================================
-沙雕GPT啊别犯这些低级翻译错误  = 'You must to translate "agent" to "智能体". '
+专业词汇声明  = 'If the term "agent" is used in this section, it should be translated to "智能体". '
 def switch_prompt(pfg, mode):
    """
    Generate prompts and system prompts based on the mode for proofreading or translating.
@ -25,7 +25,7 @@ def switch_prompt(pfg, mode):
                        f"\n\n{frag}" for frag in pfg.sp_file_contents]
        sys_prompt_array = ["You are a professional academic paper writer." for _ in range(n_split)]
    elif mode == 'translate_zh':
-        inputs_array = [r"Below is a section from an English academic paper, translate it into Chinese." + 沙雕GPT啊别犯这些低级翻译错误 + 
+        inputs_array = [r"Below is a section from an English academic paper, translate it into Chinese. " + 专业词汇声明 + 
                        r"Do not modify any latex command such as \section, \cite, \begin, \item and equations. " + 
                        r"Answer me only with the translated text:" + 
                        f"\n\n{frag}" for frag in pfg.sp_file_contents]
@ -65,8 +65,10 @@ def move_project(project_folder, arxiv_id=None):
        new_workfolder = pj(ARXIV_CACHE_DIR, arxiv_id, 'workfolder')
    else:
        new_workfolder = f'gpt_log/{gen_time_str()}'
-    try: shutil.rmtree(new_workfolder)
-    except: pass
+    try:
+        shutil.rmtree(new_workfolder)
+    except:
+        pass
    shutil.copytree(src=project_folder, dst=new_workfolder)
    return new_workfolder

@ -80,7 +82,14 @@ def arxiv_download(chatbot, history, txt):
            promote_file_to_downloadzone(target_file)
            return target_file
        return False
-    
+    def is_float(s):
+        try:
+            float(s)
+            return True
+        except ValueError:
+            return False
+    if ('.' in txt) and ('/' not in txt) and is_float(txt):
+        txt = 'https://arxiv.org/abs/' + txt
    if not txt.startswith('https://arxiv.org'): 
        return txt, None
    
@ -132,12 +141,12 @@ def Latex英文纠错加PDF对比(txt, llm_kwargs, plugin_kwargs, chatbot, histo

    # <-------------- check deps ------------->
    try:
-        import glob, os, time
-        os.system(f'pdflatex -version')
-        from .latex_utils import Latex精细分解与转化, 编译Latex差别
+        import glob, os, time, subprocess
+        subprocess.Popen(['pdflatex', '-version'])
+        from .latex_utils import Latex精细分解与转化, 编译Latex
    except Exception as e:
        chatbot.append([ f"解析项目: {txt}",
-            f"尝试执行Latex指令失败。Latex没有安装, 或者不在环境变量PATH中。报错信息\n\n```\n\n{trimmed_format_exc()}\n\n```\n\n"])
+            f"尝试执行Latex指令失败。Latex没有安装, 或者不在环境变量PATH中。安装方法https://tug.org/texlive/。报错信息\n\n```\n\n{trimmed_format_exc()}\n\n```\n\n"])
        yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
        return
    
@ -172,7 +181,7 @@ def Latex英文纠错加PDF对比(txt, llm_kwargs, plugin_kwargs, chatbot, histo


    # <-------------- compile PDF ------------->
-    success = yield from 编译Latex差别(chatbot, history, main_file_original='merge', main_file_modified='merge_proofread', 
+    success = yield from 编译Latex(chatbot, history, main_file_original='merge', main_file_modified='merge_proofread', 
                             work_folder_original=project_folder, work_folder_modified=project_folder, work_folder=project_folder)
    

@ -196,18 +205,18 @@ def Latex翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot,
    # <-------------- information about this plugin ------------->
    chatbot.append([
        "函数插件功能？",
-        "对整个Latex项目进行翻译, 生成中文PDF。函数插件贡献者: Binary-Husky。注意事项: 目前仅支持GPT3.5/GPT4，其他模型转化效果未知。目前对机器学习类文献转化效果最好，其他类型文献转化效果未知。仅在Windows系统进行了测试，其他操作系统表现未知。"])
+        "对整个Latex项目进行翻译, 生成中文PDF。函数插件贡献者: Binary-Husky。注意事项: 目前仅支持GPT3.5/GPT4，其他模型转化效果未知。目前对机器学习类文献转化效果最好，其他类型文献转化效果未知。"])
    yield from update_ui(chatbot=chatbot, history=history) # 刷新界面


    # <-------------- check deps ------------->
    try:
-        import glob, os, time
-        os.system(f'pdflatex -version')
-        from .latex_utils import Latex精细分解与转化, 编译Latex差别
+        import glob, os, time, subprocess
+        subprocess.Popen(['pdflatex', '-version'])
+        from .latex_utils import Latex精细分解与转化, 编译Latex
    except Exception as e:
        chatbot.append([ f"解析项目: {txt}",
-            f"尝试执行Latex指令失败。Latex没有安装, 或者不在环境变量PATH中。报错信息\n\n```\n\n{trimmed_format_exc()}\n\n```\n\n"])
+            f"尝试执行Latex指令失败。Latex没有安装, 或者不在环境变量PATH中。安装方法https://tug.org/texlive/。报错信息\n\n```\n\n{trimmed_format_exc()}\n\n```\n\n"])
        yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
        return
    
@ -219,6 +228,8 @@ def Latex翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot,
        report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"发现已经存在翻译好的PDF文档")
        yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
        return
+    
+
    if os.path.exists(txt):
        project_folder = txt
    else:
@ -226,6 +237,7 @@ def Latex翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot,
        report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}")
        yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
        return
+    
    file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.tex', recursive=True)]
    if len(file_manifest) == 0:
        report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.tex文件: {txt}")
@ -247,7 +259,7 @@ def Latex翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot,


    # <-------------- compile PDF ------------->
-    success = yield from 编译Latex差别(chatbot, history, main_file_original='merge', main_file_modified='merge_translate_zh', 
+    success = yield from 编译Latex(chatbot, history, main_file_original='merge', main_file_modified='merge_translate_zh', 
                             work_folder_original=project_folder, work_folder_modified=project_folder, work_folder=project_folder)

    # <-------------- zip PDF ------------->
@ -259,5 +271,6 @@ def Latex翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot,
        chatbot.append((f"失败了", '虽然PDF生成失败了, 但请查收结果（压缩包）, 内含已经翻译的Tex文档, 也是可读的, 您可以到Github Issue区, 用该压缩包+对话历史存档进行反馈 ...'))
        yield from update_ui(chatbot=chatbot, history=history); time.sleep(1) # 刷新界面

+
    # <-------------- we are done ------------->
    return success
--- a/crazy_functions/crazy_functions_test.py
+++ b/crazy_functions/crazy_functions_test.py
@ -181,18 +181,14 @@ def test_Langchain知识库读取():

 def test_Latex():
    from crazy_functions.Latex输出PDF结果 import Latex英文纠错加PDF对比, Latex翻译中文并重新编译PDF
-    txt = "C:/Users/fuqingxu/Desktop/proofread"
-    txt = "C:/Users/fuqingxu/Desktop/旧文件/gpt/paperx"
-    txt = "C:/Users/fuqingxu/Desktop/旧文件/gpt/papery"
-    txt = r"C:\Users\fuqingxu\Desktop\旧文件\gpt\latex2pdf\2023-06-03-14-57-06"
-    txt = r"C:\Users\fuqingxu\Desktop\旧文件\gpt\latex2pdf\2023-06-03-15-40-20"
-    txt = r"https://arxiv.org/abs/1902.03185"
-    txt = r"C:\Users\fuqingxu\Desktop\旧文件\gpt\latex2pdf\2023-06-03-17-14-40"
-    txt = r"https://arxiv.org/abs/2305.18290"
-    txt = r"https://arxiv.org/abs/2305.17608"
-    # txt = r"https://arxiv.org/abs/2306.00324"
-    txt = r"https://arxiv.org/abs/2211.16068"
-    
+
+    # txt = r"https://arxiv.org/abs/1706.03762"
+    # txt = r"https://arxiv.org/abs/1902.03185"
+    # txt = r"https://arxiv.org/abs/2305.18290"
+    # txt = r"https://arxiv.org/abs/2305.17608"
+    # txt = r"https://arxiv.org/abs/2211.16068"                     #  ACE
+    # txt = r"C:\Users\x\arxiv_cache\2211.16068\workfolder"  #  ACE
+    txt = r"https://arxiv.org/abs/2002.09253"
    for cookies, cb, hist, msg in (Latex翻译中文并重新编译PDF)(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
        cli_printer.print(cb)   #  print(cb)

--- a/crazy_functions/latex_utils.py
+++ b/crazy_functions/latex_utils.py
@ -2,8 +2,111 @@ from toolbox import update_ui, update_ui_lastest_msg    # 刷新Gradio前端界
 from toolbox import zip_folder, objdump, objload, promote_file_to_downloadzone
 import os, shutil
 import re
+import numpy as np
 pj = os.path.join

+"""
+========================================================================
+Part One
+Latex segmentation to a linklist
+========================================================================
+"""
+PRESERVE = 0
+TRANSFORM = 1
+
+def split_worker(text, mask, pattern, flags=0):
+    """
+    Add a preserve text area in this paper
+    """
+    pattern_compile = re.compile(pattern, flags)
+    for res in pattern_compile.finditer(text):
+        mask[res.span()[0]:res.span()[1]] = PRESERVE
+    return text, mask
+
+def split_worker_careful_brace(text, mask, pattern, flags=0):
+    """
+    Move area into preserve area 
+    """
+    pattern_compile = re.compile(pattern, flags)
+    for res in pattern_compile.finditer(text):
+        brace_level = -1
+        p = begin = end = res.regs[0][0]
+        for _ in range(1024*16):
+            if text[p] == '}' and brace_level == 0: break
+            elif text[p] == '}':  brace_level -= 1
+            elif text[p] == '{':  brace_level += 1
+            p += 1
+        end = p+1
+        mask[begin:end] = PRESERVE
+    return text, mask
+
+def split_worker_reverse_careful_brace(text, mask, pattern, flags=0):
+    """
+    Move area out of preserve area 
+    """
+    pattern_compile = re.compile(pattern, flags)
+    for res in pattern_compile.finditer(text):
+        brace_level = 0
+        p = begin = end = res.regs[1][0]
+        for _ in range(1024*16):
+            if text[p] == '}' and brace_level == 0: break
+            elif text[p] == '}':  brace_level -= 1
+            elif text[p] == '{':  brace_level += 1
+            p += 1
+        end = p
+        mask[begin:end] = TRANSFORM
+    return text, mask
+
+def split_worker_begin_end(text, mask, pattern, flags=0, limit_n_lines=42):
+    """
+    Find all \begin{} ... \end{} text block that with less than limit_n_lines lines.
+    Add it to preserve area
+    """
+    pattern_compile = re.compile(pattern, flags)
+    def search_with_line_limit(text, mask):
+        for res in pattern_compile.finditer(text):
+            cmd = res.group(1)  # begin{what}
+            this = res.group(2) # content between begin and end
+            this_mask = mask[res.regs[2][0]:res.regs[2][1]]
+            white_list = ['document', 'abstract', 'lemma', 'definition', 'sproof', 
+                          'em', 'emph', 'textit', 'textbf', 'itemize', 'enumerate']
+            if (cmd in white_list) or this.count('\n') >= limit_n_lines: # use a magical number 42
+                this, this_mask = search_with_line_limit(this, this_mask)
+                mask[res.regs[2][0]:res.regs[2][1]] = this_mask
+            else:
+                mask[res.regs[0][0]:res.regs[0][1]] = PRESERVE
+        return text, mask
+    return search_with_line_limit(text, mask) 
+
+class LinkedListNode():
+    """
+    Linked List Node
+    """
+    def __init__(self, string, preserve=True) -> None:
+        self.string = string
+        self.preserve = preserve
+        self.next = None
+        # self.begin_line = 0
+        # self.begin_char = 0
+
+def convert_to_linklist(text, mask):
+    root = LinkedListNode("", preserve=True)
+    current_node = root
+    for c, m, i in zip(text, mask, range(len(text))):
+        if (m==PRESERVE and current_node.preserve) \
+            or (m==TRANSFORM and not current_node.preserve):
+            # add
+            current_node.string += c
+        else:
+            current_node.next = LinkedListNode(c, preserve=(m==PRESERVE))
+            current_node = current_node.next
+    return root
+"""
+========================================================================
+Latex Merge File
+========================================================================
+"""
+
 def 寻找Latex主文件(file_manifest, mode):
    """
    在多Tex文档中，寻找主文件，必须包含documentclass，返回找到的第一个。
@ -20,10 +123,23 @@ def 寻找Latex主文件(file_manifest, mode):
            continue
    raise RuntimeError('无法找到一个主Tex文件（包含documentclass关键字）')

+def rm_comments(main_file):
+    new_file_remove_comment_lines = []
+    for l in main_file.splitlines():
+        # 删除整行的空注释
+        if l.lstrip().startswith("%"):
+            pass
+        else:
+            new_file_remove_comment_lines.append(l)
+    main_file = '\n'.join(new_file_remove_comment_lines)
+    main_file = re.sub(r'(?<!\\)%.*', '', main_file)  # 使用正则表达式查找半行注释, 并替换为空字符串
+    return main_file
+
 def merge_tex_files_(project_foler, main_file, mode):
    """
-    递归地把多Tex工程整合为一个Tex文档
+    Merge Tex project recrusively
    """
+    main_file = rm_comments(main_file)
    for s in reversed([q for q in re.finditer(r"\\input\{(.*?)\}", main_file, re.M)]):
        f = s.group(1)
        fp = os.path.join(project_foler, f)
@ -41,42 +157,37 @@ def merge_tex_files_(project_foler, main_file, mode):

 def merge_tex_files(project_foler, main_file, mode):
    """
-    递归地把多Tex工程整合为一个Tex文档（递归外层）
+    Merge Tex project recrusively
    P.S. 顺便把CTEX塞进去以支持中文
    P.S. 顺便把Latex的注释去除
    """
    main_file = merge_tex_files_(project_foler, main_file, mode)
+    main_file = rm_comments(main_file)
+
    if mode == 'translate_zh':
        pattern = re.compile(r'\\documentclass.*\n')
        match = pattern.search(main_file)
        position = match.end()
-        main_file = main_file[:position] + '\\usepackage{CTEX}\n\\usepackage{url}\n' + main_file[position:]
-
-    new_file_remove_comment_lines = []
-    for l in main_file.splitlines():
-        # 删除整行的空注释
-        if l.startswith("%") or (l.startswith(" ") and l.lstrip().startswith("%")):
-            pass
-        else:
-            new_file_remove_comment_lines.append(l)
-    main_file = '\n'.join(new_file_remove_comment_lines)
-    main_file = re.sub(r'(?<!\\)%.*', '', main_file)  # 使用正则表达式查找半行注释, 并替换为空字符串
+        add_ctex = '\\usepackage{ctex}\n'
+        add_url = '\\usepackage{url}\n' if '{url}' not in main_file else ''
+        main_file = main_file[:position] + add_ctex + add_url + main_file[position:]
+        # 2 fontset=windows
+        import platform
+        if platform.system() != 'Windows':
+            main_file = re.sub(r"\\documentclass\[(.*?)\]{(.*?)}", r"\\documentclass[\1,fontset=windows]{\2}",main_file)
+            main_file = re.sub(r"\\documentclass{(.*?)}", r"\\documentclass[fontset=windows]{\1}",main_file)
    return main_file


-class LinkedListNode():
-    """
-    链表单元
-    """
-    def __init__(self, string, preserve=True) -> None:
-        self.string = string
-        self.preserve = preserve
-        self.next = None
-

+"""
+========================================================================
+Post process
+========================================================================
+"""
 def mod_inbraket(match):
    """
-    为啥chatgpt会把cite里面的逗号换成中文逗号呀 艹
+    为啥chatgpt会把cite里面的逗号换成中文逗号呀 
    """
    # get the matched string
    cmd = match.group(1)
@ -91,271 +202,251 @@ def fix_content(final_tex, node_string):
    """
    Fix common GPT errors to increase success rate
    """
-    final_tex = final_tex.replace('%', r'\%')
-    final_tex = final_tex.replace(r'\%', r'\\%')
+    final_tex = re.sub(r"(?<!\\)%", "\\%", final_tex)
    final_tex = re.sub(r"\\([a-z]{2,10})\ \{", r"\\\1{", string=final_tex)
    final_tex = re.sub(r"\\\ ([a-z]{2,10})\{", r"\\\1{", string=final_tex)
    final_tex = re.sub(r"\\([a-z]{2,10})\{([^\}]*?)\}", mod_inbraket, string=final_tex)
-    if node_string.count('{') != node_string.count('}'):
-        if final_tex.count('{') != node_string.count('{'):
-            final_tex = node_string # 出问题了，还原原文
-        if final_tex.count('}') != node_string.count('}'):
-            final_tex = node_string # 出问题了，还原原文

+    if node_string.count('\\begin') != final_tex.count('\\begin'):
+        final_tex = node_string # 出问题了，还原原文
+    if node_string.count('\_') > 0 and node_string.count('\_') > final_tex.count('\_'):
+        # walk and replace any _ without \
+        final_tex = re.sub(r"(?<!\\)_", "\\_", final_tex)
+
+    def compute_brace_level(string):
+        # this function count the number of { and }
+        brace_level = 0
+        for c in string:
+            if c == "{": brace_level += 1
+            elif c == "}": brace_level -= 1
+        return brace_level
+    def join_most(tex_t, tex_o):
+        # this function join translated string and original string when something goes wrong
+        p_t = 0
+        p_o = 0
+        def find_next(string, chars, begin):
+            p = begin
+            while p < len(string):
+                if string[p] in chars: return p, string[p]
+                p += 1
+            return None, None
+        while True:
+            res1, char = find_next(tex_o, ['{','}'], p_o)
+            if res1 is None: break
+            res2, char = find_next(tex_t, [char], p_t)
+            if res2 is None: break
+            p_o = res1 + 1
+            p_t = res2 + 1
+        return tex_t[:p_t] + tex_o[p_o:]
+
+    if compute_brace_level(final_tex) != compute_brace_level(node_string):
+        # 出问题了，还原部分原文，保证括号正确
+        final_tex = join_most(final_tex, node_string)
    return final_tex

+def split_subprocess(txt, project_folder, return_dict, opts):
+    """
+    break down latex file to a linked list,
+    each node use a preserve flag to indicate whether it should
+    be proccessed by GPT.
+    """
+    text = txt
+    mask = np.zeros(len(txt), dtype=np.uint8) + TRANSFORM
+
+    # 吸收title与作者以上的部分
+    text, mask = split_worker(text, mask, r"(.*?)\\maketitle", re.DOTALL)
+    # 删除iffalse注释
+    text, mask = split_worker(text, mask, r"\\iffalse(.*?)\\fi", re.DOTALL)
+    # 吸收在25行以内的begin-end组合
+    text, mask = split_worker_begin_end(text, mask, r"\\begin\{([a-z\*]*)\}(.*?)\\end\{\1\}", re.DOTALL, limit_n_lines=25)
+    # 吸收匿名公式
+    text, mask = split_worker(text, mask, r"\$\$(.*?)\$\$", re.DOTALL)
+    # 吸收其他杂项
+    text, mask = split_worker(text, mask, r"\\section\{(.*?)\}")
+    text, mask = split_worker(text, mask, r"\\section\*\{(.*?)\}")
+    text, mask = split_worker(text, mask, r"\\subsection\{(.*?)\}")
+    text, mask = split_worker(text, mask, r"\\subsubsection\{(.*?)\}")
+    text, mask = split_worker(text, mask, r"\\bibliography\{(.*?)\}")
+    text, mask = split_worker(text, mask, r"\\bibliographystyle\{(.*?)\}")
+    text, mask = split_worker(text, mask, r"\\begin\{lstlisting\}(.*?)\\end\{lstlisting\}", re.DOTALL)
+    text, mask = split_worker(text, mask, r"\\begin\{wraptable\}(.*?)\\end\{wraptable\}", re.DOTALL)
+    text, mask = split_worker(text, mask, r"\\begin\{algorithm\}(.*?)\\end\{algorithm\}", re.DOTALL)
+    text, mask = split_worker(text, mask, r"\\begin\{wrapfigure\}(.*?)\\end\{wrapfigure\}", re.DOTALL)
+    text, mask = split_worker(text, mask, r"\\begin\{wrapfigure\*\}(.*?)\\end\{wrapfigure\*\}", re.DOTALL)
+    text, mask = split_worker(text, mask, r"\\begin\{figure\}(.*?)\\end\{figure\}", re.DOTALL)
+    text, mask = split_worker(text, mask, r"\\begin\{figure\*\}(.*?)\\end\{figure\*\}", re.DOTALL)
+    text, mask = split_worker(text, mask, r"\\begin\{multline\}(.*?)\\end\{multline\}", re.DOTALL)
+    text, mask = split_worker(text, mask, r"\\begin\{multline\*\}(.*?)\\end\{multline\*\}", re.DOTALL)
+    text, mask = split_worker(text, mask, r"\\begin\{table\}(.*?)\\end\{table\}", re.DOTALL)
+    text, mask = split_worker(text, mask, r"\\begin\{table\*\}(.*?)\\end\{table\*\}", re.DOTALL)
+    text, mask = split_worker(text, mask, r"\\begin\{minipage\}(.*?)\\end\{minipage\}", re.DOTALL)
+    text, mask = split_worker(text, mask, r"\\begin\{minipage\*\}(.*?)\\end\{minipage\*\}", re.DOTALL)
+    text, mask = split_worker(text, mask, r"\\begin\{align\*\}(.*?)\\end\{align\*\}", re.DOTALL)
+    text, mask = split_worker(text, mask, r"\\begin\{align\}(.*?)\\end\{align\}", re.DOTALL)
+    text, mask = split_worker(text, mask, r"\\begin\{equation\}(.*?)\\end\{equation\}", re.DOTALL)
+    text, mask = split_worker(text, mask, r"\\begin\{equation\*\}(.*?)\\end\{equation\*\}", re.DOTALL)
+    text, mask = split_worker(text, mask, r"\\item ")
+    text, mask = split_worker(text, mask, r"\\label\{(.*?)\}")
+    text, mask = split_worker(text, mask, r"\\begin\{(.*?)\}")
+    text, mask = split_worker(text, mask, r"\\vspace\{(.*?)\}")
+    text, mask = split_worker(text, mask, r"\\hspace\{(.*?)\}")
+    text, mask = split_worker(text, mask, r"\\end\{(.*?)\}")
+    text, mask = split_worker_careful_brace(text, mask, r"\\hl\{(.*?)\}", re.DOTALL)
+    text, mask = split_worker_reverse_careful_brace(text, mask, r"\\caption\{(.*?)\}", re.DOTALL)
+    root = convert_to_linklist(text, mask)
+
+    # 修复括号
+    node = root
+    while True:
+        string = node.string
+        if node.preserve: 
+            node = node.next
+            if node is None: break
+            continue
+        def break_check(string):
+            str_stack = [""] # (lv, index)
+            for i, c in enumerate(string):
+                if c == '{':
+                    str_stack.append('{')
+                elif c == '}':
+                    if len(str_stack) == 1:
+                        print('stack fix')
+                        return i
+                    str_stack.pop(-1)
+                else:
+                    str_stack[-1] += c
+            return -1
+        bp = break_check(string)
+
+        if bp == -1:
+            pass
+        elif bp == 0:
+            node.string = string[:1]
+            q = LinkedListNode(string[1:], False)
+            q.next = node.next
+            node.next = q
+        else:
+            node.string = string[:bp]
+            q = LinkedListNode(string[bp:], False)
+            q.next = node.next
+            node.next = q
+
+        node = node.next
+        if node is None: break
+
+    # 屏蔽空行和太短的句子
+    node = root
+    while True:
+        if len(node.string.strip('\n').strip(''))==0: node.preserve = True
+        if len(node.string.strip('\n').strip(''))<42: node.preserve = True
+        node = node.next
+        if node is None: break
+    node = root
+    while True:
+        if node.next and node.preserve and node.next.preserve:
+            node.string += node.next.string
+            node.next = node.next.next
+        node = node.next
+        if node is None: break
+
+    # 将前后断行符脱离
+    node = root
+    prev_node = None
+    while True:
+        if not node.preserve:
+            lstriped_ = node.string.lstrip().lstrip('\n')
+            if (prev_node is not None) and (prev_node.preserve) and (len(lstriped_)!=len(node.string)):
+                prev_node.string += node.string[:-len(lstriped_)]
+                node.string = lstriped_
+            rstriped_ = node.string.rstrip().rstrip('\n')
+            if (node.next is not None) and (node.next.preserve) and (len(rstriped_)!=len(node.string)):
+                node.next.string = node.string[len(rstriped_):] + node.next.string
+                node.string = rstriped_
+        # =====
+        prev_node = node
+        node = node.next
+        if node is None: break
+
+    with open(pj(project_folder, 'debug_log.html'), 'w', encoding='utf8') as f:
+        segment_parts_for_gpt = []
+        nodes = []
+        node = root
+        while True:
+            nodes.append(node)
+            show_html = node.string.replace('\n','<br/>')
+            if not node.preserve:
+                segment_parts_for_gpt.append(node.string)
+                f.write(f'<p style="color:black;">#{show_html}#</p>')
+            else:
+                f.write(f'<p style="color:red;">{show_html}</p>')
+            node = node.next
+            if node is None: break
+
+    for n in nodes: n.next = None   # break
+    return_dict['nodes'] = nodes
+    return_dict['segment_parts_for_gpt'] = segment_parts_for_gpt
+    return return_dict
+
+
+
 class LatexPaperSplit():
    """
-    将Latex文档分解到一个链表中，每个链表节点用preserve的标志位提示它是否应当被GPT处理
+    break down latex file to a linked list,
+    each node use a preserve flag to indicate whether it should
+    be proccessed by GPT.
    """
    def __init__(self) -> None:
-        """
-        root是链表的根节点
-        """
-        self.root = None
+        self.nodes = None
+        self.msg = "{\\scriptsize\\textbf{警告：该PDF由GPT-Academic开源项目调用大语言模型+Latex翻译插件一键生成，" + \
+            "版权归原文作者所有。翻译内容可靠性无任何保障，请仔细鉴别并以原文为准。" + \
+            "项目Github地址 \\url{https://github.com/binary-husky/gpt_academic/}。"
+        # 请您不要删除或修改这行警告，除非您是论文的原作者（如果您是论文原作者，欢迎加REAME中的QQ联系开发者）
+        self.msg_declare = "为了防止大语言模型的意外谬误产生扩散影响，禁止移除或修改此警告。}}\\\\" 

    def merge_result(self, arr, mode, msg):
        """
-        将GPT处理后的结果融合
+        Merge the result after the GPT process completed
        """
        result_string = ""
-        node = self.root
        p = 0
-        while True:
+        for node in self.nodes:
            if node.preserve:
                result_string += node.string
            else:
                result_string += fix_content(arr[p], node.string)
                p += 1
-            node = node.next
-            if node is None: break
        if mode == 'translate_zh':
-            try:
-                pattern = re.compile(r'\\begin\{abstract\}.*\n')
-                match = pattern.search(result_string)
-                position = match.end()
-                result_string = result_string[:position] + \
-                    "{\\scriptsize\\textbf{警告：该PDF由GPT-Academic开源项目调用大语言模型+Latex翻译插件一键生成，其内容可靠性没有任何保障，请仔细鉴别并以原文为准。" + \
-                    "项目Github地址 \\url{https://github.com/binary-husky/gpt_academic/}。"            + \
-                    msg + \
-                    "为了防止大语言模型的意外谬误产生扩散影响，禁止移除或修改此警告。}}\\\\"    + \
-                    result_string[position:]
-            except:
-                pass
+            pattern = re.compile(r'\\begin\{abstract\}.*\n')
+            match = pattern.search(result_string)
+            position = match.end()
+            result_string = result_string[:position] + self.msg + msg + self.msg_declare + result_string[position:]
        return result_string

-    def split(self, txt, project_folder):
+    def split(self, txt, project_folder, opts): 
        """
-        将Latex文档分解到一个链表中，每个链表节点用preserve的标志位提示它是否应当被GPT处理
+        break down latex file to a linked list,
+        each node use a preserve flag to indicate whether it should
+        be proccessed by GPT.
+        P.S. use multiprocessing to avoid timeout error
        """
-        root = LinkedListNode(txt, False)
-        def split_worker(root, pattern, flags=0):
-            lt = root
-            cnt = 0
-            pattern_compile = re.compile(pattern, flags)
-            while True:
-                if not lt.preserve:
-                    while True:
-                        res = pattern_compile.search(lt.string)
-                        if not res: break
-                        before = res.string[:res.span()[0]]
-                        this = res.group(0)
-                        after = res.string[res.span()[1]:]
-                        # ======
-                        lt.string = before
-                        tmp  = lt.next
-                        # ======
-                        mid = LinkedListNode(this, True)
-                        lt.next = mid
-                        # ======
-                        aft = LinkedListNode(after, False)
-                        mid.next = aft
-                        aft.next = tmp
-                        # ======
-                        lt = aft
-                lt = lt.next
-                cnt += 1
-                # print(cnt)
-                if lt is None: break
-
-        def split_worker_begin_end(root, pattern, flags=0, limit_n_lines=25):
-            lt = root
-            cnt = 0
-            pattern_compile = re.compile(pattern, flags)
-            while True:
-                if not lt.preserve:
-                    while True:
-                        target_string = lt.string
-
-                        def search_with_line_limit(target_string):
-                            for res in pattern_compile.finditer(target_string):
-                                cmd = res.group(1) # begin{what}
-                                this = res.group(2) # content between begin and end
-                                white_list = ['document', 'abstract', 'lemma', 'definition', 'sproof', 'em', 'emph', 'textit', 'textbf']
-                                if cmd in white_list or this.count('\n') > 25:
-                                    sub_res = search_with_line_limit(this)
-                                    if not sub_res: continue
-                                    else: return sub_res
-                                else:
-                                    return res.group(0)
-                            return False
-                        # ======
-                        # search for first encounter of \begin \end pair with less than 25 lines in the middle
-                        ps = search_with_line_limit(target_string) 
-                        if not ps: break
-                        res = re.search(re.escape(ps), target_string, flags)
-                        if not res: assert False
-                        before = res.string[:res.span()[0]]
-                        this = res.group(0)
-                        after = res.string[res.span()[1]:]
-                        # ======
-                        lt.string = before
-                        tmp  = lt.next
-                        # ======
-                        mid = LinkedListNode(this, True)
-                        lt.next = mid
-                        # ======
-                        aft = LinkedListNode(after, False)
-                        mid.next = aft
-                        aft.next = tmp
-                        # ======
-                        lt = aft
-                lt = lt.next
-                cnt += 1
-                # print(cnt)
-                if lt is None: break
-
-
-        # root 是链表的头
-        print('正在分解Latex源文件，构建链表结构')
-        # 删除iffalse注释
-        split_worker(root, r"\\iffalse(.*?)\\fi", re.DOTALL)
-        # 吸收在25行以内的begin-end组合
-        split_worker_begin_end(root, r"\\begin\{([a-z\*]*)\}(.*?)\\end\{\1\}", re.DOTALL, limit_n_lines=25)
-        # 吸收匿名公式
-        split_worker(root, r"\$\$(.*?)\$\$", re.DOTALL)
-        # 吸收其他杂项
-        split_worker(root, r"(.*?)\\maketitle", re.DOTALL)
-        split_worker(root, r"\\section\{(.*?)\}")
-        split_worker(root, r"\\section\*\{(.*?)\}")
-        split_worker(root, r"\\subsection\{(.*?)\}")
-        split_worker(root, r"\\subsubsection\{(.*?)\}")
-        split_worker(root, r"\\bibliography\{(.*?)\}")
-        split_worker(root, r"\\bibliographystyle\{(.*?)\}")
-        split_worker(root, r"\\begin\{lstlisting\}(.*?)\\end\{lstlisting\}", re.DOTALL)
-        split_worker(root, r"\\begin\{wraptable\}(.*?)\\end\{wraptable\}", re.DOTALL)
-        split_worker(root, r"\\begin\{algorithm\}(.*?)\\end\{algorithm\}", re.DOTALL)
-        split_worker(root, r"\\begin\{wrapfigure\}(.*?)\\end\{wrapfigure\}", re.DOTALL)
-        split_worker(root, r"\\begin\{wrapfigure\*\}(.*?)\\end\{wrapfigure\*\}", re.DOTALL)
-        split_worker(root, r"\\begin\{figure\}(.*?)\\end\{figure\}", re.DOTALL)
-        split_worker(root, r"\\begin\{figure\*\}(.*?)\\end\{figure\*\}", re.DOTALL)
-        split_worker(root, r"\\begin\{multline\}(.*?)\\end\{multline\}", re.DOTALL)
-        split_worker(root, r"\\begin\{multline\*\}(.*?)\\end\{multline\*\}", re.DOTALL)
-        split_worker(root, r"\\begin\{table\}(.*?)\\end\{table\}", re.DOTALL)
-        split_worker(root, r"\\begin\{table\*\}(.*?)\\end\{table\*\}", re.DOTALL)
-        split_worker(root, r"\\begin\{minipage\}(.*?)\\end\{minipage\}", re.DOTALL)
-        split_worker(root, r"\\begin\{minipage\*\}(.*?)\\end\{minipage\*\}", re.DOTALL)
-        split_worker(root, r"\\begin\{align\*\}(.*?)\\end\{align\*\}", re.DOTALL)
-        split_worker(root, r"\\begin\{align\}(.*?)\\end\{align\}", re.DOTALL)
-        split_worker(root, r"\\begin\{equation\}(.*?)\\end\{equation\}", re.DOTALL)
-        split_worker(root, r"\\begin\{equation\*\}(.*?)\\end\{equation\*\}", re.DOTALL)
-        split_worker(root, r"\\item ")
-        split_worker(root, r"\\label\{(.*?)\}")
-        split_worker(root, r"\\begin\{(.*?)\}")
-        split_worker(root, r"\\vspace\{(.*?)\}")
-        split_worker(root, r"\\hspace\{(.*?)\}")
-        split_worker(root, r"\\end\{(.*?)\}")
-
-        node = root
-        while True:
-            if len(node.string.strip('\n').strip(''))==0: node.preserve = True
-            if len(node.string.strip('\n').strip(''))<50: node.preserve = True
-            node = node.next
-            if node is None: break
-
-        # 修复括号
-        node = root
-        while True:
-            string = node.string
-            if node.preserve: 
-                node = node.next
-                if node is None: break
-                continue
-            def break_check(string):
-                str_stack = [""] # (lv, index)
-                for i, c in enumerate(string):
-                    if c == '{':
-                        str_stack.append('{')
-                    elif c == '}':
-                        if len(str_stack) == 1:
-                            print('stack kill')
-                            return i
-                        str_stack.pop(-1)
-                    else:
-                        str_stack[-1] += c
-                return -1
-            bp = break_check(string)
-
-            if bp == -1:
-                pass
-            elif bp == 0:
-                node.string = string[:1]
-                q = LinkedListNode(string[1:], False)
-                q.next = node.next
-                node.next = q
-            else:
-                node.string = string[:bp]
-                q = LinkedListNode(string[bp:], False)
-                q.next = node.next
-                node.next = q
-
-            node = node.next
-            if node is None: break
-
-        node = root
-        while True:
-            if len(node.string.strip('\n').strip(''))==0: node.preserve = True
-            if len(node.string.strip('\n').strip(''))<50: node.preserve = True
-            node = node.next
-            if node is None: break
-
-        # 将前后断行符脱离
-        node = root
-        prev_node = None
-        while True:
-            if not node.preserve:
-                lstriped_ = node.string.lstrip().lstrip('\n')
-                if (prev_node is not None) and (prev_node.preserve) and (len(lstriped_)!=len(node.string)):
-                    prev_node.string += node.string[:-len(lstriped_)]
-                    node.string = lstriped_
-                rstriped_ = node.string.rstrip().rstrip('\n')
-                if (node.next is not None) and (node.next.preserve) and (len(rstriped_)!=len(node.string)):
-                    node.next.string = node.string[len(rstriped_):] + node.next.string
-                    node.string = rstriped_
-            # =====
-            prev_node = node
-            node = node.next
-            if node is None: break
-
-        # 将分解结果返回 res_to_t
-        with open(pj(project_folder, 'debug_log.html'), 'w', encoding='utf8') as f:
-            res_to_t = []
-            node = root
-            while True:
-                show_html = node.string.replace('\n','<br/>')
-                if not node.preserve:
-                    res_to_t.append(node.string)
-                    f.write(f'<p style="color:black;">#{show_html}#</p>')
-                else:
-                    f.write(f'<p style="color:red;">{show_html}</p>')
-                node = node.next
-                if node is None: break
-
-        self.root = root
-        self.sp = res_to_t
+        import multiprocessing
+        manager = multiprocessing.Manager()
+        return_dict = manager.dict()
+        p = multiprocessing.Process(
+            target=split_subprocess, 
+            args=(txt, project_folder, return_dict, opts))
+        p.start()
+        p.join()
+        self.nodes = return_dict['nodes']
+        self.sp = return_dict['segment_parts_for_gpt']
        return self.sp

+
+
 class LatexPaperFileGroup():
+    """
+    use tokenizer to break down text according to max_token_limit
+    """
    def __init__(self):
        self.file_paths = []
        self.file_contents = []
@ -371,7 +462,7 @@ class LatexPaperFileGroup():

    def run_file_split(self, max_token_limit=1900):
        """
-        将长文本分离开来
+        use tokenizer to break down text according to max_token_limit
        """
        for index, file_content in enumerate(self.file_contents):
            if self.get_token_num(file_content) < max_token_limit:
@ -402,7 +493,7 @@ class LatexPaperFileGroup():



-def Latex精细分解与转化(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, mode='proofread', switch_prompt=None):
+def Latex精细分解与转化(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, mode='proofread', switch_prompt=None, opts=[]):
    import time, os, re
    from .crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
    from .latex_utils import LatexPaperFileGroup, merge_tex_files, LatexPaperSplit, 寻找Latex主文件
@ -411,7 +502,7 @@ def Latex精细分解与转化(file_manifest, project_folder, llm_kwargs, plugin
    maintex = 寻找Latex主文件(file_manifest, mode)
    chatbot.append((f"定位主Latex文件", f'[Local Message] 分析结果：该项目的Latex主文件是{maintex}, 如果分析错误, 请立即终止程序, 删除或修改歧义文件, 然后重试。主程序即将开始, 请稍候。'))
    yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
-    time.sleep(5)
+    time.sleep(3)

    #  <-------- 读取Latex文件, 将多文件tex工程融合为一个巨型tex ----------> 
    main_tex_basename = os.path.basename(maintex)
@ -431,8 +522,10 @@ def Latex精细分解与转化(file_manifest, project_folder, llm_kwargs, plugin
        f.write(merged_content)

    #  <-------- 精细切分latex文件 ----------> 
+    chatbot.append((f"Latex文件融合完成", f'[Local Message] 正在精细切分latex文件，这需要一段时间计算，文档越长耗时越长，请耐心等待。'))
+    yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
    lps = LatexPaperSplit()
-    res = lps.split(merged_content, project_folder)
+    res = lps.split(merged_content, project_folder, opts) # 消耗时间的函数

    #  <-------- 拆分过长的latex片段 ----------> 
    pfg = LatexPaperFileGroup()
@ -480,7 +573,8 @@ def Latex精细分解与转化(file_manifest, project_folder, llm_kwargs, plugin
    msg = f"当前大语言模型: {llm_kwargs['llm_model']}，当前语言模型温度设定: {llm_kwargs['temperature']}。"
    final_tex = lps.merge_result(pfg.file_result, mode, msg)
    with open(project_folder + f'/merge_{mode}.tex', 'w', encoding='utf-8', errors='replace') as f:
-        f.write(final_tex)
+        if mode != 'translate_zh' or "binary" in final_tex: f.write(final_tex)
+        

    #  <-------- 整理结果, 退出 ----------> 
    chatbot.append((f"完成了吗？", 'GPT结果已输出, 正在编译PDF'))
@ -507,7 +601,8 @@ def remove_buggy_lines(file_path, log_path, tex_name, tex_name_pure, n_fix, work
            f.writelines(file_lines)
        return True, f"{tex_name_pure}_fix_{n_fix}", buggy_lines
    except:
-        return False, 0, [0]
+        print("Fatal error occurred, but we cannot identify error, please download zip, read latex log, and compile manually.")
+        return False, -1, [-1]
    

 def compile_latex_with_timeout(command, timeout=60):
@ -522,12 +617,12 @@ def compile_latex_with_timeout(command, timeout=60):
        return False
    return True

-def 编译Latex差别(chatbot, history, main_file_original, main_file_modified, work_folder_original, work_folder_modified, work_folder):
+def 编译Latex(chatbot, history, main_file_original, main_file_modified, work_folder_original, work_folder_modified, work_folder):
    import os, time
    current_dir = os.getcwd()
    n_fix = 1
    max_try = 32
-    chatbot.append([f"正在编译PDF文档", f'编译已经开始。当前工作路径为{work_folder}，如果程序停顿5分钟以上，则大概率是卡死在Latex里面了。不幸卡死时请直接去该路径下取回翻译结果，或者重启之后再度尝试 ...']); yield from update_ui(chatbot=chatbot, history=history)
+    chatbot.append([f"正在编译PDF文档", f'编译已经开始。当前工作路径为{work_folder}，如果程序停顿5分钟以上，请直接去该路径下取回翻译结果，或者重启之后再度尝试 ...']); yield from update_ui(chatbot=chatbot, history=history)
    chatbot.append([f"正在编译PDF文档", '...']); yield from update_ui(chatbot=chatbot, history=history); time.sleep(1); chatbot[-1] = list(chatbot[-1]) # 刷新界面
    yield from update_ui_lastest_msg('编译已经开始...', chatbot, history)   # 刷新Gradio前端界面

--- a/crazy_functions/数学动画生成manim.py
+++ b/crazy_functions/数学动画生成manim.py
@ -8,7 +8,7 @@ def inspect_dependency(chatbot, history):
        import manim
        return True
    except:
-        chatbot.append(["导入依赖失败", "使用该模块需要额外依赖，安装方法:```pip install manimgl```"])
+        chatbot.append(["导入依赖失败", "使用该模块需要额外依赖，安装方法:```pip install manim manimgl```"])
        yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
        return False