@ -2,10 +2,12 @@ from toolbox import CatchException, report_execption, write_results_to_file
from toolbox import update_ui
from . crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
from . crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
from colorful import *
def read_and_clean_pdf_text ( fp ) :
"""
这个函数用于分割pdf, 用了很多trick, 逻辑较乱, 效果奇好, 不建议任何人去读这个函数
**输入参数说明**
- `fp`: 需要读取和清理文本的pdf文件路径
@ -22,17 +24,43 @@ def read_and_clean_pdf_text(fp):
- 清除重复的换行
- 将每个换行符替换为两个换行符,使每个段落之间有两个换行符分隔
"""
import fitz
import fitz , copy
import re
import numpy as np
fc = 0
fs = 1
fb = 2
REMOVE_FOOT_NOTE = True
REMOVE_FOOT_FFSIZE_PERCENT = 0.95
def primary_ffsize ( l ) :
fsize_statiscs = { }
for wtf in l [ ' spans ' ] :
if wtf [ ' size ' ] not in fsize_statiscs : fsize_statiscs [ wtf [ ' size ' ] ] = 0
fsize_statiscs [ wtf [ ' size ' ] ] + = len ( wtf [ ' text ' ] )
return max ( fsize_statiscs , key = fsize_statiscs . get )
def ffsize_same ( a , b ) :
return abs ( ( a - b ) / max ( a , b ) ) < 0.02
# file_content = ""
with fitz . open ( fp ) as doc :
meta_txt = [ ]
meta_font = [ ]
meta_line = [ ]
meta_span = [ ]
for index , page in enumerate ( doc ) :
# file_content += page.get_text()
text_areas = page . get_text ( " dict " ) # 获取页面上的文本信息
for t in text_areas [ ' blocks ' ] :
if ' lines ' in t :
pf = 998
for l in t [ ' lines ' ] :
txt_line = " " . join ( [ wtf [ ' text ' ] for wtf in l [ ' spans ' ] ] )
pf = primary_ffsize ( l )
meta_line . append ( [ txt_line , pf , l [ ' bbox ' ] , l ] )
for wtf in l [ ' spans ' ] : # for l in t['lines']:
meta_span . append ( [ wtf [ ' text ' ] , wtf [ ' size ' ] , len ( wtf [ ' text ' ] ) ] )
# meta_line.append(["NEW_BLOCK", pf])
# 块元提取 for each word segment with in line for each line cross-line words for each block
meta_txt . extend ( [ " " . join ( [ " " . join ( [ wtf [ ' text ' ] for wtf in l [ ' spans ' ] ] ) for l in t [ ' lines ' ] ] ) . replace (
' - ' , ' ' ) for t in text_areas [ ' blocks ' ] if ' lines ' in t ] )
@ -41,6 +69,56 @@ def read_and_clean_pdf_text(fp):
if index == 0 :
page_one_meta = [ " " . join ( [ " " . join ( [ wtf [ ' text ' ] for wtf in l [ ' spans ' ] ] ) for l in t [ ' lines ' ] ] ) . replace (
' - ' , ' ' ) for t in text_areas [ ' blocks ' ] if ' lines ' in t ]
# 获取正文主字体
fsize_statiscs = { }
for span in meta_span :
if span [ 1 ] not in fsize_statiscs : fsize_statiscs [ span [ 1 ] ] = 0
fsize_statiscs [ span [ 1 ] ] + = span [ 2 ]
main_fsize = max ( fsize_statiscs , key = fsize_statiscs . get )
if REMOVE_FOOT_NOTE :
give_up_fize_threshold = main_fsize * REMOVE_FOOT_FFSIZE_PERCENT
# 切分和重新整合
mega_sec = [ ]
sec = [ ]
for index , line in enumerate ( meta_line ) :
if index == 0 :
sec . append ( line [ fc ] )
continue
if REMOVE_FOOT_NOTE :
if meta_line [ index ] [ fs ] < = give_up_fize_threshold :
continue
if ffsize_same ( meta_line [ index ] [ fs ] , meta_line [ index - 1 ] [ fs ] ) :
# 尝试识别段落
if meta_line [ index ] [ fc ] . endswith ( ' . ' ) and \
( meta_line [ index - 1 ] [ fc ] != ' NEW_BLOCK ' ) and \
( meta_line [ index ] [ fb ] [ 2 ] - meta_line [ index ] [ fb ] [ 0 ] ) < ( meta_line [ index - 1 ] [ fb ] [ 2 ] - meta_line [ index - 1 ] [ fb ] [ 0 ] ) * 0.7 :
sec [ - 1 ] + = line [ fc ]
sec [ - 1 ] + = " \n \n "
else :
sec [ - 1 ] + = " "
sec [ - 1 ] + = line [ fc ]
else :
if ( index + 1 < len ( meta_line ) ) and \
meta_line [ index ] [ fs ] > main_fsize :
# 单行 + 字体大
mega_sec . append ( copy . deepcopy ( sec ) )
sec = [ ]
sec . append ( " # " + line [ fc ] )
else :
# 尝试识别section
if meta_line [ index - 1 ] [ fs ] > meta_line [ index ] [ fs ] :
sec . append ( " \n " + line [ fc ] )
else :
sec . append ( line [ fc ] )
mega_sec . append ( copy . deepcopy ( sec ) )
finals = [ ]
for ms in mega_sec :
final = " " . join ( ms )
final = final . replace ( ' - ' , ' ' )
finals . append ( final )
meta_txt = finals
def 把字符太少的块清除为回车 ( meta_txt ) :
for index , block_txt in enumerate ( meta_txt ) :
@ -85,6 +163,10 @@ def read_and_clean_pdf_text(fp):
# 换行 -> 双换行
meta_txt = meta_txt . replace ( ' \n ' , ' \n \n ' )
for f in finals :
print亮黄 ( f )
print亮绿 ( ' *************************** ' )
return meta_txt , page_one_meta
@ -145,21 +227,23 @@ def 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot,
TOKEN_LIMIT_PER_FRAGMENT = 1600
generated_conclusion_files = [ ]
for index , fp in enumerate ( file_manifest ) :
# 读取PDF文件
file_content , page_one = read_and_clean_pdf_text ( fp )
# 递归地切割PDF文件
from . crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
from toolbox import get_conf
enc = tiktoken . encoding_for_model ( * get_conf ( ' LLM_MODEL ' ) )
def get_token_num ( txt ) : return len ( enc . encode ( txt ) )
# 分解文本
paper_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf (
txt = file_content , get_token_fn = get_token_num , limit = TOKEN_LIMIT_PER_FRAGMENT )
page_one_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf (
txt = str ( page_one ) , get_token_fn = get_token_num , limit = TOKEN_LIMIT_PER_FRAGMENT / / 4 )
# 为了更好的效果, 我们剥离Introduction之后的部分
paper_meta = page_one_fragments [ 0 ] . split ( ' introduction ' ) [ 0 ] . split (
' Introduction ' ) [ 0 ] . split ( ' INTRODUCTION ' ) [ 0 ]
# 为了更好的效果, 我们剥离Introduction之后的部分( 如果有)
paper_meta = page_one_fragments [ 0 ] . split ( ' introduction ' ) [ 0 ] . split ( ' Introduction ' ) [ 0 ] . split ( ' INTRODUCTION ' ) [ 0 ]
# 单线, 获取文章meta信息
paper_meta_info = yield from request_gpt_model_in_new_thread_with_ui_alive (
inputs = f " 以下是一篇学术论文的基础信息, 请从中提取出“标题”、“收录会议或期刊”、“作者”、“摘要”、“编号”、“作者邮箱”这六个部分。请用markdown格式输出, 最后用中文翻译摘要部分。请提取: { paper_meta } " ,
@ -168,23 +252,32 @@ def 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot,
chatbot = chatbot , history = [ ] ,
sys_prompt = " Your job is to collect information from materials。 " ,
)
# 多线,翻译
gpt_response_collection = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency (
inputs_array = [
f " 以下是你需要翻译的文章段落 : \n { frag } " for frag in paper_fragments ] ,
inputs_show_user_array = [ f " " for _ in paper_fragments ] ,
f " 以下是你需要翻译的论文片段 : \n { frag } " for frag in paper_fragments ] ,
inputs_show_user_array = [ f " \n --- \n 原文: \n \n { frag . replace ( ' # ' , ' ' ) } \n --- \n 翻译: \n " for frag in paper_fragments ] ,
llm_kwargs = llm_kwargs ,
chatbot = chatbot ,
history_array = [ [ paper_meta ] for _ in paper_fragments ] ,
sys_prompt_array = [
" 请你作为一个学术翻译,把整个段落翻译成中文,要求语言简洁,禁止重复输出原 文。 " for _ in paper_fragments ] ,
" 请你作为一个学术翻译,负责把学术论文的片段准确翻译成中 文。 " for _ in paper_fragments ] ,
max_workers = 16 # OpenAI所允许的最大并行过载
)
final = [ " " , paper_meta_info + ' \n \n --- \n \n --- \n \n --- \n \n ' ]
# 整理报告的格式
for i , k in enumerate ( gpt_response_collection ) :
if i % 2 == 0 :
gpt_response_collection [ i ] = f " \n \n --- \n \n ## 原文[ { i / / 2 } / { len ( gpt_response_collection ) / / 2 } ]: \n \n { paper_fragments [ i / / 2 ] . replace ( ' # ' , ' ' ) } \n \n --- \n \n ## 翻译[ { i / / 2 } / { len ( gpt_response_collection ) / / 2 } ]: \n "
else :
gpt_response_collection [ i ] = gpt_response_collection [ i ]
final = [ " 一、论文概况 \n \n --- \n \n " , paper_meta_info . replace ( ' # ' , ' ### ' ) + ' \n \n --- \n \n ' , " 二、论文翻译 " , " " ]
final . extend ( gpt_response_collection )
create_report_file_name = f " { os . path . basename ( fp ) } .trans.md "
res = write_results_to_file ( final , file_name = create_report_file_name )
# 更新UI
generated_conclusion_files . append ( f ' ./gpt_log/ { create_report_file_name } ' )
chatbot . append ( ( f " { fp } 完成了吗? " , res ) )
yield from update_ui ( chatbot = chatbot , history = chatbot ) # 刷新界面
@ -200,4 +293,4 @@ def 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot,
if os . path . exists ( pdf_path ) :
os . remove ( pdf_path )
chatbot . append ( ( " 给出输出文件清单 " , str ( generated_conclusion_files ) ) )
yield from update_ui ( chatbot = chatbot , history = chatbot , msg = msg ) # 刷新界面
yield from update_ui ( chatbot = chatbot , history = chatbot ) # 刷新界面