Add zh_ Langchain into dependent files
This commit is contained in:
@ -6,12 +6,14 @@ def input_clipping(inputs, history, max_token_limit):
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
from request_llm.bridge_all import model_info
|
from request_llm.bridge_all import model_info
|
||||||
enc = model_info["gpt-3.5-turbo"]['tokenizer']
|
enc = model_info["gpt-3.5-turbo"]['tokenizer']
|
||||||
def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
|
|
||||||
|
def get_token_num(txt):
|
||||||
|
return len(enc.encode(txt, disallowed_special=()))
|
||||||
|
|
||||||
mode = 'input-and-history'
|
mode = 'input-and-history'
|
||||||
# 当 输入部分的token占比 小于 全文的一半时,只裁剪历史
|
# 当 输入部分的token占比 小于 全文的一半时,只裁剪历史
|
||||||
input_token_num = get_token_num(inputs)
|
input_token_num = get_token_num(inputs)
|
||||||
if input_token_num < max_token_limit//2:
|
if input_token_num < max_token_limit // 2:
|
||||||
mode = 'only-history'
|
mode = 'only-history'
|
||||||
max_token_limit = max_token_limit - input_token_num
|
max_token_limit = max_token_limit - input_token_num
|
||||||
|
|
||||||
@ -19,13 +21,13 @@ def input_clipping(inputs, history, max_token_limit):
|
|||||||
everything.extend(history)
|
everything.extend(history)
|
||||||
n_token = get_token_num('\n'.join(everything))
|
n_token = get_token_num('\n'.join(everything))
|
||||||
everything_token = [get_token_num(e) for e in everything]
|
everything_token = [get_token_num(e) for e in everything]
|
||||||
delta = max(everything_token) // 16 # 截断时的颗粒度
|
delta = max(everything_token) // 16 # 截断时的颗粒度
|
||||||
|
|
||||||
while n_token > max_token_limit:
|
while n_token > max_token_limit:
|
||||||
where = np.argmax(everything_token)
|
where = np.argmax(everything_token)
|
||||||
encoded = enc.encode(everything[where], disallowed_special=())
|
encoded = enc.encode(everything[where], disallowed_special=())
|
||||||
clipped_encoded = encoded[:len(encoded)-delta]
|
clipped_encoded = encoded[:len(encoded) - delta]
|
||||||
everything[where] = enc.decode(clipped_encoded)[:-1] # -1 to remove the may-be illegal char
|
everything[where] = enc.decode(clipped_encoded)[:-1] # -1 to remove the may-be illegal char
|
||||||
everything_token[where] = get_token_num(everything[where])
|
everything_token[where] = get_token_num(everything[where])
|
||||||
n_token = get_token_num('\n'.join(everything))
|
n_token = get_token_num('\n'.join(everything))
|
||||||
|
|
||||||
@ -38,11 +40,11 @@ def input_clipping(inputs, history, max_token_limit):
|
|||||||
|
|
||||||
|
|
||||||
def request_gpt_model_in_new_thread_with_ui_alive(
|
def request_gpt_model_in_new_thread_with_ui_alive(
|
||||||
inputs, inputs_show_user, llm_kwargs,
|
inputs, inputs_show_user, llm_kwargs,
|
||||||
chatbot, history, sys_prompt, refresh_interval=0.2,
|
chatbot, history, sys_prompt, refresh_interval=0.2,
|
||||||
handle_token_exceed=True,
|
handle_token_exceed=True,
|
||||||
retry_times_at_unknown_error=2,
|
retry_times_at_unknown_error=2,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Request GPT model,请求GPT模型同时维持用户界面活跃。
|
Request GPT model,请求GPT模型同时维持用户界面活跃。
|
||||||
|
|
||||||
@ -75,7 +77,7 @@ def request_gpt_model_in_new_thread_with_ui_alive(
|
|||||||
exceeded_cnt = 0
|
exceeded_cnt = 0
|
||||||
while True:
|
while True:
|
||||||
# watchdog error
|
# watchdog error
|
||||||
if len(mutable) >= 2 and (time.time()-mutable[1]) > 5:
|
if len(mutable) >= 2 and (time.time() - mutable[1]) > 5:
|
||||||
raise RuntimeError("检测到程序终止。")
|
raise RuntimeError("检测到程序终止。")
|
||||||
try:
|
try:
|
||||||
# 【第一种情况】:顺利完成
|
# 【第一种情况】:顺利完成
|
||||||
@ -92,14 +94,14 @@ def request_gpt_model_in_new_thread_with_ui_alive(
|
|||||||
p_ratio, n_exceed = get_reduce_token_percent(str(token_exceeded_error))
|
p_ratio, n_exceed = get_reduce_token_percent(str(token_exceeded_error))
|
||||||
MAX_TOKEN = 4096
|
MAX_TOKEN = 4096
|
||||||
EXCEED_ALLO = 512 + 512 * exceeded_cnt
|
EXCEED_ALLO = 512 + 512 * exceeded_cnt
|
||||||
inputs, history = input_clipping(inputs, history, max_token_limit=MAX_TOKEN-EXCEED_ALLO)
|
inputs, history = input_clipping(inputs, history, max_token_limit=MAX_TOKEN - EXCEED_ALLO)
|
||||||
mutable[0] += f'[Local Message] 警告,文本过长将进行截断,Token溢出数:{n_exceed}。\n\n'
|
mutable[0] += f'[Local Message] 警告,文本过长将进行截断,Token溢出数:{n_exceed}。\n\n'
|
||||||
continue # 返回重试
|
continue # 返回重试
|
||||||
else:
|
else:
|
||||||
# 【选择放弃】
|
# 【选择放弃】
|
||||||
tb_str = '```\n' + trimmed_format_exc() + '```'
|
tb_str = '```\n' + trimmed_format_exc() + '```'
|
||||||
mutable[0] += f"[Local Message] 警告,在执行过程中遭遇问题, Traceback:\n\n{tb_str}\n\n"
|
mutable[0] += f"[Local Message] 警告,在执行过程中遭遇问题, Traceback:\n\n{tb_str}\n\n"
|
||||||
return mutable[0] # 放弃
|
return mutable[0] # 放弃
|
||||||
except:
|
except:
|
||||||
# 【第三种情况】:其他错误:重试几次
|
# 【第三种情况】:其他错误:重试几次
|
||||||
tb_str = '```\n' + trimmed_format_exc() + '```'
|
tb_str = '```\n' + trimmed_format_exc() + '```'
|
||||||
@ -107,14 +109,15 @@ def request_gpt_model_in_new_thread_with_ui_alive(
|
|||||||
mutable[0] += f"[Local Message] 警告,在执行过程中遭遇问题, Traceback:\n\n{tb_str}\n\n"
|
mutable[0] += f"[Local Message] 警告,在执行过程中遭遇问题, Traceback:\n\n{tb_str}\n\n"
|
||||||
if retry_op > 0:
|
if retry_op > 0:
|
||||||
retry_op -= 1
|
retry_op -= 1
|
||||||
mutable[0] += f"[Local Message] 重试中,请稍等 {retry_times_at_unknown_error-retry_op}/{retry_times_at_unknown_error}:\n\n"
|
mutable[
|
||||||
|
0] += f"[Local Message] 重试中,请稍等 {retry_times_at_unknown_error - retry_op}/{retry_times_at_unknown_error}:\n\n"
|
||||||
if ("Rate limit reached" in tb_str) or ("Too Many Requests" in tb_str):
|
if ("Rate limit reached" in tb_str) or ("Too Many Requests" in tb_str):
|
||||||
time.sleep(30)
|
time.sleep(30)
|
||||||
time.sleep(5)
|
time.sleep(5)
|
||||||
continue # 返回重试
|
continue # 返回重试
|
||||||
else:
|
else:
|
||||||
time.sleep(5)
|
time.sleep(5)
|
||||||
return mutable[0] # 放弃
|
return mutable[0] # 放弃
|
||||||
|
|
||||||
# 提交任务
|
# 提交任务
|
||||||
future = executor.submit(_req_gpt, inputs, history, sys_prompt)
|
future = executor.submit(_req_gpt, inputs, history, sys_prompt)
|
||||||
@ -126,21 +129,21 @@ def request_gpt_model_in_new_thread_with_ui_alive(
|
|||||||
if future.done():
|
if future.done():
|
||||||
break
|
break
|
||||||
chatbot[-1] = [chatbot[-1][0], mutable[0]]
|
chatbot[-1] = [chatbot[-1][0], mutable[0]]
|
||||||
yield from update_ui(chatbot=chatbot, history=[]) # 刷新界面
|
yield from update_ui(chatbot=chatbot, history=[]) # 刷新界面
|
||||||
|
|
||||||
final_result = future.result()
|
final_result = future.result()
|
||||||
chatbot[-1] = [chatbot[-1][0], final_result]
|
chatbot[-1] = [chatbot[-1][0], final_result]
|
||||||
yield from update_ui(chatbot=chatbot, history=[]) # 如果最后成功了,则删除报错信息
|
yield from update_ui(chatbot=chatbot, history=[]) # 如果最后成功了,则删除报错信息
|
||||||
return final_result
|
return final_result
|
||||||
|
|
||||||
|
|
||||||
def request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
|
def request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
|
||||||
inputs_array, inputs_show_user_array, llm_kwargs,
|
inputs_array, inputs_show_user_array, llm_kwargs,
|
||||||
chatbot, history_array, sys_prompt_array,
|
chatbot, history_array, sys_prompt_array,
|
||||||
refresh_interval=0.2, max_workers=-1, scroller_max_len=30,
|
refresh_interval=0.2, max_workers=-1, scroller_max_len=30,
|
||||||
handle_token_exceed=True, show_user_at_complete=False,
|
handle_token_exceed=True, show_user_at_complete=False,
|
||||||
retry_times_at_unknown_error=2,
|
retry_times_at_unknown_error=2,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Request GPT model using multiple threads with UI and high efficiency
|
Request GPT model using multiple threads with UI and high efficiency
|
||||||
请求GPT模型的[多线程]版。
|
请求GPT模型的[多线程]版。
|
||||||
@ -173,19 +176,21 @@ def request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
|
|||||||
from request_llm.bridge_all import predict_no_ui_long_connection
|
from request_llm.bridge_all import predict_no_ui_long_connection
|
||||||
assert len(inputs_array) == len(history_array)
|
assert len(inputs_array) == len(history_array)
|
||||||
assert len(inputs_array) == len(sys_prompt_array)
|
assert len(inputs_array) == len(sys_prompt_array)
|
||||||
if max_workers == -1: # 读取配置文件
|
if max_workers == -1: # 读取配置文件
|
||||||
try: max_workers, = get_conf('DEFAULT_WORKER_NUM')
|
try:
|
||||||
except: max_workers = 8
|
max_workers, = get_conf('DEFAULT_WORKER_NUM')
|
||||||
|
except:
|
||||||
|
max_workers = 8
|
||||||
if max_workers <= 0: max_workers = 3
|
if max_workers <= 0: max_workers = 3
|
||||||
# 屏蔽掉 chatglm的多线程,可能会导致严重卡顿
|
# 屏蔽掉 chatglm的多线程,可能会导致严重卡顿
|
||||||
if not (llm_kwargs['llm_model'].startswith('gpt-') or llm_kwargs['llm_model'].startswith('api2d-')):
|
if not (llm_kwargs['llm_model'].startswith('gpt-') or llm_kwargs['llm_model'].startswith('api2d-')):
|
||||||
max_workers = 1
|
max_workers = 1
|
||||||
|
|
||||||
executor = ThreadPoolExecutor(max_workers=max_workers)
|
executor = ThreadPoolExecutor(max_workers=max_workers)
|
||||||
n_frag = len(inputs_array)
|
n_frag = len(inputs_array)
|
||||||
# 用户反馈
|
# 用户反馈
|
||||||
chatbot.append(["请开始多线程操作。", ""])
|
chatbot.append(["请开始多线程操作。", ""])
|
||||||
yield from update_ui(chatbot=chatbot, history=[]) # 刷新界面
|
yield from update_ui(chatbot=chatbot, history=[]) # 刷新界面
|
||||||
# 跨线程传递
|
# 跨线程传递
|
||||||
mutable = [["", time.time(), "等待中"] for _ in range(n_frag)]
|
mutable = [["", time.time(), "等待中"] for _ in range(n_frag)]
|
||||||
|
|
||||||
@ -197,13 +202,13 @@ def request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
|
|||||||
mutable[index][2] = "执行中"
|
mutable[index][2] = "执行中"
|
||||||
while True:
|
while True:
|
||||||
# watchdog error
|
# watchdog error
|
||||||
if len(mutable[index]) >= 2 and (time.time()-mutable[index][1]) > 5:
|
if len(mutable[index]) >= 2 and (time.time() - mutable[index][1]) > 5:
|
||||||
raise RuntimeError("检测到程序终止。")
|
raise RuntimeError("检测到程序终止。")
|
||||||
try:
|
try:
|
||||||
# 【第一种情况】:顺利完成
|
# 【第一种情况】:顺利完成
|
||||||
# time.sleep(10); raise RuntimeError("测试")
|
# time.sleep(10); raise RuntimeError("测试")
|
||||||
gpt_say = predict_no_ui_long_connection(
|
gpt_say = predict_no_ui_long_connection(
|
||||||
inputs=inputs, llm_kwargs=llm_kwargs, history=history,
|
inputs=inputs, llm_kwargs=llm_kwargs, history=history,
|
||||||
sys_prompt=sys_prompt, observe_window=mutable[index], console_slience=True
|
sys_prompt=sys_prompt, observe_window=mutable[index], console_slience=True
|
||||||
)
|
)
|
||||||
mutable[index][2] = "已成功"
|
mutable[index][2] = "已成功"
|
||||||
@ -217,10 +222,10 @@ def request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
|
|||||||
p_ratio, n_exceed = get_reduce_token_percent(str(token_exceeded_error))
|
p_ratio, n_exceed = get_reduce_token_percent(str(token_exceeded_error))
|
||||||
MAX_TOKEN = 4096
|
MAX_TOKEN = 4096
|
||||||
EXCEED_ALLO = 512 + 512 * exceeded_cnt
|
EXCEED_ALLO = 512 + 512 * exceeded_cnt
|
||||||
inputs, history = input_clipping(inputs, history, max_token_limit=MAX_TOKEN-EXCEED_ALLO)
|
inputs, history = input_clipping(inputs, history, max_token_limit=MAX_TOKEN - EXCEED_ALLO)
|
||||||
gpt_say += f'[Local Message] 警告,文本过长将进行截断,Token溢出数:{n_exceed}。\n\n'
|
gpt_say += f'[Local Message] 警告,文本过长将进行截断,Token溢出数:{n_exceed}。\n\n'
|
||||||
mutable[index][2] = f"截断重试"
|
mutable[index][2] = f"截断重试"
|
||||||
continue # 返回重试
|
continue # 返回重试
|
||||||
else:
|
else:
|
||||||
# 【选择放弃】
|
# 【选择放弃】
|
||||||
tb_str = '```\n' + trimmed_format_exc() + '```'
|
tb_str = '```\n' + trimmed_format_exc() + '```'
|
||||||
@ -236,7 +241,7 @@ def request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
|
|||||||
gpt_say += f"[Local Message] 警告,线程{index}在执行过程中遭遇问题, Traceback:\n\n{tb_str}\n\n"
|
gpt_say += f"[Local Message] 警告,线程{index}在执行过程中遭遇问题, Traceback:\n\n{tb_str}\n\n"
|
||||||
if len(mutable[index][0]) > 0:
|
if len(mutable[index][0]) > 0:
|
||||||
gpt_say += "此线程失败前收到的回答:\n\n" + mutable[index][0]
|
gpt_say += "此线程失败前收到的回答:\n\n" + mutable[index][0]
|
||||||
if retry_op > 0:
|
if retry_op > 0:
|
||||||
retry_op -= 1
|
retry_op -= 1
|
||||||
wait = random.randint(5, 20)
|
wait = random.randint(5, 20)
|
||||||
if ("Rate limit reached" in tb_str) or ("Too Many Requests" in tb_str):
|
if ("Rate limit reached" in tb_str) or ("Too Many Requests" in tb_str):
|
||||||
@ -246,19 +251,22 @@ def request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
|
|||||||
fail_info = ""
|
fail_info = ""
|
||||||
# 也许等待十几秒后,情况会好转
|
# 也许等待十几秒后,情况会好转
|
||||||
for i in range(wait):
|
for i in range(wait):
|
||||||
mutable[index][2] = f"{fail_info}等待重试 {wait-i}"; time.sleep(1)
|
mutable[index][2] = f"{fail_info}等待重试 {wait - i}";
|
||||||
|
time.sleep(1)
|
||||||
# 开始重试
|
# 开始重试
|
||||||
mutable[index][2] = f"重试中 {retry_times_at_unknown_error-retry_op}/{retry_times_at_unknown_error}"
|
mutable[index][
|
||||||
|
2] = f"重试中 {retry_times_at_unknown_error - retry_op}/{retry_times_at_unknown_error}"
|
||||||
continue # 返回重试
|
continue # 返回重试
|
||||||
else:
|
else:
|
||||||
mutable[index][2] = "已失败"
|
mutable[index][2] = "已失败"
|
||||||
wait = 5
|
wait = 5
|
||||||
time.sleep(5)
|
time.sleep(5)
|
||||||
return gpt_say # 放弃
|
return gpt_say # 放弃
|
||||||
|
|
||||||
# 异步任务开始
|
# 异步任务开始
|
||||||
futures = [executor.submit(_req_gpt, index, inputs, history, sys_prompt) for index, inputs, history, sys_prompt in zip(
|
futures = [executor.submit(_req_gpt, index, inputs, history, sys_prompt) for index, inputs, history, sys_prompt in
|
||||||
range(len(inputs_array)), inputs_array, history_array, sys_prompt_array)]
|
zip(
|
||||||
|
range(len(inputs_array)), inputs_array, history_array, sys_prompt_array)]
|
||||||
cnt = 0
|
cnt = 0
|
||||||
while True:
|
while True:
|
||||||
# yield一次以刷新前端页面
|
# yield一次以刷新前端页面
|
||||||
@ -272,17 +280,17 @@ def request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
|
|||||||
mutable[thread_index][1] = time.time()
|
mutable[thread_index][1] = time.time()
|
||||||
# 在前端打印些好玩的东西
|
# 在前端打印些好玩的东西
|
||||||
for thread_index, _ in enumerate(worker_done):
|
for thread_index, _ in enumerate(worker_done):
|
||||||
print_something_really_funny = "[ ...`"+mutable[thread_index][0][-scroller_max_len:].\
|
print_something_really_funny = "[ ...`" + mutable[thread_index][0][-scroller_max_len:]. \
|
||||||
replace('\n', '').replace('```', '...').replace(
|
replace('\n', '').replace('```', '...').replace(
|
||||||
' ', '.').replace('<br/>', '.....').replace('$', '.')+"`... ]"
|
' ', '.').replace('<br/>', '.....').replace('$', '.') + "`... ]"
|
||||||
observe_win.append(print_something_really_funny)
|
observe_win.append(print_something_really_funny)
|
||||||
# 在前端打印些好玩的东西
|
# 在前端打印些好玩的东西
|
||||||
stat_str = ''.join([f'`{mutable[thread_index][2]}`: {obs}\n\n'
|
stat_str = ''.join([f'`{mutable[thread_index][2]}`: {obs}\n\n'
|
||||||
if not done else f'`{mutable[thread_index][2]}`\n\n'
|
if not done else f'`{mutable[thread_index][2]}`\n\n'
|
||||||
for thread_index, done, obs in zip(range(len(worker_done)), worker_done, observe_win)])
|
for thread_index, done, obs in zip(range(len(worker_done)), worker_done, observe_win)])
|
||||||
# 在前端打印些好玩的东西
|
# 在前端打印些好玩的东西
|
||||||
chatbot[-1] = [chatbot[-1][0], f'多线程操作已经开始,完成情况: \n\n{stat_str}' + ''.join(['.']*(cnt % 10+1))]
|
chatbot[-1] = [chatbot[-1][0], f'多线程操作已经开始,完成情况: \n\n{stat_str}' + ''.join(['.'] * (cnt % 10 + 1))]
|
||||||
yield from update_ui(chatbot=chatbot, history=[]) # 刷新界面
|
yield from update_ui(chatbot=chatbot, history=[]) # 刷新界面
|
||||||
if all(worker_done):
|
if all(worker_done):
|
||||||
executor.shutdown()
|
executor.shutdown()
|
||||||
break
|
break
|
||||||
@ -292,13 +300,13 @@ def request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
|
|||||||
for inputs_show_user, f in zip(inputs_show_user_array, futures):
|
for inputs_show_user, f in zip(inputs_show_user_array, futures):
|
||||||
gpt_res = f.result()
|
gpt_res = f.result()
|
||||||
gpt_response_collection.extend([inputs_show_user, gpt_res])
|
gpt_response_collection.extend([inputs_show_user, gpt_res])
|
||||||
|
|
||||||
# 是否在结束时,在界面上显示结果
|
# 是否在结束时,在界面上显示结果
|
||||||
if show_user_at_complete:
|
if show_user_at_complete:
|
||||||
for inputs_show_user, f in zip(inputs_show_user_array, futures):
|
for inputs_show_user, f in zip(inputs_show_user_array, futures):
|
||||||
gpt_res = f.result()
|
gpt_res = f.result()
|
||||||
chatbot.append([inputs_show_user, gpt_res])
|
chatbot.append([inputs_show_user, gpt_res])
|
||||||
yield from update_ui(chatbot=chatbot, history=[]) # 刷新界面
|
yield from update_ui(chatbot=chatbot, history=[]) # 刷新界面
|
||||||
time.sleep(0.3)
|
time.sleep(0.3)
|
||||||
return gpt_response_collection
|
return gpt_response_collection
|
||||||
|
|
||||||
@ -311,6 +319,7 @@ def breakdown_txt_to_satisfy_token_limit(txt, get_token_fn, limit):
|
|||||||
lines = txt_tocut.split('\n')
|
lines = txt_tocut.split('\n')
|
||||||
estimated_line_cut = limit / get_token_fn(txt_tocut) * len(lines)
|
estimated_line_cut = limit / get_token_fn(txt_tocut) * len(lines)
|
||||||
estimated_line_cut = int(estimated_line_cut)
|
estimated_line_cut = int(estimated_line_cut)
|
||||||
|
cnt = 0
|
||||||
for cnt in reversed(range(estimated_line_cut)):
|
for cnt in reversed(range(estimated_line_cut)):
|
||||||
if must_break_at_empty_line:
|
if must_break_at_empty_line:
|
||||||
if lines[cnt] != "":
|
if lines[cnt] != "":
|
||||||
@ -327,6 +336,7 @@ def breakdown_txt_to_satisfy_token_limit(txt, get_token_fn, limit):
|
|||||||
result = [prev]
|
result = [prev]
|
||||||
result.extend(cut(post, must_break_at_empty_line))
|
result.extend(cut(post, must_break_at_empty_line))
|
||||||
return result
|
return result
|
||||||
|
|
||||||
try:
|
try:
|
||||||
return cut(txt, must_break_at_empty_line=True)
|
return cut(txt, must_break_at_empty_line=True)
|
||||||
except RuntimeError:
|
except RuntimeError:
|
||||||
@ -342,9 +352,10 @@ def force_breakdown(txt, limit, get_token_fn):
|
|||||||
return txt[:i], txt[i:]
|
return txt[:i], txt[i:]
|
||||||
return "Tiktoken未知错误", "Tiktoken未知错误"
|
return "Tiktoken未知错误", "Tiktoken未知错误"
|
||||||
|
|
||||||
|
|
||||||
def breakdown_txt_to_satisfy_token_limit_for_pdf(txt, get_token_fn, limit):
|
def breakdown_txt_to_satisfy_token_limit_for_pdf(txt, get_token_fn, limit):
|
||||||
# 递归
|
# 递归
|
||||||
def cut(txt_tocut, must_break_at_empty_line, break_anyway=False):
|
def cut(txt_tocut, must_break_at_empty_line, break_anyway=False):
|
||||||
if get_token_fn(txt_tocut) <= limit:
|
if get_token_fn(txt_tocut) <= limit:
|
||||||
return [txt_tocut]
|
return [txt_tocut]
|
||||||
else:
|
else:
|
||||||
@ -370,6 +381,7 @@ def breakdown_txt_to_satisfy_token_limit_for_pdf(txt, get_token_fn, limit):
|
|||||||
result = [prev]
|
result = [prev]
|
||||||
result.extend(cut(post, must_break_at_empty_line, break_anyway=break_anyway))
|
result.extend(cut(post, must_break_at_empty_line, break_anyway=break_anyway))
|
||||||
return result
|
return result
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# 第1次尝试,将双空行(\n\n)作为切分点
|
# 第1次尝试,将双空行(\n\n)作为切分点
|
||||||
return cut(txt, must_break_at_empty_line=True)
|
return cut(txt, must_break_at_empty_line=True)
|
||||||
@ -380,7 +392,7 @@ def breakdown_txt_to_satisfy_token_limit_for_pdf(txt, get_token_fn, limit):
|
|||||||
except RuntimeError:
|
except RuntimeError:
|
||||||
try:
|
try:
|
||||||
# 第3次尝试,将英文句号(.)作为切分点
|
# 第3次尝试,将英文句号(.)作为切分点
|
||||||
res = cut(txt.replace('.', '。\n'), must_break_at_empty_line=False) # 这个中文的句号是故意的,作为一个标识而存在
|
res = cut(txt.replace('.', '。\n'), must_break_at_empty_line=False) # 这个中文的句号是故意的,作为一个标识而存在
|
||||||
return [r.replace('。\n', '.') for r in res]
|
return [r.replace('。\n', '.') for r in res]
|
||||||
except RuntimeError as e:
|
except RuntimeError as e:
|
||||||
try:
|
try:
|
||||||
@ -392,7 +404,6 @@ def breakdown_txt_to_satisfy_token_limit_for_pdf(txt, get_token_fn, limit):
|
|||||||
return cut(txt, must_break_at_empty_line=False, break_anyway=True)
|
return cut(txt, must_break_at_empty_line=False, break_anyway=True)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def read_and_clean_pdf_text(fp):
|
def read_and_clean_pdf_text(fp):
|
||||||
"""
|
"""
|
||||||
这个函数用于分割pdf,用了很多trick,逻辑较乱,效果奇好
|
这个函数用于分割pdf,用了很多trick,逻辑较乱,效果奇好
|
||||||
@ -420,8 +431,9 @@ def read_and_clean_pdf_text(fp):
|
|||||||
fc = 0 # Index 0 文本
|
fc = 0 # Index 0 文本
|
||||||
fs = 1 # Index 1 字体
|
fs = 1 # Index 1 字体
|
||||||
fb = 2 # Index 2 框框
|
fb = 2 # Index 2 框框
|
||||||
REMOVE_FOOT_NOTE = True # 是否丢弃掉 不是正文的内容 (比正文字体小,如参考文献、脚注、图注等)
|
REMOVE_FOOT_NOTE = True # 是否丢弃掉 不是正文的内容 (比正文字体小,如参考文献、脚注、图注等)
|
||||||
REMOVE_FOOT_FFSIZE_PERCENT = 0.95 # 小于正文的?时,判定为不是正文(有些文章的正文部分字体大小不是100%统一的,有肉眼不可见的小变化)
|
REMOVE_FOOT_FFSIZE_PERCENT = 0.95 # 小于正文的?时,判定为不是正文(有些文章的正文部分字体大小不是100%统一的,有肉眼不可见的小变化)
|
||||||
|
|
||||||
def primary_ffsize(l):
|
def primary_ffsize(l):
|
||||||
"""
|
"""
|
||||||
提取文本块主字体
|
提取文本块主字体
|
||||||
@ -431,12 +443,12 @@ def read_and_clean_pdf_text(fp):
|
|||||||
if wtf['size'] not in fsize_statiscs: fsize_statiscs[wtf['size']] = 0
|
if wtf['size'] not in fsize_statiscs: fsize_statiscs[wtf['size']] = 0
|
||||||
fsize_statiscs[wtf['size']] += len(wtf['text'])
|
fsize_statiscs[wtf['size']] += len(wtf['text'])
|
||||||
return max(fsize_statiscs, key=fsize_statiscs.get)
|
return max(fsize_statiscs, key=fsize_statiscs.get)
|
||||||
|
|
||||||
def ffsize_same(a,b):
|
def ffsize_same(a, b):
|
||||||
"""
|
"""
|
||||||
提取字体大小是否近似相等
|
提取字体大小是否近似相等
|
||||||
"""
|
"""
|
||||||
return abs((a-b)/max(a,b)) < 0.02
|
return abs((a - b) / max(a, b)) < 0.02
|
||||||
|
|
||||||
with fitz.open(fp) as doc:
|
with fitz.open(fp) as doc:
|
||||||
meta_txt = []
|
meta_txt = []
|
||||||
@ -456,18 +468,19 @@ def read_and_clean_pdf_text(fp):
|
|||||||
if len(txt_line) == 0: continue
|
if len(txt_line) == 0: continue
|
||||||
pf = primary_ffsize(l)
|
pf = primary_ffsize(l)
|
||||||
meta_line.append([txt_line, pf, l['bbox'], l])
|
meta_line.append([txt_line, pf, l['bbox'], l])
|
||||||
for wtf in l['spans']: # for l in t['lines']:
|
for wtf in l['spans']: # for l in t['lines']:
|
||||||
meta_span.append([wtf['text'], wtf['size'], len(wtf['text'])])
|
meta_span.append([wtf['text'], wtf['size'], len(wtf['text'])])
|
||||||
# meta_line.append(["NEW_BLOCK", pf])
|
# meta_line.append(["NEW_BLOCK", pf])
|
||||||
# 块元提取 for each word segment with in line for each line cross-line words for each block
|
# 块元提取 for each word segment with in line for each line
|
||||||
|
# cross-line words for each block
|
||||||
meta_txt.extend([" ".join(["".join([wtf['text'] for wtf in l['spans']]) for l in t['lines']]).replace(
|
meta_txt.extend([" ".join(["".join([wtf['text'] for wtf in l['spans']]) for l in t['lines']]).replace(
|
||||||
'- ', '') for t in text_areas['blocks'] if 'lines' in t])
|
'- ', '') for t in text_areas['blocks'] if 'lines' in t])
|
||||||
meta_font.extend([np.mean([np.mean([wtf['size'] for wtf in l['spans']])
|
meta_font.extend([np.mean([np.mean([wtf['size'] for wtf in l['spans']])
|
||||||
for l in t['lines']]) for t in text_areas['blocks'] if 'lines' in t])
|
for l in t['lines']]) for t in text_areas['blocks'] if 'lines' in t])
|
||||||
if index == 0:
|
if index == 0:
|
||||||
page_one_meta = [" ".join(["".join([wtf['text'] for wtf in l['spans']]) for l in t['lines']]).replace(
|
page_one_meta = [" ".join(["".join([wtf['text'] for wtf in l['spans']]) for l in t['lines']]).replace(
|
||||||
'- ', '') for t in text_areas['blocks'] if 'lines' in t]
|
'- ', '') for t in text_areas['blocks'] if 'lines' in t]
|
||||||
|
|
||||||
############################## <第 2 步,获取正文主字体> ##################################
|
############################## <第 2 步,获取正文主字体> ##################################
|
||||||
fsize_statiscs = {}
|
fsize_statiscs = {}
|
||||||
for span in meta_span:
|
for span in meta_span:
|
||||||
@ -481,32 +494,33 @@ def read_and_clean_pdf_text(fp):
|
|||||||
mega_sec = []
|
mega_sec = []
|
||||||
sec = []
|
sec = []
|
||||||
for index, line in enumerate(meta_line):
|
for index, line in enumerate(meta_line):
|
||||||
if index == 0:
|
if index == 0:
|
||||||
sec.append(line[fc])
|
sec.append(line[fc])
|
||||||
continue
|
continue
|
||||||
if REMOVE_FOOT_NOTE:
|
if REMOVE_FOOT_NOTE:
|
||||||
if meta_line[index][fs] <= give_up_fize_threshold:
|
if meta_line[index][fs] <= give_up_fize_threshold:
|
||||||
continue
|
continue
|
||||||
if ffsize_same(meta_line[index][fs], meta_line[index-1][fs]):
|
if ffsize_same(meta_line[index][fs], meta_line[index - 1][fs]):
|
||||||
# 尝试识别段落
|
# 尝试识别段落
|
||||||
if meta_line[index][fc].endswith('.') and\
|
if meta_line[index][fc].endswith('.') and \
|
||||||
(meta_line[index-1][fc] != 'NEW_BLOCK') and \
|
(meta_line[index - 1][fc] != 'NEW_BLOCK') and \
|
||||||
(meta_line[index][fb][2] - meta_line[index][fb][0]) < (meta_line[index-1][fb][2] - meta_line[index-1][fb][0]) * 0.7:
|
(meta_line[index][fb][2] - meta_line[index][fb][0]) < (
|
||||||
|
meta_line[index - 1][fb][2] - meta_line[index - 1][fb][0]) * 0.7:
|
||||||
sec[-1] += line[fc]
|
sec[-1] += line[fc]
|
||||||
sec[-1] += "\n\n"
|
sec[-1] += "\n\n"
|
||||||
else:
|
else:
|
||||||
sec[-1] += " "
|
sec[-1] += " "
|
||||||
sec[-1] += line[fc]
|
sec[-1] += line[fc]
|
||||||
else:
|
else:
|
||||||
if (index+1 < len(meta_line)) and \
|
if (index + 1 < len(meta_line)) and \
|
||||||
meta_line[index][fs] > main_fsize:
|
meta_line[index][fs] > main_fsize:
|
||||||
# 单行 + 字体大
|
# 单行 + 字体大
|
||||||
mega_sec.append(copy.deepcopy(sec))
|
mega_sec.append(copy.deepcopy(sec))
|
||||||
sec = []
|
sec = []
|
||||||
sec.append("# " + line[fc])
|
sec.append("# " + line[fc])
|
||||||
else:
|
else:
|
||||||
# 尝试识别section
|
# 尝试识别section
|
||||||
if meta_line[index-1][fs] > meta_line[index][fs]:
|
if meta_line[index - 1][fs] > meta_line[index][fs]:
|
||||||
sec.append("\n" + line[fc])
|
sec.append("\n" + line[fc])
|
||||||
else:
|
else:
|
||||||
sec.append(line[fc])
|
sec.append(line[fc])
|
||||||
@ -525,13 +539,15 @@ def read_and_clean_pdf_text(fp):
|
|||||||
if len(block_txt) < 100:
|
if len(block_txt) < 100:
|
||||||
meta_txt[index] = '\n'
|
meta_txt[index] = '\n'
|
||||||
return meta_txt
|
return meta_txt
|
||||||
|
|
||||||
meta_txt = 把字符太少的块清除为回车(meta_txt)
|
meta_txt = 把字符太少的块清除为回车(meta_txt)
|
||||||
|
|
||||||
def 清理多余的空行(meta_txt):
|
def 清理多余的空行(meta_txt):
|
||||||
for index in reversed(range(1, len(meta_txt))):
|
for index in reversed(range(1, len(meta_txt))):
|
||||||
if meta_txt[index] == '\n' and meta_txt[index-1] == '\n':
|
if meta_txt[index] == '\n' and meta_txt[index - 1] == '\n':
|
||||||
meta_txt.pop(index)
|
meta_txt.pop(index)
|
||||||
return meta_txt
|
return meta_txt
|
||||||
|
|
||||||
meta_txt = 清理多余的空行(meta_txt)
|
meta_txt = 清理多余的空行(meta_txt)
|
||||||
|
|
||||||
def 合并小写开头的段落块(meta_txt):
|
def 合并小写开头的段落块(meta_txt):
|
||||||
@ -542,16 +558,18 @@ def read_and_clean_pdf_text(fp):
|
|||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
for _ in range(100):
|
for _ in range(100):
|
||||||
for index, block_txt in enumerate(meta_txt):
|
for index, block_txt in enumerate(meta_txt):
|
||||||
if starts_with_lowercase_word(block_txt):
|
if starts_with_lowercase_word(block_txt):
|
||||||
if meta_txt[index-1] != '\n':
|
if meta_txt[index - 1] != '\n':
|
||||||
meta_txt[index-1] += ' '
|
meta_txt[index - 1] += ' '
|
||||||
else:
|
else:
|
||||||
meta_txt[index-1] = ''
|
meta_txt[index - 1] = ''
|
||||||
meta_txt[index-1] += meta_txt[index]
|
meta_txt[index - 1] += meta_txt[index]
|
||||||
meta_txt[index] = '\n'
|
meta_txt[index] = '\n'
|
||||||
return meta_txt
|
return meta_txt
|
||||||
|
|
||||||
meta_txt = 合并小写开头的段落块(meta_txt)
|
meta_txt = 合并小写开头的段落块(meta_txt)
|
||||||
meta_txt = 清理多余的空行(meta_txt)
|
meta_txt = 清理多余的空行(meta_txt)
|
||||||
|
|
||||||
@ -571,7 +589,7 @@ def read_and_clean_pdf_text(fp):
|
|||||||
return meta_txt, page_one_meta
|
return meta_txt, page_one_meta
|
||||||
|
|
||||||
|
|
||||||
def get_files_from_everything(txt, type): # type='.md'
|
def get_files_from_everything(txt, type): # type='.md'
|
||||||
"""
|
"""
|
||||||
这个函数是用来获取指定目录下所有指定类型(如.md)的文件,并且对于网络上的文件,也可以获取它。
|
这个函数是用来获取指定目录下所有指定类型(如.md)的文件,并且对于网络上的文件,也可以获取它。
|
||||||
下面是对每个参数和返回值的说明:
|
下面是对每个参数和返回值的说明:
|
||||||
@ -593,9 +611,10 @@ def get_files_from_everything(txt, type): # type='.md'
|
|||||||
from toolbox import get_conf
|
from toolbox import get_conf
|
||||||
proxies, = get_conf('proxies')
|
proxies, = get_conf('proxies')
|
||||||
r = requests.get(txt, proxies=proxies)
|
r = requests.get(txt, proxies=proxies)
|
||||||
with open('./gpt_log/temp'+type, 'wb+') as f: f.write(r.content)
|
with open('./gpt_log/temp' + type, 'wb+') as f:
|
||||||
|
f.write(r.content)
|
||||||
project_folder = './gpt_log/'
|
project_folder = './gpt_log/'
|
||||||
file_manifest = ['./gpt_log/temp'+type]
|
file_manifest = ['./gpt_log/temp' + type]
|
||||||
elif txt.endswith(type):
|
elif txt.endswith(type):
|
||||||
# 直接给定文件
|
# 直接给定文件
|
||||||
file_manifest = [txt]
|
file_manifest = [txt]
|
||||||
@ -603,7 +622,7 @@ def get_files_from_everything(txt, type): # type='.md'
|
|||||||
elif os.path.exists(txt):
|
elif os.path.exists(txt):
|
||||||
# 本地路径,递归搜索
|
# 本地路径,递归搜索
|
||||||
project_folder = txt
|
project_folder = txt
|
||||||
file_manifest = [f for f in glob.glob(f'{project_folder}/**/*'+type, recursive=True)]
|
file_manifest = [f for f in glob.glob(f'{project_folder}/**/*' + type, recursive=True)]
|
||||||
if len(file_manifest) == 0:
|
if len(file_manifest) == 0:
|
||||||
success = False
|
success = False
|
||||||
else:
|
else:
|
||||||
@ -614,16 +633,14 @@ def get_files_from_everything(txt, type): # type='.md'
|
|||||||
return success, file_manifest, project_folder
|
return success, file_manifest, project_folder
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def Singleton(cls):
|
def Singleton(cls):
|
||||||
_instance = {}
|
_instance = {}
|
||||||
|
|
||||||
def _singleton(*args, **kargs):
|
def _singleton(*args, **kargs):
|
||||||
if cls not in _instance:
|
if cls not in _instance:
|
||||||
_instance[cls] = cls(*args, **kargs)
|
_instance[cls] = cls(*args, **kargs)
|
||||||
return _instance[cls]
|
return _instance[cls]
|
||||||
|
|
||||||
return _singleton
|
return _singleton
|
||||||
|
|
||||||
|
|
||||||
@ -642,31 +659,30 @@ class knowledge_archive_interface():
|
|||||||
from toolbox import ProxyNetworkActivate
|
from toolbox import ProxyNetworkActivate
|
||||||
print('Checking Text2vec ...')
|
print('Checking Text2vec ...')
|
||||||
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
|
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
|
||||||
with ProxyNetworkActivate(): # 临时地激活代理网络
|
with ProxyNetworkActivate(): # 临时地激活代理网络
|
||||||
self.text2vec_large_chinese = HuggingFaceEmbeddings(model_name="GanymedeNil/text2vec-large-chinese")
|
self.text2vec_large_chinese = HuggingFaceEmbeddings(model_name="GanymedeNil/text2vec-large-chinese")
|
||||||
|
|
||||||
return self.text2vec_large_chinese
|
return self.text2vec_large_chinese
|
||||||
|
|
||||||
|
|
||||||
def feed_archive(self, file_manifest, id="default"):
|
def feed_archive(self, file_manifest, id="default"):
|
||||||
self.threadLock.acquire()
|
self.threadLock.acquire()
|
||||||
# import uuid
|
# import uuid
|
||||||
self.current_id = id
|
self.current_id = id
|
||||||
from zh_langchain import construct_vector_store
|
from zh_langchain import construct_vector_store
|
||||||
self.qa_handle, self.kai_path = construct_vector_store(
|
self.qa_handle, self.kai_path = construct_vector_store(
|
||||||
vs_id=self.current_id,
|
vs_id=self.current_id,
|
||||||
files=file_manifest,
|
files=file_manifest,
|
||||||
sentence_size=100,
|
sentence_size=100,
|
||||||
history=[],
|
history=[],
|
||||||
one_conent="",
|
one_conent="",
|
||||||
one_content_segmentation="",
|
one_content_segmentation="",
|
||||||
text2vec = self.get_chinese_text2vec(),
|
text2vec=self.get_chinese_text2vec(),
|
||||||
)
|
)
|
||||||
self.threadLock.release()
|
self.threadLock.release()
|
||||||
|
|
||||||
def get_current_archive_id(self):
|
def get_current_archive_id(self):
|
||||||
return self.current_id
|
return self.current_id
|
||||||
|
|
||||||
def get_loaded_file(self):
|
def get_loaded_file(self):
|
||||||
return self.qa_handle.get_loaded_file()
|
return self.qa_handle.get_loaded_file()
|
||||||
|
|
||||||
@ -675,30 +691,31 @@ class knowledge_archive_interface():
|
|||||||
if not self.current_id == id:
|
if not self.current_id == id:
|
||||||
self.current_id = id
|
self.current_id = id
|
||||||
from zh_langchain import construct_vector_store
|
from zh_langchain import construct_vector_store
|
||||||
self.qa_handle, self.kai_path = construct_vector_store(
|
self.qa_handle, self.kai_path = construct_vector_store(
|
||||||
vs_id=self.current_id,
|
vs_id=self.current_id,
|
||||||
files=[],
|
files=[],
|
||||||
sentence_size=100,
|
sentence_size=100,
|
||||||
history=[],
|
history=[],
|
||||||
one_conent="",
|
one_conent="",
|
||||||
one_content_segmentation="",
|
one_content_segmentation="",
|
||||||
text2vec = self.get_chinese_text2vec(),
|
text2vec=self.get_chinese_text2vec(),
|
||||||
)
|
)
|
||||||
VECTOR_SEARCH_SCORE_THRESHOLD = 0
|
VECTOR_SEARCH_SCORE_THRESHOLD = 0
|
||||||
VECTOR_SEARCH_TOP_K = 4
|
VECTOR_SEARCH_TOP_K = 4
|
||||||
CHUNK_SIZE = 512
|
CHUNK_SIZE = 512
|
||||||
resp, prompt = self.qa_handle.get_knowledge_based_conent_test(
|
resp, prompt = self.qa_handle.get_knowledge_based_conent_test(
|
||||||
query = txt,
|
query=txt,
|
||||||
vs_path = self.kai_path,
|
vs_path=self.kai_path,
|
||||||
score_threshold=VECTOR_SEARCH_SCORE_THRESHOLD,
|
score_threshold=VECTOR_SEARCH_SCORE_THRESHOLD,
|
||||||
vector_search_top_k=VECTOR_SEARCH_TOP_K,
|
vector_search_top_k=VECTOR_SEARCH_TOP_K,
|
||||||
chunk_conent=True,
|
chunk_conent=True,
|
||||||
chunk_size=CHUNK_SIZE,
|
chunk_size=CHUNK_SIZE,
|
||||||
text2vec = self.get_chinese_text2vec(),
|
text2vec=self.get_chinese_text2vec(),
|
||||||
)
|
)
|
||||||
self.threadLock.release()
|
self.threadLock.release()
|
||||||
return resp, prompt
|
return resp, prompt
|
||||||
|
|
||||||
|
|
||||||
def try_install_deps(deps):
|
def try_install_deps(deps):
|
||||||
for dep in deps:
|
for dep in deps:
|
||||||
import subprocess, sys
|
import subprocess, sys
|
||||||
|
|||||||
@ -16,4 +16,5 @@ openai
|
|||||||
numpy
|
numpy
|
||||||
arxiv
|
arxiv
|
||||||
rich
|
rich
|
||||||
langchain
|
langchain
|
||||||
|
zh_langchain
|
||||||
Reference in New Issue
Block a user