qingxu99 commited on
Commit
cd6a1fd
1 Parent(s): f10ea20

当无法正常切割PDF文档时,强制切割

Browse files
crazy_functions/crazy_utils.py CHANGED
@@ -104,7 +104,7 @@ def request_gpt_model_in_new_thread_with_ui_alive(
104
  mutable[0] += f"[Local Message] 警告,在执行过程中遭遇问题, Traceback:\n\n{tb_str}\n\n"
105
  if retry_op > 0:
106
  retry_op -= 1
107
- mutable[0] += f"[Local Message] 重试中 {retry_times_at_unknown_error-retry_op}/{retry_times_at_unknown_error}:\n\n"
108
  if "Rate limit reached" in tb_str:
109
  time.sleep(30)
110
  time.sleep(5)
@@ -312,7 +312,6 @@ def breakdown_txt_to_satisfy_token_limit(txt, get_token_fn, limit):
312
  if get_token_fn(prev) < limit:
313
  break
314
  if cnt == 0:
315
- print('what the fuck ?')
316
  raise RuntimeError("存在一行极长的文本!")
317
  # print(len(post))
318
  # 列表递归接龙
@@ -325,8 +324,18 @@ def breakdown_txt_to_satisfy_token_limit(txt, get_token_fn, limit):
325
  return cut(txt, must_break_at_empty_line=False)
326
 
327
 
 
 
 
 
 
 
 
 
 
328
  def breakdown_txt_to_satisfy_token_limit_for_pdf(txt, get_token_fn, limit):
329
- def cut(txt_tocut, must_break_at_empty_line): # 递归
 
330
  if get_token_fn(txt_tocut) <= limit:
331
  return [txt_tocut]
332
  else:
@@ -338,28 +347,40 @@ def breakdown_txt_to_satisfy_token_limit_for_pdf(txt, get_token_fn, limit):
338
  if must_break_at_empty_line:
339
  if lines[cnt] != "":
340
  continue
341
- print(cnt)
342
  prev = "\n".join(lines[:cnt])
343
  post = "\n".join(lines[cnt:])
344
  if get_token_fn(prev) < limit:
345
  break
346
  if cnt == 0:
347
- # print('what the fuck ? 存在一行极长的文本!')
348
- raise RuntimeError("存在一行极长的文本!")
 
 
349
  # print(len(post))
350
  # 列表递归接龙
351
  result = [prev]
352
- result.extend(cut(post, must_break_at_empty_line))
353
  return result
354
  try:
 
355
  return cut(txt, must_break_at_empty_line=True)
356
  except RuntimeError:
357
  try:
 
358
  return cut(txt, must_break_at_empty_line=False)
359
  except RuntimeError:
360
- # 这个中文的句号是故意的,作为一个标识而存在
361
- res = cut(txt.replace('.', '。\n'), must_break_at_empty_line=False)
362
- return [r.replace('。\n', '.') for r in res]
 
 
 
 
 
 
 
 
 
363
 
364
 
365
 
 
104
  mutable[0] += f"[Local Message] 警告,在执行过程中遭遇问题, Traceback:\n\n{tb_str}\n\n"
105
  if retry_op > 0:
106
  retry_op -= 1
107
+ mutable[0] += f"[Local Message] 重试中,请稍等 {retry_times_at_unknown_error-retry_op}/{retry_times_at_unknown_error}:\n\n"
108
  if "Rate limit reached" in tb_str:
109
  time.sleep(30)
110
  time.sleep(5)
 
312
  if get_token_fn(prev) < limit:
313
  break
314
  if cnt == 0:
 
315
  raise RuntimeError("存在一行极长的文本!")
316
  # print(len(post))
317
  # 列表递归接龙
 
324
  return cut(txt, must_break_at_empty_line=False)
325
 
326
 
327
+ def force_breakdown(txt, limit, get_token_fn):
328
+ """
329
+ 当无法用标点、空行分割时,我们用最暴力的方法切割
330
+ """
331
+ for i in reversed(range(len(txt))):
332
+ if get_token_fn(txt[:i]) < limit:
333
+ return txt[:i], txt[i:]
334
+ return "Tiktoken未知错误", "Tiktoken未知错误"
335
+
336
  def breakdown_txt_to_satisfy_token_limit_for_pdf(txt, get_token_fn, limit):
337
+ # 递归
338
+ def cut(txt_tocut, must_break_at_empty_line, break_anyway=False):
339
  if get_token_fn(txt_tocut) <= limit:
340
  return [txt_tocut]
341
  else:
 
347
  if must_break_at_empty_line:
348
  if lines[cnt] != "":
349
  continue
 
350
  prev = "\n".join(lines[:cnt])
351
  post = "\n".join(lines[cnt:])
352
  if get_token_fn(prev) < limit:
353
  break
354
  if cnt == 0:
355
+ if break_anyway:
356
+ prev, post = force_breakdown(txt_tocut, limit, get_token_fn)
357
+ else:
358
+ raise RuntimeError(f"存在一行极长的文本!{txt_tocut}")
359
  # print(len(post))
360
  # 列表递归接龙
361
  result = [prev]
362
+ result.extend(cut(post, must_break_at_empty_line, break_anyway=break_anyway))
363
  return result
364
  try:
365
+ # 第1次尝试,将双空行(\n\n)作为切分点
366
  return cut(txt, must_break_at_empty_line=True)
367
  except RuntimeError:
368
  try:
369
+ # 第2次尝试,将单空行(\n)作为切分点
370
  return cut(txt, must_break_at_empty_line=False)
371
  except RuntimeError:
372
+ try:
373
+ # 第3次尝试,将英文句号(.)作为切分点
374
+ res = cut(txt.replace('.', '。\n'), must_break_at_empty_line=False) # 这个中文的句号是故意的,作为一个标识而存在
375
+ return [r.replace('。\n', '.') for r in res]
376
+ except RuntimeError as e:
377
+ try:
378
+ # 第4次尝试,将中文句号(。)作为切分点
379
+ res = cut(txt.replace('。', '。。\n'), must_break_at_empty_line=False)
380
+ return [r.replace('。。\n', '。') for r in res]
381
+ except RuntimeError as e:
382
+ # 第5次尝试,没办法了,随便切一下敷衍吧
383
+ return cut(txt, must_break_at_empty_line=False, break_anyway=True)
384
 
385
 
386
 
request_llm/bridge_chatgpt.py CHANGED
@@ -96,7 +96,7 @@ def predict_no_ui_long_connection(inputs, llm_kwargs, history=[], sys_prompt="",
96
  # 看门狗,如果超过期限没有喂狗,则终止
97
  if len(observe_window) >= 2:
98
  if (time.time()-observe_window[1]) > watch_dog_patience:
99
- raise RuntimeError("程序终止。")
100
  else: raise RuntimeError("意外Json结构:"+delta)
101
  if json_data['finish_reason'] == 'length':
102
  raise ConnectionAbortedError("正常结束,但显示Token不足,导致输出不完整,请削减单次输入的文本量。")
 
96
  # 看门狗,如果超过期限没有喂狗,则终止
97
  if len(observe_window) >= 2:
98
  if (time.time()-observe_window[1]) > watch_dog_patience:
99
+ raise RuntimeError("用户取消了程序。")
100
  else: raise RuntimeError("意外Json结构:"+delta)
101
  if json_data['finish_reason'] == 'length':
102
  raise ConnectionAbortedError("正常结束,但显示Token不足,导致输出不完整,请削减单次输入的文本量。")