3v324v23 commited on
Commit
03ba072
1 Parent(s): 2472185

改善word总结功能

Browse files
crazy_functions/crazy_functions_test.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ 这是什么?
3
+ 这个文件用于函数插件的单元测试
4
+ 运行方法 python crazy_functions/crazy_functions_test.py
5
+ """
6
+
7
+ def validate_path():
8
+ import os, sys
9
+ dir_name = os.path.dirname(__file__)
10
+ root_dir_assume = os.path.abspath(os.path.dirname(__file__) + '/..')
11
+ os.chdir(root_dir_assume)
12
+ sys.path.append(root_dir_assume)
13
+
14
+ validate_path() # validate path so you can run from base directory
15
+
16
+ from toolbox import get_conf, ChatBotWithCookies
17
+ proxies, WEB_PORT, LLM_MODEL, CONCURRENT_COUNT, AUTHENTICATION, CHATBOT_HEIGHT, LAYOUT, API_KEY = \
18
+ get_conf('proxies', 'WEB_PORT', 'LLM_MODEL', 'CONCURRENT_COUNT', 'AUTHENTICATION', 'CHATBOT_HEIGHT', 'LAYOUT', 'API_KEY')
19
+
20
+ llm_kwargs = {
21
+ 'api_key': API_KEY,
22
+ 'llm_model': LLM_MODEL,
23
+ 'top_p':1.0,
24
+ 'max_length': None,
25
+ 'temperature':1.0,
26
+ }
27
+ plugin_kwargs = { }
28
+ chatbot = ChatBotWithCookies(llm_kwargs)
29
+ history = []
30
+ system_prompt = "Serve me as a writing and programming assistant."
31
+ web_port = 1024
32
+
33
+
34
+ def test_总结word文档():
35
+ from crazy_functions.总结word文档 import 总结word文档
36
+ txt = "F:/AMD"
37
+ for _ in 总结word文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
38
+ pass
39
+
40
+ test_总结word文档()
41
+
42
+ input("程序完成,回车退出。")
43
+ print("退出。")
crazy_functions/总结word文档.py CHANGED
@@ -8,8 +8,6 @@ def 解析docx(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot
8
  import time, os
9
  # pip install python-docx 用于docx格式,跨平台
10
  # pip install pywin32 用于doc格式,仅支持Win平台
11
-
12
- print('begin analysis on:', file_manifest)
13
  for index, fp in enumerate(file_manifest):
14
  if fp.split(".")[-1] == "docx":
15
  from docx import Document
@@ -29,18 +27,20 @@ def 解析docx(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot
29
  word.Quit()
30
 
31
  print(file_content)
32
-
33
- prefix = "接下来请你逐文件分析下面的论文文件," if index == 0 else ""
34
  # private_upload里面的文件名在解压zip后容易出现乱码(rar和7z格式正常),故可以只分析文章内容,不输入文件名
35
- i_say = prefix + f'请对下面的文章片段用中英文做概述,文件名是{os.path.relpath(fp, project_folder)},' \
36
- f'文章内容是 ```{file_content}```'
37
- i_say_show_user = prefix + f'[{index+1}/{len(file_manifest)}] 假设你是论文审稿专家,请对下面的文章片段做概述: {os.path.abspath(fp)}'
38
- chatbot.append((i_say_show_user, "[Local Message] waiting gpt response."))
39
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
40
-
41
- if not fast_debug:
42
- msg = '正常'
43
- # ** gpt request **
 
 
 
 
44
  gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(
45
  inputs=i_say,
46
  inputs_show_user=i_say_show_user,
@@ -48,46 +48,34 @@ def 解析docx(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot
48
  chatbot=chatbot,
49
  history=[],
50
  sys_prompt="总结文章。"
51
- ) # 带超时倒计时
 
52
  chatbot[-1] = (i_say_show_user, gpt_say)
53
- history.append(i_say_show_user)
54
- history.append(gpt_say)
55
- yield from update_ui(chatbot=chatbot, history=history, msg=msg) # 刷新界面
56
- if not fast_debug: time.sleep(2)
57
-
58
- """
59
- # 可按需启用
60
- i_say = f'根据你上述的分析,对全文进行概括,用学术性语言写一段中文摘要,然后再写一篇英文的。'
61
- chatbot.append((i_say, "[Local Message] waiting gpt response."))
62
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
- i_say = f'我想让你做一个论文写作导师。您的任务是使用人工智能工具(例如自然语言处理)提供有关如何改进其上述文章的反馈。' \
66
- f'您还应该利用您在有效写作技巧方面的修辞知识和经验来建议作者可以更好地以书面形式表达他们的想法和想法的方法。' \
67
- f'根据你之前的分析,提出建议'
68
- chatbot.append((i_say, "[Local Message] waiting gpt response."))
69
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
70
-
71
- """
72
-
73
- if not fast_debug:
74
- msg = '正常'
75
- # ** gpt request **
76
- gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(
77
- inputs=i_say,
78
- inputs_show_user=i_say,
79
- llm_kwargs=llm_kwargs,
80
- chatbot=chatbot,
81
- history=history,
82
- sys_prompt="总结文章。"
83
- ) # 带超时倒计时
84
- chatbot[-1] = (i_say, gpt_say)
85
- history.append(i_say)
86
- history.append(gpt_say)
87
- yield from update_ui(chatbot=chatbot, history=history, msg=msg) # 刷新界面
88
  res = write_results_to_file(history)
89
  chatbot.append(("完成了吗?", res))
90
- yield from update_ui(chatbot=chatbot, history=history, msg=msg) # 刷新界面
 
 
 
 
91
 
92
 
93
  @CatchException
@@ -123,11 +111,11 @@ def 总结word文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_pr
123
  return
124
 
125
  # 搜索需要处理的文件清单
126
- file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.docx', recursive=True)] + \
127
- [f for f in glob.glob(f'{project_folder}/**/*.doc', recursive=True)]
128
- # [f for f in glob.glob(f'{project_folder}/**/*.tex', recursive=True)] + \
129
- # [f for f in glob.glob(f'{project_folder}/**/*.cpp', recursive=True)] + \
130
- # [f for f in glob.glob(f'{project_folder}/**/*.c', recursive=True)]
131
 
132
  # 如果没找到任何文件
133
  if len(file_manifest) == 0:
 
8
  import time, os
9
  # pip install python-docx 用于docx格式,跨平台
10
  # pip install pywin32 用于doc格式,仅支持Win平台
 
 
11
  for index, fp in enumerate(file_manifest):
12
  if fp.split(".")[-1] == "docx":
13
  from docx import Document
 
27
  word.Quit()
28
 
29
  print(file_content)
 
 
30
  # private_upload里面的文件名在解压zip后容易出现乱码(rar和7z格式正常),故可以只分析文章内容,不输入文件名
31
+ from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
32
+ from request_llm.bridge_all import model_info
33
+ max_token = model_info[llm_kwargs['llm_model']]['max_token']
34
+ TOKEN_LIMIT_PER_FRAGMENT = max_token * 3 // 4
35
+ paper_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
36
+ txt=file_content,
37
+ get_token_fn=model_info[llm_kwargs['llm_model']]['token_cnt'],
38
+ limit=TOKEN_LIMIT_PER_FRAGMENT
39
+ )
40
+ this_paper_history = []
41
+ for i, paper_frag in enumerate(paper_fragments):
42
+ i_say = f'请对下面的文章片段用中文做概述,文件名是{os.path.relpath(fp, project_folder)},文章内容是 ```{paper_frag}```'
43
+ i_say_show_user = f'请对下面的文章片段做概述: {os.path.abspath(fp)}的第{i+1}/{len(paper_fragments)}个片段。'
44
  gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(
45
  inputs=i_say,
46
  inputs_show_user=i_say_show_user,
 
48
  chatbot=chatbot,
49
  history=[],
50
  sys_prompt="总结文章。"
51
+ )
52
+
53
  chatbot[-1] = (i_say_show_user, gpt_say)
54
+ history.extend([i_say_show_user,gpt_say])
55
+ this_paper_history.extend([i_say_show_user,gpt_say])
 
 
 
 
 
 
 
 
56
 
57
+ # 已经对该文章的所有片段总结完毕,如果文章被切分了,
58
+ if len(paper_fragments) > 1:
59
+ i_say = f"根据以上的对话,总结文章{os.path.abspath(fp)}的主要内容。"
60
+ gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(
61
+ inputs=i_say,
62
+ inputs_show_user=i_say,
63
+ llm_kwargs=llm_kwargs,
64
+ chatbot=chatbot,
65
+ history=this_paper_history,
66
+ sys_prompt="总结文章。"
67
+ )
68
+
69
+ history.extend([i_say,gpt_say])
70
+ this_paper_history.extend([i_say,gpt_say])
71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  res = write_results_to_file(history)
73
  chatbot.append(("完成了吗?", res))
74
+ yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
75
+
76
+ res = write_results_to_file(history)
77
+ chatbot.append(("所有文件都总结完成了吗?", res))
78
+ yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
79
 
80
 
81
  @CatchException
 
111
  return
112
 
113
  # 搜索需要处理的文件清单
114
+ if txt.endswith('.docx') or txt.endswith('.doc'):
115
+ file_manifest = [txt]
116
+ else:
117
+ file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.docx', recursive=True)] + \
118
+ [f for f in glob.glob(f'{project_folder}/**/*.doc', recursive=True)]
119
 
120
  # 如果没找到任何文件
121
  if len(file_manifest) == 0:
request_llm/bridge_all.py CHANGED
@@ -8,6 +8,7 @@
8
  具备多线程调用能力的函数
9
  2. predict_no_ui_long_connection:在实验过程中发现调用predict_no_ui处理长文档时,和openai的连接容易断掉,这个函数用stream的方式解决这个问题,同样支持多线程
10
  """
 
11
 
12
  from concurrent.futures import ThreadPoolExecutor
13
 
@@ -31,6 +32,43 @@ methods = {
31
  "tgui-ui": tgui_ui,
32
  }
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  def LLM_CATCH_EXCEPTION(f):
35
  """
36
  装饰器函数,将错误显示出来
@@ -47,7 +85,7 @@ def LLM_CATCH_EXCEPTION(f):
47
  return tb_str
48
  return decorated
49
 
50
- colors = ['#FF00FF', '#00FFFF', '#FF0000''#990099', '#009999', '#990044']
51
 
52
  def predict_no_ui_long_connection(inputs, llm_kwargs, history, sys_prompt, observe_window, console_slience=False):
53
  """
 
8
  具备多线程调用能力的函数
9
  2. predict_no_ui_long_connection:在实验过程中发现调用predict_no_ui处理长文档时,和openai的连接容易断掉,这个函数用stream的方式解决这个问题,同样支持多线程
10
  """
11
+ import tiktoken
12
 
13
  from concurrent.futures import ThreadPoolExecutor
14
 
 
32
  "tgui-ui": tgui_ui,
33
  }
34
 
35
+ model_info = {
36
+ # openai
37
+ "gpt-3.5-turbo": {
38
+ "max_token": 4096,
39
+ "tokenizer": tiktoken.encoding_for_model("gpt-3.5-turbo"),
40
+ "token_cnt": lambda txt: len(tiktoken.encoding_for_model("gpt-3.5-turbo").encode(txt, disallowed_special=())),
41
+ },
42
+
43
+ "gpt-4": {
44
+ "max_token": 4096,
45
+ "tokenizer": tiktoken.encoding_for_model("gpt-4"),
46
+ "token_cnt": lambda txt: len(tiktoken.encoding_for_model("gpt-4").encode(txt, disallowed_special=())),
47
+ },
48
+ # api_2d
49
+ "gpt-3.5-turbo-api2d": {
50
+ "max_token": 4096,
51
+ "tokenizer": tiktoken.encoding_for_model("gpt-3.5-turbo"),
52
+ "token_cnt": lambda txt: len(tiktoken.encoding_for_model("gpt-3.5-turbo").encode(txt, disallowed_special=())),
53
+ },
54
+
55
+ "gpt-4-api2d": {
56
+ "max_token": 4096,
57
+ "tokenizer": tiktoken.encoding_for_model("gpt-4"),
58
+ "token_cnt": lambda txt: len(tiktoken.encoding_for_model("gpt-4").encode(txt, disallowed_special=())),
59
+ },
60
+
61
+ # chatglm
62
+ "chatglm": {
63
+ "max_token": 1024,
64
+ "tokenizer": tiktoken.encoding_for_model("gpt-3.5-turbo"),
65
+ "token_cnt": lambda txt: len(tiktoken.encoding_for_model("gpt-3.5-turbo").encode(txt, disallowed_special=())),
66
+ },
67
+
68
+
69
+ }
70
+
71
+
72
  def LLM_CATCH_EXCEPTION(f):
73
  """
74
  装饰器函数,将错误显示出来
 
85
  return tb_str
86
  return decorated
87
 
88
+ colors = ['#FF00FF', '#00FFFF', '#FF0000', '#990099', '#009999', '#990044']
89
 
90
  def predict_no_ui_long_connection(inputs, llm_kwargs, history, sys_prompt, observe_window, console_slience=False):
91
  """