qingxu99 commited on
Commit
3aa446c
·
1 Parent(s): 23c1b14

修复代码英文重构Bug

Browse files
.gitignore CHANGED
@@ -139,4 +139,5 @@ config_private.py
139
  gpt_log
140
  private.md
141
  private_upload
142
- other_llms
 
 
139
  gpt_log
140
  private.md
141
  private_upload
142
+ other_llms
143
+ cradle.py
crazy_functions/代码重写为全英文_多线程.py CHANGED
@@ -1,41 +1,126 @@
1
  import threading
2
  from predict import predict_no_ui_long_connection
3
- from toolbox import CatchException, write_results_to_file
4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
 
7
  @CatchException
8
  def 全项目切换英文(txt, top_p, temperature, chatbot, history, sys_prompt, WEB_PORT):
9
- history = [] # 清空历史,以免输入溢出
10
- # 集合文件
11
- import time, glob, os
 
 
 
 
 
 
 
 
 
 
 
 
12
  os.makedirs('gpt_log/generated_english_version', exist_ok=True)
13
  os.makedirs('gpt_log/generated_english_version/crazy_functions', exist_ok=True)
14
  file_manifest = [f for f in glob.glob('./*.py') if ('test_project' not in f) and ('gpt_log' not in f)] + \
15
  [f for f in glob.glob('./crazy_functions/*.py') if ('test_project' not in f) and ('gpt_log' not in f)]
 
16
  i_say_show_user_buffer = []
17
 
18
- # 随便显示点什么防止卡顿的感觉
19
  for index, fp in enumerate(file_manifest):
20
  # if 'test_project' in fp: continue
21
  with open(fp, 'r', encoding='utf-8') as f:
22
  file_content = f.read()
23
- i_say_show_user =f'[{index}/{len(file_manifest)}] 接下来请将以下代码中包含的所有中文转化为英文,只输出代码: {os.path.abspath(fp)}'
24
  i_say_show_user_buffer.append(i_say_show_user)
25
  chatbot.append((i_say_show_user, "[Local Message] 等待多线程操作,中间过程不予显示."))
26
  yield chatbot, history, '正常'
27
 
28
- # 任务函数
 
 
 
 
 
 
 
 
 
 
29
  mutable_return = [None for _ in file_manifest]
 
30
  def thread_worker(fp,index):
 
 
 
31
  with open(fp, 'r', encoding='utf-8') as f:
32
  file_content = f.read()
33
- i_say = f'接下来请将以下代码中包含的所有中文转化为英��,只输出代码,文件名是{fp},文件代码是 ```{file_content}```'
34
- # ** gpt request **
35
- gpt_say = predict_no_ui_long_connection(inputs=i_say, top_p=top_p, temperature=temperature, history=history, sys_prompt=sys_prompt)
36
- mutable_return[index] = gpt_say
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
- # 所有线程同时开始执行任务函数
39
  handles = [threading.Thread(target=thread_worker, args=(fp,index)) for index, fp in enumerate(file_manifest)]
40
  for h in handles:
41
  h.daemon = True
@@ -43,19 +128,23 @@ def 全项目切换英文(txt, top_p, temperature, chatbot, history, sys_prompt,
43
  chatbot.append(('开始了吗?', f'多线程操作已经开始'))
44
  yield chatbot, history, '正常'
45
 
46
- # 循环轮询各个线程是否执行完毕
47
  cnt = 0
48
  while True:
49
- time.sleep(1)
 
50
  th_alive = [h.is_alive() for h in handles]
51
  if not any(th_alive): break
52
- stat = ['执行中' if alive else '已完成' for alive in th_alive]
53
- stat_str = '|'.join(stat)
54
- cnt += 1
55
- chatbot[-1] = (chatbot[-1][0], f'多线程操作已经开始,完成情况: {stat_str}' + ''.join(['.']*(cnt%4)))
 
 
 
56
  yield chatbot, history, '正常'
57
 
58
- # 把结果写入文件
59
  for index, h in enumerate(handles):
60
  h.join() # 这里其实不需要join了,肯定已经都结束了
61
  fp = file_manifest[index]
@@ -63,13 +152,17 @@ def 全项目切换英文(txt, top_p, temperature, chatbot, history, sys_prompt,
63
  i_say_show_user = i_say_show_user_buffer[index]
64
 
65
  where_to_relocate = f'gpt_log/generated_english_version/{fp}'
66
- with open(where_to_relocate, 'w+', encoding='utf-8') as f: f.write(gpt_say.lstrip('```').rstrip('```'))
 
 
 
 
67
  chatbot.append((i_say_show_user, f'[Local Message] 已完成{os.path.abspath(fp)}的转化,\n\n存入{os.path.abspath(where_to_relocate)}'))
68
  history.append(i_say_show_user); history.append(gpt_say)
69
  yield chatbot, history, '正常'
70
  time.sleep(1)
71
 
72
- # 备份一个文件
73
  res = write_results_to_file(history)
74
  chatbot.append(("生成一份任务执行报告", res))
75
  yield chatbot, history, '正常'
 
1
  import threading
2
  from predict import predict_no_ui_long_connection
3
+ from toolbox import CatchException, write_results_to_file, report_execption
4
 
5
+ def extract_code_block_carefully(txt):
6
+ splitted = txt.split('```')
7
+ n_code_block_seg = len(splitted) - 1
8
+ if n_code_block_seg <= 1: return txt
9
+ # 剩下的情况都开头除去 ``` 结尾除去一次 ```
10
+ txt_out = '```'.join(splitted[1:-1])
11
+ return txt_out
12
+
13
+ def breakdown_txt_to_satisfy_token_limit(txt, limit, must_break_at_empty_line=True):
14
+ from transformers import GPT2TokenizerFast
15
+ tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
16
+ get_token_cnt = lambda txt: len(tokenizer(txt)["input_ids"])
17
+ def cut(txt_tocut, must_break_at_empty_line): # 递归
18
+ if get_token_cnt(txt_tocut) <= limit:
19
+ return [txt_tocut]
20
+ else:
21
+ lines = txt_tocut.split('\n')
22
+ estimated_line_cut = limit / get_token_cnt(txt_tocut) * len(lines)
23
+ estimated_line_cut = int(estimated_line_cut)
24
+ for cnt in reversed(range(estimated_line_cut)):
25
+ if must_break_at_empty_line:
26
+ if lines[cnt] != "": continue
27
+ print(cnt)
28
+ prev = "\n".join(lines[:cnt])
29
+ post = "\n".join(lines[cnt:])
30
+ if get_token_cnt(prev) < limit: break
31
+ if cnt == 0:
32
+ print('what the f?')
33
+ raise RuntimeError("存在一行极长的文本!")
34
+ print(len(post))
35
+ # 列表递归接龙
36
+ result = [prev]
37
+ result.extend(cut(post, must_break_at_empty_line))
38
+ return result
39
+ try:
40
+ return cut(txt, must_break_at_empty_line=True)
41
+ except RuntimeError:
42
+ return cut(txt, must_break_at_empty_line=False)
43
+
44
+
45
+ def break_txt_into_half_at_some_linebreak(txt):
46
+ lines = txt.split('\n')
47
+ n_lines = len(lines)
48
+ pre = lines[:(n_lines//2)]
49
+ post = lines[(n_lines//2):]
50
+ return "\n".join(pre), "\n".join(post)
51
 
52
 
53
  @CatchException
54
  def 全项目切换英文(txt, top_p, temperature, chatbot, history, sys_prompt, WEB_PORT):
55
+ # 第1步:清空历史,以免输入溢出
56
+ history = []
57
+
58
+ # 第2步:尝试导入依赖,如果缺少依赖,则给出安装建议
59
+ try:
60
+ import openai, transformers
61
+ except:
62
+ report_execption(chatbot, history,
63
+ a = f"解析项目: {txt}",
64
+ b = f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade openai transformers```。")
65
+ yield chatbot, history, '正常'
66
+ return
67
+
68
+ # 第3步:集合文件
69
+ import time, glob, os, shutil, re, openai
70
  os.makedirs('gpt_log/generated_english_version', exist_ok=True)
71
  os.makedirs('gpt_log/generated_english_version/crazy_functions', exist_ok=True)
72
  file_manifest = [f for f in glob.glob('./*.py') if ('test_project' not in f) and ('gpt_log' not in f)] + \
73
  [f for f in glob.glob('./crazy_functions/*.py') if ('test_project' not in f) and ('gpt_log' not in f)]
74
+ # file_manifest = ['./toolbox.py']
75
  i_say_show_user_buffer = []
76
 
77
+ # 第4步:随便显示点什么防止卡顿的感觉
78
  for index, fp in enumerate(file_manifest):
79
  # if 'test_project' in fp: continue
80
  with open(fp, 'r', encoding='utf-8') as f:
81
  file_content = f.read()
82
+ i_say_show_user =f'[{index}/{len(file_manifest)}] 接下来请将以下代码中包含的所有中文转化为英文,只输出转化后的英文代码,请用代码块输出代码: {os.path.abspath(fp)}'
83
  i_say_show_user_buffer.append(i_say_show_user)
84
  chatbot.append((i_say_show_user, "[Local Message] 等待多线程操作,中间过程不予显示."))
85
  yield chatbot, history, '正常'
86
 
87
+
88
+ # 第5步:Token限制下的截断与处理
89
+ MAX_TOKEN = 2500
90
+ # from transformers import GPT2TokenizerFast
91
+ # print('加载tokenizer中')
92
+ # tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
93
+ # get_token_cnt = lambda txt: len(tokenizer(txt)["input_ids"])
94
+ # print('加载tokenizer结束')
95
+
96
+
97
+ # 第6步:任务函数
98
  mutable_return = [None for _ in file_manifest]
99
+ observe_window = [[""] for _ in file_manifest]
100
  def thread_worker(fp,index):
101
+ if index > 10:
102
+ time.sleep(60)
103
+ print('Openai 限制免费用户每分钟20次请求,降低请求频率中。')
104
  with open(fp, 'r', encoding='utf-8') as f:
105
  file_content = f.read()
106
+ i_say_template = lambda fp, file_content: f'接下来请将以下代码中包含的所有中文转化为英文,只输出代码,文件名是{fp},文件代码是 ```{file_content}```'
107
+ try:
108
+ gpt_say = ""
109
+ # 分解代码文件
110
+ file_content_breakdown = breakdown_txt_to_satisfy_token_limit(file_content, MAX_TOKEN)
111
+ for file_content_partial in file_content_breakdown:
112
+ i_say = i_say_template(fp, file_content_partial)
113
+ # # ** gpt request **
114
+ gpt_say_partial = predict_no_ui_long_connection(inputs=i_say, top_p=top_p, temperature=temperature, history=[], sys_prompt=sys_prompt, observe_window=observe_window[index])
115
+ gpt_say_partial = extract_code_block_carefully(gpt_say_partial)
116
+ gpt_say += gpt_say_partial
117
+ mutable_return[index] = gpt_say
118
+ except ConnectionAbortedError as token_exceed_err:
119
+ print('至少一个线程任务Token溢出而失败', e)
120
+ except Exception as e:
121
+ print('至少一个线程任务意外失败', e)
122
 
123
+ # 第7步:所有线程同时开始执行任务函数
124
  handles = [threading.Thread(target=thread_worker, args=(fp,index)) for index, fp in enumerate(file_manifest)]
125
  for h in handles:
126
  h.daemon = True
 
128
  chatbot.append(('开始了吗?', f'多线程操作已经开始'))
129
  yield chatbot, history, '正常'
130
 
131
+ # 第8步:循环轮询各个线程是否执行完毕
132
  cnt = 0
133
  while True:
134
+ cnt += 1
135
+ time.sleep(0.2)
136
  th_alive = [h.is_alive() for h in handles]
137
  if not any(th_alive): break
138
+ # 更好的UI视觉效果
139
+ observe_win = []
140
+ for thread_index, alive in enumerate(th_alive):
141
+ observe_win.append("[ ..."+observe_window[thread_index][0][-60:].replace('\n','').replace('```','...').replace(' ','.').replace('<br/>','.....').replace('$','.')+"... ]")
142
+ stat = [f'执行中: {obs}\n\n' if alive else '已完成\n\n' for alive, obs in zip(th_alive, observe_win)]
143
+ stat_str = ''.join(stat)
144
+ chatbot[-1] = (chatbot[-1][0], f'多线程操作已经开始,完成情况: \n\n{stat_str}' + ''.join(['.']*(cnt%10+1)))
145
  yield chatbot, history, '正常'
146
 
147
+ # 第9步:把结果写入文件
148
  for index, h in enumerate(handles):
149
  h.join() # 这里其实不需要join了,肯定已经都结束了
150
  fp = file_manifest[index]
 
152
  i_say_show_user = i_say_show_user_buffer[index]
153
 
154
  where_to_relocate = f'gpt_log/generated_english_version/{fp}'
155
+ if gpt_say is not None:
156
+ with open(where_to_relocate, 'w+', encoding='utf-8') as f:
157
+ f.write(gpt_say)
158
+ else: # 失败
159
+ shutil.copyfile(file_manifest[index], where_to_relocate)
160
  chatbot.append((i_say_show_user, f'[Local Message] 已完成{os.path.abspath(fp)}的转化,\n\n存入{os.path.abspath(where_to_relocate)}'))
161
  history.append(i_say_show_user); history.append(gpt_say)
162
  yield chatbot, history, '正常'
163
  time.sleep(1)
164
 
165
+ # 第10步:备份一个文件
166
  res = write_results_to_file(history)
167
  chatbot.append(("生成一份任务执行报告", res))
168
  yield chatbot, history, '正常'
predict.py CHANGED
@@ -71,9 +71,10 @@ def predict_no_ui(inputs, top_p, temperature, history=[], sys_prompt=""):
71
  raise ConnectionAbortedError("Json解析不合常规,可能是文本过长" + response.text)
72
 
73
 
74
- def predict_no_ui_long_connection(inputs, top_p, temperature, history=[], sys_prompt=""):
75
  """
76
  发送至chatGPT,等待回复,一次性完成,不显示中间过程。但内部用stream的方法避免有人中途掐网线。
 
77
  """
78
  headers, payload = generate_payload(inputs, top_p, temperature, history, system_prompt=sys_prompt, stream=True)
79
 
@@ -105,7 +106,10 @@ def predict_no_ui_long_connection(inputs, top_p, temperature, history=[], sys_pr
105
  delta = json_data["delta"]
106
  if len(delta) == 0: break
107
  if "role" in delta: continue
108
- if "content" in delta: result += delta["content"]; print(delta["content"], end='')
 
 
 
109
  else: raise RuntimeError("意外Json结构:"+delta)
110
  if json_data['finish_reason'] == 'length':
111
  raise ConnectionAbortedError("正常结束,但显示Token不足。")
 
71
  raise ConnectionAbortedError("Json解析不合常规,可能是文本过长" + response.text)
72
 
73
 
74
+ def predict_no_ui_long_connection(inputs, top_p, temperature, history=[], sys_prompt="", observe_window=None):
75
  """
76
  发送至chatGPT,等待回复,一次性完成,不显示中间过程。但内部用stream的方法避免有人中途掐网线。
77
+ observe_window:用于负责跨越线程传递已经输出的部分,大部分时候仅仅为了fancy的视觉效果,留空即可
78
  """
79
  headers, payload = generate_payload(inputs, top_p, temperature, history, system_prompt=sys_prompt, stream=True)
80
 
 
106
  delta = json_data["delta"]
107
  if len(delta) == 0: break
108
  if "role" in delta: continue
109
+ if "content" in delta:
110
+ result += delta["content"]
111
+ print(delta["content"], end='')
112
+ if observe_window is not None: observe_window[0] += delta["content"]
113
  else: raise RuntimeError("意外Json结构:"+delta)
114
  if json_data['finish_reason'] == 'length':
115
  raise ConnectionAbortedError("正常结束,但显示Token不足。")
requirements.txt CHANGED
@@ -3,3 +3,5 @@ requests[socks]
3
  mdtex2html
4
  Markdown
5
  latex2mathml
 
 
 
3
  mdtex2html
4
  Markdown
5
  latex2mathml
6
+ openai
7
+ transformers