qingxu99 commited on
Commit
f05862c
1 Parent(s): fc762cb

Json is good

Browse files
crazy_functions/批量总结PDF文档.py CHANGED
@@ -41,8 +41,8 @@ def clean_text(raw_text):
41
  """
42
  对从 PDF 提取出的原始文本进行清洗和格式化处理。
43
  1. 对原始文本进行归一化处理。
44
- 2. 替换跨行的连词,例如 “Espe-\ncially” 转换为 “Especially”。
45
- 3. 根据 heuristic 规则判断换行符是否是段落分隔,并相应地进行替换。
46
  """
47
  # 对文本进行归一化处理
48
  normalized_text = normalize_text(raw_text)
 
41
  """
42
  对从 PDF 提取出的原始文本进行清洗和格式化处理。
43
  1. 对原始文本进行归一化处理。
44
+ 2. 替换跨行的连词
45
+ 3. 根据 heuristic 规则判断换行符是否是段落分隔,并相应地进行替换
46
  """
47
  # 对文本进行归一化处理
48
  normalized_text = normalize_text(raw_text)
docs/translate_english.json CHANGED
The diff for this file is too large to render. See raw diff
 
multi_language.py CHANGED
@@ -110,7 +110,7 @@ def read_map_from_json(language):
110
  if os.path.exists(f'docs/translate_{language.lower()}.json'):
111
  with open(f'docs/translate_{language.lower()}.json', 'r', encoding='utf8') as f:
112
  res = json.load(f)
113
- res = {k:v for k, v in res.items() if v is not None}
114
  return res
115
  return {}
116
 
@@ -181,6 +181,8 @@ def trans(word_to_translate, language, special=False):
181
  try:
182
  res_before_trans = eval(result[i-1])
183
  res_after_trans = eval(result[i])
 
 
184
  for a,b in zip(res_before_trans, res_after_trans):
185
  translated_result[a] = b
186
  except:
@@ -196,6 +198,57 @@ def trans(word_to_translate, language, special=False):
196
  translated_result[a] = None
197
  return translated_result
198
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
  def step_1_core_key_translate():
200
  def extract_chinese_characters(file_path):
201
  syntax = []
@@ -310,6 +363,7 @@ def step_2_core_key_translate():
310
  splitted_string = advanced_split(splitted_string, spliter="]", include_spliter=False)
311
  splitted_string = advanced_split(splitted_string, spliter="【", include_spliter=False)
312
  splitted_string = advanced_split(splitted_string, spliter="】", include_spliter=False)
 
313
  splitted_string = advanced_split(splitted_string, spliter=":", include_spliter=False)
314
  splitted_string = advanced_split(splitted_string, spliter=":", include_spliter=False)
315
  splitted_string = advanced_split(splitted_string, spliter=",", include_spliter=False)
@@ -318,6 +372,9 @@ def step_2_core_key_translate():
318
  splitted_string = advanced_split(splitted_string, spliter=";", include_spliter=False)
319
  splitted_string = advanced_split(splitted_string, spliter="`", include_spliter=False)
320
  splitted_string = advanced_split(splitted_string, spliter=" ", include_spliter=False)
 
 
 
321
  # --------------------------------------
322
  for j, s in enumerate(splitted_string): # .com
323
  if '.com' in s: continue
@@ -377,7 +434,7 @@ def step_2_core_key_translate():
377
  need_translate.append(d)
378
 
379
 
380
- up = trans(need_translate, language=LANG, special=False)
381
  map_to_json(up, language=LANG)
382
  cached_translation = read_map_from_json(language=LANG)
383
  cached_translation = dict(sorted(cached_translation.items(), key=lambda x: -len(x[0])))
 
110
  if os.path.exists(f'docs/translate_{language.lower()}.json'):
111
  with open(f'docs/translate_{language.lower()}.json', 'r', encoding='utf8') as f:
112
  res = json.load(f)
113
+ res = {k:v for k, v in res.items() if v is not None and contains_chinese(k)}
114
  return res
115
  return {}
116
 
 
181
  try:
182
  res_before_trans = eval(result[i-1])
183
  res_after_trans = eval(result[i])
184
+ if len(res_before_trans) != len(res_after_trans):
185
+ raise RuntimeError
186
  for a,b in zip(res_before_trans, res_after_trans):
187
  translated_result[a] = b
188
  except:
 
198
  translated_result[a] = None
199
  return translated_result
200
 
201
+
202
+ def trans_json(word_to_translate, language, special=False):
203
+ if len(word_to_translate) == 0: return {}
204
+ from crazy_functions.crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
205
+ from toolbox import get_conf, ChatBotWithCookies
206
+ proxies, WEB_PORT, LLM_MODEL, CONCURRENT_COUNT, AUTHENTICATION, CHATBOT_HEIGHT, LAYOUT, API_KEY = \
207
+ get_conf('proxies', 'WEB_PORT', 'LLM_MODEL', 'CONCURRENT_COUNT', 'AUTHENTICATION', 'CHATBOT_HEIGHT', 'LAYOUT', 'API_KEY')
208
+ llm_kwargs = {
209
+ 'api_key': API_KEY,
210
+ 'llm_model': LLM_MODEL,
211
+ 'top_p':1.0,
212
+ 'max_length': None,
213
+ 'temperature':0.1,
214
+ }
215
+ import random
216
+ N_EACH_REQ = random.randint(16, 32)
217
+ random.shuffle(word_to_translate)
218
+ word_to_translate_split = split_list(word_to_translate, N_EACH_REQ)
219
+ inputs_array = [{k:"#" for k in s} for s in word_to_translate_split]
220
+ inputs_array = [ json.dumps(i, ensure_ascii=False) for i in inputs_array]
221
+
222
+ inputs_show_user_array = inputs_array
223
+ history_array = [[] for _ in inputs_array]
224
+ sys_prompt_array = [f"Replace each json value `#` with translated results in {LANG}, e.g., \"原始文本\":\"TranslatedText\". Keep Json format. Do not answer #." for _ in inputs_array]
225
+ chatbot = ChatBotWithCookies(llm_kwargs)
226
+ gpt_say_generator = request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
227
+ inputs_array,
228
+ inputs_show_user_array,
229
+ llm_kwargs,
230
+ chatbot,
231
+ history_array,
232
+ sys_prompt_array,
233
+ )
234
+ while True:
235
+ try:
236
+ gpt_say = next(gpt_say_generator)
237
+ print(gpt_say[1][0][1])
238
+ except StopIteration as e:
239
+ result = e.value
240
+ break
241
+ translated_result = {}
242
+ for i, r in enumerate(result):
243
+ if i%2 == 1:
244
+ try:
245
+ translated_result.update(json.loads(result[i]))
246
+ except:
247
+ print(result[i])
248
+ print(result)
249
+ return translated_result
250
+
251
+
252
  def step_1_core_key_translate():
253
  def extract_chinese_characters(file_path):
254
  syntax = []
 
363
  splitted_string = advanced_split(splitted_string, spliter="]", include_spliter=False)
364
  splitted_string = advanced_split(splitted_string, spliter="【", include_spliter=False)
365
  splitted_string = advanced_split(splitted_string, spliter="】", include_spliter=False)
366
+ splitted_string = advanced_split(splitted_string, spliter="?", include_spliter=False)
367
  splitted_string = advanced_split(splitted_string, spliter=":", include_spliter=False)
368
  splitted_string = advanced_split(splitted_string, spliter=":", include_spliter=False)
369
  splitted_string = advanced_split(splitted_string, spliter=",", include_spliter=False)
 
372
  splitted_string = advanced_split(splitted_string, spliter=";", include_spliter=False)
373
  splitted_string = advanced_split(splitted_string, spliter="`", include_spliter=False)
374
  splitted_string = advanced_split(splitted_string, spliter=" ", include_spliter=False)
375
+ splitted_string = advanced_split(splitted_string, spliter="- ", include_spliter=False)
376
+ splitted_string = advanced_split(splitted_string, spliter="---", include_spliter=False)
377
+
378
  # --------------------------------------
379
  for j, s in enumerate(splitted_string): # .com
380
  if '.com' in s: continue
 
434
  need_translate.append(d)
435
 
436
 
437
+ up = trans_json(need_translate, language=LANG, special=False)
438
  map_to_json(up, language=LANG)
439
  cached_translation = read_map_from_json(language=LANG)
440
  cached_translation = dict(sorted(cached_translation.items(), key=lambda x: -len(x[0])))