worldqwq commited on
Commit
31aaedc
·
1 Parent(s): 4c3dc51

Modified Suggestion function with argument word and threshold

Browse files
Files changed (1) hide show
  1. SRT.py +30 -8
SRT.py CHANGED
@@ -405,7 +405,25 @@ class SRT_script():
405
  seg.source_text = seg.source_text.lower().replace(word, term_enzh_dict.get(word))
406
  logging.info("replace term: " + word + " --> " + term_enzh_dict.get(word) + " in time stamp {}".format(i+1))
407
  logging.info("source text becomes: " + seg.source_text)
408
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
409
  def spell_check_term(self):
410
  ## known bug: I've will be replaced because i've is not in the dict
411
  logging.info("performing spell check")
@@ -419,17 +437,21 @@ class SRT_script():
419
  word = ready_words[i]
420
  [real_word, pos] = self.get_real_word(word)
421
  if not dict.check(word[:pos]):
422
- suggest = term_spellDict.suggest(real_word)
423
- if suggest and enchant.utils.levenshtein(word, suggest[0]) < (len(word)+len(suggest[0]))/4: # relax spell check
 
 
 
 
424
 
425
  # with open("dislog.log","a") as log:
426
  # if not os.path.exists("dislog.log"):
427
  # log.write("word \t suggest \t levenshtein \n")
428
- logging.info(word + "\t" + suggest[0] + "\t" + str(enchant.utils.levenshtein(word, suggest[0]))+'\n')
429
- #print(word + ":" + suggest[0] + ":---:levenshtein:" + str(enchant.utils.levenshtein(word, suggest[0])))
430
- new_word = word.replace(word[:pos],suggest[0])
431
- else:
432
- new_word = word
433
  else:
434
  new_word = word
435
  ready_words[i] = new_word
 
405
  seg.source_text = seg.source_text.lower().replace(word, term_enzh_dict.get(word))
406
  logging.info("replace term: " + word + " --> " + term_enzh_dict.get(word) + " in time stamp {}".format(i+1))
407
  logging.info("source text becomes: " + seg.source_text)
408
+
409
+
410
+ comp_dict = []
411
+
412
+ def fetchfunc(self,word,threshold):
413
+ import enchant
414
+ result = word;
415
+ threshold = threshold*len(word)
416
+ if len(self.comp_dict)==0:
417
+ with open("./finetune_data/dict_enzh.csv", 'r', encoding='utf-8') as f:
418
+ self.comp_dict = {rows[0]: rows[1] for rows in reader(f)}
419
+ temp = ""
420
+ for matched in self.comp_dict:
421
+ if enchant.utils.levenshtein(word, matched)<enchant.utils.levenshtein(word, temp):
422
+ temp = matched
423
+ if enchant.utils.levenshtein(word, temp) < threshold:
424
+ result = temp
425
+ return result
426
+
427
  def spell_check_term(self):
428
  ## known bug: I've will be replaced because i've is not in the dict
429
  logging.info("performing spell check")
 
437
  word = ready_words[i]
438
  [real_word, pos] = self.get_real_word(word)
439
  if not dict.check(word[:pos]):
440
+ new_word = word.replace(word[:pos],self.fetchfunc(word[:pos],0.5))
441
+
442
+ logging.info(word + "\t" + self.fetchfunc(word[:pos],0.5) + "\t" + str(enchant.utils.levenshtein(word, self.fetchfunc(word[:pos],0.5)))+'\n')
443
+
444
+ #suggest = term_spellDict.suggest(real_word)
445
+ #if suggest and enchant.utils.levenshtein(word, suggest[0]) < (len(word)+len(suggest[0]))/4: # relax spell check
446
 
447
  # with open("dislog.log","a") as log:
448
  # if not os.path.exists("dislog.log"):
449
  # log.write("word \t suggest \t levenshtein \n")
450
+ # logging.info(word + "\t" + suggest[0] + "\t" + str(enchant.utils.levenshtein(word, suggest[0]))+'\n')
451
+ # #print(word + ":" + suggest[0] + ":---:levenshtein:" + str(enchant.utils.levenshtein(word, suggest[0])))
452
+ # new_word = word.replace(word[:pos],suggest[0])
453
+ #else:
454
+ # new_word = word
455
  else:
456
  new_word = word
457
  ready_words[i] = new_word