patharanor commited on
Commit
2f990e6
·
1 Parent(s): eb69374

feat: optimized performance of pretty func when execute long words

Browse files
Files changed (2) hide show
  1. app.py +7 -1
  2. utils/thai_word.py +16 -10
app.py CHANGED
@@ -4,6 +4,8 @@ import numpy as np
4
  from transformers import pipeline
5
  from utils.thai_word import ThaiWord
6
  from pythainlp.tokenize import word_tokenize
 
 
7
 
8
  MODEL_NAME = "biodatlab/whisper-th-medium-combined"
9
  DEVICE = 0 if torch.cuda.is_available() else "cpu"
@@ -37,7 +39,11 @@ def transcribe(audio):
37
  # pretty text
38
  tokens = word_tokenize(text, engine="attacut", join_broken_num=True)
39
  print(tokens)
40
- result = f'pretty: {thw.pretty(tokens)}\n\n original: {text}'
 
 
 
 
41
  else:
42
  result = 'โปรดลองพูดอีกครั้ง'
43
  except Exception as e:
 
4
  from transformers import pipeline
5
  from utils.thai_word import ThaiWord
6
  from pythainlp.tokenize import word_tokenize
7
+ from collections import deque
8
+ from copy import deepcopy
9
 
10
  MODEL_NAME = "biodatlab/whisper-th-medium-combined"
11
  DEVICE = 0 if torch.cuda.is_available() else "cpu"
 
39
  # pretty text
40
  tokens = word_tokenize(text, engine="attacut", join_broken_num=True)
41
  print(tokens)
42
+
43
+ if len(tokens) > 0:
44
+ result = f'pretty: {thw.pretty(deque(deepcopy(tokens)))}\n\n original: {text}'
45
+ else:
46
+ result = 'pretty: ไม่สามารถตัดคำได้'
47
  else:
48
  result = 'โปรดลองพูดอีกครั้ง'
49
  except Exception as e:
utils/thai_word.py CHANGED
@@ -1,4 +1,5 @@
1
  from pythainlp.util import text_to_num, text_to_arabic_digit
 
2
 
3
  class ThaiWord:
4
 
@@ -24,40 +25,45 @@ class ThaiWord:
24
  else:
25
  try:
26
  num = text_to_num("".join(words))
 
 
27
  if len(num) > 0:
28
  num = num[0]
 
 
29
  except Exception:
30
  for word in words:
31
  num = f'{num}{text_to_arabic_digit(word)}'
32
 
33
- return f' {int(num):,} '
34
 
35
- def pretty(self, words) -> str:
36
  has_start_number = False
37
  number = []
38
- text = []
39
 
40
- for idx, word in enumerate(words):
 
41
  if has_start_number:
42
  if self.is_number(word) or self.is_digit(word):
43
  number.append(word)
44
  else:
45
- text.append(self.words_to_number(number))
46
  has_start_number = False
47
  number.clear()
48
 
 
49
  if not has_start_number:
50
  if self.is_start_number(word):
51
  has_start_number = True
52
  number.append(word)
53
  else:
54
- text.append(word)
55
 
56
- if idx == len(words)-1 and len(number) > 0:
57
- text.append(self.words_to_number(number))
58
 
59
-
60
- return ''.join(text)
61
 
62
  def is_start_number(self, word) -> bool:
63
  has_start_number = False
 
1
  from pythainlp.util import text_to_num, text_to_arabic_digit
2
+ from collections import deque
3
 
4
  class ThaiWord:
5
 
 
25
  else:
26
  try:
27
  num = text_to_num("".join(words))
28
+
29
+ # get numeric only in sentence
30
  if len(num) > 0:
31
  num = num[0]
32
+ num = f' {int(num):,} '
33
+
34
  except Exception:
35
  for word in words:
36
  num = f'{num}{text_to_arabic_digit(word)}'
37
 
38
+ return num
39
 
40
+ def pretty(self, words: deque) -> str:
41
  has_start_number = False
42
  number = []
43
+ text = ''
44
 
45
+ while len(words) > 0:
46
+ word = words.popleft()
47
  if has_start_number:
48
  if self.is_number(word) or self.is_digit(word):
49
  number.append(word)
50
  else:
51
+ text = f'{text}{self.words_to_number(number)}'
52
  has_start_number = False
53
  number.clear()
54
 
55
+ # detect the first numeric in sentence
56
  if not has_start_number:
57
  if self.is_start_number(word):
58
  has_start_number = True
59
  number.append(word)
60
  else:
61
+ text = f'{text}{word}'
62
 
63
+ if len(words) == 0 and len(number) > 0:
64
+ text = f'{text}{self.words_to_number(number)}'
65
 
66
+ return text
 
67
 
68
  def is_start_number(self, word) -> bool:
69
  has_start_number = False