Spaces:

patharanor
/

asr-th

Runtime error

App Files Files Community

patharanor commited on Feb 12, 2024

Commit

2f990e6

1 Parent(s): eb69374

feat: optimized performance of pretty func when execute long words

Browse files

Files changed (2) hide show

app.py +7 -1
utils/thai_word.py +16 -10

app.py CHANGED Viewed

@@ -4,6 +4,8 @@ import numpy as np
 from transformers import pipeline
 from utils.thai_word import ThaiWord
 from pythainlp.tokenize import word_tokenize
 MODEL_NAME = "biodatlab/whisper-th-medium-combined"
 DEVICE = 0 if torch.cuda.is_available() else "cpu"
@@ -37,7 +39,11 @@ def transcribe(audio):
             # pretty text
             tokens = word_tokenize(text, engine="attacut", join_broken_num=True)
             print(tokens)
-            result = f'pretty: {thw.pretty(tokens)}\n\n original: {text}'
         else:
             result = 'โปรดลองพูดอีกครั้ง'
     except Exception as e:

 from transformers import pipeline
 from utils.thai_word import ThaiWord
 from pythainlp.tokenize import word_tokenize
+from collections import deque
+from copy import deepcopy
 MODEL_NAME = "biodatlab/whisper-th-medium-combined"
 DEVICE = 0 if torch.cuda.is_available() else "cpu"
             # pretty text
             tokens = word_tokenize(text, engine="attacut", join_broken_num=True)
             print(tokens)
+            if len(tokens) > 0:
+                result = f'pretty: {thw.pretty(deque(deepcopy(tokens)))}\n\n original: {text}'
+            else:
+                result = 'pretty: ไม่สามารถตัดคำได้'
         else:
             result = 'โปรดลองพูดอีกครั้ง'
     except Exception as e:

utils/thai_word.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from pythainlp.util import text_to_num, text_to_arabic_digit
 class ThaiWord:
@@ -24,40 +25,45 @@ class ThaiWord:
         else:
             try:
                 num = text_to_num("".join(words))
                 if len(num) > 0:
                     num = num[0]
             except Exception:
                 for word in words:
                     num = f'{num}{text_to_arabic_digit(word)}'
-        return f' {int(num):,} '
-    def pretty(self, words) -> str:
         has_start_number = False
         number = []
-        text = []
-        for idx, word in enumerate(words):
             if has_start_number:
                 if self.is_number(word) or self.is_digit(word):
                     number.append(word)
                 else:
-                    text.append(self.words_to_number(number))
                     has_start_number = False
                     number.clear()
             if not has_start_number:
                 if self.is_start_number(word):
                     has_start_number = True
                     number.append(word)
                 else:
-                    text.append(word)
-            if idx == len(words)-1 and len(number) > 0:
-                text.append(self.words_to_number(number))
-        return ''.join(text)
     def is_start_number(self, word) -> bool:
         has_start_number = False

 from pythainlp.util import text_to_num, text_to_arabic_digit
+from collections import deque
 class ThaiWord:
         else:
             try:
                 num = text_to_num("".join(words))
+                # get numeric only in sentence
                 if len(num) > 0:
                     num = num[0]
+                    num = f' {int(num):,} '
             except Exception:
                 for word in words:
                     num = f'{num}{text_to_arabic_digit(word)}'
+        return num
+    def pretty(self, words: deque) -> str:
         has_start_number = False
         number = []
+        text = ''
+        while len(words) > 0:
+            word = words.popleft()
             if has_start_number:
                 if self.is_number(word) or self.is_digit(word):
                     number.append(word)
                 else:
+                    text = f'{text}{self.words_to_number(number)}'
                     has_start_number = False
                     number.clear()
+            # detect the first numeric in sentence
             if not has_start_number:
                 if self.is_start_number(word):
                     has_start_number = True
                     number.append(word)
                 else:
+                    text = f'{text}{word}'
+            if len(words) == 0 and len(number) > 0:
+                text = f'{text}{self.words_to_number(number)}'
+        return text
     def is_start_number(self, word) -> bool:
         has_start_number = False