Spaces:
Runtime error
Runtime error
Commit
·
2f990e6
1
Parent(s):
eb69374
feat: optimized performance of pretty func when execute long words
Browse files- app.py +7 -1
- utils/thai_word.py +16 -10
app.py
CHANGED
@@ -4,6 +4,8 @@ import numpy as np
|
|
4 |
from transformers import pipeline
|
5 |
from utils.thai_word import ThaiWord
|
6 |
from pythainlp.tokenize import word_tokenize
|
|
|
|
|
7 |
|
8 |
MODEL_NAME = "biodatlab/whisper-th-medium-combined"
|
9 |
DEVICE = 0 if torch.cuda.is_available() else "cpu"
|
@@ -37,7 +39,11 @@ def transcribe(audio):
|
|
37 |
# pretty text
|
38 |
tokens = word_tokenize(text, engine="attacut", join_broken_num=True)
|
39 |
print(tokens)
|
40 |
-
|
|
|
|
|
|
|
|
|
41 |
else:
|
42 |
result = 'โปรดลองพูดอีกครั้ง'
|
43 |
except Exception as e:
|
|
|
4 |
from transformers import pipeline
|
5 |
from utils.thai_word import ThaiWord
|
6 |
from pythainlp.tokenize import word_tokenize
|
7 |
+
from collections import deque
|
8 |
+
from copy import deepcopy
|
9 |
|
10 |
MODEL_NAME = "biodatlab/whisper-th-medium-combined"
|
11 |
DEVICE = 0 if torch.cuda.is_available() else "cpu"
|
|
|
39 |
# pretty text
|
40 |
tokens = word_tokenize(text, engine="attacut", join_broken_num=True)
|
41 |
print(tokens)
|
42 |
+
|
43 |
+
if len(tokens) > 0:
|
44 |
+
result = f'pretty: {thw.pretty(deque(deepcopy(tokens)))}\n\n original: {text}'
|
45 |
+
else:
|
46 |
+
result = 'pretty: ไม่สามารถตัดคำได้'
|
47 |
else:
|
48 |
result = 'โปรดลองพูดอีกครั้ง'
|
49 |
except Exception as e:
|
utils/thai_word.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
from pythainlp.util import text_to_num, text_to_arabic_digit
|
|
|
2 |
|
3 |
class ThaiWord:
|
4 |
|
@@ -24,40 +25,45 @@ class ThaiWord:
|
|
24 |
else:
|
25 |
try:
|
26 |
num = text_to_num("".join(words))
|
|
|
|
|
27 |
if len(num) > 0:
|
28 |
num = num[0]
|
|
|
|
|
29 |
except Exception:
|
30 |
for word in words:
|
31 |
num = f'{num}{text_to_arabic_digit(word)}'
|
32 |
|
33 |
-
return
|
34 |
|
35 |
-
def pretty(self, words) -> str:
|
36 |
has_start_number = False
|
37 |
number = []
|
38 |
-
text =
|
39 |
|
40 |
-
|
|
|
41 |
if has_start_number:
|
42 |
if self.is_number(word) or self.is_digit(word):
|
43 |
number.append(word)
|
44 |
else:
|
45 |
-
text
|
46 |
has_start_number = False
|
47 |
number.clear()
|
48 |
|
|
|
49 |
if not has_start_number:
|
50 |
if self.is_start_number(word):
|
51 |
has_start_number = True
|
52 |
number.append(word)
|
53 |
else:
|
54 |
-
text
|
55 |
|
56 |
-
if
|
57 |
-
text
|
58 |
|
59 |
-
|
60 |
-
return ''.join(text)
|
61 |
|
62 |
def is_start_number(self, word) -> bool:
|
63 |
has_start_number = False
|
|
|
1 |
from pythainlp.util import text_to_num, text_to_arabic_digit
|
2 |
+
from collections import deque
|
3 |
|
4 |
class ThaiWord:
|
5 |
|
|
|
25 |
else:
|
26 |
try:
|
27 |
num = text_to_num("".join(words))
|
28 |
+
|
29 |
+
# get numeric only in sentence
|
30 |
if len(num) > 0:
|
31 |
num = num[0]
|
32 |
+
num = f' {int(num):,} '
|
33 |
+
|
34 |
except Exception:
|
35 |
for word in words:
|
36 |
num = f'{num}{text_to_arabic_digit(word)}'
|
37 |
|
38 |
+
return num
|
39 |
|
40 |
+
def pretty(self, words: deque) -> str:
|
41 |
has_start_number = False
|
42 |
number = []
|
43 |
+
text = ''
|
44 |
|
45 |
+
while len(words) > 0:
|
46 |
+
word = words.popleft()
|
47 |
if has_start_number:
|
48 |
if self.is_number(word) or self.is_digit(word):
|
49 |
number.append(word)
|
50 |
else:
|
51 |
+
text = f'{text}{self.words_to_number(number)}'
|
52 |
has_start_number = False
|
53 |
number.clear()
|
54 |
|
55 |
+
# detect the first numeric in sentence
|
56 |
if not has_start_number:
|
57 |
if self.is_start_number(word):
|
58 |
has_start_number = True
|
59 |
number.append(word)
|
60 |
else:
|
61 |
+
text = f'{text}{word}'
|
62 |
|
63 |
+
if len(words) == 0 and len(number) > 0:
|
64 |
+
text = f'{text}{self.words_to_number(number)}'
|
65 |
|
66 |
+
return text
|
|
|
67 |
|
68 |
def is_start_number(self, word) -> bool:
|
69 |
has_start_number = False
|