alakxender commited on
Commit
821340b
·
1 Parent(s): 0d01a94

normalize-lib

Browse files
Files changed (4) hide show
  1. app.py +2 -2
  2. lib/dv_sentence_end.map +0 -74
  3. lib/normalize_dv.py +0 -135
  4. requirements.txt +1 -0
app.py CHANGED
@@ -5,7 +5,7 @@ from transformers import VitsTokenizer, VitsModel, set_seed
5
  import tempfile
6
  import numpy as np
7
  from scipy.io.wavfile import write
8
- from lib.normalize_dv import normalize_dv
9
 
10
  models = {
11
  "MMS TTS Base": "Dhivehi/mms-tts-div",
@@ -37,7 +37,7 @@ def tts(text:str, model_name:str):
37
 
38
  # normalize the dv text from written to spoken
39
  print (f"Normalizing: {text}")
40
- text = normalize_dv(text)
41
  print (f"Normalized: {text}")
42
 
43
  # Preprocess the input text
 
5
  import tempfile
6
  import numpy as np
7
  from scipy.io.wavfile import write
8
+ from dv_normalize.dv_sentence import spoken_dv
9
 
10
  models = {
11
  "MMS TTS Base": "Dhivehi/mms-tts-div",
 
37
 
38
  # normalize the dv text from written to spoken
39
  print (f"Normalizing: {text}")
40
+ text = spoken_dv(text)
41
  print (f"Normalized: {text}")
42
 
43
  # Preprocess the input text
lib/dv_sentence_end.map DELETED
@@ -1,74 +0,0 @@
1
- normalized,ending
2
- ވެއެވެ,ވޭ
3
- ލޮއެވެ,ލޮ
4
- ޓައެވެ,ޓާ
5
- ފުޅެވެ,ފުޅު
6
- ގެއެވެ,ގެ
7
- ހުރެއެވެ,ހުރޭ
8
- ފައެވެ,ފަ
9
- ކެކެވެ,ކެއް
10
- މެކެވެ,މެއް
11
- ރެއެވެ,ރޭ
12
- ލެވެ,ލު
13
- ދެވެ,ދު
14
- ތުއެވެ,ތު
15
- ހެކެވެ,ހެއް
16
- ނޫނެވެ,ނޫން
17
- ންނެވެ,ން
18
- ދުނެވެ,ދުން
19
- ތަނެވެ,ތަން
20
- ރެކެވެ,ރެއް
21
- ބެއެވެ,ބޭ
22
- މެއެވެ,
23
- ތަށެވެ,ތަށް
24
- ޅައެވެ,ޅަ
25
- ކެވެ,އް
26
- މައެވެ,މަ
27
- ޔަށެވެ,ޔަށް
28
- ދުމެވެ,ދުން
29
- ށެކެވެ,ށެއް
30
- ވިއެވެ,ވި
31
- ތީއެވެ,ތީ
32
- ނެއެވެ,ނެ
33
- ކަށެވެ,ކަށް
34
- ނެެއެވެ,ނެ
35
- ރެވެ,ރު
36
- ޓަށެވެ,ޓަށް
37
- ޖެއެވެ,ޖެ
38
- އްބެވެ,ވި
39
- ޅެވެ,ޅު
40
- އިންނެވެ,އިން
41
- ގަތެވެ,ގަތް
42
- އެކެވެ,އެއް
43
- އައެވެ,އައޭ
44
- ޅެކެވެ,ޅެއް
45
- ގައެވެ,ގައި
46
- ތެކެވެ,ތެއް
47
- ފާތެވެ,ފާތު
48
- ބަހެކެވެ,ބަސް
49
- ކައެވެ,ކައި
50
- ގާމެވެ,ގާމު
51
- ހުއްޓެވެ,ހުރި
52
- ތަކެވެ,ތައް
53
- ޤަށެވެ,ޤަށް
54
- ހަށެވެ,ހަށް
55
- ޔާއެވެ,ޔާ
56
- އަށެވެ,އަށް
57
- ޅެމެވެ,ޅެން
58
- ދަށެވެ,ދަށް
59
- ޔުމެވެ,ޔުން
60
- ބަހެވެ,ބަސް
61
- ވައެވެ,ވަ
62
- ވީއެވެ,ވީ
63
- ލެއެވެ,ލެ
64
- ޗަށެވެ,ޗަށް
65
- ނަށެވެ,ނަށް
66
- ރުމެވެ,ރުން
67
- ދެއެވެ,ދޭ
68
- ވަހެވެ,ވަސް
69
- ތައެވެ,ތަ
70
- ރަށެވެ,ރަށް
71
- މުމެވެ,މުން
72
- ކްޓެވެ,ކްޓު
73
- ޑރ,ޑޮކްޓަރ
74
- ޏެވެ,ބުނި
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lib/normalize_dv.py DELETED
@@ -1,135 +0,0 @@
1
- import pandas as pd
2
- import re
3
-
4
- def fix_sentence_end(text, csv_path="lib/dv_sentence_end.map"):
5
- # end map from : https://github.com/Sofwath/dv_speech_text_data/tree/master/tools
6
- try:
7
- df = pd.read_csv(csv_path, sep=",", header=0)
8
- text_map = df.set_index('normalized')['ending'].to_dict()
9
-
10
- for normalized, ending in text_map.items():
11
- text = text.replace(normalized, ending)
12
-
13
- text = text.replace('އެވެ', '').replace('ށެވެ', '')
14
- return text
15
- except Exception as e:
16
- print(f"An error occurred: {e}")
17
- return text
18
-
19
- def remove_special_characters(text):
20
- pattern = r'[^\w\sހށނރބޅކއވމފދތލގޏސޑޒޓޔޕޖޗޘޙޚޛޜޝޞޟޠޡޢޣޤޥަާިީުޫެޭޮޯްޱ޲޳޴޵޶޷޸޹޺޻޼޽޾޿\s]'
21
- return re.sub(pattern, '', text)
22
-
23
- def int_to_dv(num, thousands=False,is_spoken=False):
24
-
25
- d = {
26
- 0: ["ސުމެއް","ސުމެއް"],
27
- 1: ["އެއް","އެކެއް"],
28
- 2: ["ދެ","ދޭ","ދުއި"],
29
- 3: ["ތިން","ތިނެއް"],
30
- 4: ["ހަތަރު","ހަތަރެއް"],
31
- 5: ["ފަސް","ފަހެއް"],
32
- 6: ["ހަ","ހައެއް"],
33
- 7: ["ހަތް","ހަތެއް"],
34
- 8: ["އަށް","އަށެއް"],
35
- 9: ["ނުވަ","ނުވައެއް"],
36
- 10: ["ދިހަ","ދިހައެއް"],
37
- 11: ["އެގާރަ","އެގާރަ"],
38
- 12: ["ބާރަ","ބާރަ"],
39
- 13: ["ތޭރަ","ތޭރަ"],
40
- 14: ["ސާދަ","ސާދަ"],
41
- 15: ["ފަނަރަ","ފަނަރަ"],
42
- 16: ["ސޯޅަ","ސޯޅަ"],
43
- 17: ["ސަތާރަ","ސަތާރަ"],
44
- 18: ["އަށާރަ","އަށާރަ"],
45
- 19: ["ނަވާރަ","ނަވާރަ"],
46
- 20: ["ވިހި","ވިހި"],
47
- 21: ["އެކާވީސް","އެކާވީސް"],
48
- 22: ["ބާވީސް","ބާވީސް"],
49
- 23: ["ތޭވީސް","ތޭވީސް"],
50
- 24: ["ސައުވީސް","ސައުވީސް"],
51
- 25: ["ފަންސަވީސް","ފަންސަވީސް"],
52
- 26: ["ސައްބީސް","ސައްބީސް"],
53
- 27: ["ހަތާވީސް","ހަތާވީސް"],
54
- 28: ["އަށާވީސް","އަށާވީސް"],
55
- 29: ["އޮނަތިރީސް","އޮނަތިރީސް"],
56
- 30: ["ތިރީސް","ތިރީސް"],
57
- 40: ["ސާޅީސް","ސާޅީސް"],
58
- 50: ["ފަންސާސް","ފަންސާސް"],
59
- 60: ["ފަސްދޮޅަސް","ފަސްދޮޅަސް"],
60
- 70: ["ހައްދިހަ","ހައްދިހަ"],
61
- 80: ["އައްޑިހަ","އައްޑިހަ"],
62
- 90: ["ނުވަދިހަ","ނުވަދިހަ"],
63
- }
64
-
65
- k = 1000
66
- m = k * 1000
67
- b = m * 1000
68
- t = b * 1000
69
-
70
- assert 0 <= num
71
-
72
- if num < 30:
73
- return d[num][0 if thousands else 1]
74
-
75
- if num < 100:
76
- # At this point we will check if we want to return the number for written form or spoken form
77
- if is_spoken == True:
78
- thousands = True
79
-
80
- index = 0 if thousands else 1
81
-
82
- return d[num][1] if num % 10 == 0 else d[num // 10 * 10][1] + ' ' + d[num % 10][index]
83
-
84
- if num < k:
85
- # At this point we will check if we want to return the number for written form or spoken form
86
- if is_spoken == True:
87
- thousands = True
88
-
89
- hundreds = num // 100
90
- remainder = num % 100
91
- hundreds_text = d[hundreds][2] + ' ސައްތަ' if hundreds == 2 else d[hundreds][0] + ' ސަތޭކަ'
92
- return hundreds_text if remainder == 0 else hundreds_text + ' ' + int_to_dv(remainder, thousands)
93
-
94
- if num < m:
95
- thousands_text = int_to_dv(num // k, True) + ' ހާސް'
96
- return thousands_text if num % k == 0 else thousands_text + ' ' + int_to_dv(num % k, False,is_spoken)
97
-
98
- if num < b:
99
- millions_text = int_to_dv(num // m, True) + ' މިލިއަން'
100
- return millions_text if num % m == 0 else millions_text + ' ' + int_to_dv(num % m, False,is_spoken)
101
-
102
- if num < t:
103
- billions_text = int_to_dv(num // b, True) + ' ބިލިއަން'
104
- return billions_text if num % b == 0 else billions_text + ' ' + int_to_dv(num % b, False,is_spoken)
105
-
106
- trillions_text = int_to_dv(num // t, False) + ' ޓްރިލިއަން'
107
- return trillions_text if num % t == 0 else trillions_text + ' ' + int_to_dv(num % t, False,is_spoken)
108
-
109
- # Function to replace decimal points with the word 'point'
110
- def replace_decimal_points(text):
111
- # Use regex to find numbers with decimal points and replace '.' with 'point'
112
- return re.sub(r'(\d+)\.(\d+)', r'\1 ޕޮއިންޓު \2', text)
113
-
114
- # Function to add a space after each digit in numbers bigger than 999999
115
- def add_space_after_digits(text):
116
- # Use regex to find numbers bigger than 999999 and add space after each digit
117
- #return re.sub(r'(?<!\d)(\d{1,})(?=\D|$)', lambda x: ' '.join(x.group(0)), text)
118
- return re.sub(r'(?<!\d)(\d{10,})(?!\d)', lambda x: ' '.join(x.group(0)), text)
119
-
120
-
121
- def replace_digits_with_dv(text):
122
- text= text.replace(",","") # we dont won't thousand sep if any
123
- text = replace_decimal_points(text) # replace dot
124
- text = add_space_after_digits(text) # add some space if its a big num
125
- digits = re.findall(r'\d+', text)
126
- for digit in digits:
127
- dv = int_to_dv(int(digit),is_spoken=True)
128
- text = text.replace(digit, dv)
129
- return text
130
-
131
- def normalize_dv(text:str):
132
- text = fix_sentence_end(text)
133
- text = replace_digits_with_dv(text)
134
- text = remove_special_characters(text)
135
- return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -129,3 +129,4 @@ wcwidth==0.2.13
129
  websockets==11.0.3
130
  xxhash==3.4.1
131
  yarl==1.9.4
 
 
129
  websockets==11.0.3
130
  xxhash==3.4.1
131
  yarl==1.9.4
132
+ dv-normalizer