Spaces:

alakxender
/

tts-dhivehi-demo-mms

Running

App Files Files Community

alakxender commited on Nov 23, 2024

Commit

821340b

1 Parent(s): 0d01a94

normalize-lib

Browse files

Files changed (4) hide show

app.py +2 -2
lib/dv_sentence_end.map +0 -74
lib/normalize_dv.py +0 -135
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -5,7 +5,7 @@ from transformers import VitsTokenizer, VitsModel, set_seed
 import tempfile
 import numpy as np
 from scipy.io.wavfile import write
-from lib.normalize_dv import normalize_dv
 models = {
     "MMS TTS Base": "Dhivehi/mms-tts-div",
@@ -37,7 +37,7 @@ def tts(text:str, model_name:str):
     # normalize the dv text from written to spoken
     print (f"Normalizing: {text}")
-    text = normalize_dv(text)
     print (f"Normalized: {text}")
     # Preprocess the input text

 import tempfile
 import numpy as np
 from scipy.io.wavfile import write
+from dv_normalize.dv_sentence import spoken_dv
 models = {
     "MMS TTS Base": "Dhivehi/mms-tts-div",
     # normalize the dv text from written to spoken
     print (f"Normalizing: {text}")
+    text = spoken_dv(text)
     print (f"Normalized: {text}")
     # Preprocess the input text

lib/dv_sentence_end.map DELETED Viewed

@@ -1,74 +0,0 @@
-normalized,ending
-ވެއެވެ,ވޭ
-ލޮއެވެ,ލޮ
-ޓައެވެ,ޓާ
-ފުޅެވެ,ފުޅު
-ގެއެވެ,ގެ
-ހުރެއެވެ,ހުރޭ
-ފައެވެ,ފަ
-ކެކެވެ,ކެއް
-މެކެވެ,މެއް
-ރެއެވެ,ރޭ
-ލެވެ,ލު
-ދެވެ,ދު
-ތުއެވެ,ތު
-ހެކެވެ,ހެއް
-ނޫނެވެ,ނޫން
-ންނެވެ,ން
-ދުނެވެ,ދުން
-ތަނެވެ,ތަން
-ރެކެވެ,ރެއް
-ބެއެވެ,ބޭ
-މެއެވެ,
-ތަށެވެ,ތަށް
-ޅައެވެ,ޅަ
-ކެވެ,އް
-މައެވެ,މަ
-ޔަށެވެ,ޔަށް
-ދުމެވެ,ދުން
-ށެކެވެ,ށެއް
-ވިއެވެ,ވި
-ތީއެވެ,ތީ
-ނެއެވެ,ނެ
-ކަށެވެ,ކަށް
-ނެެއެވެ,ނެ
-ރެވެ,ރު
-ޓަށެވެ,ޓަށް
-ޖެއެވެ,ޖެ
-އްބެވެ,ވި
-ޅެވެ,ޅު
-އިންނެވެ,އިން
-ގަތެވެ,ގަތް
-އެކެވެ,އެއް
-އައެވެ,އައޭ
-ޅެކެވެ,ޅެއް
-ގައެވެ,ގައި
-ތެކެވެ,ތެއް
-ފާތެވެ,ފާތު
-ބަހެކެވެ,ބަސް
-ކައެވެ,ކައި
-ގާމެވެ,ގާމު
-ހުއްޓެވެ,ހުރި
-ތަކެވެ,ތައް
-ޤަށެވެ,ޤަށް
-ހަށެވެ,ހަށް
-ޔާއެވެ,ޔާ
-އަށެވެ,އަށް
-ޅެމެވެ,ޅެން
-ދަށެވެ,ދަށް
-ޔުމެވެ,ޔުން
-ބަހެވެ,ބަސް
-ވައެވެ,ވަ
-ވީއެވެ,ވީ
-ލެއެވެ,ލެ
-ޗަށެވެ,ޗަށް
-ނަށެވެ,ނަށް
-ރުމެވެ,ރުން
-ދެއެވެ,ދޭ
-ވަހެވެ,ވަސް
-ތައެވެ,ތަ
-ރަށެވެ,ރަށް
-މުމެވެ,މުން
-ކްޓެވެ,ކްޓު
-ޑރ,ޑޮކްޓަރ
-ޏެވެ,ބުނި

lib/normalize_dv.py DELETED Viewed

@@ -1,135 +0,0 @@
-import pandas as pd
-import re
-def fix_sentence_end(text, csv_path="lib/dv_sentence_end.map"):
-    # end map from : https://github.com/Sofwath/dv_speech_text_data/tree/master/tools
-    try:
-        df = pd.read_csv(csv_path, sep=",", header=0)
-        text_map = df.set_index('normalized')['ending'].to_dict()
-        for normalized, ending in text_map.items():
-            text = text.replace(normalized, ending)
-        text = text.replace('އެވެ', '').replace('ށެވެ', '')
-        return text
-    except Exception as e:
-        print(f"An error occurred: {e}")
-        return text
-def remove_special_characters(text):
-    pattern = r'[^\w\sހށނރބޅކއވމފދތލގޏސޑޒޓޔޕޖޗޘޙޚޛޜޝޞޟޠޡޢޣޤޥަާިީުޫެޭޮޯްޱ޲޳޴޵޶޷޸޹޺޻޼޽޾޿\s]'
-    return re.sub(pattern, '', text)
-def int_to_dv(num, thousands=False,is_spoken=False):
-    d = {
-        0: ["ސުމެއް","ސުމެއް"],
-        1: ["އެއް","އެކެއް"],
-        2: ["ދެ","ދޭ","ދުއި"],
-        3: ["ތިން","ތިނެއް"],
-        4: ["ހަތަރު","ހަތަރެއް"],
-        5: ["ފަސް","ފަހެއް"],
-        6: ["ހަ","ހައެއް"],
-        7: ["ހަތް","ހަތެއް"],
-        8: ["އަށް","އަށެއް"],
-        9: ["ނުވަ","ނުވައެއް"],
-        10: ["ދިހަ","ދިހައެއް"],
-        11: ["އެގާރަ","އެގާރަ"],
-        12: ["ބާރަ","ބާރަ"],
-        13: ["ތޭރަ","ތޭރަ"],
-        14: ["ސާދަ","ސާދަ"],
-        15: ["ފަނަރަ","ފަނަރަ"],
-        16: ["ސޯޅަ","ސޯޅަ"],
-        17: ["ސަތާރަ","ސަތާރަ"],
-        18: ["އަށާރަ","އަށާރަ"],
-        19: ["ނަވާރަ","ނަވާރަ"],
-        20: ["ވިހި","ވިހި"],
-        21: ["އެކާވީސް","އެކާވީސް"],
-        22: ["ބާވީސް","ބާވީސް"],
-        23: ["ތޭވީސް","ތޭވީސް"],
-        24: ["ސައުވީސް","ސައުވީސް"],
-        25: ["ފަންސަވީސް","ފަންސަވީސް"],
-        26: ["ސައްބީސް","ސައްބީސް"],
-        27: ["ހަތާވީސް","ހަތާވީސް"],
-        28: ["އަށާވީސް","އަށާވީސް"],
-        29: ["އޮނަތިރީސް","އޮނަތިރީސް"],
-        30: ["ތިރީސް","ތިރީސް"],
-        40: ["ސާޅީސް","ސާޅީސް"],
-        50: ["ފަންސާސް","ފަންސާސް"],
-        60: ["ފަސްދޮޅަސް","ފަސްދޮޅަސް"],
-        70: ["ހައްދިހަ","ހައްދިހަ"],
-        80: ["އައްޑިހަ","އައްޑިހަ"],
-        90: ["ނުވަދިހަ","ނުވަދިހަ"],
-    }
-    k = 1000
-    m = k * 1000
-    b = m * 1000
-    t = b * 1000
-    assert 0 <= num
-    if num < 30:
-        return d[num][0 if thousands else 1]
-    if num < 100:
-        # At this point we will check if we want to return the number for written form or spoken form
-        if is_spoken == True:
-            thousands = True
-        index = 0 if thousands else 1
-        return d[num][1] if num % 10 == 0 else d[num // 10 * 10][1] + ' ' + d[num % 10][index]
-    if num < k:
-        # At this point we will check if we want to return the number for written form or spoken form
-        if is_spoken == True:
-            thousands = True
-        hundreds = num // 100
-        remainder = num % 100
-        hundreds_text = d[hundreds][2] + ' ސައްތަ' if hundreds == 2 else d[hundreds][0] + ' ސަތޭކަ'
-        return hundreds_text if remainder == 0 else hundreds_text + ' ' + int_to_dv(remainder, thousands)
-    if num < m:
-        thousands_text = int_to_dv(num // k, True) + ' ހާސް'
-        return thousands_text if num % k == 0 else thousands_text + ' ' + int_to_dv(num % k, False,is_spoken)
-    if num < b:
-        millions_text = int_to_dv(num // m, True) + ' މިލިއަން'
-        return millions_text if num % m == 0 else millions_text + ' ' + int_to_dv(num % m, False,is_spoken)
-    if num < t:
-        billions_text = int_to_dv(num // b, True) + ' ބިލިއަން'
-        return billions_text if num % b == 0 else billions_text + ' ' + int_to_dv(num % b, False,is_spoken)
-    trillions_text = int_to_dv(num // t, False) + ' ޓްރިލިއަން'
-    return trillions_text if num % t == 0 else trillions_text + ' ' + int_to_dv(num % t, False,is_spoken)
-# Function to replace decimal points with the word 'point'
-def replace_decimal_points(text):
-    # Use regex to find numbers with decimal points and replace '.' with 'point'
-    return re.sub(r'(\d+)\.(\d+)', r'\1 ޕޮއިންޓު \2', text)
-# Function to add a space after each digit in numbers bigger than 999999
-def add_space_after_digits(text):
-    # Use regex to find numbers bigger than 999999 and add space after each digit
-    #return re.sub(r'(?<!\d)(\d{1,})(?=\D|$)', lambda x: ' '.join(x.group(0)), text)
-    return re.sub(r'(?<!\d)(\d{10,})(?!\d)', lambda x: ' '.join(x.group(0)), text)
-def replace_digits_with_dv(text):
-    text= text.replace(",","") # we dont won't thousand sep if any
-    text = replace_decimal_points(text) # replace dot
-    text = add_space_after_digits(text) # add some space if its a big num
-    digits = re.findall(r'\d+', text)
-    for digit in digits:
-        dv = int_to_dv(int(digit),is_spoken=True)
-        text = text.replace(digit, dv)
-    return text
-def normalize_dv(text:str):
-    text = fix_sentence_end(text)
-    text = replace_digits_with_dv(text)
-    text = remove_special_characters(text)
-    return text

requirements.txt CHANGED Viewed

@@ -129,3 +129,4 @@ wcwidth==0.2.13
 websockets==11.0.3
 xxhash==3.4.1
 yarl==1.9.4

 websockets==11.0.3
 xxhash==3.4.1
 yarl==1.9.4
+dv-normalizer