Update app.py
Browse files
app.py
CHANGED
@@ -57,15 +57,7 @@ def check_voice_files():
|
|
57 |
else:
|
58 |
return "**All voice files are present.** 🎉"
|
59 |
|
60 |
-
# Initialize Hindi tokenizer
|
61 |
-
def load_hindi_tokenizer():
|
62 |
-
"""
|
63 |
-
Loads a pre-trained Hindi tokenizer from Hugging Face.
|
64 |
-
"""
|
65 |
-
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert", use_fast=True)
|
66 |
-
return tokenizer
|
67 |
|
68 |
-
hindi_tokenizer = load_hindi_tokenizer()
|
69 |
|
70 |
# New function to split text into chunks of 100 tokens using the Hindi tokenizer
|
71 |
def split_text_into_chunks(text, max_tokens=100, language="en"):
|
@@ -74,26 +66,16 @@ def split_text_into_chunks(text, max_tokens=100, language="en"):
|
|
74 |
Inserts a newline after each chunk.
|
75 |
Uses a specialized tokenizer for Hindi language.
|
76 |
"""
|
77 |
-
if language == "hi":
|
78 |
-
tokens = hindi_tokenizer.tokenize(text)
|
79 |
-
else:
|
80 |
-
tokens = text.split() # Fallback to simple splitting for other languages
|
81 |
|
82 |
chunks = []
|
83 |
for i in range(0, len(tokens), max_tokens):
|
84 |
-
|
85 |
-
# Convert tokens back to string for Hindi
|
86 |
-
chunk = hindi_tokenizer.convert_tokens_to_string(tokens[i:i + max_tokens])
|
87 |
-
else:
|
88 |
-
chunk = ' '.join(tokens[i:i + max_tokens])
|
89 |
chunks.append(chunk)
|
90 |
return '\n'.join(chunks)
|
91 |
|
92 |
@spaces.GPU(duration=120)
|
93 |
def tts_generate(text, voice, language):
|
94 |
# Check for Hindi language and split text if necessary
|
95 |
-
if language == "hi":
|
96 |
-
text = split_text_into_chunks(text, max_tokens=100, language=language)
|
97 |
|
98 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
|
99 |
temp_audio_path = temp_audio.name
|
|
|
57 |
else:
|
58 |
return "**All voice files are present.** 🎉"
|
59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
|
|
|
61 |
|
62 |
# New function to split text into chunks of 100 tokens using the Hindi tokenizer
|
63 |
def split_text_into_chunks(text, max_tokens=100, language="en"):
|
|
|
66 |
Inserts a newline after each chunk.
|
67 |
Uses a specialized tokenizer for Hindi language.
|
68 |
"""
|
|
|
|
|
|
|
|
|
69 |
|
70 |
chunks = []
|
71 |
for i in range(0, len(tokens), max_tokens):
|
72 |
+
chunk = ' '.join(tokens[i:i + max_tokens])
|
|
|
|
|
|
|
|
|
73 |
chunks.append(chunk)
|
74 |
return '\n'.join(chunks)
|
75 |
|
76 |
@spaces.GPU(duration=120)
|
77 |
def tts_generate(text, voice, language):
|
78 |
# Check for Hindi language and split text if necessary
|
|
|
|
|
79 |
|
80 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
|
81 |
temp_audio_path = temp_audio.name
|