nikkmitra commited on
Commit
726d00d
·
verified ·
1 Parent(s): 0855517

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1 -19
app.py CHANGED
@@ -57,15 +57,7 @@ def check_voice_files():
57
  else:
58
  return "**All voice files are present.** 🎉"
59
 
60
- # Initialize Hindi tokenizer
61
- def load_hindi_tokenizer():
62
- """
63
- Loads a pre-trained Hindi tokenizer from Hugging Face.
64
- """
65
- tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert", use_fast=True)
66
- return tokenizer
67
 
68
- hindi_tokenizer = load_hindi_tokenizer()
69
 
70
  # New function to split text into chunks of 100 tokens using the Hindi tokenizer
71
  def split_text_into_chunks(text, max_tokens=100, language="en"):
@@ -74,26 +66,16 @@ def split_text_into_chunks(text, max_tokens=100, language="en"):
74
  Inserts a newline after each chunk.
75
  Uses a specialized tokenizer for Hindi language.
76
  """
77
- if language == "hi":
78
- tokens = hindi_tokenizer.tokenize(text)
79
- else:
80
- tokens = text.split() # Fallback to simple splitting for other languages
81
 
82
  chunks = []
83
  for i in range(0, len(tokens), max_tokens):
84
- if language == "hi":
85
- # Convert tokens back to string for Hindi
86
- chunk = hindi_tokenizer.convert_tokens_to_string(tokens[i:i + max_tokens])
87
- else:
88
- chunk = ' '.join(tokens[i:i + max_tokens])
89
  chunks.append(chunk)
90
  return '\n'.join(chunks)
91
 
92
  @spaces.GPU(duration=120)
93
  def tts_generate(text, voice, language):
94
  # Check for Hindi language and split text if necessary
95
- if language == "hi":
96
- text = split_text_into_chunks(text, max_tokens=100, language=language)
97
 
98
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
99
  temp_audio_path = temp_audio.name
 
57
  else:
58
  return "**All voice files are present.** 🎉"
59
 
 
 
 
 
 
 
 
60
 
 
61
 
62
  # New function to split text into chunks of 100 tokens using the Hindi tokenizer
63
  def split_text_into_chunks(text, max_tokens=100, language="en"):
 
66
  Inserts a newline after each chunk.
67
  Uses a specialized tokenizer for Hindi language.
68
  """
 
 
 
 
69
 
70
  chunks = []
71
  for i in range(0, len(tokens), max_tokens):
72
+ chunk = ' '.join(tokens[i:i + max_tokens])
 
 
 
 
73
  chunks.append(chunk)
74
  return '\n'.join(chunks)
75
 
76
  @spaces.GPU(duration=120)
77
  def tts_generate(text, voice, language):
78
  # Check for Hindi language and split text if necessary
 
 
79
 
80
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
81
  temp_audio_path = temp_audio.name