Spaces:
Sleeping
Sleeping
from transformers import VitsModel, AutoTokenizer | |
import torch | |
import scipy.io.wavfile | |
import util | |
# Model ID and setup | |
model_id = "facebook/mms-tts-uig-script_arabic" | |
tts_tokenizer = AutoTokenizer.from_pretrained(model_id) | |
tts_model = VitsModel.from_pretrained(model_id) | |
# Automatically allocate the device | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
tts_model = tts_model.to(device) | |
def generate_audio(input_text, script): | |
""" | |
Generate audio for the given input text and script | |
""" | |
# Convert text to Uyghur Arabic if needed | |
if script != "Uyghur Arabic": | |
input_text = util.ug_latn_to_arab(input_text) | |
# Tokenize and move inputs to the same device as the model | |
tts_inputs = tts_tokenizer(input_text, return_tensors="pt").to(device) | |
# Perform inference | |
with torch.no_grad(): | |
tts_output = tts_model(**tts_inputs).waveform.cpu() # Move output back to CPU for saving | |
# Save to a temporary file | |
output_path = "tts_output.wav" | |
sample_rate = 16000 | |
scipy.io.wavfile.write(output_path, rate=sample_rate, data=tts_output.numpy()[0]) | |
# Return the audio file path | |
return output_path |