import gradio as gr | |
import torch | |
from transformers import AutoTokenizer, AutoModelForPreTraining | |
import soundfile as sf | |
# Load the tokenizer and model for Bulgarian TTS (Text-to-Speech) | |
tokenizer = AutoTokenizer.from_pretrained("Opit/mms_tts_bulgarian_finetuning") | |
model = AutoModelForPreTraining.from_pretrained("Opit/mms_tts_bulgarian_finetuning") | |
# TTS λ³ν ν¨μ (text-to-speech conversion) | |
def tts_generate(text): | |
inputs = tokenizer(text, return_tensors="pt") | |
with torch.no_grad(): | |
outputs = model(**inputs) | |
# Convert the model outputs to audio format (you need to implement this depending on model specifics) | |
# This will depend on how the model's outputs are structured | |
# For now, let's assume you need a simple conversion to waveform/audio | |
# Placeholder: Assuming `outputs` contains audio data that can be returned directly as .wav format | |
# You might need to adjust this based on how the TTS model is structured and how it outputs speech | |
audio = outputs['logits'] # Adjust according to your model's output structure | |
# Return audio output (in numpy format) and the sample rate (this might be specific to your model) | |
return audio.numpy(), 22050 # Assuming the output is sampled at 22050 Hz | |
# Create Gradio interface | |
iface = gr.Interface( | |
fn=tts_generate, | |
inputs="text", | |
outputs="audio", | |
title="Bulgarian TTS (Text-to-Speech)", | |
description="Enter text to generate speech in Bulgarian." | |
) | |
# Run the interface | |
if __name__ == "__main__": | |
iface.launch() | |