bgtts / app.py
englissi's picture
Update app.py
ff0bf3d verified
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForPreTraining
import soundfile as sf
# Load the tokenizer and model for Bulgarian TTS (Text-to-Speech)
tokenizer = AutoTokenizer.from_pretrained("Opit/mms_tts_bulgarian_finetuning")
model = AutoModelForPreTraining.from_pretrained("Opit/mms_tts_bulgarian_finetuning")
# TTS λ³€ν™˜ ν•¨μˆ˜ (text-to-speech conversion)
def tts_generate(text):
inputs = tokenizer(text, return_tensors="pt")
with torch.no_grad():
outputs = model(**inputs)
# Convert the model outputs to audio format (you need to implement this depending on model specifics)
# This will depend on how the model's outputs are structured
# For now, let's assume you need a simple conversion to waveform/audio
# Placeholder: Assuming `outputs` contains audio data that can be returned directly as .wav format
# You might need to adjust this based on how the TTS model is structured and how it outputs speech
audio = outputs['logits'] # Adjust according to your model's output structure
# Return audio output (in numpy format) and the sample rate (this might be specific to your model)
return audio.numpy(), 22050 # Assuming the output is sampled at 22050 Hz
# Create Gradio interface
iface = gr.Interface(
fn=tts_generate,
inputs="text",
outputs="audio",
title="Bulgarian TTS (Text-to-Speech)",
description="Enter text to generate speech in Bulgarian."
)
# Run the interface
if __name__ == "__main__":
iface.launch()