Spaces:

englissi
/

bgtts

Runtime error

File size: 1,539 Bytes

6a81069
8dd10aa
805ef56
ff0bf3d
 
 
805ef56
 
6a81069
ff0bf3d
 
 
bb16e26
 
ff0bf3d
 
 
 
 
bb16e26
ff0bf3d
 
 
6a81069
ff0bf3d
 
bb16e26
ff0bf3d
8d444a7
ff0bf3d
 
 
 
 
8d444a7
6a81069
ff0bf3d
740beea
bb16e26

import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForPreTraining
import soundfile as sf

# Load the tokenizer and model for Bulgarian TTS (Text-to-Speech)
tokenizer = AutoTokenizer.from_pretrained("Opit/mms_tts_bulgarian_finetuning")
model = AutoModelForPreTraining.from_pretrained("Opit/mms_tts_bulgarian_finetuning")

# TTS 변환 함수 (text-to-speech conversion)
def tts_generate(text):
    inputs = tokenizer(text, return_tensors="pt")

    with torch.no_grad():
        outputs = model(**inputs)

    # Convert the model outputs to audio format (you need to implement this depending on model specifics)
    # This will depend on how the model's outputs are structured
    # For now, let's assume you need a simple conversion to waveform/audio

    # Placeholder: Assuming `outputs` contains audio data that can be returned directly as .wav format
    # You might need to adjust this based on how the TTS model is structured and how it outputs speech
    audio = outputs['logits']  # Adjust according to your model's output structure

    # Return audio output (in numpy format) and the sample rate (this might be specific to your model)
    return audio.numpy(), 22050  # Assuming the output is sampled at 22050 Hz

# Create Gradio interface
iface = gr.Interface(
    fn=tts_generate,
    inputs="text",
    outputs="audio",
    title="Bulgarian TTS (Text-to-Speech)",
    description="Enter text to generate speech in Bulgarian."
)

# Run the interface
if __name__ == "__main__":
    iface.launch()