File size: 1,539 Bytes
6a81069
8dd10aa
805ef56
ff0bf3d
 
 
805ef56
 
6a81069
ff0bf3d
 
 
bb16e26
 
ff0bf3d
 
 
 
 
bb16e26
ff0bf3d
 
 
6a81069
ff0bf3d
 
bb16e26
ff0bf3d
8d444a7
ff0bf3d
 
 
 
 
8d444a7
6a81069
ff0bf3d
740beea
bb16e26
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForPreTraining
import soundfile as sf

# Load the tokenizer and model for Bulgarian TTS (Text-to-Speech)
tokenizer = AutoTokenizer.from_pretrained("Opit/mms_tts_bulgarian_finetuning")
model = AutoModelForPreTraining.from_pretrained("Opit/mms_tts_bulgarian_finetuning")

# TTS ๋ณ€ํ™˜ ํ•จ์ˆ˜ (text-to-speech conversion)
def tts_generate(text):
    inputs = tokenizer(text, return_tensors="pt")

    with torch.no_grad():
        outputs = model(**inputs)

    # Convert the model outputs to audio format (you need to implement this depending on model specifics)
    # This will depend on how the model's outputs are structured
    # For now, let's assume you need a simple conversion to waveform/audio

    # Placeholder: Assuming `outputs` contains audio data that can be returned directly as .wav format
    # You might need to adjust this based on how the TTS model is structured and how it outputs speech
    audio = outputs['logits']  # Adjust according to your model's output structure

    # Return audio output (in numpy format) and the sample rate (this might be specific to your model)
    return audio.numpy(), 22050  # Assuming the output is sampled at 22050 Hz

# Create Gradio interface
iface = gr.Interface(
    fn=tts_generate,
    inputs="text",
    outputs="audio",
    title="Bulgarian TTS (Text-to-Speech)",
    description="Enter text to generate speech in Bulgarian."
)

# Run the interface
if __name__ == "__main__":
    iface.launch()