File size: 1,539 Bytes
6a81069 8dd10aa 805ef56 ff0bf3d 805ef56 6a81069 ff0bf3d bb16e26 ff0bf3d bb16e26 ff0bf3d 6a81069 ff0bf3d bb16e26 ff0bf3d 8d444a7 ff0bf3d 8d444a7 6a81069 ff0bf3d 740beea bb16e26 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 |
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForPreTraining
import soundfile as sf
# Load the tokenizer and model for Bulgarian TTS (Text-to-Speech)
tokenizer = AutoTokenizer.from_pretrained("Opit/mms_tts_bulgarian_finetuning")
model = AutoModelForPreTraining.from_pretrained("Opit/mms_tts_bulgarian_finetuning")
# TTS ๋ณํ ํจ์ (text-to-speech conversion)
def tts_generate(text):
inputs = tokenizer(text, return_tensors="pt")
with torch.no_grad():
outputs = model(**inputs)
# Convert the model outputs to audio format (you need to implement this depending on model specifics)
# This will depend on how the model's outputs are structured
# For now, let's assume you need a simple conversion to waveform/audio
# Placeholder: Assuming `outputs` contains audio data that can be returned directly as .wav format
# You might need to adjust this based on how the TTS model is structured and how it outputs speech
audio = outputs['logits'] # Adjust according to your model's output structure
# Return audio output (in numpy format) and the sample rate (this might be specific to your model)
return audio.numpy(), 22050 # Assuming the output is sampled at 22050 Hz
# Create Gradio interface
iface = gr.Interface(
fn=tts_generate,
inputs="text",
outputs="audio",
title="Bulgarian TTS (Text-to-Speech)",
description="Enter text to generate speech in Bulgarian."
)
# Run the interface
if __name__ == "__main__":
iface.launch()
|