|
import torch |
|
from transformers import VitsModel, AutoTokenizer |
|
import numpy as np |
|
import scipy.io.wavfile as wavfile |
|
import gradio as gr |
|
|
|
def yes(texte): |
|
model = VitsModel.from_pretrained("facebook/mms-tts-eng") |
|
tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng") |
|
|
|
text = texte |
|
inputs = tokenizer(text, return_tensors="pt") |
|
|
|
with torch.no_grad(): |
|
output = model(**inputs).waveform |
|
|
|
|
|
output_normalized = output / torch.max(torch.abs(output)) |
|
|
|
|
|
audio_data = output_normalized.squeeze().cpu().numpy() |
|
|
|
|
|
audio_data_scaled = np.int16(audio_data * 32767) |
|
|
|
|
|
wavfile.write("techno.wav", rate=model.config.sampling_rate, data=audio_data_scaled) |
|
|
|
with open("techno.wav",'rb') as audio: |
|
audio_data = audio.read() |
|
|
|
return audio_data |
|
|
|
text = gr.Interface(fn=yes, inputs='text', outputs='audio') |
|
text.launch(debug=True) |
|
|
|
|