|
import torch |
|
from transformers import SpeechT5ForTextToSpeech, SpeechT5Processor |
|
|
|
MODEL_ID = "microsoft/speecht5_tts" |
|
processor = SpeechT5Processor.from_pretrained(MODEL_ID) |
|
model = SpeechT5ForTextToSpeech.from_pretrained(MODEL_ID) |
|
|
|
def synthesize_speech(text): |
|
inputs = processor(text, return_tensors="pt") |
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
model.to(device) |
|
inputs = inputs.to(device) |
|
|
|
with torch.no_grad(): |
|
speech = model.generate(**inputs) |
|
|
|
return processor.decode(speech) |
|
|