Spaces:

unijoh
/

metaambod

Runtime error

File size: 1,567 Bytes

ba0da04
 
41d6250
ba0da04
 
1180f3c
19401eb
 
 
ba0da04
 
 
 
 
 
 
 
 
0083046
ba0da04
58f6d57
 
0083046
 
 
 
 
ba0da04
 
 
 
 
 
 
0083046
ba0da04
a90990e
ba0da04
 
0083046
 
 
ba0da04
0083046
 
 
58f6d57
ba0da04
58f6d57

import torch
from transformers import SpeechT5ForTextToSpeech, SpeechT5Processor
import logging
import numpy as np
import soundfile as sf

# Set up logging
logging.basicConfig(level=logging.DEBUG)

MODEL_ID = "microsoft/speecht5_tts"

# Try to load the model and processor
try:
    processor = SpeechT5Processor.from_pretrained(MODEL_ID)
    model = SpeechT5ForTextToSpeech.from_pretrained(MODEL_ID)
    logging.info("Model and processor loaded successfully.")
except Exception as e:
    logging.error(f"Error loading model or processor: {e}")
    raise

def synthesize_speech(text):
    try:
        # Ensure text is not empty
        if not text.strip():
            logging.error("Text input is empty.")
            return None

        inputs = processor(text, return_tensors="pt")
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model.to(device)
        inputs = inputs.to(device)

        with torch.no_grad():
            speech = model.generate(**inputs)
        
        logging.info("Speech generated successfully.")

        # Decode the generated speech and save to an audio file
        waveform = speech.cpu().numpy().flatten()
        # Normalize waveform to the range [-1, 1]
        waveform = np.clip(waveform, -1.0, 1.0)
        
        # Convert waveform to audio format that Gradio can handle
        audio_path = "output.wav"
        sf.write(audio_path, waveform, 16000)
        return audio_path
    except Exception as e:
        logging.error(f"Error during speech synthesis: {e}")
        return None