Kabatubare's picture
Update app.py
903ecce verified
raw
history blame
2.65 kB
import gradio as gr
import torchaudio
import speechbrain
from speechbrain.pretrained import EncoderClassifier, Tacotron2, HIFIGAN, ASR
import os
import soundfile as sf
# Ensure output directory exists
os.makedirs("output_audio", exist_ok=True)
# Load models
encoder = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb", savedir="models/encoder")
tacotron2 = Tacotron2.from_hparams(source="speechbrain/tts-tacotron2-ljspeech", savedir="models/tacotron2")
hifigan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", savedir="models/hifigan")
asr = ASR.from_hparams(source="speechbrain/asr-transformer-librispeech", savedir="models/asr")
def speech_to_text(input_audio):
sig, sr = torchaudio.load(input_audio)
transcription = asr.transcribe_file(input_audio)
return transcription
def speech_to_speech(input_audio, target_text):
# Load and encode speaker from input audio
signal, fs = torchaudio.load(input_audio)
if fs != 16000:
signal = torchaudio.transforms.Resample(orig_freq=fs, new_freq=16000)(signal)
embedding = encoder.encode_batch(signal)
# Synthesize speech from text
mel_output, mel_length, alignment = tacotron2.encode_text(target_text, embedding)
waveform = hifigan.decode_batch(mel_output)
# Save output audio
output_path = "output_audio/synthesized_speech.wav"
sf.write(output_path, waveform.squeeze().cpu().numpy(), 22050)
return output_path
def text_to_speech(text):
mel_output, mel_length, alignment = tacotron2.encode_text(text)
waveform = hifigan.decode_batch(mel_output)
output_path = "output_audio/text_to_speech.wav"
sf.write(output_path, waveform.squeeze().cpu().numpy(), 22050)
return output_path
iface = gr.Interface(
fn={
"Speech to Text": speech_to_text,
"Text to Speech": text_to_speech,
"Speech to Speech": speech_to_speech
},
inputs={
"Speech to Text": gr.inputs.Audio(source="upload", type="file"),
"Text to Speech": gr.inputs.Textbox(label="Text"),
"Speech to Speech": [gr.inputs.Audio(source="upload", type="file"), gr.inputs.Textbox(label="Target Text")]
},
outputs={
"Speech to Text": gr.outputs.Textbox(label="Transcription"),
"Text to Speech": gr.outputs.Audio(type="file", label="Synthesized Speech"),
"Speech to Speech": gr.outputs.Audio(type="file", label="Synthesized Speech")
},
title="Speech Processing App",
description="Upload an audio file or enter text to perform various speech processing tasks.",
layout="vertical"
)
if __name__ == "__main__":
iface.launch()