Spaces:

LPhilp1943
/

speech_2_speech_voice_cloning

Build error

App Files Files Community

LPhilp1943 commited on Mar 17, 2024

Commit

d8238c0

verified ·

1 Parent(s): c95f875

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -63

app.py CHANGED Viewed

@@ -1,72 +1,54 @@
 import os
 import sys
-import subprocess
 import gradio as gr
-import torch
-import soundfile as sf
-from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
-import librosa
 from TTS.api import TTS
-from TTS.utils.manage import ModelManager
-def install_sentencepiece():
-    try:
-        # Attempting to install sentencepiece via pip
-        subprocess.check_call([sys.executable, "-m", "pip", "install", "sentencepiece"])
-    except subprocess.CalledProcessError:
-        # Attempt to install sentencepiece via system package manager if pip install fails
-        if os.name == "posix":
-            os.system("sudo apt-get install -y libprotobuf10 protobuf-compiler libprotobuf-dev")
-            os.system("sudo apt-get install -y libsentencepiece-dev")
-        else:
-            raise OSError("Automatic installation of SentencePiece is not supported on this OS")
-# Call the function to attempt installing SentencePiece
-install_sentencepiece()
-# Agreeing to Coqui TTS terms of service and setting up environment variables
 os.environ["COQUI_TOS_AGREED"] = "1"
-os.makedirs("output_audio", exist_ok=True)
-# Initialize ASR model
-asr_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
-asr_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
-asr_model.eval()
-# Dynamically list and select TTS model
-tts_manager = ModelManager()
 model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
 tts = TTS(model_name, gpu=False)
-def resample_audio(input_audio_path, target_sr=16000):
-    waveform, sr = sf.read(input_audio_path)
-    if sr != target_sr:
-        waveform = librosa.resample(waveform, orig_sr=sr, target_sr=target_sr)
-    return waveform
-def speech_to_text(input_audio_path):
-    waveform = resample_audio(input_audio_path)
-    input_values = asr_processor(waveform, return_tensors="pt").input_values
-    with torch.no_grad():
-        logits = asr_model(input_values).logits
-    predicted_ids = torch.argmax(logits, dim=-1)
-    transcription = asr_processor.batch_decode(predicted_ids)[0]
-    return transcription.strip()
-def text_to_speech(text, speaker_wav_path, output_path="output_audio/output.wav"):
-    if not text.strip():
-        return "Empty text input."
-    tts.tts_to_file(text=text, file_path=output_path, speaker_wav=speaker_wav_path)
-    return output_path
-def speech_to_speech(input_audio, text_input=None):
-    speaker_wav_path = input_audio
-    if text_input is None:
-        text_input = speech_to_text(input_audio)
-    return text_to_speech(text_input, speaker_wav_path)
-iface = gr.Interface(fn=speech_to_speech,
-                     inputs=[gr.Audio(type="filepath"), gr.Textbox(optional=True)],
-                     outputs=gr.Audio())
-iface.launch()

 import os
 import sys
+from fastapi import Request
 import gradio as gr
 from TTS.api import TTS
+# Agree to Coqui TTS terms of service
 os.environ["COQUI_TOS_AGREED"] = "1"
+# Initialize TTS with the desired model
 model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
 tts = TTS(model_name, gpu=False)
+tts.to("cpu")  # Use CPU for inference
+def predict(prompt, language, audio_file_path, use_mic, agree):
+    if not agree:
+        return "You must agree to the Terms & Condition!", None
+    if use_mic and not audio_file_path:
+        return "Please provide a microphone recording or disable the 'Use Microphone' option.", None
+    if len(prompt) < 2 or len(prompt) > 50000:
+        return "Prompt text length must be between 2 and 50000 characters.", None
+    speaker_wav = audio_file_path
+    output_path = "output.wav"
+    try:
+        tts.tts_to_file(text=prompt, file_path=output_path, speaker_wav=speaker_wav, language=language)
+    except Exception as e:
+        print(f"Error during TTS generation: {e}", file=sys.stderr)
+        return "An error occurred during TTS generation.", None
+    return gr.Audio(file_path=output_path), output_path
+iface = gr.Interface(
+    fn=predict,
+    inputs=[
+        gr.Textbox(label="Text Prompt"),
+        gr.Dropdown(label="Language", choices=["en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn"], value="en"),
+        gr.Audio(label="Reference Audio", type="filepath"),
+        gr.Checkbox(label="Use Microphone as Reference", value=False),
+        gr.Checkbox(label="Agree to Terms & Conditions", value=True),
+    ],
+    outputs=[gr.Audio(label="Synthesised Audio"), "text"],
+    title="XTTS Text-to-Speech",
+    description="A web interface for Coqui's TTS model to generate speech from text.",
+    examples=[
+        # Example inputs
+        ["Hello, World !", "en", "path/to/example_audio.wav", False, True],
+    ]
+)
+iface.launch(debug=True)