Spaces:

reallynicejam
/

s2ut-hk-en

Runtime error

App Files Files Community

reallynicejam commited on Jan 30, 2024

Commit

a2afd3b

verified ·

1 Parent(s): 9e18d98

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -65

app.py CHANGED Viewed

@@ -1,70 +1,18 @@
 import gradio as gr
-import numpy as np
-import IPython.display as ipd
-from pathlib import Path
-from fairseq import hub_utils
-from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub
-from fairseq.models.speech_to_text.hub_interface import S2THubInterface
-from fairseq.models.text_to_speech import CodeHiFiGANVocoder
-from fairseq.models.text_to_speech.hub_interface import VocoderHubInterface
-from huggingface_hub import snapshot_download
-import json
-import sounddevice as sd
-# Load speech-to-text model
-models, cfg, task = load_model_ensemble_and_task_from_hf_hub(
-    "facebook/xm_transformer_s2ut_hk-en",
-    arg_overrides={"config_yaml": "config.yaml", "task": "speech_to_text"},
 )
-generator = task.build_generator([models[0]], cfg)
-# Load text-to-speech model
-library_name = "fairseq"
-cache_dir = (Path.home() / ".cache" / library_name).as_posix()
-cache_dir = snapshot_download(
-    f"facebook/unit_hifigan_mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj_dur",
-    cache_dir=cache_dir,
-    library_name=library_name,
-)
-x = hub_utils.from_pretrained(
-    cache_dir,
-    "model.pt",
-    ".",
-    archive_map=CodeHiFiGANVocoder.hub_models(),
-    config_yaml="config.json",
-    fp16=False,
-    is_vocoder=True,
-)
-with open(f"{x['args']['data']}/config.json") as f:
-    vocoder_cfg = json.load(f)
-assert len(x["args"]["model_path"]) == 1, "Too many vocoder models in the input"
-vocoder = CodeHiFiGANVocoder(x["args"]["model_path"][0], vocoder_cfg)
-tts_model = VocoderHubInterface(vocoder_cfg, vocoder)
-def record_and_transcribe_synthesize():
-    # Record audio using sounddevice
-    sr = 16000  # Sample rate
-    duration = 5  # Recording duration in seconds
-    audio = sd.rec(int(sr * duration), samplerate=sr, channels=1, dtype=np.int16)
-    sd.wait()
-    # Speech-to-Text
-    sample = S2THubInterface.get_model_input(task, audio)
-    unit = S2THubInterface.get_prediction(task, models[0], generator, sample)
-    # Text-to-Speech
-    tts_sample = tts_model.get_model_input(unit)
-    wav, sr = tts_model.get_prediction(tts_sample)
-    return ipd.Audio(wav, rate=sr)
-# Gradio Interface
-iface = gr.Interface(fn=record_and_transcribe_synthesize, inputs=None, outputs="audio")
 iface.launch()

 import gradio as gr
+def audio_receiver(audio_data):
+    # Process or analyze the received audio data
+    # For simplicity, let's just return the received audio data
+    return audio_data
+# Create a Gradio Interface
+iface = gr.Interface(
+    fn=audio_receiver,
+    inputs="microphone",
+    outputs="audio",
+    live=True,  # Set live to True for real-time audio processing
+    capture_session=True  # Use capture_session for continuous microphone input
 )
+# Launch the Gradio Interface
 iface.launch()