reallynicejam commited on
Commit
a2afd3b
·
verified ·
1 Parent(s): 9e18d98

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -65
app.py CHANGED
@@ -1,70 +1,18 @@
1
  import gradio as gr
2
- import numpy as np
3
- import IPython.display as ipd
4
- from pathlib import Path
5
- from fairseq import hub_utils
6
- from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub
7
- from fairseq.models.speech_to_text.hub_interface import S2THubInterface
8
- from fairseq.models.text_to_speech import CodeHiFiGANVocoder
9
- from fairseq.models.text_to_speech.hub_interface import VocoderHubInterface
10
- from huggingface_hub import snapshot_download
11
- import json
12
- import sounddevice as sd
13
 
14
- # Load speech-to-text model
15
- models, cfg, task = load_model_ensemble_and_task_from_hf_hub(
16
- "facebook/xm_transformer_s2ut_hk-en",
17
- arg_overrides={"config_yaml": "config.yaml", "task": "speech_to_text"},
 
 
 
 
 
 
 
 
18
  )
19
 
20
- generator = task.build_generator([models[0]], cfg)
21
-
22
- # Load text-to-speech model
23
- library_name = "fairseq"
24
- cache_dir = (Path.home() / ".cache" / library_name).as_posix()
25
- cache_dir = snapshot_download(
26
- f"facebook/unit_hifigan_mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj_dur",
27
- cache_dir=cache_dir,
28
- library_name=library_name,
29
- )
30
-
31
- x = hub_utils.from_pretrained(
32
- cache_dir,
33
- "model.pt",
34
- ".",
35
- archive_map=CodeHiFiGANVocoder.hub_models(),
36
- config_yaml="config.json",
37
- fp16=False,
38
- is_vocoder=True,
39
- )
40
-
41
- with open(f"{x['args']['data']}/config.json") as f:
42
- vocoder_cfg = json.load(f)
43
-
44
- assert len(x["args"]["model_path"]) == 1, "Too many vocoder models in the input"
45
-
46
- vocoder = CodeHiFiGANVocoder(x["args"]["model_path"][0], vocoder_cfg)
47
- tts_model = VocoderHubInterface(vocoder_cfg, vocoder)
48
-
49
-
50
- def record_and_transcribe_synthesize():
51
- # Record audio using sounddevice
52
- sr = 16000 # Sample rate
53
- duration = 5 # Recording duration in seconds
54
- audio = sd.rec(int(sr * duration), samplerate=sr, channels=1, dtype=np.int16)
55
- sd.wait()
56
-
57
- # Speech-to-Text
58
- sample = S2THubInterface.get_model_input(task, audio)
59
- unit = S2THubInterface.get_prediction(task, models[0], generator, sample)
60
-
61
- # Text-to-Speech
62
- tts_sample = tts_model.get_model_input(unit)
63
- wav, sr = tts_model.get_prediction(tts_sample)
64
-
65
- return ipd.Audio(wav, rate=sr)
66
-
67
-
68
- # Gradio Interface
69
- iface = gr.Interface(fn=record_and_transcribe_synthesize, inputs=None, outputs="audio")
70
  iface.launch()
 
1
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
2
 
3
+ def audio_receiver(audio_data):
4
+ # Process or analyze the received audio data
5
+ # For simplicity, let's just return the received audio data
6
+ return audio_data
7
+
8
+ # Create a Gradio Interface
9
+ iface = gr.Interface(
10
+ fn=audio_receiver,
11
+ inputs="microphone",
12
+ outputs="audio",
13
+ live=True, # Set live to True for real-time audio processing
14
+ capture_session=True # Use capture_session for continuous microphone input
15
  )
16
 
17
+ # Launch the Gradio Interface
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  iface.launch()