import gradio as gr import librosa from asr import transcribe from tts import synthesize def identify(microphone, file_upload): LID_SAMPLING_RATE = 16_000 if (microphone is not None) and (file_upload is not None): return "WARNING: Using microphone input. Uploaded file will be ignored." if (microphone is None) and (file_upload is None): return "ERROR: Provide an audio file or use the microphone." audio_fp = microphone if microphone is not None else file_upload inputs = librosa.load(audio_fp, sr=LID_SAMPLING_RATE, mono=True)[0] return {"Faroese": 1.0} demo = gr.Blocks() mms_transcribe = gr.Interface( fn=transcribe, inputs=[ gr.Audio(source="microphone", type="filepath"), gr.Audio(source="upload", type="filepath"), ], outputs="text", title="Speech-to-text", description="Transcribe audio!", allow_flagging="never", ) mms_synthesize = gr.Interface( fn=synthesize, inputs=[ gr.Text(label="Input text"), gr.Slider(minimum=0.1, maximum=4.0, value=1.0, step=0.1, label="Speed"), ], outputs=gr.Audio(label="Generated Audio", type="numpy"), title="Text-to-speech", description="Generate audio!", allow_flagging="never", ) mms_identify = gr.Interface( fn=identify, inputs=[ gr.Audio(source="microphone", type="filepath"), gr.Audio(source="upload", type="filepath"), ], outputs=gr.Label(num_top_classes=1), title="Language Identification", description="Identify the language of audio!", allow_flagging="never", ) with demo: gr.TabbedInterface( [mms_synthesize, mms_transcribe, mms_identify], ["Text-to-speech", "Speech-to-text", "Language Identification"], ) demo.launch()