import gradio as gr from asr import transcribe_audio # Import your ASR function from lid import detect_language # Import your Language Detection function from tts import synthesize # Import the correct TTS function def process_audio(audio_data): # Step 1: Perform ASR (Audio-to-Text) transcription = transcribe_audio(audio_data) # Step 2: Detect language language = detect_language(audio_data) # Step 3: Generate Text Response based on ASR result (Future model generation) # Replace this with your model inference logic generated_text = f"Detected Language: {language}\n\nTranscription: {transcription}" # Step 4: Convert generated text into speech using TTS speech_output, _ = synthesize(text=generated_text, lang=language, speed=1.0) return generated_text, speech_output # Define the Gradio Interface interface = gr.Interface( fn=process_audio, inputs=gr.Audio(type="numpy"), # Removed 'source' argument and kept 'type' outputs=[gr.Textbox(label="Generated Text"), gr.Audio(label="Generated Speech")], live=True ) if __name__ == "__main__": interface.launch()