import os import gradio as gr import soundfile as sf # For audio handling from brain import encode_image, analyze_image_with_query from patientvoice import record_audio, transcribe_with_groq from doctorvoice import text_to_speech_with_gtts from dotenv import load_dotenv # Load environment variables load_dotenv() system_prompt = """You have to act as a professional doctor, i know you are not but this is for learning purpose. What's in this image?. Do you find anything wrong with it medically? If you make a differential, suggest some remedies for them. Donot add any numbers or special characters in your response. Your response should be in one long paragraph. Also always answer as if you are answering to a real person. Donot say 'In the image I see' but say 'With what I see, I think you have ....' Dont respond as an AI model in markdown, your answer should mimic that of an actual doctor not an AI bot, Keep your answer concise (max 2 sentences). No preamble, start your answer right away please""" def process_inputs(audio_data, image_filepath): # Handle audio input from microphone if audio_data is not None: sample_rate, audio_array = audio_data audio_filepath = "temp_audio.wav" sf.write(audio_filepath, audio_array, sample_rate) else: audio_filepath = None # Transcribe audio with error handling if audio_filepath: try: speech_to_text_output = transcribe_with_groq( GROQ_API_KEY=os.environ.get("GROQ_API_KEY"), audio_filepath=audio_filepath, stt_model="whisper-large-v3" ) except Exception as e: speech_to_text_output = f"Error in transcription: {str(e)}" else: speech_to_text_output = "No audio provided" # Handle the image input with error handling if image_filepath: try: doctor_response = analyze_image_with_query( query=system_prompt + speech_to_text_output, encoded_image=encode_image(image_filepath), model="llama-3.2-11b-vision-preview" ) except Exception as e: doctor_response = f"Error in image analysis: {str(e)}" else: doctor_response = "No image provided for me to analyze" # Generate doctor's voice with error handling try: voice_of_doctor = text_to_speech_with_gtts(input_text=doctor_response, output_filepath="final.mp3") except Exception as e: voice_of_doctor = None print(f"Error in text-to-speech: {str(e)}") return speech_to_text_output, doctor_response, voice_of_doctor # Create the interface iface = gr.Interface( fn=process_inputs, inputs=[ gr.Audio(sources=["microphone"], type="numpy", label="Speak to the Doctor"), gr.Image(type="filepath", label="Upload an Image (Optional)") ], outputs=[ gr.Textbox(label="Speech to Text"), gr.Textbox(label="Doctor's Response"), gr.Audio(label="Doctor's Voice") ], title="MediVox : AI Doctor with Vision and Voice", css=".gradio-container {text-align: center;}" ) iface.launch()