Spaces:

gauravgulati619
/

MediVox

Running

App Files Files Community

gauravgulati619 commited on Feb 28

Commit

8693c7e

verified ·

1 Parent(s): f78f4fc

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -20

app.py CHANGED Viewed

@@ -1,16 +1,14 @@
 import os
 import gradio as gr
-import pathlib
-import torch
-import faiss
-from sentence_transformers import SentenceTransformer
 from brain import encode_image, analyze_image_with_query
-from patientvoice import record_audio, transcribe_with_groq
-from doctorvoice import text_to_speech_with_gtts, text_to_speech_with_elevenlabs
 from dotenv import load_dotenv
 load_dotenv()
-system_prompt="""You have to act as a professional doctor, i know you are not but this is for learning purpose.
             What's in this image?. Do you find anything wrong with it medically?
             If you make a differential, suggest some remedies for them. Donot add any numbers or special characters in
             your response. Your response should be in one long paragraph. Also always answer as if you are answering to a real person.
@@ -18,36 +16,68 @@ system_prompt="""You have to act as a professional doctor, i know you are not bu
             Dont respond as an AI model in markdown, your answer should mimic that of an actual doctor not an AI bot,
             Keep your answer concise (max 5 sentences). No preamble, start your answer right away please"""
-def process_inputs(audio_filepath, image_filepath):
-    speech_to_text_output = transcribe_with_groq(GROQ_API_KEY=os.environ.get("GROQ_API_KEY"),
-                                                 audio_filepath=audio_filepath,
-                                                 stt_model="whisper-large-v3")
-    # Handle the image input
     if image_filepath:
-        doctor_response = analyze_image_with_query(query=system_prompt+speech_to_text_output, encoded_image=encode_image(image_filepath), model="llama-3.2-11b-vision-preview")
     else:
         doctor_response = "No image provided for me to analyze"
-    voice_of_doctor = text_to_speech_with_elevenlabs(input_text=doctor_response, output_filepath="final.mp3")
     return speech_to_text_output, doctor_response, voice_of_doctor
-# Create the interface
 iface = gr.Interface(
     fn=process_inputs,
     inputs=[
-        gr.Audio(sources=["microphone"], type="filepath"),
-        gr.Image(type="filepath")
     ],
     outputs=[
         gr.Textbox(label="Speech to Text"),
         gr.Textbox(label="Doctor's Response"),
         gr.Audio(label="Doctor's Voice")
     ],
-    title="MediVox : AI Doctor with Vision and Voice",
     css=".gradio-container {text-align: center;}"
 )
-iface.launch(debug=True)

 import os
 import gradio as gr
+import soundfile as sf
 from brain import encode_image, analyze_image_with_query
+from patientvoice import transcribe_with_groq
+from doctorvoice import text_to_speech_with_elevenlabs
 from dotenv import load_dotenv
+# Load environment variables
 load_dotenv()
+system_prompt = """You have to act as a professional doctor, i know you are not but this is for learning purpose.
             What's in this image?. Do you find anything wrong with it medically?
             If you make a differential, suggest some remedies for them. Donot add any numbers or special characters in
             your response. Your response should be in one long paragraph. Also always answer as if you are answering to a real person.
             Dont respond as an AI model in markdown, your answer should mimic that of an actual doctor not an AI bot,
             Keep your answer concise (max 5 sentences). No preamble, start your answer right away please"""
+def process_inputs(audio_data, image_filepath):
+    # Handle audio input from microphone
+    if audio_data is not None:
+        sample_rate, audio_array = audio_data
+        audio_filepath = "temp_audio.wav"
+        sf.write(audio_filepath, audio_array, sample_rate)
+    else:
+        audio_filepath = None
+    # Transcribe audio with error handling
+    if audio_filepath:
+        try:
+            speech_to_text_output = transcribe_with_groq(
+                GROQ_API_KEY=os.environ.get("GROQ_API_KEY"),
+                audio_filepath=audio_filepath,
+                stt_model="whisper-large-v3"
+            )
+        except Exception as e:
+            speech_to_text_output = f"Error in transcription: {str(e)}"
+    else:
+        speech_to_text_output = "No audio provided"
+    # Process image input with error handling
     if image_filepath:
+        try:
+            doctor_response = analyze_image_with_query(
+                query=system_prompt + speech_to_text_output,
+                encoded_image=encode_image(image_filepath),
+                model="llama-3.2-11b-vision-preview"
+            )
+        except Exception as e:
+            doctor_response = f"Error in image analysis: {str(e)}"
     else:
         doctor_response = "No image provided for me to analyze"
+    # Generate doctor's voice with error handling
+    try:
+        voice_of_doctor = text_to_speech_with_elevenlabs(
+            input_text=doctor_response,
+            output_filepath="final.mp3"
+        )
+    except Exception as e:
+        voice_of_doctor = None
+        print(f"Error in text-to-speech: {str(e)}")
     return speech_to_text_output, doctor_response, voice_of_doctor
+# Define Gradio interface
 iface = gr.Interface(
     fn=process_inputs,
     inputs=[
+        gr.Audio(sources=["microphone"], type="numpy", label="Speak to the Doctor"),
+        gr.Image(type="filepath", label="Upload an Image (Optional)")
     ],
     outputs=[
         gr.Textbox(label="Speech to Text"),
         gr.Textbox(label="Doctor's Response"),
         gr.Audio(label="Doctor's Voice")
     ],
+    title="MediVox: AI Doctor with Vision and Voice",
     css=".gradio-container {text-align: center;}"
 )
+# Launch the interface
+iface.launch()