Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,16 +1,14 @@
|
|
1 |
import os
|
2 |
import gradio as gr
|
3 |
-
import
|
4 |
-
import torch
|
5 |
-
import faiss
|
6 |
-
from sentence_transformers import SentenceTransformer
|
7 |
-
|
8 |
from brain import encode_image, analyze_image_with_query
|
9 |
-
from patientvoice import
|
10 |
-
from doctorvoice import
|
11 |
from dotenv import load_dotenv
|
|
|
|
|
12 |
load_dotenv()
|
13 |
-
system_prompt="""You have to act as a professional doctor, i know you are not but this is for learning purpose.
|
14 |
What's in this image?. Do you find anything wrong with it medically?
|
15 |
If you make a differential, suggest some remedies for them. Donot add any numbers or special characters in
|
16 |
your response. Your response should be in one long paragraph. Also always answer as if you are answering to a real person.
|
@@ -18,36 +16,68 @@ system_prompt="""You have to act as a professional doctor, i know you are not bu
|
|
18 |
Dont respond as an AI model in markdown, your answer should mimic that of an actual doctor not an AI bot,
|
19 |
Keep your answer concise (max 5 sentences). No preamble, start your answer right away please"""
|
20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
-
#
|
28 |
if image_filepath:
|
29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
else:
|
31 |
doctor_response = "No image provided for me to analyze"
|
32 |
|
33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
return speech_to_text_output, doctor_response, voice_of_doctor
|
36 |
|
37 |
-
#
|
38 |
iface = gr.Interface(
|
39 |
fn=process_inputs,
|
40 |
inputs=[
|
41 |
-
gr.Audio(sources=["microphone"], type="
|
42 |
-
gr.Image(type="filepath")
|
43 |
],
|
44 |
outputs=[
|
45 |
gr.Textbox(label="Speech to Text"),
|
46 |
gr.Textbox(label="Doctor's Response"),
|
47 |
gr.Audio(label="Doctor's Voice")
|
48 |
],
|
49 |
-
title="MediVox
|
50 |
css=".gradio-container {text-align: center;}"
|
51 |
)
|
52 |
|
53 |
-
|
|
|
|
1 |
import os
|
2 |
import gradio as gr
|
3 |
+
import soundfile as sf
|
|
|
|
|
|
|
|
|
4 |
from brain import encode_image, analyze_image_with_query
|
5 |
+
from patientvoice import transcribe_with_groq
|
6 |
+
from doctorvoice import text_to_speech_with_elevenlabs
|
7 |
from dotenv import load_dotenv
|
8 |
+
|
9 |
+
# Load environment variables
|
10 |
load_dotenv()
|
11 |
+
system_prompt = """You have to act as a professional doctor, i know you are not but this is for learning purpose.
|
12 |
What's in this image?. Do you find anything wrong with it medically?
|
13 |
If you make a differential, suggest some remedies for them. Donot add any numbers or special characters in
|
14 |
your response. Your response should be in one long paragraph. Also always answer as if you are answering to a real person.
|
|
|
16 |
Dont respond as an AI model in markdown, your answer should mimic that of an actual doctor not an AI bot,
|
17 |
Keep your answer concise (max 5 sentences). No preamble, start your answer right away please"""
|
18 |
|
19 |
+
def process_inputs(audio_data, image_filepath):
|
20 |
+
# Handle audio input from microphone
|
21 |
+
if audio_data is not None:
|
22 |
+
sample_rate, audio_array = audio_data
|
23 |
+
audio_filepath = "temp_audio.wav"
|
24 |
+
sf.write(audio_filepath, audio_array, sample_rate)
|
25 |
+
else:
|
26 |
+
audio_filepath = None
|
27 |
|
28 |
+
# Transcribe audio with error handling
|
29 |
+
if audio_filepath:
|
30 |
+
try:
|
31 |
+
speech_to_text_output = transcribe_with_groq(
|
32 |
+
GROQ_API_KEY=os.environ.get("GROQ_API_KEY"),
|
33 |
+
audio_filepath=audio_filepath,
|
34 |
+
stt_model="whisper-large-v3"
|
35 |
+
)
|
36 |
+
except Exception as e:
|
37 |
+
speech_to_text_output = f"Error in transcription: {str(e)}"
|
38 |
+
else:
|
39 |
+
speech_to_text_output = "No audio provided"
|
40 |
|
41 |
+
# Process image input with error handling
|
42 |
if image_filepath:
|
43 |
+
try:
|
44 |
+
doctor_response = analyze_image_with_query(
|
45 |
+
query=system_prompt + speech_to_text_output,
|
46 |
+
encoded_image=encode_image(image_filepath),
|
47 |
+
model="llama-3.2-11b-vision-preview"
|
48 |
+
)
|
49 |
+
except Exception as e:
|
50 |
+
doctor_response = f"Error in image analysis: {str(e)}"
|
51 |
else:
|
52 |
doctor_response = "No image provided for me to analyze"
|
53 |
|
54 |
+
# Generate doctor's voice with error handling
|
55 |
+
try:
|
56 |
+
voice_of_doctor = text_to_speech_with_elevenlabs(
|
57 |
+
input_text=doctor_response,
|
58 |
+
output_filepath="final.mp3"
|
59 |
+
)
|
60 |
+
except Exception as e:
|
61 |
+
voice_of_doctor = None
|
62 |
+
print(f"Error in text-to-speech: {str(e)}")
|
63 |
|
64 |
return speech_to_text_output, doctor_response, voice_of_doctor
|
65 |
|
66 |
+
# Define Gradio interface
|
67 |
iface = gr.Interface(
|
68 |
fn=process_inputs,
|
69 |
inputs=[
|
70 |
+
gr.Audio(sources=["microphone"], type="numpy", label="Speak to the Doctor"),
|
71 |
+
gr.Image(type="filepath", label="Upload an Image (Optional)")
|
72 |
],
|
73 |
outputs=[
|
74 |
gr.Textbox(label="Speech to Text"),
|
75 |
gr.Textbox(label="Doctor's Response"),
|
76 |
gr.Audio(label="Doctor's Voice")
|
77 |
],
|
78 |
+
title="MediVox: AI Doctor with Vision and Voice",
|
79 |
css=".gradio-container {text-align: center;}"
|
80 |
)
|
81 |
|
82 |
+
# Launch the interface
|
83 |
+
iface.launch()
|