Spaces:
Running
Running
import os | |
import gradio as gr | |
import soundfile as sf # For audio handling | |
from brain import encode_image, analyze_image_with_query | |
from patientvoice import record_audio, transcribe_with_groq | |
from doctorvoice import text_to_speech_with_gtts | |
from dotenv import load_dotenv | |
# Load environment variables | |
load_dotenv() | |
system_prompt = """You have to act as a professional doctor, i know you are not but this is for learning purpose. | |
What's in this image?. Do you find anything wrong with it medically? | |
If you make a differential, suggest some remedies for them. Donot add any numbers or special characters in | |
your response. Your response should be in one long paragraph. Also always answer as if you are answering to a real person. | |
Donot say 'In the image I see' but say 'With what I see, I think you have ....' | |
Dont respond as an AI model in markdown, your answer should mimic that of an actual doctor not an AI bot, | |
Keep your answer concise (max 2 sentences). No preamble, start your answer right away please""" | |
def process_inputs(audio_data, image_filepath): | |
# Handle audio input from microphone | |
if audio_data is not None: | |
sample_rate, audio_array = audio_data | |
audio_filepath = "temp_audio.wav" | |
sf.write(audio_filepath, audio_array, sample_rate) | |
else: | |
audio_filepath = None | |
# Transcribe audio with error handling | |
if audio_filepath: | |
try: | |
speech_to_text_output = transcribe_with_groq( | |
GROQ_API_KEY=os.environ.get("GROQ_API_KEY"), | |
audio_filepath=audio_filepath, | |
stt_model="whisper-large-v3" | |
) | |
except Exception as e: | |
speech_to_text_output = f"Error in transcription: {str(e)}" | |
else: | |
speech_to_text_output = "No audio provided" | |
# Handle the image input with error handling | |
if image_filepath: | |
try: | |
doctor_response = analyze_image_with_query( | |
query=system_prompt + speech_to_text_output, | |
encoded_image=encode_image(image_filepath), | |
model="llama-3.2-11b-vision-preview" | |
) | |
except Exception as e: | |
doctor_response = f"Error in image analysis: {str(e)}" | |
else: | |
doctor_response = "No image provided for me to analyze" | |
# Generate doctor's voice with error handling | |
try: | |
voice_of_doctor = text_to_speech_with_gtts(input_text=doctor_response, output_filepath="final.mp3") | |
except Exception as e: | |
voice_of_doctor = None | |
print(f"Error in text-to-speech: {str(e)}") | |
return speech_to_text_output, doctor_response, voice_of_doctor | |
# Create the interface | |
iface = gr.Interface( | |
fn=process_inputs, | |
inputs=[ | |
gr.Audio(sources=["microphone"], type="numpy", label="Speak to the Doctor"), | |
gr.Image(type="filepath", label="Upload an Image (Optional)") | |
], | |
outputs=[ | |
gr.Textbox(label="Speech to Text"), | |
gr.Textbox(label="Doctor's Response"), | |
gr.Audio(label="Doctor's Voice") | |
], | |
title="MediVox : AI Doctor with Vision and Voice", | |
css=".gradio-container {text-align: center;}" | |
) | |
iface.launch() |