gauravgulati619 commited on
Commit
8693c7e
·
verified ·
1 Parent(s): f78f4fc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -20
app.py CHANGED
@@ -1,16 +1,14 @@
1
  import os
2
  import gradio as gr
3
- import pathlib
4
- import torch
5
- import faiss
6
- from sentence_transformers import SentenceTransformer
7
-
8
  from brain import encode_image, analyze_image_with_query
9
- from patientvoice import record_audio, transcribe_with_groq
10
- from doctorvoice import text_to_speech_with_gtts, text_to_speech_with_elevenlabs
11
  from dotenv import load_dotenv
 
 
12
  load_dotenv()
13
- system_prompt="""You have to act as a professional doctor, i know you are not but this is for learning purpose.
14
  What's in this image?. Do you find anything wrong with it medically?
15
  If you make a differential, suggest some remedies for them. Donot add any numbers or special characters in
16
  your response. Your response should be in one long paragraph. Also always answer as if you are answering to a real person.
@@ -18,36 +16,68 @@ system_prompt="""You have to act as a professional doctor, i know you are not bu
18
  Dont respond as an AI model in markdown, your answer should mimic that of an actual doctor not an AI bot,
19
  Keep your answer concise (max 5 sentences). No preamble, start your answer right away please"""
20
 
 
 
 
 
 
 
 
 
21
 
22
- def process_inputs(audio_filepath, image_filepath):
23
- speech_to_text_output = transcribe_with_groq(GROQ_API_KEY=os.environ.get("GROQ_API_KEY"),
24
- audio_filepath=audio_filepath,
25
- stt_model="whisper-large-v3")
 
 
 
 
 
 
 
 
26
 
27
- # Handle the image input
28
  if image_filepath:
29
- doctor_response = analyze_image_with_query(query=system_prompt+speech_to_text_output, encoded_image=encode_image(image_filepath), model="llama-3.2-11b-vision-preview")
 
 
 
 
 
 
 
30
  else:
31
  doctor_response = "No image provided for me to analyze"
32
 
33
- voice_of_doctor = text_to_speech_with_elevenlabs(input_text=doctor_response, output_filepath="final.mp3")
 
 
 
 
 
 
 
 
34
 
35
  return speech_to_text_output, doctor_response, voice_of_doctor
36
 
37
- # Create the interface
38
  iface = gr.Interface(
39
  fn=process_inputs,
40
  inputs=[
41
- gr.Audio(sources=["microphone"], type="filepath"),
42
- gr.Image(type="filepath")
43
  ],
44
  outputs=[
45
  gr.Textbox(label="Speech to Text"),
46
  gr.Textbox(label="Doctor's Response"),
47
  gr.Audio(label="Doctor's Voice")
48
  ],
49
- title="MediVox : AI Doctor with Vision and Voice",
50
  css=".gradio-container {text-align: center;}"
51
  )
52
 
53
- iface.launch(debug=True)
 
 
1
  import os
2
  import gradio as gr
3
+ import soundfile as sf
 
 
 
 
4
  from brain import encode_image, analyze_image_with_query
5
+ from patientvoice import transcribe_with_groq
6
+ from doctorvoice import text_to_speech_with_elevenlabs
7
  from dotenv import load_dotenv
8
+
9
+ # Load environment variables
10
  load_dotenv()
11
+ system_prompt = """You have to act as a professional doctor, i know you are not but this is for learning purpose.
12
  What's in this image?. Do you find anything wrong with it medically?
13
  If you make a differential, suggest some remedies for them. Donot add any numbers or special characters in
14
  your response. Your response should be in one long paragraph. Also always answer as if you are answering to a real person.
 
16
  Dont respond as an AI model in markdown, your answer should mimic that of an actual doctor not an AI bot,
17
  Keep your answer concise (max 5 sentences). No preamble, start your answer right away please"""
18
 
19
+ def process_inputs(audio_data, image_filepath):
20
+ # Handle audio input from microphone
21
+ if audio_data is not None:
22
+ sample_rate, audio_array = audio_data
23
+ audio_filepath = "temp_audio.wav"
24
+ sf.write(audio_filepath, audio_array, sample_rate)
25
+ else:
26
+ audio_filepath = None
27
 
28
+ # Transcribe audio with error handling
29
+ if audio_filepath:
30
+ try:
31
+ speech_to_text_output = transcribe_with_groq(
32
+ GROQ_API_KEY=os.environ.get("GROQ_API_KEY"),
33
+ audio_filepath=audio_filepath,
34
+ stt_model="whisper-large-v3"
35
+ )
36
+ except Exception as e:
37
+ speech_to_text_output = f"Error in transcription: {str(e)}"
38
+ else:
39
+ speech_to_text_output = "No audio provided"
40
 
41
+ # Process image input with error handling
42
  if image_filepath:
43
+ try:
44
+ doctor_response = analyze_image_with_query(
45
+ query=system_prompt + speech_to_text_output,
46
+ encoded_image=encode_image(image_filepath),
47
+ model="llama-3.2-11b-vision-preview"
48
+ )
49
+ except Exception as e:
50
+ doctor_response = f"Error in image analysis: {str(e)}"
51
  else:
52
  doctor_response = "No image provided for me to analyze"
53
 
54
+ # Generate doctor's voice with error handling
55
+ try:
56
+ voice_of_doctor = text_to_speech_with_elevenlabs(
57
+ input_text=doctor_response,
58
+ output_filepath="final.mp3"
59
+ )
60
+ except Exception as e:
61
+ voice_of_doctor = None
62
+ print(f"Error in text-to-speech: {str(e)}")
63
 
64
  return speech_to_text_output, doctor_response, voice_of_doctor
65
 
66
+ # Define Gradio interface
67
  iface = gr.Interface(
68
  fn=process_inputs,
69
  inputs=[
70
+ gr.Audio(sources=["microphone"], type="numpy", label="Speak to the Doctor"),
71
+ gr.Image(type="filepath", label="Upload an Image (Optional)")
72
  ],
73
  outputs=[
74
  gr.Textbox(label="Speech to Text"),
75
  gr.Textbox(label="Doctor's Response"),
76
  gr.Audio(label="Doctor's Voice")
77
  ],
78
+ title="MediVox: AI Doctor with Vision and Voice",
79
  css=".gradio-container {text-align: center;}"
80
  )
81
 
82
+ # Launch the interface
83
+ iface.launch()