gauravgulati619 commited on
Commit
95841bc
·
0 Parent(s):

Initial commit: Complete MediVox application

Browse files
Files changed (9) hide show
  1. .gitattributes +1 -0
  2. .gitignore +23 -0
  3. README.md +60 -0
  4. app.py +153 -0
  5. brain.py +42 -0
  6. doctorvoice.py +112 -0
  7. packages.txt +11 -0
  8. patientvoice.py +57 -0
  9. requirements.txt +23 -0
.gitattributes ADDED
@@ -0,0 +1 @@
 
 
1
+ medical.pdf filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Environment
2
+ .env
3
+ medenv/
4
+ venv/
5
+ __pycache__/
6
+
7
+ # Generated files
8
+ *.pyc
9
+ *.mp3
10
+ *.wav
11
+ *.jpg
12
+ download.jpg
13
+ Temp.mp3
14
+ final.mp3
15
+ patient_voice.mp3
16
+
17
+ # Large files
18
+ medical.pdf
19
+ vectorstore/
20
+
21
+ # IDE
22
+ .vscode/
23
+ .idea/
README.md ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: MediVox - AI Doctor with Vision and Voice
3
+ emoji: 👨‍⚕️
4
+ colorFrom: blue
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: 4.16.0
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+
12
+ # AI Doctor with Vision and Voice
13
+
14
+ This is an AI-powered medical assistant that can:
15
+ - Accept voice input from patients
16
+ - Analyze medical images
17
+ - Provide medical insights using RAG (Retrieval Augmented Generation)
18
+ - Respond with natural voice output
19
+
20
+ ## Features
21
+
22
+ - Speech-to-Text using Whisper
23
+ - Image Analysis using LLaVA
24
+ - RAG using FAISS and medical knowledge base
25
+ - Text-to-Speech using ElevenLabs
26
+ - Context-aware responses using medical domain knowledge
27
+
28
+ ## Environment Variables Required
29
+
30
+ ```bash
31
+ GROQ_API_KEY=your_groq_api_key
32
+ ELEVENLABS_API_KEY=your_elevenlabs_api_key
33
+ ```
34
+
35
+ ## Usage
36
+
37
+ 1. Click the microphone button to record your question
38
+ 2. Upload or take a picture of the medical condition
39
+ 3. Wait for the AI doctor to analyze and respond
40
+ 4. Listen to the voice response or read the text output
41
+
42
+ ## Model Details
43
+
44
+ - Vision Model: LLaVA 3.2 11B
45
+ - Speech-to-Text: Whisper Large V3
46
+ - Text Generation: Groq
47
+ - Voice Generation: ElevenLabs
48
+ - Embeddings: sentence-transformers/all-mpnet-base-v2
49
+
50
+ ## Citation
51
+
52
+ If you use this space, please cite:
53
+ ```
54
+ @misc{medivoicebot2024,
55
+ author = {Your Name},
56
+ title = {AI Doctor with Vision and Voice},
57
+ year = {2024},
58
+ publisher = {Hugging Face Spaces},
59
+ }
60
+ ```
app.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ import pathlib
4
+ import torch
5
+ import faiss
6
+ from sentence_transformers import SentenceTransformer
7
+
8
+ from brain import encode_image, analyze_image_with_query
9
+ from patientvoice import record_audio, transcribe_with_groq
10
+ from doctorvoice import text_to_speech_with_gtts, text_to_speech_with_elevenlabs
11
+ from dotenv import load_dotenv
12
+ load_dotenv()
13
+ from langchain_community.vectorstores import FAISS
14
+ from langchain_core.embeddings import Embeddings
15
+ from langchain_core.prompts import ChatPromptTemplate
16
+ from langchain_community.document_loaders import PyPDFLoader
17
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
18
+
19
+ # Check if CUDA is available
20
+ device = "cuda" if torch.cuda.is_available() else "cpu"
21
+ print(f"Using device: {device}")
22
+
23
+ # Initialize embeddings model
24
+ class SentenceTransformerEmbeddings(Embeddings):
25
+ def __init__(self, model_name: str, device: str = None):
26
+ self.model = SentenceTransformer(model_name, device=device)
27
+
28
+ def embed_documents(self, texts: list[str]) -> list[list[float]]:
29
+ embeddings = self.model.encode(texts, convert_to_tensor=False)
30
+ return embeddings.tolist()
31
+
32
+ def embed_query(self, text: str) -> list[float]:
33
+ embedding = self.model.encode(text, convert_to_tensor=False)
34
+ return embedding.tolist()
35
+
36
+ embeddings = SentenceTransformerEmbeddings(
37
+ model_name="sentence-transformers/all-mpnet-base-v2",
38
+ device=device
39
+ )
40
+
41
+ # Define vectorstore paths consistently
42
+ VECTORSTORE_DIR = "vectorstore/db_faiss"
43
+ vectorstore_path = pathlib.Path(VECTORSTORE_DIR)
44
+
45
+ # Create vectorstore directory if it doesn't exist
46
+ vectorstore_path.mkdir(parents=True, exist_ok=True)
47
+
48
+ if not (vectorstore_path / "index.faiss").exists():
49
+ print("Creating new vectorstore...")
50
+ # Load and split the PDF
51
+ loader = PyPDFLoader("medical.pdf")
52
+ documents = loader.load()
53
+
54
+ # Split documents into chunks
55
+ text_splitter = RecursiveCharacterTextSplitter(
56
+ chunk_size=1000,
57
+ chunk_overlap=200,
58
+ length_function=len,
59
+ )
60
+ texts = text_splitter.split_documents(documents)
61
+
62
+ # Create and save the vectorstore
63
+ vectorstore = FAISS.from_documents(texts, embeddings)
64
+
65
+ # If CUDA is available, convert index to GPU
66
+ if device == "cuda":
67
+ res = faiss.StandardGpuResources() # Initialize GPU resources
68
+ index = vectorstore.index
69
+ gpu_index = faiss.index_cpu_to_gpu(res, 0, index) # Move to GPU
70
+ vectorstore.index = gpu_index
71
+
72
+ # Save the vectorstore
73
+ vectorstore.save_local(VECTORSTORE_DIR)
74
+ print("Vectorstore created and saved successfully.")
75
+ else:
76
+ print("Loading existing vectorstore...")
77
+ # Load existing vectorstore
78
+ vectorstore = FAISS.load_local(
79
+ folder_path=VECTORSTORE_DIR,
80
+ embeddings=embeddings,
81
+ allow_dangerous_deserialization=True
82
+ )
83
+
84
+ # If CUDA is available, convert loaded index to GPU
85
+ if device == "cuda":
86
+ res = faiss.StandardGpuResources() # Initialize GPU resources
87
+ index = vectorstore.index
88
+ gpu_index = faiss.index_cpu_to_gpu(res, 0, index) # Move to GPU
89
+ vectorstore.index = gpu_index
90
+ print("Vectorstore loaded successfully.")
91
+
92
+ def get_relevant_context(query):
93
+ try:
94
+ # Search the vector store for relevant documents
95
+ docs = vectorstore.similarity_search(query, k=2)
96
+
97
+ # Extract and combine the content from retrieved documents
98
+ context = "\n".join([doc.page_content for doc in docs])
99
+
100
+ return context
101
+ except Exception as e:
102
+ print(f"Error in similarity search: {e}")
103
+ return "Could not retrieve relevant context."
104
+
105
+ # Update system prompt to include retrieved context
106
+ def get_enhanced_prompt(query, context):
107
+ enhanced_prompt = f"""You have to act as a professional doctor, i know you are not but this is for learning purpose.
108
+ Use the following medical context to inform your response: {context}
109
+ What's in this image?. Do you find anything wrong with it medically?
110
+ If you make a differential, suggest some remedies for them. Donot add any numbers or special characters in
111
+ your response. Your response should be in one long paragraph. Also always answer as if you are answering to a real person.
112
+ Donot say 'In the image I see' but say 'With what I see, I think you have ....'
113
+ Dont respond as an AI model in markdown, your answer should mimic that of an actual doctor not an AI bot,
114
+ Keep your answer concise (max 2 sentences). No preamble, start your answer right away please.
115
+ Question from patient: {query}"""
116
+ return enhanced_prompt
117
+
118
+ def process_inputs(audio_filepath, image_filepath):
119
+ speech_to_text_output = transcribe_with_groq(GROQ_API_KEY=os.environ.get("GROQ_API_KEY"),
120
+ audio_filepath=audio_filepath,
121
+ stt_model="whisper-large-v3")
122
+
123
+ # Get relevant context from the vector store
124
+ context = get_relevant_context(speech_to_text_output)
125
+
126
+ # Handle the image input
127
+ if image_filepath:
128
+ enhanced_prompt = get_enhanced_prompt(speech_to_text_output, context)
129
+ doctor_response = analyze_image_with_query(query=enhanced_prompt, encoded_image=encode_image(image_filepath), model="llama-3.2-11b-vision-preview")
130
+ else:
131
+ doctor_response = "No image provided for me to analyze"
132
+
133
+ voice_of_doctor = text_to_speech_with_elevenlabs(input_text=doctor_response, output_filepath="final.mp3")
134
+
135
+ return speech_to_text_output, doctor_response, voice_of_doctor
136
+
137
+
138
+ # Create the interface
139
+ iface = gr.Interface(
140
+ fn=process_inputs,
141
+ inputs=[
142
+ gr.Audio(sources=["microphone"], type="filepath"),
143
+ gr.Image(type="filepath")
144
+ ],
145
+ outputs=[
146
+ gr.Textbox(label="Speech to Text"),
147
+ gr.Textbox(label="Doctor's Response"),
148
+ gr.Audio("Temp.mp3")
149
+ ],
150
+ title="AI Doctor with Vision and Voice"
151
+ )
152
+
153
+ iface.launch(debug=True)
brain.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+
4
+ # Load environment variables
5
+ load_dotenv()
6
+ GROQ_API_KEY = os.environ.get("GROQ_API_KEY")
7
+
8
+ import base64
9
+ def encode_image(image_path):
10
+ image_file=open(image_path, "rb")
11
+ return base64.b64encode(image_file.read()).decode('utf-8')
12
+
13
+ #Step3: Setup Multimodal LLM
14
+ from groq import Groq
15
+
16
+ query="Is there something wrong with my face?"
17
+ model="llama-3.2-90b-vision-preview"
18
+
19
+ def analyze_image_with_query(query, model, encoded_image):
20
+ client=Groq()
21
+ messages=[
22
+ {
23
+ "role": "user",
24
+ "content": [
25
+ {
26
+ "type": "text",
27
+ "text": query
28
+ },
29
+ {
30
+ "type": "image_url",
31
+ "image_url": {
32
+ "url": f"data:image/jpeg;base64,{encoded_image}",
33
+ },
34
+ },
35
+ ],
36
+ }]
37
+ chat_completion=client.chat.completions.create(
38
+ messages=messages,
39
+ model=model
40
+ )
41
+
42
+ return chat_completion.choices[0].message.content
doctorvoice.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # if you dont use pipenv uncomment the following:
2
+ from dotenv import load_dotenv
3
+ load_dotenv()
4
+
5
+ #Step1a: Setup Text to Speech–TTS–model with gTTS
6
+ import os
7
+ from gtts import gTTS
8
+
9
+ def text_to_speech_with_gtts_old(input_text, output_filepath):
10
+ language="en"
11
+
12
+ audioobj= gTTS(
13
+ text=input_text,
14
+ lang=language,
15
+ slow=False
16
+ )
17
+ audioobj.save(output_filepath)
18
+
19
+
20
+ # input_text="Hi"
21
+ # text_to_speech_with_gtts_old(input_text=input_text, output_filepath="gtts_testing.mp3")
22
+
23
+ #Step1b: Setup Text to Speech–TTS–model with ElevenLabs
24
+ import elevenlabs
25
+ from elevenlabs.client import ElevenLabs
26
+
27
+ ELEVENLABS_API_KEY=os.environ.get("ELEVENLABS_API_KEY")
28
+
29
+ def text_to_speech_with_elevenlabs_old(input_text, output_filepath):
30
+ client=ElevenLabs(api_key=ELEVENLABS_API_KEY)
31
+ audio=client.generate(
32
+ text= input_text,
33
+ voice= "Emily",
34
+ output_format= "mp3_22050_32",
35
+ model= "eleven_turbo_v2"
36
+ )
37
+ elevenlabs.save(audio, output_filepath)
38
+
39
+ # text_to_speech_with_elevenlabs_old(input_text, output_filepath="elevenlabs_testing.mp3")
40
+
41
+ # #Step2: Use Model for Text output to Voice
42
+ # when the files of the doctor gets saved, they dont play automatically so we have to do this step 2 in order to automatically run the audio files.
43
+ import subprocess
44
+ import platform
45
+ from pydub import AudioSegment
46
+ from pydub.playback import play
47
+ import tempfile
48
+
49
+ def text_to_speech_with_gtts(input_text, output_filepath):
50
+ language="en"
51
+
52
+ audioobj= gTTS(
53
+ text=input_text,
54
+ lang=language,
55
+ slow=False
56
+ )
57
+ audioobj.save(output_filepath)
58
+ os_name = platform.system()
59
+ try:
60
+ if os_name == "Darwin": # macOS
61
+ subprocess.run(['afplay', output_filepath])
62
+ elif os_name == "Windows": # Windows
63
+ subprocess.run(['powershell', '-c', f'(New-Object Media.SoundPlayer "{output_filepath}").PlaySync();'])
64
+ elif os_name == "Linux": # Linux
65
+ subprocess.run(['aplay', output_filepath]) # Alternative: use 'mpg123' or 'ffplay'
66
+ else:
67
+ raise OSError("Unsupported operating system")
68
+ except Exception as e:
69
+ print(f"An error occurred while trying to play the audio: {e}")
70
+
71
+
72
+ # input_text="Hi"
73
+ # #text_to_speech_with_gtts(input_text=input_text, output_filepath="gtts_testing_autoplay.mp3")
74
+
75
+ def play_audio(file_path):
76
+ os_name = platform.system()
77
+ try:
78
+ if os_name == "Darwin": # macOS
79
+ subprocess.run(['afplay', file_path])
80
+ elif os_name == "Windows": # Windows
81
+ # Load MP3 and convert to WAV for playback
82
+ audio = AudioSegment.from_mp3(file_path)
83
+ # Create a temporary WAV file
84
+ with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_wav:
85
+ wav_path = temp_wav.name
86
+ audio.export(wav_path, format='wav')
87
+ # Play the WAV file
88
+ subprocess.run(['powershell', '-c', f'(New-Object Media.SoundPlayer "{wav_path}").PlaySync();'])
89
+ # Clean up temporary file
90
+ os.unlink(wav_path)
91
+ elif os_name == "Linux": # Linux
92
+ subprocess.run(['mpg123', file_path]) # Using mpg123 for MP3 playback
93
+ else:
94
+ raise OSError("Unsupported operating system")
95
+ except Exception as e:
96
+ print(f"An error occurred while trying to play the audio: {e}")
97
+
98
+ def text_to_speech_with_elevenlabs(input_text, output_filepath):
99
+ client = ElevenLabs(api_key=ELEVENLABS_API_KEY)
100
+ audio = client.generate(
101
+ text=input_text,
102
+ voice="Aria",
103
+ output_format="mp3_22050_32",
104
+ model="eleven_turbo_v2"
105
+ )
106
+ elevenlabs.save(audio, output_filepath)
107
+
108
+ # Play the audio
109
+ play_audio(output_filepath)
110
+ return output_filepath
111
+
112
+ # text_to_speech_with_elevenlabs(input_text, output_filepath="elevenlabs_testing_autoplay.mp3")
packages.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ python3-dev
2
+ portaudio19-dev
3
+ python3-pyaudio
4
+ ffmpeg
5
+ libsndfile1
6
+ build-essential
7
+ pkg-config
8
+ git
9
+ libasound2-dev
10
+ python3-all-dev
11
+ libportaudio2
patientvoice.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import speech_recognition as sr
3
+ from pydub import AudioSegment
4
+ from io import BytesIO
5
+
6
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
7
+
8
+ def record_audio(file_path, timeout=20, phrase_time_limit=None):
9
+ """
10
+ Simplified function to record audio from the microphone and save it as an MP3 file.
11
+
12
+ Args:
13
+ file_path (str): Path to save the recorded audio file.
14
+ timeout (int): Maximum time to wait for a phrase to start (in seconds).
15
+ phrase_time_lfimit (int): Maximum time for the phrase to be recorded (in seconds).
16
+ """
17
+ recognizer = sr.Recognizer()
18
+
19
+ try:
20
+ with sr.Microphone() as source:
21
+ logging.info("Adjusting for ambient noise...")
22
+ recognizer.adjust_for_ambient_noise(source, duration=1)
23
+ logging.info("Start speaking now...")
24
+
25
+ # Record the audio
26
+ audio_data = recognizer.listen(source, timeout=timeout, phrase_time_limit=phrase_time_limit)
27
+ logging.info("Recording complete.")
28
+
29
+ # Convert the recorded audio to an MP3 file
30
+ wav_data = audio_data.get_wav_data()
31
+ audio_segment = AudioSegment.from_wav(BytesIO(wav_data))
32
+ audio_segment.export(file_path, format="mp3", bitrate="128k")
33
+
34
+ logging.info(f"Audio saved to {file_path}")
35
+
36
+ except Exception as e:
37
+ logging.error(f"An error occurred: {e}")
38
+
39
+ import os
40
+ from groq import Groq
41
+ from dotenv import load_dotenv
42
+
43
+ load_dotenv()
44
+ GROQ_API_KEY=os.environ.get("GROQ_API_KEY")
45
+ stt_model="whisper-large-v3"
46
+
47
+ def transcribe_with_groq(stt_model, audio_filepath, GROQ_API_KEY):
48
+ client=Groq(api_key=GROQ_API_KEY)
49
+
50
+ audio_file=open(audio_filepath, "rb")
51
+ transcription=client.audio.transcriptions.create(
52
+ model=stt_model,
53
+ file=audio_file,
54
+ language="en"
55
+ )
56
+
57
+ return transcription.text
requirements.txt ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ python-dotenv
3
+ groq
4
+ langchain
5
+ langchain-core
6
+ langchain-community
7
+ sentence-transformers
8
+ chromadb
9
+ PyPDF2
10
+ transformers
11
+ torch
12
+ torchaudio
13
+ SpeechRecognition
14
+ pydub
15
+ ffmpeg-python
16
+ gTTS
17
+ elevenlabs
18
+ faiss-cpu
19
+ requests
20
+ numpy
21
+ typing-inspect
22
+ typing_extensions
23
+ pypdf