Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -10,132 +10,29 @@ from patientvoice import record_audio, transcribe_with_groq
|
|
10 |
from doctorvoice import text_to_speech_with_gtts, text_to_speech_with_elevenlabs
|
11 |
from dotenv import load_dotenv
|
12 |
load_dotenv()
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
|
|
|
|
18 |
|
19 |
-
# Check if CUDA is available
|
20 |
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
21 |
-
print(f"Using device: {device}")
|
22 |
-
|
23 |
-
# Initialize embeddings model
|
24 |
-
class SentenceTransformerEmbeddings(Embeddings):
|
25 |
-
def __init__(self, model_name: str, device: str = None):
|
26 |
-
self.model = SentenceTransformer(model_name, device=device)
|
27 |
-
|
28 |
-
def embed_documents(self, texts: list[str]) -> list[list[float]]:
|
29 |
-
embeddings = self.model.encode(texts, convert_to_tensor=False)
|
30 |
-
return embeddings.tolist()
|
31 |
-
|
32 |
-
def embed_query(self, text: str) -> list[float]:
|
33 |
-
embedding = self.model.encode(text, convert_to_tensor=False)
|
34 |
-
return embedding.tolist()
|
35 |
-
|
36 |
-
embeddings = SentenceTransformerEmbeddings(
|
37 |
-
model_name="sentence-transformers/all-MiniLM-L6-v2",
|
38 |
-
device=device
|
39 |
-
)
|
40 |
-
|
41 |
-
# Define vectorstore paths consistently
|
42 |
-
VECTORSTORE_DIR = "vectorstore/db_faiss"
|
43 |
-
vectorstore_path = pathlib.Path(VECTORSTORE_DIR)
|
44 |
-
|
45 |
-
# Create vectorstore directory if it doesn't exist
|
46 |
-
vectorstore_path.mkdir(parents=True, exist_ok=True)
|
47 |
-
|
48 |
-
if not (vectorstore_path / "index.faiss").exists():
|
49 |
-
print("Creating new vectorstore...")
|
50 |
-
# Load and split the PDF
|
51 |
-
loader = PyPDFLoader("medical.pdf")
|
52 |
-
documents = loader.load()
|
53 |
-
|
54 |
-
# Split documents into chunks
|
55 |
-
text_splitter = RecursiveCharacterTextSplitter(
|
56 |
-
chunk_size=2000,
|
57 |
-
chunk_overlap=100,
|
58 |
-
length_function=len,
|
59 |
-
)
|
60 |
-
texts = text_splitter.split_documents(documents)
|
61 |
-
|
62 |
-
# Create and save the vectorstore
|
63 |
-
vectorstore = FAISS.from_documents(texts, embeddings)
|
64 |
-
|
65 |
-
# If CUDA is available, convert index to GPU
|
66 |
-
if device == "cuda":
|
67 |
-
res = faiss.StandardGpuResources() # Initialize GPU resources
|
68 |
-
index = vectorstore.index
|
69 |
-
gpu_index = faiss.index_cpu_to_gpu(res, 0, index) # Move to GPU
|
70 |
-
vectorstore.index = gpu_index
|
71 |
-
|
72 |
-
# Save the vectorstore
|
73 |
-
vectorstore.save_local(VECTORSTORE_DIR)
|
74 |
-
print("Vectorstore created and saved successfully.")
|
75 |
-
else:
|
76 |
-
print("Loading existing vectorstore...")
|
77 |
-
# Load existing vectorstore
|
78 |
-
vectorstore = FAISS.load_local(
|
79 |
-
folder_path=VECTORSTORE_DIR,
|
80 |
-
embeddings=embeddings,
|
81 |
-
allow_dangerous_deserialization=True
|
82 |
-
)
|
83 |
-
|
84 |
-
# If CUDA is available, convert loaded index to GPU
|
85 |
-
if device == "cuda":
|
86 |
-
res = faiss.StandardGpuResources() # Initialize GPU resources
|
87 |
-
index = vectorstore.index
|
88 |
-
gpu_index = faiss.index_cpu_to_gpu(res, 0, index) # Move to GPU
|
89 |
-
vectorstore.index = gpu_index
|
90 |
-
print("Vectorstore loaded successfully.")
|
91 |
-
|
92 |
-
def get_relevant_context(query):
|
93 |
-
try:
|
94 |
-
# Search the vector store for relevant documents
|
95 |
-
docs = vectorstore.similarity_search(query, k=2)
|
96 |
-
|
97 |
-
# Extract and combine the content from retrieved documents
|
98 |
-
context = "\n".join([doc.page_content for doc in docs])
|
99 |
-
|
100 |
-
return context
|
101 |
-
except Exception as e:
|
102 |
-
print(f"Error in similarity search: {e}")
|
103 |
-
return "Could not retrieve relevant context."
|
104 |
-
|
105 |
-
# Update system prompt to include retrieved context
|
106 |
-
def get_enhanced_prompt(query, context):
|
107 |
-
enhanced_prompt = f"""You have to act as a professional doctor, i know you are not but this is for learning purpose.
|
108 |
-
Use the following medical context to inform your response: {context}
|
109 |
-
What's in this image?. Do you find anything wrong with it medically?
|
110 |
-
If you make a differential, suggest some remedies for them. Donot add any numbers or special characters in
|
111 |
-
your response. Your response should be in one long paragraph. Also always answer as if you are answering to a real person.
|
112 |
-
Donot say 'In the image I see' but say 'With what I see, I think you have ....'
|
113 |
-
Dont respond as an AI model in markdown, your answer should mimic that of an actual doctor not an AI bot,
|
114 |
-
Keep your answer concise (max 2 sentences). No preamble, start your answer right away please.
|
115 |
-
Question from patient: {query}"""
|
116 |
-
return enhanced_prompt
|
117 |
|
118 |
def process_inputs(audio_filepath, image_filepath):
|
119 |
speech_to_text_output = transcribe_with_groq(GROQ_API_KEY=os.environ.get("GROQ_API_KEY"),
|
120 |
audio_filepath=audio_filepath,
|
121 |
stt_model="whisper-large-v3")
|
122 |
|
123 |
-
# Get relevant context from the vector store
|
124 |
-
context = get_relevant_context(speech_to_text_output)
|
125 |
-
|
126 |
# Handle the image input
|
127 |
if image_filepath:
|
128 |
-
|
129 |
-
doctor_response = analyze_image_with_query(query=enhanced_prompt, encoded_image=encode_image(image_filepath), model="llama-3.2-11b-vision-preview")
|
130 |
else:
|
131 |
doctor_response = "No image provided for me to analyze"
|
132 |
|
133 |
-
|
134 |
-
output_filepath = "output_audio.mp3"
|
135 |
-
voice_of_doctor = text_to_speech_with_elevenlabs(input_text=doctor_response, output_filepath=output_filepath)
|
136 |
-
|
137 |
-
return speech_to_text_output, doctor_response, output_filepath
|
138 |
|
|
|
139 |
|
140 |
# Create the interface
|
141 |
iface = gr.Interface(
|
|
|
10 |
from doctorvoice import text_to_speech_with_gtts, text_to_speech_with_elevenlabs
|
11 |
from dotenv import load_dotenv
|
12 |
load_dotenv()
|
13 |
+
system_prompt="""You have to act as a professional doctor, i know you are not but this is for learning purpose.
|
14 |
+
What's in this image?. Do you find anything wrong with it medically?
|
15 |
+
If you make a differential, suggest some remedies for them. Donot add any numbers or special characters in
|
16 |
+
your response. Your response should be in one long paragraph. Also always answer as if you are answering to a real person.
|
17 |
+
Donot say 'In the image I see' but say 'With what I see, I think you have ....'
|
18 |
+
Dont respond as an AI model in markdown, your answer should mimic that of an actual doctor not an AI bot,
|
19 |
+
Keep your answer concise (max 5 sentences). No preamble, start your answer right away please"""
|
20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
def process_inputs(audio_filepath, image_filepath):
|
23 |
speech_to_text_output = transcribe_with_groq(GROQ_API_KEY=os.environ.get("GROQ_API_KEY"),
|
24 |
audio_filepath=audio_filepath,
|
25 |
stt_model="whisper-large-v3")
|
26 |
|
|
|
|
|
|
|
27 |
# Handle the image input
|
28 |
if image_filepath:
|
29 |
+
doctor_response = analyze_image_with_query(query=system_prompt+speech_to_text_output, encoded_image=encode_image(image_filepath), model="llama-3.2-11b-vision-preview")
|
|
|
30 |
else:
|
31 |
doctor_response = "No image provided for me to analyze"
|
32 |
|
33 |
+
voice_of_doctor = text_to_speech_with_elevenlabs(input_text=doctor_response, output_filepath="final.mp3")
|
|
|
|
|
|
|
|
|
34 |
|
35 |
+
return speech_to_text_output, doctor_response, voice_of_doctor
|
36 |
|
37 |
# Create the interface
|
38 |
iface = gr.Interface(
|