Spaces:

gauravgulati619
/

MediVox

Running

File size: 5,709 Bytes

import os
import gradio as gr
import pathlib
import torch
import faiss
from sentence_transformers import SentenceTransformer

from brain import encode_image, analyze_image_with_query
from patientvoice import record_audio, transcribe_with_groq
from doctorvoice import text_to_speech_with_gtts, text_to_speech_with_elevenlabs
from dotenv import load_dotenv
load_dotenv()
from langchain_community.vectorstores import FAISS
from langchain_core.embeddings import Embeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Check if CUDA is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Initialize embeddings model
class SentenceTransformerEmbeddings(Embeddings):
    def __init__(self, model_name: str, device: str = None):
        self.model = SentenceTransformer(model_name, device=device)

    def embed_documents(self, texts: list[str]) -> list[list[float]]:
        embeddings = self.model.encode(texts, convert_to_tensor=False)
        return embeddings.tolist()

    def embed_query(self, text: str) -> list[float]:
        embedding = self.model.encode(text, convert_to_tensor=False)
        return embedding.tolist()

embeddings = SentenceTransformerEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    device=device
)

def create_vectorstore():
    # Define vectorstore paths consistently
    VECTORSTORE_DIR = "vectorstore/db_faiss"
    vectorstore_path = pathlib.Path(VECTORSTORE_DIR)
    
    # Create vectorstore directory if it doesn't exist
    vectorstore_path.mkdir(parents=True, exist_ok=True)
    
    if not (vectorstore_path / "index.faiss").exists():
        print("Creating new vectorstore...")
        # Load and split the PDF
        loader = PyPDFLoader("medical.pdf")
        documents = loader.load()
        
        # Split documents into chunks
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=2000,
            chunk_overlap=100,
            length_function=len,
        )
        texts = text_splitter.split_documents(documents)
        
        # Create and save the vectorstore
        vectorstore = FAISS.from_documents(texts, embeddings)
        
        # If CUDA is available, convert index to GPU
        if device == "cuda":
            res = faiss.StandardGpuResources()  # Initialize GPU resources
            index = vectorstore.index
            gpu_index = faiss.index_cpu_to_gpu(res, 0, index)  # Move to GPU
            vectorstore.index = gpu_index
        
        # Save the vectorstore
        vectorstore.save_local(VECTORSTORE_DIR)
        print("Vectorstore created and saved successfully.")
    else:
        print("Loading existing vectorstore...")
        # Load existing vectorstore
        vectorstore = FAISS.load_local(
            folder_path=VECTORSTORE_DIR,
            embeddings=embeddings,
            allow_dangerous_deserialization=True
        )
        
        # If CUDA is available, convert loaded index to GPU
        if device == "cuda":
            res = faiss.StandardGpuResources()  # Initialize GPU resources
            index = vectorstore.index
            gpu_index = faiss.index_cpu_to_gpu(res, 0, index)  # Move to GPU
            vectorstore.index = gpu_index
        print("Vectorstore loaded successfully.")

def get_relevant_context(query):
    try:
        # Search the vector store for relevant documents
        docs = vectorstore.similarity_search(query, k=2)
        
        # Extract and combine the content from retrieved documents
        context = "\n".join([doc.page_content for doc in docs])
        context = "Use the following medical context to inform your response: " + context
        
        return context if not context else "" 
        
    except Exception as e:
        print(f"Error in similarity search: {e}")
        return "Could not retrieve relevant context."

# Update system prompt to include retrieved context
def get_enhanced_prompt(query, context):
    enhanced_prompt = f"""### **Patient Information**:  
**Patient Query**: {query}  
{context}  
"""  

    return enhanced_prompt

def process_inputs(audio_filepath, image_filepath):
    speech_to_text_output = transcribe_with_groq(GROQ_API_KEY=os.environ.get("GROQ_API_KEY"), 
                                                 audio_filepath=audio_filepath,
                                                 stt_model="whisper-large-v3")

    # Get relevant context from the vector store
    context = get_relevant_context(speech_to_text_output)
    
    # Handle the image input
    if image_filepath:
        enhanced_prompt = get_enhanced_prompt(speech_to_text_output, context)
        doctor_response = analyze_image_with_query(query=enhanced_prompt, encoded_image=encode_image(image_filepath), model="llama-3.2-90b-vision-preview")
    else:
        doctor_response = "No image provided for me to analyze"

    # Generate audio response and return the filepath
    output_filepath = "output_audio.mp3"
    voice_of_doctor = text_to_speech_with_elevenlabs(input_text=doctor_response, output_filepath=output_filepath)

    return speech_to_text_output, doctor_response, output_filepath


# Create the interface
iface = gr.Interface(
    fn=process_inputs,
    inputs=[
        gr.Audio(sources=["microphone"], type="filepath"),
        gr.Image(type="filepath")
    ],
    outputs=[
        gr.Textbox(label="Speech to Text"),
        gr.Textbox(label="Doctor's Response"),
        gr.Audio(label="Doctor's Voice")
    ],
    title="AI Doctor with Vision and Voice"
)

iface.launch(debug=True)