Digital_Detectives_RAG_System / kaggle notebook.py
Tesneem's picture
Update kaggle notebook.py
f6eb1bd verified
#RAG Code written by Farhikhta Farzan
#MONGODB created by Farhikhta Farzan
#Documents and research gathered by Keira James, Farhikhta Farzan, and Tesneem Essa
# Import libraries.
# Gradio.
import gradio as gr
# File loading and environment variables.
import os
import sys
# Gradio.
from gradio.themes.base import Base
# HuggingFace LLM.
from huggingface_hub import InferenceClient
# Langchain.
from langchain.document_loaders import TextLoader
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import MongoDBAtlasVectorSearch
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
# MongoDB.
from pymongo import MongoClient
# Function type hints.
from typing import Dict, Any
#Secrets
from kaggle_secrets import UserSecretsClient
directory_path = "/kaggle/input/rag-dataset/RAG"
sys.path.append(directory_path)
print("sys.path =", sys.path)
my_txts = os.listdir(directory_path)
my_txts
loaders = []
for my_txt in my_txts:
my_txt_path = os.path.join(directory_path, my_txt)
text_loader = TextLoader(my_txt_path)
loaders.append(text_loader)
print("len(loaders) =", len(loaders))
loaders
# Load the TXT.
data = []
for loader in loaders:
loaded_text = loader.load()
data.append(loaded_text)
print("len(data) =", len(data), "\n")
# First TXT file.
data[0]
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
docs = []
for doc in data:
chunk = text_splitter.split_documents(doc)
docs.append(chunk)
merged_documents = []
for doc in docs:
merged_documents.extend(doc)
# Print the merged list of all the documents.
print("len(merged_documents) =", len(merged_documents))
print(merged_documents)
# Connect to MongoDB Atlas cluster using the connection string.
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
# secret_value_0= user_secrets.get_secret("MONGO_URI")
MONGO_URI = user_secrets.get_secret("MONGO_URI")
cluster = MongoClient(MONGO_URI)
# Define the MongoDB database and collection name.
DB_NAME = "files"
COLLECTION_NAME = "files_collection"
# Connect to the specific collection in the database.
MONGODB_COLLECTION = cluster[DB_NAME][COLLECTION_NAME]
vector_search_index = "vector_index"
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
HF_TOKEN = user_secrets.get_secret("hugging_face")
embedding_model = HuggingFaceInferenceAPIEmbeddings(
api_key=HF_TOKEN, model_name="sentence-transformers/all-mpnet-base-v2"
)
# #populated mongo_db
# vector_search = MongoDBAtlasVectorSearch.from_documents(
# documents=merged_documents,
# embedding=embedding_model,
# collection=MONGODB_COLLECTION,
# index_name=vector_search_index
# )
vector_search = MongoDBAtlasVectorSearch.from_connection_string(
connection_string=MONGO_URI,
namespace=f"{DB_NAME}.{COLLECTION_NAME}",
embedding=embedding_model,
index_name=vector_search_index,
)
query = "why EfficientNetB0?"
results = vector_search.similarity_search(query=query, k=25) # 25 most similar documents.
print("\n")
print(results)
# k to search for only the X most relevant documents.
k = 10
# score_threshold to use only documents with a relevance score above 0.80.
score_threshold = 0.80
# Build your retriever
retriever_1 = vector_search.as_retriever(
search_type = "similarity", # similarity, mmr, similarity_score_threshold. https://api.python.langchain.com/en/latest/vectorstores/langchain_core.vectorstores.VectorStore.html#langchain_core.vectorstores.VectorStore.as_retriever
search_kwargs = {"k": k, "score_threshold": score_threshold}
)
# Initialize Hugging Face client
hf_client = InferenceClient(api_key=HF_TOKEN)
# Define the prompt template
prompt = PromptTemplate.from_template(
"""Use the following pieces of context to answer the question at the end.
START OF CONTEXT:
{context}
END OF CONTEXT:
START OF QUESTION:
{question}
END OF QUESTION:
If you do not know the answer, just say that you do not know.
NEVER assume things.
"""
)
def format_docs(docs):
return "\n\n".join(doc.page_content for doc in docs)
def generate_response(input_dict: Dict[str, Any]) -> str:
formatted_prompt = prompt.format(**input_dict)
# print(formatted_prompt)
## THIS IS YOUR LLM
response = hf_client.chat.completions.create(
model="Qwen/Qwen2.5-1.5B-Instruct",
messages=[{
"role": "system",
"content": formatted_prompt
},{
"role": "user",
"content": input_dict["question"]
}],
max_tokens=1000,
temperature=0.2,
)
return response.choices[0].message.content
rag_chain = (
{
"context": retriever_1 | RunnableLambda(format_docs),
"question": RunnablePassthrough()
}
| RunnableLambda(generate_response)
)
query = "what is scaling?"
answer = rag_chain.invoke(query)
print("\nQuestion:", query)
print("Answer:", answer)
# Get source documents related to the query.
documents = retriever_1.invoke(query)
# print("\nSource documents:")
# from pprint import pprint
# pprint(results)
query = "How the GUI was implemented?"
answer = rag_chain.invoke(query)
print("\nQuestion:", query)
print("Answer:", answer)
# Get source documents related to the query.
documents = retriever_1.invoke(query)
print("\nSource documents:")
from pprint import pprint
pprint(results)
query = "How the GUI was implemented?"
answer = rag_chain.invoke(query)
print("\nQuestion:", query)
print("Answer:", answer)
# Get source documents related to the query.
documents = retriever_1.invoke(query)
formatted_docs = format_docs(documents)
print("\nSource Documents:\n", formatted_docs)