Spaces:

hunterXdk
/

RagModels

Sleeping

App Files Files Community

RagModels / app-before.py

hunterXdk

Renamed

6a00edf verified 26 days ago

raw

history blame contribute delete

3.6 kB

	import os
	import streamlit as st
	from PyPDF2 import PdfReader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.embeddings import HuggingFaceEmbeddings

	from langchain.vectorstores import FAISS
	from langchain.chains.question_answering import load_qa_chain
	from langchain.prompts import PromptTemplate
	from transformers import AutoModelForCausalLM, AutoTokenizer
	import torch


	# Load a Hugging Face model (e.g., LLaMA or Falcon)
	model_name = "mixedbread-ai/mxbai-embed-2d-large-v1" # Replace with your preferred model
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)



	def get_pdf_text(pdf_docs):
	text = ""
	for pdf in pdf_docs:
	pdf_reader = PdfReader(pdf)
	for page in pdf_reader.pages:
	text += page.extract_text()
	return text

	# chuck_size = 1000, chunk_overlap = 200 (for shorted PDFs)
	def get_text_chunks(text):
	text_splitter= RecursiveCharacterTextSplitter(
	chunk_size=10000,
	chunk_overlap=1000,
	# length_function=len
	)
	chunks=text_splitter.split_text(text)
	return chunks

	# Converting into Vector data/store (can also be stored)
	def get_vector_store(text_chunks):
	embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
	vector_store = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
	vector_store.save_local("faiss_index")
	# return vector_store



	def chat_with_huggingface(context, query):
	prompt_template = """
	Answer the query as detailed as possible from the provided context.
	If the answer is not in the context, just say, "Answer is not available in the provided documents".
	Context: {context}
	Query: {query}
	Answer:
	"""
	inputs = tokenizer(prompt_template, return_tensors="pt").to(model.device)
	outputs = model.generate(**inputs, max_length=500, temperature=0.3)
	return tokenizer.decode(outputs[0], skip_special_tokens=True)

	def get_conversation_chain():
	def huggingface_chain(inputs):
	context = inputs["input_documents"][0].page_content # Extract context from FAISS search
	query = inputs["question"]
	return {"output_text": chat_with_huggingface(context, query)}

	return huggingface_chain

	def user_input(user_question):
	# embeddings = GoogleGenerativeAIEmbeddings(model='embedding-gecko-001')
	embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

	# Loading the embeddings
	new_db = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)
	docs = new_db.similarity_search(user_question)

	chain=get_conversation_chain()

	response = chain(
	{"input_documents": docs, "question": user_question})

	print(response)
	st.write("Reply: ", response["output_text"])

	# Frontend page Processor
	def main():
	st.set_page_config(page_title="PDF Chatbot")
	st.header("PDF Chatbot made for Pooja")

	user_question = st.text_input("Puchiye kuch apne documents se:")

	if user_question:
	user_input(user_question)

	with st.sidebar:
	st.title("Menu:")
	pdf_docs = st.file_uploader(
	"Apne PDFs yaha pe upload karo then click on 'Process'", accept_multiple_files=True)
	if st.button("Submit & Process"):
	with st.spinner("Ruko Padh raha hu..."):
	raw_text = get_pdf_text(pdf_docs)
	text_chunks = get_text_chunks(raw_text)
	get_vector_store(text_chunks)
	st.success("Saare documents padh liya. Ab swaal pucho 😤")


	if __name__ == '__main__':
	main()