from pydantic import BaseModel, Field from langchain.tools import Tool from langchain_community.vectorstores import Neo4jVector from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_community.document_loaders import PyPDFLoader from langchain_openai import OpenAIEmbeddings, ChatOpenAI from langchain_core.output_parsers import StrOutputParser from langchain_core.runnables import RunnablePassthrough from langchain import hub import os # Initialize LLM key = "sk-proj-LdVhjM2bTI27bA3grOK8T3BlbkFJh5whi2UHYKkgM2pNwpbe" os.environ["OPENAI_API_KEY"] = key class RAGToolConfig(BaseModel): NEO4J_URI: str = Field(default="neo4j+s://741a3118.databases.neo4j.io") NEO4J_USERNAME: str = Field(default="neo4j") NEO4J_PASSWORD: str = Field(default="XvUolnAXmgx9SG_lRSJuisbDClxi2MiTKGIoBdqN53A") pdf_path: str = Field(default="/mnt/d/atx/hragent/rag/Sirca_Paints.pdf") class RAGToolImplementation: def __init__(self, config: RAGToolConfig, llm): self.config = config self.llm = llm # Store the llm instance self.embedding_model = OpenAIEmbeddings() self.vectorstore = self._initialize_vectorstore() self.rag_chain = self._setup_rag_chain() def _initialize_vectorstore(self): try: # Try to load existing vector store vectorstore = Neo4jVector( url=self.config.NEO4J_URI, username=self.config.NEO4J_USERNAME, password=self.config.NEO4J_PASSWORD, embedding=self.embedding_model, index_name="pdf_embeddings", node_label="PDFChunk", text_node_property="text", embedding_node_property="embedding" ) vectorstore.similarity_search("Test query", k=1) print("Existing vector store loaded.") except Exception as e: print(f"Creating new vector store. Error: {e}") # Load and process the PDF loader = PyPDFLoader(self.config.pdf_path) docs = loader.load() # Split the document into chunks text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) splits = text_splitter.split_documents(docs) # Create new vector store vectorstore = Neo4jVector.from_documents( documents=splits, embedding=self.embedding_model, url=self.config.NEO4J_URI, username=self.config.NEO4J_USERNAME, password=self.config.NEO4J_PASSWORD, index_name="pdf_embeddings", node_label="PDFChunk", text_node_property="text", embedding_node_property="embedding" ) print("New vector store created and loaded.") return vectorstore def _setup_rag_chain(self): retriever = self.vectorstore.as_retriever() prompt = hub.pull("rlm/rag-prompt") def format_docs(docs): return "\n\n".join(doc.page_content for doc in docs) rag_chain = ( {"context": retriever | format_docs, "question": RunnablePassthrough()} | prompt | self.llm # Use the llm instance here | StrOutputParser() ) return rag_chain def run(self, query: str) -> str: try: response = self.rag_chain.invoke(query) return response except Exception as e: return f"An error occurred while processing the query: {str(e)}" def create_rag_tool(config: RAGToolConfig = RAGToolConfig(), llm=None): implementation = RAGToolImplementation(config, llm) return Tool( name="RAGTool", description="Retrieval-Augmented Generation Tool for querying PDF content about Sirca Paints", func=implementation.run ) # # Example Usage # if __name__ == "__main__": # llm = ChatOpenAI(model="gpt-4", temperature=0) # rag_tool = create_rag_tool(llm=llm) # # Test the tool # result = rag_tool.run("What is spil ethics?") # print(result)