# HyDE + ReRank RAG for Freights Rates # Import modules and classes from llama_index.core import VectorStoreIndex, StorageContext, load_index_from_storage from llama_index.core.indices.query.query_transform import HyDEQueryTransform from llama_index.core.query_engine import TransformQueryEngine from langchain_nvidia_ai_endpoints import NVIDIARerank from langchain_core.documents import Document as LangDocument from llama_index.core.llms import ChatMessage, MessageRole from llama_index.llms.nvidia import NVIDIA from llama_index.embeddings.nvidia import NVIDIAEmbedding from llama_index.core import Document as LlamaDocument from llama_index.core import Settings from llama_parse import LlamaParse import streamlit as st import os # Set environmental variables nvidia_api_key = os.getenv("NVIDIA_KEY") llamaparse_api_key = os.getenv("PARSE_KEY") # Initialize ChatNVIDIA, NVIDIARerank, and NVIDIAEmbeddings client = NVIDIA( model="meta/llama-3.1-8b-instruct", api_key=nvidia_api_key, temperature=0.2, top_p=0.7, max_tokens=1024 ) embed_model = NVIDIAEmbedding( model="nvidia/nv-embedqa-e5-v5", api_key=nvidia_api_key, truncate="NONE" ) reranker = NVIDIARerank( model="nvidia/nv-rerankqa-mistral-4b-v3", api_key=nvidia_api_key, ) # Set the NVIDIA models globally Settings.embed_model = embed_model Settings.llm = client # Parse the local PDF document parser = LlamaParse( api_key=llamaparse_api_key, result_type="markdown", verbose=True ) # Get the absolute path of the script's directory script_dir = os.path.dirname(os.path.abspath(__file__)) data_file = os.path.join(script_dir, "FreightsDataset.pdf") # Load the PDF document using the relative path documents = parser.load_data(data_file) print("Document Parsed") # Split parsed text into chunks for embedding model def split_text(text, max_tokens=512): words = text.split() chunks = [] current_chunk = [] current_length = 0 for word in words: word_length = len(word) if current_length + word_length + 1 > max_tokens: chunks.append(" ".join(current_chunk)) current_chunk = [word] current_length = word_length + 1 else: current_chunk.append(word) current_length += word_length + 1 if current_chunk: chunks.append(" ".join(current_chunk)) return chunks # Generate embeddings for document chunks all_embeddings = [] all_documents = [] for doc in documents: text_chunks = split_text(doc.text) for chunk in text_chunks: embedding = embed_model.get_text_embedding(chunk) all_embeddings.append(embedding) all_documents.append(LlamaDocument(text=chunk)) print("Embeddings generated") # Create and persist index with NVIDIAEmbeddings index = VectorStoreIndex.from_documents(all_documents, embeddings=all_embeddings, embed_model=embed_model) index.set_index_id("vector_index") index.storage_context.persist("./storage") print("Index created") # Load index from storage storage_context = StorageContext.from_defaults(persist_dir="storage") index = load_index_from_storage(storage_context, index_id="vector_index") print("Index loaded") # Initialize HyDEQueryTransform and TransformQueryEngine hyde = HyDEQueryTransform(include_original=True) query_engine = index.as_query_engine() hyde_query_engine = TransformQueryEngine(query_engine, hyde) # Query the index with HyDE and use output as LLM context def query_model_with_context(question): # Generate a hypothetical document using HyDE hyde_response = hyde_query_engine.query(question) print(f"HyDE Response: {hyde_response}") if isinstance(hyde_response, str): hyde_query = hyde_response else: hyde_query = hyde_response.response # Use the hypothetical document to retrieve relevant documents retriever = index.as_retriever(similarity_top_k=3) nodes = retriever.retrieve(hyde_query) for node in nodes: print(node) # Rerank the retrieved nodes ranked_documents = reranker.compress_documents( query=question, documents=[LangDocument(page_content=node.text) for node in nodes] ) # Print the most relevant and least relevant node print(f"Most relevant node: {ranked_documents[0].page_content}") # Use the most relevant node as context context = ranked_documents[0].page_content # Send context and question to the client (NVIDIA Llama 3.1 8B model) # Construct the messages using the ChatMessage class messages = [ ChatMessage(role=MessageRole.SYSTEM, content=context), ChatMessage(role=MessageRole.USER, content=str(question)) ] # Call the chat method to get the response completion = client.chat(messages) # Process response - assuming completion is a single string or a tuple containing a string response_text = "" if isinstance(completion, (list, tuple)): # Join elements of tuple/list if it's in such format response_text = ' '.join(completion) elif isinstance(completion, str): # Directly assign if it's a string response_text = completion else: # Fallback in case of unexpected types, convert to string response_text = str(completion) response_text = response_text.replace("assistant:", "Final Response:").strip() return response_text # Streamlit UI st.title("Chat with HyDE and Rerank RAG Freights App") question = st.text_input("Enter a relevant question to chat with the attached FreightsDataset file:") if st.button("Submit"): if question: st.write("**RAG Response:**") response = query_model_with_context(question) st.write(response) else: st.warning("Please enter a question.")