import os from huggingface_hub import hf_hub_download from langchain.llms import LlamaCpp from langchain.chains import ConversationalRetrievalChain from langchain.memory import ConversationBufferMemory def load_llm(): """ Downloads the Q4_K_M GGUF model from mobeidat's Hugging Face repository and loads it via llama-cpp. """ # 1) Download the GGUF model from Hugging Face model_file = hf_hub_download( repo_id="bartowski/ALLaM-AI_ALLaM-7B-Instruct-preview-GGUF", filename="ALLaM-AI_ALLaM-7B-Instruct-preview-Q4_K_M.gguf", local_dir="./models", local_dir_use_symlinks=False ) # 2) Load the model with llama-cpp via LangChain’s LlamaCpp llm = LlamaCpp( model_path=model_file, flash_attn=False, n_ctx=2048, # or 4096 depending on your needs n_batch=512, # or even 256 depending on your hardware chat_format='chatml' ) return llm def build_conversational_chain(vectorstore): """ Creates a ConversationalRetrievalChain using the local llama-cpp-based LLM and a ConversationBufferMemory for multi-turn Q&A. """ llm = load_llm() # We'll store chat history in memory so the chain can handle multi-turn conversations memory = ConversationBufferMemory( memory_key="chat_history", return_messages=True ) qa_chain = ConversationalRetrievalChain.from_llm( llm=llm, retriever=vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5}), memory=memory, verbose=True ) return qa_chain