Spaces:
Running
Running
import os | |
from huggingface_hub import hf_hub_download | |
from langchain.llms import LlamaCpp | |
from langchain.chains import ConversationalRetrievalChain | |
from langchain.memory import ConversationBufferMemory | |
def load_llm(): | |
""" | |
Downloads the Q4_K_M GGUF model from mobeidat's Hugging Face repository and loads it via llama-cpp. | |
""" | |
# 1) Download the GGUF model from Hugging Face | |
model_file = hf_hub_download( | |
repo_id="bartowski/ALLaM-AI_ALLaM-7B-Instruct-preview-GGUF", | |
filename="ALLaM-AI_ALLaM-7B-Instruct-preview-Q4_K_M.gguf", | |
local_dir="./models", | |
local_dir_use_symlinks=False | |
) | |
# 2) Load the model with llama-cpp via LangChain’s LlamaCpp | |
llm = LlamaCpp( | |
model_path=model_file, | |
flash_attn=False, | |
n_ctx=2048, # or 4096 depending on your needs | |
n_batch=512, # or even 256 depending on your hardware | |
chat_format='chatml' | |
) | |
return llm | |
def build_conversational_chain(vectorstore): | |
""" | |
Creates a ConversationalRetrievalChain using the local llama-cpp-based LLM | |
and a ConversationBufferMemory for multi-turn Q&A. | |
""" | |
llm = load_llm() | |
# We'll store chat history in memory so the chain can handle multi-turn conversations | |
memory = ConversationBufferMemory( | |
memory_key="chat_history", | |
return_messages=True | |
) | |
qa_chain = ConversationalRetrievalChain.from_llm( | |
llm=llm, | |
retriever=vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5}), | |
memory=memory, | |
verbose=True | |
) | |
return qa_chain |