Spaces:
No application file
No application file
import warnings | |
warnings.filterwarnings("ignore") | |
import os | |
import glob | |
import textwrap | |
import time | |
import langchain | |
from langchain.document_loaders import PyPDFLoader, DirectoryLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain import PromptTemplate, LLMChain | |
from langchain.vectorstores import FAISS | |
from langchain.llms import HuggingFacePipeline | |
from langchain.embeddings import HuggingFaceInstructEmbeddings | |
from langchain.chains import RetrievalQA | |
import torch | |
import transformers | |
from transformers import ( | |
AutoTokenizer, AutoModelForCausalLM, | |
BitsAndBytesConfig, | |
pipeline | |
) | |
class CFG: | |
# LLMs | |
model_name = 'llama2-13b-chat' # wizardlm, llama2-7b-chat, llama2-13b-chat, mistral-7B | |
temperature = 0 | |
top_p = 0.95 | |
repetition_penalty = 1.15 | |
# splitting | |
split_chunk_size = 800 | |
split_overlap = 0 | |
# embeddings | |
embeddings_model_repo = 'sentence-transformers/all-MiniLM-L6-v2' | |
# similar passages | |
k = 6 | |
# paths | |
Embeddings_path = 'C:/Studies/main project/codes/final/model/cse-vectordb/faiss_index_hp' | |
# Output_folder = './cse-vectordb' | |
model_repo = 'daryl149/llama-2-7b-chat-hf' | |
tokenizer = AutoTokenizer.from_pretrained(model_repo, use_fast=True) | |
bnb_config = BitsAndBytesConfig( | |
load_in_4bit = True, | |
bnb_4bit_quant_type = "nf4", | |
bnb_4bit_compute_dtype = torch.float16, | |
bnb_4bit_use_double_quant = True, | |
) | |
model = AutoModelForCausalLM.from_pretrained( | |
model_repo, | |
quantization_config = bnb_config, | |
device_map = 'auto', | |
low_cpu_mem_usage = True, | |
trust_remote_code = True | |
) | |
max_len = 2048 | |
### hugging face pipeline | |
pipe = pipeline( | |
task = "text-generation", | |
model = model, | |
tokenizer = tokenizer, | |
pad_token_id = tokenizer.eos_token_id, | |
# do_sample = True, | |
max_length = max_len, | |
temperature = CFG.temperature, | |
top_p = CFG.top_p, | |
repetition_penalty = CFG.repetition_penalty | |
) | |
### langchain pipeline | |
llm = HuggingFacePipeline(pipeline = pipe) | |
### download embeddings model | |
embeddings = HuggingFaceInstructEmbeddings( | |
model_name = CFG.embeddings_model_repo, | |
model_kwargs = {"device": "cuda"} | |
) | |
### load vector DB embeddings | |
vectordb = FAISS.load_local( | |
CFG.Embeddings_path, # from input folder | |
# CFG.Output_folder + '/faiss_index_hp', # from output folder | |
embeddings, | |
allow_dangerous_deserialization=True | |
) | |
prompt_template = """ | |
Don't try to make up an answer, if you don't know just say that you don't know. | |
Answer in the same language the question was asked. | |
Use only the following pieces of context to answer the question at the end. | |
{context} | |
Question: {question} | |
Answer:""" | |
PROMPT = PromptTemplate( | |
template = prompt_template, | |
input_variables = ["context", "question"] | |
) | |
retriever = vectordb.as_retriever(search_kwargs = {"k": CFG.k, "search_type" : "similarity"}) | |
qa_chain = RetrievalQA.from_chain_type( | |
llm = llm, | |
chain_type = "stuff", # map_reduce, map_rerank, stuff, refine | |
retriever = retriever, | |
chain_type_kwargs = {"prompt": PROMPT}, | |
return_source_documents = True, | |
verbose = False | |
) | |
print("Hello") |