import warnings warnings.filterwarnings("ignore") import os import glob import textwrap import time import langchain from langchain.document_loaders import PyPDFLoader, DirectoryLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain import PromptTemplate, LLMChain from langchain.vectorstores import FAISS from langchain.llms import HuggingFacePipeline from langchain.embeddings import HuggingFaceInstructEmbeddings from langchain.chains import RetrievalQA import torch import transformers from transformers import ( AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline ) class CFG: # LLMs model_name = 'llama2-13b-chat' # wizardlm, llama2-7b-chat, llama2-13b-chat, mistral-7B temperature = 0 top_p = 0.95 repetition_penalty = 1.15 # splitting split_chunk_size = 800 split_overlap = 0 # embeddings embeddings_model_repo = 'sentence-transformers/all-MiniLM-L6-v2' # similar passages k = 6 # paths Embeddings_path = 'C:/Studies/main project/codes/final/model/cse-vectordb/faiss_index_hp' # Output_folder = './cse-vectordb' model_repo = 'daryl149/llama-2-7b-chat-hf' tokenizer = AutoTokenizer.from_pretrained(model_repo, use_fast=True) bnb_config = BitsAndBytesConfig( load_in_4bit = True, bnb_4bit_quant_type = "nf4", bnb_4bit_compute_dtype = torch.float16, bnb_4bit_use_double_quant = True, ) model = AutoModelForCausalLM.from_pretrained( model_repo, quantization_config = bnb_config, device_map = 'auto', low_cpu_mem_usage = True, trust_remote_code = True ) max_len = 2048 ### hugging face pipeline pipe = pipeline( task = "text-generation", model = model, tokenizer = tokenizer, pad_token_id = tokenizer.eos_token_id, # do_sample = True, max_length = max_len, temperature = CFG.temperature, top_p = CFG.top_p, repetition_penalty = CFG.repetition_penalty ) ### langchain pipeline llm = HuggingFacePipeline(pipeline = pipe) ### download embeddings model embeddings = HuggingFaceInstructEmbeddings( model_name = CFG.embeddings_model_repo, model_kwargs = {"device": "cuda"} ) ### load vector DB embeddings vectordb = FAISS.load_local( CFG.Embeddings_path, # from input folder # CFG.Output_folder + '/faiss_index_hp', # from output folder embeddings, allow_dangerous_deserialization=True ) prompt_template = """ Don't try to make up an answer, if you don't know just say that you don't know. Answer in the same language the question was asked. Use only the following pieces of context to answer the question at the end. {context} Question: {question} Answer:""" PROMPT = PromptTemplate( template = prompt_template, input_variables = ["context", "question"] ) retriever = vectordb.as_retriever(search_kwargs = {"k": CFG.k, "search_type" : "similarity"}) qa_chain = RetrievalQA.from_chain_type( llm = llm, chain_type = "stuff", # map_reduce, map_rerank, stuff, refine retriever = retriever, chain_type_kwargs = {"prompt": PROMPT}, return_source_documents = True, verbose = False ) print("Hello")