Spaces:
No application file
No application file
File size: 3,289 Bytes
1fa21e8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 |
import warnings
warnings.filterwarnings("ignore")
import os
import glob
import textwrap
import time
import langchain
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain import PromptTemplate, LLMChain
from langchain.vectorstores import FAISS
from langchain.llms import HuggingFacePipeline
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.chains import RetrievalQA
import torch
import transformers
from transformers import (
AutoTokenizer, AutoModelForCausalLM,
BitsAndBytesConfig,
pipeline
)
class CFG:
# LLMs
model_name = 'llama2-13b-chat' # wizardlm, llama2-7b-chat, llama2-13b-chat, mistral-7B
temperature = 0
top_p = 0.95
repetition_penalty = 1.15
# splitting
split_chunk_size = 800
split_overlap = 0
# embeddings
embeddings_model_repo = 'sentence-transformers/all-MiniLM-L6-v2'
# similar passages
k = 6
# paths
Embeddings_path = 'C:/Studies/main project/codes/final/model/cse-vectordb/faiss_index_hp'
# Output_folder = './cse-vectordb'
model_repo = 'daryl149/llama-2-7b-chat-hf'
tokenizer = AutoTokenizer.from_pretrained(model_repo, use_fast=True)
bnb_config = BitsAndBytesConfig(
load_in_4bit = True,
bnb_4bit_quant_type = "nf4",
bnb_4bit_compute_dtype = torch.float16,
bnb_4bit_use_double_quant = True,
)
model = AutoModelForCausalLM.from_pretrained(
model_repo,
quantization_config = bnb_config,
device_map = 'auto',
low_cpu_mem_usage = True,
trust_remote_code = True
)
max_len = 2048
### hugging face pipeline
pipe = pipeline(
task = "text-generation",
model = model,
tokenizer = tokenizer,
pad_token_id = tokenizer.eos_token_id,
# do_sample = True,
max_length = max_len,
temperature = CFG.temperature,
top_p = CFG.top_p,
repetition_penalty = CFG.repetition_penalty
)
### langchain pipeline
llm = HuggingFacePipeline(pipeline = pipe)
### download embeddings model
embeddings = HuggingFaceInstructEmbeddings(
model_name = CFG.embeddings_model_repo,
model_kwargs = {"device": "cuda"}
)
### load vector DB embeddings
vectordb = FAISS.load_local(
CFG.Embeddings_path, # from input folder
# CFG.Output_folder + '/faiss_index_hp', # from output folder
embeddings,
allow_dangerous_deserialization=True
)
prompt_template = """
Don't try to make up an answer, if you don't know just say that you don't know.
Answer in the same language the question was asked.
Use only the following pieces of context to answer the question at the end.
{context}
Question: {question}
Answer:"""
PROMPT = PromptTemplate(
template = prompt_template,
input_variables = ["context", "question"]
)
retriever = vectordb.as_retriever(search_kwargs = {"k": CFG.k, "search_type" : "similarity"})
qa_chain = RetrievalQA.from_chain_type(
llm = llm,
chain_type = "stuff", # map_reduce, map_rerank, stuff, refine
retriever = retriever,
chain_type_kwargs = {"prompt": PROMPT},
return_source_documents = True,
verbose = False
)
print("Hello") |