Spaces:
Sleeping
Sleeping
File size: 3,205 Bytes
4214383 324d801 112dd66 324d801 4214383 324d801 4214383 c2e9168 1b8e201 4214383 ed01ee7 4214383 d7c6aea 4214383 1e60b05 4214383 9663c5d 4214383 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
from torch import cuda, bfloat16
import torch
import transformers
from transformers import AutoTokenizer
from time import time
import chromadb
from chromadb.config import Settings
from langchain_community.llms import HuggingFacePipeline
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.embeddings.spacy_embeddings import SpacyEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
import gradio as gr
#############################################################################
model_id = "marcolorenzi98/tinyllama-enron-v1"
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library
bnb_config = transformers.BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type='nf4',
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=bfloat16
)
##############################################################################
model_config = transformers.AutoConfig.from_pretrained(model_id)
model = transformers.AutoModelForCausalLM.from_pretrained(
model_id,trust_remote_code=True,
config=model_config,
#quantization_config=bnb_config,
device_map='auto')
tokenizer = AutoTokenizer.from_pretrained(model_id)
##############################################################################
embedding = SpacyEmbeddings(model_name="en_core_web_sm")
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk
persist_directory = 'Enron_case_RAG/Langchain_ChromaDB'
# load from disk
db3 = Chroma(persist_directory=persist_directory,
embedding_function=embedding,
collection_name="Enron_vectorstore"
)
##############################################################################
query_pipeline = transformers.pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
torch_dtype=torch.float16,
device_map="auto")
llm = HuggingFacePipeline(pipeline=query_pipeline)
retriever = db3.as_retriever()
##############################################################################
def gradio_rag(query):
qa = RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff",
retriever=retriever,
verbose=True)
print(f"Query: {query}\n")
time_1 = time()
result = qa.run(query)
time_2 = time()
print(f"Inference time: {round(time_2-time_1, 3)} sec.")
print("\nResult: ", result)
###############################################################################
demo = gr.Interface(
fn=gradio_rag,
inputs=gr.Textbox(label="Please, write your request here:", placeholder="example: who is Sheila Chang", lines=5),
outputs=gr.Textbox(label="Answer:"),
title='Tiny Llama RAG on Enron Scandal',
description="This is a RAG system based on the SLM Tiny Llama, fine tuned on the Enron Scandal Emails' dataset",
allow_flagging="never"
)
demo.launch(debug=False)
|