Spaces:

not-lain
/

RAG-Chatbot

Running on Zero

File size: 4,367 Bytes

07ffad3
42df98c
1b7e4b0
07ffad3
 
31d7c4a
07ffad3
 
eaca477
1b7e4b0
43ae797
07ffad3
 
31d7c4a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1b7e4b0
31d7c4a
 
 
 
 
1b7e4b0
31d7c4a
 
 
 
07ffad3
31d7c4a
 
 
1b7e4b0
 
42df98c
31d7c4a
 
 
 
 
 
eaca477
31d7c4a
1b7e4b0
31d7c4a
 
 
 
 
 
07ffad3
 
bc4ea4b
6f57262
31d7c4a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42df98c
07ffad3
31d7c4a
 
07ffad3
31d7c4a
07ffad3
 
 
 
 
31d7c4a
07ffad3
 
 
31d7c4a
 
 
 
 
07ffad3
 
18b530b
07ffad3
 
e4b2161
 
31630cf
 
18b530b
31d7c4a
18b530b
31d7c4a
0e5560b
07ffad3
 
18b530b
ef4a283
 
 
 
 
 
 
 
 
 
21c647d
 
ef4a283
 
7449dcb
 
8b048b4
ef4a283
18b530b

import gradio as gr
from datasets import load_dataset

import os
import spaces
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig
import torch
from threading import Thread
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
import time

token = os.environ["HF_TOKEN"]
ST = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")

dataset = load_dataset("not-lain/wikipedia",revision = "embedded")

data = dataset["train"]
data = data.add_faiss_index("embeddings") # column name that has the embeddings of the dataset


model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

# use quantization to lower GPU usage
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id,token=token)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    quantization_config=bnb_config,
    token=token
)
terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

SYS_PROMPT = """You are an assistant for answering questions.
You are given the extracted parts of a long document and a question. Provide a conversational answer.
If you don't know the answer, just say "I do not know." Don't make up an answer."""



def search(query: str, k: int = 3 ):
    """a function that embeds a new query and returns the most probable results"""
    embedded_query = ST.encode(query) # embed new query
    scores, retrieved_examples = data.get_nearest_examples( # retrieve results
        "embeddings", embedded_query, # compare our new embedded query with the dataset embeddings
        k=k # get only top k results
    )
    return scores, retrieved_examples

def format_prompt(prompt,retrieved_documents,k):
    """using the retrieved documents we will prompt the model to generate our responses"""
    PROMPT = f"Question:{prompt}\nContext:"
    for idx in range(k) :
        PROMPT+= f"{retrieved_documents['text'][idx]}\n"
    return PROMPT


@spaces.GPU
def talk(prompt,history=[]):
    k = 1 # number of retrieved documents
    scores , retrieved_documents = search(prompt, k)
    formatted_prompt = format_prompt(prompt,retrieved_documents,k)
    formatted_prompt = formatted_prompt[:2000] # to avoid GPU OOM
    messages = [{"role":"system","content":SYS_PROMPT},{"role":"user","content":formatted_prompt}]
    # tell the model to generate
    input_ids = tokenizer.apply_chat_template(
      messages,
      add_generation_prompt=True,
      return_tensors="pt"
    ).to(model.device)
    outputs = model.generate(
      input_ids,
      max_new_tokens=1024,
      eos_token_id=terminators,
      do_sample=True,
      temperature=0.6,
      top_p=0.9,
    )
    streamer = TextIteratorStreamer(
            tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True
        )
    generate_kwargs = dict(
        input_ids= input_ids,
        streamer=streamer,
        max_new_tokens=1024,
        do_sample=True,
        top_p=0.95,
        temperature=0.75,
        eos_token_id=terminators,
    )
    t = Thread(target=model.generate, kwargs=generate_kwargs)
    t.start()

    outputs = []
    for text in streamer:
        outputs.append(text)
        yield "".join(outputs)


TITLE = "# RAG"

DESCRIPTION = """
A rag pipeline with a chatbot feature

Resources used to build this project :

* embedding model : https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1
* dataset : https://huggingface.co/datasets/not-lain/wikipedia
* faiss docs : https://huggingface.co/docs/datasets/v2.18.0/en/package_reference/main_classes#datasets.Dataset.add_faiss_index 
* chatbot : https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct
* Full documentation : https://huggingface.co/blog/not-lain/rag-chatbot-using-llama3 
"""


demo = gr.ChatInterface(
    fn=talk,
    chatbot=gr.Chatbot(
        show_label=True,
        show_share_button=True,
        show_copy_button=True,
        layout="bubble",
        bubble_full_width=False,
    ),
    theme="Soft",
    type="tuples",
    examples=["what's anarchy ? "],
    title=TITLE,
    description=DESCRIPTION,
    autofocus=False,
    autoscroll = False,
    
)
demo.launch(debug=True)