Spaces:

contenteaseAI
/

LargeLanguageModel

Build error

App Files Files Community

ShravanHN commited on Jun 21, 2024

Commit

557609c

1 Parent(s): ce5d73d

added rag implementation for the model and specified a sys prompt

Browse files

Files changed (2) hide show

app.py +42 -34
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -1,8 +1,8 @@
 import gradio as gr
 import os
-import spaces
-from transformers import GemmaTokenizer, AutoModelForCausalLM
-from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 from threading import Thread
 # Set an environment variable
@@ -37,33 +37,52 @@ h1 {
 }
 """
-# Load the tokenizer and model
-tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
-model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto")  # to("cuda:0")
 terminators = [
     tokenizer.eos_token_id,
     tokenizer.convert_tokens_to_ids("<|eot_id|>")
 ]
 @spaces.GPU(duration=120)
-def chat_llama3_8b(message: str,
-              history: list,
-              temperature: float,
-              max_new_tokens: int
-             ) -> str:
     """
     Generate a streaming response using the llama3-8b model.
     Args:
         message (str): The input message.
         history (list): The conversation history used by ChatInterface.
         temperature (float): The temperature for generating the response.
         max_new_tokens (int): The maximum number of new tokens to generate.
     Returns:
         str: The generated response.
     """
-    conversation = []
-    message+= "Extract all relevant keywords and add quantity from the following text and format the result in nested JSON, ignoring personal details and focusing only on the scope of work as shown in the example: {'lobby': {'frcm': {'replace': {'carpet': 1, 'carpet_pad': 1, 'base': 1, 'window_treatments': 1, 'artwork_and_decorative_accessories': 1, 'portable_lighting': 1, 'upholstered_furniture_and_decorative_pillows': 1, 'millwork': 1} } } }"
     for user, assistant in history:
         conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
     conversation.append({"role": "user", "content": message})
@@ -71,14 +90,15 @@ def chat_llama3_8b(message: str,
     input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt").to(model.device)
     streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
-        input_ids= input_ids,
         streamer=streamer,
         max_new_tokens=max_new_tokens,
         do_sample=True,
         temperature=temperature,
         eos_token_id=terminators,
     )
     if temperature == 0:
         generate_kwargs['do_sample'] = False
@@ -89,39 +109,27 @@ def chat_llama3_8b(message: str,
     outputs = []
     for text in streamer:
         outputs.append(text)
-        #print(outputs)
         yield "".join(outputs)
 # Gradio block
-chatbot=gr.Chatbot(height=450, placeholder=PLACEHOLDER, label='Gradio ChatInterface')
 with gr.Blocks(fill_height=True, css=css) as demo:
     gr.Markdown(DESCRIPTION)
     gr.ChatInterface(
         fn=chat_llama3_8b,
         chatbot=chatbot,
         fill_height=True,
         additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
         additional_inputs=[
-            gr.Slider(minimum=0,
-                      maximum=1,
-                      step=0.1,
-                      value=0.95,
-                      label="Temperature",
-                      render=False),
-            gr.Slider(minimum=128,
-                      maximum=9012,
-                      step=1,
-                      value=512,
-                      label="Max new tokens",
-                      render=False ),
-            ],
-                     )
     gr.Markdown(LICENSE)
 if __name__ == "__main__":
-    demo.launch(show_error=True)

+import spaces
 import gradio as gr
 import os
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig
+import torch
 from threading import Thread
 # Set an environment variable
 }
 """
+# Load the tokenizer and model with quantization
+model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.bfloat16
+)
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    device_map="auto",
+    quantization_config=bnb_config,
+    torch_dtype=torch.bfloat16
+)
+model.generation_config.pad_token_id = tokenizer.pad_token_id
 terminators = [
     tokenizer.eos_token_id,
     tokenizer.convert_tokens_to_ids("<|eot_id|>")
 ]
+SYS_PROMPT = """
+Extract all relevant keywords and add quantity from the following text and format the result in nested JSON, ignoring personal details and focusing only on the scope of work as shown in the example:
+Good JSON example: {'lobby': {'frcm': {'replace': {'carpet': 1, 'carpet_pad': 1, 'base': 1, 'window_treatments': 1, 'artwork_and_decorative_accessories': 1, 'portable_lighting': 1, 'upholstered_furniture_and_decorative_pillows': 1, 'millwork': 1} } } }
+Bad JSON example: {'lobby': { 'frcm': { 'replace': [ 'carpet', 'carpet_pad', 'base', 'window_treatments', 'artwork_and_decorative_accessories', 'portable_lighting', 'upholstered_furniture_and_decorative_pillows', 'millwork'] } } }
+Make sure to fetch details from the provided text and ignore unnecessary information. The response should be in JSON format only, without any additional comments.
+"""
 @spaces.GPU(duration=120)
+def chat_llama3_8b(message: str, history: list, temperature: float, max_new_tokens: int):
     """
     Generate a streaming response using the llama3-8b model.
     Args:
         message (str): The input message.
         history (list): The conversation history used by ChatInterface.
         temperature (float): The temperature for generating the response.
         max_new_tokens (int): The maximum number of new tokens to generate.
     Returns:
         str: The generated response.
     """
+    conversation = [{"role": "system", "content": SYS_PROMPT}]
     for user, assistant in history:
         conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
     conversation.append({"role": "user", "content": message})
     input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt").to(model.device)
     streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
+        input_ids=input_ids,
         streamer=streamer,
         max_new_tokens=max_new_tokens,
         do_sample=True,
         temperature=temperature,
         eos_token_id=terminators,
+        pad_token_id=tokenizer.eos_token_id
     )
     if temperature == 0:
         generate_kwargs['do_sample'] = False
     outputs = []
     for text in streamer:
         outputs.append(text)
         yield "".join(outputs)
 # Gradio block
+chatbot = gr.Chatbot(height=450, placeholder=PLACEHOLDER, label='Gradio ChatInterface')
 with gr.Blocks(fill_height=True, css=css) as demo:
     gr.Markdown(DESCRIPTION)
     gr.ChatInterface(
         fn=chat_llama3_8b,
         chatbot=chatbot,
         fill_height=True,
         additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
         additional_inputs=[
+            gr.Slider(minimum=0, maximum=1, step=0.1, value=0.95, label="Temperature", render=False),
+            gr.Slider(minimum=128, maximum=9012, step=1, value=512, label="Max new tokens", render=False),
+        ]
+    )
     gr.Markdown(LICENSE)
 if __name__ == "__main__":
+    demo.launch(show_error=True, debug=True)

requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
 accelerate
 transformers
-SentencePiece

 accelerate
 transformers
+SentencePiece
+bitsandbytes