ShravanHN commited on
Commit
557609c
·
1 Parent(s): ce5d73d

added rag implementation for the model and specified a sys prompt

Browse files
Files changed (2) hide show
  1. app.py +42 -34
  2. requirements.txt +2 -1
app.py CHANGED
@@ -1,8 +1,8 @@
 
1
  import gradio as gr
2
  import os
3
- import spaces
4
- from transformers import GemmaTokenizer, AutoModelForCausalLM
5
- from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
6
  from threading import Thread
7
 
8
  # Set an environment variable
@@ -37,33 +37,52 @@ h1 {
37
  }
38
 
39
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
- # Load the tokenizer and model
42
- tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
43
- model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto") # to("cuda:0")
44
  terminators = [
45
  tokenizer.eos_token_id,
46
  tokenizer.convert_tokens_to_ids("<|eot_id|>")
47
  ]
48
 
 
 
 
 
 
 
 
49
  @spaces.GPU(duration=120)
50
- def chat_llama3_8b(message: str,
51
- history: list,
52
- temperature: float,
53
- max_new_tokens: int
54
- ) -> str:
55
  """
56
  Generate a streaming response using the llama3-8b model.
 
57
  Args:
58
  message (str): The input message.
59
  history (list): The conversation history used by ChatInterface.
60
  temperature (float): The temperature for generating the response.
61
  max_new_tokens (int): The maximum number of new tokens to generate.
 
62
  Returns:
63
  str: The generated response.
64
  """
65
- conversation = []
66
- message+= "Extract all relevant keywords and add quantity from the following text and format the result in nested JSON, ignoring personal details and focusing only on the scope of work as shown in the example: {'lobby': {'frcm': {'replace': {'carpet': 1, 'carpet_pad': 1, 'base': 1, 'window_treatments': 1, 'artwork_and_decorative_accessories': 1, 'portable_lighting': 1, 'upholstered_furniture_and_decorative_pillows': 1, 'millwork': 1} } } }"
67
  for user, assistant in history:
68
  conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
69
  conversation.append({"role": "user", "content": message})
@@ -71,14 +90,15 @@ def chat_llama3_8b(message: str,
71
  input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt").to(model.device)
72
 
73
  streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
74
-
75
  generate_kwargs = dict(
76
- input_ids= input_ids,
77
  streamer=streamer,
78
  max_new_tokens=max_new_tokens,
79
  do_sample=True,
80
  temperature=temperature,
81
  eos_token_id=terminators,
 
82
  )
83
  if temperature == 0:
84
  generate_kwargs['do_sample'] = False
@@ -89,39 +109,27 @@ def chat_llama3_8b(message: str,
89
  outputs = []
90
  for text in streamer:
91
  outputs.append(text)
92
- #print(outputs)
93
  yield "".join(outputs)
94
 
95
 
96
  # Gradio block
97
- chatbot=gr.Chatbot(height=450, placeholder=PLACEHOLDER, label='Gradio ChatInterface')
98
 
99
  with gr.Blocks(fill_height=True, css=css) as demo:
100
-
101
  gr.Markdown(DESCRIPTION)
 
102
  gr.ChatInterface(
103
  fn=chat_llama3_8b,
104
  chatbot=chatbot,
105
  fill_height=True,
106
  additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
107
  additional_inputs=[
108
- gr.Slider(minimum=0,
109
- maximum=1,
110
- step=0.1,
111
- value=0.95,
112
- label="Temperature",
113
- render=False),
114
- gr.Slider(minimum=128,
115
- maximum=9012,
116
- step=1,
117
- value=512,
118
- label="Max new tokens",
119
- render=False ),
120
- ],
121
-
122
- )
123
 
124
  gr.Markdown(LICENSE)
125
 
126
  if __name__ == "__main__":
127
- demo.launch(show_error=True)
 
1
+ import spaces
2
  import gradio as gr
3
  import os
4
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig
5
+ import torch
 
6
  from threading import Thread
7
 
8
  # Set an environment variable
 
37
  }
38
 
39
  """
40
+ # Load the tokenizer and model with quantization
41
+ model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
42
+ bnb_config = BitsAndBytesConfig(
43
+ load_in_4bit=True,
44
+ bnb_4bit_use_double_quant=True,
45
+ bnb_4bit_quant_type="nf4",
46
+ bnb_4bit_compute_dtype=torch.bfloat16
47
+ )
48
+
49
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
50
+ model = AutoModelForCausalLM.from_pretrained(
51
+ model_id,
52
+ device_map="auto",
53
+ quantization_config=bnb_config,
54
+ torch_dtype=torch.bfloat16
55
+ )
56
+ model.generation_config.pad_token_id = tokenizer.pad_token_id
57
 
 
 
 
58
  terminators = [
59
  tokenizer.eos_token_id,
60
  tokenizer.convert_tokens_to_ids("<|eot_id|>")
61
  ]
62
 
63
+ SYS_PROMPT = """
64
+ Extract all relevant keywords and add quantity from the following text and format the result in nested JSON, ignoring personal details and focusing only on the scope of work as shown in the example:
65
+ Good JSON example: {'lobby': {'frcm': {'replace': {'carpet': 1, 'carpet_pad': 1, 'base': 1, 'window_treatments': 1, 'artwork_and_decorative_accessories': 1, 'portable_lighting': 1, 'upholstered_furniture_and_decorative_pillows': 1, 'millwork': 1} } } }
66
+ Bad JSON example: {'lobby': { 'frcm': { 'replace': [ 'carpet', 'carpet_pad', 'base', 'window_treatments', 'artwork_and_decorative_accessories', 'portable_lighting', 'upholstered_furniture_and_decorative_pillows', 'millwork'] } } }
67
+ Make sure to fetch details from the provided text and ignore unnecessary information. The response should be in JSON format only, without any additional comments.
68
+ """
69
+
70
  @spaces.GPU(duration=120)
71
+ def chat_llama3_8b(message: str, history: list, temperature: float, max_new_tokens: int):
 
 
 
 
72
  """
73
  Generate a streaming response using the llama3-8b model.
74
+
75
  Args:
76
  message (str): The input message.
77
  history (list): The conversation history used by ChatInterface.
78
  temperature (float): The temperature for generating the response.
79
  max_new_tokens (int): The maximum number of new tokens to generate.
80
+
81
  Returns:
82
  str: The generated response.
83
  """
84
+ conversation = [{"role": "system", "content": SYS_PROMPT}]
85
+
86
  for user, assistant in history:
87
  conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
88
  conversation.append({"role": "user", "content": message})
 
90
  input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt").to(model.device)
91
 
92
  streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
93
+
94
  generate_kwargs = dict(
95
+ input_ids=input_ids,
96
  streamer=streamer,
97
  max_new_tokens=max_new_tokens,
98
  do_sample=True,
99
  temperature=temperature,
100
  eos_token_id=terminators,
101
+ pad_token_id=tokenizer.eos_token_id
102
  )
103
  if temperature == 0:
104
  generate_kwargs['do_sample'] = False
 
109
  outputs = []
110
  for text in streamer:
111
  outputs.append(text)
 
112
  yield "".join(outputs)
113
 
114
 
115
  # Gradio block
116
+ chatbot = gr.Chatbot(height=450, placeholder=PLACEHOLDER, label='Gradio ChatInterface')
117
 
118
  with gr.Blocks(fill_height=True, css=css) as demo:
 
119
  gr.Markdown(DESCRIPTION)
120
+
121
  gr.ChatInterface(
122
  fn=chat_llama3_8b,
123
  chatbot=chatbot,
124
  fill_height=True,
125
  additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
126
  additional_inputs=[
127
+ gr.Slider(minimum=0, maximum=1, step=0.1, value=0.95, label="Temperature", render=False),
128
+ gr.Slider(minimum=128, maximum=9012, step=1, value=512, label="Max new tokens", render=False),
129
+ ]
130
+ )
 
 
 
 
 
 
 
 
 
 
 
131
 
132
  gr.Markdown(LICENSE)
133
 
134
  if __name__ == "__main__":
135
+ demo.launch(show_error=True, debug=True)
requirements.txt CHANGED
@@ -1,3 +1,4 @@
1
  accelerate
2
  transformers
3
- SentencePiece
 
 
1
  accelerate
2
  transformers
3
+ SentencePiece
4
+ bitsandbytes