charanhu commited on
Commit
7df47b5
β€’
1 Parent(s): 01951c6

quantization_config

Browse files
Files changed (1) hide show
  1. app.py +11 -2
app.py CHANGED
@@ -1,9 +1,18 @@
1
  import gradio as gr
2
- from transformers import AutoTokenizer, AutoModelForCausalLM
3
 
4
- tokenizer = AutoTokenizer.from_pretrained("TheBloke/SOLAR-10.7B-Instruct-v1.0-uncensored-GPTQ")
5
  model = AutoModelForCausalLM.from_pretrained("TheBloke/SOLAR-10.7B-Instruct-v1.0-uncensored-GPTQ")
6
 
 
 
 
 
 
 
 
 
 
7
  def generate_response(prompt):
8
  conversation = [{'role': 'user', 'content': prompt}]
9
  prompt = tokenizer.apply_chat_template(conversation, tokenizer=False, add_generation_prompt=True)
 
1
  import gradio as gr
2
+ from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
3
 
4
+ tokenizer = AutoTokenizer.from_pretrained("TheBloke/SOLAR-10.7B-Instruct-v1.0-uncensored-GPTQ", )
5
  model = AutoModelForCausalLM.from_pretrained("TheBloke/SOLAR-10.7B-Instruct-v1.0-uncensored-GPTQ")
6
 
7
+ # Add the quantization config with disable_exllama=True
8
+ quantization_config = {
9
+ 'disable_exllama': True,
10
+ }
11
+
12
+ model = torch.quantization.quantize_dynamic(
13
+ model, quantization_config=quantization_config,
14
+ )
15
+
16
  def generate_response(prompt):
17
  conversation = [{'role': 'user', 'content': prompt}]
18
  prompt = tokenizer.apply_chat_template(conversation, tokenizer=False, add_generation_prompt=True)