Abhinav Kulkarni commited on
Commit
2b601a6
1 Parent(s): 9410a1e

Updated README

Browse files
Files changed (1) hide show
  1. README.md +5 -0
README.md CHANGED
@@ -43,6 +43,7 @@ git clone https://github.com/mit-han-lab/llm-awq \
43
  ```
44
 
45
  ```python
 
46
  import torch
47
  from awq.quantize.quantizer import real_quantize_model_weight
48
  from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer, TextStreamer
@@ -84,6 +85,7 @@ prompt = f'''What is the difference between nuclear fusion and fission?
84
  ###Response:'''
85
 
86
  input_ids = tokenizer(prompt, return_tensors='pt').input_ids.cuda()
 
87
  output = model.generate(
88
  inputs=input_ids,
89
  temperature=0.7,
@@ -93,6 +95,9 @@ output = model.generate(
93
  repetition_penalty=1.1,
94
  eos_token_id=tokenizer.eos_token_id,
95
  streamer=streamer)
 
 
 
96
  ```
97
 
98
  ## Evaluation
 
43
  ```
44
 
45
  ```python
46
+ import time
47
  import torch
48
  from awq.quantize.quantizer import real_quantize_model_weight
49
  from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer, TextStreamer
 
85
  ###Response:'''
86
 
87
  input_ids = tokenizer(prompt, return_tensors='pt').input_ids.cuda()
88
+ t1 = time.time()
89
  output = model.generate(
90
  inputs=input_ids,
91
  temperature=0.7,
 
95
  repetition_penalty=1.1,
96
  eos_token_id=tokenizer.eos_token_id,
97
  streamer=streamer)
98
+ t2 = time.time()
99
+ print("*"*80)
100
+ print(f"Generated {num_tokens/(t2-t1):.2f} token/s; {(t2-t1)*1000/num_tokens:.2f} ms/token")
101
  ```
102
 
103
  ## Evaluation