mychen76 commited on
Commit
2259d9e
1 Parent(s): aa3f36b

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +40 -1
README.md CHANGED
@@ -81,4 +81,43 @@ with torch.inference_mode():
81
  outputs = model.generate(**inputs, max_new_tokens=512)
82
  result_text = tokenizer.batch_decode(outputs)[0]
83
  print(result_text)
84
- ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  outputs = model.generate(**inputs, max_new_tokens=512)
82
  result_text = tokenizer.batch_decode(outputs)[0]
83
  print(result_text)
84
+ ```
85
+
86
+ # Load model in 4bits
87
+ ```python
88
+
89
+ import torch
90
+ from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, BitsAndBytesConfig
91
+
92
+ # quantization_config = BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=True)
93
+ bnb_config = BitsAndBytesConfig(
94
+ llm_int8_enable_fp32_cpu_offload=True,
95
+ load_in_4bit=True,
96
+ bnb_4bit_use_double_quant=True,
97
+ bnb_4bit_quant_type="nf4",
98
+ bnb_4bit_compute_dtype=torch.bfloat16,
99
+ )
100
+ # control model memory allocation between devices for low GPU resource (0,cpu)
101
+ device_map = {
102
+ "transformer.word_embeddings": 0,
103
+ "transformer.word_embeddings_layernorm": 0,
104
+ "lm_head": 0,
105
+ "transformer.h": 0,
106
+ "transformer.ln_f": 0,
107
+ "model.embed_tokens": 0,
108
+ "model.layers":0,
109
+ "model.norm":0
110
+ }
111
+ device = "cuda" if torch.cuda.is_available() else "cpu"
112
+
113
+ # model use for inference
114
+ model_id="mychen76/mistral7b_ocr_to_json_v1"
115
+ model = AutoModelForCausalLM.from_pretrained(
116
+ model_id,
117
+ trust_remote_code=True,
118
+ torch_dtype=torch.float16,
119
+ quantization_config=bnb_config,
120
+ device_map=device_map)
121
+ # tokenizer
122
+ tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
123
+ ```