Ravi theja K commited on
Commit
c2646ca
1 Parent(s): 60dca91

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -0
app.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ # enable hf_transfer for faster ckpt download
3
+ os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
4
+
5
+ import torch
6
+ from transformers import AutoModelForCausalLM, AutoTokenizer
7
+ from deepspeed.linear.config import QuantizationConfig
8
+
9
+ tokenizer = AutoTokenizer.from_pretrained(
10
+ "Snowflake/snowflake-arctic-instruct",
11
+ trust_remote_code=True
12
+ )
13
+ quant_config = QuantizationConfig(q_bits=8)
14
+
15
+ model = AutoModelForCausalLM.from_pretrained(
16
+ "Snowflake/snowflake-arctic-instruct",
17
+ trust_remote_code=True,
18
+ low_cpu_mem_usage=True,
19
+ device_map="auto",
20
+ ds_quantization_config=quant_config,
21
+ max_memory={i: "150GiB" for i in range(8)},
22
+ torch_dtype=torch.bfloat16)
23
+
24
+
25
+ content = "5x + 35 = 7x - 60 + 10. Solve for x"
26
+ messages = [{"role": "user", "content": content}]
27
+ input_ids = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to("cuda")
28
+
29
+ outputs = model.generate(input_ids=input_ids, max_new_tokens=256)
30
+ print(tokenizer.decode(outputs[0]))