arjunanand13 commited on
Commit
a595339
1 Parent(s): 3e51289

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -27
app.py CHANGED
@@ -30,33 +30,19 @@ device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
30
 
31
  # set quantization configuration to load large model with less GPU memory
32
  # this requires the `bitsandbytes` library
33
- bnb_config = transformers.BitsAndBytesConfig(
34
- load_in_4bit=True,
35
- bnb_4bit_quant_type='nf4',
36
- bnb_4bit_use_double_quant=True,
37
- bnb_4bit_compute_dtype=bfloat16
38
- )
39
-
40
- model_config = transformers.AutoConfig.from_pretrained(
41
- model_id,
42
- token=HF_TOKEN,
43
- )
44
-
45
- model = transformers.AutoModelForCausalLM.from_pretrained(
46
- model_id,
47
- trust_remote_code=True,
48
- config=model_config,
49
- quantization_config=bnb_config,
50
- device_map='auto',
51
- )
52
-
53
- # enable evaluation mode to allow model inference
54
- model.eval()
55
- print(f"Model loaded on {device}")
56
-
57
- tokenizer = transformers.AutoTokenizer.from_pretrained(
58
- model_id,
59
- )
60
 
61
  """
62
  Setting up the stop list to define stopping criteria.
 
30
 
31
  # set quantization configuration to load large model with less GPU memory
32
  # this requires the `bitsandbytes` library
33
+ # bnb_config = transformers.BitsAndBytesConfig(
34
+ # load_in_4bit=True,
35
+ # bnb_4bit_quant_type='nf4',
36
+ # bnb_4bit_use_double_quant=True,
37
+ # bnb_4bit_compute_dtype=bfloat16
38
+ # )
39
+
40
+ tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
41
+ model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto") # to("cuda:0")
42
+ terminators = [
43
+ tokenizer.eos_token_id,
44
+ tokenizer.convert_tokens_to_ids("<|eot_id|>")
45
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
  """
48
  Setting up the stop list to define stopping criteria.