TI_RAG_Demo_L3.1

Sleeping

arjunanand13 commited on May 6

Commit

a595339

•

1 Parent(s): 3e51289

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -30,33 +30,19 @@ device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
 # set quantization configuration to load large model with less GPU memory
 # this requires the `bitsandbytes` library
-bnb_config = transformers.BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_quant_type='nf4',
-    bnb_4bit_use_double_quant=True,
-    bnb_4bit_compute_dtype=bfloat16
-)
-model_config = transformers.AutoConfig.from_pretrained(
-    model_id,
-    token=HF_TOKEN,
-)
-model = transformers.AutoModelForCausalLM.from_pretrained(
-    model_id,
-    trust_remote_code=True,
-    config=model_config,
-    quantization_config=bnb_config,
-    device_map='auto',
-)
-# enable evaluation mode to allow model inference
-model.eval()
-print(f"Model loaded on {device}")
-tokenizer = transformers.AutoTokenizer.from_pretrained(
-    model_id,
-)
 """
 Setting up the stop list to define stopping criteria.

 # set quantization configuration to load large model with less GPU memory
 # this requires the `bitsandbytes` library
+# bnb_config = transformers.BitsAndBytesConfig(
+#     load_in_4bit=True,
+#     bnb_4bit_quant_type='nf4',
+#     bnb_4bit_use_double_quant=True,
+#     bnb_4bit_compute_dtype=bfloat16
+# )
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto")  # to("cuda:0")
+terminators = [
+    tokenizer.eos_token_id,
+    tokenizer.convert_tokens_to_ids("<|eot_id|>")
+]
 """
 Setting up the stop list to define stopping criteria.