TI_RAG_Demo_L3.1

Sleeping

App Files Files Community

arjunanand13 commited on May 7

Commit

1a8e9fc

•

1 Parent(s): 0501f59

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -42

app.py CHANGED Viewed

@@ -30,49 +30,51 @@ HF_TOKEN = os.environ.get("HF_TOKEN", None)
 model_id = 'meta-llama/Meta-Llama-3-8B'
 device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
-# set quantization configuration to load large model with less GPU memory
-# this requires the `bitsandbytes` library
-# bnb_config = transformers.BitsAndBytesConfig(
-#     load_in_4bit=True,
-#     bnb_4bit_quant_type='nf4',
-#     bnb_4bit_use_double_quant=True,
-#     bnb_4bit_compute_dtype=bfloat16
-# )
-# tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct",token=HF_TOKEN)
-# model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto",token=HF_TOKEN)  # to("cuda:0")
-# terminators = [
-#     tokenizer.eos_token_id,
-#     tokenizer.convert_tokens_to_ids("<|eot_id|>")
-# ]
-model_config = transformers.AutoConfig.from_pretrained(
-            model_id,
-            token=HF_TOKEN,
-            # use_auth_token=hf_auth
-        )
-model = transformers.AutoModelForCausalLM.from_pretrained(
-            model_id,
-            trust_remote_code=True,
-            config=model_config,
-            # quantization_config=bnb_config,
-            token=HF_TOKEN,
-            # use_auth_token=hf_auth
-        )
-model.eval()
-tokenizer = transformers.AutoTokenizer.from_pretrained(
-            model_id,
-            token=HF_TOKEN,
-            # use_auth_token=hf_auth
-        )
-generate_text = transformers.pipeline(
-            model=self.model, tokenizer=self.tokenizer,
-            return_full_text=True,
-            task='text-generation',
-            temperature=0.01,
-            max_new_tokens=512
-        )
 """
 Setting up the stop list to define stopping criteria.

 model_id = 'meta-llama/Meta-Llama-3-8B'
 device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
+"""set quantization configuration to load large model with less GPU memory
+this requires the `bitsandbytes` library"""
+bnb_config = transformers.BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_quant_type='nf4',
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_compute_dtype=bfloat16
+)
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct",token=HF_TOKEN)
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto",token=HF_TOKEN)  # to("cuda:0")
+terminators = [
+    tokenizer.eos_token_id,
+    tokenizer.convert_tokens_to_ids("<|eot_id|>")
+]
+"""CPU"""
+# model_config = transformers.AutoConfig.from_pretrained(
+#             model_id,
+#             token=HF_TOKEN,
+#             # use_auth_token=hf_auth
+#         )
+# model = transformers.AutoModelForCausalLM.from_pretrained(
+#             model_id,
+#             trust_remote_code=True,
+#             config=model_config,
+#             # quantization_config=bnb_config,
+#             token=HF_TOKEN,
+#             # use_auth_token=hf_auth
+#         )
+# model.eval()
+# tokenizer = transformers.AutoTokenizer.from_pretrained(
+#             model_id,
+#             token=HF_TOKEN,
+#             # use_auth_token=hf_auth
+#         )
+# generate_text = transformers.pipeline(
+#             model=self.model, tokenizer=self.tokenizer,
+#             return_full_text=True,
+#             task='text-generation',
+#             temperature=0.01,
+#             max_new_tokens=512
+#         )
 """
 Setting up the stop list to define stopping criteria.