Spaces:

emeses
/

lab2

Sleeping

emeses commited on Dec 5, 2024

Commit

9744f58

1 Parent(s): 25da20e

Update space

Files changed (2) hide show

README.md CHANGED Viewed

@@ -7,4 +7,4 @@ sdk: gradio
 sdk_version: 4.19.2
 app_file: app.py
 pinned: false
----

 sdk_version: 4.19.2
 app_file: app.py
 pinned: false
+gpu: true

app.py CHANGED Viewed

@@ -2,7 +2,8 @@ import gradio as gr
 from huggingface_hub import InferenceClient
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from peft import PeftModel
-import torch
 device = "cuda" if torch.cuda.is_available() else "cpu"
 base_model = AutoModelForCausalLM.from_pretrained(
@@ -11,8 +12,21 @@ base_model = AutoModelForCausalLM.from_pretrained(
     torch_dtype=torch.float16
 )
 # Load model and tokenizer
-base_model = AutoModelForCausalLM.from_pretrained("unsloth/Llama-3.2-3B-Instruct-bnb-4bit")
 model = PeftModel.from_pretrained(base_model, "emeses/lab2_model")
 tokenizer = AutoTokenizer.from_pretrained("unsloth/Llama-3.2-3B-Instruct-bnb-4bit")

 from huggingface_hub import InferenceClient
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from peft import PeftModel
+import torch
+from transformers import BitsAndBytesConfig
 device = "cuda" if torch.cuda.is_available() else "cpu"
 base_model = AutoModelForCausalLM.from_pretrained(
     torch_dtype=torch.float16
 )
+# Configure quantization
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_compute_dtype=torch.float16,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_use_double_quant=True,
+)
 # Load model and tokenizer
+base_model = AutoModelForCausalLM.from_pretrained(
+    "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
+    device_map="auto",
+    torch_dtype=torch.float16,
+    quantization_config=bnb_config
+)
 model = PeftModel.from_pretrained(base_model, "emeses/lab2_model")
 tokenizer = AutoTokenizer.from_pretrained("unsloth/Llama-3.2-3B-Instruct-bnb-4bit")