Spaces:

Futuresony
/

FuturesonyAi

Sleeping

App Files Files Community

Futuresony commited on Dec 16, 2024

Commit

927446b

verified ·

1 Parent(s): 97f10b6

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -10

app.py CHANGED Viewed

@@ -7,19 +7,32 @@ from peft import PeftModel  # For loading adapter files
 BASE_MODEL_PATH = "unsloth/Llama-3.2-3B-Instruct"  # Replace with your base model path
 ADAPTER_PATH = "Futuresony/future_ai_12_10_2024.gguf/adapter"  # Your Hugging Face repo
 # Load base model and tokenizer
 print("Loading base model and tokenizer...")
 tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_PATH)
-model = AutoModelForCausalLM.from_pretrained(BASE_MODEL_PATH, torch_dtype=torch.float16, device_map="auto")
-# Load adapter files using PEFT
 print("Loading adapter...")
 model = PeftModel.from_pretrained(model, ADAPTER_PATH)
 # Set model to evaluation mode
 model.eval()
-# Generate responses using the model
 def respond(
     message,
     history: list[tuple[str, str]],
@@ -28,7 +41,6 @@ def respond(
     temperature,
     top_p,
 ):
-    # Format chat messages
     messages = [{"role": "system", "content": system_message}]
     for val in history:
         if val[0]:
@@ -38,10 +50,8 @@ def respond(
     messages.append({"role": "user", "content": message})
-    # Concatenate messages as input text
     input_text = "\n".join([f"{msg['role']}: {msg['content']}" for msg in messages])
-    # Tokenize input text
     inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
     # Generate response
@@ -51,12 +61,10 @@ def respond(
         top_p=top_p,
         do_sample=True,
     )
     output_ids = model.generate(**inputs, generation_config=generation_config)
     response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
-    return response.split("assistant:")[-1].strip()  # Extract assistant response
 # Gradio Interface
 demo = gr.ChatInterface(
@@ -71,3 +79,4 @@ demo = gr.ChatInterface(
 if __name__ == "__main__":
     demo.launch()

 BASE_MODEL_PATH = "unsloth/Llama-3.2-3B-Instruct"  # Replace with your base model path
 ADAPTER_PATH = "Futuresony/future_ai_12_10_2024.gguf/adapter"  # Your Hugging Face repo
+# Function to clean rope_scaling in model config
+def clean_rope_scaling(config):
+    if "rope_scaling" in config:
+        valid_rope_scaling = {"type": "linear", "factor": config["rope_scaling"].get("factor", 1.0)}
+        config["rope_scaling"] = valid_rope_scaling
+    return config
 # Load base model and tokenizer
 print("Loading base model and tokenizer...")
 tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_PATH)
+# Load and clean the model config
+config = LlamaConfig.from_pretrained(BASE_MODEL_PATH)
+clean_config = clean_rope_scaling(config.to_dict())
+# Load model with cleaned config
+model = AutoModelForCausalLM.from_pretrained(BASE_MODEL_PATH, config=clean_config, torch_dtype=torch.float16, device_map="auto")
+# Load adapter using PEFT
 print("Loading adapter...")
 model = PeftModel.from_pretrained(model, ADAPTER_PATH)
 # Set model to evaluation mode
 model.eval()
+# Function to generate responses
 def respond(
     message,
     history: list[tuple[str, str]],
     temperature,
     top_p,
 ):
     messages = [{"role": "system", "content": system_message}]
     for val in history:
         if val[0]:
     messages.append({"role": "user", "content": message})
+    # Prepare input
     input_text = "\n".join([f"{msg['role']}: {msg['content']}" for msg in messages])
     inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
     # Generate response
         top_p=top_p,
         do_sample=True,
     )
     output_ids = model.generate(**inputs, generation_config=generation_config)
     response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
+    return response.split("assistant:")[-1].strip()
 # Gradio Interface
 demo = gr.ChatInterface(
 if __name__ == "__main__":
     demo.launch()