RTMLabs
/

llama3.1-70b-4bit-of-v1-lora

@@ -3,22 +3,19 @@ from typing import Dict, List, Any
 from unsloth import FastLanguageModel
 from unsloth.chat_templates import get_chat_template
 import torch
 class EndpointHandler:
-    def __init__(self, path="."):
-        # Get the current directory (where the handler is located)
-        current_dir = os.path.dirname(os.path.abspath(__file__))
-        # Define the relative path to the LoRA adapter
-        lora_path = os.path.join(current_dir, "llama3.1-70b-4bit-of-v1-lora")
         # Load the model and tokenizer
         self.model, self.tokenizer = FastLanguageModel.from_pretrained(
-            model_name = ".", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
             max_seq_length = 2048,
             dtype = None,
             load_in_4bit = True,
-            # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
         )
         FastLanguageModel.for_inference(self.model)
@@ -72,34 +69,3 @@ class EndpointHandler:
         last_response = response_lines[-1] if response_lines else ""
         return [last_response]
-# if __name__ == "__main__":
-#     handler = EndpointHandler()
-#     print("Chat with the model. Type 'quit' to exit.")
-#     system_message = input("Enter system message (optional): ")
-#     history = []
-#     while True:
-#         user_input = input("You: ")
-#         if user_input.lower() == 'quit':
-#             break
-#         data = {
-#             "inputs": history + [{"role": "user", "content": user_input}],
-#             "parameters": {
-#                 "system_message": system_message,
-#                 "max_new_tokens": 512,
-#                 "temperature": 0.2,
-#                 "top_p": 0.5
-#             }
-#         }
-#         response = handler(data)[0]
-#         print(f"Model: {response}")
-#         history.append({"role": "user", "content": user_input})
-#         history.append({"role": "assistant", "content": response})

 from unsloth import FastLanguageModel
 from unsloth.chat_templates import get_chat_template
 import torch
+from huggingface_hub import login
+import os
 class EndpointHandler:
+    def __init__(self, path=""):
+        # access_token = os.environ["HUGGINGFACE_TOKEN"]
+        # login(token=access_token)
         # Load the model and tokenizer
         self.model, self.tokenizer = FastLanguageModel.from_pretrained(
+            model_name = path,  # Use the current directory path
             max_seq_length = 2048,
             dtype = None,
             load_in_4bit = True,
         )
         FastLanguageModel.for_inference(self.model)
         last_response = response_lines[-1] if response_lines else ""
         return [last_response]

requirements.txt CHANGED Viewed

@@ -1,13 +1,8 @@
 xformers<0.0.27
-unsloth
-torch==2.2.0
-torchvision==0.17.0
-transformers==4.42.3
 bitsandbytes==0.43.3
-trl<0.9.0
-peft
-accelerate
-git+https://github.com/unslothai/unsloth.git@933d9fe2cb2459f949ee2250e90a5b610d277eab
-# Note: Install with --no-deps flag for xformers and trl
-# pip install --no-deps "xformers<0.0.27" "trl<0.9.0"

+torchvision
 xformers<0.0.27
+trl==0.8.6
+transformers==4.44.2
 bitsandbytes==0.43.3
+peft==0.12.0
+accelerate>=0.34.2
+git+https://github.com/unslothai/unsloth.git