Spaces:

Mahavaury2
/

llama2-consent-chatbot

Runtime error

App Files Files Community

Mahavaury2 commited on Jan 20

Commit

2c933ed

verified ·

1 Parent(s): 2392d9d

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -26

app.py CHANGED Viewed

@@ -1,63 +1,59 @@
 import gradio as gr
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
-# MODEL REPO
 MODEL_NAME = "mistralai/Mistral-7B-v0.1"
-# Load tokenizer
 print("Loading tokenizer...")
 tokenizer = AutoTokenizer.from_pretrained(
     MODEL_NAME,
-    trust_remote_code=True
 )
-# Load model in 4-bit on CPU
-# (Even though we set device_map="auto", on a free Space there's no GPU, so it stays on CPU.)
 print("Loading model in 4-bit...")
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_NAME,
     torch_dtype=torch.float16,
-    device_map="auto",        # auto-detect available devices
-    load_in_4bit=True,        # bitsandbytes for quantization
-    trust_remote_code=True    # Mistral uses custom code
 )
 model.eval()
-def chat_mistral(prompt):
     """
-    Generates a response from Mistral 7B given a user prompt.
     """
-    # Tokenize
     inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
-    # Generate
     with torch.no_grad():
         outputs = model.generate(
             **inputs,
-            max_new_tokens=128,       # limit output length to avoid OOM
             temperature=0.7,
-            repetition_penalty=1.1
         )
-    # Decode
-    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    return response
-# Create a Gradio interface
 demo = gr.Interface(
-    fn=chat_mistral,
     inputs=gr.Textbox(lines=3, label="Your Prompt"),
     outputs=gr.Textbox(label="Mistral 7B Response"),
     title="Mistral 7B (4-bit) Chat",
     description=(
-        "A minimal Mistral-7B demo running on free CPU. "
-        "Inference will be slow and might run out of memory. "
-        "Use short prompts!"
-    )
 )
-# Launch the Gradio app
 if __name__ == "__main__":
     demo.launch()

+from huggingface_hub import login
+import os
+# 1) Log in so we can download from the gated Mistral repo
+login(token=os.getenv("HF_API_TOKEN"))
 import gradio as gr
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
 MODEL_NAME = "mistralai/Mistral-7B-v0.1"
 print("Loading tokenizer...")
 tokenizer = AutoTokenizer.from_pretrained(
     MODEL_NAME,
+    trust_remote_code=True,           # Mistral uses custom code
+    token=os.getenv("HF_API_TOKEN"),  # Use your HF token
 )
 print("Loading model in 4-bit...")
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_NAME,
     torch_dtype=torch.float16,
+    device_map="auto",        # On a free Space, this means CPU
+    load_in_4bit=True,        # bitsandbytes 4-bit quantization
+    trust_remote_code=True,
+    token=os.getenv("HF_API_TOKEN"),
 )
 model.eval()
+def generate_text(prompt):
     """
+    Basic text generation with Mistral 7B (4-bit).
+    NOTE: Inference will be very slow on CPU and might run out of memory.
     """
     inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
     with torch.no_grad():
         outputs = model.generate(
             **inputs,
+            max_new_tokens=128,     # keep small to avoid OOM
             temperature=0.7,
+            repetition_penalty=1.2,
         )
+    return tokenizer.decode(outputs[0], skip_special_tokens=True)
 demo = gr.Interface(
+    fn=generate_text,
     inputs=gr.Textbox(lines=3, label="Your Prompt"),
     outputs=gr.Textbox(label="Mistral 7B Response"),
     title="Mistral 7B (4-bit) Chat",
     description=(
+        "A minimal Mistral 7B example running on free CPU. "
+        "Very slow, may OOM with big prompts."
+    ),
 )
 if __name__ == "__main__":
     demo.launch()