Spestly commited on
Commit
992813f
·
verified ·
1 Parent(s): 03a4543

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -64
app.py CHANGED
@@ -2,83 +2,75 @@ import gradio as gr
2
  from transformers import AutoModelForCausalLM, AutoTokenizer
3
  import torch
4
  import gc
5
- from threading import Lock
 
 
 
6
 
7
  def load_model():
8
- """Load and optimize the model for CPU usage"""
9
  model_name = "Spestly/Athena-1-1.5B"
10
 
11
- # Load model with CPU optimizations
12
  model = AutoModelForCausalLM.from_pretrained(
13
  model_name,
14
  low_cpu_mem_usage=True,
15
- torch_dtype=torch.float32, # Use float32 for CPU
16
  device_map="cpu"
17
  )
18
-
19
- # Load tokenizer
20
  tokenizer = AutoTokenizer.from_pretrained(model_name)
21
-
22
- # Enable memory efficient settings
23
- model.eval() # Set to evaluation mode
24
-
25
- return model, tokenizer
26
-
27
- # Global lock for thread safety
28
- generation_lock = Lock()
29
 
30
  def generate_response(input_text, history):
31
- """Generate a response with CPU-optimized settings"""
 
 
 
 
 
32
  try:
33
- with generation_lock: # Ensure thread safety
34
- instruction = "You are an AI assistant. Respond to the following instruction clearly and concisely:\n"
35
- formatted_input = instruction + input_text
36
-
37
- # Generate with optimized settings for CPU
38
- inputs = tokenizer(
39
- formatted_input,
40
- return_tensors="pt",
41
- truncation=True,
42
- max_length=256 # Reduced for CPU memory constraints
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  )
44
-
45
- with torch.no_grad(): # Memory efficiency
46
- outputs = model.generate(
47
- **inputs,
48
- max_new_tokens=100, # Reduced for CPU
49
- do_sample=True,
50
- top_k=50,
51
- top_p=0.9,
52
- temperature=0.7,
53
- pad_token_id=tokenizer.pad_token_id,
54
- eos_token_id=tokenizer.eos_token_id,
55
- use_cache=True,
56
- repetition_penalty=1.2,
57
- num_beams=1 # Disable beam search for speed
58
- )
59
-
60
- response = tokenizer.decode(outputs[0], skip_special_tokens=True)
61
-
62
- # Manual garbage collection
63
- gc.collect()
64
-
65
- # Update history
66
- history = history + f"\nUser: {input_text}\nAthena: {response}\n"
67
- return response, history
68
 
69
  except Exception as e:
70
  return f"Error: {str(e)}", history
71
 
72
- # Load the model and tokenizer globally
73
- print("Loading model...")
74
- model, tokenizer = load_model()
75
- print("Model loaded successfully!")
76
 
77
- # Define the Gradio interface
78
  with gr.Blocks() as demo:
79
  gr.Markdown("""
80
  # Athena-1.1.5B Chat
81
- Small, Smart, Super.
82
  """)
83
 
84
  # Initialize state and components
@@ -128,12 +120,5 @@ with gr.Blocks() as demo:
128
  outputs=[output_text, state]
129
  )
130
 
131
- # Launch with CPU-optimized settings
132
- demo.launch(
133
- server_name="0.0.0.0",
134
- share=False,
135
- show_error=True,
136
- enable_queue=True,
137
- max_threads=2, # Match your vCPU count
138
- server_port=7860
139
- )
 
2
  from transformers import AutoModelForCausalLM, AutoTokenizer
3
  import torch
4
  import gc
5
+
6
+ # Global model and tokenizer
7
+ model = None
8
+ tokenizer = None
9
 
10
  def load_model():
11
+ global model, tokenizer
12
  model_name = "Spestly/Athena-1-1.5B"
13
 
 
14
  model = AutoModelForCausalLM.from_pretrained(
15
  model_name,
16
  low_cpu_mem_usage=True,
17
+ torch_dtype=torch.float32,
18
  device_map="cpu"
19
  )
 
 
20
  tokenizer = AutoTokenizer.from_pretrained(model_name)
21
+ model.eval()
 
 
 
 
 
 
 
22
 
23
  def generate_response(input_text, history):
24
+ global model, tokenizer
25
+
26
+ # Load model if not loaded
27
+ if model is None or tokenizer is None:
28
+ load_model()
29
+
30
  try:
31
+ instruction = "You are an AI assistant. Respond to the following instruction clearly and concisely:\n"
32
+ formatted_input = instruction + input_text
33
+
34
+ inputs = tokenizer(
35
+ formatted_input,
36
+ return_tensors="pt",
37
+ truncation=True,
38
+ max_length=256
39
+ )
40
+
41
+ with torch.no_grad():
42
+ outputs = model.generate(
43
+ **inputs,
44
+ max_new_tokens=100,
45
+ do_sample=True,
46
+ top_k=50,
47
+ top_p=0.9,
48
+ temperature=0.7,
49
+ pad_token_id=tokenizer.pad_token_id,
50
+ eos_token_id=tokenizer.eos_token_id,
51
+ use_cache=True,
52
+ repetition_penalty=1.2,
53
+ num_beams=1
54
  )
55
+
56
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
57
+
58
+ # Manual garbage collection
59
+ gc.collect()
60
+
61
+ # Update history
62
+ history = history + f"\nUser: {input_text}\nAthena: {response}\n"
63
+ return response, history
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
  except Exception as e:
66
  return f"Error: {str(e)}", history
67
 
68
+ print("Starting app...")
 
 
 
69
 
 
70
  with gr.Blocks() as demo:
71
  gr.Markdown("""
72
  # Athena-1.1.5B Chat
73
+ Smaller and Smarter.
74
  """)
75
 
76
  # Initialize state and components
 
120
  outputs=[output_text, state]
121
  )
122
 
123
+ if __name__ == "__main__":
124
+ demo.launch()