AurelioAguirre commited on
Commit
f35f208
·
1 Parent(s): cfaa883

Refactored

Browse files
app/__init__.py ADDED
File without changes
app/api.py ADDED
@@ -0,0 +1,286 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+ from threading import Thread
4
+ import torch
5
+ from typing import Optional, Iterator, List
6
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
7
+ from utils.logging import setup_logger
8
+
9
+ class LLMApi:
10
+ def __init__(self, config: dict):
11
+ """Initialize the LLM API with configuration."""
12
+ self.logger = setup_logger(config, "llm_api")
13
+ self.logger.info("Initializing LLM API")
14
+
15
+ # Set up paths
16
+ self.base_path = Path(config["model"]["base_path"])
17
+ self.models_path = self.base_path / config["folders"]["models"]
18
+ self.cache_path = self.base_path / config["folders"]["cache"]
19
+
20
+ self.model = None
21
+ self.model_name = None
22
+ self.tokenizer = None
23
+
24
+ # Generation parameters from config
25
+ gen_config = config["model"]["generation"]
26
+ self.max_new_tokens = gen_config["max_new_tokens"]
27
+ self.do_sample = gen_config["do_sample"]
28
+ self.temperature = gen_config["temperature"]
29
+ self.repetition_penalty = gen_config["repetition_penalty"]
30
+
31
+ self.generation_config = {
32
+ "max_new_tokens": self.max_new_tokens,
33
+ "do_sample": self.do_sample,
34
+ "temperature": self.temperature,
35
+ "repetition_penalty": self.repetition_penalty,
36
+ "eos_token_id": None,
37
+ "pad_token_id": None
38
+ }
39
+
40
+ # Create necessary directories
41
+ self.models_path.mkdir(parents=True, exist_ok=True)
42
+ self.cache_path.mkdir(parents=True, exist_ok=True)
43
+
44
+ # Set cache directory for transformers
45
+ os.environ['TRANSFORMERS_CACHE'] = str(self.cache_path)
46
+
47
+ self.logger.info("LLM API initialized successfully")
48
+
49
+ def download_model(self, model_name: str) -> None:
50
+ """
51
+ Download a model and its tokenizer to the models directory.
52
+
53
+ Args:
54
+ model_name: The name of the model to download (e.g., "norallm/normistral-11b-warm")
55
+ """
56
+ self.logger.info(f"Starting download of model: {model_name}")
57
+ try:
58
+ model_path = self.models_path / model_name.split('/')[-1]
59
+
60
+ # Download and save model
61
+ model = AutoModelForCausalLM.from_pretrained(model_name)
62
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
63
+
64
+ self.logger.info(f"Saving model to {model_path}")
65
+ model.save_pretrained(model_path)
66
+ tokenizer.save_pretrained(model_path)
67
+
68
+ self.logger.info(f"Successfully downloaded model: {model_name}")
69
+ except Exception as e:
70
+ self.logger.error(f"Failed to download model {model_name}: {str(e)}")
71
+ raise
72
+
73
+ def initialize_model(self, model_name: str) -> None:
74
+ """
75
+ Initialize a model and tokenizer, either from local storage or by downloading.
76
+
77
+ Args:
78
+ model_name: The name of the model to initialize
79
+ """
80
+ self.logger.info(f"Initializing model: {model_name}")
81
+ try:
82
+ self.model_name = model_name
83
+ local_model_path = self.models_path / model_name.split('/')[-1]
84
+
85
+ # Check if model exists locally
86
+ if local_model_path.exists():
87
+ self.logger.info(f"Loading model from local path: {local_model_path}")
88
+ model_path = local_model_path
89
+ else:
90
+ self.logger.info(f"Loading model from source: {model_name}")
91
+ model_path = model_name
92
+
93
+ self.model = AutoModelForCausalLM.from_pretrained(
94
+ model_path,
95
+ device_map="auto",
96
+ load_in_8bit=True,
97
+ torch_dtype=torch.float16
98
+ )
99
+ self.tokenizer = AutoTokenizer.from_pretrained(model_path)
100
+
101
+ # Update generation config with tokenizer-specific values
102
+ self.generation_config["eos_token_id"] = self.tokenizer.eos_token_id
103
+ self.generation_config["pad_token_id"] = self.tokenizer.eos_token_id
104
+
105
+ self.logger.info(f"Successfully initialized model: {model_name}")
106
+ except Exception as e:
107
+ self.logger.error(f"Failed to initialize model {model_name}: {str(e)}")
108
+ raise
109
+
110
+ def has_chat_template(self) -> bool:
111
+ """Check if the current model has a chat template."""
112
+ try:
113
+ self.tokenizer.apply_chat_template(
114
+ [{"role": "user", "content": "test"}],
115
+ tokenize=False,
116
+ )
117
+ return True
118
+ except (ValueError, AttributeError):
119
+ return False
120
+
121
+ def _prepare_prompt(self, prompt: str, system_message: Optional[str] = None) -> str:
122
+ """
123
+ Prepare the prompt text, either using the model's chat template if available,
124
+ or falling back to a simple OpenAI-style format.
125
+ """
126
+ try:
127
+ messages = []
128
+ if system_message:
129
+ messages.append({"role": "system", "content": system_message})
130
+ messages.append({"role": "user", "content": prompt})
131
+
132
+ return self.tokenizer.apply_chat_template(
133
+ messages,
134
+ tokenize=False,
135
+ add_generation_prompt=True
136
+ )
137
+ except (ValueError, AttributeError):
138
+ template = ""
139
+ if system_message:
140
+ template += f"System: {system_message}\n\n"
141
+ template += f"User: {prompt}\n\nAssistant: "
142
+ return template
143
+
144
+ def generate_response(
145
+ self,
146
+ prompt: str,
147
+ system_message: Optional[str] = None,
148
+ max_new_tokens: Optional[int] = None
149
+ ) -> str:
150
+ """
151
+ Generate a complete response for the given prompt.
152
+ """
153
+ self.logger.debug(f"Generating response for prompt: {prompt[:50]}...")
154
+
155
+ if self.model is None:
156
+ raise RuntimeError("Model not initialized. Call initialize_model first.")
157
+
158
+ try:
159
+ text = self._prepare_prompt(prompt, system_message)
160
+ inputs = self.tokenizer([text], return_tensors="pt")
161
+
162
+ # Remove token_type_ids if present
163
+ model_inputs = {k: v.to(self.model.device) for k, v in inputs.items()
164
+ if k != 'token_type_ids'}
165
+
166
+ generation_config = self.generation_config.copy()
167
+ if max_new_tokens:
168
+ generation_config["max_new_tokens"] = max_new_tokens
169
+
170
+ generated_ids = self.model.generate(
171
+ **model_inputs,
172
+ **generation_config
173
+ )
174
+
175
+ generated_ids = [
176
+ output_ids[len(input_ids):]
177
+ for input_ids, output_ids in zip(model_inputs['input_ids'], generated_ids)
178
+ ]
179
+
180
+ response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
181
+ self.logger.debug(f"Generated response: {response[:50]}...")
182
+ return response
183
+
184
+ except Exception as e:
185
+ self.logger.error(f"Error generating response: {str(e)}")
186
+ raise
187
+
188
+ def generate_stream(
189
+ self,
190
+ prompt: str,
191
+ system_message: Optional[str] = None,
192
+ max_new_tokens: Optional[int] = None
193
+ ) -> Iterator[str]:
194
+ """
195
+ Generate a streaming response for the given prompt.
196
+ """
197
+ self.logger.debug(f"Starting streaming generation for prompt: {prompt[:50]}...")
198
+
199
+ if self.model is None:
200
+ raise RuntimeError("Model not initialized. Call initialize_model first.")
201
+
202
+ try:
203
+ text = self._prepare_prompt(prompt, system_message)
204
+ inputs = self.tokenizer([text], return_tensors="pt")
205
+
206
+ # Remove token_type_ids if present
207
+ model_inputs = {k: v.to(self.model.device) for k, v in inputs.items()
208
+ if k != 'token_type_ids'}
209
+
210
+ # Configure generation
211
+ generation_config = self.generation_config.copy()
212
+ if max_new_tokens:
213
+ generation_config["max_new_tokens"] = max_new_tokens
214
+
215
+ # Set up streaming
216
+ streamer = TextIteratorStreamer(self.tokenizer)
217
+ generation_kwargs = dict(
218
+ **model_inputs,
219
+ **generation_config,
220
+ streamer=streamer
221
+ )
222
+
223
+ # Create a thread to run the generation
224
+ thread = Thread(target=self.model.generate, kwargs=generation_kwargs)
225
+ thread.start()
226
+
227
+ # Yield the generated text in chunks
228
+ for new_text in streamer:
229
+ self.logger.debug(f"Generated chunk: {new_text[:50]}...")
230
+ yield new_text
231
+
232
+ except Exception as e:
233
+ self.logger.error(f"Error in streaming generation: {str(e)}")
234
+ raise
235
+
236
+ def generate_embedding(self, text: str) -> List[float]:
237
+ """
238
+ Generate a single embedding vector for a chunk of text.
239
+ Returns a list of floats representing the text embedding.
240
+ """
241
+ self.logger.debug(f"Generating embedding for text: {text[:50]}...")
242
+
243
+ if self.model is None or self.tokenizer is None:
244
+ raise RuntimeError("Model not initialized. Call initialize_model first.")
245
+
246
+ try:
247
+ # Tokenize the input text and ensure input_ids are Long type
248
+ inputs = self.tokenizer(text, return_tensors='pt')
249
+ input_ids = inputs.input_ids.to(dtype=torch.long, device=self.model.device)
250
+
251
+ # Get the model's dtype from its parameters for the attention mask
252
+ model_dtype = next(self.model.parameters()).dtype
253
+
254
+ # Create an attention mask with matching dtype
255
+ attention_mask = torch.zeros(
256
+ input_ids.size(0),
257
+ 1,
258
+ input_ids.size(1),
259
+ input_ids.size(1),
260
+ device=input_ids.device,
261
+ dtype=model_dtype
262
+ )
263
+
264
+ # Get model outputs
265
+ with torch.no_grad():
266
+ outputs = self.model(
267
+ input_ids=input_ids,
268
+ attention_mask=attention_mask,
269
+ output_hidden_states=True,
270
+ return_dict=True
271
+ )
272
+
273
+ # Get the last hidden state
274
+ last_hidden_state = outputs.hidden_states[-1]
275
+
276
+ # Average the hidden state over all tokens (excluding padding)
277
+ embedding = last_hidden_state[0].mean(dim=0)
278
+
279
+ # Convert to regular Python list
280
+ embedding_list = embedding.cpu().tolist()
281
+ self.logger.debug(f"Generated embedding of length: {len(embedding_list)}")
282
+ return embedding_list
283
+
284
+ except Exception as e:
285
+ self.logger.error(f"Error generating embedding: {str(e)}")
286
+ raise
app/config.yaml ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ server:
2
+ host: "0.0.0.0"
3
+ port: 8000
4
+
5
+ model:
6
+ base_path: "."
7
+ generation:
8
+ max_new_tokens: 256
9
+ do_sample: true
10
+ temperature: 0.7
11
+ repetition_penalty: 1.1
12
+ defaults:
13
+ model_name: "Qwen/Qwen2.5-Coder-3B-Instruct"
14
+
15
+ folders:
16
+ models: "models"
17
+ cache: ".cache"
18
+ logs: "logs"
19
+
20
+ logging:
21
+ level: "INFO"
22
+ format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
23
+ file: "llm_api.log"
24
+
25
+ api:
26
+ version: "v1"
27
+ prefix: "/api"
28
+ cors:
29
+ origins: ["*"]
30
+ credentials: true
app/env_template ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Hugging Face Authentication
2
+ HF_TOKEN=your_token_here
3
+
4
+ # CUDA Device Configuration
5
+ CUDA_VISIBLE_DEVICES=0,1 # Specify GPUs to use (e.g., 0 for first GPU, 0,1 for first two)
6
+
7
+ # Memory Management
8
+ PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:512
9
+ CUDA_LAUNCH_BLOCKING=1 # Set to 1 for debugging
10
+ CUDA_AUTO_BOOST=0 # Disable auto boost for consistent performance
11
+
12
+ # Cache Paths
13
+ CUDA_CACHE_PATH=/path/to/cuda/cache
14
+ TRANSFORMERS_CACHE=/path/to/transformers/cache
15
+
16
+ # Performance Settings
17
+ TF_ENABLE_ONEDNN_OPTS=1
18
+ TF_GPU_ALLOCATOR=cuda_malloc_async
19
+
20
+ # Model Settings
21
+ TRANSFORMERS_OFFLINE=0 # Set to 1 for offline mode
22
+
23
+ # Logging
24
+ LOG_LEVEL=INFO # Options: DEBUG, INFO, WARNING, ERROR, CRITICAL
25
+
26
+ # Add any additional environment-specific variables below
app/main.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import yaml
2
+ import sys
3
+ from fastapi import FastAPI
4
+ from fastapi.middleware.cors import CORSMiddleware
5
+ import uvicorn
6
+ from .api import LLMApi
7
+ from .routes import router, init_router
8
+ from utils.logging import setup_logger
9
+ from huggingface_hub import login
10
+ from pathlib import Path
11
+ from dotenv import load_dotenv
12
+ import os
13
+
14
+ def validate_hf():
15
+ """
16
+ Validate Hugging Face authentication.
17
+ Checks for .env file, loads environment variables, and attempts HF login if token exists.
18
+ """
19
+ logger = setup_logger(config, "hf_validation")
20
+
21
+ # Check for .env file
22
+ env_path = Path('.env')
23
+ if env_path.exists():
24
+ logger.info("Found .env file, loading environment variables")
25
+ load_dotenv()
26
+ else:
27
+ logger.warning("No .env file found. Fine if you're on Huggingface, but you need one to run locally on your PC.")
28
+
29
+ # Check for HF token
30
+ hf_token = os.getenv('HF_TOKEN')
31
+ if not hf_token:
32
+ logger.error("No HF_TOKEN found in environment variables")
33
+ return False
34
+
35
+ try:
36
+ # Attempt login
37
+ login(token=hf_token)
38
+ logger.info("Successfully authenticated with Hugging Face")
39
+ return True
40
+ except Exception as e:
41
+ logger.error(f"Failed to authenticate with Hugging Face: {str(e)}")
42
+ return False
43
+
44
+ def load_config():
45
+ """Load configuration from yaml file"""
46
+ with open("app/config.yaml", "r") as f:
47
+ return yaml.safe_load(f)
48
+
49
+ def create_app():
50
+ config = load_config()
51
+ logger = setup_logger(config, "main")
52
+ logger.info("Starting LLM API server")
53
+
54
+ app = FastAPI(
55
+ title="LLM API",
56
+ description="API for Large Language Model operations",
57
+ version=config["api"]["version"]
58
+ )
59
+
60
+ # Add CORS middleware
61
+ app.add_middleware(
62
+ CORSMiddleware,
63
+ allow_origins=config["api"]["cors"]["origins"],
64
+ allow_credentials=config["api"]["cors"]["credentials"],
65
+ allow_methods=["*"],
66
+ allow_headers=["*"],
67
+ )
68
+
69
+ # Initialize routes with config
70
+ init_router(config)
71
+
72
+ app.include_router(router, prefix=f"{config['api']['prefix']}/{config['api']['version']}")
73
+
74
+ logger.info("FastAPI application created successfully")
75
+ return app
76
+
77
+ def test_locally():
78
+ """Run local tests for development and debugging"""
79
+ config = load_config()
80
+ logger = setup_logger(config, "test")
81
+ logger.info("Starting local tests")
82
+
83
+ api = LLMApi(config)
84
+ model_name = config["model"]["defaults"]["model_name"]
85
+
86
+ logger.info(f"Testing with model: {model_name}")
87
+
88
+ # Test download
89
+ logger.info("Testing model download...")
90
+ api.download_model(model_name)
91
+ logger.info("Download complete")
92
+
93
+ # Test initialization
94
+ logger.info("Initializing model...")
95
+ api.initialize_model(model_name)
96
+ logger.info("Model initialized")
97
+
98
+ # Test embedding
99
+ test_text = "Dette er en test av embeddings generering fra en teknisk tekst om HMS rutiner på arbeidsplassen."
100
+ logger.info("Testing embedding generation...")
101
+ embedding = api.generate_embedding(test_text)
102
+ logger.info(f"Generated embedding of length: {len(embedding)}")
103
+ logger.info(f"First few values: {embedding[:5]}")
104
+
105
+ # Test generation
106
+ test_prompts = [
107
+ "Tell me what happens in a nuclear reactor.",
108
+ ]
109
+
110
+ # Test regular generation
111
+ logger.info("Testing regular generation:")
112
+ for prompt in test_prompts:
113
+ logger.info(f"Prompt: {prompt}")
114
+ response = api.generate_response(
115
+ prompt=prompt,
116
+ system_message="You are a helpful assistant."
117
+ )
118
+ logger.info(f"Response: {response}")
119
+
120
+ # Test streaming generation
121
+ logger.info("Testing streaming generation:")
122
+ logger.info(f"Prompt: {test_prompts[0]}")
123
+ for chunk in api.generate_stream(
124
+ prompt=test_prompts[0],
125
+ system_message="You are a helpful assistant."
126
+ ):
127
+ print(chunk, end="", flush=True)
128
+ print("\n")
129
+
130
+ logger.info("Local tests completed")
131
+
132
+ app = create_app()
133
+
134
+ if __name__ == "__main__":
135
+ config = load_config()
136
+ validate_hf()
137
+ if len(sys.argv) > 1 and sys.argv[1] == "test":
138
+ test_locally()
139
+ else:
140
+ uvicorn.run(
141
+ "app.main:app",
142
+ host=config["server"]["host"],
143
+ port=config["server"]["port"],
144
+ reload=True
145
+ )
app/routes.py ADDED
@@ -0,0 +1,349 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, HTTPException
2
+ from pydantic import BaseModel
3
+ from typing import Optional, List, Dict, Union
4
+ from .api import LLMApi
5
+ from utils.logging import setup_logger
6
+ from utils.helpers import get_system_info, format_memory_size
7
+ from utils.validation import validate_model_path
8
+ import psutil
9
+ from pathlib import Path
10
+
11
+ router = APIRouter()
12
+ logger = None
13
+ api = None
14
+ config = None
15
+
16
+ def init_router(config_dict: dict):
17
+ """Initialize router with config and LLM API instance"""
18
+ global logger, api, config
19
+ config = config_dict
20
+ logger = setup_logger(config, "api_routes")
21
+ api = LLMApi(config)
22
+ logger.info("Router initialized with LLM API instance")
23
+
24
+ class GenerateRequest(BaseModel):
25
+ prompt: str
26
+ system_message: Optional[str] = None
27
+ max_new_tokens: Optional[int] = None
28
+
29
+ class EmbeddingRequest(BaseModel):
30
+ text: str
31
+
32
+ class EmbeddingResponse(BaseModel):
33
+ embedding: List[float]
34
+ dimension: int
35
+
36
+ class SystemStatusResponse(BaseModel):
37
+ """Pydantic model for system status response"""
38
+ cpu: Optional[Dict[str, Union[float, str]]] = None
39
+ memory: Optional[Dict[str, Union[float, str]]] = None
40
+ gpu: Optional[Dict[str, Union[bool, str, float]]] = None
41
+ storage: Optional[Dict[str, str]] = None
42
+ model: Optional[Dict[str, Union[bool, str]]] = None
43
+
44
+ class ValidationResponse(BaseModel):
45
+ config_validation: Dict[str, bool]
46
+ model_validation: Dict[str, bool]
47
+ folder_validation: Dict[str, bool]
48
+ overall_status: str
49
+ issues: List[str]
50
+
51
+ @router.get("/system/validate",
52
+ response_model=ValidationResponse,
53
+ summary="Validate System Configuration",
54
+ description="Validates system configuration, folders, and model setup")
55
+ async def validate_system():
56
+ """
57
+ Validates:
58
+ - Configuration parameters
59
+ - Model setup
60
+ - Folder structure
61
+ - Required permissions
62
+ """
63
+ logger.info("Starting system validation")
64
+ issues = []
65
+
66
+ # Validate configuration
67
+ try:
68
+ config_status = {
69
+ "has_required_fields": True, # Check if all required config fields exist
70
+ "valid_paths": True, # Check if paths are valid
71
+ "valid_parameters": True # Check if parameters are within acceptable ranges
72
+ }
73
+
74
+ # Example validation checks
75
+ if not api.models_path.exists():
76
+ config_status["valid_paths"] = False
77
+ issues.append("Models directory does not exist")
78
+
79
+ if api.temperature < 0 or api.temperature > 2:
80
+ config_status["valid_parameters"] = False
81
+ issues.append("Temperature parameter out of valid range (0-2)")
82
+
83
+ except Exception as e:
84
+ logger.error(f"Configuration validation failed: {str(e)}")
85
+ config_status = {"error": str(e)}
86
+ issues.append(f"Config validation error: {str(e)}")
87
+
88
+ # Validate model setup
89
+ try:
90
+ model_status = {
91
+ "model_files_exist": False,
92
+ "model_loadable": False,
93
+ "tokenizer_valid": False
94
+ }
95
+
96
+ if api.model_name:
97
+ model_path = api.models_path / api.model_name.split('/')[-1]
98
+ model_status["model_files_exist"] = validate_model_path(model_path)
99
+
100
+ if not model_status["model_files_exist"]:
101
+ issues.append("Model files are missing or incomplete")
102
+
103
+ model_status["model_loadable"] = api.model is not None
104
+ model_status["tokenizer_valid"] = api.tokenizer is not None
105
+
106
+ except Exception as e:
107
+ logger.error(f"Model validation failed: {str(e)}")
108
+ model_status = {"error": str(e)}
109
+ issues.append(f"Model validation error: {str(e)}")
110
+
111
+ # Validate folder structure and permissions
112
+ try:
113
+ folder_status = {"models_folder": api.models_path.exists(), "cache_folder": api.cache_path.exists(),
114
+ "logs_folder": Path(api.base_path / "logs").exists(), "write_permissions": False}
115
+
116
+
117
+ # Test write permissions by attempting to create a test file
118
+ test_file = api.models_path / ".test_write"
119
+ try:
120
+ test_file.touch()
121
+ test_file.unlink()
122
+ folder_status["write_permissions"] = True
123
+ except:
124
+ folder_status["write_permissions"] = False
125
+ issues.append("Insufficient write permissions in models directory")
126
+
127
+ except Exception as e:
128
+ logger.error(f"Folder validation failed: {str(e)}")
129
+ folder_status = {"error": str(e)}
130
+ issues.append(f"Folder validation error: {str(e)}")
131
+
132
+ # Determine overall status
133
+ if not issues:
134
+ overall_status = "valid"
135
+ elif len(issues) < 3:
136
+ overall_status = "warning"
137
+ else:
138
+ overall_status = "invalid"
139
+
140
+ validation_response = ValidationResponse(
141
+ config_validation=config_status,
142
+ model_validation=model_status,
143
+ folder_validation=folder_status,
144
+ overall_status=overall_status,
145
+ issues=issues
146
+ )
147
+
148
+ logger.info(f"System validation completed with status: {overall_status}")
149
+ return validation_response
150
+
151
+
152
+ @router.get("/system/status",
153
+ response_model=SystemStatusResponse,
154
+ summary="Check System Status",
155
+ description="Returns comprehensive system status including CPU, Memory, GPU, Storage, and Model information")
156
+ async def check_system():
157
+ """
158
+ Get system status including:
159
+ - CPU usage
160
+ - Memory usage
161
+ - GPU availability and usage
162
+ - Storage status for model and cache directories
163
+ - Current model status
164
+ """
165
+ logger.info("Checking system status")
166
+ status = SystemStatusResponse()
167
+ system_info = None
168
+
169
+ # Check CPU and Memory
170
+ try:
171
+ system_info = get_system_info()
172
+ status.cpu = {
173
+ "usage_percent": system_info["cpu_percent"],
174
+ "status": "healthy" if system_info["cpu_percent"] < 90 else "high"
175
+ }
176
+ logger.debug(f"CPU status retrieved: {status.cpu}")
177
+ except Exception as e:
178
+ logger.error(f"Failed to get CPU info: {str(e)}")
179
+ status.cpu = {"status": "error", "message": str(e)}
180
+
181
+ # Check Memory
182
+ try:
183
+ if not system_info:
184
+ system_info = get_system_info()
185
+ status.memory = {
186
+ "usage_percent": system_info["memory_percent"],
187
+ "status": "healthy" if system_info["memory_percent"] < 90 else "critical",
188
+ "available": format_memory_size(psutil.virtual_memory().available)
189
+ }
190
+ logger.debug(f"Memory status retrieved: {status.memory}")
191
+ except Exception as e:
192
+ logger.error(f"Failed to get memory info: {str(e)}")
193
+ status.memory = {"status": "error", "message": str(e)}
194
+
195
+ # Check GPU
196
+ try:
197
+ if not system_info:
198
+ system_info = get_system_info()
199
+ status.gpu = {
200
+ "available": system_info["gpu_available"],
201
+ "memory_used": format_memory_size(system_info["gpu_memory_used"]),
202
+ "memory_total": format_memory_size(system_info["gpu_memory_total"]),
203
+ "utilization_percent": system_info["gpu_memory_used"] / system_info["gpu_memory_total"] * 100 if system_info["gpu_available"] else 0
204
+ }
205
+ logger.debug(f"GPU status retrieved: {status.gpu}")
206
+ except Exception as e:
207
+ logger.error(f"Failed to get GPU info: {str(e)}")
208
+ status.gpu = {"status": "error", "message": str(e)}
209
+
210
+ # Check Storage
211
+ try:
212
+ models_path = Path(api.models_path)
213
+ cache_path = Path(api.cache_path)
214
+ status.storage = {
215
+ "models_directory": str(models_path),
216
+ "models_size": format_memory_size(sum(f.stat().st_size for f in models_path.glob('**/*') if f.is_file())),
217
+ "cache_directory": str(cache_path),
218
+ "cache_size": format_memory_size(sum(f.stat().st_size for f in cache_path.glob('**/*') if f.is_file()))
219
+ }
220
+ logger.debug(f"Storage status retrieved: {status.storage}")
221
+ except Exception as e:
222
+ logger.error(f"Failed to get storage info: {str(e)}")
223
+ status.storage = {"status": "error", "message": str(e)}
224
+
225
+ # Check Model Status
226
+ try:
227
+ current_model_path = api.models_path / api.model_name.split('/')[-1] if api.model_name else None
228
+ status.model = {
229
+ "is_loaded": api.model is not None,
230
+ "current_model": api.model_name,
231
+ "is_valid": validate_model_path(current_model_path) if current_model_path else False,
232
+ "has_chat_template": api.has_chat_template() if api.model else False
233
+ }
234
+ logger.debug(f"Model status retrieved: {status.model}")
235
+ except Exception as e:
236
+ logger.error(f"Failed to get model status: {str(e)}")
237
+ status.model = {"status": "error", "message": str(e)}
238
+
239
+ logger.info("System status check completed")
240
+ return status
241
+
242
+
243
+ @router.post("/generate")
244
+ async def generate_text(request: GenerateRequest):
245
+ """Generate text response from prompt"""
246
+ logger.info(f"Received generation request for prompt: {request.prompt[:50]}...")
247
+ try:
248
+ response = api.generate_response(
249
+ prompt=request.prompt,
250
+ system_message=request.system_message,
251
+ max_new_tokens=request.max_new_tokens or api.max_new_tokens
252
+ )
253
+ logger.info("Successfully generated response")
254
+ return {"generated_text": response}
255
+ except Exception as e:
256
+ logger.error(f"Error in generate_text endpoint: {str(e)}")
257
+ raise HTTPException(status_code=500, detail=str(e))
258
+
259
+
260
+ @router.post("/generate/stream")
261
+ async def generate_stream(request: GenerateRequest):
262
+ """Generate streaming text response from prompt"""
263
+ logger.info(f"Received streaming generation request for prompt: {request.prompt[:50]}...")
264
+ try:
265
+ return api.generate_stream(
266
+ prompt=request.prompt,
267
+ system_message=request.system_message,
268
+ max_new_tokens=request.max_new_tokens or api.max_new_tokens
269
+ )
270
+ except Exception as e:
271
+ logger.error(f"Error in generate_stream endpoint: {str(e)}")
272
+ raise HTTPException(status_code=500, detail=str(e))
273
+
274
+
275
+ @router.post("/embedding", response_model=EmbeddingResponse)
276
+ async def generate_embedding(request: EmbeddingRequest):
277
+ """Generate embedding vector from text"""
278
+ logger.info(f"Received embedding request for text: {request.text[:50]}...")
279
+ try:
280
+ embedding = api.generate_embedding(request.text)
281
+ logger.info(f"Successfully generated embedding of dimension {len(embedding)}")
282
+ return EmbeddingResponse(
283
+ embedding=embedding,
284
+ dimension=len(embedding)
285
+ )
286
+ except Exception as e:
287
+ logger.error(f"Error in generate_embedding endpoint: {str(e)}")
288
+ raise HTTPException(status_code=500, detail=str(e))
289
+
290
+
291
+ @router.post("/model/download",
292
+ summary="Download default or specified model",
293
+ description="Downloads model files. Uses default model from config if none specified.")
294
+ async def download_model(model_name: Optional[str] = None):
295
+ """Download model files to local storage"""
296
+ try:
297
+ # Use model name from config if none provided
298
+ model_to_download = model_name or config["model"]["defaults"]["model_name"]
299
+ logger.info(f"Received request to download model: {model_to_download}")
300
+
301
+ api.download_model(model_to_download)
302
+ logger.info(f"Successfully downloaded model: {model_to_download}")
303
+
304
+ return {
305
+ "status": "success",
306
+ "message": f"Model {model_to_download} downloaded",
307
+ "model_name": model_to_download
308
+ }
309
+ except Exception as e:
310
+ logger.error(f"Error downloading model: {str(e)}")
311
+ raise HTTPException(status_code=500, detail=str(e))
312
+
313
+ @router.post("/model/initialize",
314
+ summary="Initialize default or specified model",
315
+ description="Initialize model for use. Uses default model from config if none specified.")
316
+ async def initialize_model(model_name: Optional[str] = None):
317
+ """Initialize a model for use"""
318
+ try:
319
+ # Use model name from config if none provided
320
+ model_to_init = model_name or config["model"]["defaults"]["model_name"]
321
+ logger.info(f"Received request to initialize model: {model_to_init}")
322
+
323
+ api.initialize_model(model_to_init)
324
+ logger.info(f"Successfully initialized model: {model_to_init}")
325
+
326
+ return {
327
+ "status": "success",
328
+ "message": f"Model {model_to_init} initialized",
329
+ "model_name": model_to_init
330
+ }
331
+ except Exception as e:
332
+ logger.error(f"Error initializing model: {str(e)}")
333
+ raise HTTPException(status_code=500, detail=str(e))
334
+
335
+
336
+ @router.get("/models/status")
337
+ async def get_model_status():
338
+ """Get current model status"""
339
+ try:
340
+ status = {
341
+ "model_loaded": api.model is not None,
342
+ "current_model": api.model_name if api.model_name else None,
343
+ "has_chat_template": api.has_chat_template() if api.model else False
344
+ }
345
+ logger.info(f"Retrieved model status: {status}")
346
+ return status
347
+ except Exception as e:
348
+ logger.error(f"Error getting model status: {str(e)}")
349
+ raise HTTPException(status_code=500, detail=str(e))
utils/__init__.py ADDED
File without changes
utils/errors.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class ModelNotFoundError(Exception):
2
+ """Error raised when a model cannot be found or accessed"""
3
+ def __init__(self, model_name: str, original_error: Exception = None):
4
+ self.model_name = model_name
5
+ self.original_error = original_error
6
+
7
+ message = (
8
+ f"Could not find or access model: '{model_name}'\n\n"
9
+ f"This could be because:\n"
10
+ f"1. The model name is misspelled - double check the name\n"
11
+ f"2. The model requires authentication - you need to:\n"
12
+ f" - Log in to Hugging Face (huggingface.co)\n"
13
+ f" - Accept the model's terms of use on its page\n"
14
+ f" - Create an access token in your HF account settings\n"
15
+ f" - Set the token as an environment variable: export HUGGING_FACE_HUB_TOKEN=your_token\n\n"
16
+ f"Original error: {str(original_error)}"
17
+ )
18
+ super().__init__(message)
19
+
20
+ class ModelLoadError(Exception):
21
+ """Error raised when a model fails to load"""
22
+ def __init__(self, model_name: str, load_type: str, original_error: Exception = None):
23
+ self.model_name = model_name
24
+ self.load_type = load_type
25
+ self.original_error = original_error
26
+
27
+ message = (
28
+ f"Failed to load model: '{model_name}' using {load_type} precision\n\n"
29
+ f"Common reasons:\n"
30
+ f"1. Not enough GPU memory - This model requires more VRAM than available\n"
31
+ f" - Try using 8-bit quantization (load_in_8bit=True)\n"
32
+ f" - Try using 4-bit quantization (load_in_4bit=True)\n"
33
+ f" - Or use a smaller model\n"
34
+ f"2. Incorrect model parameters - Check the model card for correct loading parameters\n"
35
+ f"3. Corrupted model files - Try removing the model folder and downloading again\n\n"
36
+ f"Original error: {str(original_error)}"
37
+ )
38
+ super().__init__(message)
39
+
40
+ class InvalidConfigurationError(Exception):
41
+ """Error raised when configuration is invalid"""
42
+ def __init__(self, param_name: str, current_value: any, expected_value: str, original_error: Exception = None):
43
+ self.param_name = param_name
44
+ self.current_value = current_value
45
+ self.expected_value = expected_value
46
+ self.original_error = original_error
47
+
48
+ message = (
49
+ f"Invalid configuration parameter: '{param_name}'\n\n"
50
+ f"Current value: {current_value}\n"
51
+ f"Expected value: {expected_value}\n\n"
52
+ f"Please update your config.yaml file with the correct value\n"
53
+ f"Original error: {str(original_error)}"
54
+ )
55
+ super().__init__(message)
56
+
57
+ class GenerationError(Exception):
58
+ """Error raised when text generation fails"""
59
+ def __init__(self, stage: str, original_error: Exception = None):
60
+ self.stage = stage
61
+ self.original_error = original_error
62
+
63
+ message = (
64
+ f"Text generation failed during {stage}\n\n"
65
+ f"This could be because:\n"
66
+ f"1. The model ran out of memory during generation\n"
67
+ f" - Try reducing max_new_tokens\n"
68
+ f" - Try reducing the input text length\n"
69
+ f"2. The input prompt might be too complex or long\n"
70
+ f"3. The model might be in an inconsistent state\n"
71
+ f" - Try reinitializing the model\n\n"
72
+ f"Original error: {str(original_error)}"
73
+ )
74
+ super().__init__(message)
75
+
76
+ # Usage examples:
77
+ """
78
+ # When model not found:
79
+ raise ModelNotFoundError("mistralai/Mistral-7B-v0.1", original_error=e)
80
+
81
+ # When model fails to load:
82
+ raise ModelLoadError("mistralai/Mistral-7B-v0.1", "8-bit quantization", original_error=e)
83
+
84
+ # When config is invalid:
85
+ raise InvalidConfigurationError(
86
+ "temperature",
87
+ 2.5,
88
+ "a value between 0.0 and 2.0",
89
+ original_error=e
90
+ )
91
+
92
+ # When generation fails:
93
+ raise GenerationError("token generation", original_error=e)
94
+ """
utils/helpers.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import psutil
2
+ import torch
3
+ from pathlib import Path
4
+ from typing import Dict, Any
5
+
6
+ def get_system_info() -> Dict[str, Any]:
7
+ """Get system resource information"""
8
+ return {
9
+ "cpu_percent": psutil.cpu_percent(),
10
+ "memory_percent": psutil.virtual_memory().percent,
11
+ "gpu_available": torch.cuda.is_available(),
12
+ "gpu_memory_used": torch.cuda.memory_allocated() if torch.cuda.is_available() else 0,
13
+ "gpu_memory_total": torch.cuda.get_device_properties(0).total_memory if torch.cuda.is_available() else 0
14
+ }
15
+
16
+ def calculate_optimal_batch_size(model_size: int, available_memory: int) -> int:
17
+ """Calculate optimal batch size based on model size and available memory"""
18
+ memory_per_sample = model_size * 1.5 # Rough estimate including overhead
19
+ return max(1, available_memory // memory_per_sample)
20
+
21
+ def ensure_folder_structure(config: Dict) -> None:
22
+ """Ensure all necessary folders exist"""
23
+ folders = [
24
+ Path(config["folders"]["models"]),
25
+ Path(config["folders"]["cache"]),
26
+ Path(config["folders"]["logs"])
27
+ ]
28
+ for folder in folders:
29
+ folder.mkdir(parents=True, exist_ok=True)
30
+
31
+ def format_memory_size(size_bytes: int) -> str:
32
+ """Format memory size to human readable format"""
33
+ for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
34
+ if size_bytes < 1024:
35
+ return f"{size_bytes:.2f}{unit}"
36
+ size_bytes /= 1024
utils/logging.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from pathlib import Path
3
+
4
+ def setup_logger(config: dict, name: str = None) -> logging.Logger:
5
+ """Set up logger with configuration from config file."""
6
+ logger = logging.getLogger(name or __name__)
7
+
8
+ # Set level from config
9
+ level = getattr(logging, config["logging"]["level"].upper())
10
+ logger.setLevel(level)
11
+
12
+ # Create logs directory if it doesn't exist
13
+ log_path = Path(config["folders"]["logs"])
14
+ log_path.mkdir(exist_ok=True)
15
+
16
+ # Create handlers
17
+ file_handler = logging.FileHandler(log_path / config["logging"]["file"])
18
+ console_handler = logging.StreamHandler()
19
+
20
+ # Create formatter
21
+ formatter = logging.Formatter(config["logging"]["format"])
22
+ file_handler.setFormatter(formatter)
23
+ console_handler.setFormatter(formatter)
24
+
25
+ # Add handlers
26
+ logger.addHandler(file_handler)
27
+ logger.addHandler(console_handler)
28
+
29
+ return logger
utils/validation.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, Any
2
+ from pathlib import Path
3
+
4
+ def validate_model_path(model_path: Path) -> bool:
5
+ """Validate that a model path exists and contains necessary files"""
6
+ if not model_path.exists():
7
+ return False
8
+ required_files = ['config.json', 'pytorch_model.bin']
9
+ return all((model_path / file).exists() for file in required_files)
10
+
11
+ def validate_generation_params(params: Dict[str, Any]) -> Dict[str, Any]:
12
+ """Validate and normalize generation parameters"""
13
+ validated = params.copy()
14
+
15
+ # Ensure temperature is within bounds
16
+ if 'temperature' in validated:
17
+ validated['temperature'] = max(0.0, min(2.0, validated['temperature']))
18
+
19
+ # Ensure max_new_tokens is reasonable
20
+ if 'max_new_tokens' in validated:
21
+ validated['max_new_tokens'] = max(1, min(4096, validated['max_new_tokens']))
22
+
23
+ return validated