Spaces:

TeamGenKI
/

LLMServer

Paused

App Files Files Community

AurelioAguirre commited on Dec 2, 2024

Commit

f35f208

1 Parent(s): cfaa883

Refactored

Browse files

Files changed (11) hide show

app/__init__.py +0 -0
app/api.py +286 -0
app/config.yaml +30 -0
app/env_template +26 -0
app/main.py +145 -0
app/routes.py +349 -0
utils/__init__.py +0 -0
utils/errors.py +94 -0
utils/helpers.py +36 -0
utils/logging.py +29 -0
utils/validation.py +23 -0

app/__init__.py ADDED Viewed

File without changes

app/api.py ADDED Viewed

	@@ -0,0 +1,286 @@

+import os
+from pathlib import Path
+from threading import Thread
+import torch
+from typing import Optional, Iterator, List
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+from utils.logging import setup_logger
+class LLMApi:
+    def __init__(self, config: dict):
+        """Initialize the LLM API with configuration."""
+        self.logger = setup_logger(config, "llm_api")
+        self.logger.info("Initializing LLM API")
+        # Set up paths
+        self.base_path = Path(config["model"]["base_path"])
+        self.models_path = self.base_path / config["folders"]["models"]
+        self.cache_path = self.base_path / config["folders"]["cache"]
+        self.model = None
+        self.model_name = None
+        self.tokenizer = None
+        # Generation parameters from config
+        gen_config = config["model"]["generation"]
+        self.max_new_tokens = gen_config["max_new_tokens"]
+        self.do_sample = gen_config["do_sample"]
+        self.temperature = gen_config["temperature"]
+        self.repetition_penalty = gen_config["repetition_penalty"]
+        self.generation_config = {
+            "max_new_tokens": self.max_new_tokens,
+            "do_sample": self.do_sample,
+            "temperature": self.temperature,
+            "repetition_penalty": self.repetition_penalty,
+            "eos_token_id": None,
+            "pad_token_id": None
+        }
+        # Create necessary directories
+        self.models_path.mkdir(parents=True, exist_ok=True)
+        self.cache_path.mkdir(parents=True, exist_ok=True)
+        # Set cache directory for transformers
+        os.environ['TRANSFORMERS_CACHE'] = str(self.cache_path)
+        self.logger.info("LLM API initialized successfully")
+    def download_model(self, model_name: str) -> None:
+        """
+        Download a model and its tokenizer to the models directory.
+        Args:
+            model_name: The name of the model to download (e.g., "norallm/normistral-11b-warm")
+        """
+        self.logger.info(f"Starting download of model: {model_name}")
+        try:
+            model_path = self.models_path / model_name.split('/')[-1]
+            # Download and save model
+            model = AutoModelForCausalLM.from_pretrained(model_name)
+            tokenizer = AutoTokenizer.from_pretrained(model_name)
+            self.logger.info(f"Saving model to {model_path}")
+            model.save_pretrained(model_path)
+            tokenizer.save_pretrained(model_path)
+            self.logger.info(f"Successfully downloaded model: {model_name}")
+        except Exception as e:
+            self.logger.error(f"Failed to download model {model_name}: {str(e)}")
+            raise
+    def initialize_model(self, model_name: str) -> None:
+        """
+        Initialize a model and tokenizer, either from local storage or by downloading.
+        Args:
+            model_name: The name of the model to initialize
+        """
+        self.logger.info(f"Initializing model: {model_name}")
+        try:
+            self.model_name = model_name
+            local_model_path = self.models_path / model_name.split('/')[-1]
+            # Check if model exists locally
+            if local_model_path.exists():
+                self.logger.info(f"Loading model from local path: {local_model_path}")
+                model_path = local_model_path
+            else:
+                self.logger.info(f"Loading model from source: {model_name}")
+                model_path = model_name
+            self.model = AutoModelForCausalLM.from_pretrained(
+                model_path,
+                device_map="auto",
+                load_in_8bit=True,
+                torch_dtype=torch.float16
+            )
+            self.tokenizer = AutoTokenizer.from_pretrained(model_path)
+            # Update generation config with tokenizer-specific values
+            self.generation_config["eos_token_id"] = self.tokenizer.eos_token_id
+            self.generation_config["pad_token_id"] = self.tokenizer.eos_token_id
+            self.logger.info(f"Successfully initialized model: {model_name}")
+        except Exception as e:
+            self.logger.error(f"Failed to initialize model {model_name}: {str(e)}")
+            raise
+    def has_chat_template(self) -> bool:
+        """Check if the current model has a chat template."""
+        try:
+            self.tokenizer.apply_chat_template(
+                [{"role": "user", "content": "test"}],
+                tokenize=False,
+            )
+            return True
+        except (ValueError, AttributeError):
+            return False
+    def _prepare_prompt(self, prompt: str, system_message: Optional[str] = None) -> str:
+        """
+        Prepare the prompt text, either using the model's chat template if available,
+        or falling back to a simple OpenAI-style format.
+        """
+        try:
+            messages = []
+            if system_message:
+                messages.append({"role": "system", "content": system_message})
+            messages.append({"role": "user", "content": prompt})
+            return self.tokenizer.apply_chat_template(
+                messages,
+                tokenize=False,
+                add_generation_prompt=True
+            )
+        except (ValueError, AttributeError):
+            template = ""
+            if system_message:
+                template += f"System: {system_message}\n\n"
+            template += f"User: {prompt}\n\nAssistant: "
+            return template
+    def generate_response(
+            self,
+            prompt: str,
+            system_message: Optional[str] = None,
+            max_new_tokens: Optional[int] = None
+    ) -> str:
+        """
+        Generate a complete response for the given prompt.
+        """
+        self.logger.debug(f"Generating response for prompt: {prompt[:50]}...")
+        if self.model is None:
+            raise RuntimeError("Model not initialized. Call initialize_model first.")
+        try:
+            text = self._prepare_prompt(prompt, system_message)
+            inputs = self.tokenizer([text], return_tensors="pt")
+            # Remove token_type_ids if present
+            model_inputs = {k: v.to(self.model.device) for k, v in inputs.items()
+                            if k != 'token_type_ids'}
+            generation_config = self.generation_config.copy()
+            if max_new_tokens:
+                generation_config["max_new_tokens"] = max_new_tokens
+            generated_ids = self.model.generate(
+                **model_inputs,
+                **generation_config
+            )
+            generated_ids = [
+                output_ids[len(input_ids):]
+                for input_ids, output_ids in zip(model_inputs['input_ids'], generated_ids)
+            ]
+            response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+            self.logger.debug(f"Generated response: {response[:50]}...")
+            return response
+        except Exception as e:
+            self.logger.error(f"Error generating response: {str(e)}")
+            raise
+    def generate_stream(
+            self,
+            prompt: str,
+            system_message: Optional[str] = None,
+            max_new_tokens: Optional[int] = None
+    ) -> Iterator[str]:
+        """
+        Generate a streaming response for the given prompt.
+        """
+        self.logger.debug(f"Starting streaming generation for prompt: {prompt[:50]}...")
+        if self.model is None:
+            raise RuntimeError("Model not initialized. Call initialize_model first.")
+        try:
+            text = self._prepare_prompt(prompt, system_message)
+            inputs = self.tokenizer([text], return_tensors="pt")
+            # Remove token_type_ids if present
+            model_inputs = {k: v.to(self.model.device) for k, v in inputs.items()
+                            if k != 'token_type_ids'}
+            # Configure generation
+            generation_config = self.generation_config.copy()
+            if max_new_tokens:
+                generation_config["max_new_tokens"] = max_new_tokens
+            # Set up streaming
+            streamer = TextIteratorStreamer(self.tokenizer)
+            generation_kwargs = dict(
+                **model_inputs,
+                **generation_config,
+                streamer=streamer
+            )
+            # Create a thread to run the generation
+            thread = Thread(target=self.model.generate, kwargs=generation_kwargs)
+            thread.start()
+            # Yield the generated text in chunks
+            for new_text in streamer:
+                self.logger.debug(f"Generated chunk: {new_text[:50]}...")
+                yield new_text
+        except Exception as e:
+            self.logger.error(f"Error in streaming generation: {str(e)}")
+            raise
+    def generate_embedding(self, text: str) -> List[float]:
+        """
+        Generate a single embedding vector for a chunk of text.
+        Returns a list of floats representing the text embedding.
+        """
+        self.logger.debug(f"Generating embedding for text: {text[:50]}...")
+        if self.model is None or self.tokenizer is None:
+            raise RuntimeError("Model not initialized. Call initialize_model first.")
+        try:
+            # Tokenize the input text and ensure input_ids are Long type
+            inputs = self.tokenizer(text, return_tensors='pt')
+            input_ids = inputs.input_ids.to(dtype=torch.long, device=self.model.device)
+            # Get the model's dtype from its parameters for the attention mask
+            model_dtype = next(self.model.parameters()).dtype
+            # Create an attention mask with matching dtype
+            attention_mask = torch.zeros(
+                input_ids.size(0),
+                1,
+                input_ids.size(1),
+                input_ids.size(1),
+                device=input_ids.device,
+                dtype=model_dtype
+            )
+            # Get model outputs
+            with torch.no_grad():
+                outputs = self.model(
+                    input_ids=input_ids,
+                    attention_mask=attention_mask,
+                    output_hidden_states=True,
+                    return_dict=True
+                )
+            # Get the last hidden state
+            last_hidden_state = outputs.hidden_states[-1]
+            # Average the hidden state over all tokens (excluding padding)
+            embedding = last_hidden_state[0].mean(dim=0)
+            # Convert to regular Python list
+            embedding_list = embedding.cpu().tolist()
+            self.logger.debug(f"Generated embedding of length: {len(embedding_list)}")
+            return embedding_list
+        except Exception as e:
+            self.logger.error(f"Error generating embedding: {str(e)}")
+            raise

app/config.yaml ADDED Viewed

	@@ -0,0 +1,30 @@

+server:
+  host: "0.0.0.0"
+  port: 8000
+model:
+  base_path: "."
+  generation:
+    max_new_tokens: 256
+    do_sample: true
+    temperature: 0.7
+    repetition_penalty: 1.1
+  defaults:
+    model_name: "Qwen/Qwen2.5-Coder-3B-Instruct"
+folders:
+  models: "models"
+  cache: ".cache"
+  logs: "logs"
+logging:
+  level: "INFO"
+  format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+  file: "llm_api.log"
+api:
+  version: "v1"
+  prefix: "/api"
+  cors:
+    origins: ["*"]
+    credentials: true

app/env_template ADDED Viewed

	@@ -0,0 +1,26 @@

+# Hugging Face Authentication
+HF_TOKEN=your_token_here
+# CUDA Device Configuration
+CUDA_VISIBLE_DEVICES=0,1    # Specify GPUs to use (e.g., 0 for first GPU, 0,1 for first two)
+# Memory Management
+PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:512
+CUDA_LAUNCH_BLOCKING=1      # Set to 1 for debugging
+CUDA_AUTO_BOOST=0          # Disable auto boost for consistent performance
+# Cache Paths
+CUDA_CACHE_PATH=/path/to/cuda/cache
+TRANSFORMERS_CACHE=/path/to/transformers/cache
+# Performance Settings
+TF_ENABLE_ONEDNN_OPTS=1
+TF_GPU_ALLOCATOR=cuda_malloc_async
+# Model Settings
+TRANSFORMERS_OFFLINE=0     # Set to 1 for offline mode
+# Logging
+LOG_LEVEL=INFO            # Options: DEBUG, INFO, WARNING, ERROR, CRITICAL
+# Add any additional environment-specific variables below

app/main.py ADDED Viewed

	@@ -0,0 +1,145 @@

+import yaml
+import sys
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+import uvicorn
+from .api import LLMApi
+from .routes import router, init_router
+from utils.logging import setup_logger
+from huggingface_hub import login
+from pathlib import Path
+from dotenv import load_dotenv
+import os
+def validate_hf():
+    """
+    Validate Hugging Face authentication.
+    Checks for .env file, loads environment variables, and attempts HF login if token exists.
+    """
+    logger = setup_logger(config, "hf_validation")
+    # Check for .env file
+    env_path = Path('.env')
+    if env_path.exists():
+        logger.info("Found .env file, loading environment variables")
+        load_dotenv()
+    else:
+        logger.warning("No .env file found. Fine if you're on Huggingface, but you need one to run locally on your PC.")
+    # Check for HF token
+    hf_token = os.getenv('HF_TOKEN')
+    if not hf_token:
+        logger.error("No HF_TOKEN found in environment variables")
+        return False
+    try:
+        # Attempt login
+        login(token=hf_token)
+        logger.info("Successfully authenticated with Hugging Face")
+        return True
+    except Exception as e:
+        logger.error(f"Failed to authenticate with Hugging Face: {str(e)}")
+        return False
+def load_config():
+    """Load configuration from yaml file"""
+    with open("app/config.yaml", "r") as f:
+        return yaml.safe_load(f)
+def create_app():
+    config = load_config()
+    logger = setup_logger(config, "main")
+    logger.info("Starting LLM API server")
+    app = FastAPI(
+        title="LLM API",
+        description="API for Large Language Model operations",
+        version=config["api"]["version"]
+    )
+    # Add CORS middleware
+    app.add_middleware(
+        CORSMiddleware,
+        allow_origins=config["api"]["cors"]["origins"],
+        allow_credentials=config["api"]["cors"]["credentials"],
+        allow_methods=["*"],
+        allow_headers=["*"],
+    )
+    # Initialize routes with config
+    init_router(config)
+    app.include_router(router, prefix=f"{config['api']['prefix']}/{config['api']['version']}")
+    logger.info("FastAPI application created successfully")
+    return app
+def test_locally():
+    """Run local tests for development and debugging"""
+    config = load_config()
+    logger = setup_logger(config, "test")
+    logger.info("Starting local tests")
+    api = LLMApi(config)
+    model_name = config["model"]["defaults"]["model_name"]
+    logger.info(f"Testing with model: {model_name}")
+    # Test download
+    logger.info("Testing model download...")
+    api.download_model(model_name)
+    logger.info("Download complete")
+    # Test initialization
+    logger.info("Initializing model...")
+    api.initialize_model(model_name)
+    logger.info("Model initialized")
+    # Test embedding
+    test_text = "Dette er en test av embeddings generering fra en teknisk tekst om HMS rutiner på arbeidsplassen."
+    logger.info("Testing embedding generation...")
+    embedding = api.generate_embedding(test_text)
+    logger.info(f"Generated embedding of length: {len(embedding)}")
+    logger.info(f"First few values: {embedding[:5]}")
+    # Test generation
+    test_prompts = [
+        "Tell me what happens in a nuclear reactor.",
+    ]
+    # Test regular generation
+    logger.info("Testing regular generation:")
+    for prompt in test_prompts:
+        logger.info(f"Prompt: {prompt}")
+        response = api.generate_response(
+            prompt=prompt,
+            system_message="You are a helpful assistant."
+        )
+        logger.info(f"Response: {response}")
+    # Test streaming generation
+    logger.info("Testing streaming generation:")
+    logger.info(f"Prompt: {test_prompts[0]}")
+    for chunk in api.generate_stream(
+            prompt=test_prompts[0],
+            system_message="You are a helpful assistant."
+    ):
+        print(chunk, end="", flush=True)
+    print("\n")
+    logger.info("Local tests completed")
+app = create_app()
+if __name__ == "__main__":
+    config = load_config()
+    validate_hf()
+    if len(sys.argv) > 1 and sys.argv[1] == "test":
+        test_locally()
+    else:
+        uvicorn.run(
+            "app.main:app",
+            host=config["server"]["host"],
+            port=config["server"]["port"],
+            reload=True
+        )

app/routes.py ADDED Viewed

	@@ -0,0 +1,349 @@

+from fastapi import APIRouter, HTTPException
+from pydantic import BaseModel
+from typing import Optional, List, Dict, Union
+from .api import LLMApi
+from utils.logging import setup_logger
+from utils.helpers import get_system_info, format_memory_size
+from utils.validation import validate_model_path
+import psutil
+from pathlib import Path
+router = APIRouter()
+logger = None
+api = None
+config = None
+def init_router(config_dict: dict):
+    """Initialize router with config and LLM API instance"""
+    global logger, api, config
+    config = config_dict
+    logger = setup_logger(config, "api_routes")
+    api = LLMApi(config)
+    logger.info("Router initialized with LLM API instance")
+class GenerateRequest(BaseModel):
+    prompt: str
+    system_message: Optional[str] = None
+    max_new_tokens: Optional[int] = None
+class EmbeddingRequest(BaseModel):
+    text: str
+class EmbeddingResponse(BaseModel):
+    embedding: List[float]
+    dimension: int
+class SystemStatusResponse(BaseModel):
+    """Pydantic model for system status response"""
+    cpu: Optional[Dict[str, Union[float, str]]] = None
+    memory: Optional[Dict[str, Union[float, str]]] = None
+    gpu: Optional[Dict[str, Union[bool, str, float]]] = None
+    storage: Optional[Dict[str, str]] = None
+    model: Optional[Dict[str, Union[bool, str]]] = None
+class ValidationResponse(BaseModel):
+    config_validation: Dict[str, bool]
+    model_validation: Dict[str, bool]
+    folder_validation: Dict[str, bool]
+    overall_status: str
+    issues: List[str]
+@router.get("/system/validate",
+            response_model=ValidationResponse,
+            summary="Validate System Configuration",
+            description="Validates system configuration, folders, and model setup")
+async def validate_system():
+    """
+    Validates:
+    - Configuration parameters
+    - Model setup
+    - Folder structure
+    - Required permissions
+    """
+    logger.info("Starting system validation")
+    issues = []
+    # Validate configuration
+    try:
+        config_status = {
+            "has_required_fields": True,  # Check if all required config fields exist
+            "valid_paths": True,          # Check if paths are valid
+            "valid_parameters": True      # Check if parameters are within acceptable ranges
+        }
+        # Example validation checks
+        if not api.models_path.exists():
+            config_status["valid_paths"] = False
+            issues.append("Models directory does not exist")
+        if api.temperature < 0 or api.temperature > 2:
+            config_status["valid_parameters"] = False
+            issues.append("Temperature parameter out of valid range (0-2)")
+    except Exception as e:
+        logger.error(f"Configuration validation failed: {str(e)}")
+        config_status = {"error": str(e)}
+        issues.append(f"Config validation error: {str(e)}")
+    # Validate model setup
+    try:
+        model_status = {
+            "model_files_exist": False,
+            "model_loadable": False,
+            "tokenizer_valid": False
+        }
+        if api.model_name:
+            model_path = api.models_path / api.model_name.split('/')[-1]
+            model_status["model_files_exist"] = validate_model_path(model_path)
+            if not model_status["model_files_exist"]:
+                issues.append("Model files are missing or incomplete")
+            model_status["model_loadable"] = api.model is not None
+            model_status["tokenizer_valid"] = api.tokenizer is not None
+    except Exception as e:
+        logger.error(f"Model validation failed: {str(e)}")
+        model_status = {"error": str(e)}
+        issues.append(f"Model validation error: {str(e)}")
+    # Validate folder structure and permissions
+    try:
+        folder_status = {"models_folder": api.models_path.exists(), "cache_folder": api.cache_path.exists(),
+                         "logs_folder": Path(api.base_path / "logs").exists(), "write_permissions": False}
+        # Test write permissions by attempting to create a test file
+        test_file = api.models_path / ".test_write"
+        try:
+            test_file.touch()
+            test_file.unlink()
+            folder_status["write_permissions"] = True
+        except:
+            folder_status["write_permissions"] = False
+            issues.append("Insufficient write permissions in models directory")
+    except Exception as e:
+        logger.error(f"Folder validation failed: {str(e)}")
+        folder_status = {"error": str(e)}
+        issues.append(f"Folder validation error: {str(e)}")
+    # Determine overall status
+    if not issues:
+        overall_status = "valid"
+    elif len(issues) < 3:
+        overall_status = "warning"
+    else:
+        overall_status = "invalid"
+    validation_response = ValidationResponse(
+        config_validation=config_status,
+        model_validation=model_status,
+        folder_validation=folder_status,
+        overall_status=overall_status,
+        issues=issues
+    )
+    logger.info(f"System validation completed with status: {overall_status}")
+    return validation_response
+@router.get("/system/status",
+            response_model=SystemStatusResponse,
+            summary="Check System Status",
+            description="Returns comprehensive system status including CPU, Memory, GPU, Storage, and Model information")
+async def check_system():
+    """
+    Get system status including:
+    - CPU usage
+    - Memory usage
+    - GPU availability and usage
+    - Storage status for model and cache directories
+    - Current model status
+    """
+    logger.info("Checking system status")
+    status = SystemStatusResponse()
+    system_info = None
+    # Check CPU and Memory
+    try:
+        system_info = get_system_info()
+        status.cpu = {
+            "usage_percent": system_info["cpu_percent"],
+            "status": "healthy" if system_info["cpu_percent"] < 90 else "high"
+        }
+        logger.debug(f"CPU status retrieved: {status.cpu}")
+    except Exception as e:
+        logger.error(f"Failed to get CPU info: {str(e)}")
+        status.cpu = {"status": "error", "message": str(e)}
+    # Check Memory
+    try:
+        if not system_info:
+            system_info = get_system_info()
+        status.memory = {
+            "usage_percent": system_info["memory_percent"],
+            "status": "healthy" if system_info["memory_percent"] < 90 else "critical",
+            "available": format_memory_size(psutil.virtual_memory().available)
+        }
+        logger.debug(f"Memory status retrieved: {status.memory}")
+    except Exception as e:
+        logger.error(f"Failed to get memory info: {str(e)}")
+        status.memory = {"status": "error", "message": str(e)}
+    # Check GPU
+    try:
+        if not system_info:
+            system_info = get_system_info()
+        status.gpu = {
+            "available": system_info["gpu_available"],
+            "memory_used": format_memory_size(system_info["gpu_memory_used"]),
+            "memory_total": format_memory_size(system_info["gpu_memory_total"]),
+            "utilization_percent": system_info["gpu_memory_used"] / system_info["gpu_memory_total"] * 100 if system_info["gpu_available"] else 0
+        }
+        logger.debug(f"GPU status retrieved: {status.gpu}")
+    except Exception as e:
+        logger.error(f"Failed to get GPU info: {str(e)}")
+        status.gpu = {"status": "error", "message": str(e)}
+    # Check Storage
+    try:
+        models_path = Path(api.models_path)
+        cache_path = Path(api.cache_path)
+        status.storage = {
+            "models_directory": str(models_path),
+            "models_size": format_memory_size(sum(f.stat().st_size for f in models_path.glob('**/*') if f.is_file())),
+            "cache_directory": str(cache_path),
+            "cache_size": format_memory_size(sum(f.stat().st_size for f in cache_path.glob('**/*') if f.is_file()))
+        }
+        logger.debug(f"Storage status retrieved: {status.storage}")
+    except Exception as e:
+        logger.error(f"Failed to get storage info: {str(e)}")
+        status.storage = {"status": "error", "message": str(e)}
+    # Check Model Status
+    try:
+        current_model_path = api.models_path / api.model_name.split('/')[-1] if api.model_name else None
+        status.model = {
+            "is_loaded": api.model is not None,
+            "current_model": api.model_name,
+            "is_valid": validate_model_path(current_model_path) if current_model_path else False,
+            "has_chat_template": api.has_chat_template() if api.model else False
+        }
+        logger.debug(f"Model status retrieved: {status.model}")
+    except Exception as e:
+        logger.error(f"Failed to get model status: {str(e)}")
+        status.model = {"status": "error", "message": str(e)}
+    logger.info("System status check completed")
+    return status
+@router.post("/generate")
+async def generate_text(request: GenerateRequest):
+    """Generate text response from prompt"""
+    logger.info(f"Received generation request for prompt: {request.prompt[:50]}...")
+    try:
+        response = api.generate_response(
+            prompt=request.prompt,
+            system_message=request.system_message,
+            max_new_tokens=request.max_new_tokens or api.max_new_tokens
+        )
+        logger.info("Successfully generated response")
+        return {"generated_text": response}
+    except Exception as e:
+        logger.error(f"Error in generate_text endpoint: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
+@router.post("/generate/stream")
+async def generate_stream(request: GenerateRequest):
+    """Generate streaming text response from prompt"""
+    logger.info(f"Received streaming generation request for prompt: {request.prompt[:50]}...")
+    try:
+        return api.generate_stream(
+            prompt=request.prompt,
+            system_message=request.system_message,
+            max_new_tokens=request.max_new_tokens or api.max_new_tokens
+        )
+    except Exception as e:
+        logger.error(f"Error in generate_stream endpoint: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
+@router.post("/embedding", response_model=EmbeddingResponse)
+async def generate_embedding(request: EmbeddingRequest):
+    """Generate embedding vector from text"""
+    logger.info(f"Received embedding request for text: {request.text[:50]}...")
+    try:
+        embedding = api.generate_embedding(request.text)
+        logger.info(f"Successfully generated embedding of dimension {len(embedding)}")
+        return EmbeddingResponse(
+            embedding=embedding,
+            dimension=len(embedding)
+        )
+    except Exception as e:
+        logger.error(f"Error in generate_embedding endpoint: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
+@router.post("/model/download",
+             summary="Download default or specified model",
+             description="Downloads model files. Uses default model from config if none specified.")
+async def download_model(model_name: Optional[str] = None):
+    """Download model files to local storage"""
+    try:
+        # Use model name from config if none provided
+        model_to_download = model_name or config["model"]["defaults"]["model_name"]
+        logger.info(f"Received request to download model: {model_to_download}")
+        api.download_model(model_to_download)
+        logger.info(f"Successfully downloaded model: {model_to_download}")
+        return {
+            "status": "success",
+            "message": f"Model {model_to_download} downloaded",
+            "model_name": model_to_download
+        }
+    except Exception as e:
+        logger.error(f"Error downloading model: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
+@router.post("/model/initialize",
+             summary="Initialize default or specified model",
+             description="Initialize model for use. Uses default model from config if none specified.")
+async def initialize_model(model_name: Optional[str] = None):
+    """Initialize a model for use"""
+    try:
+        # Use model name from config if none provided
+        model_to_init = model_name or config["model"]["defaults"]["model_name"]
+        logger.info(f"Received request to initialize model: {model_to_init}")
+        api.initialize_model(model_to_init)
+        logger.info(f"Successfully initialized model: {model_to_init}")
+        return {
+            "status": "success",
+            "message": f"Model {model_to_init} initialized",
+            "model_name": model_to_init
+        }
+    except Exception as e:
+        logger.error(f"Error initializing model: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
+@router.get("/models/status")
+async def get_model_status():
+    """Get current model status"""
+    try:
+        status = {
+            "model_loaded": api.model is not None,
+            "current_model": api.model_name if api.model_name else None,
+            "has_chat_template": api.has_chat_template() if api.model else False
+        }
+        logger.info(f"Retrieved model status: {status}")
+        return status
+    except Exception as e:
+        logger.error(f"Error getting model status: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))

utils/__init__.py ADDED Viewed

File without changes

utils/errors.py ADDED Viewed

	@@ -0,0 +1,94 @@

+class ModelNotFoundError(Exception):
+    """Error raised when a model cannot be found or accessed"""
+    def __init__(self, model_name: str, original_error: Exception = None):
+        self.model_name = model_name
+        self.original_error = original_error
+        message = (
+            f"Could not find or access model: '{model_name}'\n\n"
+            f"This could be because:\n"
+            f"1. The model name is misspelled - double check the name\n"
+            f"2. The model requires authentication - you need to:\n"
+            f"   - Log in to Hugging Face (huggingface.co)\n"
+            f"   - Accept the model's terms of use on its page\n"
+            f"   - Create an access token in your HF account settings\n"
+            f"   - Set the token as an environment variable: export HUGGING_FACE_HUB_TOKEN=your_token\n\n"
+            f"Original error: {str(original_error)}"
+        )
+        super().__init__(message)
+class ModelLoadError(Exception):
+    """Error raised when a model fails to load"""
+    def __init__(self, model_name: str, load_type: str, original_error: Exception = None):
+        self.model_name = model_name
+        self.load_type = load_type
+        self.original_error = original_error
+        message = (
+            f"Failed to load model: '{model_name}' using {load_type} precision\n\n"
+            f"Common reasons:\n"
+            f"1. Not enough GPU memory - This model requires more VRAM than available\n"
+            f"   - Try using 8-bit quantization (load_in_8bit=True)\n"
+            f"   - Try using 4-bit quantization (load_in_4bit=True)\n"
+            f"   - Or use a smaller model\n"
+            f"2. Incorrect model parameters - Check the model card for correct loading parameters\n"
+            f"3. Corrupted model files - Try removing the model folder and downloading again\n\n"
+            f"Original error: {str(original_error)}"
+        )
+        super().__init__(message)
+class InvalidConfigurationError(Exception):
+    """Error raised when configuration is invalid"""
+    def __init__(self, param_name: str, current_value: any, expected_value: str, original_error: Exception = None):
+        self.param_name = param_name
+        self.current_value = current_value
+        self.expected_value = expected_value
+        self.original_error = original_error
+        message = (
+            f"Invalid configuration parameter: '{param_name}'\n\n"
+            f"Current value: {current_value}\n"
+            f"Expected value: {expected_value}\n\n"
+            f"Please update your config.yaml file with the correct value\n"
+            f"Original error: {str(original_error)}"
+        )
+        super().__init__(message)
+class GenerationError(Exception):
+    """Error raised when text generation fails"""
+    def __init__(self, stage: str, original_error: Exception = None):
+        self.stage = stage
+        self.original_error = original_error
+        message = (
+            f"Text generation failed during {stage}\n\n"
+            f"This could be because:\n"
+            f"1. The model ran out of memory during generation\n"
+            f"   - Try reducing max_new_tokens\n"
+            f"   - Try reducing the input text length\n"
+            f"2. The input prompt might be too complex or long\n"
+            f"3. The model might be in an inconsistent state\n"
+            f"   - Try reinitializing the model\n\n"
+            f"Original error: {str(original_error)}"
+        )
+        super().__init__(message)
+# Usage examples:
+"""
+# When model not found:
+raise ModelNotFoundError("mistralai/Mistral-7B-v0.1", original_error=e)
+# When model fails to load:
+raise ModelLoadError("mistralai/Mistral-7B-v0.1", "8-bit quantization", original_error=e)
+# When config is invalid:
+raise InvalidConfigurationError(
+    "temperature",
+    2.5,
+    "a value between 0.0 and 2.0",
+    original_error=e
+)
+# When generation fails:
+raise GenerationError("token generation", original_error=e)
+"""

utils/helpers.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import psutil
+import torch
+from pathlib import Path
+from typing import Dict, Any
+def get_system_info() -> Dict[str, Any]:
+    """Get system resource information"""
+    return {
+        "cpu_percent": psutil.cpu_percent(),
+        "memory_percent": psutil.virtual_memory().percent,
+        "gpu_available": torch.cuda.is_available(),
+        "gpu_memory_used": torch.cuda.memory_allocated() if torch.cuda.is_available() else 0,
+        "gpu_memory_total": torch.cuda.get_device_properties(0).total_memory if torch.cuda.is_available() else 0
+    }
+def calculate_optimal_batch_size(model_size: int, available_memory: int) -> int:
+    """Calculate optimal batch size based on model size and available memory"""
+    memory_per_sample = model_size * 1.5  # Rough estimate including overhead
+    return max(1, available_memory // memory_per_sample)
+def ensure_folder_structure(config: Dict) -> None:
+    """Ensure all necessary folders exist"""
+    folders = [
+        Path(config["folders"]["models"]),
+        Path(config["folders"]["cache"]),
+        Path(config["folders"]["logs"])
+    ]
+    for folder in folders:
+        folder.mkdir(parents=True, exist_ok=True)
+def format_memory_size(size_bytes: int) -> str:
+    """Format memory size to human readable format"""
+    for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
+        if size_bytes < 1024:
+            return f"{size_bytes:.2f}{unit}"
+        size_bytes /= 1024

utils/logging.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import logging
+from pathlib import Path
+def setup_logger(config: dict, name: str = None) -> logging.Logger:
+    """Set up logger with configuration from config file."""
+    logger = logging.getLogger(name or __name__)
+    # Set level from config
+    level = getattr(logging, config["logging"]["level"].upper())
+    logger.setLevel(level)
+    # Create logs directory if it doesn't exist
+    log_path = Path(config["folders"]["logs"])
+    log_path.mkdir(exist_ok=True)
+    # Create handlers
+    file_handler = logging.FileHandler(log_path / config["logging"]["file"])
+    console_handler = logging.StreamHandler()
+    # Create formatter
+    formatter = logging.Formatter(config["logging"]["format"])
+    file_handler.setFormatter(formatter)
+    console_handler.setFormatter(formatter)
+    # Add handlers
+    logger.addHandler(file_handler)
+    logger.addHandler(console_handler)
+    return logger

utils/validation.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from typing import Dict, Any
+from pathlib import Path
+def validate_model_path(model_path: Path) -> bool:
+    """Validate that a model path exists and contains necessary files"""
+    if not model_path.exists():
+        return False
+    required_files = ['config.json', 'pytorch_model.bin']
+    return all((model_path / file).exists() for file in required_files)
+def validate_generation_params(params: Dict[str, Any]) -> Dict[str, Any]:
+    """Validate and normalize generation parameters"""
+    validated = params.copy()
+    # Ensure temperature is within bounds
+    if 'temperature' in validated:
+        validated['temperature'] = max(0.0, min(2.0, validated['temperature']))
+    # Ensure max_new_tokens is reasonable
+    if 'max_new_tokens' in validated:
+        validated['max_new_tokens'] = max(1, min(4096, validated['max_new_tokens']))
+    return validated