Spaces:

TeamGenKI
/

LLMServer

Paused

App Files Files Community

AurelioAguirre commited on Nov 26, 2024

Commit

19b1be5

1 Parent(s): 712d19c

Massive update, added download and convert options.

Browse files

Files changed (8) hide show

.idea/Inference-Server.iml +1 -0
README.md +4 -0
client/__init__.py +0 -0
client/client.py +275 -0
client/client_config.yaml +33 -0
main/hf_downloader.py +97 -0
main/main.py +3 -1
main/routes.py +249 -196

.idea/Inference-Server.iml CHANGED Viewed

@@ -4,6 +4,7 @@
     <exclude-output />
     <content url="file://$MODULE_DIR$">
       <excludeFolder url="file://$MODULE_DIR$/myenv" />
     </content>
     <orderEntry type="inheritedJdk" />
     <orderEntry type="sourceFolder" forTests="false" />

     <exclude-output />
     <content url="file://$MODULE_DIR$">
       <excludeFolder url="file://$MODULE_DIR$/myenv" />
+      <excludeFolder url="file://$MODULE_DIR$/venv" />
     </content>
     <orderEntry type="inheritedJdk" />
     <orderEntry type="sourceFolder" forTests="false" />

README.md CHANGED Viewed

@@ -24,4 +24,8 @@ folders
   LLM-Engine
     Main
       main.py
 ```

   LLM-Engine
     Main
       main.py
+      routes.py
+    checkpoints
+        meta
 ```

client/__init__.py ADDED Viewed

File without changes

client/client.py ADDED Viewed

	@@ -0,0 +1,275 @@

+import requests
+import json
+import sseclient
+import sys
+from pathlib import Path
+import yaml
+from typing import Optional
+import os
+from litgpt.scripts.convert_hf_checkpoint import convert_hf_checkpoint
+from litgpt.scripts.download import download_from_hub
+DEFAULT_CONFIG = {
+    'server': {'url': 'http://localhost:7860'},
+    'model': {
+        'name': 'Qwen2.5-Coder-7B-Instruct',
+        'download_location': 'huihui-ai/Qwen2.5-Coder-7B-Instruct-abliterated',
+        'folder_path': 'huihui-ai/Qwen2.5-Coder-7B-Instruct-abliterated',
+        'model_filename': 'model.safetensors'
+    }
+}
+def get_project_root(config: dict) -> Path:
+    client_dir = Path(__file__).parent
+    return (client_dir / config['project']['root_dir']).resolve()
+def get_checkpoints_dir(config: dict) -> Path:
+    root = get_project_root(config)
+    return root / config['project']['checkpoints_dir']
+class LLMClient:
+    def __init__(self, config: dict):
+        self.config = config
+        self.base_url = config['server']['url'].rstrip('/')
+        self.session = requests.Session()
+        self.checkpoints_dir = get_checkpoints_dir(config)
+    def download_model(
+            self,
+            repo_id: Optional[str] = None,
+            access_token: Optional[str] = os.getenv("HF_TOKEN"),
+    ) -> None:
+        repo_id = repo_id or self.config['model']['folder_path']
+        print(f"\nDownloading model from: {repo_id}")
+        download_from_hub(
+            repo_id=repo_id,
+            model_name=self.config['model']['name'],
+            access_token=access_token,
+            tokenizer_only=False,
+            checkpoint_dir=self.checkpoints_dir
+        )
+    def convert_model(
+            self,
+            folder_path: Optional[str] = None,
+            model_name: Optional[str] = None,
+    ) -> None:
+        """Convert downloaded model to LitGPT format."""
+        folder_path = folder_path or self.config['model']['folder_path']
+        model_name = model_name or self.config['model']['name']
+        model_dir = self.checkpoints_dir / folder_path
+        print(f"\nConverting model in: {model_dir}")
+        print(f"Using model name: {model_name}")
+        try:
+            convert_hf_checkpoint(
+                checkpoint_dir=model_dir,
+                model_name=model_name
+            )
+            print("Conversion complete!")
+        except ValueError as e:
+            if "is not a supported config name" in str(e):
+                print(f"\nNote: Model '{model_name}' isn't in LitGPT's predefined configs.")
+                print("You may need to use the model's safetensors files directly.")
+            raise
+    def initialize_model(
+            self,
+            folder_path: Optional[str] = None,
+            mode: Optional[str] = None,
+            **kwargs
+    ) -> dict:
+        """Initialize a converted model using the standard initialize endpoint."""
+        url = f"{self.base_url}/initialize"
+        folder_path = folder_path or self.config['model']['folder_path']
+        mode = mode or self.config['hardware']['mode']
+        # Debug prints
+        print(f"\nDebug - Attempting to initialize model with:")
+        print(f"Model path: {folder_path}")
+        print(f"Mode: {mode}")
+        payload = {
+            "model_path": folder_path,  # This is what the regular initialize endpoint expects
+            "mode": mode,
+            "precision": self.config['hardware'].get('precision'),
+            "quantize": self.config['hardware'].get('quantize'),
+            "gpu_count": self.config['hardware'].get('gpu_count', 'auto'),
+            **kwargs
+        }
+        response = self.session.post(url, json=payload)
+        response.raise_for_status()
+        return response.json()
+    def generate_stream(
+            self,
+            prompt: str,
+            max_new_tokens: Optional[int] = None,
+            temperature: Optional[float] = None,
+            top_k: Optional[int] = None,
+            top_p: Optional[float] = None
+    ):
+        url = f"{self.base_url}/generate/stream"
+        gen_config = self.config.get('generation', {})
+        payload = {
+            "prompt": prompt,
+            "max_new_tokens": max_new_tokens or gen_config.get('max_new_tokens', 50),
+            "temperature": temperature or gen_config.get('temperature', 1.0),
+            "top_k": top_k or gen_config.get('top_k'),
+            "top_p": top_p or gen_config.get('top_p', 1.0)
+        }
+        response = self.session.post(url, json=payload, stream=True)
+        response.raise_for_status()
+        client = sseclient.SSEClient(response)
+        for event in client.events():
+            yield json.loads(event.data)
+def clear_screen():
+    os.system('cls' if os.name == 'nt' else 'clear')
+def load_config(config_path: str = "client_config.yaml") -> dict:
+    try:
+        with open(config_path, 'r') as f:
+            config = yaml.safe_load(f)
+        return config
+    except Exception as e:
+        print(f"Warning: Could not load config file: {str(e)}")
+        print("Using default configuration.")
+        return DEFAULT_CONFIG
+def main():
+    config = load_config()
+    client = LLMClient(config)
+    while True:
+        clear_screen()
+        print("\nLLM Engine Client")
+        print("================")
+        print(f"Server: {client.base_url}")
+        print(f"Current Model: {config['model']['name']}")
+        print("\nOptions:")
+        print("1. Download Model")
+        print("2. Convert Model")
+        print("3. Initialize Model")
+        print("4. Generate Text (Streaming)")
+        print("5. Exit")
+        choice = input("\nEnter your choice (1-5): ").strip()
+        if choice == "1":
+            try:
+                print("\nDownload Model")
+                print("==============")
+                print(f"Default location: {config['model']['download_location']}")
+                if input("\nUse default? (Y/n): ").lower() != 'n':
+                    repo_id = config['model']['download_location']
+                else:
+                    repo_id = input("Enter download location: ").strip()
+                access_token = input("Enter HF access token (or press Enter to use HF_TOKEN env var): ").strip() or None
+                client.download_model(repo_id=repo_id, access_token=access_token)
+                print("\nModel downloaded successfully!")
+                input("\nPress Enter to continue...")
+            except Exception as e:
+                print(f"\nError: {str(e)}")
+                input("\nPress Enter to continue...")
+        elif choice == "2":
+            try:
+                print("\nConvert Model")
+                print("=============")
+                print(f"Default folder path: {config['model']['folder_path']}")
+                print(f"Default model name: {config['model']['name']}")
+                if input("\nUse defaults? (Y/n): ").lower() != 'n':
+                    folder_path = config['model']['folder_path']
+                    model_name = config['model']['name']
+                else:
+                    folder_path = input("Enter folder path: ").strip()
+                    model_name = input("Enter model name: ").strip()
+                client.convert_model(
+                    folder_path=folder_path,
+                    model_name=model_name
+                )
+                print("\nModel converted successfully!")
+                input("\nPress Enter to continue...")
+            except Exception as e:
+                print(f"\nError: {str(e)}")
+                input("\nPress Enter to continue...")
+        elif choice == "3":
+            try:
+                print("\nInitialize Model")
+                print("================")
+                print(f"Default folder path: {config['model']['folder_path']}")
+                if input("\nUse defaults? (Y/n): ").lower() != 'n':
+                    result = client.initialize_model()
+                else:
+                    folder_path = input("Enter model folder path: ").strip()
+                    mode = input("Enter mode (cpu/gpu): ").strip()
+                    result = client.initialize_model(
+                        folder_path=folder_path,
+                        mode=mode
+                    )
+                print("\nSuccess! Model initialized.")
+                print(json.dumps(result, indent=2))
+                input("\nPress Enter to continue...")
+            except Exception as e:
+                print(f"\nError: {str(e)}")
+                input("\nPress Enter to continue...")
+        elif choice == "4":
+            try:
+                print("\nGenerate Text (Streaming)")
+                print("========================")
+                prompt = input("Enter your prompt: ").strip()
+                print("\nGenerating (Ctrl+C to stop)...")
+                print("\nResponse:")
+                try:
+                    for chunk in client.generate_stream(prompt=prompt):
+                        if "error" in chunk:
+                            print(f"\nError: {chunk['error']}")
+                            break
+                        token = chunk.get("token", "")
+                        is_finished = chunk.get("metadata", {}).get("is_finished", False)
+                        if is_finished:
+                            print("\n[Generation Complete]")
+                            break
+                        print(token, end="", flush=True)
+                except KeyboardInterrupt:
+                    print("\n\n[Generation Stopped]")
+                input("\nPress Enter to continue...")
+            except Exception as e:
+                print(f"\nError: {str(e)}")
+                input("\nPress Enter to continue...")
+        elif choice == "5":
+            print("\nGoodbye!")
+            break
+        else:
+            print("\nInvalid choice. Please try again.")
+            input("\nPress Enter to continue...")
+if __name__ == "__main__":
+    main()

client/client_config.yaml ADDED Viewed

	@@ -0,0 +1,33 @@

+# Project Configuration
+project:
+  root_dir: ".."
+  checkpoints_dir: "checkpoints"
+# Server Configuration
+server:
+  url: "http://localhost:7860"
+# Model Configuration
+model:
+  name: "Llama-3.2-3B"
+  download_location: "huihui-ai/Llama-3.2-3B-Instruct-abliterated"
+  folder_path: "huihui-ai/Llama-3.2-3B-Instruct-abliterated"
+  model_filename: "lit_model.pth"
+  config_filename: "config.json"
+  tokenizer_filename: "tokenizer.json"
+# Hardware Configuration
+hardware:
+  mode: "gpu"
+  precision: "16-true"
+  # Precision Options: "32-true", "16-mixed", "16-true", "bf16-mixed", "bf16-true"
+  quantize: "bnb.int8"
+  # Quantization Options: "bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8"
+  gpu_count: "auto"
+# Generation Parameters
+generation:
+  max_new_tokens: 500
+  temperature: 1.0
+  top_k: null
+  top_p: 1.0

main/hf_downloader.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import os
+import argparse
+from transformers import AutoTokenizer, AutoModel
+from huggingface_hub import login, HfApi
+import logging
+from tqdm import tqdm
+# Set up logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+def setup_auth(token):
+    """Setup Hugging Face authentication"""
+    try:
+        login(token)
+        logger.info("Successfully authenticated with Hugging Face")
+    except Exception as e:
+        logger.error(f"Authentication failed: {str(e)}")
+        raise
+def list_models(pattern=None):
+    """List available models matching the pattern"""
+    try:
+        api = HfApi()
+        models = api.list_models(pattern=pattern, full=True)
+        return [(model.modelId, model.downloads) for model in models]
+    except Exception as e:
+        logger.error(f"Failed to list models: {str(e)}")
+        raise
+def download_model(model_name, output_dir):
+    """Download model and tokenizer"""
+    try:
+        logger.info(f"Downloading model: {model_name}")
+        # Create output directory if it doesn't exist
+        os.makedirs(output_dir, exist_ok=True)
+        # Download tokenizer
+        logger.info("Downloading tokenizer...")
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        tokenizer.save_pretrained(os.path.join(output_dir, model_name))
+        # Download model
+        logger.info("Downloading model...")
+        model = AutoModel.from_pretrained(model_name)
+        model.save_pretrained(os.path.join(output_dir, model_name))
+        logger.info(f"Successfully downloaded {model_name} to {output_dir}")
+        return True
+    except Exception as e:
+        logger.error(f"Failed to download model {model_name}: {str(e)}")
+        raise
+def main():
+    parser = argparse.ArgumentParser(description='Download models from Hugging Face')
+    parser.add_argument('--token', type=str, help='Hugging Face API token')
+    parser.add_argument('--model', type=str, help='Model name to download')
+    parser.add_argument('--output', type=str, default='./models',
+                        help='Output directory for downloaded models')
+    parser.add_argument('--search', type=str, help='Search pattern for models')
+    parser.add_argument('--list', action='store_true',
+                        help='List available models matching the search pattern')
+    args = parser.parse_args()
+    try:
+        # Setup authentication if token provided
+        if args.token:
+            setup_auth(args.token)
+        # List models if requested
+        if args.list:
+            logger.info(f"Searching for models matching: {args.search}")
+            models = list_models(args.search)
+            print("\nAvailable models:")
+            for model_id, downloads in sorted(models, key=lambda x: x[1], reverse=True):
+                print(f"- {model_id} (Downloads: {downloads:,})")
+            return
+        # Download specific model
+        if args.model:
+            download_model(args.model, args.output)
+        else:
+            logger.error("Please specify a model to download using --model")
+            return
+    except KeyboardInterrupt:
+        logger.info("\nOperation cancelled by user")
+    except Exception as e:
+        logger.error(f"An error occurred: {str(e)}")
+if __name__ == "__main__":
+    main()

main/main.py CHANGED Viewed

@@ -39,10 +39,12 @@ def main():
     logger.info("Available endpoints:")
     logger.info("  - /")
     logger.info("  - /health")
     logger.info("  - /initialize")
     logger.info("  - /generate")
-    logger.info("  - /initialize/custom")
     logger.info("  - /generate/stream")
     logger.info("  - /docs")
     logger.info("  - /redoc")
     logger.info("  - /openapi.json")

     logger.info("Available endpoints:")
     logger.info("  - /")
     logger.info("  - /health")
+    logger.info("  - /models")
     logger.info("  - /initialize")
     logger.info("  - /generate")
     logger.info("  - /generate/stream")
+    logger.info("  - /download")
+    logger.info("  - /convert")
     logger.info("  - /docs")
     logger.info("  - /redoc")
     logger.info("  - /openapi.json")

main/routes.py CHANGED Viewed

@@ -1,11 +1,14 @@
 from fastapi import APIRouter, HTTPException
 from fastapi.responses import StreamingResponse
-from pydantic import BaseModel
-from typing import Optional, Union, AsyncGenerator
 import torch
 import logging
 from pathlib import Path
 from litgpt.api import LLM
 import json
 import asyncio
@@ -19,224 +22,204 @@ router = APIRouter()
 llm_instance = None
 class InitializeRequest(BaseModel):
-    """
-    Configuration for model initialization including model path
-    """
-    mode: str = "cpu"
-    precision: Optional[str] = None
-    quantize: Optional[str] = None
-    gpu_count: Union[str, int] = "auto"
-    model_path: str
 class GenerateRequest(BaseModel):
-    prompt: str
-    max_new_tokens: int = 50
-    temperature: float = 1.0
-    top_k: Optional[int] = None
-    top_p: float = 1.0
-    return_as_token_ids: bool = False
-    stream: bool = False
-# A Pydantic model for the streaming generation request
 class StreamGenerateRequest(BaseModel):
-    prompt: str
-    max_new_tokens: int = 50
-    temperature: float = 1.0
-    top_k: Optional[int] = None
-    top_p: float = 1.0
-class InitializeCustomRequest(BaseModel):
-    """
-    Configuration for custom model initialization using from_pretrained
-    """
-    mode: str = "cpu"
-    precision: Optional[str] = None
-    quantize: Optional[str] = None
-    gpu_count: Union[str, int] = "auto"
-    folder_path: str  # Path to the model folder relative to checkpoints
-    model_filename: str  # Name of the model file (e.g., "lit_model.pth")
-    config_filename: str = "config.json"  # Default config filename
-    tokenizer_filename: Optional[str] = "tokenizer.json"  # Optional tokenizer filename
-@router.post("/initialize/custom")
-async def initialize_custom_model(request: InitializeCustomRequest):
-    """
-    Initialize a custom model using from_pretrained method.
-    This is for models that are already downloaded and stored in the checkpoints directory.
     """
-    global llm_instance
     try:
         # Get the project root directory and construct paths
-        project_root = Path(__file__).parent
         checkpoints_dir = project_root / "checkpoints"
-        model_dir = checkpoints_dir / request.folder_path
-        logger.info(f"Loading custom model from directory: {model_dir}")
-        # Verify that all required files exist
-        model_path = model_dir / request.model_filename
-        config_path = model_dir / request.config_filename
-        if not model_path.exists():
-            raise HTTPException(
-                status_code=400,
-                detail=f"Model file not found: {request.model_filename}"
-            )
-        if not config_path.exists():
-            raise HTTPException(
-                status_code=400,
-                detail=f"Config file not found: {request.config_filename}"
-            )
-        # Check for tokenizer if specified
-        tokenizer_path = None
-        if request.tokenizer_filename:
-            tokenizer_path = model_dir / request.tokenizer_filename
-            if not tokenizer_path.exists():
-                raise HTTPException(
-                    status_code=400,
-                    detail=f"Tokenizer file not found: {request.tokenizer_filename}"
-                )
-        # Load the model using from_pretrained
-        llm_instance = LLM.from_pretrained(
-            path=str(model_dir),
-            model_file=request.model_filename,
-            config_file=request.config_filename,
-            tokenizer_file=request.tokenizer_filename if request.tokenizer_filename else None,
-            distribute=None if request.precision or request.quantize else "auto"
-        )
-        # If manual distribution is needed
-        if request.precision or request.quantize:
-            llm_instance.distribute(
-                accelerator="cuda" if request.mode == "gpu" else "cpu",
-                devices=request.gpu_count,
-                precision=request.precision,
-                quantize=request.quantize
             )
-        # Log success and memory stats
-        logger.info(
-            f"Custom model initialized successfully with config:\n"
-            f"Mode: {request.mode}\n"
-            f"Precision: {request.precision}\n"
-            f"Quantize: {request.quantize}\n"
-            f"GPU Count: {request.gpu_count}\n"
-            f"Model Directory: {model_dir}\n"
-            f"Model File: {request.model_filename}\n"
-            f"Config File: {request.config_filename}\n"
-            f"Tokenizer File: {request.tokenizer_filename}\n"
-            f"Current GPU Memory: {torch.cuda.memory_allocated()/1024**3:.2f}GB allocated, "
-            f"{torch.cuda.memory_reserved()/1024**3:.2f}GB reserved"
         )
         return {
-            "success": True,
-            "message": "Custom model initialized successfully",
-            "model_info": {
-                "folder": str(model_dir),
-                "model_file": request.model_filename,
-                "config_file": request.config_filename,
-                "tokenizer_file": request.tokenizer_filename
-            }
         }
     except Exception as e:
-        logger.error(f"Error initializing custom model: {str(e)}")
-        # Print detailed memory statistics on failure
-        logger.error(f"GPU Memory Stats:\n"
-                     f"Allocated: {torch.cuda.memory_allocated()/1024**3:.2f}GB\n"
-                     f"Reserved: {torch.cuda.memory_reserved()/1024**3:.2f}GB\n"
-                     f"Max Allocated: {torch.cuda.max_memory_allocated()/1024**3:.2f}GB")
-        raise HTTPException(status_code=500, detail=f"Error initializing custom model: {str(e)}")
-# Endpoint for streaming generation
-@router.post("/generate/stream")
-async def generate_stream(request: StreamGenerateRequest):
     """
-    Generate text using the initialized model with streaming response.
-    Returns a StreamingResponse that yields JSON-formatted chunks of text.
     """
-    global llm_instance
-    if llm_instance is None:
-        raise HTTPException(
-            status_code=400,
-            detail="Model not initialized. Call /initialize first."
-        )
-    async def event_generator() -> AsyncGenerator[str, None]:
-        try:
-            # Start the generation with streaming enabled
-            async for token in llm_instance.generate(
-                    prompt=request.prompt,
-                    max_new_tokens=request.max_new_tokens,
-                    temperature=request.temperature,
-                    top_k=request.top_k,
-                    top_p=request.top_p,
-                    stream=True  # Enable streaming
-            ):
-                # Create a JSON response for each token
-                chunk = {
-                    "token": token,
-                    "metadata": {
-                        "prompt": request.prompt,
-                        "is_finished": False
-                    }
-                }
-                # Format as SSE data
-                yield f"data: {json.dumps(chunk)}\n\n"
-                # Small delay to prevent overwhelming the client
-                await asyncio.sleep(0.01)
-            # Send final message indicating completion
-            final_chunk = {
-                "token": "",
-                "metadata": {
-                    "prompt": request.prompt,
-                    "is_finished": True
-                }
-            }
-            yield f"data: {json.dumps(final_chunk)}\n\n"
-        except Exception as e:
-            logger.error(f"Error in stream generation: {str(e)}")
-            error_chunk = {
-                "error": str(e),
-                "metadata": {
-                    "prompt": request.prompt,
-                    "is_finished": True
-                }
-            }
-            yield f"data: {json.dumps(error_chunk)}\n\n"
-    return StreamingResponse(
-        event_generator(),
-        media_type="text/event-stream",
-        headers={
-            'Cache-Control': 'no-cache',
-            'Connection': 'keep-alive',
-        }
-    )
-@router.get("/")
-async def root():
-    """Root endpoint to verify service is running"""
-    return {
-        "status": "running",
-        "service": "LLM Engine",
-        "endpoints": {
-            "initialize": "/initialize",
-            "generate": "/generate",
-            "health": "/health"
-        }
-    }
 @router.post("/initialize")
 async def initialize_model(request: InitializeRequest):
@@ -247,7 +230,7 @@ async def initialize_model(request: InitializeRequest):
     try:
         # Get the project root directory (where main.py is located)
-        project_root = Path(__file__).parent
         checkpoints_dir = project_root / "checkpoints"
         logger.info(f"Checkpoint dir is: {checkpoints_dir}")
@@ -344,10 +327,80 @@ async def generate(request: GenerateRequest):
         logger.error(f"Error generating text: {str(e)}")
         raise HTTPException(status_code=500, detail=f"Error generating text: {str(e)}")
 @router.get("/health")
 async def health_check():
     """
     Check if the service is running and model is loaded.
     """
     global llm_instance

 from fastapi import APIRouter, HTTPException
 from fastapi.responses import StreamingResponse
+from pydantic import BaseModel, Field
+from typing import Optional, Union, AsyncGenerator, List
 import torch
 import logging
 from pathlib import Path
 from litgpt.api import LLM
+from litgpt.scripts.download import download_from_hub
+from litgpt.scripts.convert_hf_checkpoint import convert_hf_checkpoint
 import json
 import asyncio
 llm_instance = None
 class InitializeRequest(BaseModel):
+    """Configuration for model initialization including model path"""
+    mode: str = Field(default="cpu", description="Execution mode ('cpu' or 'gpu')")
+    precision: Optional[str] = Field(None, description="Precision format (e.g., 'bf16-true', 'bf16-mixed')")
+    quantize: Optional[str] = Field(None, description="Quantization format (e.g., 'bnb.nf4')")
+    gpu_count: Union[str, int] = Field(default="auto", description="Number of GPUs to use or 'auto'")
+    model_path: str = Field(..., description="Path to the model relative to checkpoints directory")
 class GenerateRequest(BaseModel):
+    """Request parameters for text generation"""
+    prompt: str = Field(..., description="Input text prompt for generation")
+    max_new_tokens: int = Field(default=50, description="Maximum number of tokens to generate")
+    temperature: float = Field(default=1.0, description="Sampling temperature")
+    top_k: Optional[int] = Field(None, description="Top-k sampling parameter")
+    top_p: float = Field(default=1.0, description="Top-p sampling parameter")
+    return_as_token_ids: bool = Field(default=False, description="Whether to return token IDs instead of text")
+    stream: bool = Field(default=False, description="Whether to stream the response")
 class StreamGenerateRequest(BaseModel):
+    """Request parameters for streaming text generation"""
+    prompt: str = Field(..., description="Input text prompt for generation")
+    max_new_tokens: int = Field(default=50, description="Maximum number of tokens to generate")
+    temperature: float = Field(default=1.0, description="Sampling temperature")
+    top_k: Optional[int] = Field(None, description="Top-k sampling parameter")
+    top_p: float = Field(default=1.0, description="Top-p sampling parameter")
+class DownloadModelRequest(BaseModel):
+    """Request to download a model from HuggingFace"""
+    repo_id: str = Field(
+        ...,
+        description="HuggingFace repository ID (e.g., 'huihui-ai/Llama-3.2-3B-Instruct-abliterated')"
+    )
+    model_name: str = Field(
+        ...,
+        description="Model architecture name (e.g., 'Llama-3.2-3B-Instruct')"
+    )
+    access_token: Optional[str] = Field(
+        None,
+        description="HuggingFace access token for private models"
+    )
+class ConvertModelRequest(BaseModel):
+    """Request to convert a downloaded model"""
+    folder_path: str = Field(
+        ...,
+        description="Path relative to checkpoints where model was downloaded"
+    )
+    model_name: str = Field(
+        ...,
+        description="Model architecture name for conversion"
+    )
+class ModelResponse(BaseModel):
+    """Model information response"""
+    name: str = Field(..., description="Full model name including organization")
+    path: str = Field(..., description="Relative path in checkpoints directory")
+    downloaded: bool = Field(..., description="Whether the model files are downloaded")
+    converted: bool = Field(..., description="Whether the model is converted to LitGPT format")
+    has_safetensors: bool = Field(..., description="Whether safetensors files are present")
+    files: List[str] = Field(..., description="List of files in model directory")
+class ModelsListResponse(BaseModel):
+    """Response for listing models"""
+    models: List[ModelResponse] = Field(..., description="List of available models")
+@router.post(
+    "/download",
+    response_model=dict,
+    summary="Download a model from HuggingFace Hub",
+    description="Downloads a model from HuggingFace to the LLM Engine's checkpoints directory",
+    response_description="Download status and location information"
+)
+async def download_model(request: DownloadModelRequest):
     """
+    Download a model from HuggingFace Hub.
+    - Downloads model files to the checkpoints directory
+    - Creates necessary subdirectories
+    - Handles authentication for private models
+    Returns:
+        A JSON object containing download status and path information
+    """
     try:
         # Get the project root directory and construct paths
+        project_root = Path(__file__).parent.parent
         checkpoints_dir = project_root / "checkpoints"
+        logger.info(f"Downloading model {request.repo_id} to {checkpoints_dir}")
+        download_from_hub(
+            repo_id=request.repo_id,
+            model_name=request.model_name,
+            access_token=request.access_token,
+            checkpoint_dir=checkpoints_dir,
+            tokenizer_only=False
+        )
+        return {
+            "status": "success",
+            "message": f"Model downloaded to {checkpoints_dir / request.repo_id}",
+            "path": str(request.repo_id)
+        }
+    except Exception as e:
+        logger.error(f"Error downloading model: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"Error downloading model: {str(e)}")
+@router.post(
+    "/convert",
+    response_model=dict,
+    summary="Convert a model to LitGPT format",
+    description="Converts a downloaded model to the LitGPT format required for inference",
+    response_description="Conversion status and location information"
+)
+async def convert_model(request: ConvertModelRequest):
+    """
+    Convert a downloaded model to LitGPT format.
+    - Converts model files to LitGPT's format
+    - Creates lit_model.pth file
+    - Maintains original files
+    Returns:
+        A JSON object containing conversion status and path information
+    """
+    try:
+        project_root = Path(__file__).parent.parent
+        checkpoints_dir = project_root / "checkpoints"
+        model_dir = checkpoints_dir / request.folder_path
+        if not model_dir.exists():
+            raise HTTPException(
+                status_code=404,
+                detail=f"Model directory not found: {request.folder_path}"
             )
+        logger.info(f"Converting model in {model_dir}")
+        convert_hf_checkpoint(
+            checkpoint_dir=model_dir,
+            model_name=request.model_name
         )
         return {
+            "status": "success",
+            "message": f"Model converted successfully",
+            "path": str(request.folder_path)
         }
     except Exception as e:
+        logger.error(f"Error converting model: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"Error converting model: {str(e)}")
+@router.get(
+    "/models",
+    response_model=ModelsListResponse,
+    summary="List available models",
+    description="Lists all models in the checkpoints directory with their status",
+    response_description="List of models with their details and status"
+)
+async def list_models():
     """
+    List all models in the checkpoints directory.
+    Returns:
+        A JSON object containing:
+        - List of models
+        - Each model's download status
+        - Each model's conversion status
+        - Available files for each model
     """
+    try:
+        project_root = Path(__file__).parent.parent
+        checkpoints_dir = project_root / "checkpoints"
+        models = []
+        if checkpoints_dir.exists():
+            for org_dir in checkpoints_dir.iterdir():
+                if org_dir.is_dir():
+                    for model_dir in org_dir.iterdir():
+                        if model_dir.is_dir():
+                            files = [f.name for f in model_dir.iterdir()]
+                            has_safetensors = any(f.endswith('.safetensors') for f in files)
+                            has_lit_model = 'lit_model.pth' in files
+                            model_info = ModelResponse(
+                                name=f"{org_dir.name}/{model_dir.name}",
+                                path=str(model_dir.relative_to(checkpoints_dir)),
+                                downloaded=True,
+                                converted=has_lit_model,
+                                has_safetensors=has_safetensors,
+                                files=files
+                            )
+                            models.append(model_info)
+        return ModelsListResponse(models=models)
+    except Exception as e:
+        logger.error(f"Error listing models: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"Error listing models: {str(e)}")
 @router.post("/initialize")
 async def initialize_model(request: InitializeRequest):
     try:
         # Get the project root directory (where main.py is located)
+        project_root = Path(__file__).parent.parent
         checkpoints_dir = project_root / "checkpoints"
         logger.info(f"Checkpoint dir is: {checkpoints_dir}")
         logger.error(f"Error generating text: {str(e)}")
         raise HTTPException(status_code=500, detail=f"Error generating text: {str(e)}")
+@router.post("/generate/stream")
+async def generate_stream(request: StreamGenerateRequest):
+    """
+    Generate text using the initialized model with streaming response.
+    Returns a StreamingResponse that yields JSON-formatted chunks of text.
+    """
+    global llm_instance
+    if llm_instance is None:
+        raise HTTPException(
+            status_code=400,
+            detail="Model not initialized. Call /initialize first."
+        )
+    async def event_generator() -> AsyncGenerator[str, None]:
+        try:
+            # Start the generation with streaming enabled
+            for token in llm_instance.generate(
+                    prompt=request.prompt,
+                    max_new_tokens=request.max_new_tokens,
+                    temperature=request.temperature,
+                    top_k=request.top_k,
+                    top_p=request.top_p,
+                    stream=True  # Enable streaming
+            ):
+                # Create a JSON response for each token
+                chunk = {
+                    "token": token,
+                    "metadata": {
+                        "prompt": request.prompt,
+                        "is_finished": False
+                    }
+                }
+                # Format as SSE data
+                yield f"data: {json.dumps(chunk)}\n\n"
+                # Small delay to prevent overwhelming the client
+                await asyncio.sleep(0.01)
+            # Send final message indicating completion
+            final_chunk = {
+                "token": "",
+                "metadata": {
+                    "prompt": request.prompt,
+                    "is_finished": True
+                }
+            }
+            yield f"data: {json.dumps(final_chunk)}\n\n"
+        except Exception as e:
+            logger.error(f"Error in stream generation: {str(e)}")
+            error_chunk = {
+                "error": str(e),
+                "metadata": {
+                    "prompt": request.prompt,
+                    "is_finished": True
+                }
+            }
+            yield f"data: {json.dumps(error_chunk)}\n\n"
+    return StreamingResponse(
+        event_generator(),
+        media_type="text/event-stream",
+        headers={
+            'Cache-Control': 'no-cache',
+            'Connection': 'keep-alive',
+        }
+    )
 @router.get("/health")
 async def health_check():
     """
     Check if the service is running and model is loaded.
+    Returns status information including model details if loaded.
     """
     global llm_instance