Spaces:

TeamGenKI
/

LLMServer

Paused

App Files Files Community

AurelioAguirre commited on Nov 1, 2024

Commit

d828ce4

1 Parent(s): b3cf4b4

First commit

Browse files

Files changed (6) hide show

dockerfile +43 -0
main/__init__.py +0 -0
main/api.py +0 -0
main/main.py +179 -0
requirements.txt +7 -0
setup_project.py +48 -0

dockerfile ADDED Viewed

	@@ -0,0 +1,43 @@

+# Use NVIDIA CUDA base image
+FROM nvidia/cuda:12.1.0-runtime-ubuntu22.04 as base
+# Set working directory to /code (Hugging Face Spaces convention)
+WORKDIR /code
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    python3.10 \
+    python3-pip \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+# Install Python packages
+COPY requirements.txt .
+RUN pip3 install --no-cache-dir -r requirements.txt
+# Install any additional dependencies needed for litgpt
+RUN pip3 install --no-cache-dir \
+    einops \
+    xformers \
+    bitsandbytes \
+    accelerate \
+    sentencepiece
+# Copy the application code
+COPY . .
+# Create model directory structure
+RUN mkdir -p /code/checkout/meta \
+    /code/checkout/microsoft \
+    /code/checkout/mistralai
+# Set environment variables
+ENV PYTHONPATH=/code
+ENV LLM_ENGINE_HOST=0.0.0.0
+ENV LLM_ENGINE_PORT=8001
+# Expose the port the app runs on
+EXPOSE 8001
+# Command to run the application
+CMD ["python3", "main.py"]

main/__init__.py ADDED Viewed

File without changes

main/api.py ADDED Viewed

File without changes

main/main.py ADDED Viewed

	@@ -0,0 +1,179 @@

+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+from typing import Optional, Dict, Any, Union
+import torch
+import logging
+from pathlib import Path
+from litgpt.api import LLM
+import os
+import uvicorn
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+app = FastAPI(title="LLM Engine Service")
+# Global variable to store the LLM instance
+llm_instance = None
+class InitializeRequest(BaseModel):
+    """
+    Configuration for model initialization including model path
+    """
+    mode: str = "cpu"
+    precision: Optional[str] = None
+    quantize: Optional[str] = None
+    gpu_count: Union[str, int] = "auto"
+    model_path: str
+class GenerateRequest(BaseModel):
+    prompt: str
+    max_new_tokens: int = 50
+    temperature: float = 1.0
+    top_k: Optional[int] = None
+    top_p: float = 1.0
+    return_as_token_ids: bool = False
+    stream: bool = False
+@app.post("/initialize")
+async def initialize_model(request: InitializeRequest):
+    """
+    Initialize the LLM model with specified configuration.
+    """
+    global llm_instance
+    try:
+        if request.precision is None and request.quantize is None:
+            # Use auto distribution from load when no specific precision or quantization is set
+            llm_instance = LLM.load(
+                model=request.model_path,
+                distribute="auto"  # Let the load function handle distribution automatically
+            )
+            logger.info(
+                f"Model initialized with auto settings:\n"
+                f"Model Path: {request.model_path}\n"
+                f"Current GPU Memory: {torch.cuda.memory_allocated()/1024**3:.2f}GB allocated, "
+                f"{torch.cuda.memory_reserved()/1024**3:.2f}GB reserved"
+            )
+        else:
+            # Original initialization path for when specific settings are requested
+            llm_instance = LLM.load(
+                model=request.model_path,
+                distribute=None  # We'll distribute manually
+            )
+            # Distribute the model according to the configuration
+            llm_instance.distribute(
+                accelerator="cuda" if request.mode == "gpu" else "cpu",
+                devices=request.gpu_count,
+                precision=request.precision,
+                quantize=request.quantize
+            )
+            logger.info(
+                f"Model initialized successfully with config:\n"
+                f"Mode: {request.mode}\n"
+                f"Precision: {request.precision}\n"
+                f"Quantize: {request.quantize}\n"
+                f"GPU Count: {request.gpu_count}\n"
+                f"Model Path: {request.model_path}\n"
+                f"Current GPU Memory: {torch.cuda.memory_allocated()/1024**3:.2f}GB allocated, "
+                f"{torch.cuda.memory_reserved()/1024**3:.2f}GB reserved"
+            )
+        return {"success": True, "message": "Model initialized successfully"}
+    except Exception as e:
+        logger.error(f"Error initializing model: {str(e)}")
+        # Print detailed memory statistics on failure
+        logger.error(f"GPU Memory Stats:\n"
+                     f"Allocated: {torch.cuda.memory_allocated()/1024**3:.2f}GB\n"
+                     f"Reserved: {torch.cuda.memory_reserved()/1024**3:.2f}GB\n"
+                     f"Max Allocated: {torch.cuda.max_memory_allocated()/1024**3:.2f}GB")
+        raise HTTPException(status_code=500, detail=f"Error initializing model: {str(e)}")
+@app.post("/generate")
+async def generate(request: GenerateRequest):
+    """
+    Generate text using the initialized model.
+    """
+    global llm_instance
+    if llm_instance is None:
+        raise HTTPException(status_code=400, detail="Model not initialized. Call /initialize first.")
+    try:
+        if request.stream:
+            # For streaming responses, we need to handle differently
+            # This is a placeholder as the actual streaming implementation
+            # would need to use StreamingResponse from FastAPI
+            raise HTTPException(
+                status_code=400,
+                detail="Streaming is not currently supported through the API"
+            )
+        generated_text = llm_instance.generate(
+            prompt=request.prompt,
+            max_new_tokens=request.max_new_tokens,
+            temperature=request.temperature,
+            top_k=request.top_k,
+            top_p=request.top_p,
+            return_as_token_ids=request.return_as_token_ids,
+            stream=False  # Force stream to False for now
+        )
+        response = {
+            "generated_text": generated_text if not request.return_as_token_ids else generated_text.tolist(),
+            "metadata": {
+                "prompt": request.prompt,
+                "max_new_tokens": request.max_new_tokens,
+                "temperature": request.temperature,
+                "top_k": request.top_k,
+                "top_p": request.top_p
+            }
+        }
+        return response
+    except Exception as e:
+        logger.error(f"Error generating text: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"Error generating text: {str(e)}")
+@app.get("/health")
+async def health_check():
+    """
+    Check if the service is running and model is loaded.
+    """
+    global llm_instance
+    status = {
+        "status": "healthy",
+        "model_loaded": llm_instance is not None,
+    }
+    if llm_instance is not None:
+        status["model_info"] = {
+            "model_path": llm_instance.config.name,
+            "device": str(next(llm_instance.model.parameters()).device)
+        }
+    return status
+def main():
+    # Load environment variables or configuration here
+    host = os.getenv("LLM_ENGINE_HOST", "0.0.0.0")
+    port = int(os.getenv("LLM_ENGINE_PORT", "8001"))
+    # Start the server
+    uvicorn.run(
+        app,
+        host=host,
+        port=port,
+        log_level="info",
+        reload=False
+    )
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+fastapi==0.109.0
+uvicorn==0.27.0
+pydantic==2.5.3
+torch==2.5.0
+transformers==4.36.2
+litgpt[all]
+python-dotenv==1.0.0

setup_project.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import os
+import subprocess
+import sys
+import venv
+from pathlib import Path
+def setup_project():
+    # Ensure we're in the right directory
+    project_dir = Path(__file__).parent.absolute()
+    os.chdir(project_dir)
+    print("Setting up the project...")
+    # Create virtual environment if it doesn't exist
+    venv_dir = project_dir / "myenv"
+    if not venv_dir.exists():
+        print("Creating virtual environment...")
+        venv.create(venv_dir, with_pip=True)
+    # Determine the path to the Python executable in the virtual environment
+    if sys.platform == "win32":
+        python_executable = venv_dir / "Scripts" / "python.exe"
+        pip_executable = venv_dir / "Scripts" / "pip.exe"
+    else:
+        python_executable = venv_dir / "bin" / "python"
+        pip_executable = venv_dir / "bin" / "pip"
+    # Upgrade pip
+    print("Upgrading pip...")
+    subprocess.run([str(python_executable), "-m", "pip", "install", "--upgrade", "pip"])
+    # Install requirements
+    print("Installing requirements...")
+    requirements_file = project_dir / "requirements.txt"
+    if requirements_file.exists():
+        subprocess.run([str(pip_executable), "install", "-r", "requirements.txt"])
+    else:
+        print("Warning: requirements.txt not found!")
+    print("\nSetup completed successfully!")
+    print("\nTo activate the virtual environment:")
+    if sys.platform == "win32":
+        print(f"    {venv_dir}\\Scripts\\activate")
+    else:
+        print(f"    source {venv_dir}/bin/activate")
+if __name__ == "__main__":
+    setup_project()