Spaces:

TeamGenKI
/

LLMServer

Paused

App Files Files Community

AurelioAguirre commited on Nov 6, 2024

Commit

a189e20

•

1 Parent(s): 925480a

Upload 6 files

Browse files

Files changed (3) hide show

Dockerfile +7 -5
main/env_template +55 -0
main/main.py +31 -29

Dockerfile CHANGED Viewed

@@ -35,12 +35,14 @@ RUN mkdir -p /app/checkpoints && \
 ARG HF_TOKEN
 ENV HF_TOKEN=${HF_TOKEN}
-# Download the Llama 2 model using litgpt
 # Only proceed if HF_TOKEN is provided
 RUN if [ -n "$HF_TOKEN" ]; then \
-        python -c "from huggingface_hub import login; from litgpt.cli import download; login('${HF_TOKEN}'); download('meta-llama/Llama-2-3b-chat-hf', '/app/checkpoints')"; \
     else \
-        echo "No Hugging Face token provided. Model will need to be downloaded separately."; \
     fi
 # Set environment variables
@@ -48,7 +50,7 @@ ENV LLM_ENGINE_HOST=0.0.0.0
 ENV LLM_ENGINE_PORT=8001
 # Update MODEL_PATH for the new model
-ENV MODEL_PATH=/app/checkpoints/meta-llama/Llama-2-3b-chat-hf
 # Expose both ports:
 # 8001 for FastAPI
@@ -56,4 +58,4 @@ ENV MODEL_PATH=/app/checkpoints/meta-llama/Llama-2-3b-chat-hf
 EXPOSE 8001 7860
 # Command to run the application
-CMD ["python", "main/main.py"]

 ARG HF_TOKEN
 ENV HF_TOKEN=${HF_TOKEN}
+# Download both models using litgpt
 # Only proceed if HF_TOKEN is provided
 RUN if [ -n "$HF_TOKEN" ]; then \
+        python -c "from huggingface_hub import login; from litgpt.cli import download; login('${HF_TOKEN}'); \
+        download('meta-llama/Llama-2-3b-chat-hf', '/app/checkpoints'); \
+        download('mistralai/Mistral-7B-Instruct-v0.3', '/app/checkpoints')"; \
     else \
+        echo "No Hugging Face token provided. Models will need to be downloaded separately."; \
     fi
 # Set environment variables
 ENV LLM_ENGINE_PORT=8001
 # Update MODEL_PATH for the new model
+ENV MODEL_PATH=/app/checkpoints/mistralai/Mistral-7B-Instruct-v0.3
 # Expose both ports:
 # 8001 for FastAPI
 EXPOSE 8001 7860
 # Command to run the application
+CMD ["python", "main/main.py"]

main/env_template ADDED Viewed

	@@ -0,0 +1,55 @@

+# Service URLs Configuration
+LLM_ENGINE_URL=http://localhost:8001
+RAG_ENGINE_URL=http://localhost:8002
+# LLM Engine Server Configuration
+LLM_ENGINE_HOST=0.0.0.0
+LLM_ENGINE_PORT=8001
+# RAG Engine Server Configuration (if running locally)
+RAG_ENGINE_HOST=0.0.0.0
+RAG_ENGINE_PORT=8002
+# Base Paths Configuration
+BAS_MODEL_PATH=/path/to/your/model
+BAS_RESOURCES=/path/to/resources
+# CUDA Memory Management
+PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128,garbage_collection_threshold:0.8,expandable_segments:True
+# Other memory-related settings
+CUDA_LAUNCH_BLOCKING=0
+CUDA_VISIBLE_DEVICES=0
+# Logging Configuration
+LOG_LEVEL=INFO  # DEBUG, INFO, WARNING, ERROR, CRITICAL
+# GPU Configuration (optional)
+# CUDA_VISIBLE_DEVICES=0,1  # Specify which GPUs to use
+# Memory Configuration (optional)
+# MAX_GPU_MEMORY=16Gi  # Maximum GPU memory to use
+# MAX_CPU_MEMORY=32Gi  # Maximum CPU memory to use
+# Security (if needed)
+# API_KEY=your-api-key-here
+# SSL_CERT_PATH=/path/to/cert
+# SSL_KEY_PATH=/path/to/key
+# Development Settings
+# DEBUG=True  # Enable debug mode
+# RELOAD=False  # Enable auto-reload for development
+# Model Default Parameters (optional)
+# DEFAULT_MAX_NEW_TOKENS=50
+# DEFAULT_TEMPERATURE=1.0
+# DEFAULT_TOP_K=50
+# DEFAULT_TOP_P=1.0
+# Cache Settings (optional)
+# CACHE_DIR=/path/to/cache
+# MAX_CACHE_SIZE=10Gi
+# Monitoring (optional)
+# ENABLE_METRICS=True
+# PROMETHEUS_PORT=9090

main/main.py CHANGED Viewed

@@ -44,27 +44,29 @@ async def initialize_model(request: InitializeRequest):
     global llm_instance
     try:
-        if request.precision is None and request.quantize is None:
-            # Use auto distribution from load when no specific precision or quantization is set
-            llm_instance = LLM.load(
-                model=request.model_path,
-                distribute="auto"  # Let the load function handle distribution automatically
-            )
-            logger.info(
-                f"Model initialized with auto settings:\n"
-                f"Model Path: {request.model_path}\n"
-                f"Current GPU Memory: {torch.cuda.memory_allocated()/1024**3:.2f}GB allocated, "
-                f"{torch.cuda.memory_reserved()/1024**3:.2f}GB reserved"
-            )
         else:
-            # Original initialization path for when specific settings are requested
-            llm_instance = LLM.load(
-                model=request.model_path,
-                distribute=None  # We'll distribute manually
-            )
-            # Distribute the model according to the configuration
             llm_instance.distribute(
                 accelerator="cuda" if request.mode == "gpu" else "cpu",
                 devices=request.gpu_count,
@@ -72,16 +74,16 @@ async def initialize_model(request: InitializeRequest):
                 quantize=request.quantize
             )
-            logger.info(
-                f"Model initialized successfully with config:\n"
-                f"Mode: {request.mode}\n"
-                f"Precision: {request.precision}\n"
-                f"Quantize: {request.quantize}\n"
-                f"GPU Count: {request.gpu_count}\n"
-                f"Model Path: {request.model_path}\n"
-                f"Current GPU Memory: {torch.cuda.memory_allocated()/1024**3:.2f}GB allocated, "
-                f"{torch.cuda.memory_reserved()/1024**3:.2f}GB reserved"
-            )
         return {"success": True, "message": "Model initialized successfully"}

     global llm_instance
     try:
+        # Get the project root directory (where main.py is located)
+        project_root = Path(__file__).parent
+        checkpoints_dir = project_root / "checkpoints"
+        # For LitGPT downloaded models, path includes organization
+        if "/" in request.model_path:
+            # e.g., "mistralai/Mistral-7B-Instruct-v0.3"
+            org, model_name = request.model_path.split("/")
+            model_path = str(checkpoints_dir / org / model_name)
         else:
+            # Fallback for direct model paths
+            model_path = str(checkpoints_dir / request.model_path)
+        logger.info(f"Using model path: {model_path}")
+        # Load the model
+        llm_instance = LLM.load(
+            model=model_path,
+            distribute=None if request.precision or request.quantize else "auto"
+        )
+        # If manual distribution is needed
+        if request.precision or request.quantize:
             llm_instance.distribute(
                 accelerator="cuda" if request.mode == "gpu" else "cpu",
                 devices=request.gpu_count,
                 quantize=request.quantize
             )
+        logger.info(
+            f"Model initialized successfully with config:\n"
+            f"Mode: {request.mode}\n"
+            f"Precision: {request.precision}\n"
+            f"Quantize: {request.quantize}\n"
+            f"GPU Count: {request.gpu_count}\n"
+            f"Model Path: {model_path}\n"
+            f"Current GPU Memory: {torch.cuda.memory_allocated()/1024**3:.2f}GB allocated, "
+            f"{torch.cuda.memory_reserved()/1024**3:.2f}GB reserved"
+        )
         return {"success": True, "message": "Model initialized successfully"}