Spaces:
Paused
Paused
Upload 6 files
#2
by
AurelioAguirre
- opened
- Dockerfile +7 -5
- main/env_template +55 -0
- main/main.py +31 -29
Dockerfile
CHANGED
@@ -35,12 +35,14 @@ RUN mkdir -p /app/checkpoints && \
|
|
35 |
ARG HF_TOKEN
|
36 |
ENV HF_TOKEN=${HF_TOKEN}
|
37 |
|
38 |
-
# Download
|
39 |
# Only proceed if HF_TOKEN is provided
|
40 |
RUN if [ -n "$HF_TOKEN" ]; then \
|
41 |
-
python -c "from huggingface_hub import login; from litgpt.cli import download; login('${HF_TOKEN}');
|
|
|
|
|
42 |
else \
|
43 |
-
echo "No Hugging Face token provided.
|
44 |
fi
|
45 |
|
46 |
# Set environment variables
|
@@ -48,7 +50,7 @@ ENV LLM_ENGINE_HOST=0.0.0.0
|
|
48 |
ENV LLM_ENGINE_PORT=8001
|
49 |
|
50 |
# Update MODEL_PATH for the new model
|
51 |
-
ENV MODEL_PATH=/app/checkpoints/
|
52 |
|
53 |
# Expose both ports:
|
54 |
# 8001 for FastAPI
|
@@ -56,4 +58,4 @@ ENV MODEL_PATH=/app/checkpoints/meta-llama/Llama-2-3b-chat-hf
|
|
56 |
EXPOSE 8001 7860
|
57 |
|
58 |
# Command to run the application
|
59 |
-
CMD ["python", "main/main.py"]
|
|
|
35 |
ARG HF_TOKEN
|
36 |
ENV HF_TOKEN=${HF_TOKEN}
|
37 |
|
38 |
+
# Download both models using litgpt
|
39 |
# Only proceed if HF_TOKEN is provided
|
40 |
RUN if [ -n "$HF_TOKEN" ]; then \
|
41 |
+
python -c "from huggingface_hub import login; from litgpt.cli import download; login('${HF_TOKEN}'); \
|
42 |
+
download('meta-llama/Llama-2-3b-chat-hf', '/app/checkpoints'); \
|
43 |
+
download('mistralai/Mistral-7B-Instruct-v0.3', '/app/checkpoints')"; \
|
44 |
else \
|
45 |
+
echo "No Hugging Face token provided. Models will need to be downloaded separately."; \
|
46 |
fi
|
47 |
|
48 |
# Set environment variables
|
|
|
50 |
ENV LLM_ENGINE_PORT=8001
|
51 |
|
52 |
# Update MODEL_PATH for the new model
|
53 |
+
ENV MODEL_PATH=/app/checkpoints/mistralai/Mistral-7B-Instruct-v0.3
|
54 |
|
55 |
# Expose both ports:
|
56 |
# 8001 for FastAPI
|
|
|
58 |
EXPOSE 8001 7860
|
59 |
|
60 |
# Command to run the application
|
61 |
+
CMD ["python", "main/main.py"]
|
main/env_template
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Service URLs Configuration
|
2 |
+
LLM_ENGINE_URL=http://localhost:8001
|
3 |
+
RAG_ENGINE_URL=http://localhost:8002
|
4 |
+
|
5 |
+
# LLM Engine Server Configuration
|
6 |
+
LLM_ENGINE_HOST=0.0.0.0
|
7 |
+
LLM_ENGINE_PORT=8001
|
8 |
+
|
9 |
+
# RAG Engine Server Configuration (if running locally)
|
10 |
+
RAG_ENGINE_HOST=0.0.0.0
|
11 |
+
RAG_ENGINE_PORT=8002
|
12 |
+
|
13 |
+
# Base Paths Configuration
|
14 |
+
BAS_MODEL_PATH=/path/to/your/model
|
15 |
+
BAS_RESOURCES=/path/to/resources
|
16 |
+
|
17 |
+
# CUDA Memory Management
|
18 |
+
PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128,garbage_collection_threshold:0.8,expandable_segments:True
|
19 |
+
|
20 |
+
# Other memory-related settings
|
21 |
+
CUDA_LAUNCH_BLOCKING=0
|
22 |
+
CUDA_VISIBLE_DEVICES=0
|
23 |
+
|
24 |
+
# Logging Configuration
|
25 |
+
LOG_LEVEL=INFO # DEBUG, INFO, WARNING, ERROR, CRITICAL
|
26 |
+
|
27 |
+
# GPU Configuration (optional)
|
28 |
+
# CUDA_VISIBLE_DEVICES=0,1 # Specify which GPUs to use
|
29 |
+
|
30 |
+
# Memory Configuration (optional)
|
31 |
+
# MAX_GPU_MEMORY=16Gi # Maximum GPU memory to use
|
32 |
+
# MAX_CPU_MEMORY=32Gi # Maximum CPU memory to use
|
33 |
+
|
34 |
+
# Security (if needed)
|
35 |
+
# API_KEY=your-api-key-here
|
36 |
+
# SSL_CERT_PATH=/path/to/cert
|
37 |
+
# SSL_KEY_PATH=/path/to/key
|
38 |
+
|
39 |
+
# Development Settings
|
40 |
+
# DEBUG=True # Enable debug mode
|
41 |
+
# RELOAD=False # Enable auto-reload for development
|
42 |
+
|
43 |
+
# Model Default Parameters (optional)
|
44 |
+
# DEFAULT_MAX_NEW_TOKENS=50
|
45 |
+
# DEFAULT_TEMPERATURE=1.0
|
46 |
+
# DEFAULT_TOP_K=50
|
47 |
+
# DEFAULT_TOP_P=1.0
|
48 |
+
|
49 |
+
# Cache Settings (optional)
|
50 |
+
# CACHE_DIR=/path/to/cache
|
51 |
+
# MAX_CACHE_SIZE=10Gi
|
52 |
+
|
53 |
+
# Monitoring (optional)
|
54 |
+
# ENABLE_METRICS=True
|
55 |
+
# PROMETHEUS_PORT=9090
|
main/main.py
CHANGED
@@ -44,27 +44,29 @@ async def initialize_model(request: InitializeRequest):
|
|
44 |
global llm_instance
|
45 |
|
46 |
try:
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
f"Model Path: {request.model_path}\n"
|
57 |
-
f"Current GPU Memory: {torch.cuda.memory_allocated()/1024**3:.2f}GB allocated, "
|
58 |
-
f"{torch.cuda.memory_reserved()/1024**3:.2f}GB reserved"
|
59 |
-
)
|
60 |
else:
|
61 |
-
#
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
)
|
66 |
|
67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
llm_instance.distribute(
|
69 |
accelerator="cuda" if request.mode == "gpu" else "cpu",
|
70 |
devices=request.gpu_count,
|
@@ -72,16 +74,16 @@ async def initialize_model(request: InitializeRequest):
|
|
72 |
quantize=request.quantize
|
73 |
)
|
74 |
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
|
86 |
return {"success": True, "message": "Model initialized successfully"}
|
87 |
|
|
|
44 |
global llm_instance
|
45 |
|
46 |
try:
|
47 |
+
# Get the project root directory (where main.py is located)
|
48 |
+
project_root = Path(__file__).parent
|
49 |
+
checkpoints_dir = project_root / "checkpoints"
|
50 |
+
|
51 |
+
# For LitGPT downloaded models, path includes organization
|
52 |
+
if "/" in request.model_path:
|
53 |
+
# e.g., "mistralai/Mistral-7B-Instruct-v0.3"
|
54 |
+
org, model_name = request.model_path.split("/")
|
55 |
+
model_path = str(checkpoints_dir / org / model_name)
|
|
|
|
|
|
|
|
|
56 |
else:
|
57 |
+
# Fallback for direct model paths
|
58 |
+
model_path = str(checkpoints_dir / request.model_path)
|
59 |
+
|
60 |
+
logger.info(f"Using model path: {model_path}")
|
|
|
61 |
|
62 |
+
# Load the model
|
63 |
+
llm_instance = LLM.load(
|
64 |
+
model=model_path,
|
65 |
+
distribute=None if request.precision or request.quantize else "auto"
|
66 |
+
)
|
67 |
+
|
68 |
+
# If manual distribution is needed
|
69 |
+
if request.precision or request.quantize:
|
70 |
llm_instance.distribute(
|
71 |
accelerator="cuda" if request.mode == "gpu" else "cpu",
|
72 |
devices=request.gpu_count,
|
|
|
74 |
quantize=request.quantize
|
75 |
)
|
76 |
|
77 |
+
logger.info(
|
78 |
+
f"Model initialized successfully with config:\n"
|
79 |
+
f"Mode: {request.mode}\n"
|
80 |
+
f"Precision: {request.precision}\n"
|
81 |
+
f"Quantize: {request.quantize}\n"
|
82 |
+
f"GPU Count: {request.gpu_count}\n"
|
83 |
+
f"Model Path: {model_path}\n"
|
84 |
+
f"Current GPU Memory: {torch.cuda.memory_allocated()/1024**3:.2f}GB allocated, "
|
85 |
+
f"{torch.cuda.memory_reserved()/1024**3:.2f}GB reserved"
|
86 |
+
)
|
87 |
|
88 |
return {"success": True, "message": "Model initialized successfully"}
|
89 |
|