Files changed (3) hide show
  1. Dockerfile +7 -5
  2. main/env_template +55 -0
  3. main/main.py +31 -29
Dockerfile CHANGED
@@ -35,12 +35,14 @@ RUN mkdir -p /app/checkpoints && \
35
  ARG HF_TOKEN
36
  ENV HF_TOKEN=${HF_TOKEN}
37
 
38
- # Download the Llama 2 model using litgpt
39
  # Only proceed if HF_TOKEN is provided
40
  RUN if [ -n "$HF_TOKEN" ]; then \
41
- python -c "from huggingface_hub import login; from litgpt.cli import download; login('${HF_TOKEN}'); download('meta-llama/Llama-2-3b-chat-hf', '/app/checkpoints')"; \
 
 
42
  else \
43
- echo "No Hugging Face token provided. Model will need to be downloaded separately."; \
44
  fi
45
 
46
  # Set environment variables
@@ -48,7 +50,7 @@ ENV LLM_ENGINE_HOST=0.0.0.0
48
  ENV LLM_ENGINE_PORT=8001
49
 
50
  # Update MODEL_PATH for the new model
51
- ENV MODEL_PATH=/app/checkpoints/meta-llama/Llama-2-3b-chat-hf
52
 
53
  # Expose both ports:
54
  # 8001 for FastAPI
@@ -56,4 +58,4 @@ ENV MODEL_PATH=/app/checkpoints/meta-llama/Llama-2-3b-chat-hf
56
  EXPOSE 8001 7860
57
 
58
  # Command to run the application
59
- CMD ["python", "main/main.py"]
 
35
  ARG HF_TOKEN
36
  ENV HF_TOKEN=${HF_TOKEN}
37
 
38
+ # Download both models using litgpt
39
  # Only proceed if HF_TOKEN is provided
40
  RUN if [ -n "$HF_TOKEN" ]; then \
41
+ python -c "from huggingface_hub import login; from litgpt.cli import download; login('${HF_TOKEN}'); \
42
+ download('meta-llama/Llama-2-3b-chat-hf', '/app/checkpoints'); \
43
+ download('mistralai/Mistral-7B-Instruct-v0.3', '/app/checkpoints')"; \
44
  else \
45
+ echo "No Hugging Face token provided. Models will need to be downloaded separately."; \
46
  fi
47
 
48
  # Set environment variables
 
50
  ENV LLM_ENGINE_PORT=8001
51
 
52
  # Update MODEL_PATH for the new model
53
+ ENV MODEL_PATH=/app/checkpoints/mistralai/Mistral-7B-Instruct-v0.3
54
 
55
  # Expose both ports:
56
  # 8001 for FastAPI
 
58
  EXPOSE 8001 7860
59
 
60
  # Command to run the application
61
+ CMD ["python", "main/main.py"]
main/env_template ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Service URLs Configuration
2
+ LLM_ENGINE_URL=http://localhost:8001
3
+ RAG_ENGINE_URL=http://localhost:8002
4
+
5
+ # LLM Engine Server Configuration
6
+ LLM_ENGINE_HOST=0.0.0.0
7
+ LLM_ENGINE_PORT=8001
8
+
9
+ # RAG Engine Server Configuration (if running locally)
10
+ RAG_ENGINE_HOST=0.0.0.0
11
+ RAG_ENGINE_PORT=8002
12
+
13
+ # Base Paths Configuration
14
+ BAS_MODEL_PATH=/path/to/your/model
15
+ BAS_RESOURCES=/path/to/resources
16
+
17
+ # CUDA Memory Management
18
+ PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128,garbage_collection_threshold:0.8,expandable_segments:True
19
+
20
+ # Other memory-related settings
21
+ CUDA_LAUNCH_BLOCKING=0
22
+ CUDA_VISIBLE_DEVICES=0
23
+
24
+ # Logging Configuration
25
+ LOG_LEVEL=INFO # DEBUG, INFO, WARNING, ERROR, CRITICAL
26
+
27
+ # GPU Configuration (optional)
28
+ # CUDA_VISIBLE_DEVICES=0,1 # Specify which GPUs to use
29
+
30
+ # Memory Configuration (optional)
31
+ # MAX_GPU_MEMORY=16Gi # Maximum GPU memory to use
32
+ # MAX_CPU_MEMORY=32Gi # Maximum CPU memory to use
33
+
34
+ # Security (if needed)
35
+ # API_KEY=your-api-key-here
36
+ # SSL_CERT_PATH=/path/to/cert
37
+ # SSL_KEY_PATH=/path/to/key
38
+
39
+ # Development Settings
40
+ # DEBUG=True # Enable debug mode
41
+ # RELOAD=False # Enable auto-reload for development
42
+
43
+ # Model Default Parameters (optional)
44
+ # DEFAULT_MAX_NEW_TOKENS=50
45
+ # DEFAULT_TEMPERATURE=1.0
46
+ # DEFAULT_TOP_K=50
47
+ # DEFAULT_TOP_P=1.0
48
+
49
+ # Cache Settings (optional)
50
+ # CACHE_DIR=/path/to/cache
51
+ # MAX_CACHE_SIZE=10Gi
52
+
53
+ # Monitoring (optional)
54
+ # ENABLE_METRICS=True
55
+ # PROMETHEUS_PORT=9090
main/main.py CHANGED
@@ -44,27 +44,29 @@ async def initialize_model(request: InitializeRequest):
44
  global llm_instance
45
 
46
  try:
47
- if request.precision is None and request.quantize is None:
48
- # Use auto distribution from load when no specific precision or quantization is set
49
- llm_instance = LLM.load(
50
- model=request.model_path,
51
- distribute="auto" # Let the load function handle distribution automatically
52
- )
53
-
54
- logger.info(
55
- f"Model initialized with auto settings:\n"
56
- f"Model Path: {request.model_path}\n"
57
- f"Current GPU Memory: {torch.cuda.memory_allocated()/1024**3:.2f}GB allocated, "
58
- f"{torch.cuda.memory_reserved()/1024**3:.2f}GB reserved"
59
- )
60
  else:
61
- # Original initialization path for when specific settings are requested
62
- llm_instance = LLM.load(
63
- model=request.model_path,
64
- distribute=None # We'll distribute manually
65
- )
66
 
67
- # Distribute the model according to the configuration
 
 
 
 
 
 
 
68
  llm_instance.distribute(
69
  accelerator="cuda" if request.mode == "gpu" else "cpu",
70
  devices=request.gpu_count,
@@ -72,16 +74,16 @@ async def initialize_model(request: InitializeRequest):
72
  quantize=request.quantize
73
  )
74
 
75
- logger.info(
76
- f"Model initialized successfully with config:\n"
77
- f"Mode: {request.mode}\n"
78
- f"Precision: {request.precision}\n"
79
- f"Quantize: {request.quantize}\n"
80
- f"GPU Count: {request.gpu_count}\n"
81
- f"Model Path: {request.model_path}\n"
82
- f"Current GPU Memory: {torch.cuda.memory_allocated()/1024**3:.2f}GB allocated, "
83
- f"{torch.cuda.memory_reserved()/1024**3:.2f}GB reserved"
84
- )
85
 
86
  return {"success": True, "message": "Model initialized successfully"}
87
 
 
44
  global llm_instance
45
 
46
  try:
47
+ # Get the project root directory (where main.py is located)
48
+ project_root = Path(__file__).parent
49
+ checkpoints_dir = project_root / "checkpoints"
50
+
51
+ # For LitGPT downloaded models, path includes organization
52
+ if "/" in request.model_path:
53
+ # e.g., "mistralai/Mistral-7B-Instruct-v0.3"
54
+ org, model_name = request.model_path.split("/")
55
+ model_path = str(checkpoints_dir / org / model_name)
 
 
 
 
56
  else:
57
+ # Fallback for direct model paths
58
+ model_path = str(checkpoints_dir / request.model_path)
59
+
60
+ logger.info(f"Using model path: {model_path}")
 
61
 
62
+ # Load the model
63
+ llm_instance = LLM.load(
64
+ model=model_path,
65
+ distribute=None if request.precision or request.quantize else "auto"
66
+ )
67
+
68
+ # If manual distribution is needed
69
+ if request.precision or request.quantize:
70
  llm_instance.distribute(
71
  accelerator="cuda" if request.mode == "gpu" else "cpu",
72
  devices=request.gpu_count,
 
74
  quantize=request.quantize
75
  )
76
 
77
+ logger.info(
78
+ f"Model initialized successfully with config:\n"
79
+ f"Mode: {request.mode}\n"
80
+ f"Precision: {request.precision}\n"
81
+ f"Quantize: {request.quantize}\n"
82
+ f"GPU Count: {request.gpu_count}\n"
83
+ f"Model Path: {model_path}\n"
84
+ f"Current GPU Memory: {torch.cuda.memory_allocated()/1024**3:.2f}GB allocated, "
85
+ f"{torch.cuda.memory_reserved()/1024**3:.2f}GB reserved"
86
+ )
87
 
88
  return {"success": True, "message": "Model initialized successfully"}
89