services: train: build: context: . command: | python -m src.train_optuna_callbacks experiment=catdog_experiment ++task_name=train ++train=True ++test=False && \ python -m src.create_artifacts && \ touch ./checkpoints/train_done.flag volumes: - ./data:/app/data - ./checkpoints:/app/checkpoints - ./artifacts:/app/artifacts - ./logs:/app/logs environment: - PYTHONUNBUFFERED=1 - PYTHONPATH=/app shm_size: '4g' networks: - default env_file: - .env deploy: resources: reservations: devices: - driver: nvidia count: 1 capabilities: [gpu] eval: build: context: . command: | sh -c 'while [ ! -f /app/checkpoints/train_done.flag ]; do sleep 10; done && python -m src.train_optuna_callbacks experiment=catdog_experiment ++task_name=test ++train=False ++test=True' volumes: - ./data:/app/data - ./checkpoints:/app/checkpoints - ./artifacts:/app/artifacts - ./logs:/app/logs environment: - PYTHONUNBUFFERED=1 - PYTHONPATH=/app shm_size: '4g' networks: - default env_file: - .env deploy: resources: reservations: devices: - driver: nvidia count: 1 capabilities: [gpu] server: build: context: . command: | sh -c 'while [ ! -f /app/checkpoints/train_done.flag ]; do sleep 10; done && python -m src.server' volumes: - ./data:/app/data - ./checkpoints:/app/checkpoints - ./artifacts:/app/artifacts - ./logs:/app/logs environment: - PYTHONUNBUFFERED=1 - PYTHONPATH=/app - SERVER_URL=http://localhost:8080 shm_size: '4g' networks: - default env_file: - .env ports: - "8080:8080" deploy: resources: reservations: devices: - driver: nvidia count: 1 capabilities: [gpu] client: build: context: . command: | sh -c 'until curl -s http://server:8080/health; do echo "Waiting for server to be ready..."; sleep 5; done && \ ./run_client.sh' volumes: - ./data:/app/data - ./checkpoints:/app/checkpoints - ./artifacts:/app/artifacts - ./logs:/app/logs environment: - PYTHONUNBUFFERED=1 - PYTHONPATH=/app - SERVER_URL=http://server:8080 shm_size: '4g' networks: - default env_file: - .env deploy: resources: reservations: devices: - driver: nvidia count: 1 capabilities: [gpu] volumes: data: checkpoints: artifacts: logs: networks: default: