File size: 2,813 Bytes
a3f0b6e
7c37fc5
 
 
 
3fa4d71
53f077b
 
a3f0b6e
7c37fc5
 
 
 
c6e88ba
7c37fc5
 
beb5662
36ed17a
7c37fc5
 
 
035df3d
 
 
 
 
 
 
1b0bd15
7c37fc5
240148f
 
7c37fc5
53f077b
7c37fc5
 
 
 
 
240148f
7c37fc5
 
 
36ed17a
7c37fc5
 
beb5662
035df3d
 
 
 
 
 
 
1b0bd15
7c37fc5
1b0bd15
53f077b
 
 
1b0bd15
53f077b
 
 
 
 
 
 
 
1b0bd15
53f077b
 
 
 
1b0bd15
 
 
035df3d
 
 
 
 
 
 
1b0bd15
 
 
 
 
24e4bf5
 
1b0bd15
 
 
 
 
 
 
 
 
 
 
 
 
 
240148f
035df3d
 
 
 
 
 
 
 
1b0bd15
a3f0b6e
7c37fc5
 
 
 
a3f0b6e
 
7c37fc5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
services:
  train:
    build:
      context: .
    command: |
      python -m src.train_optuna_callbacks experiment=catdog_experiment ++task_name=train ++train=True ++test=False && \
      python -m src.create_artifacts && \
      touch ./checkpoints/train_done.flag
    volumes:
      - ./data:/app/data
      - ./checkpoints:/app/checkpoints
      - ./artifacts:/app/artifacts
      - ./logs:/app/logs
    environment:
      - PYTHONUNBUFFERED=1
      - PYTHONPATH=/app
    shm_size: '4g'
    networks:
      - default
    env_file:
      - .env
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]

  eval:
    build:
      context: .
    command: |
      sh -c 'while [ ! -f /app/checkpoints/train_done.flag ]; do sleep 10; done && python -m src.train_optuna_callbacks experiment=catdog_experiment ++task_name=test ++train=False ++test=True'
    volumes:
      - ./data:/app/data
      - ./checkpoints:/app/checkpoints
      - ./artifacts:/app/artifacts
      - ./logs:/app/logs
    environment:
      - PYTHONUNBUFFERED=1
      - PYTHONPATH=/app
    shm_size: '4g'
    networks:
      - default
    env_file:
      - .env
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
  

  server:
    build:
      context: .
    command: |
      sh -c 'while [ ! -f /app/checkpoints/train_done.flag ]; do sleep 10; done && python -m src.server'
    volumes:
      - ./data:/app/data
      - ./checkpoints:/app/checkpoints
      - ./artifacts:/app/artifacts
      - ./logs:/app/logs
    environment:
      - PYTHONUNBUFFERED=1
      - PYTHONPATH=/app
      - SERVER_URL=http://localhost:8080
    shm_size: '4g'
    networks:
      - default
    env_file:
      - .env
    ports:
      - "8080:8080"
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
      
  client:
    build:
      context: .
    command: |
      sh -c 'until curl -s http://server:8080/health; do echo "Waiting for server to be ready..."; sleep 5; done && \
      ./run_client.sh'
    volumes:
      - ./data:/app/data
      - ./checkpoints:/app/checkpoints
      - ./artifacts:/app/artifacts
      - ./logs:/app/logs
    environment:
      - PYTHONUNBUFFERED=1
      - PYTHONPATH=/app
      - SERVER_URL=http://server:8080
    shm_size: '4g'
    networks:
      - default
    env_file:
      - .env

    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]

  
volumes:
  data:
  checkpoints:
  artifacts:
  logs:

networks:
  default: