Sergidev commited on
Commit
da440bd
·
verified ·
1 Parent(s): 8db0b4f

Update Dockerfile

Browse files
Files changed (1) hide show
  1. Dockerfile +70 -9
Dockerfile CHANGED
@@ -3,6 +3,7 @@ FROM nvidia/cuda:12.1.0-devel-ubuntu22.04
3
  # Install system dependencies
4
  RUN apt-get update && apt-get install -y \
5
  git \
 
6
  python3.10 \
7
  python3-pip \
8
  python-is-python3 \
@@ -14,6 +15,10 @@ RUN apt-get update && apt-get install -y \
14
 
15
  WORKDIR /app
16
 
 
 
 
 
17
  # Install basic Python packages first
18
  RUN pip3 install --no-cache-dir \
19
  packaging \
@@ -59,8 +64,15 @@ RUN git clone -b self-lengthen https://github.com/quanshr/FastChat.git && \
59
  # Install LLaMA Factory
60
  RUN pip3 install --no-cache-dir llamafactory
61
 
62
- # Create directories for models and results
63
- RUN mkdir -p models results
 
 
 
 
 
 
 
64
 
65
  # Set environment variables
66
  ENV CUDA_VISIBLE_DEVICES=0
@@ -74,27 +86,76 @@ ENV MAX_ITER=3
74
 
75
  # Create startup script
76
  RUN echo '#!/bin/bash\n\
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  # Download model if needed\n\
78
  if [ ! -d "$MODEL_PATH" ]; then\n\
79
- mkdir -p $MODEL_PATH\n\
80
- git lfs install\n\
81
- git clone https://huggingface.co/Qwen/Qwen2-7B-Instruct $MODEL_PATH\n\
82
  fi\n\
83
  \n\
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  # Run the training process\n\
85
  cd /app/qwen\n\
86
- bash run.sh --base_model=$MODEL_PATH --instruct_count=$INSTRUCT_COUNT --max_iter=$MAX_ITER\n\
87
- python collect_data.py\n\
88
  \n\
89
  # Start the web interface\n\
90
  python app.py\n' > /app/start.sh && \
91
  chmod +x /app/start.sh
92
 
 
 
 
 
 
93
  # Create a simple web interface
94
- COPY app.py .
95
 
96
  # Expose port for web interface
97
- EXPOSE 7860
98
 
99
  # Command to run
100
  ENTRYPOINT ["/app/start.sh"]
 
3
  # Install system dependencies
4
  RUN apt-get update && apt-get install -y \
5
  git \
6
+ git-lfs \
7
  python3.10 \
8
  python3-pip \
9
  python-is-python3 \
 
15
 
16
  WORKDIR /app
17
 
18
+ # Create a non-root user
19
+ RUN useradd -m -u 1000 user && \
20
+ chown -R user:user /app
21
+
22
  # Install basic Python packages first
23
  RUN pip3 install --no-cache-dir \
24
  packaging \
 
64
  # Install LLaMA Factory
65
  RUN pip3 install --no-cache-dir llamafactory
66
 
67
+ # Create directories and set permissions
68
+ RUN mkdir -p models results && \
69
+ chown -R user:user /app
70
+
71
+ # Switch to non-root user
72
+ USER user
73
+
74
+ # Initialize git-lfs
75
+ RUN git lfs install
76
 
77
  # Set environment variables
78
  ENV CUDA_VISIBLE_DEVICES=0
 
86
 
87
  # Create startup script
88
  RUN echo '#!/bin/bash\n\
89
+ \n\
90
+ # Function to wait for service\n\
91
+ wait_for_service() {\n\
92
+ local host="$1"\n\
93
+ local port="$2"\n\
94
+ local retries=30\n\
95
+ while ! nc -z "$host" "$port" > /dev/null 2>&1; do\n\
96
+ retries=$((retries-1))\n\
97
+ if [ "$retries" -eq 0 ]; then\n\
98
+ echo "Service $host:$port is not available after maximum retries"\n\
99
+ exit 1\n\
100
+ fi\n\
101
+ echo "Waiting for service $host:$port..."\n\
102
+ sleep 2\n\
103
+ done\n\
104
+ }\n\
105
+ \n\
106
  # Download model if needed\n\
107
  if [ ! -d "$MODEL_PATH" ]; then\n\
108
+ echo "Downloading model..."\n\
109
+ mkdir -p "$MODEL_PATH"\n\
110
+ git clone https://huggingface.co/Qwen/Qwen2-7B-Instruct "$MODEL_PATH"\n\
111
  fi\n\
112
  \n\
113
+ # Start FastChat services\n\
114
+ python -m fastchat.serve.controller \
115
+ --host 0.0.0.0 \
116
+ --port 21001 > controller.log 2>&1 &\n\
117
+ \n\
118
+ # Wait for controller\n\
119
+ wait_for_service localhost 21001\n\
120
+ \n\
121
+ python -m fastchat.serve.openai_api_server \
122
+ --controller-address http://localhost:21001 \
123
+ --host 0.0.0.0 \
124
+ --port 8000 > api_server.log 2>&1 &\n\
125
+ \n\
126
+ # Wait for API server\n\
127
+ wait_for_service localhost 8000\n\
128
+ \n\
129
+ # Start model worker\n\
130
+ python -m fastchat.serve.vllm_worker \
131
+ --model-names Qwen/Qwen2-7B-Instruct \
132
+ --model-path "$MODEL_PATH" \
133
+ --controller-address http://localhost:21001 \
134
+ --host localhost \
135
+ --port 8080 \
136
+ --worker-address http://localhost:8080 > worker.log 2>&1 &\n\
137
+ \n\
138
+ # Wait for model worker\n\
139
+ wait_for_service localhost 8080\n\
140
+ \n\
141
  # Run the training process\n\
142
  cd /app/qwen\n\
143
+ bash run.sh --base_model="$MODEL_PATH" --instruct_count="$INSTRUCT_COUNT" --max_iter="$MAX_ITER"\n\
 
144
  \n\
145
  # Start the web interface\n\
146
  python app.py\n' > /app/start.sh && \
147
  chmod +x /app/start.sh
148
 
149
+ # Install netcat for service checking
150
+ USER root
151
+ RUN apt-get update && apt-get install -y netcat-openbsd && rm -rf /var/lib/apt/lists/*
152
+ USER user
153
+
154
  # Create a simple web interface
155
+ COPY --chown=user:user app.py .
156
 
157
  # Expose port for web interface
158
+ EXPOSE 7860 8000 21001 8080
159
 
160
  # Command to run
161
  ENTRYPOINT ["/app/start.sh"]