Spaces:

Sergidev
/

selflengthen

Runtime error

App Files Files Community

Sergidev commited on Nov 4, 2024

Commit

da440bd

verified ·

1 Parent(s): 8db0b4f

Update Dockerfile

Browse files

Files changed (1) hide show

Dockerfile +70 -9

Dockerfile CHANGED Viewed

@@ -3,6 +3,7 @@ FROM nvidia/cuda:12.1.0-devel-ubuntu22.04
 # Install system dependencies
 RUN apt-get update && apt-get install -y \
     git \
     python3.10 \
     python3-pip \
     python-is-python3 \
@@ -14,6 +15,10 @@ RUN apt-get update && apt-get install -y \
 WORKDIR /app
 # Install basic Python packages first
 RUN pip3 install --no-cache-dir \
     packaging \
@@ -59,8 +64,15 @@ RUN git clone -b self-lengthen https://github.com/quanshr/FastChat.git && \
 # Install LLaMA Factory
 RUN pip3 install --no-cache-dir llamafactory
-# Create directories for models and results
-RUN mkdir -p models results
 # Set environment variables
 ENV CUDA_VISIBLE_DEVICES=0
@@ -74,27 +86,76 @@ ENV MAX_ITER=3
 # Create startup script
 RUN echo '#!/bin/bash\n\
 # Download model if needed\n\
 if [ ! -d "$MODEL_PATH" ]; then\n\
-    mkdir -p $MODEL_PATH\n\
-    git lfs install\n\
-    git clone https://huggingface.co/Qwen/Qwen2-7B-Instruct $MODEL_PATH\n\
 fi\n\
 \n\
 # Run the training process\n\
 cd /app/qwen\n\
-bash run.sh --base_model=$MODEL_PATH --instruct_count=$INSTRUCT_COUNT --max_iter=$MAX_ITER\n\
-python collect_data.py\n\
 \n\
 # Start the web interface\n\
 python app.py\n' > /app/start.sh && \
 chmod +x /app/start.sh
 # Create a simple web interface
-COPY app.py .
 # Expose port for web interface
-EXPOSE 7860
 # Command to run
 ENTRYPOINT ["/app/start.sh"]

 # Install system dependencies
 RUN apt-get update && apt-get install -y \
     git \
+    git-lfs \
     python3.10 \
     python3-pip \
     python-is-python3 \
 WORKDIR /app
+# Create a non-root user
+RUN useradd -m -u 1000 user && \
+    chown -R user:user /app
 # Install basic Python packages first
 RUN pip3 install --no-cache-dir \
     packaging \
 # Install LLaMA Factory
 RUN pip3 install --no-cache-dir llamafactory
+# Create directories and set permissions
+RUN mkdir -p models results && \
+    chown -R user:user /app
+# Switch to non-root user
+USER user
+# Initialize git-lfs
+RUN git lfs install
 # Set environment variables
 ENV CUDA_VISIBLE_DEVICES=0
 # Create startup script
 RUN echo '#!/bin/bash\n\
+\n\
+# Function to wait for service\n\
+wait_for_service() {\n\
+    local host="$1"\n\
+    local port="$2"\n\
+    local retries=30\n\
+    while ! nc -z "$host" "$port" > /dev/null 2>&1; do\n\
+        retries=$((retries-1))\n\
+        if [ "$retries" -eq 0 ]; then\n\
+            echo "Service $host:$port is not available after maximum retries"\n\
+            exit 1\n\
+        fi\n\
+        echo "Waiting for service $host:$port..."\n\
+        sleep 2\n\
+    done\n\
+}\n\
+\n\
 # Download model if needed\n\
 if [ ! -d "$MODEL_PATH" ]; then\n\
+    echo "Downloading model..."\n\
+    mkdir -p "$MODEL_PATH"\n\
+    git clone https://huggingface.co/Qwen/Qwen2-7B-Instruct "$MODEL_PATH"\n\
 fi\n\
 \n\
+# Start FastChat services\n\
+python -m fastchat.serve.controller \
+    --host 0.0.0.0 \
+    --port 21001 > controller.log 2>&1 &\n\
+\n\
+# Wait for controller\n\
+wait_for_service localhost 21001\n\
+\n\
+python -m fastchat.serve.openai_api_server \
+    --controller-address http://localhost:21001 \
+    --host 0.0.0.0 \
+    --port 8000 > api_server.log 2>&1 &\n\
+\n\
+# Wait for API server\n\
+wait_for_service localhost 8000\n\
+\n\
+# Start model worker\n\
+python -m fastchat.serve.vllm_worker \
+    --model-names Qwen/Qwen2-7B-Instruct \
+    --model-path "$MODEL_PATH" \
+    --controller-address http://localhost:21001 \
+    --host localhost \
+    --port 8080 \
+    --worker-address http://localhost:8080 > worker.log 2>&1 &\n\
+\n\
+# Wait for model worker\n\
+wait_for_service localhost 8080\n\
+\n\
 # Run the training process\n\
 cd /app/qwen\n\
+bash run.sh --base_model="$MODEL_PATH" --instruct_count="$INSTRUCT_COUNT" --max_iter="$MAX_ITER"\n\
 \n\
 # Start the web interface\n\
 python app.py\n' > /app/start.sh && \
 chmod +x /app/start.sh
+# Install netcat for service checking
+USER root
+RUN apt-get update && apt-get install -y netcat-openbsd && rm -rf /var/lib/apt/lists/*
+USER user
 # Create a simple web interface
+COPY --chown=user:user app.py .
 # Expose port for web interface
+EXPOSE 7860 8000 21001 8080
 # Command to run
 ENTRYPOINT ["/app/start.sh"]