Spaces:
Runtime error
Runtime error
File size: 3,978 Bytes
c6456fa 577d21a da440bd 577d21a bdb0ed5 29ca3eb c6456fa 577d21a da440bd c6456fa 577d21a c6456fa bdb0ed5 c6456fa 577d21a 29ca3eb c6456fa 577d21a da440bd 577d21a bdb0ed5 8e2efbf 577d21a da440bd bdb0ed5 da440bd bdb0ed5 da440bd bdb0ed5 da440bd bdb0ed5 577d21a da440bd bdb0ed5 da440bd bdb0ed5 da440bd bdb0ed5 577d21a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
FROM nvidia/cuda:12.1.0-devel-ubuntu22.04
# Install system dependencies
RUN apt-get update && apt-get install -y \
git \
git-lfs \
python3.10 \
python3-pip \
python-is-python3 \
wget \
ninja-build \
gcc \
g++ \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /app
# Create a non-root user
RUN useradd -m -u 1000 user && \
chown -R user:user /app
# Install basic Python packages first
RUN pip3 install --no-cache-dir \
packaging \
setuptools \
wheel \
numpy \
torch==2.4.0
# Install CUDA toolkit
ENV CUDA_HOME=/usr/local/cuda
ENV PATH=${CUDA_HOME}/bin:${PATH}
ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
# Clone Self-Lengthen repository
RUN git clone https://github.com/QwenLM/Self-Lengthen.git && \
mv Self-Lengthen/* . && \
rm -rf Self-Lengthen
# Install dependencies in order
COPY requirements.txt .
RUN pip3 install --no-cache-dir \
transformers==4.43.2 \
accelerate \
peft \
datasets \
sentencepiece \
protobuf \
tiktoken \
scipy \
gradio \
cn2an>=0.5.22 \
langdetect>=1.0.9 \
openai \
tqdm \
&& pip3 install --no-cache-dir flash-attn --no-build-isolation \
&& pip3 install --no-cache-dir vllm==0.5.5 vllm-flash-attn
# Install FastChat
RUN git clone -b self-lengthen https://github.com/quanshr/FastChat.git && \
cd FastChat && \
pip3 install ".[model_worker,webui]"
# Install LLaMA Factory
RUN pip3 install --no-cache-dir llamafactory
# Create directories and set permissions
RUN mkdir -p models results && \
chown -R user:user /app
# Switch to non-root user
USER user
# Initialize git-lfs
RUN git lfs install
# Set environment variables
ENV CUDA_VISIBLE_DEVICES=0
ENV WORLD_SIZE=1
ENV RANK=0
ENV MASTER_ADDR=localhost
ENV MASTER_PORT=29500
ENV MODEL_PATH=/app/models/base_model
ENV INSTRUCT_COUNT=5000
ENV MAX_ITER=3
# Create startup script
RUN echo '#!/bin/bash\n\
\n\
# Function to wait for service\n\
wait_for_service() {\n\
local host="$1"\n\
local port="$2"\n\
local retries=30\n\
while ! nc -z "$host" "$port" > /dev/null 2>&1; do\n\
retries=$((retries-1))\n\
if [ "$retries" -eq 0 ]; then\n\
echo "Service $host:$port is not available after maximum retries"\n\
exit 1\n\
fi\n\
echo "Waiting for service $host:$port..."\n\
sleep 2\n\
done\n\
}\n\
\n\
# Download model if needed\n\
if [ ! -d "$MODEL_PATH" ]; then\n\
echo "Downloading model..."\n\
mkdir -p "$MODEL_PATH"\n\
git clone https://huggingface.co/Qwen/Qwen2-7B-Instruct "$MODEL_PATH"\n\
fi\n\
\n\
# Start FastChat services\n\
python -m fastchat.serve.controller \
--host 0.0.0.0 \
--port 21001 > controller.log 2>&1 &\n\
\n\
# Wait for controller\n\
wait_for_service localhost 21001\n\
\n\
python -m fastchat.serve.openai_api_server \
--controller-address http://localhost:21001 \
--host 0.0.0.0 \
--port 8000 > api_server.log 2>&1 &\n\
\n\
# Wait for API server\n\
wait_for_service localhost 8000\n\
\n\
# Start model worker\n\
python -m fastchat.serve.vllm_worker \
--model-names Qwen/Qwen2-7B-Instruct \
--model-path "$MODEL_PATH" \
--controller-address http://localhost:21001 \
--host localhost \
--port 8080 \
--worker-address http://localhost:8080 > worker.log 2>&1 &\n\
\n\
# Wait for model worker\n\
wait_for_service localhost 8080\n\
\n\
# Run the training process\n\
cd /app/qwen\n\
bash run.sh --base_model="$MODEL_PATH" --instruct_count="$INSTRUCT_COUNT" --max_iter="$MAX_ITER"\n\
\n\
# Start the web interface\n\
python app.py\n' > /app/start.sh && \
chmod +x /app/start.sh
# Install netcat for service checking
USER root
RUN apt-get update && apt-get install -y netcat-openbsd && rm -rf /var/lib/apt/lists/*
USER user
# Create a simple web interface
COPY --chown=user:user app.py .
# Expose port for web interface
EXPOSE 7860 8000 21001 8080
# Command to run
ENTRYPOINT ["/app/start.sh"] |