Spaces:

Sergidev
/

selflengthen

Runtime error

File size: 3,978 Bytes

c6456fa
577d21a
 
 
 
da440bd
577d21a
 
bdb0ed5
29ca3eb
c6456fa
 
 
577d21a
 
 
 
da440bd
 
 
 
c6456fa
 
 
 
 
 
 
577d21a
c6456fa
 
 
 
 
bdb0ed5
 
 
 
 
c6456fa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
577d21a
 
29ca3eb
 
c6456fa
 
577d21a
da440bd
 
 
 
 
 
 
 
 
577d21a
 
 
 
 
 
 
bdb0ed5
 
 
8e2efbf
577d21a
 
da440bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bdb0ed5
 
da440bd
 
 
bdb0ed5
 
da440bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bdb0ed5
 
da440bd
bdb0ed5
 
 
577d21a
 
da440bd
 
 
 
 
bdb0ed5
da440bd
bdb0ed5
 
da440bd
bdb0ed5
577d21a

FROM nvidia/cuda:12.1.0-devel-ubuntu22.04

# Install system dependencies
RUN apt-get update && apt-get install -y \
    git \
    git-lfs \
    python3.10 \
    python3-pip \
    python-is-python3 \
    wget \
    ninja-build \
    gcc \
    g++ \
    && rm -rf /var/lib/apt/lists/*

WORKDIR /app

# Create a non-root user
RUN useradd -m -u 1000 user && \
    chown -R user:user /app

# Install basic Python packages first
RUN pip3 install --no-cache-dir \
    packaging \
    setuptools \
    wheel \
    numpy \
    torch==2.4.0

# Install CUDA toolkit
ENV CUDA_HOME=/usr/local/cuda
ENV PATH=${CUDA_HOME}/bin:${PATH}
ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}

# Clone Self-Lengthen repository
RUN git clone https://github.com/QwenLM/Self-Lengthen.git && \
    mv Self-Lengthen/* . && \
    rm -rf Self-Lengthen

# Install dependencies in order
COPY requirements.txt .
RUN pip3 install --no-cache-dir \
    transformers==4.43.2 \
    accelerate \
    peft \
    datasets \
    sentencepiece \
    protobuf \
    tiktoken \
    scipy \
    gradio \
    cn2an>=0.5.22 \
    langdetect>=1.0.9 \
    openai \
    tqdm \
    && pip3 install --no-cache-dir flash-attn --no-build-isolation \
    && pip3 install --no-cache-dir vllm==0.5.5 vllm-flash-attn

# Install FastChat
RUN git clone -b self-lengthen https://github.com/quanshr/FastChat.git && \
    cd FastChat && \
    pip3 install ".[model_worker,webui]"

# Install LLaMA Factory
RUN pip3 install --no-cache-dir llamafactory

# Create directories and set permissions
RUN mkdir -p models results && \
    chown -R user:user /app

# Switch to non-root user
USER user

# Initialize git-lfs
RUN git lfs install

# Set environment variables
ENV CUDA_VISIBLE_DEVICES=0
ENV WORLD_SIZE=1
ENV RANK=0
ENV MASTER_ADDR=localhost
ENV MASTER_PORT=29500
ENV MODEL_PATH=/app/models/base_model
ENV INSTRUCT_COUNT=5000
ENV MAX_ITER=3

# Create startup script
RUN echo '#!/bin/bash\n\
\n\
# Function to wait for service\n\
wait_for_service() {\n\
    local host="$1"\n\
    local port="$2"\n\
    local retries=30\n\
    while ! nc -z "$host" "$port" > /dev/null 2>&1; do\n\
        retries=$((retries-1))\n\
        if [ "$retries" -eq 0 ]; then\n\
            echo "Service $host:$port is not available after maximum retries"\n\
            exit 1\n\
        fi\n\
        echo "Waiting for service $host:$port..."\n\
        sleep 2\n\
    done\n\
}\n\
\n\
# Download model if needed\n\
if [ ! -d "$MODEL_PATH" ]; then\n\
    echo "Downloading model..."\n\
    mkdir -p "$MODEL_PATH"\n\
    git clone https://huggingface.co/Qwen/Qwen2-7B-Instruct "$MODEL_PATH"\n\
fi\n\
\n\
# Start FastChat services\n\
python -m fastchat.serve.controller \
    --host 0.0.0.0 \
    --port 21001 > controller.log 2>&1 &\n\
\n\
# Wait for controller\n\
wait_for_service localhost 21001\n\
\n\
python -m fastchat.serve.openai_api_server \
    --controller-address http://localhost:21001 \
    --host 0.0.0.0 \
    --port 8000 > api_server.log 2>&1 &\n\
\n\
# Wait for API server\n\
wait_for_service localhost 8000\n\
\n\
# Start model worker\n\
python -m fastchat.serve.vllm_worker \
    --model-names Qwen/Qwen2-7B-Instruct \
    --model-path "$MODEL_PATH" \
    --controller-address http://localhost:21001 \
    --host localhost \
    --port 8080 \
    --worker-address http://localhost:8080 > worker.log 2>&1 &\n\
\n\
# Wait for model worker\n\
wait_for_service localhost 8080\n\
\n\
# Run the training process\n\
cd /app/qwen\n\
bash run.sh --base_model="$MODEL_PATH" --instruct_count="$INSTRUCT_COUNT" --max_iter="$MAX_ITER"\n\
\n\
# Start the web interface\n\
python app.py\n' > /app/start.sh && \
chmod +x /app/start.sh

# Install netcat for service checking
USER root
RUN apt-get update && apt-get install -y netcat-openbsd && rm -rf /var/lib/apt/lists/*
USER user

# Create a simple web interface
COPY --chown=user:user app.py .

# Expose port for web interface
EXPOSE 7860 8000 21001 8080

# Command to run
ENTRYPOINT ["/app/start.sh"]