Add Docker image for CUDA-12.1.

2026-05-20 08:25:47 +08:00 · 2024-01-08 14:22:05 +08:00
parent 4aab1d490b
commit 23a01b0696
10 changed files with 146 additions and 5 deletions
--- a/README.md
+++ b/README.md
@@ -1049,6 +1049,7 @@ To simplify the deployment process, we provide docker images with pre-built envi
 1. Install the correct version of Nvidia driver depending on the image to use:
  - `qwenllm/qwen:cu117` (**recommend**): `>= 515.48.07`
  - `qwenllm/qwen:cu114` (w/o flash-attention): `>= 470.82.01`
  - `qwenllm/qwen:cu121`: `>= 530.30.02`
  - `qwenllm/qwen:latest`: same as `qwenllm/qwen:cu117`
 2. Install and configure [docker](https://docs.docker.com/engine/install/) and [nvidia-container-toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html):
--- a/README_CN.md
+++ b/README_CN.md
@@ -1040,6 +1040,7 @@ print(response.choices[0].message.content)
 1. 根据需要使用的镜像版本，安装相应版本的Nvidia驱动：
  - `qwenllm/qwen:cu117`（**推荐**）：`>= 515.48.07`
  - `qwenllm/qwen:cu114`（不支持flash-attention）：`>= 470.82.01`
  - `qwenllm/qwen:cu121`：`>= 530.30.02`
  - `qwenllm/qwen:latest`：与`qwenllm/qwen:cu117`相同
 2. 安装并配置[docker](https://docs.docker.com/engine/install/)和[nvidia-container-toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html)：
--- a/README_ES.md
+++ b/README_ES.md
@@ -909,6 +909,7 @@ Para simplificar el proceso de despliegue, proporcionamos imágenes Docker con e
 1. Instale la versión correcta del controlador Nvidia en función de la imagen que vaya a utilizar:
  - `qwenllm/qwen:cu117` (**recomendado**): `>= 515.48.07`
  - `qwenllm/qwen:cu114` (w/o flash-attention): `>= 470.82.01`
  - `qwenllm/qwen:cu121`: `>= 530.30.02`
  - `qwenllm/qwen:latest`: igual que `qwenllm/qwen:cu117`
 2. Instale y configure [docker](https://docs.docker.com/engine/install/) y [nvidia-container-toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html):
--- a/README_FR.md
+++ b/README_FR.md
@@ -912,6 +912,7 @@ Pour simplifier le processus de déploiement, nous fournissons des images docker
 1. Installez la version correcte du pilote Nvidia en fonction de l'image à utiliser :
  - `qwenllm/qwen:cu117` (**recommandé**): `>= 515.48.07`
  - `qwenllm/qwen:cu114` (w/o flash-attention): `>= 470.82.01`
  - `qwenllm/qwen:cu121`: `>= 530.30.02`
  - `qwenllm/qwen:latest`: même que `qwenllm/qwen:cu117`
 2. Installer et configurer [docker](https://docs.docker.com/engine/install/) et [nvidia-container-toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) :
--- a/README_JA.md
+++ b/README_JA.md
@@ -948,6 +948,7 @@ print(response.choices[0].message.content)
 1. 使用するイメージに応じて、正しいバージョンのNvidiaドライバをインストールしてください：
  - `qwenllm/qwen:cu117` (**recommend**): `>= 515.48.07`
  - `qwenllm/qwen:cu114` (w/o flash-attention): `>= 470.82.01`
  - `qwenllm/qwen:cu121`: `>= 530.30.02`
  - `qwenllm/qwen:latest`: same as `qwenllm/qwen:cu117`
 2. [Docker](https://docs.docker.com/engine/install/) と [nvidia-container-toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) をインストールして設定します：
--- a/docker/Dockerfile-cu121
+++ b/docker/Dockerfile-cu121
@@ -0,0 +1,122 @@
 ARG CUDA_VERSION=12.1.0
 ARG from=nvidia/cuda:${CUDA_VERSION}-cudnn8-devel-ubuntu20.04
 FROM ${from} as base
 ARG from
 RUN <<EOF
 apt update -y && apt upgrade -y && apt install -y --no-install-recommends  \
    git \
    git-lfs \
    python3 \
    python3-pip \
    python3-dev \
    wget \
    vim \
 && rm -rf /var/lib/apt/lists/*
 EOF
 RUN ln -s /usr/bin/python3 /usr/bin/python
 RUN git lfs install
 FROM base as dev
 WORKDIR /
 RUN mkdir -p /data/shared/Qwen
 WORKDIR /data/shared/Qwen/
 # Users can also mount '/data/shared/Qwen/' to keep the data
 COPY ../requirements.txt ./
 COPY ../requirements_web_demo.txt ./
 FROM dev as bundle_req
 ARG BUNDLE_REQUIREMENTS=true
 RUN <<EOF
 if [ "$BUNDLE_REQUIREMENTS" = "true" ]; then 
    cd /data/shared/Qwen
    pip3 install torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/cu121
    pip3 install -r requirements.txt
    pip3 install -r requirements_web_demo.txt
    pip3 install transformers==4.36.0
 fi
 EOF
 FROM bundle_req as bundle_flash_attention
 ARG BUNDLE_FLASH_ATTENTION=true
 RUN <<EOF 
 if [ "$BUNDLE_FLASH_ATTENTION" = "true" ]; then
    cd /data/shared/Qwen 
    test -d flash-attention || git clone -b v2.3.3 https://github.com/Dao-AILab/flash-attention
    cd /data/shared/Qwen/flash-attention &&
        pip3 install . &&
        pip3 install csrc/layer_norm
 fi
 EOF
 FROM bundle_flash_attention as bundle_finetune
 ARG BUNDLE_FINETUNE=true
 RUN <<EOF
 if [ "$BUNDLE_FINETUNE" = "true" ]; then
    cd /data/shared/Qwen
    # Full-finetune / LoRA.
    pip3 install "deepspeed==0.12.6" "peft==0.7.1"
    # Q-LoRA.
    apt update -y && DEBIAN_FRONTEND=noninteractive apt install -y --no-install-recommends \
        libopenmpi-dev openmpi-bin \
        && rm -rf /var/lib/apt/lists/*
    pip3 install "optimum==1.14.0" "auto-gptq==0.5.0" mpi4py
 fi
 EOF
 FROM bundle_finetune as bundle_openai_api
 ARG BUNDLE_OPENAI_API=true
 RUN <<EOF
 if [ "$BUNDLE_OPENAI_API" = "true" ]; then
    cd /data/shared/Qwen
    pip3 install fastapi uvicorn "openai<1.0.0" sse_starlette "pydantic<=1.10.13"
 fi
 EOF
 FROM bundle_openai_api as bundle_vllm
 ARG BUNDLE_VLLM=true
 RUN <<EOF
 if [ "$BUNDLE_VLLM" = "true" ]; then
    cd /data/shared/Qwen
    pip3 install vllm==0.2.7 "fschat[model_worker,webui]==0.2.34"
 fi
 EOF
 FROM bundle_vllm as final
 ARG from
 COPY ../requirements.txt ./
 COPY ../requirements_web_demo.txt ./
 COPY ../cli_demo.py ./
 COPY ../web_demo.py ./
 COPY ../openai_api.py ./
 COPY ../finetune.py ./
 COPY ../utils.py ./
 COPY ./examples/* ./examples/
 COPY ./eval/* ./eval/
 COPY ./finetune/* ./finetune/
 EXPOSE 80
 WORKDIR /data/shared/Qwen/
 CMD ["python3", "web_demo.py", "--server-port", "80", "--server-name", "0.0.0.0", "-c", "/data/shared/Qwen/Qwen-Chat/"]
--- a/finetune.py
+++ b/finetune.py
@@ -272,7 +272,7 @@ def train():
    local_rank = training_args.local_rank
-    device_map = "auto"
+    device_map = None
    world_size = int(os.environ.get("WORLD_SIZE", 1))
    ddp = world_size != 1
    if lora_args.q_lora:
@@ -282,6 +282,19 @@ def train():
                "FSDP or ZeRO3 are incompatible with QLoRA."
            )
    is_chat_model = 'chat' in model_args.model_name_or_path.lower()
    if (
            training_args.use_lora
            and not lora_args.q_lora
            and deepspeed.is_deepspeed_zero3_enabled()
            and not is_chat_model
    ):
        raise RuntimeError("ZeRO3 is incompatible with LoRA when finetuning on base model.")
    model_load_kwargs = {}
    if deepspeed.is_deepspeed_zero3_enabled():
        model_load_kwargs['low_cpu_mem_usage'] = False
    # Set RoPE scaling factor
    config = transformers.AutoConfig.from_pretrained(
        model_args.model_name_or_path,
@@ -302,6 +315,7 @@ def train():
        )
        if training_args.use_lora and lora_args.q_lora
        else None,
        **model_load_kwargs,
    )
    tokenizer = transformers.AutoTokenizer.from_pretrained(
        model_args.model_name_or_path,
@@ -314,7 +328,7 @@ def train():
    tokenizer.pad_token_id = tokenizer.eod_id
    if training_args.use_lora:
-        if lora_args.q_lora or 'chat' in model_args.model_name_or_path.lower():
+        if lora_args.q_lora or is_chat_model:
            modules_to_save = None
        else:
            modules_to_save = ["wte", "lm_head"]
--- a/finetune/finetune_ds.sh
+++ b/finetune/finetune_ds.sh
@@ -18,7 +18,7 @@ NNODES=${NNODES:-1}
 NODE_RANK=${NODE_RANK:-0}
 # The ip address of the rank-0 worker, for single-worker training, please set to localhost
-MASTER_ADDR=${MASTER_ADDR:localhost}
+MASTER_ADDR=${MASTER_ADDR:-localhost}
 # The port for communication
 MASTER_PORT=${MASTER_PORT:-6001}
--- a/finetune/finetune_lora_ds.sh
+++ b/finetune/finetune_lora_ds.sh
@@ -18,7 +18,7 @@ NNODES=${NNODES:-1}
 NODE_RANK=${NODE_RANK:-0}
 # The ip address of the rank-0 worker, for single-worker training, please set to localhost
-MASTER_ADDR=${MASTER_ADDR:localhost}
+MASTER_ADDR=${MASTER_ADDR:-localhost}
 # The port for communication
 MASTER_PORT=${MASTER_PORT:-6001}
--- a/finetune/finetune_qlora_ds.sh
+++ b/finetune/finetune_qlora_ds.sh
@@ -18,7 +18,7 @@ NNODES=${NNODES:-1}
 NODE_RANK=${NODE_RANK:-0}
 # The ip address of the rank-0 worker, for single-worker training, please set to localhost
-MASTER_ADDR=${MASTER_ADDR:localhost}
+MASTER_ADDR=${MASTER_ADDR:-localhost}
 # The port for communication
 MASTER_PORT=${MASTER_PORT:-6001}