main

分支 (36)

标签 (29)

管理

管理

main

bugfix-explicit-kv-scale

woosuk-gemma1

tpu-optimize-swap

tpu-n

compressed-tensors-fp8

cleanup-reqs

quant-supported-hardware

flip-quant-method-check

lora-chunk

add_4_gpu_test

revert-5253-refactor-checking-supported-quant-method

collect-env-transformers

optimize-prefix-caching-scheduling

fix-hashing-partial-blocks

remove-abstract-property

simon-mo/test-ignore-eos

revert-5136-marlin_24_ptx_ordered_metadata

sparseml-avoid-warning

detect-sparseml-config

v0.5.0.post1

v0.5.0

v0.4.3

v0.4.2

v0.4.1

v0.4.0.post1

v0.4.0

v0.3.3

v0.3.2

v0.3.1

v0.3.0

v0.2.7

v0.2.6

v0.2.5

v0.2.4

v0.2.3

v0.2.2

v0.2.1.post1

v0.2.1

v0.2.0

vllm
/
Dockerfile.rocm

# Default ROCm 6.1 base image
ARG BASE_IMAGE="rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging"

# Default ROCm ARCHes to build vLLM for.
ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100"

# Whether to install CK-based flash-attention
# If 0, will not install flash-attention
ARG BUILD_FA="1"
# If `TRY_FA_WHEEL=1`, we will try installing flash-attention from `FA_WHEEL_URL`
# If this succeeds, we use the downloaded wheel and skip building flash-attention.
# Otherwise, ROCm flash-attention from `FA_BRANCH` will be built for the
# architectures specified in `FA_GFX_ARCHS`
ARG TRY_FA_WHEEL="1"
ARG FA_WHEEL_URL="https://github.com/ROCm/flash-attention/releases/download/v2.5.9post1-cktile-vllm/flash_attn-2.5.9.post1-cp39-cp39-linux_x86_64.whl"
ARG FA_GFX_ARCHS="gfx90a;gfx942"
ARG FA_BRANCH="23a2b1c2"

# Whether to build triton on rocm
ARG BUILD_TRITON="1"
ARG TRITON_BRANCH="e0fc12c"

### Base image build stage
FROM $BASE_IMAGE AS base

# Import arg(s) defined before this build stage
ARG PYTORCH_ROCM_ARCH

# Install some basic utilities
RUN apt-get update && apt-get install python3 python3-pip -y
RUN apt-get update && apt-get install -y \
    curl \
    ca-certificates \
    sudo \
    git \
    bzip2 \
    libx11-6 \
    build-essential \
    wget \
    unzip \
    tmux \
    ccache \
 && rm -rf /var/lib/apt/lists/*

# When launching the container, mount the code directory to /vllm-workspace
ARG APP_MOUNT=/vllm-workspace
WORKDIR ${APP_MOUNT}

RUN python3 -m pip install --upgrade pip
# Remove sccache so it doesn't interfere with ccache
# TODO: implement sccache support across components
RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
# Install torch == 2.5.0 on ROCm
RUN case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
        *"rocm-6.1"*) \
            python3 -m pip uninstall -y torch torchvision \
            && python3 -m pip install --no-cache-dir --pre \
                torch==2.5.0.dev20240726 \
                torchvision==0.20.0.dev20240726 \
               --index-url https://download.pytorch.org/whl/nightly/rocm6.1;; \
        *) ;; esac

ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer
ENV PATH=$PATH:/opt/rocm/bin:/libtorch/bin:
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/libtorch/lib:
ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/libtorch/include:/libtorch/include/torch/csrc/api/include/:/opt/rocm/include/:

ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
ENV CCACHE_DIR=/root/.cache/ccache


### AMD-SMI build stage
FROM base AS build_amdsmi
# Build amdsmi wheel always
RUN cd /opt/rocm/share/amd_smi \
    && python3 -m pip wheel . --wheel-dir=/install


### Flash-Attention wheel build stage
FROM base AS build_fa
ARG BUILD_FA
ARG TRY_FA_WHEEL
ARG FA_WHEEL_URL
ARG FA_GFX_ARCHS
ARG FA_BRANCH
# Build ROCm flash-attention wheel if `BUILD_FA = 1`
RUN --mount=type=cache,target=${CCACHE_DIR} \
    if [ "$BUILD_FA" = "1" ]; then \
        if [ "${TRY_FA_WHEEL}" = "1" ] && python3 -m pip install "${FA_WHEEL_URL}"; then \
            # If a suitable wheel exists, we download it instead of building FA
            mkdir -p /install && wget -N "${FA_WHEEL_URL}" -P /install; \
        else \
            mkdir -p libs \
            && cd libs \
            && git clone https://github.com/ROCm/flash-attention.git \
            && cd flash-attention \
            && git checkout "${FA_BRANCH}" \
            && git submodule update --init \
            && GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \
        fi; \
    # Create an empty directory otherwise as later build stages expect one
    else mkdir -p /install; \
    fi


### Triton wheel build stage
FROM base AS build_triton
ARG BUILD_TRITON
ARG TRITON_BRANCH
# Build triton wheel if `BUILD_TRITON = 1`
RUN --mount=type=cache,target=${CCACHE_DIR} \
    if [ "$BUILD_TRITON" = "1" ]; then \
    mkdir -p libs \
    && cd libs \
    && git clone https://github.com/OpenAI/triton.git \
    && cd triton \
    && git checkout "${TRITON_BRANCH}" \
    && cd python \
    && python3 setup.py bdist_wheel --dist-dir=/install; \
    # Create an empty directory otherwise as later build stages expect one
    else mkdir -p /install; \
    fi


### Final vLLM build stage
FROM base AS final
# Import the vLLM development directory from the build context
COPY . .

# Package upgrades for useful functionality or to avoid dependency issues
RUN --mount=type=cache,target=/root/.cache/pip \
    python3 -m pip install --upgrade numba scipy huggingface-hub[cli]

# Make sure punica kernels are built (for LoRA)
ENV VLLM_INSTALL_PUNICA_KERNELS=1
# Workaround for ray >= 2.10.0
ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
# Silences the HF Tokenizers warning
ENV TOKENIZERS_PARALLELISM=false

RUN --mount=type=cache,target=${CCACHE_DIR} \
    --mount=type=cache,target=/root/.cache/pip \
    python3 -m pip install -Ur requirements-rocm.txt \
    && case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
        *"rocm-6.1"*) \
            # Bring in upgrades to HIP graph earlier than ROCm 6.2 for vLLM
            wget -N https://github.com/ROCm/vllm/raw/fa78403/rocm_patch/libamdhip64.so.6 -P /opt/rocm/lib \
            # Prevent interference if torch bundles its own HIP runtime
            && rm -f "$(python3 -c 'import torch; print(torch.__path__[0])')"/lib/libamdhip64.so* || true;; \
        *) ;; esac \
    && python3 setup.py clean --all \
    && python3 setup.py develop

# Copy amdsmi wheel into final image
RUN --mount=type=bind,from=build_amdsmi,src=/install,target=/install \
    mkdir -p libs \
    && cp /install/*.whl libs \
    # Preemptively uninstall to avoid same-version no-installs
    && python3 -m pip uninstall -y amdsmi;

# Copy triton wheel(s) into final image if they were built
RUN --mount=type=bind,from=build_triton,src=/install,target=/install \
    mkdir -p libs \
    && if ls /install/*.whl; then \
        cp /install/*.whl libs \
        # Preemptively uninstall to avoid same-version no-installs
        && python3 -m pip uninstall -y triton; fi

# Copy flash-attn wheel(s) into final image if they were built
RUN --mount=type=bind,from=build_fa,src=/install,target=/install \
    mkdir -p libs \
    && if ls /install/*.whl; then \
        cp /install/*.whl libs \
        # Preemptively uninstall to avoid same-version no-installs
        && python3 -m pip uninstall -y flash-attn; fi

# Install wheels that were built to the final image
RUN --mount=type=cache,target=/root/.cache/pip \
    if ls libs/*.whl; then \
    python3 -m pip install libs/*.whl; fi

CMD ["/bin/bash"]