opendatahub-io · dtrifiro · Jun 13, 2024 · Jun 13, 2024
diff --git a/Dockerfile.ubi b/Dockerfile.ubi
@@ -2,16 +2,15 @@
 ARG BASE_UBI_IMAGE_TAG=9.4
 ARG PYTHON_VERSION=3.11
 
-ARG TORCH_CUDA_ARCH_LIST="7.5 8.0 8.6 8.9 9.0+PTX"
+ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
+
 
 ## Base Layer ##################################################################
 FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} as base
 ARG PYTHON_VERSION
 
-# Some utils for dev/build purposes - tar required for kubectl cp
 RUN microdnf install -y \
     python${PYTHON_VERSION}-pip python${PYTHON_VERSION}-wheel \
-    which procps findutils tar git \
     && microdnf clean all
 
 WORKDIR /workspace
@@ -40,21 +39,61 @@ RUN microdnf install -y \
 ## CUDA Base ###################################################################
 FROM python-install as cuda-base
 
-ENV CUDA_VERSION=12.5.0
+# The Nvidia operator won't allow deploying on CUDA 12.0 hosts if
+# this env var is set to 12.2.0, even though it's compatible
+#ENV CUDA_VERSION=12.2.0 \
+ENV CUDA_VERSION=12.0.0 \
+    NV_CUDA_LIB_VERSION=12.2.0-1 \
+    NVIDIA_VISIBLE_DEVICES=all \
+    NVIDIA_DRIVER_CAPABILITIES=compute,utility \
+    NV_CUDA_CUDART_VERSION=12.2.53-1 \
+    NV_CUDA_COMPAT_VERSION=535.104.12
 
 RUN curl -Lo /etc/yum.repos.d/cuda-rhel9.repo \
         https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo
 
 RUN microdnf install -y \
-        cuda-nvcc-12-2 cuda-nvtx-12-2 cuda-libraries-devel-12-2 && \
-    microdnf clean all
+        cuda-cudart-12-2-${NV_CUDA_CUDART_VERSION} \
+        cuda-compat-12-2-${NV_CUDA_COMPAT_VERSION} \
+    && microdnf clean all
+
 
-ENV CUDA_HOME="/usr/local/cuda" \
+ARG CUDA_HOME="/usr/local/cuda"
+ENV CUDA_HOME=${CUDA_HOME}\
     PATH="${CUDA_HOME}/bin:${PATH}" \
     LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${CUDA_HOME}/extras/CUPTI/lib64:${LD_LIBRARY_PATH}"
 
+
+## CUDA Development ############################################################
+FROM cuda-base as cuda-devel
+
+ENV NV_CUDA_CUDART_DEV_VERSION=12.2.53-1 \
+    NV_NVML_DEV_VERSION=12.2.81-1 \
+    NV_LIBCUBLAS_DEV_VERSION=12.2.1.16-1 \
+    NV_LIBNPP_DEV_VERSION=12.1.1.14-1 \
+    NV_LIBNCCL_DEV_PACKAGE_VERSION=2.18.5-1+cuda12.2
+
+RUN microdnf install -y \
+        cuda-command-line-tools-12-2-${NV_CUDA_LIB_VERSION} \
+        cuda-libraries-devel-12-2-${NV_CUDA_LIB_VERSION} \
+        cuda-minimal-build-12-2-${NV_CUDA_LIB_VERSION} \
+        cuda-cudart-devel-12-2-${NV_CUDA_CUDART_DEV_VERSION} \
+        cuda-nvml-devel-12-2-${NV_NVML_DEV_VERSION} \
+        libcublas-devel-12-2-${NV_LIBCUBLAS_DEV_VERSION} \
+        libnpp-devel-12-2-${NV_LIBNPP_DEV_VERSION} \
+        libnccl-devel-${NV_LIBNCCL_DEV_PACKAGE_VERSION} \
+    && microdnf clean all
+
+ENV LIBRARY_PATH="$CUDA_HOME/lib64/stubs"
+
+# Workaround for https://github.com/openai/triton/issues/2507 and
+# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
+# this won't be needed for future versions of this docker image
+# or future versions of triton.
+RUN ldconfig /usr/local/cuda-12.2/compat/
+
 ## Python cuda base #################################################################
-FROM cuda-base AS python-cuda-base
+FROM cuda-devel AS python-cuda-base
 
 ENV VIRTUAL_ENV=/opt/vllm
 ENV PATH="$VIRTUAL_ENV/bin:$PATH"
@@ -64,8 +103,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
     --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \
     --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \
     pip install \
-        -r requirements-cuda.txt && \
-    find /opt/vllm/lib/ -name ".*\.so.*" -exec strip {} \;
+        -r requirements-cuda.txt
 
 ## Development #################################################################
 FROM python-cuda-base AS dev
@@ -88,8 +126,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
     pip install -r requirements-build.txt
 
 # install compiler cache to speed up compilation leveraging local or remote caching
-# git is required for the cutlass kernels
-RUN rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && rpm -ql epel-release && microdnf install -y git ccache && microdnf clean all
+RUN rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && rpm -ql epel-release && microdnf install -y ccache && microdnf clean all
 # install build dependencies
 
 # copy input files
@@ -123,12 +160,13 @@ COPY vllm vllm
 ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
     --mount=type=cache,target=/root/.cache/pip \
-    env CFLAGS="-march=haswell" \
-        CXXFLAGS="$CFLAGS $CXXFLAGS" \
-        CMAKE_BUILD_TYPE=Release \
-        python3 setup.py bdist_wheel --dist-dir=dist
+    CMAKE_BUILD_TYPE=Release python3 setup.py bdist_wheel --dist-dir=dist
 
 ## Release #####################################################################
+# Note from the non-UBI Dockerfile:
+# We used base cuda image because pytorch installs its own cuda libraries.
+# However pynccl depends on cuda libraries so we had to switch to the runtime image
+# In the future it would be nice to get a container with pytorch and cuda without duplicating cuda
 FROM python-install AS vllm-openai
 
 WORKDIR /workspace
@@ -143,8 +181,7 @@ RUN microdnf install -y gcc \
 # install vllm wheel first, so that torch etc will be installed
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \
     --mount=type=cache,target=/root/.cache/pip \
-    pip install dist/*.whl --verbose && \
-    find /opt/vllm/lib/ -regex ".*\.so.*" -exec strip {} \;
+    pip install dist/*.whl --verbose
 
 ENV HF_HUB_OFFLINE=1 \
     PORT=8000 \