From e9437bee197dc2c6db348bcf107b7ec6ad533192 Mon Sep 17 00:00:00 2001 From: Maxime Hugues Date: Tue, 7 May 2024 15:22:36 -0500 Subject: [PATCH] Fix aws ofi nccl version expansion --- micro-benchmarks/nccl-tests/nccl-tests.Dockerfile | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/micro-benchmarks/nccl-tests/nccl-tests.Dockerfile b/micro-benchmarks/nccl-tests/nccl-tests.Dockerfile index 8d3a814a..4a52a193 100644 --- a/micro-benchmarks/nccl-tests/nccl-tests.Dockerfile +++ b/micro-benchmarks/nccl-tests/nccl-tests.Dockerfile @@ -1,6 +1,6 @@ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: MIT-0 -FROM nvidia/cuda:12.2.0-devel-ubuntu22.04 +FROM nvidia/cuda:12.2.2-devel-ubuntu22.04 ARG GDRCOPY_VERSION=v2.4.1 ARG EFA_INSTALLER_VERSION=1.31.0 @@ -88,6 +88,8 @@ RUN git clone -b ${NCCL_VERSION} https://github.com/NVIDIA/nccl.git /opt/nccl \ ################################################### ## Install AWS-OFI-NCCL plugin RUN DEBIAN_FRONTEND=noninteractive apt-get install -y libhwloc-dev +#Switch from sh to bash to allow parameter expansion +SHELL ["/bin/bash", "-c"] RUN curl -OL https://github.com/aws/aws-ofi-nccl/releases/download/${AWS_OFI_NCCL_VERSION}/aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz \ && tar -xf aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz \ && cd aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v} \ @@ -102,6 +104,8 @@ RUN curl -OL https://github.com/aws/aws-ofi-nccl/releases/download/${AWS_OFI_NCC && rm -rf aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v} \ && rm aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz +SHELL ["/bin/sh", "-c"] + ################################################### ## Install NCCL-tests RUN git clone -b ${NCCL_TESTS_VERSION} https://github.com/NVIDIA/nccl-tests.git /opt/nccl-tests \ @@ -120,7 +124,7 @@ ENV OMPI_MCA_pml=^cm,ucx \ OMPI_MCA_btl=tcp,self \ OMPI_MCA_btl_tcp_if_exclude=lo,docker0,veth_def_agent\ OPAL_PREFIX=/opt/amazon/openmpi \ - NCCL_SOCKET_IFNAME=^docker,lo + NCCL_SOCKET_IFNAME=^docker,lo,veth_def_agent ## Turn off PMIx Error https://github.com/open-mpi/ompi/issues/7516 ENV PMIX_MCA_gds=hash