From 9e0b5908c1c6e53cc2fb19adfa04ac38df1634cd Mon Sep 17 00:00:00 2001 From: Thien Nguyen <58006629+1tnguyen@users.noreply.github.com> Date: Sat, 18 Jan 2025 07:14:39 +1100 Subject: [PATCH] Fix a bug in tensornet backend scratch pad allocation in multi-GPU mode (#2516) * Fix a bug in default init of scratchpad: it must allocate memory after we've set the device Signed-off-by: Thien Nguyen * Add test Signed-off-by: Thien Nguyen * Add a check to prevent multiple allocate calls Signed-off-by: Thien Nguyen --------- Signed-off-by: Thien Nguyen --- .../cutensornet/simulator_cutensornet.cpp | 2 + runtime/nvqir/cutensornet/tensornet_utils.h | 13 +++++- unittests/CMakeLists.txt | 23 ++++++++++ unittests/mpi/tensornet_mpi_tester.cpp | 44 +++++++++++++++++++ 4 files changed, 80 insertions(+), 2 deletions(-) create mode 100644 unittests/mpi/tensornet_mpi_tester.cpp diff --git a/runtime/nvqir/cutensornet/simulator_cutensornet.cpp b/runtime/nvqir/cutensornet/simulator_cutensornet.cpp index 676c9a38c5..3382ae1586 100644 --- a/runtime/nvqir/cutensornet/simulator_cutensornet.cpp +++ b/runtime/nvqir/cutensornet/simulator_cutensornet.cpp @@ -22,6 +22,8 @@ SimulatorTensorNetBase::SimulatorTensorNetBase() cudaq::mpi::is_initialized() ? cudaq::mpi::rank() % numDevices : 0; HANDLE_CUDA_ERROR(cudaSetDevice(deviceId)); HANDLE_CUTN_ERROR(cutensornetCreate(&m_cutnHandle)); + // The scratch pad must be allocated after we have selected the device. + scratchPad.allocate(); } static std::vector> diff --git a/runtime/nvqir/cutensornet/tensornet_utils.h b/runtime/nvqir/cutensornet/tensornet_utils.h index 6d11f3bebb..fd605fab80 100644 --- a/runtime/nvqir/cutensornet/tensornet_utils.h +++ b/runtime/nvqir/cutensornet/tensornet_utils.h @@ -72,7 +72,12 @@ struct ScratchDeviceMem { 2; // use half of available memory with alignment } - ScratchDeviceMem() { + // Allocate scratch device memory based on available memory + void allocate() { + if (d_scratch) + throw std::runtime_error( + "Multiple scratch device memory allocations is not allowed."); + computeScratchSize(); // Try allocate device memory auto errCode = cudaMalloc(&d_scratch, scratchSize); @@ -86,7 +91,11 @@ struct ScratchDeviceMem { HANDLE_CUDA_ERROR(errCode); } } - ~ScratchDeviceMem() { HANDLE_CUDA_ERROR(cudaFree(d_scratch)); } + + ~ScratchDeviceMem() { + if (scratchSize > 0) + HANDLE_CUDA_ERROR(cudaFree(d_scratch)); + } }; /// Initialize `cutensornet` MPI Comm diff --git a/unittests/CMakeLists.txt b/unittests/CMakeLists.txt index bc7f6581b8..9ff26f7510 100644 --- a/unittests/CMakeLists.txt +++ b/unittests/CMakeLists.txt @@ -198,6 +198,29 @@ if(TARGET nvqir-tensornet) message(STATUS "Building cutensornet backend tests.") create_tests_with_backend(tensornet "") create_tests_with_backend(tensornet-mps "") + if (MPI_CXX_FOUND) + # Count the number of GPUs + find_program(NVIDIA_SMI "nvidia-smi") + if(NVIDIA_SMI) + execute_process(COMMAND bash -c "nvidia-smi --list-gpus | wc -l" OUTPUT_VARIABLE NGPUS) + # Only build this test if we have more than 1 GPUs + if (${NGPUS} GREATER_EQUAL 2) + message(STATUS "Building cutensornet MPI tests.") + add_executable(test_tensornet_mpi mpi/tensornet_mpi_tester.cpp) + if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND NOT APPLE) + target_link_options(test_tensornet_mpi PRIVATE -Wl,--no-as-needed) + endif() + target_link_libraries(test_tensornet_mpi + PRIVATE + cudaq + cudaq-platform-default + nvqir-tensornet + gtest) + add_test(NAME TensornetMPITest COMMAND ${MPIEXEC} --allow-run-as-root -np 2 ${CMAKE_BINARY_DIR}/unittests/test_tensornet_mpi) + set_tests_properties(TensornetMPITest PROPERTIES LABELS "gpu_required;mgpus_required") + endif() # NGPUS + endif() # NVIDIA_SMI + endif() # MPI_CXX_FOUND endif() # Create an executable for SpinOp UnitTests diff --git a/unittests/mpi/tensornet_mpi_tester.cpp b/unittests/mpi/tensornet_mpi_tester.cpp new file mode 100644 index 0000000000..a770461117 --- /dev/null +++ b/unittests/mpi/tensornet_mpi_tester.cpp @@ -0,0 +1,44 @@ +/******************************************************************************* + * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ +#include +#include + +TEST(TensornetMPITester, checkInit) { + EXPECT_TRUE(cudaq::mpi::is_initialized()); + std::cout << "Rank = " << cudaq::mpi::rank() << "\n"; +} + +TEST(TensornetMPITester, checkSimple) { + constexpr std::size_t numQubits = 50; + auto kernel = []() __qpu__ { + cudaq::qvector q(numQubits); + h(q[0]); + for (int i = 0; i < numQubits - 1; i++) + x(q[i], q[i + 1]); + mz(q); + }; + + auto counts = cudaq::sample(100, kernel); + + if (cudaq::mpi::rank() == 0) { + EXPECT_EQ(2, counts.size()); + + for (auto &[bits, count] : counts) { + printf("Observed: %s, %lu\n", bits.data(), count); + EXPECT_EQ(numQubits, bits.size()); + } + } +} + +int main(int argc, char **argv) { + ::testing::InitGoogleTest(&argc, argv); + cudaq::mpi::initialize(); + const auto testResult = RUN_ALL_TESTS(); + cudaq::mpi::finalize(); + return testResult; +}