Skip to content

Commit

Permalink
Fix a bug in tensornet backend scratch pad allocation in multi-GPU mo…
Browse files Browse the repository at this point in the history
…de (#2516)

* Fix a bug in default init of scratchpad: it must allocate memory after we've set the device

Signed-off-by: Thien Nguyen <thiennguyen@nvidia.com>

* Add test

Signed-off-by: Thien Nguyen <thiennguyen@nvidia.com>

* Add a check to prevent multiple allocate calls

Signed-off-by: Thien Nguyen <thiennguyen@nvidia.com>

---------

Signed-off-by: Thien Nguyen <thiennguyen@nvidia.com>
  • Loading branch information
1tnguyen authored Jan 17, 2025
1 parent 742a31d commit 9e0b590
Show file tree
Hide file tree
Showing 4 changed files with 80 additions and 2 deletions.
2 changes: 2 additions & 0 deletions runtime/nvqir/cutensornet/simulator_cutensornet.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ SimulatorTensorNetBase::SimulatorTensorNetBase()
cudaq::mpi::is_initialized() ? cudaq::mpi::rank() % numDevices : 0;
HANDLE_CUDA_ERROR(cudaSetDevice(deviceId));
HANDLE_CUTN_ERROR(cutensornetCreate(&m_cutnHandle));
// The scratch pad must be allocated after we have selected the device.
scratchPad.allocate();
}

static std::vector<std::complex<double>>
Expand Down
13 changes: 11 additions & 2 deletions runtime/nvqir/cutensornet/tensornet_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,12 @@ struct ScratchDeviceMem {
2; // use half of available memory with alignment
}

ScratchDeviceMem() {
// Allocate scratch device memory based on available memory
void allocate() {
if (d_scratch)
throw std::runtime_error(
"Multiple scratch device memory allocations is not allowed.");

computeScratchSize();
// Try allocate device memory
auto errCode = cudaMalloc(&d_scratch, scratchSize);
Expand All @@ -86,7 +91,11 @@ struct ScratchDeviceMem {
HANDLE_CUDA_ERROR(errCode);
}
}
~ScratchDeviceMem() { HANDLE_CUDA_ERROR(cudaFree(d_scratch)); }

~ScratchDeviceMem() {
if (scratchSize > 0)
HANDLE_CUDA_ERROR(cudaFree(d_scratch));
}
};

/// Initialize `cutensornet` MPI Comm
Expand Down
23 changes: 23 additions & 0 deletions unittests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,29 @@ if(TARGET nvqir-tensornet)
message(STATUS "Building cutensornet backend tests.")
create_tests_with_backend(tensornet "")
create_tests_with_backend(tensornet-mps "")
if (MPI_CXX_FOUND)
# Count the number of GPUs
find_program(NVIDIA_SMI "nvidia-smi")
if(NVIDIA_SMI)
execute_process(COMMAND bash -c "nvidia-smi --list-gpus | wc -l" OUTPUT_VARIABLE NGPUS)
# Only build this test if we have more than 1 GPUs
if (${NGPUS} GREATER_EQUAL 2)
message(STATUS "Building cutensornet MPI tests.")
add_executable(test_tensornet_mpi mpi/tensornet_mpi_tester.cpp)
if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND NOT APPLE)
target_link_options(test_tensornet_mpi PRIVATE -Wl,--no-as-needed)
endif()
target_link_libraries(test_tensornet_mpi
PRIVATE
cudaq
cudaq-platform-default
nvqir-tensornet
gtest)
add_test(NAME TensornetMPITest COMMAND ${MPIEXEC} --allow-run-as-root -np 2 ${CMAKE_BINARY_DIR}/unittests/test_tensornet_mpi)
set_tests_properties(TensornetMPITest PROPERTIES LABELS "gpu_required;mgpus_required")
endif() # NGPUS
endif() # NVIDIA_SMI
endif() # MPI_CXX_FOUND
endif()

# Create an executable for SpinOp UnitTests
Expand Down
44 changes: 44 additions & 0 deletions unittests/mpi/tensornet_mpi_tester.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
/*******************************************************************************
* Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates. *
* All rights reserved. *
* *
* This source code and the accompanying materials are made available under *
* the terms of the Apache License 2.0 which accompanies this distribution. *
******************************************************************************/
#include <cudaq.h>
#include <gtest/gtest.h>

TEST(TensornetMPITester, checkInit) {
EXPECT_TRUE(cudaq::mpi::is_initialized());
std::cout << "Rank = " << cudaq::mpi::rank() << "\n";
}

TEST(TensornetMPITester, checkSimple) {
constexpr std::size_t numQubits = 50;
auto kernel = []() __qpu__ {
cudaq::qvector q(numQubits);
h(q[0]);
for (int i = 0; i < numQubits - 1; i++)
x<cudaq::ctrl>(q[i], q[i + 1]);
mz(q);
};

auto counts = cudaq::sample(100, kernel);

if (cudaq::mpi::rank() == 0) {
EXPECT_EQ(2, counts.size());

for (auto &[bits, count] : counts) {
printf("Observed: %s, %lu\n", bits.data(), count);
EXPECT_EQ(numQubits, bits.size());
}
}
}

int main(int argc, char **argv) {
::testing::InitGoogleTest(&argc, argv);
cudaq::mpi::initialize();
const auto testResult = RUN_ALL_TESTS();
cudaq::mpi::finalize();
return testResult;
}

0 comments on commit 9e0b590

Please sign in to comment.