From 9e0b5908c1c6e53cc2fb19adfa04ac38df1634cd Mon Sep 17 00:00:00 2001
From: Thien Nguyen <58006629+1tnguyen@users.noreply.github.com>
Date: Sat, 18 Jan 2025 07:14:39 +1100
Subject: [PATCH] Fix a bug in tensornet backend scratch pad allocation in
 multi-GPU mode (#2516)

* Fix a bug in default init of scratchpad: it must allocate memory after we've set the device

Signed-off-by: Thien Nguyen <thiennguyen@nvidia.com>

* Add test

Signed-off-by: Thien Nguyen <thiennguyen@nvidia.com>

* Add a check to prevent multiple allocate calls

Signed-off-by: Thien Nguyen <thiennguyen@nvidia.com>

---------

Signed-off-by: Thien Nguyen <thiennguyen@nvidia.com>
---
 .../cutensornet/simulator_cutensornet.cpp     |  2 +
 runtime/nvqir/cutensornet/tensornet_utils.h   | 13 +++++-
 unittests/CMakeLists.txt                      | 23 ++++++++++
 unittests/mpi/tensornet_mpi_tester.cpp        | 44 +++++++++++++++++++
 4 files changed, 80 insertions(+), 2 deletions(-)
 create mode 100644 unittests/mpi/tensornet_mpi_tester.cpp
diff --git a/runtime/nvqir/cutensornet/simulator_cutensornet.cpp b/runtime/nvqir/cutensornet/simulator_cutensornet.cpp
index 676c9a38c5..3382ae1586 100644
--- a/runtime/nvqir/cutensornet/simulator_cutensornet.cpp
+++ b/runtime/nvqir/cutensornet/simulator_cutensornet.cpp
@@ -22,6 +22,8 @@ SimulatorTensorNetBase::SimulatorTensorNetBase()
       cudaq::mpi::is_initialized() ? cudaq::mpi::rank() % numDevices : 0;
   HANDLE_CUDA_ERROR(cudaSetDevice(deviceId));
   HANDLE_CUTN_ERROR(cutensornetCreate(&m_cutnHandle));
+  // The scratch pad must be allocated after we have selected the device.
+  scratchPad.allocate();
 }
 
 static std::vector<std::complex<double>>
diff --git a/runtime/nvqir/cutensornet/tensornet_utils.h b/runtime/nvqir/cutensornet/tensornet_utils.h
index 6d11f3bebb..fd605fab80 100644
--- a/runtime/nvqir/cutensornet/tensornet_utils.h
+++ b/runtime/nvqir/cutensornet/tensornet_utils.h
@@ -72,7 +72,12 @@ struct ScratchDeviceMem {
                   2; // use half of available memory with alignment
   }
 
-  ScratchDeviceMem() {
+  // Allocate scratch device memory based on available memory
+  void allocate() {
+    if (d_scratch)
+      throw std::runtime_error(
+          "Multiple scratch device memory allocations is not allowed.");
+
     computeScratchSize();
     // Try allocate device memory
     auto errCode = cudaMalloc(&d_scratch, scratchSize);
@@ -86,7 +91,11 @@ struct ScratchDeviceMem {
       HANDLE_CUDA_ERROR(errCode);
     }
   }
-  ~ScratchDeviceMem() { HANDLE_CUDA_ERROR(cudaFree(d_scratch)); }
+
+  ~ScratchDeviceMem() {
+    if (scratchSize > 0)
+      HANDLE_CUDA_ERROR(cudaFree(d_scratch));
+  }
 };
 
 /// Initialize `cutensornet` MPI Comm
diff --git a/unittests/CMakeLists.txt b/unittests/CMakeLists.txt
index bc7f6581b8..9ff26f7510 100644
--- a/unittests/CMakeLists.txt
+++ b/unittests/CMakeLists.txt
@@ -198,6 +198,29 @@ if(TARGET nvqir-tensornet)
   message(STATUS "Building cutensornet backend tests.")
   create_tests_with_backend(tensornet "")
   create_tests_with_backend(tensornet-mps "")
+  if (MPI_CXX_FOUND)
+    # Count the number of GPUs
+    find_program(NVIDIA_SMI "nvidia-smi")
+    if(NVIDIA_SMI)
+      execute_process(COMMAND bash -c "nvidia-smi --list-gpus | wc -l" OUTPUT_VARIABLE NGPUS)
+      # Only build this test if we have more than 1 GPUs
+      if (${NGPUS} GREATER_EQUAL 2)
+        message(STATUS "Building cutensornet MPI tests.")
+        add_executable(test_tensornet_mpi mpi/tensornet_mpi_tester.cpp)
+        if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND NOT APPLE)
+          target_link_options(test_tensornet_mpi PRIVATE -Wl,--no-as-needed)
+        endif()
+        target_link_libraries(test_tensornet_mpi
+            PRIVATE 
+            cudaq
+            cudaq-platform-default
+            nvqir-tensornet
+            gtest)
+        add_test(NAME TensornetMPITest COMMAND ${MPIEXEC} --allow-run-as-root -np 2 ${CMAKE_BINARY_DIR}/unittests/test_tensornet_mpi)
+        set_tests_properties(TensornetMPITest PROPERTIES LABELS "gpu_required;mgpus_required")
+      endif() # NGPUS
+    endif() # NVIDIA_SMI
+  endif() # MPI_CXX_FOUND
 endif() 
 
 # Create an executable for SpinOp UnitTests
diff --git a/unittests/mpi/tensornet_mpi_tester.cpp b/unittests/mpi/tensornet_mpi_tester.cpp
new file mode 100644
index 0000000000..a770461117
--- /dev/null
+++ b/unittests/mpi/tensornet_mpi_tester.cpp
@@ -0,0 +1,44 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+#include <cudaq.h>
+#include <gtest/gtest.h>
+
+TEST(TensornetMPITester, checkInit) {
+  EXPECT_TRUE(cudaq::mpi::is_initialized());
+  std::cout << "Rank = " << cudaq::mpi::rank() << "\n";
+}
+
+TEST(TensornetMPITester, checkSimple) {
+  constexpr std::size_t numQubits = 50;
+  auto kernel = []() __qpu__ {
+    cudaq::qvector q(numQubits);
+    h(q[0]);
+    for (int i = 0; i < numQubits - 1; i++)
+      x<cudaq::ctrl>(q[i], q[i + 1]);
+    mz(q);
+  };
+
+  auto counts = cudaq::sample(100, kernel);
+
+  if (cudaq::mpi::rank() == 0) {
+    EXPECT_EQ(2, counts.size());
+
+    for (auto &[bits, count] : counts) {
+      printf("Observed: %s, %lu\n", bits.data(), count);
+      EXPECT_EQ(numQubits, bits.size());
+    }
+  }
+}
+
+int main(int argc, char **argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  cudaq::mpi::initialize();
+  const auto testResult = RUN_ALL_TESTS();
+  cudaq::mpi::finalize();
+  return testResult;
+}