NVIDIA · bmhowe23 · Jan 24, 2025 · Jan 23, 2025 · Jan 23, 2025 · Jan 23, 2025
@@ -1,2 +1,2 @@
 nvidia-mgpu-repo: cuda-quantum/cuquantum-mgpu.git
-nvidia-mgpu-commit: 806e7fe5c459f52296ae0d3bd8bc57c3ea806152
+nvidia-mgpu-commit: fdea89e034c7ac6b6b3d0e239d8924bcd2c08f96
diff --git a/docker/build/devdeps.Dockerfile b/docker/build/devdeps.Dockerfile
@@ -170,5 +170,5 @@ RUN apt-get update && apt-get install -y --no-install-recommends python3 python3
     && python3 -m pip install --no-cache-dir \
         ipython==8.15.0 pandoc==2.3 sphinx==5.3.0 sphinx_rtd_theme==1.2.0 sphinx-reredirects==0.1.2 \
         sphinx-copybutton==0.5.2 sphinx_inline_tabs==2023.4.21 enum-tools[sphinx] breathe==4.34.0 \
-        nbsphinx==0.9.2 sphinx_gallery==0.13.0 myst-parser==1.0.0 ipykernel==6.29.4 notebook==7.1.3 \
+        nbsphinx==0.9.2 sphinx_gallery==0.13.0 myst-parser==1.0.0 ipykernel==6.29.4 notebook==7.3.2 \
         ipywidgets==8.1.5
diff --git a/docker/release/cudaq.Dockerfile b/docker/release/cudaq.Dockerfile
@@ -62,7 +62,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     && ln -s /bin/python3 /bin/python
 RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ python3-dev \
     # Ref: https://github.com/qutip/qutip/issues/2412
-    && python3 -m pip install --no-cache-dir notebook==7.1.3 "qutip<5" matplotlib \
+    && python3 -m pip install --no-cache-dir notebook==7.3.2 "qutip<5" matplotlib \
     && apt-get remove -y gcc g++ python3-dev \
     && apt-get autoremove -y && apt-get clean && rm -rf /var/lib/apt/lists/*
 

diff --git a/docker/release/cudaq.ext.Dockerfile b/docker/release/cudaq.ext.Dockerfile
@@ -44,7 +44,7 @@ RUN cuda_version_suffix=$(echo ${CUDA_VERSION} | tr . -) && \
     apt-get install -y --no-install-recommends curl jq 
 RUN if [ -x "$(command -v pip)" ]; then \
         apt-get install -y --no-install-recommends gcc libpython3-dev \
-        && pip install --no-cache-dir jupyterlab; \
+        && pip install --no-cache-dir jupyterlab==4.3.4; \
         if [ -n "$MPI_ROOT" ]; then \
             pip install --no-cache-dir mpi4py~=3.1; \
         fi; \

diff --git a/docs/sphinx/api/languages/python_api.rst b/docs/sphinx/api/languages/python_api.rst
@@ -157,6 +157,8 @@ Data Types
 .. autoclass:: cudaq.operator.cudm_state.CuDensityMatState
     :members:
 
+.. autoclass:: cudaq.operator.helpers.InitialState
+
 .. autofunction:: cudaq.operator.cudm_state.to_cupy_array
 
 .. autoclass:: cudaq::SampleResult

diff --git a/docs/sphinx/examples/python/dynamics/mgmn/initial_state.py b/docs/sphinx/examples/python/dynamics/mgmn/initial_state.py
@@ -0,0 +1,71 @@
+import cudaq
+from cudaq import operators, spin, Schedule, RungeKuttaIntegrator
+
+import numpy as np
+import matplotlib.pyplot as plt
+import os
+
+# On a system with multiple GPUs, `mpiexec` can be used as follows:
+# `mpiexec -np <N> python3 multi_gpu.py `
+cudaq.mpi.initialize()
+
+# Set the target to our dynamics simulator
+cudaq.set_target("dynamics")
+
+# Large number of spins
+N = 20
+dimensions = {}
+for i in range(N):
+    dimensions[i] = 2
+
+# Observable is the average magnetization operator
+avg_magnetization_op = operators.zero()
+for i in range(N):
+    avg_magnetization_op += (spin.z(i) / N)
+
+# Arbitrary coupling constant
+g = 1.0
+# Construct the Hamiltonian
+H = operators.zero()
+for i in range(N):
+    H += 2 * np.pi * spin.x(i)
+    H += 2 * np.pi * spin.y(i)
+for i in range(N - 1):
+    H += 2 * np.pi * g * spin.x(i) * spin.x(i + 1)
+    H += 2 * np.pi * g * spin.y(i) * spin.z(i + 1)
+
+steps = np.linspace(0.0, 1, 200)
+schedule = Schedule(steps, ["time"])
+
+# Initial state (expressed as an enum)
+psi0 = cudaq.operator.InitialState.ZERO
+# This can also be used to initialize a uniformly-distributed wave-function instead.
+# `psi0 = cudaq.operator.InitialState.UNIFORM`
+
+# Run the simulation
+evolution_result = cudaq.evolve(H,
+                                dimensions,
+                                schedule,
+                                psi0,
+                                observables=[avg_magnetization_op],
+                                collapse_operators=[],
+                                store_intermediate_results=True,
+                                integrator=RungeKuttaIntegrator())
+
+exp_val = [
+    exp_vals[0].expectation()
+    for exp_vals in evolution_result.expectation_values()
+]
+
+if cudaq.mpi.rank() == 0:
+    # Plot the results
+    fig = plt.figure(figsize=(12, 6))
+    plt.plot(steps, exp_val)
+    plt.ylabel("Average Magnetization")
+    plt.xlabel("Time")
+    abspath = os.path.abspath(__file__)
+    dname = os.path.dirname(abspath)
+    os.chdir(dname)
+    fig.savefig("spin_model.png", dpi=fig.dpi)
+
+cudaq.mpi.finalize()
diff --git a/docs/sphinx/examples/python/dynamics/mgmn/multi_gpu.py b/docs/sphinx/examples/python/dynamics/mgmn/multi_gpu.py
@@ -0,0 +1,94 @@
+import cudaq
+from cudaq import operators, spin, Schedule, RungeKuttaIntegrator
+
+import numpy as np
+import cupy as cp
+import matplotlib.pyplot as plt
+import os
+
+# On a system with multiple GPUs, `mpiexec` can be used as follows:
+# `mpiexec -np <N> python3 multi_gpu.py `
+cudaq.mpi.initialize()
+
+# Set the target to our dynamics simulator
+cudaq.set_target("dynamics")
+
+# In this example, we solve the Quantum Heisenberg model (https://en.wikipedia.org/wiki/Quantum_Heisenberg_model),
+# which exhibits the so-called quantum quench effect.
+# e.g., see `Quantum quenches in the anisotropic spin-1/2 Heisenberg chain: different approaches to many-body dynamics far from equilibrium`
+# (New J. Phys. 12 055017)
+# Large number of spins
+N = 21
+dimensions = {}
+for i in range(N):
+    dimensions[i] = 2
+
+# Initial state: alternating spin up and down
+spin_state = ''
+for i in range(N):
+    spin_state += str(int(i % 2))
+
+# Observable is the staggered magnetization operator
+staggered_magnetization_op = operators.zero()
+for i in range(N):
+    if i % 2 == 0:
+        staggered_magnetization_op += spin.z(i)
+    else:
+        staggered_magnetization_op -= spin.z(i)
+
+staggered_magnetization_op /= N
+
+observe_results = []
+for g in [0.25, 4.0]:
+    # Heisenberg model spin coupling strength
+    Jx = 1.0
+    Jy = 1.0
+    Jz = g
+
+    # Construct the Hamiltonian
+    H = operators.zero()
+
+    for i in range(N - 1):
+        H += Jx * spin.x(i) * spin.x(i + 1)
+        H += Jy * spin.y(i) * spin.y(i + 1)
+        H += Jz * spin.z(i) * spin.z(i + 1)
+
+    steps = np.linspace(0.0, 1, 100)
+    schedule = Schedule(steps, ["time"])
+
+    # Prepare the initial state vector
+    psi0_ = cp.zeros(2**N, dtype=cp.complex128)
+    psi0_[int(spin_state, 2)] = 1.0
+    psi0 = cudaq.State.from_data(psi0_)
+
+    # Run the simulation
+    evolution_result = cudaq.evolve(H,
+                                    dimensions,
+                                    schedule,
+                                    psi0,
+                                    observables=[staggered_magnetization_op],
+                                    collapse_operators=[],
+                                    store_intermediate_results=True,
+                                    integrator=RungeKuttaIntegrator())
+
+    exp_val = [
+        exp_vals[0].expectation()
+        for exp_vals in evolution_result.expectation_values()
+    ]
+
+    observe_results.append((g, exp_val))
+
+if cudaq.mpi.rank() == 0:
+    # Plot the results
+    fig = plt.figure(figsize=(12, 6))
+    for g, exp_val in observe_results:
+        plt.plot(steps, exp_val, label=f'$ g = {g}$')
+    plt.legend(fontsize=16)
+    plt.ylabel("Staggered Magnetization")
+    plt.xlabel("Time")
+    abspath = os.path.abspath(__file__)
+    dname = os.path.dirname(abspath)
+    os.chdir(dname)
+    fig.savefig("heisenberg_model.png", dpi=fig.dpi)
+
+cudaq.mpi.finalize()
diff --git a/docs/sphinx/snippets/python/using/backends/dynamics.py b/docs/sphinx/snippets/python/using/backends/dynamics.py
@@ -172,3 +172,44 @@ def compute_value(param_name, step_idx):
 time_dependence = parameter_values(numpy.linspace(0, 1, 100))
 cudaq.evolve(system_operator, system_dimensions, time_dependence, initial_state)
 #[End Schedule2]
+
+import cudaq
+from cudaq import operators, spin, Schedule, RungeKuttaIntegrator
+
+N = 4
+dimensions = {}
+for i in range(N):
+    dimensions[i] = 2
+g = 1.0
+H = operators.zero()
+for i in range(N):
+    H += 2 * np.pi * spin.x(i)
+    H += 2 * np.pi * spin.y(i)
+for i in range(N - 1):
+    H += 2 * np.pi * g * spin.x(i) * spin.x(i + 1)
+    H += 2 * np.pi * g * spin.y(i) * spin.z(i + 1)
+
+steps = np.linspace(0.0, 1, 200)
+schedule = Schedule(steps, ["time"])
+
+#[Begin MPI]
+cudaq.mpi.initialize()
+
+# Set the target to our dynamics simulator
+cudaq.set_target("dynamics")
+
+# Initial state (expressed as an enum)
+psi0 = cudaq.operator.InitialState.ZERO
+
+# Run the simulation
+evolution_result = cudaq.evolve(H,
+                                dimensions,
+                                schedule,
+                                psi0,
+                                observables=[],
+                                collapse_operators=[],
+                                store_intermediate_results=True,
+                                integrator=RungeKuttaIntegrator())
+
+cudaq.mpi.finalize()
+#[End MPI]
diff --git a/docs/sphinx/using/backends/dynamics.rst b/docs/sphinx/using/backends/dynamics.rst
@@ -84,6 +84,8 @@ For example, we can plot the Pauli expectation value for the above simulation as
 In particular, for each time step, `evolve` captures an array of expectation values, one for each  
 observable. Hence, we convert them into sequences for plotting purposes.
 
+Examples that illustrate how to use the ``dynamics`` target are available 
+in the `CUDA-Q repository <https://github.com/NVIDIA/cuda-quantum/tree/main/docs/sphinx/examples/python/dynamics>`__. 
 
 Operator
 +++++++++++
@@ -272,4 +274,45 @@ backend target.
     If the output is a '`None`' string, it indicates that your Torch installation does not support CUDA.
     In this case, you need to install a CUDA-enabled Torch package via other mechanisms, e.g., building Torch from source or
     using their Docker images.
-
+
+Multi-GPU Multi-Node Execution
++++++++++++++++++++++++++++++++
+
+.. _cudensitymat_mgmn:
+
+CUDA-Q ``dynamics`` target supports parallel execution on multiple GPUs. 
+To enable parallel execution, the application must initialize MPI as follows.
+
+
+.. tab:: Python
+
+  .. literalinclude:: ../../snippets/python/using/backends/dynamics.py
+        :language: python
+        :start-after: [Begin MPI]
+        :end-before: [End MPI]
+
+  .. code:: bash 
+
+        mpiexec -np <N> python3 program.py 
+
+  where ``N`` is the number of processes.
+
+
+By initializing the MPI execution environment (via `cudaq.mpi.initialize()`) in the application code and
+invoking it via an MPI launcher, we have activated the multi-node multi-GPU feature of the ``dynamics`` target.
+Specifically, it will detect the number of processes (GPUs) and distribute the computation across all available GPUs.
+
+
+.. note::
+    The number of MPI processes must be a power of 2, one GPU per process.
+
+.. note::
+    Not all integrators are capable of handling distributed state. Errors will be raised if parallel execution is activated 
+    but the selected integrator does not support distributed state. 
+
+.. warning:: 
+    As of cuQuantum version 24.11, there are a couple of `known limitations <https://docs.nvidia.com/cuda/cuquantum/24.11.0/cudensitymat/index.html>`__ for parallel execution:
+
+    - Computing the expectation value of a mixed quantum state is not supported. Thus, `collapse_operators` are not supported if expectation calculation is required.
+
+    - Some combinations of quantum states and quantum many-body operators are not supported. Errors will be raised in those cases. 
diff --git a/docs/sphinx/using/backends/simulators.rst b/docs/sphinx/using/backends/simulators.rst
@@ -482,13 +482,26 @@ Specific aspects of the simulation can be configured by setting the following of
 * **`OMP_PLACES=cores`**: Set this environment variable to improve CPU parallelization.
 * **`OMP_NUM_THREADS=X`**: To enable CPU parallelization, set X to `NUMBER_OF_CORES_PER_NODE/NUMBER_OF_GPUS_PER_NODE`.
 * **`CUDAQ_TENSORNET_CONTROLLED_RANK=X`**: Specify the number of controlled qubits whereby the full tensor body of the controlled gate is expanded. If the number of controlled qubits is greater than this value, the gate is applied as a controlled tensor operator to the tensor network state. Default value is 1.
+* **`CUDAQ_TENSORNET_OBSERVE_CONTRACT_PATH_REUSE=X`**: Set this environment variable to `TRUE` (`ON`) or `FALSE` (`OFF`) to enable or disable contraction path reuse when computing expectation values. Default is `OFF`.
 
 .. note:: 
 
   This backend requires an NVIDIA GPU and CUDA runtime libraries. 
   If you do not have these dependencies installed, you may encounter an error stating `Invalid simulator requested`. 
   See the section :ref:`dependencies-and-compatibility` for more information about how to install dependencies.
 
+.. note:: 
+
+  When using contraction path reuse (`CUDAQ_TENSORNET_OBSERVE_CONTRACT_PATH_REUSE=TRUE`), :code:`tensornet` backends perform a single contraction path optimization with an opaque spin operator term. This path is then used to contract all the actual terms in the spin operator, hence saving the path finding time.
+
+  As we use an opaque spin operator term as a placeholder for contraction path optimization, the resulting contraction path is not as optimal as if the actual spin operator is used.
+  For instance, if the spin operator is sparse (only acting on a few qubits), the contraction can be significantly simplified.  
+
+.. note:: 
+
+  :code:`tensornet` backends only return the overall expectation value for a :class:`cudaq.SpinOperator` when using the `cudaq::observe` method. 
+  Term-by-term expectation values will not be available in the resulting `ObserveResult` object.
+  If needed, these values can be computed by calling `cudaq::observe` on individual terms instead.  
 
 Matrix product state 
 +++++++++++++++++++++++++++++++++++

diff --git a/python/cudaq/operator/__init__.py b/python/cudaq/operator/__init__.py
@@ -9,5 +9,5 @@
 from .definitions import operators, spin
 from .evolution import evolve, evolve_async
 from .expressions import Operator, OperatorSum, ProductOperator, ElementaryOperator, ScalarOperator, RydbergHamiltonian
-from .helpers import NumericType
+from .helpers import NumericType, InitialState
 from .schedule import Schedule
diff --git a/python/cudaq/operator/cudm_solver.py b/python/cudaq/operator/cudm_solver.py
@@ -16,6 +16,7 @@
 from ..mlir._mlir_libs._quakeDialects import cudaq_runtime
 from .cudm_helpers import cudm, CudmStateType
 from .cudm_state import CuDensityMatState, as_cudm_state
+from .helpers import InitialState, InitialStateArgT
 from .integrator import BaseIntegrator
 from .integrators.builtin_integrators import RungeKuttaIntegrator, cuDensityMatTimeStepper
 import cupy
@@ -28,7 +29,7 @@ def evolve_dynamics(
         hamiltonian: Operator,
         dimensions: Mapping[int, int],
         schedule: Schedule,
-        initial_state: cudaq_runtime.State,
+        initial_state: InitialStateArgT,
         collapse_operators: Sequence[Operator] = [],
         observables: Sequence[Operator] = [],
         store_intermediate_results=False,
@@ -43,8 +44,21 @@ def evolve_dynamics(
     schedule.reset()
     hilbert_space_dims = tuple(dimensions[d] for d in range(len(dimensions)))
 
-    with ScopeTimer("evolve.as_cudm_state") as timer:
-        initial_state = as_cudm_state(initial_state)
+    # Check that the integrator can support distributed state if this is a distributed simulation.
+    if cudaq_runtime.mpi.is_initialized() and cudaq_runtime.mpi.num_ranks(
+    ) > 1 and integrator is not None and not integrator.support_distributed_state(
+    ):
+        raise ValueError(
+            f"Integrator {type(integrator).__name__} does not support distributed state."
+        )
+
+    if isinstance(initial_state, InitialState):
+        has_collapse_operators = len(collapse_operators) > 0
+        initial_state = CuDensityMatState.create_initial_state(
+            initial_state, hilbert_space_dims, has_collapse_operators)
+    else:
+        with ScopeTimer("evolve.as_cudm_state") as timer:
+            initial_state = as_cudm_state(initial_state)
 
     if not isinstance(initial_state, CuDensityMatState):
         raise ValueError("Unknown type")