[JIT] Hook everything up to use the new argument synthesis (#2084)

* [JIT] Hook everything up to use the new argument synthesis Add option to nvq++ to use the new kernel launcher. Add code to the runtime to implement the new launch sequence. This receives the new vector of arguments protocol, uses the new argument conversion, and then calls the new argument synthesis pass to specialize the kernel for JIT compilation. Add a couple of tests to smoke test this new implementation. * Add missing override. Add ArgumentConversion module to the python library to resolve symbols.
NVIDIA · Aug 15, 2024 · 78f0715 · 78f0715
1 parent a817c65
commit 78f0715
Show file tree

Hide file tree

Showing 15 changed files with 146 additions and 25 deletions.
diff --git a/include/cudaq/Optimizer/Transforms/Passes.h b/include/cudaq/Optimizer/Transforms/Passes.h
@@ -57,9 +57,9 @@ inline std::unique_ptr<mlir::Pass> createPySynthCallableBlockArgs() {
 /// Helper function to build an argument synthesis pass. The names of the
 /// functions and the substitutions text can be built as an unzipped pair of
 /// lists.
-std::unique_ptr<mlir::Pass> createArgumentSynthesisPass(
-    const mlir::ArrayRef<mlir::StringRef> &funcNames,
-    const mlir::ArrayRef<mlir::StringRef> &substitutions);
+std::unique_ptr<mlir::Pass>
+createArgumentSynthesisPass(mlir::ArrayRef<mlir::StringRef> funcNames,
+                            mlir::ArrayRef<mlir::StringRef> substitutions);
 
 // declarative passes
 #define GEN_PASS_DECL

diff --git a/lib/Optimizer/Transforms/ArgumentSynthesis.cpp b/lib/Optimizer/Transforms/ArgumentSynthesis.cpp
@@ -73,8 +73,9 @@ class ArgumentSynthesisPass
     assert(*substMod && "module must have been created");
 
     // 2. Go through the Module and process each substitution.
-    std::vector<bool> processedArgs(func.getFunctionType().getNumInputs());
-    std::vector<std::tuple<unsigned, Value, Value>> replacements;
+    SmallVector<bool> processedArgs(func.getFunctionType().getNumInputs());
+    SmallVector<std::tuple<unsigned, Value, Value>> replacements;
+    BitVector replacedArgs(processedArgs.size());
     for (auto &op : *substMod) {
       auto subst = dyn_cast<cudaq::cc::ArgumentSubstitutionOp>(op);
       if (!subst) {
@@ -103,6 +104,17 @@ class ArgumentSynthesisPass
       // OK, substitute the code for the argument.
       Block &entry = func.getRegion().front();
       processedArgs[pos] = true;
+      if (subst.getBody().front().empty()) {
+        // No code is present. Erase the argument if it is not used.
+        const auto numUses =
+            std::distance(entry.getArgument(pos).getUses().begin(),
+                          entry.getArgument(pos).getUses().end());
+        LLVM_DEBUG(llvm::dbgs() << "maybe erasing an unused argument ("
+                                << std::to_string(numUses) << ")\n");
+        if (numUses == 0)
+          replacedArgs.set(pos);
+        continue;
+      }
       OpBuilder builder{ctx};
       Block *splitBlock = entry.splitBlock(entry.begin());
       builder.setInsertionPointToEnd(&entry);
@@ -126,7 +138,6 @@ class ArgumentSynthesisPass
     // function is still dead and can be removed by a DCE.
 
     // 3. Replace the block argument values with the freshly inserted new code.
-    BitVector replacedArgs(processedArgs.size());
     for (auto [pos, fromVal, toVal] : replacements) {
       replacedArgs.set(pos);
       fromVal.replaceAllUsesWith(toVal);
@@ -142,9 +153,9 @@ class ArgumentSynthesisPass
 // Helper function that takes an unzipped pair of lists of function names and
 // substitution code strings. This is meant to make adding this pass to a
 // pipeline easier from within a tool (such as the JIT compiler).
-std::unique_ptr<mlir::Pass> cudaq::opt::createArgumentSynthesisPass(
-    const ArrayRef<StringRef> &funcNames,
-    const ArrayRef<StringRef> &substitutions) {
+std::unique_ptr<mlir::Pass>
+cudaq::opt::createArgumentSynthesisPass(ArrayRef<StringRef> funcNames,
+                                        ArrayRef<StringRef> substitutions) {
   SmallVector<std::string> pairs;
   if (funcNames.size() == substitutions.size())
     for (auto [name, text] : llvm::zip(funcNames, substitutions))

diff --git a/lib/Optimizer/Transforms/GenKernelExecution.cpp b/lib/Optimizer/Transforms/GenKernelExecution.cpp
@@ -1329,8 +1329,7 @@ class GenerateKernelExecution
     Value vecArgPtrs;
     if (isCodegenArgumentGather(codegenKind)) {
       // 1) Allocate and initialize a std::vector<void*> object.
-      const unsigned count =
-          cudaq::cc::numberOfHiddenArgs(addThisPtr, hiddenSRet);
+      const unsigned count = devFuncTy.getInputs().size();
       auto stdVec = builder.create<cudaq::cc::AllocaOp>(
           loc, cudaq::opt::factory::stlVectorType(ptrI8Ty));
       auto arrPtrTy = cudaq::cc::ArrayType::get(ctx, ptrI8Ty, count);

diff --git a/python/extension/CMakeLists.txt b/python/extension/CMakeLists.txt
@@ -72,6 +72,7 @@ declare_mlir_python_extension(CUDAQuantumPythonSources.Extension
     ../../runtime/cudaq/platform/common/QuantumExecutionQueue.cpp
     ../../runtime/cudaq/platform/default/rest_server/RemoteRuntimeClient.cpp
     ../../runtime/cudaq/platform/orca/OrcaQPU.cpp
+    ../../runtime/common/ArgumentConversion.cpp
 
   EMBED_CAPI_LINK_LIBS
    CUDAQuantumMLIRCAPI

diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
@@ -8,6 +8,7 @@
 
 #pragma once
 
+#include "common/ArgumentConversion.h"
 #include "common/Environment.h"
 #include "common/ExecutionContext.h"
 #include "common/Executor.h"
@@ -17,6 +18,7 @@
 #include "common/RuntimeMLIR.h"
 #include "cudaq.h"
 #include "cudaq/Frontend/nvqpp/AttributeNames.h"
+#include "cudaq/Optimizer/Builder/Runtime.h"
 #include "cudaq/Optimizer/CodeGen/OpenQASMEmitter.h"
 #include "cudaq/Optimizer/CodeGen/Passes.h"
 #include "cudaq/Optimizer/Dialect/CC/CCDialect.h"
@@ -112,7 +114,8 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
   /// @brief Invoke the kernel in the JIT engine
   void invokeJITKernel(mlir::ExecutionEngine *jit,
                        const std::string &kernelName) {
-    auto funcPtr = jit->lookup(std::string("__nvqpp__mlirgen__") + kernelName);
+    auto funcPtr = jit->lookup(std::string(cudaq::runtime::cudaqGenPrefixName) +
+                               kernelName);
     if (!funcPtr) {
       throw std::runtime_error(
           "cudaq::builder failed to get kernelReg function.");
@@ -347,12 +350,24 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
     return output_names;
   }
 
+  std::vector<cudaq::KernelExecution>
+  lowerQuakeCode(const std::string &kernelName, void *kernelArgs) {
+    return lowerQuakeCode(kernelName, kernelArgs, {});
+  }
+
+  std::vector<cudaq::KernelExecution>
+  lowerQuakeCode(const std::string &kernelName,
+                 const std::vector<void *> &rawArgs) {
+    return lowerQuakeCode(kernelName, nullptr, rawArgs);
+  }
+
   /// @brief Extract the Quake representation for the given kernel name and
   /// lower it to the code format required for the specific backend. The
   /// lowering process is controllable via the configuration file in the
   /// platform directory for the targeted backend.
   std::vector<cudaq::KernelExecution>
-  lowerQuakeCode(const std::string &kernelName, void *kernelArgs) {
+  lowerQuakeCode(const std::string &kernelName, void *kernelArgs,
+                 const std::vector<void *> &rawArgs) {
 
     auto [m_module, contextPtr, updatedArgs] =
         extractQuakeCodeAndContext(kernelName, kernelArgs);
@@ -361,7 +376,7 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
 
     // Extract the kernel name
     auto func = m_module.lookupSymbol<mlir::func::FuncOp>(
-        std::string("__nvqpp__mlirgen__") + kernelName);
+        std::string(cudaq::runtime::cudaqGenPrefixName) + kernelName);
 
     // Create a new Module to clone the function into
     auto location = mlir::FileLineColLoc::get(&context, "<builder>", 1, 1);
@@ -402,10 +417,26 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
         throw std::runtime_error("Remote rest platform Quake lowering failed.");
     };
 
-    if (updatedArgs) {
-      cudaq::info("Run Quake Synth.\n");
+    if (!rawArgs.empty() || updatedArgs) {
       mlir::PassManager pm(&context);
-      pm.addPass(cudaq::opt::createQuakeSynthesizer(kernelName, updatedArgs));
+      if (!rawArgs.empty()) {
+        cudaq::info("Run Argument Synth.\n");
+        opt::ArgumentConverter argCon(kernelName, moduleOp);
+        argCon.gen(rawArgs);
+        std::string kernName = cudaq::runtime::cudaqGenPrefixName + kernelName;
+        mlir::StringRef sr{kernName};
+        mlir::SmallVector<mlir::StringRef> kernels = {sr};
+        std::string substBuff;
+        llvm::raw_string_ostream ss(substBuff);
+        ss << argCon.getSubstitutionModule();
+        mlir::StringRef su{substBuff};
+        mlir::SmallVector<mlir::StringRef> substs = {su};
+        pm.addNestedPass<mlir::func::FuncOp>(
+            opt::createArgumentSynthesisPass(kernels, substs));
+      } else if (updatedArgs) {
+        cudaq::info("Run Quake Synth.\n");
+        pm.addPass(cudaq::opt::createQuakeSynthesizer(kernelName, updatedArgs));
+      }
       pm.addPass(mlir::createCanonicalizerPass());
       if (disableMLIRthreading || enablePrintMLIREachPass)
         moduleOp.getContext()->disableMultithreading();
@@ -418,7 +449,7 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
     runPassPipeline(passPipelineConfig, moduleOp);
 
     auto entryPointFunc = moduleOp.lookupSymbol<mlir::func::FuncOp>(
-        std::string("__nvqpp__mlirgen__") + kernelName);
+        std::string(cudaq::runtime::cudaqGenPrefixName) + kernelName);
     std::vector<std::size_t> mapping_reorder_idx;
     if (auto mappingAttr = dyn_cast_if_present<mlir::ArrayAttr>(
             entryPointFunc->getAttr("mapping_reorder_idx"))) {
@@ -448,7 +479,7 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
 
         // Get the ansatz
         auto ansatz = moduleOp.lookupSymbol<mlir::func::FuncOp>(
-            std::string("__nvqpp__mlirgen__") + kernelName);
+            std::string(cudaq::runtime::cudaqGenPrefixName) + kernelName);
 
         // Create a new Module to clone the ansatz into it
         auto tmpModuleOp = builder.create<mlir::ModuleOp>();
@@ -513,6 +544,21 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
     return codes;
   }
 
+  void launchKernel(const std::string &kernelName,
+                    const std::vector<void *> &rawArgs) override {
+    cudaq::info("launching remote rest kernel ({})", kernelName);
+
+    // TODO future iterations of this should support non-void return types.
+    if (!executionContext)
+      throw std::runtime_error(
+          "Remote rest execution can only be performed via cudaq::sample(), "
+          "cudaq::observe(), or cudaq::draw().");
+
+    // Get the Quake code, lowered according to config file.
+    auto codes = lowerQuakeCode(kernelName, rawArgs);
+    completeLaunchKernel(kernelName, std::move(codes));
+  }
+
   /// @brief Launch the kernel. Extract the Quake code and lower to
   /// the representation required by the targeted backend. Handle all pertinent
   /// modifications for the execution context as well as asynchronous or
@@ -530,6 +576,11 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
 
     // Get the Quake code, lowered according to config file.
     auto codes = lowerQuakeCode(kernelName, args);
+    completeLaunchKernel(kernelName, std::move(codes));
+  }
+
+  void completeLaunchKernel(const std::string &kernelName,
+                            std::vector<cudaq::KernelExecution> &&codes) {
 
     // After performing lowerQuakeCode, check to see if we are simply drawing
     // the circuit. If so, perform the trace here and then return.

diff --git a/runtime/common/BaseRemoteSimulatorQPU.h b/runtime/common/BaseRemoteSimulatorQPU.h
@@ -105,6 +105,11 @@ class BaseRemoteSimulatorQPU : public cudaq::QPU {
       throw std::runtime_error("Failed to launch VQE. Error: " + errorMsg);
   }
 
+  void launchKernel(const std::string &name,
+                    const std::vector<void *> &rawArgs) override {
+    throw std::runtime_error("launch kernel on raw args not implemented");
+  }
+
   void launchKernel(const std::string &name, void (*kernelFunc)(void *),
                     void *args, std::uint64_t voidStarSize,
                     std::uint64_t resultOffset) override {

diff --git a/runtime/cudaq/platform/default/DefaultQuantumPlatform.cpp b/runtime/cudaq/platform/default/DefaultQuantumPlatform.cpp
@@ -39,6 +39,11 @@ class DefaultQPU : public cudaq::QPU {
     kernelFunc(args);
   }
 
+  void launchKernel(const std::string &name,
+                    const std::vector<void *> &) override {
+    throw std::runtime_error("Wrong kernel launch point.");
+  }
+
   /// Overrides setExecutionContext to forward it to the ExecutionManager
   void setExecutionContext(cudaq::ExecutionContext *context) override {
     ScopedTraceWithContext("DefaultPlatform::setExecutionContext",

diff --git a/runtime/cudaq/platform/mqpu/custatevec/GPUEmulatedQPU.cpp b/runtime/cudaq/platform/mqpu/custatevec/GPUEmulatedQPU.cpp
@@ -44,6 +44,11 @@ class GPUEmulatedQPU : public cudaq::QPU {
     kernelFunc(args);
   }
 
+  void launchKernel(const std::string &name,
+                    const std::vector<void *> &rawArgs) override {
+    throw std::runtime_error("not implemented");
+  }
+
   /// Overrides setExecutionContext to forward it to the ExecutionManager
   void setExecutionContext(cudaq::ExecutionContext *context) override {
     cudaSetDevice(qpu_id);

diff --git a/runtime/cudaq/platform/orca/OrcaQPU.cpp b/runtime/cudaq/platform/orca/OrcaQPU.cpp
@@ -175,6 +175,10 @@ class OrcaRemoteRESTQPU : public cudaq::QPU {
   void launchKernel(const std::string &kernelName, void (*kernelFunc)(void *),
                     void *args, std::uint64_t voidStarSize,
                     std::uint64_t resultOffset) override;
+  void launchKernel(const std::string &kernelName,
+                    const std::vector<void *> &rawArgs) override {
+    throw std::runtime_error("launch kernel on raw args not implemented");
+  }
 };
 
 /// @brief This setTargetBackend override is in charge of reading the
@@ -321,4 +325,4 @@ cudaq::RestHeaders OrcaRemoteRESTQPU::getHeaders() {
 
 } // namespace
 
-CUDAQ_REGISTER_TYPE(cudaq::QPU, OrcaRemoteRESTQPU, orca)
+CUDAQ_REGISTER_TYPE(cudaq::QPU, OrcaRemoteRESTQPU, orca)
diff --git a/runtime/cudaq/platform/qpu.h b/runtime/cudaq/platform/qpu.h
@@ -174,6 +174,8 @@ class QPU : public registry::RegisteredType<QPU> {
   /// as a struct-packed void pointer and its corresponding size.
   virtual void launchKernel(const std::string &name, void (*kernelFunc)(void *),
                             void *args, std::uint64_t, std::uint64_t) = 0;
+  virtual void launchKernel(const std::string &name,
+                            const std::vector<void *> &rawArgs) = 0;
 
   /// Launch serialized code for remote execution. Subtypes that support this
   /// should override this function.

diff --git a/runtime/cudaq/platform/quantum_platform.cpp b/runtime/cudaq/platform/quantum_platform.cpp
@@ -163,6 +163,19 @@ void quantum_platform::launchKernel(std::string kernelName,
   qpu->launchKernel(kernelName, kernelFunc, args, voidStarSize, resultOffset);
 }
 
+void quantum_platform::launchKernel(std::string kernelName,
+                                    const std::vector<void *> &rawArgs) {
+  std::size_t qpu_id = 0;
+
+  auto tid = std::hash<std::thread::id>{}(std::this_thread::get_id());
+  auto iter = threadToQpuId.find(tid);
+  if (iter != threadToQpuId.end())
+    qpu_id = iter->second;
+
+  auto &qpu = platformQPUs[qpu_id];
+  qpu->launchKernel(kernelName, rawArgs);
+}
+
 void quantum_platform::launchSerializedCodeExecution(
     const std::string &name,
     cudaq::SerializedCodeExecutionContext &serializeCodeExecutionObject) {
@@ -201,3 +214,12 @@ void cudaq::altLaunchKernel(const char *kernelName, void (*kernelFunc)(void *),
   platform.launchKernel(kernName, kernelFunc, kernelArgs, argsSize,
                         resultOffset);
 }
+
+void cudaq::streamlinedLaunchKernel(const char *kernelName,
+                                    const std::vector<void *> &rawArgs) {
+  std::size_t argsSize = rawArgs.size();
+  ScopedTraceWithContext("streamlinedLaunchKernel", kernelName, argsSize);
+  auto &platform = *cudaq::getQuantumPlatformInternal();
+  std::string kernName = kernelName;
+  platform.launchKernel(kernName, rawArgs);
+}
diff --git a/runtime/cudaq/platform/quantum_platform.h b/runtime/cudaq/platform/quantum_platform.h
@@ -145,6 +145,7 @@ class quantum_platform {
   void launchKernel(std::string kernelName, void (*kernelFunc)(void *),
                     void *args, std::uint64_t voidStarSize,
                     std::uint64_t resultOffset);
+  void launchKernel(std::string kernelName, const std::vector<void *> &);
 
   // This method is the hook for executing SerializedCodeExecutionContext
   // objects.
@@ -212,8 +213,20 @@ class quantum_platform {
 /// tied to the quantum platform instance somehow. Note that the compiler cannot
 /// provide that information.
 extern "C" {
+// Client-server (legacy) interface.
 void altLaunchKernel(const char *kernelName, void (*kernel)(void *), void *args,
                      std::uint64_t argsSize, std::uint64_t resultOffset);
+// Streamlined interface for launching kernels. Argument synthesis and JIT
+// compilation *must* happen on the local machine.
+void streamlinedLaunchKernel(const char *kernelName,
+                             const std::vector<void *> &rawArgs);
+// Hybrid of the client-server and streamlined approaches. Letting JIT
+// compilation happen either early or late and can handle return values from
+// each kernel launch.
+void hybridLaunchKernel(const char *kernelName, void (*kernel)(void *),
+                        void *args, std::uint64_t argsSize,
+                        std::uint64_t resultOffset,
+                        const std::vector<void *> &rawArgs);
 }
 
 } // namespace cudaq

diff --git a/targettests/execution/test-6.cpp b/targettests/execution/test-6.cpp
@@ -8,6 +8,7 @@
 
 // REQUIRES: c++20
 // RUN: nvq++ --target quantinuum --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ -fkernel-exec-kind=2 --target quantinuum --emulate %s -o %t && %t | FileCheck %s
 
 #include <cudaq.h>
 #include <iostream>

diff --git a/targettests/execution/to_integer.cpp b/targettests/execution/to_integer.cpp
@@ -6,10 +6,8 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-// RUN: nvq++ -std=c++17 --enable-mlir %s -o %t
-// RUN: if [ $(echo %cpp_std | cut -c4- ) -ge 20 ]; then \
-// RUN:   nvq++ --enable-mlir %s -o %t && %t; \
-// RUN: fi
+// RUN: nvq++ %cpp_std --enable-mlir %s -o %t && %t
+// RUN: nvq++ %cpp_std -fkernel-exec-kind=2 --enable-mlir %s -o %t && %t
 
 #include <cudaq.h>
 

diff --git a/tools/nvqpp/nvq++.in b/tools/nvqpp/nvq++.in
@@ -95,6 +95,9 @@ function f_option_handling {
 	-flower-to-cfg)
 		ENABLE_LOWER_TO_CFG=true
 		;;
+	-fkernel-exec-kind=*)
+		KERNEL_EXECUTION_KIND="{codegen=${1#*=}}"
+		;;
 	*)
 		# Pass any unrecognized options on to the clang++ tool.
 		ARGS="${ARGS} $1"
@@ -325,6 +328,7 @@ SHOW_VERSION=false
 ENABLE_UNWIND_LOWERING=true
 ENABLE_DEVICE_CODE_LOADERS=true
 ENABLE_KERNEL_EXECUTION=true
+KERNEL_EXECUTION_KIND=
 ENABLE_AGGRESSIVE_EARLY_INLINE=true
 ENABLE_LOWER_TO_CFG=true
 ENABLE_APPLY_SPECIALIZATION=true
@@ -680,7 +684,7 @@ if ${ENABLE_APPLY_SPECIALIZATION}; then
 fi
 if ${ENABLE_KERNEL_EXECUTION}; then
 	RUN_OPT=true
-	OPT_PASSES=$(add_pass_to_pipeline "${OPT_PASSES}" "kernel-execution")
+	OPT_PASSES=$(add_pass_to_pipeline "${OPT_PASSES}" "kernel-execution${KERNEL_EXECUTION_KIND}")
 fi
 if ${ENABLE_AGGRESSIVE_EARLY_INLINE}; then
 	RUN_OPT=true