Merge branch 'main' into main

NVIDIA · Aug 11, 2024 · 084aaff · 084aaff
2 parents 52e8ed9 + 7924b17
commit 084aaff
Show file tree

Hide file tree

Showing 13 changed files with 382 additions and 59 deletions.
diff --git a/include/cudaq/Optimizer/Builder/Runtime.h b/include/cudaq/Optimizer/Builder/Runtime.h
@@ -23,5 +23,7 @@ static constexpr unsigned cudaqGenPrefixLength = sizeof(cudaqGenPrefixName) - 1;
 /// compile time (see `cudaqGenPrefixName`) or it can be rewritten to call back
 /// to the runtime library (and be handled at runtime).
 static constexpr const char launchKernelFuncName[] = "altLaunchKernel";
+static constexpr const char launchKernelVersion2FuncName[] =
+    "altLaunchKernelUsingLocalJIT";
 
 } // namespace cudaq::runtime
diff --git a/include/cudaq/Optimizer/Transforms/Passes.h b/include/cudaq/Optimizer/Transforms/Passes.h
@@ -80,4 +80,7 @@ inline std::unique_ptr<mlir::Pass> createQuantumMemToReg() {
   return createMemToReg(m2rOpt);
 }
 
+/// Name of `quake.wire_set` generated prior to mapping
+static constexpr const char topologyAgnosticWiresetName[] = "wires";
+
 } // namespace cudaq::opt
diff --git a/include/cudaq/Optimizer/Transforms/Passes.td b/include/cudaq/Optimizer/Transforms/Passes.td
@@ -11,6 +11,23 @@
 
 include "mlir/Pass/PassBase.td"
 
+def AddWireset : Pass<"add-wireset", "mlir::ModuleOp"> {
+  let summary = "Adds a topology-less `quake.wireset` to the module";
+  let description = [{
+    Adds a `quake.wireset` operation without tological info to the module.
+  }];
+}
+
+def AssignWireIndices : Pass<"assign-wire-indices", "mlir::func::FuncOp"> {
+  let summary = "Replaces wires with wires from a `quake.wireset`";
+  let description = [{
+    Replaces all instances of `quake.null_wire_op` with `quake.borrow_wire_op`s
+    from a common `quake.wireset` without any topological information.
+    Each wire is assigned a unique identifier (the index into the
+    `quake.wireset`) through this process.
+  }];
+}
+
 def ApplyControlNegations :
     Pass<"apply-control-negations", "mlir::func::FuncOp"> {
   let summary =
@@ -279,6 +296,11 @@ def GenerateKernelExecution : Pass<"kernel-execution", "mlir::ModuleOp"> {
     Generate the kernel execution thunks. The kernel execution thunks allow
     the control side (C++ code) to launch quantum kernels. This pass
     generates the required glue code.
+
+    Specifying the alt-launch=2 option will generate different code that makes
+    use of library side argument conversion and the argument synthesis pass.
+    More generally, this option can be used when JIT compiling kernels on the
+    client/host/local processor.
   }];
 
   let dependentDialects = ["cudaq::cc::CCDialect", "mlir::LLVM::LLVMDialect"];
@@ -288,11 +310,14 @@ def GenerateKernelExecution : Pass<"kernel-execution", "mlir::ModuleOp"> {
       /*default=*/"\"-\"", "Name of output file.">,
     Option<"startingArgIdx", "starting-arg-idx", "std::size_t", /*default=*/"0",
       "The starting argument index for the argsCreator.">,
+    Option<"altLaunchVersion", "alt-launch", "std::size_t", /*default=*/"1",
+      "Specify the version of altLaunchKernel to be used.">
   ];
 }
 
 def GetConcreteMatrix : Pass<"get-concrete-matrix", "mlir::func::FuncOp"> {
-  let summary = "Replace the unitary matrix generator function with concrete matrix.";
+  let summary =
+    "Replace the unitary matrix generator function with concrete matrix.";
   let description = [{
     Given a custom operation whose generator attribute is another function 
     within the module, such that if `LiftArrayAlloc` pass has run, there will

diff --git a/lib/Optimizer/Builder/Intrinsics.cpp b/lib/Optimizer/Builder/Intrinsics.cpp
@@ -298,6 +298,12 @@ static constexpr IntrinsicCode intrinsicTable[] = {
      R"#(
   func.func private @altLaunchKernel(!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64) -> ())#"},
 
+    {cudaq::runtime::
+         launchKernelVersion2FuncName, // altLaunchKernelUsingLocalJIT
+     {},
+     R"#(
+  func.func private @altLaunchKernelUsingLocalJIT(!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>) -> ())#"},
+
     {"free", {}, "func.func private @free(!cc.ptr<i8>) -> ()"},
 
     {cudaq::llvmMemCopyIntrinsic, // llvm.memcpy.p0i8.p0i8.i64

diff --git a/lib/Optimizer/Transforms/CMakeLists.txt b/lib/Optimizer/Transforms/CMakeLists.txt
@@ -49,6 +49,7 @@ add_cudaq_library(OptTransforms
   RefToVeqAlloc.cpp
   RegToMem.cpp
   StatePreparation.cpp
+  WiresToWiresets.cpp
 
   DEPENDS
     OptTransformsPassIncGen

diff --git a/lib/Optimizer/Transforms/GenKernelExecution.cpp b/lib/Optimizer/Transforms/GenKernelExecution.cpp
@@ -1116,10 +1116,11 @@ class GenerateKernelExecution
   /// library. Pass along the thunk, so the runtime can call the quantum
   /// circuit. These entry points are `operator()` member functions in a class,
   /// so account for the `this` argument here.
-  void genNewHostEntryPoint(Location loc, OpBuilder &builder,
-                            FunctionType funcTy, cudaq::cc::StructType structTy,
-                            LLVM::GlobalOp kernelNameObj, func::FuncOp thunk,
-                            func::FuncOp rewriteEntry, bool addThisPtr) {
+  void genNewHostEntryPoint1(Location loc, OpBuilder &builder,
+                             FunctionType funcTy,
+                             cudaq::cc::StructType structTy,
+                             LLVM::GlobalOp kernelNameObj, func::FuncOp thunk,
+                             func::FuncOp rewriteEntry, bool addThisPtr) {
     auto *ctx = builder.getContext();
     auto i64Ty = builder.getI64Type();
     auto offset = funcTy.getNumInputs();
@@ -1392,6 +1393,91 @@ class GenerateKernelExecution
     builder.create<func::ReturnOp>(loc, results);
   }
 
+  void genNewHostEntryPoint2(Location loc, OpBuilder &builder,
+                             FunctionType devFuncTy,
+                             LLVM::GlobalOp kernelNameObj,
+                             func::FuncOp hostFunc, bool addThisPtr) {
+    const bool hiddenSRet = cudaq::opt::factory::hasHiddenSRet(devFuncTy);
+    const unsigned count =
+        cudaq::cc::numberOfHiddenArgs(addThisPtr, hiddenSRet);
+    auto *ctx = builder.getContext();
+    auto i8PtrTy = cudaq::cc::PointerType::get(builder.getI8Type());
+
+    // 0) Pointer our builder into the entry block of the function.
+    Block *hostFuncEntryBlock = hostFunc.addEntryBlock();
+
+    OpBuilder::InsertionGuard guard(builder);
+    builder.setInsertionPointToStart(hostFuncEntryBlock);
+
+    // 1) Allocate and initialize a std::vector<void*> object.
+    auto stdVec = builder.create<cudaq::cc::AllocaOp>(
+        loc, cudaq::opt::factory::stlVectorType(i8PtrTy));
+    auto arrPtrTy = cudaq::cc::ArrayType::get(ctx, i8PtrTy, count);
+    Value buffer = builder.create<cudaq::cc::AllocaOp>(loc, arrPtrTy);
+    auto i64Ty = builder.getI64Type();
+    auto buffSize = builder.create<cudaq::cc::SizeOfOp>(loc, i64Ty, arrPtrTy);
+    auto ptrPtrTy = cudaq::cc::PointerType::get(i8PtrTy);
+    auto cast1 = builder.create<cudaq::cc::CastOp>(loc, ptrPtrTy, buffer);
+    auto ptr3Ty = cudaq::cc::PointerType::get(ptrPtrTy);
+    auto stdVec0 = builder.create<cudaq::cc::CastOp>(loc, ptr3Ty, stdVec);
+    builder.create<cudaq::cc::StoreOp>(loc, cast1, stdVec0);
+    auto cast2 = builder.create<cudaq::cc::CastOp>(loc, i64Ty, buffer);
+    auto endBuff = builder.create<arith::AddIOp>(loc, cast2, buffSize);
+    auto cast3 = builder.create<cudaq::cc::CastOp>(loc, ptrPtrTy, endBuff);
+    auto stdVec1 = builder.create<cudaq::cc::ComputePtrOp>(
+        loc, ptr3Ty, stdVec, ArrayRef<cudaq::cc::ComputePtrArg>{1});
+    builder.create<cudaq::cc::StoreOp>(loc, cast3, stdVec1);
+    auto stdVec2 = builder.create<cudaq::cc::ComputePtrOp>(
+        loc, ptr3Ty, stdVec, ArrayRef<cudaq::cc::ComputePtrArg>{2});
+    builder.create<cudaq::cc::StoreOp>(loc, cast3, stdVec2);
+    auto zero = builder.create<arith::ConstantIntOp>(loc, 0, 64);
+    auto nullPtr = builder.create<cudaq::cc::CastOp>(loc, i8PtrTy, zero);
+
+    // 2) Iterate over the arguments passed in and populate the vector.
+    SmallVector<BlockArgument> blockArgs{dropAnyHiddenArguments(
+        hostFuncEntryBlock->getArguments(), devFuncTy, addThisPtr)};
+    for (auto iter : llvm::enumerate(blockArgs)) {
+      std::int32_t i = iter.index();
+      auto pos = builder.create<cudaq::cc::ComputePtrOp>(
+          loc, ptrPtrTy, buffer, ArrayRef<cudaq::cc::ComputePtrArg>{i});
+      auto blkArg = iter.value();
+      if (isa<cudaq::cc::PointerType>(blkArg.getType())) {
+        auto castArg = builder.create<cudaq::cc::CastOp>(loc, i8PtrTy, blkArg);
+        builder.create<cudaq::cc::StoreOp>(loc, castArg, pos);
+        continue;
+      }
+      auto temp = builder.create<cudaq::cc::AllocaOp>(loc, blkArg.getType());
+      builder.create<cudaq::cc::StoreOp>(loc, blkArg, temp);
+      auto castTemp = builder.create<cudaq::cc::CastOp>(loc, i8PtrTy, temp);
+      builder.create<cudaq::cc::StoreOp>(loc, castTemp, pos);
+    }
+
+    auto resultBuffer = builder.create<cudaq::cc::AllocaOp>(loc, i8PtrTy);
+    builder.create<cudaq::cc::StoreOp>(loc, nullPtr, resultBuffer);
+    auto castResultBuffer =
+        builder.create<cudaq::cc::CastOp>(loc, i8PtrTy, resultBuffer);
+    auto castStdvec = builder.create<cudaq::cc::CastOp>(loc, i8PtrTy, stdVec);
+    Value loadKernName = builder.create<LLVM::AddressOfOp>(
+        loc, cudaq::opt::factory::getPointerType(kernelNameObj.getType()),
+        kernelNameObj.getSymName());
+    auto castKernelNameObj =
+        builder.create<cudaq::cc::CastOp>(loc, i8PtrTy, loadKernName);
+    builder.create<func::CallOp>(
+        loc, std::nullopt, cudaq::runtime::launchKernelVersion2FuncName,
+        ArrayRef<Value>{castKernelNameObj, castStdvec, castResultBuffer});
+
+    // FIXME: Drop any results on the floor for now and return random data left
+    // on the stack. (Maintains parity with existing kernel launch.)
+    if (hostFunc.getFunctionType().getResults().empty()) {
+      builder.create<func::ReturnOp>(loc);
+      return;
+    }
+    // There can only be 1 return type in C++, so this is safe.
+    Value garbage = builder.create<cudaq::cc::UndefOp>(
+        loc, hostFunc.getFunctionType().getResult(0));
+    builder.create<func::ReturnOp>(loc, garbage);
+  }
+
   /// A kernel function that takes a quantum type argument (also known as a pure
   /// device kernel) cannot be called directly from C++ (classical) code. It
   /// must be called via other quantum code.
@@ -1422,11 +1508,18 @@ class GenerateKernelExecution
     if (!mangledNameMap || mangledNameMap.empty())
       return;
     auto irBuilder = cudaq::IRBuilder::atBlockEnd(module.getBody());
-    if (failed(irBuilder.loadIntrinsic(module,
-                                       cudaq::runtime::launchKernelFuncName))) {
-      module.emitError("could not load altLaunchKernel intrinsic.");
-      return;
-    }
+    if (altLaunchVersion == 1)
+      if (failed(irBuilder.loadIntrinsic(
+              module, cudaq::runtime::launchKernelFuncName))) {
+        module.emitError("could not load altLaunchKernel intrinsic.");
+        return;
+      }
+    if (altLaunchVersion == 2)
+      if (failed(irBuilder.loadIntrinsic(
+              module, cudaq::runtime::launchKernelVersion2FuncName))) {
+        module.emitError("could not load altLaunchKernel intrinsic.");
+        return;
+      }
 
     auto loc = module.getLoc();
     auto ptrType = cudaq::cc::PointerType::get(builder.getI8Type());
@@ -1526,37 +1619,47 @@ class GenerateKernelExecution
             cudaq::opt::factory::toHostSideFuncType(funcTy, hasThisPtr, module);
       }
 
-      // Generate the function that computes the return offset.
-      genReturnOffsetFunction(loc, builder, funcTy, structTy, classNameStr);
+      func::FuncOp thunk;
+      func::FuncOp argsCreatorFunc;
 
-      // Generate thunk, `<kernel>.thunk`, to call back to the MLIR code.
-      auto thunk = genThunkFunction(loc, builder, classNameStr, structTy,
-                                    funcTy, funcOp);
+      if (altLaunchVersion == 1) {
+        // Generate the function that computes the return offset.
+        genReturnOffsetFunction(loc, builder, funcTy, structTy, classNameStr);
 
-      // Generate the argsCreator function used by synthesis.
-      mlir::func::FuncOp argsCreatorFunc;
-      if (startingArgIdx == 0) {
-        argsCreatorFunc =
-            genKernelArgsCreatorFunction(loc, builder, funcTy, structTy,
-                                         classNameStr, hostFuncTy, hasThisPtr);
-      } else {
-        // We are operating in a very special case where we want the argsCreator
-        // function to ignore the first `startingArgIdx` arguments. In this
-        // situation, the argsCreator function will not be compatible with the
-        // other helper functions created in this pass, so it is assumed that
-        // the caller is OK with that.
-        auto structTy_argsCreator =
-            cudaq::opt::factory::buildInvokeStructType(funcTy, startingArgIdx);
-        argsCreatorFunc = genKernelArgsCreatorFunction(
-            loc, builder, funcTy, structTy_argsCreator, classNameStr,
-            hostFuncTy, hasThisPtr);
+        // Generate thunk, `<kernel>.thunk`, to call back to the MLIR code.
+        thunk = genThunkFunction(loc, builder, classNameStr, structTy, funcTy,
+                                 funcOp);
+
+        // Generate the argsCreator function used by synthesis.
+        if (startingArgIdx == 0) {
+          argsCreatorFunc = genKernelArgsCreatorFunction(
+              loc, builder, funcTy, structTy, classNameStr, hostFuncTy,
+              hasThisPtr);
+        } else {
+          // We are operating in a very special case where we want the
+          // argsCreator function to ignore the first `startingArgIdx`
+          // arguments. In this situation, the argsCreator function will not be
+          // compatible with the other helper functions created in this pass, so
+          // it is assumed that the caller is OK with that.
+          auto structTy_argsCreator =
+              cudaq::opt::factory::buildInvokeStructType(funcTy,
+                                                         startingArgIdx);
+          argsCreatorFunc = genKernelArgsCreatorFunction(
+              loc, builder, funcTy, structTy_argsCreator, classNameStr,
+              hostFuncTy, hasThisPtr);
+        }
       }
 
       // Generate a new mangled function on the host side to call the
       // callback function.
-      if (hostEntryNeeded)
-        genNewHostEntryPoint(loc, builder, funcTy, structTy, kernelNameObj,
-                             thunk, hostFunc, hasThisPtr);
+      if (hostEntryNeeded) {
+        if (altLaunchVersion == 1)
+          genNewHostEntryPoint1(loc, builder, funcTy, structTy, kernelNameObj,
+                                thunk, hostFunc, hasThisPtr);
+        else
+          genNewHostEntryPoint2(loc, builder, funcTy, kernelNameObj, hostFunc,
+                                hasThisPtr);
+      }
 
       // Generate a function at startup to register this kernel as having
       // been processed for kernel execution.
@@ -1576,17 +1679,19 @@ class GenerateKernelExecution
         builder.create<func::CallOp>(loc, std::nullopt, cudaqRegisterKernelName,
                                      ValueRange{castKernRef});
 
-        // Register the argsCreator too
-        auto ptrPtrType = cudaq::cc::PointerType::get(ptrType);
-        auto argsCreatorFuncType = FunctionType::get(
-            ctx, {ptrPtrType, ptrPtrType}, {builder.getI64Type()});
-        Value loadArgsCreator = builder.create<func::ConstantOp>(
-            loc, argsCreatorFuncType, argsCreatorFunc.getName());
-        auto castLoadArgsCreator = builder.create<cudaq::cc::FuncToPtrOp>(
-            loc, ptrType, loadArgsCreator);
-        builder.create<func::CallOp>(
-            loc, std::nullopt, cudaqRegisterArgsCreator,
-            ValueRange{castKernRef, castLoadArgsCreator});
+        if (altLaunchVersion == 1) {
+          // Register the argsCreator too
+          auto ptrPtrType = cudaq::cc::PointerType::get(ptrType);
+          auto argsCreatorFuncType = FunctionType::get(
+              ctx, {ptrPtrType, ptrPtrType}, {builder.getI64Type()});
+          Value loadArgsCreator = builder.create<func::ConstantOp>(
+              loc, argsCreatorFuncType, argsCreatorFunc.getName());
+          auto castLoadArgsCreator = builder.create<cudaq::cc::FuncToPtrOp>(
+              loc, ptrType, loadArgsCreator);
+          builder.create<func::CallOp>(
+              loc, std::nullopt, cudaqRegisterArgsCreator,
+              ValueRange{castKernRef, castLoadArgsCreator});
+        }
 
         // Check if this is a lambda mangled name
         auto demangledPtr = abi::__cxa_demangle(mangledName.str().c_str(),