Merge branch 'main' of https://github.com/NVIDIA/cuda-quantum into st…

…ate-synthesis-remote-sim-new
NVIDIA · Aug 15, 2024 · 60c1389 · 60c1389
2 parents 7021438 + a817c65
commit 60c1389
Show file tree

Hide file tree

Showing 13 changed files with 635 additions and 451 deletions.
diff --git a/include/cudaq/Optimizer/Builder/Runtime.h b/include/cudaq/Optimizer/Builder/Runtime.h
@@ -23,7 +23,8 @@ static constexpr unsigned cudaqGenPrefixLength = sizeof(cudaqGenPrefixName) - 1;
 /// compile time (see `cudaqGenPrefixName`) or it can be rewritten to call back
 /// to the runtime library (and be handled at runtime).
 static constexpr const char launchKernelFuncName[] = "altLaunchKernel";
-static constexpr const char launchKernelVersion2FuncName[] =
-    "altLaunchKernelUsingLocalJIT";
+static constexpr const char launchKernelStreamlinedFuncName[] =
+    "streamlinedLaunchKernel";
+static constexpr const char launchKernelHybridFuncName[] = "hybridLaunchKernel";
 
 } // namespace cudaq::runtime
diff --git a/include/cudaq/Optimizer/Transforms/Passes.td b/include/cudaq/Optimizer/Transforms/Passes.td
@@ -302,6 +302,27 @@ def GenerateKernelExecution : Pass<"kernel-execution", "mlir::ModuleOp"> {
     use of library side argument conversion and the argument synthesis pass.
     More generally, this option can be used when JIT compiling kernels on the
     client/host/local processor.
+
+    There are multiple code generation kinds that are supported for flexibility
+    and streamlining the kernel launch process. These tend to be related to the
+    target and runtime environment the compiler is being run in and can involve
+    some technical issues that require deeper understanding of the entire
+    process. In general, it is not recommended for user's to change this value.
+
+    ```
+    codegen kind   description
+
+      0            Hybrid. A combination of 1 and 2 that allowed early and
+                   streamlined JIT compilation but also supports return values
+                   and dynamic parameters.
+      1            Client-server interchange format. Supports kernels that
+                   return results and dynamic parameters.
+      2            Streamlined for JIT. The kernel will be converted to a
+                   nullary function with no results. Return values from the
+                   kernel are ignored, if present. All parameter values are to
+                   be inlined by the JIT compiler, so this codegen kind does not
+                   support any dynamic parameters.
+    ```
   }];
 
   let dependentDialects = ["cudaq::cc::CCDialect", "mlir::LLVM::LLVMDialect"];
@@ -311,8 +332,8 @@ def GenerateKernelExecution : Pass<"kernel-execution", "mlir::ModuleOp"> {
       /*default=*/"\"-\"", "Name of output file.">,
     Option<"startingArgIdx", "starting-arg-idx", "std::size_t", /*default=*/"0",
       "The starting argument index for the argsCreator.">,
-    Option<"altLaunchVersion", "alt-launch", "std::size_t", /*default=*/"1",
-      "Specify the version of altLaunchKernel to be used.">
+    Option<"codegenKind", "codegen", "std::size_t", /*default=*/"1",
+      "Set the kind of code to generate for the launches.">
   ];
 }
 

diff --git a/lib/Optimizer/Builder/Intrinsics.cpp b/lib/Optimizer/Builder/Intrinsics.cpp
@@ -293,25 +293,33 @@ static constexpr IntrinsicCode intrinsicTable[] = {
     return %3 : !cc.struct<{!cc.ptr<i8>, i64}>
   })#"},
 
-    {cudaq::runtime::launchKernelFuncName, // altLaunchKernel
+    // altLaunchKernel(kernelName, thunk, commBuffer, buffSize, resultOffset)
+    {cudaq::runtime::launchKernelFuncName,
      {},
      R"#(
   func.func private @altLaunchKernel(!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64) -> ())#"},
 
-    {cudaq::runtime::
-         launchKernelVersion2FuncName, // altLaunchKernelUsingLocalJIT
+    {"free", {}, "func.func private @free(!cc.ptr<i8>) -> ()"},
+
+    // hybridLaunchKernel(kernelName, thunk, commBuffer, buffSize,
+    //                    resultOffset, vectorArgPtrs)
+    {cudaq::runtime::launchKernelHybridFuncName,
      {},
      R"#(
-  func.func private @altLaunchKernelUsingLocalJIT(!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>) -> ())#"},
-
-    {"free", {}, "func.func private @free(!cc.ptr<i8>) -> ()"},
+  func.func private @hybridLaunchKernel(!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64, !cc.ptr<i8>) -> ())#"},
 
     {cudaq::llvmMemCopyIntrinsic, // llvm.memcpy.p0i8.p0i8.i64
      {},
      R"#(
   func.func private @llvm.memcpy.p0i8.p0i8.i64(!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ())#"},
 
-    {"malloc", {}, "func.func private @malloc(i64) -> !cc.ptr<i8>"}};
+    {"malloc", {}, "func.func private @malloc(i64) -> !cc.ptr<i8>"},
+
+    // streamlinedLaunchKernel(kernelName, vectorArgPtrs)
+    {cudaq::runtime::launchKernelStreamlinedFuncName,
+     {},
+     R"#(
+  func.func private @streamlinedLaunchKernel(!cc.ptr<i8>, !cc.ptr<i8>) -> ())#"}};
 
 static constexpr std::size_t intrinsicTableSize =
     sizeof(intrinsicTable) / sizeof(IntrinsicCode);

diff --git a/lib/Optimizer/CodeGen/CCToLLVM.cpp b/lib/Optimizer/CodeGen/CCToLLVM.cpp
@@ -495,6 +495,33 @@ class SizeOfOpPattern : public ConvertOpToLLVMPattern<cudaq::cc::SizeOfOp> {
   }
 };
 
+class OffsetOfOpPattern : public ConvertOpToLLVMPattern<cudaq::cc::OffsetOfOp> {
+public:
+  using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
+
+  // Use the GEP approach for now. LLVM is planning to remove support for this
+  // at some point. See: https://github.com/llvm/llvm-project/issues/71507
+  LogicalResult
+  matchAndRewrite(cudaq::cc::OffsetOfOp offsetOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto inputTy = offsetOp.getInputType();
+    SmallVector<cudaq::cc::ComputePtrArg> args;
+    for (std::int32_t i : offsetOp.getConstantIndices())
+      args.push_back(i);
+    auto resultTy = offsetOp.getType();
+    auto loc = offsetOp.getLoc();
+    // TODO: replace this with some target-specific memory layout computation
+    // when we upgrade to a newer MLIR.
+    auto zero = rewriter.create<arith::ConstantIntOp>(loc, 0, 64);
+    auto ptrTy = cudaq::cc::PointerType::get(inputTy);
+    auto nul = rewriter.create<cudaq::cc::CastOp>(loc, ptrTy, zero);
+    Value nextPtr =
+        rewriter.create<cudaq::cc::ComputePtrOp>(loc, ptrTy, nul, args);
+    rewriter.replaceOpWithNewOp<cudaq::cc::CastOp>(offsetOp, resultTy, nextPtr);
+    return success();
+  }
+};
+
 class StdvecDataOpPattern
     : public ConvertOpToLLVMPattern<cudaq::cc::StdvecDataOp> {
 public:
@@ -647,7 +674,8 @@ void cudaq::opt::populateCCToLLVMPatterns(LLVMTypeConverter &typeConverter,
                   ComputePtrOpPattern, CreateStringLiteralOpPattern,
                   ExtractValueOpPattern, FuncToPtrOpPattern, GlobalOpPattern,
                   InsertValueOpPattern, InstantiateCallableOpPattern,
-                  LoadOpPattern, PoisonOpPattern, SizeOfOpPattern,
-                  StdvecDataOpPattern, StdvecInitOpPattern, StdvecSizeOpPattern,
-                  StoreOpPattern, UndefOpPattern>(typeConverter);
+                  LoadOpPattern, OffsetOfOpPattern, PoisonOpPattern,
+                  SizeOfOpPattern, StdvecDataOpPattern, StdvecInitOpPattern,
+                  StdvecSizeOpPattern, StoreOpPattern, UndefOpPattern>(
+      typeConverter);
 }