From 418dddbe92c7e70d63e1ff10a2f6cd0469d92990 Mon Sep 17 00:00:00 2001
From: Eric Schweitz <eschweitz@nvidia.com>
Date: Mon, 12 Aug 2024 18:45:28 -0700
Subject: [PATCH 1/4] Add interface to exclude any permutation of arguments
 (#2068)

when performing argument conversion.

There is both a "drop_front" style interface to support the legacy style
and a fully generalized interface to allow the filtering of any
permutation of arguments.
---
 runtime/common/ArgumentConversion.cpp | 34 ++++++++++++++++++++++++++-
 runtime/common/ArgumentConversion.h   | 10 ++++++++
 2 files changed, 43 insertions(+), 1 deletion(-)
diff --git a/runtime/common/ArgumentConversion.cpp b/runtime/common/ArgumentConversion.cpp
index 7edb146c6f..d72b056869 100644
--- a/runtime/common/ArgumentConversion.cpp
+++ b/runtime/common/ArgumentConversion.cpp
@@ -278,8 +278,10 @@ void cudaq::opt::ArgumentConverter::gen(const std::vector<void *> &arguments) {
   FunctionType fromFuncTy = fun.getFunctionType();
   for (auto iter :
        llvm::enumerate(llvm::zip(fromFuncTy.getInputs(), arguments))) {
-    Type argTy = std::get<0>(iter.value());
     void *argPtr = std::get<1>(iter.value());
+    if (!argPtr)
+      continue;
+    Type argTy = std::get<0>(iter.value());
     unsigned i = iter.index();
     auto buildSubst = [&, i = i]<typename... Ts>(Ts &&...ts) {
       builder.setInsertionPointToEnd(substModule.getBody());
@@ -360,3 +362,33 @@ void cudaq::opt::ArgumentConverter::gen(const std::vector<void *> &arguments) {
       substitutions.emplace_back(std::move(subst));
   }
 }
+
+void cudaq::opt::ArgumentConverter::gen(
+    const std::vector<void *> &arguments,
+    const std::unordered_set<unsigned> &exclusions) {
+  std::vector<void *> partialArgs;
+  for (auto iter : llvm::enumerate(arguments)) {
+    if (exclusions.contains(iter.index())) {
+      partialArgs.push_back(nullptr);
+      continue;
+    }
+    partialArgs.push_back(iter.value());
+  }
+  gen(partialArgs);
+}
+
+void cudaq::opt::ArgumentConverter::gen_drop_front(
+    const std::vector<void *> &arguments, unsigned numDrop) {
+  // If we're dropping all the arguments, we're done.
+  if (numDrop >= arguments.size())
+    return;
+  std::vector<void *> partialArgs;
+  for (void *arg : arguments) {
+    if (numDrop--) {
+      partialArgs.push_back(nullptr);
+      continue;
+    }
+    partialArgs.push_back(arg);
+  }
+  gen(partialArgs);
+}
diff --git a/runtime/common/ArgumentConversion.h b/runtime/common/ArgumentConversion.h
index cefc27aed9..f94ff86e59 100644
--- a/runtime/common/ArgumentConversion.h
+++ b/runtime/common/ArgumentConversion.h
@@ -12,6 +12,7 @@
 #include "cudaq/Optimizer/Dialect/CC/CCTypes.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/Types.h"
+#include <unordered_set>
 
 namespace cudaq {
 class state;
@@ -29,6 +30,15 @@ class ArgumentConverter {
   /// The arguments are those presented to the kernel, kernelName.
   void gen(const std::vector<void *> &arguments);
 
+  /// Generate a substitution ModuleOp but include only the arguments that do
+  /// not appear in the set of \p exclusions.
+  void gen(const std::vector<void *> &arguments,
+           const std::unordered_set<unsigned> &exclusions);
+
+  /// Generate a substitution ModuleOp but drop the first \p numDrop arguments
+  /// and thereby exclude them from the substitutions.
+  void gen_drop_front(const std::vector<void *> &arguments, unsigned numDrop);
+
   /// Get the list of substitutions that were generated by `gen()`.
   mlir::SmallVector<cudaq::cc::ArgumentSubstitutionOp> &getSubstitutions() {
     return substitutions;

From ada3ab4c842c54d037885288ccf8b92fe4947103 Mon Sep 17 00:00:00 2001
From: Eric Schweitz <eschweitz@nvidia.com>
Date: Tue, 13 Aug 2024 17:56:04 -0700
Subject: [PATCH 2/4] [core] Add support for kernel return values. (#2082)

* Add interface to exclude any permutation of arguments
when performing argument conversion.

There is both a "drop_front" style interface to support the legacy style
and a fully generalized interface to allow the filtering of any
permutation of arguments.

* Workaround bogus warning about doxygen tag.

* [core] Add support for kernel return values.

GKE has always supported return values from kernels. However, the
existing runtime intermediate layers do not use them and just drop them
on the floor, if they are present.

These patches fuse the old GKE with the new argument gathering approach
in a hybrid format. Specifically, GKE will construct both a data packet
with the argument data packed in it *and* a std::vector<void*> with
pointers to the argument values. A new runtime entry point can use
either strategy for the incoming direction. For the outgoing direction,
"synthesizing" the result value into the calling code is nonsensical and
an actual value must be returned, which was already supported by the old
altLaunchKernel protocol.
---
 include/cudaq/Optimizer/Builder/Runtime.h     |   5 +-
 include/cudaq/Optimizer/Transforms/Passes.td  |  25 +-
 lib/Optimizer/Builder/Intrinsics.cpp          |  22 +-
 .../Transforms/GenKernelExecution.cpp         | 758 +++++++++---------
 test/Quake/kernel_exec-1.qke                  | 110 ++-
 test/Quake/kernel_exec-2.qke                  |   4 +-
 test/Quake/return_vector.qke                  |   8 +-
 7 files changed, 514 insertions(+), 418 deletions(-)

diff --git a/include/cudaq/Optimizer/Builder/Runtime.h b/include/cudaq/Optimizer/Builder/Runtime.h
index bf81843fd9..c25a5cd2ee 100644
--- a/include/cudaq/Optimizer/Builder/Runtime.h
+++ b/include/cudaq/Optimizer/Builder/Runtime.h
@@ -23,7 +23,8 @@ static constexpr unsigned cudaqGenPrefixLength = sizeof(cudaqGenPrefixName) - 1;
 /// compile time (see `cudaqGenPrefixName`) or it can be rewritten to call back
 /// to the runtime library (and be handled at runtime).
 static constexpr const char launchKernelFuncName[] = "altLaunchKernel";
-static constexpr const char launchKernelVersion2FuncName[] =
-    "altLaunchKernelUsingLocalJIT";
+static constexpr const char launchKernelStreamlinedFuncName[] =
+    "streamlinedLaunchKernel";
+static constexpr const char launchKernelHybridFuncName[] = "hybridLaunchKernel";
 
 } // namespace cudaq::runtime
diff --git a/include/cudaq/Optimizer/Transforms/Passes.td b/include/cudaq/Optimizer/Transforms/Passes.td
index e8e5a79b0d..8fa3eb8bd0 100644
--- a/include/cudaq/Optimizer/Transforms/Passes.td
+++ b/include/cudaq/Optimizer/Transforms/Passes.td
@@ -302,6 +302,27 @@ def GenerateKernelExecution : Pass<"kernel-execution", "mlir::ModuleOp"> {
     use of library side argument conversion and the argument synthesis pass.
     More generally, this option can be used when JIT compiling kernels on the
     client/host/local processor.
+
+    There are multiple code generation kinds that are supported for flexibility
+    and streamlining the kernel launch process. These tend to be related to the
+    target and runtime environment the compiler is being run in and can involve
+    some technical issues that require deeper understanding of the entire
+    process. In general, it is not recommended for user's to change this value.
+
+    ```
+    codegen kind   description
+
+      0            Hybrid. A combination of 1 and 2 that allowed early and
+                   streamlined JIT compilation but also supports return values
+                   and dynamic parameters.
+      1            Client-server interchange format. Supports kernels that
+                   return results and dynamic parameters.
+      2            Streamlined for JIT. The kernel will be converted to a
+                   nullary function with no results. Return values from the
+                   kernel are ignored, if present. All parameter values are to
+                   be inlined by the JIT compiler, so this codegen kind does not
+                   support any dynamic parameters.
+    ```
   }];
 
   let dependentDialects = ["cudaq::cc::CCDialect", "mlir::LLVM::LLVMDialect"];
@@ -311,8 +332,8 @@ def GenerateKernelExecution : Pass<"kernel-execution", "mlir::ModuleOp"> {
       /*default=*/"\"-\"", "Name of output file.">,
     Option<"startingArgIdx", "starting-arg-idx", "std::size_t", /*default=*/"0",
       "The starting argument index for the argsCreator.">,
-    Option<"altLaunchVersion", "alt-launch", "std::size_t", /*default=*/"1",
-      "Specify the version of altLaunchKernel to be used.">
+    Option<"codegenKind", "codegen", "std::size_t", /*default=*/"1",
+      "Set the kind of code to generate for the launches.">
   ];
 }
 
diff --git a/lib/Optimizer/Builder/Intrinsics.cpp b/lib/Optimizer/Builder/Intrinsics.cpp
index 12030de199..5daceec94b 100644
--- a/lib/Optimizer/Builder/Intrinsics.cpp
+++ b/lib/Optimizer/Builder/Intrinsics.cpp
@@ -293,25 +293,33 @@ static constexpr IntrinsicCode intrinsicTable[] = {
     return %3 : !cc.struct<{!cc.ptr<i8>, i64}>
   })#"},
 
-    {cudaq::runtime::launchKernelFuncName, // altLaunchKernel
+    // altLaunchKernel(kernelName, thunk, commBuffer, buffSize, resultOffset)
+    {cudaq::runtime::launchKernelFuncName,
      {},
      R"#(
   func.func private @altLaunchKernel(!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64) -> ())#"},
 
-    {cudaq::runtime::
-         launchKernelVersion2FuncName, // altLaunchKernelUsingLocalJIT
+    {"free", {}, "func.func private @free(!cc.ptr<i8>) -> ()"},
+
+    // hybridLaunchKernel(kernelName, thunk, commBuffer, buffSize,
+    //                    resultOffset, vectorArgPtrs)
+    {cudaq::runtime::launchKernelHybridFuncName,
      {},
      R"#(
-  func.func private @altLaunchKernelUsingLocalJIT(!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>) -> ())#"},
-
-    {"free", {}, "func.func private @free(!cc.ptr<i8>) -> ()"},
+  func.func private @hybridLaunchKernel(!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64, !cc.ptr<i8>) -> ())#"},
 
     {cudaq::llvmMemCopyIntrinsic, // llvm.memcpy.p0i8.p0i8.i64
      {},
      R"#(
   func.func private @llvm.memcpy.p0i8.p0i8.i64(!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ())#"},
 
-    {"malloc", {}, "func.func private @malloc(i64) -> !cc.ptr<i8>"}};
+    {"malloc", {}, "func.func private @malloc(i64) -> !cc.ptr<i8>"},
+
+    // streamlinedLaunchKernel(kernelName, vectorArgPtrs)
+    {cudaq::runtime::launchKernelStreamlinedFuncName,
+     {},
+     R"#(
+  func.func private @streamlinedLaunchKernel(!cc.ptr<i8>, !cc.ptr<i8>) -> ())#"}};
 
 static constexpr std::size_t intrinsicTableSize =
     sizeof(intrinsicTable) / sizeof(IntrinsicCode);
diff --git a/lib/Optimizer/Transforms/GenKernelExecution.cpp b/lib/Optimizer/Transforms/GenKernelExecution.cpp
index 78176e5387..7c87b9f4a7 100644
--- a/lib/Optimizer/Transforms/GenKernelExecution.cpp
+++ b/lib/Optimizer/Transforms/GenKernelExecution.cpp
@@ -34,19 +34,29 @@ namespace cudaq::opt {
 
 using namespace mlir;
 
-namespace {
 // Define some constant function name strings.
-static constexpr const char cudaqRegisterLambdaName[] =
+static constexpr const char CudaqRegisterLambdaName[] =
     "cudaqRegisterLambdaName";
-static constexpr const char cudaqRegisterArgsCreator[] =
+static constexpr const char CudaqRegisterArgsCreator[] =
     "cudaqRegisterArgsCreator";
-static constexpr const char cudaqRegisterKernelName[] =
+static constexpr const char CudaqRegisterKernelName[] =
     "cudaqRegisterKernelName";
 
 /// This value is used to indicate that a kernel does not return a result.
 static constexpr std::uint64_t NoResultOffset =
     std::numeric_limits<std::int32_t>::max();
 
+/// Generate code for packing arguments as raw data.
+static bool isCodegenPackedData(std::size_t kind) {
+  return kind == 0 || kind == 1;
+}
+
+/// Generate code that gathers the arguments for conversion and synthesis.
+static bool isCodegenArgumentGather(std::size_t kind) {
+  return kind == 0 || kind == 2;
+}
+
+namespace {
 class GenerateKernelExecution
     : public cudaq::opt::impl::GenerateKernelExecutionBase<
           GenerateKernelExecution> {
@@ -1116,227 +1126,311 @@ class GenerateKernelExecution
   /// library. Pass along the thunk, so the runtime can call the quantum
   /// circuit. These entry points are `operator()` member functions in a class,
   /// so account for the `this` argument here.
-  void genNewHostEntryPoint1(Location loc, OpBuilder &builder,
-                             FunctionType funcTy,
-                             cudaq::cc::StructType structTy,
-                             LLVM::GlobalOp kernelNameObj, func::FuncOp thunk,
-                             func::FuncOp rewriteEntry, bool addThisPtr) {
+  void genNewHostEntryPoint(Location loc, OpBuilder &builder,
+                            FunctionType devFuncTy,
+                            LLVM::GlobalOp kernelNameObj, func::FuncOp hostFunc,
+                            bool addThisPtr, cudaq::cc::StructType structTy,
+                            func::FuncOp thunkFunc) {
     auto *ctx = builder.getContext();
     auto i64Ty = builder.getI64Type();
-    auto offset = funcTy.getNumInputs();
+    auto offset = devFuncTy.getNumInputs();
     auto thunkTy = getThunkType(ctx);
     auto structPtrTy = cudaq::cc::PointerType::get(structTy);
-    Block *rewriteEntryBlock = rewriteEntry.addEntryBlock();
+    Block *hostFuncEntryBlock = hostFunc.addEntryBlock();
+    const bool hiddenSRet = cudaq::opt::factory::hasHiddenSRet(devFuncTy);
 
     OpBuilder::InsertionGuard guard(builder);
-    builder.setInsertionPointToStart(rewriteEntryBlock);
-    Value stVal = builder.create<cudaq::cc::UndefOp>(loc, structTy);
+    builder.setInsertionPointToStart(hostFuncEntryBlock);
+    auto i8Ty = builder.getI8Type();
+    auto ptrI8Ty = cudaq::cc::PointerType::get(i8Ty);
 
-    // Process all the arguments for the original call, ignoring any hidden
-    // arguments (such as the `this` pointer).
-    auto zero = builder.create<arith::ConstantIntOp>(loc, 0, 64);
-    Value extraBytes = zero;
-    bool hasTrailingData = false;
-    SmallVector<BlockArgument> blockArgs{dropAnyHiddenArguments(
-        rewriteEntryBlock->getArguments(), funcTy, addThisPtr)};
-    std::int32_t idx = 0;
-    SmallVector<Value> blockValues(blockArgs.size());
-    std::copy(blockArgs.begin(), blockArgs.end(), blockValues.begin());
-    for (auto iter = blockArgs.begin(), end = blockArgs.end(); iter != end;
-         ++iter, ++idx) {
-      Value arg = *iter;
-      Type inTy = arg.getType();
-      Type quakeTy = funcTy.getInput(idx);
-      // If the argument is a callable, skip it.
-      if (isa<cudaq::cc::CallableType>(quakeTy))
-        continue;
-      // If the argument is an empty struct, skip it.
-      if (auto strTy = dyn_cast<cudaq::cc::StructType>(quakeTy))
-        if (strTy.isEmpty())
+    Value temp;
+    Value castTemp;
+    Value resultOffset;
+    Value castLoadThunk;
+    Value extendedStructSize;
+    if (isCodegenPackedData(codegenKind)) {
+      Value stVal = builder.create<cudaq::cc::UndefOp>(loc, structTy);
+
+      // Process all the arguments for the original call, ignoring any hidden
+      // arguments (such as the `this` pointer).
+      auto zero = builder.create<arith::ConstantIntOp>(loc, 0, 64);
+      Value extraBytes = zero;
+      bool hasTrailingData = false;
+      SmallVector<BlockArgument> blockArgs{dropAnyHiddenArguments(
+          hostFuncEntryBlock->getArguments(), devFuncTy, addThisPtr)};
+      std::int32_t idx = 0;
+      SmallVector<Value> blockValues(blockArgs.size());
+      std::copy(blockArgs.begin(), blockArgs.end(), blockValues.begin());
+      for (auto iter = blockArgs.begin(), end = blockArgs.end(); iter != end;
+           ++iter, ++idx) {
+        Value arg = *iter;
+        Type inTy = arg.getType();
+        Type quakeTy = devFuncTy.getInput(idx);
+        // If the argument is a callable, skip it.
+        if (isa<cudaq::cc::CallableType>(quakeTy))
           continue;
+        // If the argument is an empty struct, skip it.
+        if (auto strTy = dyn_cast<cudaq::cc::StructType>(quakeTy))
+          if (strTy.isEmpty())
+            continue;
 
-      if (auto stdvecTy = dyn_cast<cudaq::cc::SpanLikeType>(quakeTy)) {
-        // Per the CUDA-Q spec, an entry point kernel must take a `[const]
-        // std::vector<T>` value argument.
-        // Should the spec stipulate that pure device kernels must pass by
-        // read-only reference, i.e., take `const std::vector<T> &` arguments?
-        auto ptrInTy = cast<cudaq::cc::PointerType>(inTy);
-        // If this is a std::vector<bool>, unpack it.
-        if (stdvecTy.getElementType() == builder.getI1Type()) {
-          // Create a mock vector of i8 and populate the bools, 1 per char.
-          Value temp = builder.create<cudaq::cc::AllocaOp>(
-              loc, ptrInTy.getElementType());
-          builder.create<func::CallOp>(loc, std::nullopt,
-                                       cudaq::stdvecBoolUnpackToInitList,
-                                       ArrayRef<Value>{temp, arg});
-          arg = blockValues[idx] = temp;
+        if (auto stdvecTy = dyn_cast<cudaq::cc::SpanLikeType>(quakeTy)) {
+          // Per the CUDA-Q spec, an entry point kernel must take a `[const]
+          // std::vector<T>` value argument.
+          // Should the spec stipulate that pure device kernels must pass by
+          // read-only reference, i.e., take `const std::vector<T> &` arguments?
+          auto ptrInTy = cast<cudaq::cc::PointerType>(inTy);
+          // If this is a std::vector<bool>, unpack it.
+          if (stdvecTy.getElementType() == builder.getI1Type()) {
+            // Create a mock vector of i8 and populate the bools, 1 per char.
+            Value tmp = builder.create<cudaq::cc::AllocaOp>(
+                loc, ptrInTy.getElementType());
+            builder.create<func::CallOp>(loc, std::nullopt,
+                                         cudaq::stdvecBoolUnpackToInitList,
+                                         ArrayRef<Value>{tmp, arg});
+            arg = blockValues[idx] = tmp;
+          }
+          // FIXME: call the `size` member function. For expediency, assume this
+          // is an std::vector and the size is the scaled delta between the
+          // first two pointers. Use the unscaled size for now.
+          auto [p1, p2] = insertVectorSizeAndIncrementExtraBytes(
+              loc, builder, arg, ptrInTy, stdvecTy, stVal, idx, extraBytes);
+          stVal = p1;
+          extraBytes = p2;
+          hasTrailingData = true;
+          continue;
         }
-        // FIXME: call the `size` member function. For expediency, assume this
-        // is an std::vector and the size is the scaled delta between the
-        // first two pointers. Use the unscaled size for now.
-        auto [p1, p2] = insertVectorSizeAndIncrementExtraBytes(
-            loc, builder, arg, ptrInTy, stdvecTy, stVal, idx, extraBytes);
-        stVal = p1;
-        extraBytes = p2;
-        hasTrailingData = true;
-        continue;
-      }
-      if (auto strTy = dyn_cast<cudaq::cc::StructType>(quakeTy)) {
-        if (!isa<cudaq::cc::PointerType>(arg.getType())) {
-          // If argument is not a pointer, then struct was promoted into a
-          // register.
-          auto *parent = builder.getBlock()->getParentOp();
-          auto module = parent->getParentOfType<ModuleOp>();
-          auto tmp = builder.create<cudaq::cc::AllocaOp>(loc, quakeTy);
-          auto cast = builder.create<cudaq::cc::CastOp>(
-              loc, cudaq::cc::PointerType::get(arg.getType()), tmp);
-          if (cudaq::opt::factory::isX86_64(module)) {
-            builder.create<cudaq::cc::StoreOp>(loc, arg, cast);
-            if (cudaq::opt::factory::structUsesTwoArguments(quakeTy)) {
-              auto arrTy = cudaq::cc::ArrayType::get(builder.getI8Type());
-              auto cast = builder.create<cudaq::cc::CastOp>(
-                  loc, cudaq::cc::PointerType::get(arrTy), tmp);
-              auto hiPtr = builder.create<cudaq::cc::ComputePtrOp>(
-                  loc, cudaq::cc::PointerType::get(builder.getI8Type()), cast,
-                  cudaq::cc::ComputePtrArg{8});
-              ++iter;
-              Value nextArg = *iter;
-              auto cast2 = builder.create<cudaq::cc::CastOp>(
-                  loc, cudaq::cc::PointerType::get(nextArg.getType()), hiPtr);
-              builder.create<cudaq::cc::StoreOp>(loc, nextArg, cast2);
+        if (auto strTy = dyn_cast<cudaq::cc::StructType>(quakeTy)) {
+          if (!isa<cudaq::cc::PointerType>(arg.getType())) {
+            // If argument is not a pointer, then struct was promoted into a
+            // register.
+            auto *parent = builder.getBlock()->getParentOp();
+            auto module = parent->getParentOfType<ModuleOp>();
+            auto tmp = builder.create<cudaq::cc::AllocaOp>(loc, quakeTy);
+            auto cast = builder.create<cudaq::cc::CastOp>(
+                loc, cudaq::cc::PointerType::get(arg.getType()), tmp);
+            if (cudaq::opt::factory::isX86_64(module)) {
+              builder.create<cudaq::cc::StoreOp>(loc, arg, cast);
+              if (cudaq::opt::factory::structUsesTwoArguments(quakeTy)) {
+                auto arrTy = cudaq::cc::ArrayType::get(builder.getI8Type());
+                auto cast = builder.create<cudaq::cc::CastOp>(
+                    loc, cudaq::cc::PointerType::get(arrTy), tmp);
+                auto hiPtr = builder.create<cudaq::cc::ComputePtrOp>(
+                    loc, cudaq::cc::PointerType::get(builder.getI8Type()), cast,
+                    cudaq::cc::ComputePtrArg{8});
+                ++iter;
+                Value nextArg = *iter;
+                auto cast2 = builder.create<cudaq::cc::CastOp>(
+                    loc, cudaq::cc::PointerType::get(nextArg.getType()), hiPtr);
+                builder.create<cudaq::cc::StoreOp>(loc, nextArg, cast2);
+              }
+            } else {
+              builder.create<cudaq::cc::StoreOp>(loc, arg, cast);
             }
-          } else {
-            builder.create<cudaq::cc::StoreOp>(loc, arg, cast);
+            // Load the assembled (sub-)struct and insert into the buffer value.
+            Value v = builder.create<cudaq::cc::LoadOp>(loc, tmp);
+            stVal = builder.create<cudaq::cc::InsertValueOp>(
+                loc, stVal.getType(), stVal, v, idx);
+            continue;
           }
-          // Load the assembled (sub-)struct and insert into the buffer value.
-          Value v = builder.create<cudaq::cc::LoadOp>(loc, tmp);
-          stVal = builder.create<cudaq::cc::InsertValueOp>(loc, stVal.getType(),
-                                                           stVal, v, idx);
+          if (!cudaq::cc::isDynamicType(strTy)) {
+            // struct is static size, so just load the value (byval ptr).
+            Value v = builder.create<cudaq::cc::LoadOp>(loc, arg);
+            stVal = builder.create<cudaq::cc::InsertValueOp>(
+                loc, stVal.getType(), stVal, v, idx);
+            continue;
+          }
+          auto genTy = cast<cudaq::cc::StructType>(
+              cudaq::opt::factory::genArgumentBufferType(strTy));
+          Value zero = builder.create<arith::ConstantIntOp>(loc, 0, 64);
+          auto [quakeVal, recursiveSize] = computeRecursiveDynamicStructSize(
+              loc, builder, strTy, arg, zero, genTy);
+          stVal = builder.create<cudaq::cc::InsertValueOp>(
+              loc, stVal.getType(), stVal, quakeVal, idx);
+          extraBytes =
+              builder.create<arith::AddIOp>(loc, extraBytes, recursiveSize);
+          hasTrailingData = true;
           continue;
         }
-        if (!cudaq::cc::isDynamicType(strTy)) {
-          // struct is static size, so just load the value (byval ptr).
-          Value v = builder.create<cudaq::cc::LoadOp>(loc, arg);
-          stVal = builder.create<cudaq::cc::InsertValueOp>(loc, stVal.getType(),
-                                                           stVal, v, idx);
+        if (auto ptrTy = dyn_cast<cudaq::cc::PointerType>(inTy)) {
+          if (isa<cudaq::cc::StateType>(ptrTy.getElementType())) {
+            // Special case: if the argument is a `cudaq::state*`, then just
+            // pass the pointer. We can do that in this case because the
+            // synthesis step (which will receive the argument data) is assumed
+            // to run in the same memory space.
+            Value argPtr = builder.create<cudaq::cc::CastOp>(loc, inTy, arg);
+            stVal = builder.create<cudaq::cc::InsertValueOp>(
+                loc, stVal.getType(), stVal, argPtr, idx);
+          }
           continue;
         }
-        auto genTy = cast<cudaq::cc::StructType>(
-            cudaq::opt::factory::genArgumentBufferType(strTy));
-        Value zero = builder.create<arith::ConstantIntOp>(loc, 0, 64);
-        auto [quakeVal, recursiveSize] = computeRecursiveDynamicStructSize(
-            loc, builder, strTy, arg, zero, genTy);
+
         stVal = builder.create<cudaq::cc::InsertValueOp>(loc, stVal.getType(),
-                                                         stVal, quakeVal, idx);
-        extraBytes =
-            builder.create<arith::AddIOp>(loc, extraBytes, recursiveSize);
-        hasTrailingData = true;
-        continue;
+                                                         stVal, arg, idx);
       }
-      if (auto ptrTy = dyn_cast<cudaq::cc::PointerType>(inTy)) {
-        if (isa<cudaq::cc::StateType>(ptrTy.getElementType())) {
-          // Special case: if the argument is a `cudaq::state*`, then just pass
-          // the pointer. We can do that in this case because the synthesis step
-          // (which will receive the argument data) is assumed to run in the
-          // same memory space.
-          Value argPtr = builder.create<cudaq::cc::CastOp>(loc, inTy, arg);
-          stVal = builder.create<cudaq::cc::InsertValueOp>(loc, stVal.getType(),
-                                                           stVal, argPtr, idx);
+
+      // Compute the struct size without the trailing bytes, structSize, and
+      // with the trailing bytes, extendedStructSize.
+      auto nullSt = builder.create<cudaq::cc::CastOp>(loc, structPtrTy, zero);
+      Value structSize =
+          builder.create<cudaq::cc::SizeOfOp>(loc, i64Ty, structTy);
+      extendedStructSize =
+          builder.create<arith::AddIOp>(loc, structSize, extraBytes);
+
+      // Allocate our struct to save the argument to.
+      auto buff =
+          builder.create<cudaq::cc::AllocaOp>(loc, i8Ty, extendedStructSize);
+
+      temp = builder.create<cudaq::cc::CastOp>(loc, structPtrTy, buff);
+
+      // Store the arguments to the argument section.
+      builder.create<cudaq::cc::StoreOp>(loc, stVal, temp);
+
+      auto structPtrArrTy =
+          cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(structTy));
+      temp = builder.create<cudaq::cc::CastOp>(loc, structPtrArrTy, buff);
+
+      // Append the vector data to the end of the struct.
+      if (hasTrailingData) {
+        Value vecToBuffer = builder.create<cudaq::cc::ComputePtrOp>(
+            loc, ptrI8Ty, buff, SmallVector<Value>{structSize});
+        // Ignore any hidden `this` argument.
+        for (auto inp : llvm::enumerate(blockValues)) {
+          Value arg = inp.value();
+          Type inTy = arg.getType();
+          std::int32_t idx = inp.index();
+          Type quakeTy = devFuncTy.getInput(idx);
+          if (auto stdvecTy = dyn_cast<cudaq::cc::SpanLikeType>(quakeTy)) {
+            auto bytes = builder.create<cudaq::cc::ExtractValueOp>(
+                loc, builder.getI64Type(), stVal, idx);
+            assert(stdvecTy == devFuncTy.getInput(idx));
+            auto ptrInTy = cast<cudaq::cc::PointerType>(inTy);
+            vecToBuffer = encodeVectorData(loc, builder, bytes, stdvecTy, arg,
+                                           vecToBuffer, ptrInTy);
+            if (stdvecTy.getElementType() == builder.getI1Type()) {
+              auto ptrI1Ty = cudaq::cc::PointerType::get(builder.getI1Type());
+              auto heapPtr = builder.create<cudaq::cc::ComputePtrOp>(
+                  loc, cudaq::cc::PointerType::get(ptrI1Ty), arg,
+                  ArrayRef<cudaq::cc::ComputePtrArg>{0});
+              auto loadHeapPtr =
+                  builder.create<cudaq::cc::LoadOp>(loc, heapPtr);
+              Value heapCast = builder.create<cudaq::cc::CastOp>(
+                  loc, cudaq::cc::PointerType::get(i8Ty), loadHeapPtr);
+              builder.create<func::CallOp>(loc, std::nullopt, "free",
+                                           ArrayRef<Value>{heapCast});
+            }
+          } else if (auto strTy = dyn_cast<cudaq::cc::StructType>(quakeTy)) {
+            if (cudaq::cc::isDynamicType(strTy))
+              vecToBuffer = encodeDynamicStructData(loc, builder, strTy, arg,
+                                                    temp, vecToBuffer);
+          }
         }
-        continue;
       }
-
-      stVal = builder.create<cudaq::cc::InsertValueOp>(loc, stVal.getType(),
-                                                       stVal, arg, idx);
+      Value loadThunk =
+          builder.create<func::ConstantOp>(loc, thunkTy, thunkFunc.getName());
+      castLoadThunk =
+          builder.create<cudaq::cc::FuncToPtrOp>(loc, ptrI8Ty, loadThunk);
+      castTemp = builder.create<cudaq::cc::CastOp>(loc, ptrI8Ty, temp);
+      resultOffset =
+          genComputeReturnOffset(loc, builder, devFuncTy, structTy, nullSt);
     }
 
-    // Compute the struct size without the trailing bytes, structSize, and with
-    // the trailing bytes, extendedStructSize.
-    auto nullSt = builder.create<cudaq::cc::CastOp>(loc, structPtrTy, zero);
-    Value structSize =
-        builder.create<cudaq::cc::SizeOfOp>(loc, i64Ty, structTy);
-    Value extendedStructSize =
-        builder.create<arith::AddIOp>(loc, structSize, extraBytes);
-
-    // Allocate our struct to save the argument to.
-    auto i8Ty = builder.getI8Type();
-    auto ptrI8Ty = cudaq::cc::PointerType::get(i8Ty);
-    auto buff =
-        builder.create<cudaq::cc::AllocaOp>(loc, i8Ty, extendedStructSize);
-
-    Value temp = builder.create<cudaq::cc::CastOp>(loc, structPtrTy, buff);
-
-    // Store the arguments to the argument section.
-    builder.create<cudaq::cc::StoreOp>(loc, stVal, temp);
-
-    auto structPtrArrTy =
-        cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(structTy));
-    temp = builder.create<cudaq::cc::CastOp>(loc, structPtrArrTy, buff);
-
-    // Append the vector data to the end of the struct.
-    if (hasTrailingData) {
-      Value vecToBuffer = builder.create<cudaq::cc::ComputePtrOp>(
-          loc, ptrI8Ty, buff, SmallVector<Value>{structSize});
-      // Ignore any hidden `this` argument.
-      for (auto inp : llvm::enumerate(blockValues)) {
-        Value arg = inp.value();
-        Type inTy = arg.getType();
-        std::int32_t idx = inp.index();
-        Type quakeTy = funcTy.getInput(idx);
-        if (auto stdvecTy = dyn_cast<cudaq::cc::SpanLikeType>(quakeTy)) {
-          auto bytes = builder.create<cudaq::cc::ExtractValueOp>(
-              loc, builder.getI64Type(), stVal, idx);
-          assert(stdvecTy == funcTy.getInput(idx));
-          auto ptrInTy = cast<cudaq::cc::PointerType>(inTy);
-          vecToBuffer = encodeVectorData(loc, builder, bytes, stdvecTy, arg,
-                                         vecToBuffer, ptrInTy);
-          if (stdvecTy.getElementType() == builder.getI1Type()) {
-            auto ptrI1Ty = cudaq::cc::PointerType::get(builder.getI1Type());
-            auto heapPtr = builder.create<cudaq::cc::ComputePtrOp>(
-                loc, cudaq::cc::PointerType::get(ptrI1Ty), arg,
-                ArrayRef<cudaq::cc::ComputePtrArg>{0});
-            auto loadHeapPtr = builder.create<cudaq::cc::LoadOp>(loc, heapPtr);
-            Value heapCast = builder.create<cudaq::cc::CastOp>(
-                loc, cudaq::cc::PointerType::get(i8Ty), loadHeapPtr);
-            builder.create<func::CallOp>(loc, std::nullopt, "free",
-                                         ArrayRef<Value>{heapCast});
-          }
-        } else if (auto strTy = dyn_cast<cudaq::cc::StructType>(quakeTy)) {
-          if (cudaq::cc::isDynamicType(strTy))
-            vecToBuffer = encodeDynamicStructData(loc, builder, strTy, arg,
-                                                  temp, vecToBuffer);
+    Value vecArgPtrs;
+    if (isCodegenArgumentGather(codegenKind)) {
+      // 1) Allocate and initialize a std::vector<void*> object.
+      const unsigned count =
+          cudaq::cc::numberOfHiddenArgs(addThisPtr, hiddenSRet);
+      auto stdVec = builder.create<cudaq::cc::AllocaOp>(
+          loc, cudaq::opt::factory::stlVectorType(ptrI8Ty));
+      auto arrPtrTy = cudaq::cc::ArrayType::get(ctx, ptrI8Ty, count);
+      Value buffer = builder.create<cudaq::cc::AllocaOp>(loc, arrPtrTy);
+      auto i64Ty = builder.getI64Type();
+      auto buffSize = builder.create<cudaq::cc::SizeOfOp>(loc, i64Ty, arrPtrTy);
+      auto ptrPtrTy = cudaq::cc::PointerType::get(ptrI8Ty);
+      auto cast1 = builder.create<cudaq::cc::CastOp>(loc, ptrPtrTy, buffer);
+      auto ptr3Ty = cudaq::cc::PointerType::get(ptrPtrTy);
+      auto stdVec0 = builder.create<cudaq::cc::CastOp>(loc, ptr3Ty, stdVec);
+      builder.create<cudaq::cc::StoreOp>(loc, cast1, stdVec0);
+      auto cast2 = builder.create<cudaq::cc::CastOp>(loc, i64Ty, buffer);
+      auto endBuff = builder.create<arith::AddIOp>(loc, cast2, buffSize);
+      auto cast3 = builder.create<cudaq::cc::CastOp>(loc, ptrPtrTy, endBuff);
+      auto stdVec1 = builder.create<cudaq::cc::ComputePtrOp>(
+          loc, ptr3Ty, stdVec, ArrayRef<cudaq::cc::ComputePtrArg>{1});
+      builder.create<cudaq::cc::StoreOp>(loc, cast3, stdVec1);
+      auto stdVec2 = builder.create<cudaq::cc::ComputePtrOp>(
+          loc, ptr3Ty, stdVec, ArrayRef<cudaq::cc::ComputePtrArg>{2});
+      builder.create<cudaq::cc::StoreOp>(loc, cast3, stdVec2);
+
+      // 2) Iterate over the arguments passed in and populate the vector.
+      SmallVector<BlockArgument> blockArgs{dropAnyHiddenArguments(
+          hostFuncEntryBlock->getArguments(), devFuncTy, addThisPtr)};
+      for (auto iter : llvm::enumerate(blockArgs)) {
+        std::int32_t i = iter.index();
+        auto pos = builder.create<cudaq::cc::ComputePtrOp>(
+            loc, ptrPtrTy, buffer, ArrayRef<cudaq::cc::ComputePtrArg>{i});
+        auto blkArg = iter.value();
+        if (isa<cudaq::cc::PointerType>(blkArg.getType())) {
+          auto castArg =
+              builder.create<cudaq::cc::CastOp>(loc, ptrI8Ty, blkArg);
+          builder.create<cudaq::cc::StoreOp>(loc, castArg, pos);
+          continue;
         }
+        auto temp = builder.create<cudaq::cc::AllocaOp>(loc, blkArg.getType());
+        builder.create<cudaq::cc::StoreOp>(loc, blkArg, temp);
+        auto castTemp = builder.create<cudaq::cc::CastOp>(loc, ptrI8Ty, temp);
+        builder.create<cudaq::cc::StoreOp>(loc, castTemp, pos);
       }
+      vecArgPtrs = builder.create<cudaq::cc::CastOp>(loc, ptrI8Ty, stdVec);
     }
 
     // Prepare to call the `launchKernel` runtime library entry point.
     Value loadKernName = builder.create<LLVM::AddressOfOp>(
         loc, cudaq::opt::factory::getPointerType(kernelNameObj.getType()),
         kernelNameObj.getSymName());
-    Value loadThunk =
-        builder.create<func::ConstantOp>(loc, thunkTy, thunk.getName());
     auto castLoadKernName =
         builder.create<cudaq::cc::CastOp>(loc, ptrI8Ty, loadKernName);
-    auto castLoadThunk =
-        builder.create<cudaq::cc::FuncToPtrOp>(loc, ptrI8Ty, loadThunk);
-    auto castTemp = builder.create<cudaq::cc::CastOp>(loc, ptrI8Ty, temp);
-
-    auto resultOffset =
-        genComputeReturnOffset(loc, builder, funcTy, structTy, nullSt);
 
     // Generate the call to `launchKernel`.
-    builder.create<func::CallOp>(
-        loc, std::nullopt, cudaq::runtime::launchKernelFuncName,
-        ArrayRef<Value>{castLoadKernName, castLoadThunk, castTemp,
-                        extendedStructSize, resultOffset});
-    const bool hiddenSRet = cudaq::opt::factory::hasHiddenSRet(funcTy);
+    switch (codegenKind) {
+    case 0: {
+      assert(vecArgPtrs && "vector<arg*> must be initialized");
+      builder.create<func::CallOp>(
+          loc, std::nullopt, cudaq::runtime::launchKernelHybridFuncName,
+          ArrayRef<Value>{castLoadKernName, castLoadThunk, castTemp,
+                          extendedStructSize, resultOffset, vecArgPtrs});
+    } break;
+    case 1: {
+      builder.create<func::CallOp>(
+          loc, std::nullopt, cudaq::runtime::launchKernelFuncName,
+          ArrayRef<Value>{castLoadKernName, castLoadThunk, castTemp,
+                          extendedStructSize, resultOffset});
+    } break;
+    case 2: {
+      assert(vecArgPtrs && "vector<arg*> must be initialized");
+      builder.create<func::CallOp>(
+          loc, std::nullopt, cudaq::runtime::launchKernelStreamlinedFuncName,
+          ArrayRef<Value>{castLoadKernName, vecArgPtrs});
+      // For this codegen kind, we drop any results on the floor and return
+      // random data in registers and/or off the stack. This maintains parity
+      // with any pre-existing kernel launchers.
+      SmallVector<Value> garbage;
+      for (auto ty : hostFunc.getFunctionType().getResults())
+        garbage.push_back(builder.create<cudaq::cc::UndefOp>(loc, ty));
+      builder.create<func::ReturnOp>(loc, garbage);
+      return;
+    }
+    default:
+      hostFunc.emitOpError("codegen kind is invalid");
+      return;
+    }
 
     // If and only if this kernel returns a value, unpack and load the
     // result value(s) from the struct returned by `launchKernel` and return
     // them to our caller.
     SmallVector<Value> results;
-    const bool multiResult = funcTy.getResults().size() > 1;
-    for (auto res : llvm::enumerate(funcTy.getResults())) {
+    const bool multiResult = devFuncTy.getResults().size() > 1;
+    for (auto res : llvm::enumerate(devFuncTy.getResults())) {
       int off = res.index() + offset;
       if (auto vecTy = dyn_cast<cudaq::cc::SpanLikeType>(res.value())) {
         auto eleTy = vecTy.getElementType();
@@ -1352,7 +1446,7 @@ class GenerateKernelExecution
         auto vecLen = builder.create<cudaq::cc::LoadOp>(loc, gep1);
         if (vecTy.getElementType() == builder.getI1Type()) {
           genStdvecBoolFromInitList(loc, builder,
-                                    rewriteEntryBlock->getArguments().front(),
+                                    hostFuncEntryBlock->getArguments().front(),
                                     dataPtr, vecLen);
         } else {
           cudaq::IRBuilder irBuilder(builder);
@@ -1362,7 +1456,7 @@ class GenerateKernelExecution
             return;
           }
           genStdvecTFromInitList(loc, builder,
-                                 rewriteEntryBlock->getArguments().front(),
+                                 hostFuncEntryBlock->getArguments().front(),
                                  dataPtr, tSize, vecLen);
         }
         offset++;
@@ -1378,11 +1472,11 @@ class GenerateKernelExecution
             if (multiResult)
               return builder.create<cudaq::cc::ComputePtrOp>(
                   loc, cudaq::cc::PointerType::get(res.value()),
-                  rewriteEntryBlock->getArguments().front(),
+                  hostFuncEntryBlock->getArguments().front(),
                   SmallVector<cudaq::cc::ComputePtrArg>{off});
             return builder.create<cudaq::cc::CastOp>(
                 loc, cudaq::cc::PointerType::get(res.value()),
-                rewriteEntryBlock->getArguments().front());
+                hostFuncEntryBlock->getArguments().front());
           }();
           builder.create<cudaq::cc::StoreOp>(loc, loadVal, sretPtr);
         } else {
@@ -1393,91 +1487,6 @@ class GenerateKernelExecution
     builder.create<func::ReturnOp>(loc, results);
   }
 
-  void genNewHostEntryPoint2(Location loc, OpBuilder &builder,
-                             FunctionType devFuncTy,
-                             LLVM::GlobalOp kernelNameObj,
-                             func::FuncOp hostFunc, bool addThisPtr) {
-    const bool hiddenSRet = cudaq::opt::factory::hasHiddenSRet(devFuncTy);
-    const unsigned count =
-        cudaq::cc::numberOfHiddenArgs(addThisPtr, hiddenSRet);
-    auto *ctx = builder.getContext();
-    auto i8PtrTy = cudaq::cc::PointerType::get(builder.getI8Type());
-
-    // 0) Pointer our builder into the entry block of the function.
-    Block *hostFuncEntryBlock = hostFunc.addEntryBlock();
-
-    OpBuilder::InsertionGuard guard(builder);
-    builder.setInsertionPointToStart(hostFuncEntryBlock);
-
-    // 1) Allocate and initialize a std::vector<void*> object.
-    auto stdVec = builder.create<cudaq::cc::AllocaOp>(
-        loc, cudaq::opt::factory::stlVectorType(i8PtrTy));
-    auto arrPtrTy = cudaq::cc::ArrayType::get(ctx, i8PtrTy, count);
-    Value buffer = builder.create<cudaq::cc::AllocaOp>(loc, arrPtrTy);
-    auto i64Ty = builder.getI64Type();
-    auto buffSize = builder.create<cudaq::cc::SizeOfOp>(loc, i64Ty, arrPtrTy);
-    auto ptrPtrTy = cudaq::cc::PointerType::get(i8PtrTy);
-    auto cast1 = builder.create<cudaq::cc::CastOp>(loc, ptrPtrTy, buffer);
-    auto ptr3Ty = cudaq::cc::PointerType::get(ptrPtrTy);
-    auto stdVec0 = builder.create<cudaq::cc::CastOp>(loc, ptr3Ty, stdVec);
-    builder.create<cudaq::cc::StoreOp>(loc, cast1, stdVec0);
-    auto cast2 = builder.create<cudaq::cc::CastOp>(loc, i64Ty, buffer);
-    auto endBuff = builder.create<arith::AddIOp>(loc, cast2, buffSize);
-    auto cast3 = builder.create<cudaq::cc::CastOp>(loc, ptrPtrTy, endBuff);
-    auto stdVec1 = builder.create<cudaq::cc::ComputePtrOp>(
-        loc, ptr3Ty, stdVec, ArrayRef<cudaq::cc::ComputePtrArg>{1});
-    builder.create<cudaq::cc::StoreOp>(loc, cast3, stdVec1);
-    auto stdVec2 = builder.create<cudaq::cc::ComputePtrOp>(
-        loc, ptr3Ty, stdVec, ArrayRef<cudaq::cc::ComputePtrArg>{2});
-    builder.create<cudaq::cc::StoreOp>(loc, cast3, stdVec2);
-    auto zero = builder.create<arith::ConstantIntOp>(loc, 0, 64);
-    auto nullPtr = builder.create<cudaq::cc::CastOp>(loc, i8PtrTy, zero);
-
-    // 2) Iterate over the arguments passed in and populate the vector.
-    SmallVector<BlockArgument> blockArgs{dropAnyHiddenArguments(
-        hostFuncEntryBlock->getArguments(), devFuncTy, addThisPtr)};
-    for (auto iter : llvm::enumerate(blockArgs)) {
-      std::int32_t i = iter.index();
-      auto pos = builder.create<cudaq::cc::ComputePtrOp>(
-          loc, ptrPtrTy, buffer, ArrayRef<cudaq::cc::ComputePtrArg>{i});
-      auto blkArg = iter.value();
-      if (isa<cudaq::cc::PointerType>(blkArg.getType())) {
-        auto castArg = builder.create<cudaq::cc::CastOp>(loc, i8PtrTy, blkArg);
-        builder.create<cudaq::cc::StoreOp>(loc, castArg, pos);
-        continue;
-      }
-      auto temp = builder.create<cudaq::cc::AllocaOp>(loc, blkArg.getType());
-      builder.create<cudaq::cc::StoreOp>(loc, blkArg, temp);
-      auto castTemp = builder.create<cudaq::cc::CastOp>(loc, i8PtrTy, temp);
-      builder.create<cudaq::cc::StoreOp>(loc, castTemp, pos);
-    }
-
-    auto resultBuffer = builder.create<cudaq::cc::AllocaOp>(loc, i8PtrTy);
-    builder.create<cudaq::cc::StoreOp>(loc, nullPtr, resultBuffer);
-    auto castResultBuffer =
-        builder.create<cudaq::cc::CastOp>(loc, i8PtrTy, resultBuffer);
-    auto castStdvec = builder.create<cudaq::cc::CastOp>(loc, i8PtrTy, stdVec);
-    Value loadKernName = builder.create<LLVM::AddressOfOp>(
-        loc, cudaq::opt::factory::getPointerType(kernelNameObj.getType()),
-        kernelNameObj.getSymName());
-    auto castKernelNameObj =
-        builder.create<cudaq::cc::CastOp>(loc, i8PtrTy, loadKernName);
-    builder.create<func::CallOp>(
-        loc, std::nullopt, cudaq::runtime::launchKernelVersion2FuncName,
-        ArrayRef<Value>{castKernelNameObj, castStdvec, castResultBuffer});
-
-    // FIXME: Drop any results on the floor for now and return random data left
-    // on the stack. (Maintains parity with existing kernel launch.)
-    if (hostFunc.getFunctionType().getResults().empty()) {
-      builder.create<func::ReturnOp>(loc);
-      return;
-    }
-    // There can only be 1 return type in C++, so this is safe.
-    Value garbage = builder.create<cudaq::cc::UndefOp>(
-        loc, hostFunc.getFunctionType().getResult(0));
-    builder.create<func::ReturnOp>(loc, garbage);
-  }
-
   /// A kernel function that takes a quantum type argument (also known as a pure
   /// device kernel) cannot be called directly from C++ (classical) code. It
   /// must be called via other quantum code.
@@ -1491,6 +1500,88 @@ class GenerateKernelExecution
     return true;
   }
 
+  LLVM::LLVMFuncOp registerKernelForExecution(Location loc, OpBuilder &builder,
+                                              const std::string &classNameStr,
+                                              LLVM::GlobalOp kernelNameObj,
+                                              func::FuncOp argsCreatorFunc,
+                                              StringRef mangledName) {
+    auto module = getOperation();
+    auto *ctx = builder.getContext();
+    auto ptrType = cudaq::cc::PointerType::get(builder.getI8Type());
+    auto initFun = builder.create<LLVM::LLVMFuncOp>(
+        loc, classNameStr + ".kernelRegFunc",
+        LLVM::LLVMFunctionType::get(cudaq::opt::factory::getVoidType(ctx), {}));
+    OpBuilder::InsertionGuard guard(builder);
+    auto *initFunEntry = initFun.addEntryBlock();
+    builder.setInsertionPointToStart(initFunEntry);
+    auto kernRef = builder.create<LLVM::AddressOfOp>(
+        loc, cudaq::opt::factory::getPointerType(kernelNameObj.getType()),
+        kernelNameObj.getSymName());
+    auto castKernRef = builder.create<cudaq::cc::CastOp>(loc, ptrType, kernRef);
+    builder.create<func::CallOp>(loc, std::nullopt, CudaqRegisterKernelName,
+                                 ValueRange{castKernRef});
+
+    if (isCodegenPackedData(codegenKind)) {
+      // Register the argsCreator too
+      auto ptrPtrType = cudaq::cc::PointerType::get(ptrType);
+      auto argsCreatorFuncType = FunctionType::get(
+          ctx, {ptrPtrType, ptrPtrType}, {builder.getI64Type()});
+      Value loadArgsCreator = builder.create<func::ConstantOp>(
+          loc, argsCreatorFuncType, argsCreatorFunc.getName());
+      auto castLoadArgsCreator =
+          builder.create<cudaq::cc::FuncToPtrOp>(loc, ptrType, loadArgsCreator);
+      builder.create<func::CallOp>(
+          loc, std::nullopt, CudaqRegisterArgsCreator,
+          ValueRange{castKernRef, castLoadArgsCreator});
+    }
+
+    // Check if this is a lambda mangled name
+    auto demangledPtr = abi::__cxa_demangle(mangledName.str().c_str(), nullptr,
+                                            nullptr, nullptr);
+    if (demangledPtr) {
+      std::string demangledName(demangledPtr);
+      demangledName =
+          std::regex_replace(demangledName, std::regex("::operator()(.*)"), "");
+      if (demangledName.find("$_") != std::string::npos) {
+        auto insertPoint = builder.saveInsertionPoint();
+        builder.setInsertionPointToStart(module.getBody());
+
+        // Create the function if it doesn't already exist.
+        if (!module.lookupSymbol<LLVM::LLVMFuncOp>(CudaqRegisterLambdaName))
+          builder.create<LLVM::LLVMFuncOp>(
+              module.getLoc(), CudaqRegisterLambdaName,
+              LLVM::LLVMFunctionType::get(
+                  cudaq::opt::factory::getVoidType(ctx),
+                  {cudaq::opt::factory::getPointerType(ctx),
+                   cudaq::opt::factory::getPointerType(ctx)}));
+
+        // Create this global name, it is unique for any lambda
+        // bc classNameStr contains the parentFunc + varName
+        auto lambdaName = builder.create<LLVM::GlobalOp>(
+            loc,
+            cudaq::opt::factory::getStringType(ctx, demangledName.size() + 1),
+            /*isConstant=*/true, LLVM::Linkage::External,
+            classNameStr + ".lambdaName",
+            builder.getStringAttr(demangledName + '\0'), /*alignment=*/0);
+
+        builder.restoreInsertionPoint(insertPoint);
+        auto lambdaRef = builder.create<LLVM::AddressOfOp>(
+            loc, cudaq::opt::factory::getPointerType(lambdaName.getType()),
+            lambdaName.getSymName());
+
+        auto castLambdaRef = builder.create<cudaq::cc::CastOp>(
+            loc, cudaq::opt::factory::getPointerType(ctx), lambdaRef);
+        auto castKernelRef = builder.create<cudaq::cc::CastOp>(
+            loc, cudaq::opt::factory::getPointerType(ctx), castKernRef);
+        builder.create<LLVM::CallOp>(loc, std::nullopt, CudaqRegisterLambdaName,
+                                     ValueRange{castLambdaRef, castKernelRef});
+      }
+    }
+
+    builder.create<LLVM::ReturnOp>(loc, ValueRange{});
+    return initFun;
+  }
+
   void runOnOperation() override {
     auto module = getOperation();
     DataLayoutAnalysis dla(module); // caches module's data layout information.
@@ -1508,26 +1599,40 @@ class GenerateKernelExecution
     if (!mangledNameMap || mangledNameMap.empty())
       return;
     auto irBuilder = cudaq::IRBuilder::atBlockEnd(module.getBody());
-    if (altLaunchVersion == 1)
+    switch (codegenKind) {
+    case 0:
+      if (failed(irBuilder.loadIntrinsic(
+              module, cudaq::runtime::launchKernelHybridFuncName))) {
+        module.emitError("could not load altLaunchKernel intrinsic.");
+        return;
+      }
+      break;
+    case 1:
       if (failed(irBuilder.loadIntrinsic(
               module, cudaq::runtime::launchKernelFuncName))) {
         module.emitError("could not load altLaunchKernel intrinsic.");
         return;
       }
-    if (altLaunchVersion == 2)
+      break;
+    case 2:
       if (failed(irBuilder.loadIntrinsic(
-              module, cudaq::runtime::launchKernelVersion2FuncName))) {
+              module, cudaq::runtime::launchKernelStreamlinedFuncName))) {
         module.emitError("could not load altLaunchKernel intrinsic.");
         return;
       }
+      break;
+    default:
+      module.emitError("invalid codegen kind value.");
+      return;
+    }
 
     auto loc = module.getLoc();
     auto ptrType = cudaq::cc::PointerType::get(builder.getI8Type());
     auto regKern = builder.create<func::FuncOp>(
-        loc, cudaqRegisterKernelName, FunctionType::get(ctx, {ptrType}, {}));
+        loc, CudaqRegisterKernelName, FunctionType::get(ctx, {ptrType}, {}));
     regKern.setPrivate();
     auto regArgs = builder.create<func::FuncOp>(
-        loc, cudaqRegisterArgsCreator,
+        loc, CudaqRegisterArgsCreator,
         FunctionType::get(ctx, {ptrType, ptrType}, {}));
     regArgs.setPrivate();
 
@@ -1622,7 +1727,7 @@ class GenerateKernelExecution
       func::FuncOp thunk;
       func::FuncOp argsCreatorFunc;
 
-      if (altLaunchVersion == 1) {
+      if (isCodegenPackedData(codegenKind)) {
         // Generate the function that computes the return offset.
         genReturnOffsetFunction(loc, builder, funcTy, structTy, classNameStr);
 
@@ -1652,94 +1757,15 @@ class GenerateKernelExecution
 
       // Generate a new mangled function on the host side to call the
       // callback function.
-      if (hostEntryNeeded) {
-        if (altLaunchVersion == 1)
-          genNewHostEntryPoint1(loc, builder, funcTy, structTy, kernelNameObj,
-                                thunk, hostFunc, hasThisPtr);
-        else
-          genNewHostEntryPoint2(loc, builder, funcTy, kernelNameObj, hostFunc,
-                                hasThisPtr);
-      }
+      if (hostEntryNeeded)
+        genNewHostEntryPoint(loc, builder, funcTy, kernelNameObj, hostFunc,
+                             hasThisPtr, structTy, thunk);
 
       // Generate a function at startup to register this kernel as having
       // been processed for kernel execution.
-      auto initFun = builder.create<LLVM::LLVMFuncOp>(
-          loc, classNameStr + ".kernelRegFunc",
-          LLVM::LLVMFunctionType::get(cudaq::opt::factory::getVoidType(ctx),
-                                      {}));
-      {
-        OpBuilder::InsertionGuard guard(builder);
-        auto *initFunEntry = initFun.addEntryBlock();
-        builder.setInsertionPointToStart(initFunEntry);
-        auto kernRef = builder.create<LLVM::AddressOfOp>(
-            loc, cudaq::opt::factory::getPointerType(kernelNameObj.getType()),
-            kernelNameObj.getSymName());
-        auto castKernRef =
-            builder.create<cudaq::cc::CastOp>(loc, ptrType, kernRef);
-        builder.create<func::CallOp>(loc, std::nullopt, cudaqRegisterKernelName,
-                                     ValueRange{castKernRef});
-
-        if (altLaunchVersion == 1) {
-          // Register the argsCreator too
-          auto ptrPtrType = cudaq::cc::PointerType::get(ptrType);
-          auto argsCreatorFuncType = FunctionType::get(
-              ctx, {ptrPtrType, ptrPtrType}, {builder.getI64Type()});
-          Value loadArgsCreator = builder.create<func::ConstantOp>(
-              loc, argsCreatorFuncType, argsCreatorFunc.getName());
-          auto castLoadArgsCreator = builder.create<cudaq::cc::FuncToPtrOp>(
-              loc, ptrType, loadArgsCreator);
-          builder.create<func::CallOp>(
-              loc, std::nullopt, cudaqRegisterArgsCreator,
-              ValueRange{castKernRef, castLoadArgsCreator});
-        }
-
-        // Check if this is a lambda mangled name
-        auto demangledPtr = abi::__cxa_demangle(mangledName.str().c_str(),
-                                                nullptr, nullptr, nullptr);
-        if (demangledPtr) {
-          std::string demangledName(demangledPtr);
-          demangledName = std::regex_replace(
-              demangledName, std::regex("::operator()(.*)"), "");
-          if (demangledName.find("$_") != std::string::npos) {
-            auto insertPoint = builder.saveInsertionPoint();
-            builder.setInsertionPointToStart(module.getBody());
-
-            // Create the function if it doesn't already exist.
-            if (!module.lookupSymbol<LLVM::LLVMFuncOp>(cudaqRegisterLambdaName))
-              builder.create<LLVM::LLVMFuncOp>(
-                  module.getLoc(), cudaqRegisterLambdaName,
-                  LLVM::LLVMFunctionType::get(
-                      cudaq::opt::factory::getVoidType(ctx),
-                      {cudaq::opt::factory::getPointerType(ctx),
-                       cudaq::opt::factory::getPointerType(ctx)}));
-
-            // Create this global name, it is unique for any lambda
-            // bc classNameStr contains the parentFunc + varName
-            auto lambdaName = builder.create<LLVM::GlobalOp>(
-                loc,
-                cudaq::opt::factory::getStringType(ctx,
-                                                   demangledName.size() + 1),
-                /*isConstant=*/true, LLVM::Linkage::External,
-                classNameStr + ".lambdaName",
-                builder.getStringAttr(demangledName + '\0'), /*alignment=*/0);
-
-            builder.restoreInsertionPoint(insertPoint);
-            auto lambdaRef = builder.create<LLVM::AddressOfOp>(
-                loc, cudaq::opt::factory::getPointerType(lambdaName.getType()),
-                lambdaName.getSymName());
-
-            auto castLambdaRef = builder.create<cudaq::cc::CastOp>(
-                loc, cudaq::opt::factory::getPointerType(ctx), lambdaRef);
-            auto castKernelRef = builder.create<cudaq::cc::CastOp>(
-                loc, cudaq::opt::factory::getPointerType(ctx), castKernRef);
-            builder.create<LLVM::CallOp>(
-                loc, std::nullopt, cudaqRegisterLambdaName,
-                ValueRange{castLambdaRef, castKernelRef});
-          }
-        }
-
-        builder.create<LLVM::ReturnOp>(loc, ValueRange{});
-      }
+      auto initFun =
+          registerKernelForExecution(loc, builder, classNameStr, kernelNameObj,
+                                     argsCreatorFunc, mangledName);
 
       // Create a global with a default ctor to be run at program startup.
       // The ctor will execute the above function, which will register this
diff --git a/test/Quake/kernel_exec-1.qke b/test/Quake/kernel_exec-1.qke
index b0287073a7..2a768a2061 100644
--- a/test/Quake/kernel_exec-1.qke
+++ b/test/Quake/kernel_exec-1.qke
@@ -7,7 +7,8 @@
 // ========================================================================== //
 
 // RUN: cudaq-opt --kernel-execution %s | FileCheck %s
-// RUN: cudaq-opt --kernel-execution=alt-launch=2 %s | FileCheck --check-prefix=ALT2 %s
+// RUN: cudaq-opt --kernel-execution=codegen=2 %s | FileCheck --check-prefix=STREAM %s
+// RUN: cudaq-opt --kernel-execution=codegen=0 %s | FileCheck --check-prefix=HYBRID %s
 
 module attributes {quake.mangled_name_map = {
   __nvqpp__mlirgen__ghz = "_ZN3ghzclEi"}} {
@@ -96,13 +97,13 @@ module attributes {quake.mangled_name_map = {
 // CHECK:           %[[VAL_10:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.struct<{i32, f64}>>
 // CHECK:           cc.store %[[VAL_4]], %[[VAL_10]] : !cc.ptr<!cc.struct<{i32, f64}>>
 // CHECK:           %[[VAL_11:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.array<!cc.struct<{i32, f64}> x ?>>
-// CHECK:           %[[VAL_12:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr<array<4 x i8>>
 // CHECK:           %[[VAL_13:.*]] = constant @ghz.thunk : (!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>
-// CHECK:           %[[VAL_14:.*]] = cc.cast %[[VAL_12]] : (!llvm.ptr<array<4 x i8>>) -> !cc.ptr<i8>
 // CHECK:           %[[VAL_15:.*]] = cc.func_ptr %[[VAL_13]] : ((!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>) -> !cc.ptr<i8>
 // CHECK:           %[[VAL_16:.*]] = cc.cast %[[VAL_11]] : (!cc.ptr<!cc.array<!cc.struct<{i32, f64}> x ?>>) -> !cc.ptr<i8>
 // CHECK:           %[[VAL_17:.*]] = cc.compute_ptr %[[VAL_5]][1] : (!cc.ptr<!cc.struct<{i32, f64}>>) -> !cc.ptr<f64>
 // CHECK:           %[[VAL_18:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr<f64>) -> i64
+// CHECK:           %[[VAL_12:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr<array<4 x i8>>
+// CHECK:           %[[VAL_14:.*]] = cc.cast %[[VAL_12]] : (!llvm.ptr<array<4 x i8>>) -> !cc.ptr<i8>
 // CHECK:           call @altLaunchKernel(%[[VAL_14]], %[[VAL_15]], %[[VAL_16]], %[[VAL_8]], %[[VAL_18]]) : (!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64) -> ()
 // CHECK:           %[[VAL_19:.*]] = cc.compute_ptr %[[VAL_11]][0, 1] : (!cc.ptr<!cc.array<!cc.struct<{i32, f64}> x ?>>) -> !cc.ptr<f64>
 // CHECK:           %[[VAL_20:.*]] = cc.load %[[VAL_19]] : !cc.ptr<f64>
@@ -160,35 +161,74 @@ module attributes {quake.mangled_name_map = {
 // CHECK:         }
 
 
-// ALT2-LABEL:   func.func @_ZN3ghzclEi(
-// ALT2-SAME:      %[[VAL_0:.*]]: !cc.ptr<i8>, %[[VAL_1:.*]]: i32) -> f64 {
-// ALT2:           %[[VAL_2:.*]] = cc.alloca !cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>
-// ALT2:           %[[VAL_3:.*]] = cc.alloca !cc.array<!cc.ptr<i8> x 1>
-// ALT2:           %[[VAL_4:.*]] = cc.sizeof !cc.array<!cc.ptr<i8> x 1> : i64
-// ALT2:           %[[VAL_5:.*]] = cc.cast %[[VAL_3]] : (!cc.ptr<!cc.array<!cc.ptr<i8> x 1>>) -> !cc.ptr<!cc.ptr<i8>>
-// ALT2:           %[[VAL_6:.*]] = cc.cast %[[VAL_2]] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
-// ALT2:           cc.store %[[VAL_5]], %[[VAL_6]] : !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
-// ALT2:           %[[VAL_7:.*]] = cc.cast %[[VAL_3]] : (!cc.ptr<!cc.array<!cc.ptr<i8> x 1>>) -> i64
-// ALT2:           %[[VAL_8:.*]] = arith.addi %[[VAL_7]], %[[VAL_4]] : i64
-// ALT2:           %[[VAL_9:.*]] = cc.cast %[[VAL_8]] : (i64) -> !cc.ptr<!cc.ptr<i8>>
-// ALT2:           %[[VAL_10:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
-// ALT2:           cc.store %[[VAL_9]], %[[VAL_10]] : !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
-// ALT2:           %[[VAL_11:.*]] = cc.compute_ptr %[[VAL_2]][2] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
-// ALT2:           cc.store %[[VAL_9]], %[[VAL_11]] : !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
-// ALT2:           %[[VAL_12:.*]] = arith.constant 0 : i64
-// ALT2:           %[[VAL_13:.*]] = cc.cast %[[VAL_12]] : (i64) -> !cc.ptr<i8>
-// ALT2:           %[[VAL_14:.*]] = cc.compute_ptr %[[VAL_3]][0] : (!cc.ptr<!cc.array<!cc.ptr<i8> x 1>>) -> !cc.ptr<!cc.ptr<i8>>
-// ALT2:           %[[VAL_15:.*]] = cc.alloca i32
-// ALT2:           cc.store %[[VAL_1]], %[[VAL_15]] : !cc.ptr<i32>
-// ALT2:           %[[VAL_16:.*]] = cc.cast %[[VAL_15]] : (!cc.ptr<i32>) -> !cc.ptr<i8>
-// ALT2:           cc.store %[[VAL_16]], %[[VAL_14]] : !cc.ptr<!cc.ptr<i8>>
-// ALT2:           %[[VAL_17:.*]] = cc.alloca !cc.ptr<i8>
-// ALT2:           cc.store %[[VAL_13]], %[[VAL_17]] : !cc.ptr<!cc.ptr<i8>>
-// ALT2:           %[[VAL_18:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr<!cc.ptr<i8>>) -> !cc.ptr<i8>
-// ALT2:           %[[VAL_19:.*]] = cc.cast %[[VAL_2]] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<i8>
-// ALT2:           %[[VAL_20:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr<array<4 x i8>>
-// ALT2:           %[[VAL_21:.*]] = cc.cast %[[VAL_20]] : (!llvm.ptr<array<4 x i8>>) -> !cc.ptr<i8>
-// ALT2:           call @altLaunchKernelUsingLocalJIT(%[[VAL_21]], %[[VAL_19]], %[[VAL_18]]) : (!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>) -> ()
-// ALT2:           %[[VAL_22:.*]] = cc.undef f64
-// ALT2:           return %[[VAL_22]] : f64
-// ALT2:         }
+// STREAM-LABEL:   func.func @_ZN3ghzclEi(
+// STREAM-SAME:      %[[VAL_0:.*]]: !cc.ptr<i8>, %[[VAL_1:.*]]: i32) -> f64 {
+// STREAM:           %[[VAL_2:.*]] = cc.alloca !cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>
+// STREAM:           %[[VAL_3:.*]] = cc.alloca !cc.array<!cc.ptr<i8> x 1>
+// STREAM:           %[[VAL_4:.*]] = cc.sizeof !cc.array<!cc.ptr<i8> x 1> : i64
+// STREAM:           %[[VAL_5:.*]] = cc.cast %[[VAL_3]] : (!cc.ptr<!cc.array<!cc.ptr<i8> x 1>>) -> !cc.ptr<!cc.ptr<i8>>
+// STREAM:           %[[VAL_6:.*]] = cc.cast %[[VAL_2]] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// STREAM:           cc.store %[[VAL_5]], %[[VAL_6]] : !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// STREAM:           %[[VAL_7:.*]] = cc.cast %[[VAL_3]] : (!cc.ptr<!cc.array<!cc.ptr<i8> x 1>>) -> i64
+// STREAM:           %[[VAL_8:.*]] = arith.addi %[[VAL_7]], %[[VAL_4]] : i64
+// STREAM:           %[[VAL_9:.*]] = cc.cast %[[VAL_8]] : (i64) -> !cc.ptr<!cc.ptr<i8>>
+// STREAM:           %[[VAL_10:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// STREAM:           cc.store %[[VAL_9]], %[[VAL_10]] : !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// STREAM:           %[[VAL_11:.*]] = cc.compute_ptr %[[VAL_2]][2] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// STREAM:           cc.store %[[VAL_9]], %[[VAL_11]] : !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// STREAM:           %[[VAL_14:.*]] = cc.compute_ptr %[[VAL_3]][0] : (!cc.ptr<!cc.array<!cc.ptr<i8> x 1>>) -> !cc.ptr<!cc.ptr<i8>>
+// STREAM:           %[[VAL_15:.*]] = cc.alloca i32
+// STREAM:           cc.store %[[VAL_1]], %[[VAL_15]] : !cc.ptr<i32>
+// STREAM:           %[[VAL_16:.*]] = cc.cast %[[VAL_15]] : (!cc.ptr<i32>) -> !cc.ptr<i8>
+// STREAM:           cc.store %[[VAL_16]], %[[VAL_14]] : !cc.ptr<!cc.ptr<i8>>
+// STREAM:           %[[VAL_19:.*]] = cc.cast %[[VAL_2]] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<i8>
+// STREAM:           %[[VAL_20:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr<array<4 x i8>>
+// STREAM:           %[[VAL_21:.*]] = cc.cast %[[VAL_20]] : (!llvm.ptr<array<4 x i8>>) -> !cc.ptr<i8>
+// STREAM:           call @streamlinedLaunchKernel(%[[VAL_21]], %[[VAL_19]]) : (!cc.ptr<i8>, !cc.ptr<i8>) -> ()
+// STREAM:           %[[VAL_22:.*]] = cc.undef f64
+// STREAM:           return %[[VAL_22]] : f64
+// STREAM:         }
+
+// HYBRID-LABEL:   func.func @_ZN3ghzclEi(
+// HYBRID-SAME:      %[[VAL_0:.*]]: !cc.ptr<i8>, %[[VAL_1:.*]]: i32) -> f64 {
+// HYBRID:           %[[VAL_2:.*]] = cc.undef !cc.struct<{i32, f64}>
+// HYBRID:           %[[VAL_3:.*]] = arith.constant 0 : i64
+// HYBRID:           %[[VAL_4:.*]] = cc.insert_value %[[VAL_1]], %[[VAL_2]][0] : (!cc.struct<{i32, f64}>, i32) -> !cc.struct<{i32, f64}>
+// HYBRID:           %[[VAL_5:.*]] = cc.cast %[[VAL_3]] : (i64) -> !cc.ptr<!cc.struct<{i32, f64}>>
+// HYBRID:           %[[VAL_6:.*]] = cc.sizeof !cc.struct<{i32, f64}> : i64
+// HYBRID:           %[[VAL_7:.*]] = arith.addi %[[VAL_6]], %[[VAL_3]] : i64
+// HYBRID:           %[[VAL_8:.*]] = cc.alloca i8{{\[}}%[[VAL_7]] : i64]
+// HYBRID:           %[[VAL_9:.*]] = cc.cast %[[VAL_8]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.struct<{i32, f64}>>
+// HYBRID:           cc.store %[[VAL_4]], %[[VAL_9]] : !cc.ptr<!cc.struct<{i32, f64}>>
+// HYBRID:           %[[VAL_10:.*]] = cc.cast %[[VAL_8]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.array<!cc.struct<{i32, f64}> x ?>>
+// HYBRID:           %[[VAL_11:.*]] = constant @ghz.thunk : (!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>
+// HYBRID:           %[[VAL_12:.*]] = cc.func_ptr %[[VAL_11]] : ((!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>) -> !cc.ptr<i8>
+// HYBRID:           %[[VAL_13:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr<!cc.array<!cc.struct<{i32, f64}> x ?>>) -> !cc.ptr<i8>
+// HYBRID:           %[[VAL_14:.*]] = cc.compute_ptr %[[VAL_5]][1] : (!cc.ptr<!cc.struct<{i32, f64}>>) -> !cc.ptr<f64>
+// HYBRID:           %[[VAL_15:.*]] = cc.cast %[[VAL_14]] : (!cc.ptr<f64>) -> i64
+// HYBRID:           %[[VAL_16:.*]] = cc.alloca !cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>
+// HYBRID:           %[[VAL_17:.*]] = cc.alloca !cc.array<!cc.ptr<i8> x 1>
+// HYBRID:           %[[VAL_18:.*]] = cc.sizeof !cc.array<!cc.ptr<i8> x 1> : i64
+// HYBRID:           %[[VAL_19:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr<!cc.array<!cc.ptr<i8> x 1>>) -> !cc.ptr<!cc.ptr<i8>>
+// HYBRID:           %[[VAL_20:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// HYBRID:           cc.store %[[VAL_19]], %[[VAL_20]] : !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// HYBRID:           %[[VAL_21:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr<!cc.array<!cc.ptr<i8> x 1>>) -> i64
+// HYBRID:           %[[VAL_22:.*]] = arith.addi %[[VAL_21]], %[[VAL_18]] : i64
+// HYBRID:           %[[VAL_23:.*]] = cc.cast %[[VAL_22]] : (i64) -> !cc.ptr<!cc.ptr<i8>>
+// HYBRID:           %[[VAL_24:.*]] = cc.compute_ptr %[[VAL_16]][1] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// HYBRID:           cc.store %[[VAL_23]], %[[VAL_24]] : !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// HYBRID:           %[[VAL_25:.*]] = cc.compute_ptr %[[VAL_16]][2] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// HYBRID:           cc.store %[[VAL_23]], %[[VAL_25]] : !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// HYBRID:           %[[VAL_26:.*]] = cc.compute_ptr %[[VAL_17]][0] : (!cc.ptr<!cc.array<!cc.ptr<i8> x 1>>) -> !cc.ptr<!cc.ptr<i8>>
+// HYBRID:           %[[VAL_27:.*]] = cc.alloca i32
+// HYBRID:           cc.store %[[VAL_1]], %[[VAL_27]] : !cc.ptr<i32>
+// HYBRID:           %[[VAL_28:.*]] = cc.cast %[[VAL_27]] : (!cc.ptr<i32>) -> !cc.ptr<i8>
+// HYBRID:           cc.store %[[VAL_28]], %[[VAL_26]] : !cc.ptr<!cc.ptr<i8>>
+// HYBRID:           %[[VAL_29:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<i8>
+// HYBRID:           %[[VAL_30:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr<array<4 x i8>>
+// HYBRID:           %[[VAL_31:.*]] = cc.cast %[[VAL_30]] : (!llvm.ptr<array<4 x i8>>) -> !cc.ptr<i8>
+// HYBRID:           call @hybridLaunchKernel(%[[VAL_31]], %[[VAL_12]], %[[VAL_13]], %[[VAL_7]], %[[VAL_15]], %[[VAL_29]]) : (!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64, !cc.ptr<i8>) -> ()
+// HYBRID:           %[[VAL_32:.*]] = cc.compute_ptr %[[VAL_10]][0, 1] : (!cc.ptr<!cc.array<!cc.struct<{i32, f64}> x ?>>) -> !cc.ptr<f64>
+// HYBRID:           %[[VAL_33:.*]] = cc.load %[[VAL_32]] : !cc.ptr<f64>
+// HYBRID:           return %[[VAL_33]] : f64
+// HYBRID:         }
diff --git a/test/Quake/kernel_exec-2.qke b/test/Quake/kernel_exec-2.qke
index e5b8a7f24c..fa3a8a5492 100644
--- a/test/Quake/kernel_exec-2.qke
+++ b/test/Quake/kernel_exec-2.qke
@@ -65,12 +65,12 @@ __nvqpp__mlirgen__function_cargo = "pants"}} {
 // CHECK:           call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_21]], %[[VAL_26]], %[[VAL_22]], %[[VAL_23]]) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
 // CHECK:           %[[VAL_90:.*]] = cc.cast %[[VAL_21]] :
 // CHECK:           %[[VAL_27:.*]] = cc.compute_ptr %[[VAL_90]][%[[VAL_22]]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_28:.*]] = llvm.mlir.addressof @function_hawaiian.kernelName : !llvm.ptr<array<18 x i8>>
 // CHECK:           %[[VAL_29:.*]] = constant @function_hawaiian.thunk : (!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>
-// CHECK:           %[[VAL_30:.*]] = cc.cast %[[VAL_28]] : (!llvm.ptr<array<18 x i8>>) -> !cc.ptr<i8>
 // CHECK:           %[[VAL_31:.*]] = cc.func_ptr %[[VAL_29]] : ((!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>) -> !cc.ptr<i8>
 // CHECK:           %[[VAL_32:.*]] = cc.cast %[[VAL_20]] : (!cc.ptr<!cc.array<!cc.struct<{i1, i64}> x ?>>) -> !cc.ptr<i8>
 // CHECK:           %[[VAL_33:.*]] = arith.constant 2147483647 : i64
+// CHECK:           %[[VAL_28:.*]] = llvm.mlir.addressof @function_hawaiian.kernelName : !llvm.ptr<array<18 x i8>>
+// CHECK:           %[[VAL_30:.*]] = cc.cast %[[VAL_28]] : (!llvm.ptr<array<18 x i8>>) -> !cc.ptr<i8>
 // CHECK:           call @altLaunchKernel(%[[VAL_30]], %[[VAL_31]], %[[VAL_32]], %[[VAL_17]], %[[VAL_33]]) : (!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64) -> ()
 // CHECK:           return
 // CHECK:         }
diff --git a/test/Quake/return_vector.qke b/test/Quake/return_vector.qke
index 79f090bb4c..59a546a046 100644
--- a/test/Quake/return_vector.qke
+++ b/test/Quake/return_vector.qke
@@ -54,12 +54,12 @@ func.func @test_0(%0: !cc.ptr<!cc.struct<{!cc.ptr<i32>, !cc.ptr<i32>, !cc.ptr<i3
 // CHECK:           %[[VAL_10:.*]] = cc.alloca i8{{\[}}%[[VAL_9]] : i64]
 // CHECK:           %[[VAL_11:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>>
 // CHECK:           cc.store %[[VAL_7]], %[[VAL_11]] : !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>>
-// CHECK:           %[[VAL_12:.*]] = llvm.mlir.addressof @test_0.kernelName : !llvm.ptr<array<7 x i8>>
-// CHECK:           %[[VAL_13:.*]] = cc.cast %[[VAL_12]] : (!llvm.ptr<array<7 x i8>>) -> !cc.ptr<i8>
 // CHECK:           %[[VAL_14:.*]] = cc.func_ptr %[[VAL_4]] : ((!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>) -> !cc.ptr<i8>
 // CHECK:           %[[VAL_15:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
 // CHECK:           %[[VAL_16:.*]] = cc.compute_ptr %[[VAL_8]][1] : (!cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>>) -> !cc.ptr<!cc.struct<{!cc.ptr<i32>, i64}>>
 // CHECK:           %[[VAL_17:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr<!cc.struct<{!cc.ptr<i32>, i64}>>) -> i64
+// CHECK:           %[[VAL_12:.*]] = llvm.mlir.addressof @test_0.kernelName : !llvm.ptr<array<7 x i8>>
+// CHECK:           %[[VAL_13:.*]] = cc.cast %[[VAL_12]] : (!llvm.ptr<array<7 x i8>>) -> !cc.ptr<i8>
 // CHECK:           call @altLaunchKernel(%[[VAL_13]], %[[VAL_14]], %[[VAL_15]], %[[VAL_9]], %[[VAL_17]]) : (!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64) -> ()
 // CHECK:           %[[VAL_18:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>>
 // CHECK:           %[[VAL_19:.*]] = cc.compute_ptr %[[VAL_18]][1, 0] : (!cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>>) -> !cc.ptr<!cc.ptr<i32>>
@@ -121,12 +121,12 @@ func.func @test_1(%0: !cc.ptr<!cc.struct<{!cc.ptr<f64>, !cc.ptr<f64>, !cc.ptr<f6
 // CHECK:           %[[VAL_10:.*]] = cc.alloca i8{{\[}}%[[VAL_9]] : i64]
 // CHECK:           %[[VAL_11:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>>
 // CHECK:           cc.store %[[VAL_7]], %[[VAL_11]] : !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>>
-// CHECK:           %[[VAL_12:.*]] = llvm.mlir.addressof @test_1.kernelName : !llvm.ptr<array<7 x i8>>
-// CHECK:           %[[VAL_13:.*]] = cc.cast %[[VAL_12]] : (!llvm.ptr<array<7 x i8>>) -> !cc.ptr<i8>
 // CHECK:           %[[VAL_14:.*]] = cc.func_ptr %[[VAL_4]] : ((!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>) -> !cc.ptr<i8>
 // CHECK:           %[[VAL_15:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
 // CHECK:           %[[VAL_16:.*]] = cc.compute_ptr %[[VAL_8]][1] : (!cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>>) -> !cc.ptr<!cc.struct<{!cc.ptr<f64>, i64}>>
 // CHECK:           %[[VAL_17:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr<!cc.struct<{!cc.ptr<f64>, i64}>>) -> i64
+// CHECK:           %[[VAL_12:.*]] = llvm.mlir.addressof @test_1.kernelName : !llvm.ptr<array<7 x i8>>
+// CHECK:           %[[VAL_13:.*]] = cc.cast %[[VAL_12]] : (!llvm.ptr<array<7 x i8>>) -> !cc.ptr<i8>
 // CHECK:           call @altLaunchKernel(%[[VAL_13]], %[[VAL_14]], %[[VAL_15]], %[[VAL_9]], %[[VAL_17]]) : (!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64) -> ()
 // CHECK:           %[[VAL_18:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>>
 // CHECK:           %[[VAL_19:.*]] = cc.compute_ptr %[[VAL_18]][1, 0] : (!cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>>) -> !cc.ptr<!cc.ptr<f64>>

From d117ad7b7615312e624c7b941997e41064630f23 Mon Sep 17 00:00:00 2001
From: Ben Howe <141149032+bmhowe23@users.noreply.github.com>
Date: Tue, 13 Aug 2024 23:15:57 -0500
Subject: [PATCH 3/4] Follow-up to #1603 - fix `kronprod` bug occurring with
 Nvidia simulators (#2077)

---
 python/tests/builder/test_qalloc_init.py      | 19 +++++++++++++++++++
 .../custatevec/CuStateVecCircuitSimulator.cu  |  2 +-
 unittests/integration/get_state_tester.cpp    | 19 +++++++++++++++++++
 3 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/python/tests/builder/test_qalloc_init.py b/python/tests/builder/test_qalloc_init.py
index e0cc626f02..623c383f3c 100644
--- a/python/tests/builder/test_qalloc_init.py
+++ b/python/tests/builder/test_qalloc_init.py
@@ -171,6 +171,25 @@ def test_kernel_complex_params_rotate_f64():
     assert '10' in counts
 
 
+@skipIfNvidiaFP64NotInstalled
+def test_kernel_complex_force_kron():
+    cudaq.reset_target()
+    cudaq.set_target('nvidia-fp64')
+
+    c = [0. + 0j] * 1024
+    c[1023] = 1j
+
+    kernel, vec = cudaq.make_kernel(list[complex])
+    p = kernel.qalloc(1)
+    q = kernel.qalloc(vec)
+    kernel.mz(p)
+    kernel.mz(q)
+
+    counts = cudaq.sample(kernel, c)
+    assert len(counts) == 1
+    assert '01111111111' in counts
+
+
 @skipIfNvidiaNotInstalled
 def test_kernel_complex_params_rotate_f32():
     cudaq.reset_target()
diff --git a/runtime/nvqir/custatevec/CuStateVecCircuitSimulator.cu b/runtime/nvqir/custatevec/CuStateVecCircuitSimulator.cu
index ed034822d7..4693eefd36 100644
--- a/runtime/nvqir/custatevec/CuStateVecCircuitSimulator.cu
+++ b/runtime/nvqir/custatevec/CuStateVecCircuitSimulator.cu
@@ -95,7 +95,7 @@ void kronprod(uint32_t n_blocks, int32_t threads_per_block,
               void *arr0) {
   cudaKronprod<<<n_blocks, threads_per_block>>>(
     tsize1, reinterpret_cast<const CudaDataType *>(arr1), 
-    (1UL << tsize2), reinterpret_cast<const CudaDataType *>(arr2),
+    tsize2, reinterpret_cast<const CudaDataType *>(arr2),
     reinterpret_cast<CudaDataType *>(arr0));
 }
 
diff --git a/unittests/integration/get_state_tester.cpp b/unittests/integration/get_state_tester.cpp
index 00f3ca1dec..452c55c8bb 100644
--- a/unittests/integration/get_state_tester.cpp
+++ b/unittests/integration/get_state_tester.cpp
@@ -165,3 +165,22 @@ CUDAQ_TEST(GetStateTester, checkOverlapFromHostVector) {
   EXPECT_NEAR(1.0, state.overlap(hostState).real(), 1e-3);
 }
 #endif
+
+CUDAQ_TEST(GetStateTester, checkKron) {
+  auto force_kron = [](std::vector<std::complex<cudaq::real>> vec) __qpu__ {
+    cudaq::qubit a;
+    cudaq::qvector qvec(vec);
+  };
+  // Construct a 6-qubit |111111> state
+  const int num_qubits_input_state = 6;
+  std::vector<std::complex<cudaq::real>> hostStateData(
+      1 << num_qubits_input_state);
+  hostStateData[hostStateData.size() - 1] = 1.0;
+
+  auto counts = cudaq::sample(force_kron, hostStateData);
+
+  // Expect a single state with a deterministic outcome
+  EXPECT_EQ(counts.size(), 1);
+  EXPECT_EQ(counts.begin()->first,
+            "0" + std::string(num_qubits_input_state, '1'));
+}

From a817c65a0639b2c604786f93f300b529142ce06f Mon Sep 17 00:00:00 2001
From: Eric Schweitz <eschweitz@nvidia.com>
Date: Wed, 14 Aug 2024 14:46:53 -0700
Subject: [PATCH 4/4] [core] Simplify autogenerated code. (#2083)

Use the cc.offsetof operation to reduce the size of the code.
---
 lib/Optimizer/CodeGen/CCToLLVM.cpp            | 34 +++++++++++++++++--
 .../Transforms/GenKernelExecution.cpp         | 22 ++++--------
 test/Quake/kernel_exec-1.qke                  | 16 ++++-----
 test/Quake/return_vector.qke                  | 10 ++----
 4 files changed, 45 insertions(+), 37 deletions(-)

diff --git a/lib/Optimizer/CodeGen/CCToLLVM.cpp b/lib/Optimizer/CodeGen/CCToLLVM.cpp
index aa025656b8..01596ae760 100644
--- a/lib/Optimizer/CodeGen/CCToLLVM.cpp
+++ b/lib/Optimizer/CodeGen/CCToLLVM.cpp
@@ -495,6 +495,33 @@ class SizeOfOpPattern : public ConvertOpToLLVMPattern<cudaq::cc::SizeOfOp> {
   }
 };
 
+class OffsetOfOpPattern : public ConvertOpToLLVMPattern<cudaq::cc::OffsetOfOp> {
+public:
+  using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
+
+  // Use the GEP approach for now. LLVM is planning to remove support for this
+  // at some point. See: https://github.com/llvm/llvm-project/issues/71507
+  LogicalResult
+  matchAndRewrite(cudaq::cc::OffsetOfOp offsetOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto inputTy = offsetOp.getInputType();
+    SmallVector<cudaq::cc::ComputePtrArg> args;
+    for (std::int32_t i : offsetOp.getConstantIndices())
+      args.push_back(i);
+    auto resultTy = offsetOp.getType();
+    auto loc = offsetOp.getLoc();
+    // TODO: replace this with some target-specific memory layout computation
+    // when we upgrade to a newer MLIR.
+    auto zero = rewriter.create<arith::ConstantIntOp>(loc, 0, 64);
+    auto ptrTy = cudaq::cc::PointerType::get(inputTy);
+    auto nul = rewriter.create<cudaq::cc::CastOp>(loc, ptrTy, zero);
+    Value nextPtr =
+        rewriter.create<cudaq::cc::ComputePtrOp>(loc, ptrTy, nul, args);
+    rewriter.replaceOpWithNewOp<cudaq::cc::CastOp>(offsetOp, resultTy, nextPtr);
+    return success();
+  }
+};
+
 class StdvecDataOpPattern
     : public ConvertOpToLLVMPattern<cudaq::cc::StdvecDataOp> {
 public:
@@ -647,7 +674,8 @@ void cudaq::opt::populateCCToLLVMPatterns(LLVMTypeConverter &typeConverter,
                   ComputePtrOpPattern, CreateStringLiteralOpPattern,
                   ExtractValueOpPattern, FuncToPtrOpPattern, GlobalOpPattern,
                   InsertValueOpPattern, InstantiateCallableOpPattern,
-                  LoadOpPattern, PoisonOpPattern, SizeOfOpPattern,
-                  StdvecDataOpPattern, StdvecInitOpPattern, StdvecSizeOpPattern,
-                  StoreOpPattern, UndefOpPattern>(typeConverter);
+                  LoadOpPattern, OffsetOfOpPattern, PoisonOpPattern,
+                  SizeOfOpPattern, StdvecDataOpPattern, StdvecInitOpPattern,
+                  StdvecSizeOpPattern, StoreOpPattern, UndefOpPattern>(
+      typeConverter);
 }
diff --git a/lib/Optimizer/Transforms/GenKernelExecution.cpp b/lib/Optimizer/Transforms/GenKernelExecution.cpp
index 7c87b9f4a7..7927a1995d 100644
--- a/lib/Optimizer/Transforms/GenKernelExecution.cpp
+++ b/lib/Optimizer/Transforms/GenKernelExecution.cpp
@@ -287,18 +287,13 @@ class GenerateKernelExecution
 
   Value genComputeReturnOffset(Location loc, OpBuilder &builder,
                                FunctionType funcTy,
-                               cudaq::cc::StructType msgStructTy,
-                               Value nullSt) {
-    auto i64Ty = builder.getI64Type();
+                               cudaq::cc::StructType msgStructTy) {
     if (funcTy.getNumResults() == 0)
       return builder.create<arith::ConstantIntOp>(loc, NoResultOffset, 64);
-    auto members = msgStructTy.getMembers();
     std::int32_t numKernelArgs = funcTy.getNumInputs();
-    auto resTy = cudaq::cc::PointerType::get(members[numKernelArgs]);
-    auto gep = builder.create<cudaq::cc::ComputePtrOp>(
-        loc, resTy, nullSt,
-        SmallVector<cudaq::cc::ComputePtrArg>{numKernelArgs});
-    return builder.create<cudaq::cc::CastOp>(loc, i64Ty, gep);
+    auto i64Ty = builder.getI64Type();
+    return builder.create<cudaq::cc::OffsetOfOp>(
+        loc, i64Ty, msgStructTy, ArrayRef<std::int32_t>{numKernelArgs});
   }
 
   /// Create a function that determines the return value offset in the message
@@ -315,11 +310,8 @@ class GenerateKernelExecution
     OpBuilder::InsertionGuard guard(builder);
     auto *entry = returnOffsetFunc.addEntryBlock();
     builder.setInsertionPointToStart(entry);
-    auto ptrTy = cudaq::cc::PointerType::get(msgStructTy);
-    auto zero = builder.create<arith::ConstantIntOp>(loc, 0, 64);
-    auto basePtr = builder.create<cudaq::cc::CastOp>(loc, ptrTy, zero);
     auto result =
-        genComputeReturnOffset(loc, builder, devKernelTy, msgStructTy, basePtr);
+        genComputeReturnOffset(loc, builder, devKernelTy, msgStructTy);
     builder.create<func::ReturnOp>(loc, result);
   }
 
@@ -1272,7 +1264,6 @@ class GenerateKernelExecution
 
       // Compute the struct size without the trailing bytes, structSize, and
       // with the trailing bytes, extendedStructSize.
-      auto nullSt = builder.create<cudaq::cc::CastOp>(loc, structPtrTy, zero);
       Value structSize =
           builder.create<cudaq::cc::SizeOfOp>(loc, i64Ty, structTy);
       extendedStructSize =
@@ -1332,8 +1323,7 @@ class GenerateKernelExecution
       castLoadThunk =
           builder.create<cudaq::cc::FuncToPtrOp>(loc, ptrI8Ty, loadThunk);
       castTemp = builder.create<cudaq::cc::CastOp>(loc, ptrI8Ty, temp);
-      resultOffset =
-          genComputeReturnOffset(loc, builder, devFuncTy, structTy, nullSt);
+      resultOffset = genComputeReturnOffset(loc, builder, devFuncTy, structTy);
     }
 
     Value vecArgPtrs;
diff --git a/test/Quake/kernel_exec-1.qke b/test/Quake/kernel_exec-1.qke
index 2a768a2061..4463b01c71 100644
--- a/test/Quake/kernel_exec-1.qke
+++ b/test/Quake/kernel_exec-1.qke
@@ -87,10 +87,9 @@ module attributes {quake.mangled_name_map = {
 
 // CHECK-LABEL:   func.func @_ZN3ghzclEi(
 // CHECK-SAME:      %[[VAL_0:.*]]: !cc.ptr<i8>, %[[VAL_1:.*]]: i32) -> f64 {
-// CHECK:           %[[VAL_2:.*]] = cc.undef !cc.struct<{i32, f64}>
-// CHECK:           %[[VAL_3:.*]] = arith.constant 0 : i64
+// CHECK-DAG:       %[[VAL_2:.*]] = cc.undef !cc.struct<{i32, f64}>
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 0 : i64
 // CHECK:           %[[VAL_4:.*]] = cc.insert_value %[[VAL_1]], %[[VAL_2]][0] : (!cc.struct<{i32, f64}>, i32) -> !cc.struct<{i32, f64}>
-// CHECK:           %[[VAL_5:.*]] = cc.cast %[[VAL_3]] : (i64) -> !cc.ptr<!cc.struct<{i32, f64}>>
 // CHECK:           %[[VAL_7:.*]] = cc.sizeof !cc.struct<{i32, f64}> : i64
 // CHECK:           %[[VAL_8:.*]] = arith.addi %[[VAL_7]], %[[VAL_3]] : i64
 // CHECK:           %[[VAL_9:.*]] = cc.alloca i8[%[[VAL_8]] : i64]
@@ -100,8 +99,7 @@ module attributes {quake.mangled_name_map = {
 // CHECK:           %[[VAL_13:.*]] = constant @ghz.thunk : (!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>
 // CHECK:           %[[VAL_15:.*]] = cc.func_ptr %[[VAL_13]] : ((!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>) -> !cc.ptr<i8>
 // CHECK:           %[[VAL_16:.*]] = cc.cast %[[VAL_11]] : (!cc.ptr<!cc.array<!cc.struct<{i32, f64}> x ?>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_17:.*]] = cc.compute_ptr %[[VAL_5]][1] : (!cc.ptr<!cc.struct<{i32, f64}>>) -> !cc.ptr<f64>
-// CHECK:           %[[VAL_18:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr<f64>) -> i64
+// CHECK:           %[[VAL_18:.*]] = cc.offsetof !cc.struct<{i32, f64}> [1] : i64
 // CHECK:           %[[VAL_12:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr<array<4 x i8>>
 // CHECK:           %[[VAL_14:.*]] = cc.cast %[[VAL_12]] : (!llvm.ptr<array<4 x i8>>) -> !cc.ptr<i8>
 // CHECK:           call @altLaunchKernel(%[[VAL_14]], %[[VAL_15]], %[[VAL_16]], %[[VAL_8]], %[[VAL_18]]) : (!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64) -> ()
@@ -132,8 +130,8 @@ module attributes {quake.mangled_name_map = {
 // CHECK:         }
 
 // CHECK-LABEL:   func.func @ghz.argsCreator(
-// CHECK-SAME:                               %[[VAL_0:.*]]: !cc.ptr<!cc.ptr<i8>>,
-// CHECK-SAME:                               %[[VAL_1:.*]]: !cc.ptr<!cc.ptr<i8>>) -> i64 {
+// CHECK-SAME:      %[[VAL_0:.*]]: !cc.ptr<!cc.ptr<i8>>,
+// CHECK-SAME:      %[[VAL_1:.*]]: !cc.ptr<!cc.ptr<i8>>) -> i64 {
 // CHECK:           %[[VAL_2:.*]] = cc.undef !cc.struct<{i32, f64}>
 // CHECK:           %[[VAL_14:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<!cc.ptr<i8>>) -> !cc.ptr<!cc.array<!cc.ptr<i8> x ?>>
 // CHECK:           %[[VAL_3:.*]] = arith.constant 0 : i64
@@ -194,7 +192,6 @@ module attributes {quake.mangled_name_map = {
 // HYBRID:           %[[VAL_2:.*]] = cc.undef !cc.struct<{i32, f64}>
 // HYBRID:           %[[VAL_3:.*]] = arith.constant 0 : i64
 // HYBRID:           %[[VAL_4:.*]] = cc.insert_value %[[VAL_1]], %[[VAL_2]][0] : (!cc.struct<{i32, f64}>, i32) -> !cc.struct<{i32, f64}>
-// HYBRID:           %[[VAL_5:.*]] = cc.cast %[[VAL_3]] : (i64) -> !cc.ptr<!cc.struct<{i32, f64}>>
 // HYBRID:           %[[VAL_6:.*]] = cc.sizeof !cc.struct<{i32, f64}> : i64
 // HYBRID:           %[[VAL_7:.*]] = arith.addi %[[VAL_6]], %[[VAL_3]] : i64
 // HYBRID:           %[[VAL_8:.*]] = cc.alloca i8{{\[}}%[[VAL_7]] : i64]
@@ -204,8 +201,7 @@ module attributes {quake.mangled_name_map = {
 // HYBRID:           %[[VAL_11:.*]] = constant @ghz.thunk : (!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>
 // HYBRID:           %[[VAL_12:.*]] = cc.func_ptr %[[VAL_11]] : ((!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>) -> !cc.ptr<i8>
 // HYBRID:           %[[VAL_13:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr<!cc.array<!cc.struct<{i32, f64}> x ?>>) -> !cc.ptr<i8>
-// HYBRID:           %[[VAL_14:.*]] = cc.compute_ptr %[[VAL_5]][1] : (!cc.ptr<!cc.struct<{i32, f64}>>) -> !cc.ptr<f64>
-// HYBRID:           %[[VAL_15:.*]] = cc.cast %[[VAL_14]] : (!cc.ptr<f64>) -> i64
+// HYBRID:           %[[VAL_15:.*]] = cc.offsetof !cc.struct<{i32, f64}> [1] : i64
 // HYBRID:           %[[VAL_16:.*]] = cc.alloca !cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>
 // HYBRID:           %[[VAL_17:.*]] = cc.alloca !cc.array<!cc.ptr<i8> x 1>
 // HYBRID:           %[[VAL_18:.*]] = cc.sizeof !cc.array<!cc.ptr<i8> x 1> : i64
diff --git a/test/Quake/return_vector.qke b/test/Quake/return_vector.qke
index 59a546a046..3394ada1a2 100644
--- a/test/Quake/return_vector.qke
+++ b/test/Quake/return_vector.qke
@@ -46,18 +46,15 @@ func.func @test_0(%0: !cc.ptr<!cc.struct<{!cc.ptr<i32>, !cc.ptr<i32>, !cc.ptr<i3
 // CHECK-SAME:          %[[VAL_1:.*]]: !cc.ptr<i8>, %[[VAL_2:.*]]: i32) {
 // CHECK:           %[[VAL_3:.*]] = arith.constant 4 : i64
 // CHECK:           %[[VAL_4:.*]] = constant @test_0.thunk : (!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>
-// CHECK:           %[[VAL_5:.*]] = arith.constant 0 : i64
 // CHECK:           %[[VAL_6:.*]] = cc.undef !cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>
 // CHECK:           %[[VAL_7:.*]] = cc.insert_value %[[VAL_2]], %[[VAL_6]][0] : (!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>, i32) -> !cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>
-// CHECK:           %[[VAL_8:.*]] = cc.cast %[[VAL_5]] : (i64) -> !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>>
 // CHECK:           %[[VAL_9:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}> : i64
 // CHECK:           %[[VAL_10:.*]] = cc.alloca i8{{\[}}%[[VAL_9]] : i64]
 // CHECK:           %[[VAL_11:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>>
 // CHECK:           cc.store %[[VAL_7]], %[[VAL_11]] : !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>>
 // CHECK:           %[[VAL_14:.*]] = cc.func_ptr %[[VAL_4]] : ((!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>) -> !cc.ptr<i8>
 // CHECK:           %[[VAL_15:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_16:.*]] = cc.compute_ptr %[[VAL_8]][1] : (!cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>>) -> !cc.ptr<!cc.struct<{!cc.ptr<i32>, i64}>>
-// CHECK:           %[[VAL_17:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr<!cc.struct<{!cc.ptr<i32>, i64}>>) -> i64
+// CHECK:           %[[VAL_17:.*]] = cc.offsetof !cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}> [1] : i64
 // CHECK:           %[[VAL_12:.*]] = llvm.mlir.addressof @test_0.kernelName : !llvm.ptr<array<7 x i8>>
 // CHECK:           %[[VAL_13:.*]] = cc.cast %[[VAL_12]] : (!llvm.ptr<array<7 x i8>>) -> !cc.ptr<i8>
 // CHECK:           call @altLaunchKernel(%[[VAL_13]], %[[VAL_14]], %[[VAL_15]], %[[VAL_9]], %[[VAL_17]]) : (!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64) -> ()
@@ -113,18 +110,15 @@ func.func @test_1(%0: !cc.ptr<!cc.struct<{!cc.ptr<f64>, !cc.ptr<f64>, !cc.ptr<f6
 // CHECK-SAME:           %[[VAL_1:.*]]: !cc.ptr<i8>, %[[VAL_2:.*]]: i32) {
 // CHECK:           %[[VAL_3:.*]] = arith.constant 8 : i64
 // CHECK:           %[[VAL_4:.*]] = constant @test_1.thunk : (!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>
-// CHECK:           %[[VAL_5:.*]] = arith.constant 0 : i64
 // CHECK:           %[[VAL_6:.*]] = cc.undef !cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>
 // CHECK:           %[[VAL_7:.*]] = cc.insert_value %[[VAL_2]], %[[VAL_6]][0] : (!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>, i32) -> !cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>
-// CHECK:           %[[VAL_8:.*]] = cc.cast %[[VAL_5]] : (i64) -> !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>>
 // CHECK:           %[[VAL_9:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}> : i64
 // CHECK:           %[[VAL_10:.*]] = cc.alloca i8{{\[}}%[[VAL_9]] : i64]
 // CHECK:           %[[VAL_11:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>>
 // CHECK:           cc.store %[[VAL_7]], %[[VAL_11]] : !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>>
 // CHECK:           %[[VAL_14:.*]] = cc.func_ptr %[[VAL_4]] : ((!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>) -> !cc.ptr<i8>
 // CHECK:           %[[VAL_15:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_16:.*]] = cc.compute_ptr %[[VAL_8]][1] : (!cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>>) -> !cc.ptr<!cc.struct<{!cc.ptr<f64>, i64}>>
-// CHECK:           %[[VAL_17:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr<!cc.struct<{!cc.ptr<f64>, i64}>>) -> i64
+// CHECK:           %[[VAL_17:.*]] = cc.offsetof !cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}> [1] : i64
 // CHECK:           %[[VAL_12:.*]] = llvm.mlir.addressof @test_1.kernelName : !llvm.ptr<array<7 x i8>>
 // CHECK:           %[[VAL_13:.*]] = cc.cast %[[VAL_12]] : (!llvm.ptr<array<7 x i8>>) -> !cc.ptr<i8>
 // CHECK:           call @altLaunchKernel(%[[VAL_13]], %[[VAL_14]], %[[VAL_15]], %[[VAL_9]], %[[VAL_17]]) : (!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64) -> ()