diff --git a/include/cudaq/Optimizer/Builder/Runtime.h b/include/cudaq/Optimizer/Builder/Runtime.h
index bf81843fd9..c25a5cd2ee 100644
--- a/include/cudaq/Optimizer/Builder/Runtime.h
+++ b/include/cudaq/Optimizer/Builder/Runtime.h
@@ -23,7 +23,8 @@ static constexpr unsigned cudaqGenPrefixLength = sizeof(cudaqGenPrefixName) - 1;
 /// compile time (see `cudaqGenPrefixName`) or it can be rewritten to call back
 /// to the runtime library (and be handled at runtime).
 static constexpr const char launchKernelFuncName[] = "altLaunchKernel";
-static constexpr const char launchKernelVersion2FuncName[] =
-    "altLaunchKernelUsingLocalJIT";
+static constexpr const char launchKernelStreamlinedFuncName[] =
+    "streamlinedLaunchKernel";
+static constexpr const char launchKernelHybridFuncName[] = "hybridLaunchKernel";
 
 } // namespace cudaq::runtime
diff --git a/include/cudaq/Optimizer/Transforms/Passes.td b/include/cudaq/Optimizer/Transforms/Passes.td
index e8e5a79b0d..8fa3eb8bd0 100644
--- a/include/cudaq/Optimizer/Transforms/Passes.td
+++ b/include/cudaq/Optimizer/Transforms/Passes.td
@@ -302,6 +302,27 @@ def GenerateKernelExecution : Pass<"kernel-execution", "mlir::ModuleOp"> {
     use of library side argument conversion and the argument synthesis pass.
     More generally, this option can be used when JIT compiling kernels on the
     client/host/local processor.
+
+    There are multiple code generation kinds that are supported for flexibility
+    and streamlining the kernel launch process. These tend to be related to the
+    target and runtime environment the compiler is being run in and can involve
+    some technical issues that require deeper understanding of the entire
+    process. In general, it is not recommended for user's to change this value.
+
+    ```
+    codegen kind   description
+
+      0            Hybrid. A combination of 1 and 2 that allowed early and
+                   streamlined JIT compilation but also supports return values
+                   and dynamic parameters.
+      1            Client-server interchange format. Supports kernels that
+                   return results and dynamic parameters.
+      2            Streamlined for JIT. The kernel will be converted to a
+                   nullary function with no results. Return values from the
+                   kernel are ignored, if present. All parameter values are to
+                   be inlined by the JIT compiler, so this codegen kind does not
+                   support any dynamic parameters.
+    ```
   }];
 
   let dependentDialects = ["cudaq::cc::CCDialect", "mlir::LLVM::LLVMDialect"];
@@ -311,8 +332,8 @@ def GenerateKernelExecution : Pass<"kernel-execution", "mlir::ModuleOp"> {
       /*default=*/"\"-\"", "Name of output file.">,
     Option<"startingArgIdx", "starting-arg-idx", "std::size_t", /*default=*/"0",
       "The starting argument index for the argsCreator.">,
-    Option<"altLaunchVersion", "alt-launch", "std::size_t", /*default=*/"1",
-      "Specify the version of altLaunchKernel to be used.">
+    Option<"codegenKind", "codegen", "std::size_t", /*default=*/"1",
+      "Set the kind of code to generate for the launches.">
   ];
 }
 
diff --git a/lib/Optimizer/Builder/Intrinsics.cpp b/lib/Optimizer/Builder/Intrinsics.cpp
index 12030de199..5daceec94b 100644
--- a/lib/Optimizer/Builder/Intrinsics.cpp
+++ b/lib/Optimizer/Builder/Intrinsics.cpp
@@ -293,25 +293,33 @@ static constexpr IntrinsicCode intrinsicTable[] = {
     return %3 : !cc.struct<{!cc.ptr<i8>, i64}>
   })#"},
 
-    {cudaq::runtime::launchKernelFuncName, // altLaunchKernel
+    // altLaunchKernel(kernelName, thunk, commBuffer, buffSize, resultOffset)
+    {cudaq::runtime::launchKernelFuncName,
      {},
      R"#(
   func.func private @altLaunchKernel(!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64) -> ())#"},
 
-    {cudaq::runtime::
-         launchKernelVersion2FuncName, // altLaunchKernelUsingLocalJIT
+    {"free", {}, "func.func private @free(!cc.ptr<i8>) -> ()"},
+
+    // hybridLaunchKernel(kernelName, thunk, commBuffer, buffSize,
+    //                    resultOffset, vectorArgPtrs)
+    {cudaq::runtime::launchKernelHybridFuncName,
      {},
      R"#(
-  func.func private @altLaunchKernelUsingLocalJIT(!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>) -> ())#"},
-
-    {"free", {}, "func.func private @free(!cc.ptr<i8>) -> ()"},
+  func.func private @hybridLaunchKernel(!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64, !cc.ptr<i8>) -> ())#"},
 
     {cudaq::llvmMemCopyIntrinsic, // llvm.memcpy.p0i8.p0i8.i64
      {},
      R"#(
   func.func private @llvm.memcpy.p0i8.p0i8.i64(!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ())#"},
 
-    {"malloc", {}, "func.func private @malloc(i64) -> !cc.ptr<i8>"}};
+    {"malloc", {}, "func.func private @malloc(i64) -> !cc.ptr<i8>"},
+
+    // streamlinedLaunchKernel(kernelName, vectorArgPtrs)
+    {cudaq::runtime::launchKernelStreamlinedFuncName,
+     {},
+     R"#(
+  func.func private @streamlinedLaunchKernel(!cc.ptr<i8>, !cc.ptr<i8>) -> ())#"}};
 
 static constexpr std::size_t intrinsicTableSize =
     sizeof(intrinsicTable) / sizeof(IntrinsicCode);
diff --git a/lib/Optimizer/CodeGen/CCToLLVM.cpp b/lib/Optimizer/CodeGen/CCToLLVM.cpp
index aa025656b8..01596ae760 100644
--- a/lib/Optimizer/CodeGen/CCToLLVM.cpp
+++ b/lib/Optimizer/CodeGen/CCToLLVM.cpp
@@ -495,6 +495,33 @@ class SizeOfOpPattern : public ConvertOpToLLVMPattern<cudaq::cc::SizeOfOp> {
   }
 };
 
+class OffsetOfOpPattern : public ConvertOpToLLVMPattern<cudaq::cc::OffsetOfOp> {
+public:
+  using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
+
+  // Use the GEP approach for now. LLVM is planning to remove support for this
+  // at some point. See: https://github.com/llvm/llvm-project/issues/71507
+  LogicalResult
+  matchAndRewrite(cudaq::cc::OffsetOfOp offsetOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto inputTy = offsetOp.getInputType();
+    SmallVector<cudaq::cc::ComputePtrArg> args;
+    for (std::int32_t i : offsetOp.getConstantIndices())
+      args.push_back(i);
+    auto resultTy = offsetOp.getType();
+    auto loc = offsetOp.getLoc();
+    // TODO: replace this with some target-specific memory layout computation
+    // when we upgrade to a newer MLIR.
+    auto zero = rewriter.create<arith::ConstantIntOp>(loc, 0, 64);
+    auto ptrTy = cudaq::cc::PointerType::get(inputTy);
+    auto nul = rewriter.create<cudaq::cc::CastOp>(loc, ptrTy, zero);
+    Value nextPtr =
+        rewriter.create<cudaq::cc::ComputePtrOp>(loc, ptrTy, nul, args);
+    rewriter.replaceOpWithNewOp<cudaq::cc::CastOp>(offsetOp, resultTy, nextPtr);
+    return success();
+  }
+};
+
 class StdvecDataOpPattern
     : public ConvertOpToLLVMPattern<cudaq::cc::StdvecDataOp> {
 public:
@@ -647,7 +674,8 @@ void cudaq::opt::populateCCToLLVMPatterns(LLVMTypeConverter &typeConverter,
                   ComputePtrOpPattern, CreateStringLiteralOpPattern,
                   ExtractValueOpPattern, FuncToPtrOpPattern, GlobalOpPattern,
                   InsertValueOpPattern, InstantiateCallableOpPattern,
-                  LoadOpPattern, PoisonOpPattern, SizeOfOpPattern,
-                  StdvecDataOpPattern, StdvecInitOpPattern, StdvecSizeOpPattern,
-                  StoreOpPattern, UndefOpPattern>(typeConverter);
+                  LoadOpPattern, OffsetOfOpPattern, PoisonOpPattern,
+                  SizeOfOpPattern, StdvecDataOpPattern, StdvecInitOpPattern,
+                  StdvecSizeOpPattern, StoreOpPattern, UndefOpPattern>(
+      typeConverter);
 }
diff --git a/lib/Optimizer/Transforms/GenKernelExecution.cpp b/lib/Optimizer/Transforms/GenKernelExecution.cpp
index 78176e5387..7927a1995d 100644
--- a/lib/Optimizer/Transforms/GenKernelExecution.cpp
+++ b/lib/Optimizer/Transforms/GenKernelExecution.cpp
@@ -34,19 +34,29 @@ namespace cudaq::opt {
 
 using namespace mlir;
 
-namespace {
 // Define some constant function name strings.
-static constexpr const char cudaqRegisterLambdaName[] =
+static constexpr const char CudaqRegisterLambdaName[] =
     "cudaqRegisterLambdaName";
-static constexpr const char cudaqRegisterArgsCreator[] =
+static constexpr const char CudaqRegisterArgsCreator[] =
     "cudaqRegisterArgsCreator";
-static constexpr const char cudaqRegisterKernelName[] =
+static constexpr const char CudaqRegisterKernelName[] =
     "cudaqRegisterKernelName";
 
 /// This value is used to indicate that a kernel does not return a result.
 static constexpr std::uint64_t NoResultOffset =
     std::numeric_limits<std::int32_t>::max();
 
+/// Generate code for packing arguments as raw data.
+static bool isCodegenPackedData(std::size_t kind) {
+  return kind == 0 || kind == 1;
+}
+
+/// Generate code that gathers the arguments for conversion and synthesis.
+static bool isCodegenArgumentGather(std::size_t kind) {
+  return kind == 0 || kind == 2;
+}
+
+namespace {
 class GenerateKernelExecution
     : public cudaq::opt::impl::GenerateKernelExecutionBase<
           GenerateKernelExecution> {
@@ -277,18 +287,13 @@ class GenerateKernelExecution
 
   Value genComputeReturnOffset(Location loc, OpBuilder &builder,
                                FunctionType funcTy,
-                               cudaq::cc::StructType msgStructTy,
-                               Value nullSt) {
-    auto i64Ty = builder.getI64Type();
+                               cudaq::cc::StructType msgStructTy) {
     if (funcTy.getNumResults() == 0)
       return builder.create<arith::ConstantIntOp>(loc, NoResultOffset, 64);
-    auto members = msgStructTy.getMembers();
     std::int32_t numKernelArgs = funcTy.getNumInputs();
-    auto resTy = cudaq::cc::PointerType::get(members[numKernelArgs]);
-    auto gep = builder.create<cudaq::cc::ComputePtrOp>(
-        loc, resTy, nullSt,
-        SmallVector<cudaq::cc::ComputePtrArg>{numKernelArgs});
-    return builder.create<cudaq::cc::CastOp>(loc, i64Ty, gep);
+    auto i64Ty = builder.getI64Type();
+    return builder.create<cudaq::cc::OffsetOfOp>(
+        loc, i64Ty, msgStructTy, ArrayRef<std::int32_t>{numKernelArgs});
   }
 
   /// Create a function that determines the return value offset in the message
@@ -305,11 +310,8 @@ class GenerateKernelExecution
     OpBuilder::InsertionGuard guard(builder);
     auto *entry = returnOffsetFunc.addEntryBlock();
     builder.setInsertionPointToStart(entry);
-    auto ptrTy = cudaq::cc::PointerType::get(msgStructTy);
-    auto zero = builder.create<arith::ConstantIntOp>(loc, 0, 64);
-    auto basePtr = builder.create<cudaq::cc::CastOp>(loc, ptrTy, zero);
     auto result =
-        genComputeReturnOffset(loc, builder, devKernelTy, msgStructTy, basePtr);
+        genComputeReturnOffset(loc, builder, devKernelTy, msgStructTy);
     builder.create<func::ReturnOp>(loc, result);
   }
 
@@ -1116,227 +1118,309 @@ class GenerateKernelExecution
   /// library. Pass along the thunk, so the runtime can call the quantum
   /// circuit. These entry points are `operator()` member functions in a class,
   /// so account for the `this` argument here.
-  void genNewHostEntryPoint1(Location loc, OpBuilder &builder,
-                             FunctionType funcTy,
-                             cudaq::cc::StructType structTy,
-                             LLVM::GlobalOp kernelNameObj, func::FuncOp thunk,
-                             func::FuncOp rewriteEntry, bool addThisPtr) {
+  void genNewHostEntryPoint(Location loc, OpBuilder &builder,
+                            FunctionType devFuncTy,
+                            LLVM::GlobalOp kernelNameObj, func::FuncOp hostFunc,
+                            bool addThisPtr, cudaq::cc::StructType structTy,
+                            func::FuncOp thunkFunc) {
     auto *ctx = builder.getContext();
     auto i64Ty = builder.getI64Type();
-    auto offset = funcTy.getNumInputs();
+    auto offset = devFuncTy.getNumInputs();
     auto thunkTy = getThunkType(ctx);
     auto structPtrTy = cudaq::cc::PointerType::get(structTy);
-    Block *rewriteEntryBlock = rewriteEntry.addEntryBlock();
+    Block *hostFuncEntryBlock = hostFunc.addEntryBlock();
+    const bool hiddenSRet = cudaq::opt::factory::hasHiddenSRet(devFuncTy);
 
     OpBuilder::InsertionGuard guard(builder);
-    builder.setInsertionPointToStart(rewriteEntryBlock);
-    Value stVal = builder.create<cudaq::cc::UndefOp>(loc, structTy);
+    builder.setInsertionPointToStart(hostFuncEntryBlock);
+    auto i8Ty = builder.getI8Type();
+    auto ptrI8Ty = cudaq::cc::PointerType::get(i8Ty);
 
-    // Process all the arguments for the original call, ignoring any hidden
-    // arguments (such as the `this` pointer).
-    auto zero = builder.create<arith::ConstantIntOp>(loc, 0, 64);
-    Value extraBytes = zero;
-    bool hasTrailingData = false;
-    SmallVector<BlockArgument> blockArgs{dropAnyHiddenArguments(
-        rewriteEntryBlock->getArguments(), funcTy, addThisPtr)};
-    std::int32_t idx = 0;
-    SmallVector<Value> blockValues(blockArgs.size());
-    std::copy(blockArgs.begin(), blockArgs.end(), blockValues.begin());
-    for (auto iter = blockArgs.begin(), end = blockArgs.end(); iter != end;
-         ++iter, ++idx) {
-      Value arg = *iter;
-      Type inTy = arg.getType();
-      Type quakeTy = funcTy.getInput(idx);
-      // If the argument is a callable, skip it.
-      if (isa<cudaq::cc::CallableType>(quakeTy))
-        continue;
-      // If the argument is an empty struct, skip it.
-      if (auto strTy = dyn_cast<cudaq::cc::StructType>(quakeTy))
-        if (strTy.isEmpty())
+    Value temp;
+    Value castTemp;
+    Value resultOffset;
+    Value castLoadThunk;
+    Value extendedStructSize;
+    if (isCodegenPackedData(codegenKind)) {
+      Value stVal = builder.create<cudaq::cc::UndefOp>(loc, structTy);
+
+      // Process all the arguments for the original call, ignoring any hidden
+      // arguments (such as the `this` pointer).
+      auto zero = builder.create<arith::ConstantIntOp>(loc, 0, 64);
+      Value extraBytes = zero;
+      bool hasTrailingData = false;
+      SmallVector<BlockArgument> blockArgs{dropAnyHiddenArguments(
+          hostFuncEntryBlock->getArguments(), devFuncTy, addThisPtr)};
+      std::int32_t idx = 0;
+      SmallVector<Value> blockValues(blockArgs.size());
+      std::copy(blockArgs.begin(), blockArgs.end(), blockValues.begin());
+      for (auto iter = blockArgs.begin(), end = blockArgs.end(); iter != end;
+           ++iter, ++idx) {
+        Value arg = *iter;
+        Type inTy = arg.getType();
+        Type quakeTy = devFuncTy.getInput(idx);
+        // If the argument is a callable, skip it.
+        if (isa<cudaq::cc::CallableType>(quakeTy))
           continue;
+        // If the argument is an empty struct, skip it.
+        if (auto strTy = dyn_cast<cudaq::cc::StructType>(quakeTy))
+          if (strTy.isEmpty())
+            continue;
 
-      if (auto stdvecTy = dyn_cast<cudaq::cc::SpanLikeType>(quakeTy)) {
-        // Per the CUDA-Q spec, an entry point kernel must take a `[const]
-        // std::vector<T>` value argument.
-        // Should the spec stipulate that pure device kernels must pass by
-        // read-only reference, i.e., take `const std::vector<T> &` arguments?
-        auto ptrInTy = cast<cudaq::cc::PointerType>(inTy);
-        // If this is a std::vector<bool>, unpack it.
-        if (stdvecTy.getElementType() == builder.getI1Type()) {
-          // Create a mock vector of i8 and populate the bools, 1 per char.
-          Value temp = builder.create<cudaq::cc::AllocaOp>(
-              loc, ptrInTy.getElementType());
-          builder.create<func::CallOp>(loc, std::nullopt,
-                                       cudaq::stdvecBoolUnpackToInitList,
-                                       ArrayRef<Value>{temp, arg});
-          arg = blockValues[idx] = temp;
+        if (auto stdvecTy = dyn_cast<cudaq::cc::SpanLikeType>(quakeTy)) {
+          // Per the CUDA-Q spec, an entry point kernel must take a `[const]
+          // std::vector<T>` value argument.
+          // Should the spec stipulate that pure device kernels must pass by
+          // read-only reference, i.e., take `const std::vector<T> &` arguments?
+          auto ptrInTy = cast<cudaq::cc::PointerType>(inTy);
+          // If this is a std::vector<bool>, unpack it.
+          if (stdvecTy.getElementType() == builder.getI1Type()) {
+            // Create a mock vector of i8 and populate the bools, 1 per char.
+            Value tmp = builder.create<cudaq::cc::AllocaOp>(
+                loc, ptrInTy.getElementType());
+            builder.create<func::CallOp>(loc, std::nullopt,
+                                         cudaq::stdvecBoolUnpackToInitList,
+                                         ArrayRef<Value>{tmp, arg});
+            arg = blockValues[idx] = tmp;
+          }
+          // FIXME: call the `size` member function. For expediency, assume this
+          // is an std::vector and the size is the scaled delta between the
+          // first two pointers. Use the unscaled size for now.
+          auto [p1, p2] = insertVectorSizeAndIncrementExtraBytes(
+              loc, builder, arg, ptrInTy, stdvecTy, stVal, idx, extraBytes);
+          stVal = p1;
+          extraBytes = p2;
+          hasTrailingData = true;
+          continue;
         }
-        // FIXME: call the `size` member function. For expediency, assume this
-        // is an std::vector and the size is the scaled delta between the
-        // first two pointers. Use the unscaled size for now.
-        auto [p1, p2] = insertVectorSizeAndIncrementExtraBytes(
-            loc, builder, arg, ptrInTy, stdvecTy, stVal, idx, extraBytes);
-        stVal = p1;
-        extraBytes = p2;
-        hasTrailingData = true;
-        continue;
-      }
-      if (auto strTy = dyn_cast<cudaq::cc::StructType>(quakeTy)) {
-        if (!isa<cudaq::cc::PointerType>(arg.getType())) {
-          // If argument is not a pointer, then struct was promoted into a
-          // register.
-          auto *parent = builder.getBlock()->getParentOp();
-          auto module = parent->getParentOfType<ModuleOp>();
-          auto tmp = builder.create<cudaq::cc::AllocaOp>(loc, quakeTy);
-          auto cast = builder.create<cudaq::cc::CastOp>(
-              loc, cudaq::cc::PointerType::get(arg.getType()), tmp);
-          if (cudaq::opt::factory::isX86_64(module)) {
-            builder.create<cudaq::cc::StoreOp>(loc, arg, cast);
-            if (cudaq::opt::factory::structUsesTwoArguments(quakeTy)) {
-              auto arrTy = cudaq::cc::ArrayType::get(builder.getI8Type());
-              auto cast = builder.create<cudaq::cc::CastOp>(
-                  loc, cudaq::cc::PointerType::get(arrTy), tmp);
-              auto hiPtr = builder.create<cudaq::cc::ComputePtrOp>(
-                  loc, cudaq::cc::PointerType::get(builder.getI8Type()), cast,
-                  cudaq::cc::ComputePtrArg{8});
-              ++iter;
-              Value nextArg = *iter;
-              auto cast2 = builder.create<cudaq::cc::CastOp>(
-                  loc, cudaq::cc::PointerType::get(nextArg.getType()), hiPtr);
-              builder.create<cudaq::cc::StoreOp>(loc, nextArg, cast2);
+        if (auto strTy = dyn_cast<cudaq::cc::StructType>(quakeTy)) {
+          if (!isa<cudaq::cc::PointerType>(arg.getType())) {
+            // If argument is not a pointer, then struct was promoted into a
+            // register.
+            auto *parent = builder.getBlock()->getParentOp();
+            auto module = parent->getParentOfType<ModuleOp>();
+            auto tmp = builder.create<cudaq::cc::AllocaOp>(loc, quakeTy);
+            auto cast = builder.create<cudaq::cc::CastOp>(
+                loc, cudaq::cc::PointerType::get(arg.getType()), tmp);
+            if (cudaq::opt::factory::isX86_64(module)) {
+              builder.create<cudaq::cc::StoreOp>(loc, arg, cast);
+              if (cudaq::opt::factory::structUsesTwoArguments(quakeTy)) {
+                auto arrTy = cudaq::cc::ArrayType::get(builder.getI8Type());
+                auto cast = builder.create<cudaq::cc::CastOp>(
+                    loc, cudaq::cc::PointerType::get(arrTy), tmp);
+                auto hiPtr = builder.create<cudaq::cc::ComputePtrOp>(
+                    loc, cudaq::cc::PointerType::get(builder.getI8Type()), cast,
+                    cudaq::cc::ComputePtrArg{8});
+                ++iter;
+                Value nextArg = *iter;
+                auto cast2 = builder.create<cudaq::cc::CastOp>(
+                    loc, cudaq::cc::PointerType::get(nextArg.getType()), hiPtr);
+                builder.create<cudaq::cc::StoreOp>(loc, nextArg, cast2);
+              }
+            } else {
+              builder.create<cudaq::cc::StoreOp>(loc, arg, cast);
             }
-          } else {
-            builder.create<cudaq::cc::StoreOp>(loc, arg, cast);
+            // Load the assembled (sub-)struct and insert into the buffer value.
+            Value v = builder.create<cudaq::cc::LoadOp>(loc, tmp);
+            stVal = builder.create<cudaq::cc::InsertValueOp>(
+                loc, stVal.getType(), stVal, v, idx);
+            continue;
           }
-          // Load the assembled (sub-)struct and insert into the buffer value.
-          Value v = builder.create<cudaq::cc::LoadOp>(loc, tmp);
-          stVal = builder.create<cudaq::cc::InsertValueOp>(loc, stVal.getType(),
-                                                           stVal, v, idx);
+          if (!cudaq::cc::isDynamicType(strTy)) {
+            // struct is static size, so just load the value (byval ptr).
+            Value v = builder.create<cudaq::cc::LoadOp>(loc, arg);
+            stVal = builder.create<cudaq::cc::InsertValueOp>(
+                loc, stVal.getType(), stVal, v, idx);
+            continue;
+          }
+          auto genTy = cast<cudaq::cc::StructType>(
+              cudaq::opt::factory::genArgumentBufferType(strTy));
+          Value zero = builder.create<arith::ConstantIntOp>(loc, 0, 64);
+          auto [quakeVal, recursiveSize] = computeRecursiveDynamicStructSize(
+              loc, builder, strTy, arg, zero, genTy);
+          stVal = builder.create<cudaq::cc::InsertValueOp>(
+              loc, stVal.getType(), stVal, quakeVal, idx);
+          extraBytes =
+              builder.create<arith::AddIOp>(loc, extraBytes, recursiveSize);
+          hasTrailingData = true;
           continue;
         }
-        if (!cudaq::cc::isDynamicType(strTy)) {
-          // struct is static size, so just load the value (byval ptr).
-          Value v = builder.create<cudaq::cc::LoadOp>(loc, arg);
-          stVal = builder.create<cudaq::cc::InsertValueOp>(loc, stVal.getType(),
-                                                           stVal, v, idx);
+        if (auto ptrTy = dyn_cast<cudaq::cc::PointerType>(inTy)) {
+          if (isa<cudaq::cc::StateType>(ptrTy.getElementType())) {
+            // Special case: if the argument is a `cudaq::state*`, then just
+            // pass the pointer. We can do that in this case because the
+            // synthesis step (which will receive the argument data) is assumed
+            // to run in the same memory space.
+            Value argPtr = builder.create<cudaq::cc::CastOp>(loc, inTy, arg);
+            stVal = builder.create<cudaq::cc::InsertValueOp>(
+                loc, stVal.getType(), stVal, argPtr, idx);
+          }
           continue;
         }
-        auto genTy = cast<cudaq::cc::StructType>(
-            cudaq::opt::factory::genArgumentBufferType(strTy));
-        Value zero = builder.create<arith::ConstantIntOp>(loc, 0, 64);
-        auto [quakeVal, recursiveSize] = computeRecursiveDynamicStructSize(
-            loc, builder, strTy, arg, zero, genTy);
+
         stVal = builder.create<cudaq::cc::InsertValueOp>(loc, stVal.getType(),
-                                                         stVal, quakeVal, idx);
-        extraBytes =
-            builder.create<arith::AddIOp>(loc, extraBytes, recursiveSize);
-        hasTrailingData = true;
-        continue;
+                                                         stVal, arg, idx);
       }
-      if (auto ptrTy = dyn_cast<cudaq::cc::PointerType>(inTy)) {
-        if (isa<cudaq::cc::StateType>(ptrTy.getElementType())) {
-          // Special case: if the argument is a `cudaq::state*`, then just pass
-          // the pointer. We can do that in this case because the synthesis step
-          // (which will receive the argument data) is assumed to run in the
-          // same memory space.
-          Value argPtr = builder.create<cudaq::cc::CastOp>(loc, inTy, arg);
-          stVal = builder.create<cudaq::cc::InsertValueOp>(loc, stVal.getType(),
-                                                           stVal, argPtr, idx);
+
+      // Compute the struct size without the trailing bytes, structSize, and
+      // with the trailing bytes, extendedStructSize.
+      Value structSize =
+          builder.create<cudaq::cc::SizeOfOp>(loc, i64Ty, structTy);
+      extendedStructSize =
+          builder.create<arith::AddIOp>(loc, structSize, extraBytes);
+
+      // Allocate our struct to save the argument to.
+      auto buff =
+          builder.create<cudaq::cc::AllocaOp>(loc, i8Ty, extendedStructSize);
+
+      temp = builder.create<cudaq::cc::CastOp>(loc, structPtrTy, buff);
+
+      // Store the arguments to the argument section.
+      builder.create<cudaq::cc::StoreOp>(loc, stVal, temp);
+
+      auto structPtrArrTy =
+          cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(structTy));
+      temp = builder.create<cudaq::cc::CastOp>(loc, structPtrArrTy, buff);
+
+      // Append the vector data to the end of the struct.
+      if (hasTrailingData) {
+        Value vecToBuffer = builder.create<cudaq::cc::ComputePtrOp>(
+            loc, ptrI8Ty, buff, SmallVector<Value>{structSize});
+        // Ignore any hidden `this` argument.
+        for (auto inp : llvm::enumerate(blockValues)) {
+          Value arg = inp.value();
+          Type inTy = arg.getType();
+          std::int32_t idx = inp.index();
+          Type quakeTy = devFuncTy.getInput(idx);
+          if (auto stdvecTy = dyn_cast<cudaq::cc::SpanLikeType>(quakeTy)) {
+            auto bytes = builder.create<cudaq::cc::ExtractValueOp>(
+                loc, builder.getI64Type(), stVal, idx);
+            assert(stdvecTy == devFuncTy.getInput(idx));
+            auto ptrInTy = cast<cudaq::cc::PointerType>(inTy);
+            vecToBuffer = encodeVectorData(loc, builder, bytes, stdvecTy, arg,
+                                           vecToBuffer, ptrInTy);
+            if (stdvecTy.getElementType() == builder.getI1Type()) {
+              auto ptrI1Ty = cudaq::cc::PointerType::get(builder.getI1Type());
+              auto heapPtr = builder.create<cudaq::cc::ComputePtrOp>(
+                  loc, cudaq::cc::PointerType::get(ptrI1Ty), arg,
+                  ArrayRef<cudaq::cc::ComputePtrArg>{0});
+              auto loadHeapPtr =
+                  builder.create<cudaq::cc::LoadOp>(loc, heapPtr);
+              Value heapCast = builder.create<cudaq::cc::CastOp>(
+                  loc, cudaq::cc::PointerType::get(i8Ty), loadHeapPtr);
+              builder.create<func::CallOp>(loc, std::nullopt, "free",
+                                           ArrayRef<Value>{heapCast});
+            }
+          } else if (auto strTy = dyn_cast<cudaq::cc::StructType>(quakeTy)) {
+            if (cudaq::cc::isDynamicType(strTy))
+              vecToBuffer = encodeDynamicStructData(loc, builder, strTy, arg,
+                                                    temp, vecToBuffer);
+          }
         }
-        continue;
       }
-
-      stVal = builder.create<cudaq::cc::InsertValueOp>(loc, stVal.getType(),
-                                                       stVal, arg, idx);
+      Value loadThunk =
+          builder.create<func::ConstantOp>(loc, thunkTy, thunkFunc.getName());
+      castLoadThunk =
+          builder.create<cudaq::cc::FuncToPtrOp>(loc, ptrI8Ty, loadThunk);
+      castTemp = builder.create<cudaq::cc::CastOp>(loc, ptrI8Ty, temp);
+      resultOffset = genComputeReturnOffset(loc, builder, devFuncTy, structTy);
     }
 
-    // Compute the struct size without the trailing bytes, structSize, and with
-    // the trailing bytes, extendedStructSize.
-    auto nullSt = builder.create<cudaq::cc::CastOp>(loc, structPtrTy, zero);
-    Value structSize =
-        builder.create<cudaq::cc::SizeOfOp>(loc, i64Ty, structTy);
-    Value extendedStructSize =
-        builder.create<arith::AddIOp>(loc, structSize, extraBytes);
-
-    // Allocate our struct to save the argument to.
-    auto i8Ty = builder.getI8Type();
-    auto ptrI8Ty = cudaq::cc::PointerType::get(i8Ty);
-    auto buff =
-        builder.create<cudaq::cc::AllocaOp>(loc, i8Ty, extendedStructSize);
-
-    Value temp = builder.create<cudaq::cc::CastOp>(loc, structPtrTy, buff);
-
-    // Store the arguments to the argument section.
-    builder.create<cudaq::cc::StoreOp>(loc, stVal, temp);
-
-    auto structPtrArrTy =
-        cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(structTy));
-    temp = builder.create<cudaq::cc::CastOp>(loc, structPtrArrTy, buff);
-
-    // Append the vector data to the end of the struct.
-    if (hasTrailingData) {
-      Value vecToBuffer = builder.create<cudaq::cc::ComputePtrOp>(
-          loc, ptrI8Ty, buff, SmallVector<Value>{structSize});
-      // Ignore any hidden `this` argument.
-      for (auto inp : llvm::enumerate(blockValues)) {
-        Value arg = inp.value();
-        Type inTy = arg.getType();
-        std::int32_t idx = inp.index();
-        Type quakeTy = funcTy.getInput(idx);
-        if (auto stdvecTy = dyn_cast<cudaq::cc::SpanLikeType>(quakeTy)) {
-          auto bytes = builder.create<cudaq::cc::ExtractValueOp>(
-              loc, builder.getI64Type(), stVal, idx);
-          assert(stdvecTy == funcTy.getInput(idx));
-          auto ptrInTy = cast<cudaq::cc::PointerType>(inTy);
-          vecToBuffer = encodeVectorData(loc, builder, bytes, stdvecTy, arg,
-                                         vecToBuffer, ptrInTy);
-          if (stdvecTy.getElementType() == builder.getI1Type()) {
-            auto ptrI1Ty = cudaq::cc::PointerType::get(builder.getI1Type());
-            auto heapPtr = builder.create<cudaq::cc::ComputePtrOp>(
-                loc, cudaq::cc::PointerType::get(ptrI1Ty), arg,
-                ArrayRef<cudaq::cc::ComputePtrArg>{0});
-            auto loadHeapPtr = builder.create<cudaq::cc::LoadOp>(loc, heapPtr);
-            Value heapCast = builder.create<cudaq::cc::CastOp>(
-                loc, cudaq::cc::PointerType::get(i8Ty), loadHeapPtr);
-            builder.create<func::CallOp>(loc, std::nullopt, "free",
-                                         ArrayRef<Value>{heapCast});
-          }
-        } else if (auto strTy = dyn_cast<cudaq::cc::StructType>(quakeTy)) {
-          if (cudaq::cc::isDynamicType(strTy))
-            vecToBuffer = encodeDynamicStructData(loc, builder, strTy, arg,
-                                                  temp, vecToBuffer);
+    Value vecArgPtrs;
+    if (isCodegenArgumentGather(codegenKind)) {
+      // 1) Allocate and initialize a std::vector<void*> object.
+      const unsigned count =
+          cudaq::cc::numberOfHiddenArgs(addThisPtr, hiddenSRet);
+      auto stdVec = builder.create<cudaq::cc::AllocaOp>(
+          loc, cudaq::opt::factory::stlVectorType(ptrI8Ty));
+      auto arrPtrTy = cudaq::cc::ArrayType::get(ctx, ptrI8Ty, count);
+      Value buffer = builder.create<cudaq::cc::AllocaOp>(loc, arrPtrTy);
+      auto i64Ty = builder.getI64Type();
+      auto buffSize = builder.create<cudaq::cc::SizeOfOp>(loc, i64Ty, arrPtrTy);
+      auto ptrPtrTy = cudaq::cc::PointerType::get(ptrI8Ty);
+      auto cast1 = builder.create<cudaq::cc::CastOp>(loc, ptrPtrTy, buffer);
+      auto ptr3Ty = cudaq::cc::PointerType::get(ptrPtrTy);
+      auto stdVec0 = builder.create<cudaq::cc::CastOp>(loc, ptr3Ty, stdVec);
+      builder.create<cudaq::cc::StoreOp>(loc, cast1, stdVec0);
+      auto cast2 = builder.create<cudaq::cc::CastOp>(loc, i64Ty, buffer);
+      auto endBuff = builder.create<arith::AddIOp>(loc, cast2, buffSize);
+      auto cast3 = builder.create<cudaq::cc::CastOp>(loc, ptrPtrTy, endBuff);
+      auto stdVec1 = builder.create<cudaq::cc::ComputePtrOp>(
+          loc, ptr3Ty, stdVec, ArrayRef<cudaq::cc::ComputePtrArg>{1});
+      builder.create<cudaq::cc::StoreOp>(loc, cast3, stdVec1);
+      auto stdVec2 = builder.create<cudaq::cc::ComputePtrOp>(
+          loc, ptr3Ty, stdVec, ArrayRef<cudaq::cc::ComputePtrArg>{2});
+      builder.create<cudaq::cc::StoreOp>(loc, cast3, stdVec2);
+
+      // 2) Iterate over the arguments passed in and populate the vector.
+      SmallVector<BlockArgument> blockArgs{dropAnyHiddenArguments(
+          hostFuncEntryBlock->getArguments(), devFuncTy, addThisPtr)};
+      for (auto iter : llvm::enumerate(blockArgs)) {
+        std::int32_t i = iter.index();
+        auto pos = builder.create<cudaq::cc::ComputePtrOp>(
+            loc, ptrPtrTy, buffer, ArrayRef<cudaq::cc::ComputePtrArg>{i});
+        auto blkArg = iter.value();
+        if (isa<cudaq::cc::PointerType>(blkArg.getType())) {
+          auto castArg =
+              builder.create<cudaq::cc::CastOp>(loc, ptrI8Ty, blkArg);
+          builder.create<cudaq::cc::StoreOp>(loc, castArg, pos);
+          continue;
         }
+        auto temp = builder.create<cudaq::cc::AllocaOp>(loc, blkArg.getType());
+        builder.create<cudaq::cc::StoreOp>(loc, blkArg, temp);
+        auto castTemp = builder.create<cudaq::cc::CastOp>(loc, ptrI8Ty, temp);
+        builder.create<cudaq::cc::StoreOp>(loc, castTemp, pos);
       }
+      vecArgPtrs = builder.create<cudaq::cc::CastOp>(loc, ptrI8Ty, stdVec);
     }
 
     // Prepare to call the `launchKernel` runtime library entry point.
     Value loadKernName = builder.create<LLVM::AddressOfOp>(
         loc, cudaq::opt::factory::getPointerType(kernelNameObj.getType()),
         kernelNameObj.getSymName());
-    Value loadThunk =
-        builder.create<func::ConstantOp>(loc, thunkTy, thunk.getName());
     auto castLoadKernName =
         builder.create<cudaq::cc::CastOp>(loc, ptrI8Ty, loadKernName);
-    auto castLoadThunk =
-        builder.create<cudaq::cc::FuncToPtrOp>(loc, ptrI8Ty, loadThunk);
-    auto castTemp = builder.create<cudaq::cc::CastOp>(loc, ptrI8Ty, temp);
-
-    auto resultOffset =
-        genComputeReturnOffset(loc, builder, funcTy, structTy, nullSt);
 
     // Generate the call to `launchKernel`.
-    builder.create<func::CallOp>(
-        loc, std::nullopt, cudaq::runtime::launchKernelFuncName,
-        ArrayRef<Value>{castLoadKernName, castLoadThunk, castTemp,
-                        extendedStructSize, resultOffset});
-    const bool hiddenSRet = cudaq::opt::factory::hasHiddenSRet(funcTy);
+    switch (codegenKind) {
+    case 0: {
+      assert(vecArgPtrs && "vector<arg*> must be initialized");
+      builder.create<func::CallOp>(
+          loc, std::nullopt, cudaq::runtime::launchKernelHybridFuncName,
+          ArrayRef<Value>{castLoadKernName, castLoadThunk, castTemp,
+                          extendedStructSize, resultOffset, vecArgPtrs});
+    } break;
+    case 1: {
+      builder.create<func::CallOp>(
+          loc, std::nullopt, cudaq::runtime::launchKernelFuncName,
+          ArrayRef<Value>{castLoadKernName, castLoadThunk, castTemp,
+                          extendedStructSize, resultOffset});
+    } break;
+    case 2: {
+      assert(vecArgPtrs && "vector<arg*> must be initialized");
+      builder.create<func::CallOp>(
+          loc, std::nullopt, cudaq::runtime::launchKernelStreamlinedFuncName,
+          ArrayRef<Value>{castLoadKernName, vecArgPtrs});
+      // For this codegen kind, we drop any results on the floor and return
+      // random data in registers and/or off the stack. This maintains parity
+      // with any pre-existing kernel launchers.
+      SmallVector<Value> garbage;
+      for (auto ty : hostFunc.getFunctionType().getResults())
+        garbage.push_back(builder.create<cudaq::cc::UndefOp>(loc, ty));
+      builder.create<func::ReturnOp>(loc, garbage);
+      return;
+    }
+    default:
+      hostFunc.emitOpError("codegen kind is invalid");
+      return;
+    }
 
     // If and only if this kernel returns a value, unpack and load the
     // result value(s) from the struct returned by `launchKernel` and return
     // them to our caller.
     SmallVector<Value> results;
-    const bool multiResult = funcTy.getResults().size() > 1;
-    for (auto res : llvm::enumerate(funcTy.getResults())) {
+    const bool multiResult = devFuncTy.getResults().size() > 1;
+    for (auto res : llvm::enumerate(devFuncTy.getResults())) {
       int off = res.index() + offset;
       if (auto vecTy = dyn_cast<cudaq::cc::SpanLikeType>(res.value())) {
         auto eleTy = vecTy.getElementType();
@@ -1352,7 +1436,7 @@ class GenerateKernelExecution
         auto vecLen = builder.create<cudaq::cc::LoadOp>(loc, gep1);
         if (vecTy.getElementType() == builder.getI1Type()) {
           genStdvecBoolFromInitList(loc, builder,
-                                    rewriteEntryBlock->getArguments().front(),
+                                    hostFuncEntryBlock->getArguments().front(),
                                     dataPtr, vecLen);
         } else {
           cudaq::IRBuilder irBuilder(builder);
@@ -1362,7 +1446,7 @@ class GenerateKernelExecution
             return;
           }
           genStdvecTFromInitList(loc, builder,
-                                 rewriteEntryBlock->getArguments().front(),
+                                 hostFuncEntryBlock->getArguments().front(),
                                  dataPtr, tSize, vecLen);
         }
         offset++;
@@ -1378,11 +1462,11 @@ class GenerateKernelExecution
             if (multiResult)
               return builder.create<cudaq::cc::ComputePtrOp>(
                   loc, cudaq::cc::PointerType::get(res.value()),
-                  rewriteEntryBlock->getArguments().front(),
+                  hostFuncEntryBlock->getArguments().front(),
                   SmallVector<cudaq::cc::ComputePtrArg>{off});
             return builder.create<cudaq::cc::CastOp>(
                 loc, cudaq::cc::PointerType::get(res.value()),
-                rewriteEntryBlock->getArguments().front());
+                hostFuncEntryBlock->getArguments().front());
           }();
           builder.create<cudaq::cc::StoreOp>(loc, loadVal, sretPtr);
         } else {
@@ -1393,91 +1477,6 @@ class GenerateKernelExecution
     builder.create<func::ReturnOp>(loc, results);
   }
 
-  void genNewHostEntryPoint2(Location loc, OpBuilder &builder,
-                             FunctionType devFuncTy,
-                             LLVM::GlobalOp kernelNameObj,
-                             func::FuncOp hostFunc, bool addThisPtr) {
-    const bool hiddenSRet = cudaq::opt::factory::hasHiddenSRet(devFuncTy);
-    const unsigned count =
-        cudaq::cc::numberOfHiddenArgs(addThisPtr, hiddenSRet);
-    auto *ctx = builder.getContext();
-    auto i8PtrTy = cudaq::cc::PointerType::get(builder.getI8Type());
-
-    // 0) Pointer our builder into the entry block of the function.
-    Block *hostFuncEntryBlock = hostFunc.addEntryBlock();
-
-    OpBuilder::InsertionGuard guard(builder);
-    builder.setInsertionPointToStart(hostFuncEntryBlock);
-
-    // 1) Allocate and initialize a std::vector<void*> object.
-    auto stdVec = builder.create<cudaq::cc::AllocaOp>(
-        loc, cudaq::opt::factory::stlVectorType(i8PtrTy));
-    auto arrPtrTy = cudaq::cc::ArrayType::get(ctx, i8PtrTy, count);
-    Value buffer = builder.create<cudaq::cc::AllocaOp>(loc, arrPtrTy);
-    auto i64Ty = builder.getI64Type();
-    auto buffSize = builder.create<cudaq::cc::SizeOfOp>(loc, i64Ty, arrPtrTy);
-    auto ptrPtrTy = cudaq::cc::PointerType::get(i8PtrTy);
-    auto cast1 = builder.create<cudaq::cc::CastOp>(loc, ptrPtrTy, buffer);
-    auto ptr3Ty = cudaq::cc::PointerType::get(ptrPtrTy);
-    auto stdVec0 = builder.create<cudaq::cc::CastOp>(loc, ptr3Ty, stdVec);
-    builder.create<cudaq::cc::StoreOp>(loc, cast1, stdVec0);
-    auto cast2 = builder.create<cudaq::cc::CastOp>(loc, i64Ty, buffer);
-    auto endBuff = builder.create<arith::AddIOp>(loc, cast2, buffSize);
-    auto cast3 = builder.create<cudaq::cc::CastOp>(loc, ptrPtrTy, endBuff);
-    auto stdVec1 = builder.create<cudaq::cc::ComputePtrOp>(
-        loc, ptr3Ty, stdVec, ArrayRef<cudaq::cc::ComputePtrArg>{1});
-    builder.create<cudaq::cc::StoreOp>(loc, cast3, stdVec1);
-    auto stdVec2 = builder.create<cudaq::cc::ComputePtrOp>(
-        loc, ptr3Ty, stdVec, ArrayRef<cudaq::cc::ComputePtrArg>{2});
-    builder.create<cudaq::cc::StoreOp>(loc, cast3, stdVec2);
-    auto zero = builder.create<arith::ConstantIntOp>(loc, 0, 64);
-    auto nullPtr = builder.create<cudaq::cc::CastOp>(loc, i8PtrTy, zero);
-
-    // 2) Iterate over the arguments passed in and populate the vector.
-    SmallVector<BlockArgument> blockArgs{dropAnyHiddenArguments(
-        hostFuncEntryBlock->getArguments(), devFuncTy, addThisPtr)};
-    for (auto iter : llvm::enumerate(blockArgs)) {
-      std::int32_t i = iter.index();
-      auto pos = builder.create<cudaq::cc::ComputePtrOp>(
-          loc, ptrPtrTy, buffer, ArrayRef<cudaq::cc::ComputePtrArg>{i});
-      auto blkArg = iter.value();
-      if (isa<cudaq::cc::PointerType>(blkArg.getType())) {
-        auto castArg = builder.create<cudaq::cc::CastOp>(loc, i8PtrTy, blkArg);
-        builder.create<cudaq::cc::StoreOp>(loc, castArg, pos);
-        continue;
-      }
-      auto temp = builder.create<cudaq::cc::AllocaOp>(loc, blkArg.getType());
-      builder.create<cudaq::cc::StoreOp>(loc, blkArg, temp);
-      auto castTemp = builder.create<cudaq::cc::CastOp>(loc, i8PtrTy, temp);
-      builder.create<cudaq::cc::StoreOp>(loc, castTemp, pos);
-    }
-
-    auto resultBuffer = builder.create<cudaq::cc::AllocaOp>(loc, i8PtrTy);
-    builder.create<cudaq::cc::StoreOp>(loc, nullPtr, resultBuffer);
-    auto castResultBuffer =
-        builder.create<cudaq::cc::CastOp>(loc, i8PtrTy, resultBuffer);
-    auto castStdvec = builder.create<cudaq::cc::CastOp>(loc, i8PtrTy, stdVec);
-    Value loadKernName = builder.create<LLVM::AddressOfOp>(
-        loc, cudaq::opt::factory::getPointerType(kernelNameObj.getType()),
-        kernelNameObj.getSymName());
-    auto castKernelNameObj =
-        builder.create<cudaq::cc::CastOp>(loc, i8PtrTy, loadKernName);
-    builder.create<func::CallOp>(
-        loc, std::nullopt, cudaq::runtime::launchKernelVersion2FuncName,
-        ArrayRef<Value>{castKernelNameObj, castStdvec, castResultBuffer});
-
-    // FIXME: Drop any results on the floor for now and return random data left
-    // on the stack. (Maintains parity with existing kernel launch.)
-    if (hostFunc.getFunctionType().getResults().empty()) {
-      builder.create<func::ReturnOp>(loc);
-      return;
-    }
-    // There can only be 1 return type in C++, so this is safe.
-    Value garbage = builder.create<cudaq::cc::UndefOp>(
-        loc, hostFunc.getFunctionType().getResult(0));
-    builder.create<func::ReturnOp>(loc, garbage);
-  }
-
   /// A kernel function that takes a quantum type argument (also known as a pure
   /// device kernel) cannot be called directly from C++ (classical) code. It
   /// must be called via other quantum code.
@@ -1491,6 +1490,88 @@ class GenerateKernelExecution
     return true;
   }
 
+  LLVM::LLVMFuncOp registerKernelForExecution(Location loc, OpBuilder &builder,
+                                              const std::string &classNameStr,
+                                              LLVM::GlobalOp kernelNameObj,
+                                              func::FuncOp argsCreatorFunc,
+                                              StringRef mangledName) {
+    auto module = getOperation();
+    auto *ctx = builder.getContext();
+    auto ptrType = cudaq::cc::PointerType::get(builder.getI8Type());
+    auto initFun = builder.create<LLVM::LLVMFuncOp>(
+        loc, classNameStr + ".kernelRegFunc",
+        LLVM::LLVMFunctionType::get(cudaq::opt::factory::getVoidType(ctx), {}));
+    OpBuilder::InsertionGuard guard(builder);
+    auto *initFunEntry = initFun.addEntryBlock();
+    builder.setInsertionPointToStart(initFunEntry);
+    auto kernRef = builder.create<LLVM::AddressOfOp>(
+        loc, cudaq::opt::factory::getPointerType(kernelNameObj.getType()),
+        kernelNameObj.getSymName());
+    auto castKernRef = builder.create<cudaq::cc::CastOp>(loc, ptrType, kernRef);
+    builder.create<func::CallOp>(loc, std::nullopt, CudaqRegisterKernelName,
+                                 ValueRange{castKernRef});
+
+    if (isCodegenPackedData(codegenKind)) {
+      // Register the argsCreator too
+      auto ptrPtrType = cudaq::cc::PointerType::get(ptrType);
+      auto argsCreatorFuncType = FunctionType::get(
+          ctx, {ptrPtrType, ptrPtrType}, {builder.getI64Type()});
+      Value loadArgsCreator = builder.create<func::ConstantOp>(
+          loc, argsCreatorFuncType, argsCreatorFunc.getName());
+      auto castLoadArgsCreator =
+          builder.create<cudaq::cc::FuncToPtrOp>(loc, ptrType, loadArgsCreator);
+      builder.create<func::CallOp>(
+          loc, std::nullopt, CudaqRegisterArgsCreator,
+          ValueRange{castKernRef, castLoadArgsCreator});
+    }
+
+    // Check if this is a lambda mangled name
+    auto demangledPtr = abi::__cxa_demangle(mangledName.str().c_str(), nullptr,
+                                            nullptr, nullptr);
+    if (demangledPtr) {
+      std::string demangledName(demangledPtr);
+      demangledName =
+          std::regex_replace(demangledName, std::regex("::operator()(.*)"), "");
+      if (demangledName.find("$_") != std::string::npos) {
+        auto insertPoint = builder.saveInsertionPoint();
+        builder.setInsertionPointToStart(module.getBody());
+
+        // Create the function if it doesn't already exist.
+        if (!module.lookupSymbol<LLVM::LLVMFuncOp>(CudaqRegisterLambdaName))
+          builder.create<LLVM::LLVMFuncOp>(
+              module.getLoc(), CudaqRegisterLambdaName,
+              LLVM::LLVMFunctionType::get(
+                  cudaq::opt::factory::getVoidType(ctx),
+                  {cudaq::opt::factory::getPointerType(ctx),
+                   cudaq::opt::factory::getPointerType(ctx)}));
+
+        // Create this global name, it is unique for any lambda
+        // bc classNameStr contains the parentFunc + varName
+        auto lambdaName = builder.create<LLVM::GlobalOp>(
+            loc,
+            cudaq::opt::factory::getStringType(ctx, demangledName.size() + 1),
+            /*isConstant=*/true, LLVM::Linkage::External,
+            classNameStr + ".lambdaName",
+            builder.getStringAttr(demangledName + '\0'), /*alignment=*/0);
+
+        builder.restoreInsertionPoint(insertPoint);
+        auto lambdaRef = builder.create<LLVM::AddressOfOp>(
+            loc, cudaq::opt::factory::getPointerType(lambdaName.getType()),
+            lambdaName.getSymName());
+
+        auto castLambdaRef = builder.create<cudaq::cc::CastOp>(
+            loc, cudaq::opt::factory::getPointerType(ctx), lambdaRef);
+        auto castKernelRef = builder.create<cudaq::cc::CastOp>(
+            loc, cudaq::opt::factory::getPointerType(ctx), castKernRef);
+        builder.create<LLVM::CallOp>(loc, std::nullopt, CudaqRegisterLambdaName,
+                                     ValueRange{castLambdaRef, castKernelRef});
+      }
+    }
+
+    builder.create<LLVM::ReturnOp>(loc, ValueRange{});
+    return initFun;
+  }
+
   void runOnOperation() override {
     auto module = getOperation();
     DataLayoutAnalysis dla(module); // caches module's data layout information.
@@ -1508,26 +1589,40 @@ class GenerateKernelExecution
     if (!mangledNameMap || mangledNameMap.empty())
       return;
     auto irBuilder = cudaq::IRBuilder::atBlockEnd(module.getBody());
-    if (altLaunchVersion == 1)
+    switch (codegenKind) {
+    case 0:
+      if (failed(irBuilder.loadIntrinsic(
+              module, cudaq::runtime::launchKernelHybridFuncName))) {
+        module.emitError("could not load altLaunchKernel intrinsic.");
+        return;
+      }
+      break;
+    case 1:
       if (failed(irBuilder.loadIntrinsic(
               module, cudaq::runtime::launchKernelFuncName))) {
         module.emitError("could not load altLaunchKernel intrinsic.");
         return;
       }
-    if (altLaunchVersion == 2)
+      break;
+    case 2:
       if (failed(irBuilder.loadIntrinsic(
-              module, cudaq::runtime::launchKernelVersion2FuncName))) {
+              module, cudaq::runtime::launchKernelStreamlinedFuncName))) {
         module.emitError("could not load altLaunchKernel intrinsic.");
         return;
       }
+      break;
+    default:
+      module.emitError("invalid codegen kind value.");
+      return;
+    }
 
     auto loc = module.getLoc();
     auto ptrType = cudaq::cc::PointerType::get(builder.getI8Type());
     auto regKern = builder.create<func::FuncOp>(
-        loc, cudaqRegisterKernelName, FunctionType::get(ctx, {ptrType}, {}));
+        loc, CudaqRegisterKernelName, FunctionType::get(ctx, {ptrType}, {}));
     regKern.setPrivate();
     auto regArgs = builder.create<func::FuncOp>(
-        loc, cudaqRegisterArgsCreator,
+        loc, CudaqRegisterArgsCreator,
         FunctionType::get(ctx, {ptrType, ptrType}, {}));
     regArgs.setPrivate();
 
@@ -1622,7 +1717,7 @@ class GenerateKernelExecution
       func::FuncOp thunk;
       func::FuncOp argsCreatorFunc;
 
-      if (altLaunchVersion == 1) {
+      if (isCodegenPackedData(codegenKind)) {
         // Generate the function that computes the return offset.
         genReturnOffsetFunction(loc, builder, funcTy, structTy, classNameStr);
 
@@ -1652,94 +1747,15 @@ class GenerateKernelExecution
 
       // Generate a new mangled function on the host side to call the
       // callback function.
-      if (hostEntryNeeded) {
-        if (altLaunchVersion == 1)
-          genNewHostEntryPoint1(loc, builder, funcTy, structTy, kernelNameObj,
-                                thunk, hostFunc, hasThisPtr);
-        else
-          genNewHostEntryPoint2(loc, builder, funcTy, kernelNameObj, hostFunc,
-                                hasThisPtr);
-      }
+      if (hostEntryNeeded)
+        genNewHostEntryPoint(loc, builder, funcTy, kernelNameObj, hostFunc,
+                             hasThisPtr, structTy, thunk);
 
       // Generate a function at startup to register this kernel as having
       // been processed for kernel execution.
-      auto initFun = builder.create<LLVM::LLVMFuncOp>(
-          loc, classNameStr + ".kernelRegFunc",
-          LLVM::LLVMFunctionType::get(cudaq::opt::factory::getVoidType(ctx),
-                                      {}));
-      {
-        OpBuilder::InsertionGuard guard(builder);
-        auto *initFunEntry = initFun.addEntryBlock();
-        builder.setInsertionPointToStart(initFunEntry);
-        auto kernRef = builder.create<LLVM::AddressOfOp>(
-            loc, cudaq::opt::factory::getPointerType(kernelNameObj.getType()),
-            kernelNameObj.getSymName());
-        auto castKernRef =
-            builder.create<cudaq::cc::CastOp>(loc, ptrType, kernRef);
-        builder.create<func::CallOp>(loc, std::nullopt, cudaqRegisterKernelName,
-                                     ValueRange{castKernRef});
-
-        if (altLaunchVersion == 1) {
-          // Register the argsCreator too
-          auto ptrPtrType = cudaq::cc::PointerType::get(ptrType);
-          auto argsCreatorFuncType = FunctionType::get(
-              ctx, {ptrPtrType, ptrPtrType}, {builder.getI64Type()});
-          Value loadArgsCreator = builder.create<func::ConstantOp>(
-              loc, argsCreatorFuncType, argsCreatorFunc.getName());
-          auto castLoadArgsCreator = builder.create<cudaq::cc::FuncToPtrOp>(
-              loc, ptrType, loadArgsCreator);
-          builder.create<func::CallOp>(
-              loc, std::nullopt, cudaqRegisterArgsCreator,
-              ValueRange{castKernRef, castLoadArgsCreator});
-        }
-
-        // Check if this is a lambda mangled name
-        auto demangledPtr = abi::__cxa_demangle(mangledName.str().c_str(),
-                                                nullptr, nullptr, nullptr);
-        if (demangledPtr) {
-          std::string demangledName(demangledPtr);
-          demangledName = std::regex_replace(
-              demangledName, std::regex("::operator()(.*)"), "");
-          if (demangledName.find("$_") != std::string::npos) {
-            auto insertPoint = builder.saveInsertionPoint();
-            builder.setInsertionPointToStart(module.getBody());
-
-            // Create the function if it doesn't already exist.
-            if (!module.lookupSymbol<LLVM::LLVMFuncOp>(cudaqRegisterLambdaName))
-              builder.create<LLVM::LLVMFuncOp>(
-                  module.getLoc(), cudaqRegisterLambdaName,
-                  LLVM::LLVMFunctionType::get(
-                      cudaq::opt::factory::getVoidType(ctx),
-                      {cudaq::opt::factory::getPointerType(ctx),
-                       cudaq::opt::factory::getPointerType(ctx)}));
-
-            // Create this global name, it is unique for any lambda
-            // bc classNameStr contains the parentFunc + varName
-            auto lambdaName = builder.create<LLVM::GlobalOp>(
-                loc,
-                cudaq::opt::factory::getStringType(ctx,
-                                                   demangledName.size() + 1),
-                /*isConstant=*/true, LLVM::Linkage::External,
-                classNameStr + ".lambdaName",
-                builder.getStringAttr(demangledName + '\0'), /*alignment=*/0);
-
-            builder.restoreInsertionPoint(insertPoint);
-            auto lambdaRef = builder.create<LLVM::AddressOfOp>(
-                loc, cudaq::opt::factory::getPointerType(lambdaName.getType()),
-                lambdaName.getSymName());
-
-            auto castLambdaRef = builder.create<cudaq::cc::CastOp>(
-                loc, cudaq::opt::factory::getPointerType(ctx), lambdaRef);
-            auto castKernelRef = builder.create<cudaq::cc::CastOp>(
-                loc, cudaq::opt::factory::getPointerType(ctx), castKernRef);
-            builder.create<LLVM::CallOp>(
-                loc, std::nullopt, cudaqRegisterLambdaName,
-                ValueRange{castLambdaRef, castKernelRef});
-          }
-        }
-
-        builder.create<LLVM::ReturnOp>(loc, ValueRange{});
-      }
+      auto initFun =
+          registerKernelForExecution(loc, builder, classNameStr, kernelNameObj,
+                                     argsCreatorFunc, mangledName);
 
       // Create a global with a default ctor to be run at program startup.
       // The ctor will execute the above function, which will register this
diff --git a/python/tests/builder/test_qalloc_init.py b/python/tests/builder/test_qalloc_init.py
index e0cc626f02..623c383f3c 100644
--- a/python/tests/builder/test_qalloc_init.py
+++ b/python/tests/builder/test_qalloc_init.py
@@ -171,6 +171,25 @@ def test_kernel_complex_params_rotate_f64():
     assert '10' in counts
 
 
+@skipIfNvidiaFP64NotInstalled
+def test_kernel_complex_force_kron():
+    cudaq.reset_target()
+    cudaq.set_target('nvidia-fp64')
+
+    c = [0. + 0j] * 1024
+    c[1023] = 1j
+
+    kernel, vec = cudaq.make_kernel(list[complex])
+    p = kernel.qalloc(1)
+    q = kernel.qalloc(vec)
+    kernel.mz(p)
+    kernel.mz(q)
+
+    counts = cudaq.sample(kernel, c)
+    assert len(counts) == 1
+    assert '01111111111' in counts
+
+
 @skipIfNvidiaNotInstalled
 def test_kernel_complex_params_rotate_f32():
     cudaq.reset_target()
diff --git a/runtime/common/ArgumentConversion.cpp b/runtime/common/ArgumentConversion.cpp
index f39cf942a5..e76a24b5a0 100644
--- a/runtime/common/ArgumentConversion.cpp
+++ b/runtime/common/ArgumentConversion.cpp
@@ -340,8 +340,10 @@ void cudaq::opt::ArgumentConverter::gen(const std::vector<void *> &arguments) {
   FunctionType fromFuncTy = fun.getFunctionType();
   for (auto iter :
        llvm::enumerate(llvm::zip(fromFuncTy.getInputs(), arguments))) {
-    Type argTy = std::get<0>(iter.value());
     void *argPtr = std::get<1>(iter.value());
+    if (!argPtr)
+      continue;
+    Type argTy = std::get<0>(iter.value());
     unsigned i = iter.index();
     auto buildSubst = [&, i = i]<typename... Ts>(Ts &&...ts) {
       builder.setInsertionPointToEnd(substModule.getBody());
@@ -422,3 +424,33 @@ void cudaq::opt::ArgumentConverter::gen(const std::vector<void *> &arguments) {
       substitutions.emplace_back(std::move(subst));
   }
 }
+
+void cudaq::opt::ArgumentConverter::gen(
+    const std::vector<void *> &arguments,
+    const std::unordered_set<unsigned> &exclusions) {
+  std::vector<void *> partialArgs;
+  for (auto iter : llvm::enumerate(arguments)) {
+    if (exclusions.contains(iter.index())) {
+      partialArgs.push_back(nullptr);
+      continue;
+    }
+    partialArgs.push_back(iter.value());
+  }
+  gen(partialArgs);
+}
+
+void cudaq::opt::ArgumentConverter::gen_drop_front(
+    const std::vector<void *> &arguments, unsigned numDrop) {
+  // If we're dropping all the arguments, we're done.
+  if (numDrop >= arguments.size())
+    return;
+  std::vector<void *> partialArgs;
+  for (void *arg : arguments) {
+    if (numDrop--) {
+      partialArgs.push_back(nullptr);
+      continue;
+    }
+    partialArgs.push_back(arg);
+  }
+  gen(partialArgs);
+}
diff --git a/runtime/common/ArgumentConversion.h b/runtime/common/ArgumentConversion.h
index 3251e0d304..1e1efb9347 100644
--- a/runtime/common/ArgumentConversion.h
+++ b/runtime/common/ArgumentConversion.h
@@ -13,6 +13,7 @@
 #include "cudaq/qis/state.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/Types.h"
+#include <unordered_set>
 
 namespace cudaq::opt {
 
@@ -53,6 +54,15 @@ class ArgumentConverter {
   /// The arguments are those presented to the kernel, kernelName.
   void gen(const std::vector<void *> &arguments);
 
+  /// Generate a substitution ModuleOp but include only the arguments that do
+  /// not appear in the set of \p exclusions.
+  void gen(const std::vector<void *> &arguments,
+           const std::unordered_set<unsigned> &exclusions);
+
+  /// Generate a substitution ModuleOp but drop the first \p numDrop arguments
+  /// and thereby exclude them from the substitutions.
+  void gen_drop_front(const std::vector<void *> &arguments, unsigned numDrop);
+
   /// Get the list of substitutions that were generated by `gen()`.
   mlir::SmallVector<cudaq::cc::ArgumentSubstitutionOp> &getSubstitutions() {
     return substitutions;
diff --git a/runtime/nvqir/custatevec/CuStateVecCircuitSimulator.cu b/runtime/nvqir/custatevec/CuStateVecCircuitSimulator.cu
index ed034822d7..4693eefd36 100644
--- a/runtime/nvqir/custatevec/CuStateVecCircuitSimulator.cu
+++ b/runtime/nvqir/custatevec/CuStateVecCircuitSimulator.cu
@@ -95,7 +95,7 @@ void kronprod(uint32_t n_blocks, int32_t threads_per_block,
               void *arr0) {
   cudaKronprod<<<n_blocks, threads_per_block>>>(
     tsize1, reinterpret_cast<const CudaDataType *>(arr1), 
-    (1UL << tsize2), reinterpret_cast<const CudaDataType *>(arr2),
+    tsize2, reinterpret_cast<const CudaDataType *>(arr2),
     reinterpret_cast<CudaDataType *>(arr0));
 }
 
diff --git a/test/Quake/kernel_exec-1.qke b/test/Quake/kernel_exec-1.qke
index b0287073a7..4463b01c71 100644
--- a/test/Quake/kernel_exec-1.qke
+++ b/test/Quake/kernel_exec-1.qke
@@ -7,7 +7,8 @@
 // ========================================================================== //
 
 // RUN: cudaq-opt --kernel-execution %s | FileCheck %s
-// RUN: cudaq-opt --kernel-execution=alt-launch=2 %s | FileCheck --check-prefix=ALT2 %s
+// RUN: cudaq-opt --kernel-execution=codegen=2 %s | FileCheck --check-prefix=STREAM %s
+// RUN: cudaq-opt --kernel-execution=codegen=0 %s | FileCheck --check-prefix=HYBRID %s
 
 module attributes {quake.mangled_name_map = {
   __nvqpp__mlirgen__ghz = "_ZN3ghzclEi"}} {
@@ -86,23 +87,21 @@ module attributes {quake.mangled_name_map = {
 
 // CHECK-LABEL:   func.func @_ZN3ghzclEi(
 // CHECK-SAME:      %[[VAL_0:.*]]: !cc.ptr<i8>, %[[VAL_1:.*]]: i32) -> f64 {
-// CHECK:           %[[VAL_2:.*]] = cc.undef !cc.struct<{i32, f64}>
-// CHECK:           %[[VAL_3:.*]] = arith.constant 0 : i64
+// CHECK-DAG:       %[[VAL_2:.*]] = cc.undef !cc.struct<{i32, f64}>
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 0 : i64
 // CHECK:           %[[VAL_4:.*]] = cc.insert_value %[[VAL_1]], %[[VAL_2]][0] : (!cc.struct<{i32, f64}>, i32) -> !cc.struct<{i32, f64}>
-// CHECK:           %[[VAL_5:.*]] = cc.cast %[[VAL_3]] : (i64) -> !cc.ptr<!cc.struct<{i32, f64}>>
 // CHECK:           %[[VAL_7:.*]] = cc.sizeof !cc.struct<{i32, f64}> : i64
 // CHECK:           %[[VAL_8:.*]] = arith.addi %[[VAL_7]], %[[VAL_3]] : i64
 // CHECK:           %[[VAL_9:.*]] = cc.alloca i8[%[[VAL_8]] : i64]
 // CHECK:           %[[VAL_10:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.struct<{i32, f64}>>
 // CHECK:           cc.store %[[VAL_4]], %[[VAL_10]] : !cc.ptr<!cc.struct<{i32, f64}>>
 // CHECK:           %[[VAL_11:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.array<!cc.struct<{i32, f64}> x ?>>
-// CHECK:           %[[VAL_12:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr<array<4 x i8>>
 // CHECK:           %[[VAL_13:.*]] = constant @ghz.thunk : (!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>
-// CHECK:           %[[VAL_14:.*]] = cc.cast %[[VAL_12]] : (!llvm.ptr<array<4 x i8>>) -> !cc.ptr<i8>
 // CHECK:           %[[VAL_15:.*]] = cc.func_ptr %[[VAL_13]] : ((!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>) -> !cc.ptr<i8>
 // CHECK:           %[[VAL_16:.*]] = cc.cast %[[VAL_11]] : (!cc.ptr<!cc.array<!cc.struct<{i32, f64}> x ?>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_17:.*]] = cc.compute_ptr %[[VAL_5]][1] : (!cc.ptr<!cc.struct<{i32, f64}>>) -> !cc.ptr<f64>
-// CHECK:           %[[VAL_18:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr<f64>) -> i64
+// CHECK:           %[[VAL_18:.*]] = cc.offsetof !cc.struct<{i32, f64}> [1] : i64
+// CHECK:           %[[VAL_12:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr<array<4 x i8>>
+// CHECK:           %[[VAL_14:.*]] = cc.cast %[[VAL_12]] : (!llvm.ptr<array<4 x i8>>) -> !cc.ptr<i8>
 // CHECK:           call @altLaunchKernel(%[[VAL_14]], %[[VAL_15]], %[[VAL_16]], %[[VAL_8]], %[[VAL_18]]) : (!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64) -> ()
 // CHECK:           %[[VAL_19:.*]] = cc.compute_ptr %[[VAL_11]][0, 1] : (!cc.ptr<!cc.array<!cc.struct<{i32, f64}> x ?>>) -> !cc.ptr<f64>
 // CHECK:           %[[VAL_20:.*]] = cc.load %[[VAL_19]] : !cc.ptr<f64>
@@ -131,8 +130,8 @@ module attributes {quake.mangled_name_map = {
 // CHECK:         }
 
 // CHECK-LABEL:   func.func @ghz.argsCreator(
-// CHECK-SAME:                               %[[VAL_0:.*]]: !cc.ptr<!cc.ptr<i8>>,
-// CHECK-SAME:                               %[[VAL_1:.*]]: !cc.ptr<!cc.ptr<i8>>) -> i64 {
+// CHECK-SAME:      %[[VAL_0:.*]]: !cc.ptr<!cc.ptr<i8>>,
+// CHECK-SAME:      %[[VAL_1:.*]]: !cc.ptr<!cc.ptr<i8>>) -> i64 {
 // CHECK:           %[[VAL_2:.*]] = cc.undef !cc.struct<{i32, f64}>
 // CHECK:           %[[VAL_14:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<!cc.ptr<i8>>) -> !cc.ptr<!cc.array<!cc.ptr<i8> x ?>>
 // CHECK:           %[[VAL_3:.*]] = arith.constant 0 : i64
@@ -160,35 +159,72 @@ module attributes {quake.mangled_name_map = {
 // CHECK:         }
 
 
-// ALT2-LABEL:   func.func @_ZN3ghzclEi(
-// ALT2-SAME:      %[[VAL_0:.*]]: !cc.ptr<i8>, %[[VAL_1:.*]]: i32) -> f64 {
-// ALT2:           %[[VAL_2:.*]] = cc.alloca !cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>
-// ALT2:           %[[VAL_3:.*]] = cc.alloca !cc.array<!cc.ptr<i8> x 1>
-// ALT2:           %[[VAL_4:.*]] = cc.sizeof !cc.array<!cc.ptr<i8> x 1> : i64
-// ALT2:           %[[VAL_5:.*]] = cc.cast %[[VAL_3]] : (!cc.ptr<!cc.array<!cc.ptr<i8> x 1>>) -> !cc.ptr<!cc.ptr<i8>>
-// ALT2:           %[[VAL_6:.*]] = cc.cast %[[VAL_2]] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
-// ALT2:           cc.store %[[VAL_5]], %[[VAL_6]] : !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
-// ALT2:           %[[VAL_7:.*]] = cc.cast %[[VAL_3]] : (!cc.ptr<!cc.array<!cc.ptr<i8> x 1>>) -> i64
-// ALT2:           %[[VAL_8:.*]] = arith.addi %[[VAL_7]], %[[VAL_4]] : i64
-// ALT2:           %[[VAL_9:.*]] = cc.cast %[[VAL_8]] : (i64) -> !cc.ptr<!cc.ptr<i8>>
-// ALT2:           %[[VAL_10:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
-// ALT2:           cc.store %[[VAL_9]], %[[VAL_10]] : !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
-// ALT2:           %[[VAL_11:.*]] = cc.compute_ptr %[[VAL_2]][2] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
-// ALT2:           cc.store %[[VAL_9]], %[[VAL_11]] : !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
-// ALT2:           %[[VAL_12:.*]] = arith.constant 0 : i64
-// ALT2:           %[[VAL_13:.*]] = cc.cast %[[VAL_12]] : (i64) -> !cc.ptr<i8>
-// ALT2:           %[[VAL_14:.*]] = cc.compute_ptr %[[VAL_3]][0] : (!cc.ptr<!cc.array<!cc.ptr<i8> x 1>>) -> !cc.ptr<!cc.ptr<i8>>
-// ALT2:           %[[VAL_15:.*]] = cc.alloca i32
-// ALT2:           cc.store %[[VAL_1]], %[[VAL_15]] : !cc.ptr<i32>
-// ALT2:           %[[VAL_16:.*]] = cc.cast %[[VAL_15]] : (!cc.ptr<i32>) -> !cc.ptr<i8>
-// ALT2:           cc.store %[[VAL_16]], %[[VAL_14]] : !cc.ptr<!cc.ptr<i8>>
-// ALT2:           %[[VAL_17:.*]] = cc.alloca !cc.ptr<i8>
-// ALT2:           cc.store %[[VAL_13]], %[[VAL_17]] : !cc.ptr<!cc.ptr<i8>>
-// ALT2:           %[[VAL_18:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr<!cc.ptr<i8>>) -> !cc.ptr<i8>
-// ALT2:           %[[VAL_19:.*]] = cc.cast %[[VAL_2]] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<i8>
-// ALT2:           %[[VAL_20:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr<array<4 x i8>>
-// ALT2:           %[[VAL_21:.*]] = cc.cast %[[VAL_20]] : (!llvm.ptr<array<4 x i8>>) -> !cc.ptr<i8>
-// ALT2:           call @altLaunchKernelUsingLocalJIT(%[[VAL_21]], %[[VAL_19]], %[[VAL_18]]) : (!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>) -> ()
-// ALT2:           %[[VAL_22:.*]] = cc.undef f64
-// ALT2:           return %[[VAL_22]] : f64
-// ALT2:         }
+// STREAM-LABEL:   func.func @_ZN3ghzclEi(
+// STREAM-SAME:      %[[VAL_0:.*]]: !cc.ptr<i8>, %[[VAL_1:.*]]: i32) -> f64 {
+// STREAM:           %[[VAL_2:.*]] = cc.alloca !cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>
+// STREAM:           %[[VAL_3:.*]] = cc.alloca !cc.array<!cc.ptr<i8> x 1>
+// STREAM:           %[[VAL_4:.*]] = cc.sizeof !cc.array<!cc.ptr<i8> x 1> : i64
+// STREAM:           %[[VAL_5:.*]] = cc.cast %[[VAL_3]] : (!cc.ptr<!cc.array<!cc.ptr<i8> x 1>>) -> !cc.ptr<!cc.ptr<i8>>
+// STREAM:           %[[VAL_6:.*]] = cc.cast %[[VAL_2]] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// STREAM:           cc.store %[[VAL_5]], %[[VAL_6]] : !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// STREAM:           %[[VAL_7:.*]] = cc.cast %[[VAL_3]] : (!cc.ptr<!cc.array<!cc.ptr<i8> x 1>>) -> i64
+// STREAM:           %[[VAL_8:.*]] = arith.addi %[[VAL_7]], %[[VAL_4]] : i64
+// STREAM:           %[[VAL_9:.*]] = cc.cast %[[VAL_8]] : (i64) -> !cc.ptr<!cc.ptr<i8>>
+// STREAM:           %[[VAL_10:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// STREAM:           cc.store %[[VAL_9]], %[[VAL_10]] : !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// STREAM:           %[[VAL_11:.*]] = cc.compute_ptr %[[VAL_2]][2] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// STREAM:           cc.store %[[VAL_9]], %[[VAL_11]] : !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// STREAM:           %[[VAL_14:.*]] = cc.compute_ptr %[[VAL_3]][0] : (!cc.ptr<!cc.array<!cc.ptr<i8> x 1>>) -> !cc.ptr<!cc.ptr<i8>>
+// STREAM:           %[[VAL_15:.*]] = cc.alloca i32
+// STREAM:           cc.store %[[VAL_1]], %[[VAL_15]] : !cc.ptr<i32>
+// STREAM:           %[[VAL_16:.*]] = cc.cast %[[VAL_15]] : (!cc.ptr<i32>) -> !cc.ptr<i8>
+// STREAM:           cc.store %[[VAL_16]], %[[VAL_14]] : !cc.ptr<!cc.ptr<i8>>
+// STREAM:           %[[VAL_19:.*]] = cc.cast %[[VAL_2]] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<i8>
+// STREAM:           %[[VAL_20:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr<array<4 x i8>>
+// STREAM:           %[[VAL_21:.*]] = cc.cast %[[VAL_20]] : (!llvm.ptr<array<4 x i8>>) -> !cc.ptr<i8>
+// STREAM:           call @streamlinedLaunchKernel(%[[VAL_21]], %[[VAL_19]]) : (!cc.ptr<i8>, !cc.ptr<i8>) -> ()
+// STREAM:           %[[VAL_22:.*]] = cc.undef f64
+// STREAM:           return %[[VAL_22]] : f64
+// STREAM:         }
+
+// HYBRID-LABEL:   func.func @_ZN3ghzclEi(
+// HYBRID-SAME:      %[[VAL_0:.*]]: !cc.ptr<i8>, %[[VAL_1:.*]]: i32) -> f64 {
+// HYBRID:           %[[VAL_2:.*]] = cc.undef !cc.struct<{i32, f64}>
+// HYBRID:           %[[VAL_3:.*]] = arith.constant 0 : i64
+// HYBRID:           %[[VAL_4:.*]] = cc.insert_value %[[VAL_1]], %[[VAL_2]][0] : (!cc.struct<{i32, f64}>, i32) -> !cc.struct<{i32, f64}>
+// HYBRID:           %[[VAL_6:.*]] = cc.sizeof !cc.struct<{i32, f64}> : i64
+// HYBRID:           %[[VAL_7:.*]] = arith.addi %[[VAL_6]], %[[VAL_3]] : i64
+// HYBRID:           %[[VAL_8:.*]] = cc.alloca i8{{\[}}%[[VAL_7]] : i64]
+// HYBRID:           %[[VAL_9:.*]] = cc.cast %[[VAL_8]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.struct<{i32, f64}>>
+// HYBRID:           cc.store %[[VAL_4]], %[[VAL_9]] : !cc.ptr<!cc.struct<{i32, f64}>>
+// HYBRID:           %[[VAL_10:.*]] = cc.cast %[[VAL_8]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.array<!cc.struct<{i32, f64}> x ?>>
+// HYBRID:           %[[VAL_11:.*]] = constant @ghz.thunk : (!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>
+// HYBRID:           %[[VAL_12:.*]] = cc.func_ptr %[[VAL_11]] : ((!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>) -> !cc.ptr<i8>
+// HYBRID:           %[[VAL_13:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr<!cc.array<!cc.struct<{i32, f64}> x ?>>) -> !cc.ptr<i8>
+// HYBRID:           %[[VAL_15:.*]] = cc.offsetof !cc.struct<{i32, f64}> [1] : i64
+// HYBRID:           %[[VAL_16:.*]] = cc.alloca !cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>
+// HYBRID:           %[[VAL_17:.*]] = cc.alloca !cc.array<!cc.ptr<i8> x 1>
+// HYBRID:           %[[VAL_18:.*]] = cc.sizeof !cc.array<!cc.ptr<i8> x 1> : i64
+// HYBRID:           %[[VAL_19:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr<!cc.array<!cc.ptr<i8> x 1>>) -> !cc.ptr<!cc.ptr<i8>>
+// HYBRID:           %[[VAL_20:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// HYBRID:           cc.store %[[VAL_19]], %[[VAL_20]] : !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// HYBRID:           %[[VAL_21:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr<!cc.array<!cc.ptr<i8> x 1>>) -> i64
+// HYBRID:           %[[VAL_22:.*]] = arith.addi %[[VAL_21]], %[[VAL_18]] : i64
+// HYBRID:           %[[VAL_23:.*]] = cc.cast %[[VAL_22]] : (i64) -> !cc.ptr<!cc.ptr<i8>>
+// HYBRID:           %[[VAL_24:.*]] = cc.compute_ptr %[[VAL_16]][1] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// HYBRID:           cc.store %[[VAL_23]], %[[VAL_24]] : !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// HYBRID:           %[[VAL_25:.*]] = cc.compute_ptr %[[VAL_16]][2] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// HYBRID:           cc.store %[[VAL_23]], %[[VAL_25]] : !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// HYBRID:           %[[VAL_26:.*]] = cc.compute_ptr %[[VAL_17]][0] : (!cc.ptr<!cc.array<!cc.ptr<i8> x 1>>) -> !cc.ptr<!cc.ptr<i8>>
+// HYBRID:           %[[VAL_27:.*]] = cc.alloca i32
+// HYBRID:           cc.store %[[VAL_1]], %[[VAL_27]] : !cc.ptr<i32>
+// HYBRID:           %[[VAL_28:.*]] = cc.cast %[[VAL_27]] : (!cc.ptr<i32>) -> !cc.ptr<i8>
+// HYBRID:           cc.store %[[VAL_28]], %[[VAL_26]] : !cc.ptr<!cc.ptr<i8>>
+// HYBRID:           %[[VAL_29:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<i8>
+// HYBRID:           %[[VAL_30:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr<array<4 x i8>>
+// HYBRID:           %[[VAL_31:.*]] = cc.cast %[[VAL_30]] : (!llvm.ptr<array<4 x i8>>) -> !cc.ptr<i8>
+// HYBRID:           call @hybridLaunchKernel(%[[VAL_31]], %[[VAL_12]], %[[VAL_13]], %[[VAL_7]], %[[VAL_15]], %[[VAL_29]]) : (!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64, !cc.ptr<i8>) -> ()
+// HYBRID:           %[[VAL_32:.*]] = cc.compute_ptr %[[VAL_10]][0, 1] : (!cc.ptr<!cc.array<!cc.struct<{i32, f64}> x ?>>) -> !cc.ptr<f64>
+// HYBRID:           %[[VAL_33:.*]] = cc.load %[[VAL_32]] : !cc.ptr<f64>
+// HYBRID:           return %[[VAL_33]] : f64
+// HYBRID:         }
diff --git a/test/Quake/kernel_exec-2.qke b/test/Quake/kernel_exec-2.qke
index e5b8a7f24c..fa3a8a5492 100644
--- a/test/Quake/kernel_exec-2.qke
+++ b/test/Quake/kernel_exec-2.qke
@@ -65,12 +65,12 @@ __nvqpp__mlirgen__function_cargo = "pants"}} {
 // CHECK:           call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_21]], %[[VAL_26]], %[[VAL_22]], %[[VAL_23]]) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
 // CHECK:           %[[VAL_90:.*]] = cc.cast %[[VAL_21]] :
 // CHECK:           %[[VAL_27:.*]] = cc.compute_ptr %[[VAL_90]][%[[VAL_22]]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_28:.*]] = llvm.mlir.addressof @function_hawaiian.kernelName : !llvm.ptr<array<18 x i8>>
 // CHECK:           %[[VAL_29:.*]] = constant @function_hawaiian.thunk : (!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>
-// CHECK:           %[[VAL_30:.*]] = cc.cast %[[VAL_28]] : (!llvm.ptr<array<18 x i8>>) -> !cc.ptr<i8>
 // CHECK:           %[[VAL_31:.*]] = cc.func_ptr %[[VAL_29]] : ((!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>) -> !cc.ptr<i8>
 // CHECK:           %[[VAL_32:.*]] = cc.cast %[[VAL_20]] : (!cc.ptr<!cc.array<!cc.struct<{i1, i64}> x ?>>) -> !cc.ptr<i8>
 // CHECK:           %[[VAL_33:.*]] = arith.constant 2147483647 : i64
+// CHECK:           %[[VAL_28:.*]] = llvm.mlir.addressof @function_hawaiian.kernelName : !llvm.ptr<array<18 x i8>>
+// CHECK:           %[[VAL_30:.*]] = cc.cast %[[VAL_28]] : (!llvm.ptr<array<18 x i8>>) -> !cc.ptr<i8>
 // CHECK:           call @altLaunchKernel(%[[VAL_30]], %[[VAL_31]], %[[VAL_32]], %[[VAL_17]], %[[VAL_33]]) : (!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64) -> ()
 // CHECK:           return
 // CHECK:         }
diff --git a/test/Quake/return_vector.qke b/test/Quake/return_vector.qke
index 79f090bb4c..3394ada1a2 100644
--- a/test/Quake/return_vector.qke
+++ b/test/Quake/return_vector.qke
@@ -46,20 +46,17 @@ func.func @test_0(%0: !cc.ptr<!cc.struct<{!cc.ptr<i32>, !cc.ptr<i32>, !cc.ptr<i3
 // CHECK-SAME:          %[[VAL_1:.*]]: !cc.ptr<i8>, %[[VAL_2:.*]]: i32) {
 // CHECK:           %[[VAL_3:.*]] = arith.constant 4 : i64
 // CHECK:           %[[VAL_4:.*]] = constant @test_0.thunk : (!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>
-// CHECK:           %[[VAL_5:.*]] = arith.constant 0 : i64
 // CHECK:           %[[VAL_6:.*]] = cc.undef !cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>
 // CHECK:           %[[VAL_7:.*]] = cc.insert_value %[[VAL_2]], %[[VAL_6]][0] : (!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>, i32) -> !cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>
-// CHECK:           %[[VAL_8:.*]] = cc.cast %[[VAL_5]] : (i64) -> !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>>
 // CHECK:           %[[VAL_9:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}> : i64
 // CHECK:           %[[VAL_10:.*]] = cc.alloca i8{{\[}}%[[VAL_9]] : i64]
 // CHECK:           %[[VAL_11:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>>
 // CHECK:           cc.store %[[VAL_7]], %[[VAL_11]] : !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>>
-// CHECK:           %[[VAL_12:.*]] = llvm.mlir.addressof @test_0.kernelName : !llvm.ptr<array<7 x i8>>
-// CHECK:           %[[VAL_13:.*]] = cc.cast %[[VAL_12]] : (!llvm.ptr<array<7 x i8>>) -> !cc.ptr<i8>
 // CHECK:           %[[VAL_14:.*]] = cc.func_ptr %[[VAL_4]] : ((!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>) -> !cc.ptr<i8>
 // CHECK:           %[[VAL_15:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_16:.*]] = cc.compute_ptr %[[VAL_8]][1] : (!cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>>) -> !cc.ptr<!cc.struct<{!cc.ptr<i32>, i64}>>
-// CHECK:           %[[VAL_17:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr<!cc.struct<{!cc.ptr<i32>, i64}>>) -> i64
+// CHECK:           %[[VAL_17:.*]] = cc.offsetof !cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}> [1] : i64
+// CHECK:           %[[VAL_12:.*]] = llvm.mlir.addressof @test_0.kernelName : !llvm.ptr<array<7 x i8>>
+// CHECK:           %[[VAL_13:.*]] = cc.cast %[[VAL_12]] : (!llvm.ptr<array<7 x i8>>) -> !cc.ptr<i8>
 // CHECK:           call @altLaunchKernel(%[[VAL_13]], %[[VAL_14]], %[[VAL_15]], %[[VAL_9]], %[[VAL_17]]) : (!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64) -> ()
 // CHECK:           %[[VAL_18:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>>
 // CHECK:           %[[VAL_19:.*]] = cc.compute_ptr %[[VAL_18]][1, 0] : (!cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>>) -> !cc.ptr<!cc.ptr<i32>>
@@ -113,20 +110,17 @@ func.func @test_1(%0: !cc.ptr<!cc.struct<{!cc.ptr<f64>, !cc.ptr<f64>, !cc.ptr<f6
 // CHECK-SAME:           %[[VAL_1:.*]]: !cc.ptr<i8>, %[[VAL_2:.*]]: i32) {
 // CHECK:           %[[VAL_3:.*]] = arith.constant 8 : i64
 // CHECK:           %[[VAL_4:.*]] = constant @test_1.thunk : (!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>
-// CHECK:           %[[VAL_5:.*]] = arith.constant 0 : i64
 // CHECK:           %[[VAL_6:.*]] = cc.undef !cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>
 // CHECK:           %[[VAL_7:.*]] = cc.insert_value %[[VAL_2]], %[[VAL_6]][0] : (!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>, i32) -> !cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>
-// CHECK:           %[[VAL_8:.*]] = cc.cast %[[VAL_5]] : (i64) -> !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>>
 // CHECK:           %[[VAL_9:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}> : i64
 // CHECK:           %[[VAL_10:.*]] = cc.alloca i8{{\[}}%[[VAL_9]] : i64]
 // CHECK:           %[[VAL_11:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>>
 // CHECK:           cc.store %[[VAL_7]], %[[VAL_11]] : !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>>
-// CHECK:           %[[VAL_12:.*]] = llvm.mlir.addressof @test_1.kernelName : !llvm.ptr<array<7 x i8>>
-// CHECK:           %[[VAL_13:.*]] = cc.cast %[[VAL_12]] : (!llvm.ptr<array<7 x i8>>) -> !cc.ptr<i8>
 // CHECK:           %[[VAL_14:.*]] = cc.func_ptr %[[VAL_4]] : ((!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>) -> !cc.ptr<i8>
 // CHECK:           %[[VAL_15:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_16:.*]] = cc.compute_ptr %[[VAL_8]][1] : (!cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>>) -> !cc.ptr<!cc.struct<{!cc.ptr<f64>, i64}>>
-// CHECK:           %[[VAL_17:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr<!cc.struct<{!cc.ptr<f64>, i64}>>) -> i64
+// CHECK:           %[[VAL_17:.*]] = cc.offsetof !cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}> [1] : i64
+// CHECK:           %[[VAL_12:.*]] = llvm.mlir.addressof @test_1.kernelName : !llvm.ptr<array<7 x i8>>
+// CHECK:           %[[VAL_13:.*]] = cc.cast %[[VAL_12]] : (!llvm.ptr<array<7 x i8>>) -> !cc.ptr<i8>
 // CHECK:           call @altLaunchKernel(%[[VAL_13]], %[[VAL_14]], %[[VAL_15]], %[[VAL_9]], %[[VAL_17]]) : (!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64) -> ()
 // CHECK:           %[[VAL_18:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>>
 // CHECK:           %[[VAL_19:.*]] = cc.compute_ptr %[[VAL_18]][1, 0] : (!cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>>) -> !cc.ptr<!cc.ptr<f64>>
diff --git a/unittests/integration/get_state_tester.cpp b/unittests/integration/get_state_tester.cpp
index 00f3ca1dec..452c55c8bb 100644
--- a/unittests/integration/get_state_tester.cpp
+++ b/unittests/integration/get_state_tester.cpp
@@ -165,3 +165,22 @@ CUDAQ_TEST(GetStateTester, checkOverlapFromHostVector) {
   EXPECT_NEAR(1.0, state.overlap(hostState).real(), 1e-3);
 }
 #endif
+
+CUDAQ_TEST(GetStateTester, checkKron) {
+  auto force_kron = [](std::vector<std::complex<cudaq::real>> vec) __qpu__ {
+    cudaq::qubit a;
+    cudaq::qvector qvec(vec);
+  };
+  // Construct a 6-qubit |111111> state
+  const int num_qubits_input_state = 6;
+  std::vector<std::complex<cudaq::real>> hostStateData(
+      1 << num_qubits_input_state);
+  hostStateData[hostStateData.size() - 1] = 1.0;
+
+  auto counts = cudaq::sample(force_kron, hostStateData);
+
+  // Expect a single state with a deterministic outcome
+  EXPECT_EQ(counts.size(), 1);
+  EXPECT_EQ(counts.begin()->first,
+            "0" + std::string(num_qubits_input_state, '1'));
+}