diff --git a/include/cudaq/Optimizer/Transforms/Passes.h b/include/cudaq/Optimizer/Transforms/Passes.h
index 56f65f4f79..ad31dff838 100644
--- a/include/cudaq/Optimizer/Transforms/Passes.h
+++ b/include/cudaq/Optimizer/Transforms/Passes.h
@@ -57,9 +57,9 @@ inline std::unique_ptr<mlir::Pass> createPySynthCallableBlockArgs() {
 /// Helper function to build an argument synthesis pass. The names of the
 /// functions and the substitutions text can be built as an unzipped pair of
 /// lists.
-std::unique_ptr<mlir::Pass> createArgumentSynthesisPass(
-    const mlir::ArrayRef<mlir::StringRef> &funcNames,
-    const mlir::ArrayRef<mlir::StringRef> &substitutions);
+std::unique_ptr<mlir::Pass>
+createArgumentSynthesisPass(mlir::ArrayRef<mlir::StringRef> funcNames,
+                            mlir::ArrayRef<mlir::StringRef> substitutions);
 
 // declarative passes
 #define GEN_PASS_DECL
diff --git a/lib/Optimizer/Transforms/ArgumentSynthesis.cpp b/lib/Optimizer/Transforms/ArgumentSynthesis.cpp
index 7281c0e21f..138e842373 100644
--- a/lib/Optimizer/Transforms/ArgumentSynthesis.cpp
+++ b/lib/Optimizer/Transforms/ArgumentSynthesis.cpp
@@ -73,8 +73,9 @@ class ArgumentSynthesisPass
     assert(*substMod && "module must have been created");
 
     // 2. Go through the Module and process each substitution.
-    std::vector<bool> processedArgs(func.getFunctionType().getNumInputs());
-    std::vector<std::tuple<unsigned, Value, Value>> replacements;
+    SmallVector<bool> processedArgs(func.getFunctionType().getNumInputs());
+    SmallVector<std::tuple<unsigned, Value, Value>> replacements;
+    BitVector replacedArgs(processedArgs.size());
     for (auto &op : *substMod) {
       auto subst = dyn_cast<cudaq::cc::ArgumentSubstitutionOp>(op);
       if (!subst) {
@@ -103,6 +104,17 @@ class ArgumentSynthesisPass
       // OK, substitute the code for the argument.
       Block &entry = func.getRegion().front();
       processedArgs[pos] = true;
+      if (subst.getBody().front().empty()) {
+        // No code is present. Erase the argument if it is not used.
+        const auto numUses =
+            std::distance(entry.getArgument(pos).getUses().begin(),
+                          entry.getArgument(pos).getUses().end());
+        LLVM_DEBUG(llvm::dbgs() << "maybe erasing an unused argument ("
+                                << std::to_string(numUses) << ")\n");
+        if (numUses == 0)
+          replacedArgs.set(pos);
+        continue;
+      }
       OpBuilder builder{ctx};
       Block *splitBlock = entry.splitBlock(entry.begin());
       builder.setInsertionPointToEnd(&entry);
@@ -126,7 +138,6 @@ class ArgumentSynthesisPass
     // function is still dead and can be removed by a DCE.
 
     // 3. Replace the block argument values with the freshly inserted new code.
-    BitVector replacedArgs(processedArgs.size());
     for (auto [pos, fromVal, toVal] : replacements) {
       replacedArgs.set(pos);
       fromVal.replaceAllUsesWith(toVal);
@@ -142,9 +153,9 @@ class ArgumentSynthesisPass
 // Helper function that takes an unzipped pair of lists of function names and
 // substitution code strings. This is meant to make adding this pass to a
 // pipeline easier from within a tool (such as the JIT compiler).
-std::unique_ptr<mlir::Pass> cudaq::opt::createArgumentSynthesisPass(
-    const ArrayRef<StringRef> &funcNames,
-    const ArrayRef<StringRef> &substitutions) {
+std::unique_ptr<mlir::Pass>
+cudaq::opt::createArgumentSynthesisPass(ArrayRef<StringRef> funcNames,
+                                        ArrayRef<StringRef> substitutions) {
   SmallVector<std::string> pairs;
   if (funcNames.size() == substitutions.size())
     for (auto [name, text] : llvm::zip(funcNames, substitutions))
diff --git a/lib/Optimizer/Transforms/GenKernelExecution.cpp b/lib/Optimizer/Transforms/GenKernelExecution.cpp
index 7927a1995d..d57fc97f5f 100644
--- a/lib/Optimizer/Transforms/GenKernelExecution.cpp
+++ b/lib/Optimizer/Transforms/GenKernelExecution.cpp
@@ -1329,8 +1329,7 @@ class GenerateKernelExecution
     Value vecArgPtrs;
     if (isCodegenArgumentGather(codegenKind)) {
       // 1) Allocate and initialize a std::vector<void*> object.
-      const unsigned count =
-          cudaq::cc::numberOfHiddenArgs(addThisPtr, hiddenSRet);
+      const unsigned count = devFuncTy.getInputs().size();
       auto stdVec = builder.create<cudaq::cc::AllocaOp>(
           loc, cudaq::opt::factory::stlVectorType(ptrI8Ty));
       auto arrPtrTy = cudaq::cc::ArrayType::get(ctx, ptrI8Ty, count);
diff --git a/python/extension/CMakeLists.txt b/python/extension/CMakeLists.txt
index 48225b9740..425bc1eea8 100644
--- a/python/extension/CMakeLists.txt
+++ b/python/extension/CMakeLists.txt
@@ -72,6 +72,7 @@ declare_mlir_python_extension(CUDAQuantumPythonSources.Extension
     ../../runtime/cudaq/platform/common/QuantumExecutionQueue.cpp
     ../../runtime/cudaq/platform/default/rest_server/RemoteRuntimeClient.cpp
     ../../runtime/cudaq/platform/orca/OrcaQPU.cpp
+    ../../runtime/common/ArgumentConversion.cpp
 
   EMBED_CAPI_LINK_LIBS
    CUDAQuantumMLIRCAPI
diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
index 631e813056..570fa3e597 100644
--- a/runtime/common/BaseRemoteRESTQPU.h
+++ b/runtime/common/BaseRemoteRESTQPU.h
@@ -8,6 +8,7 @@
 
 #pragma once
 
+#include "common/ArgumentConversion.h"
 #include "common/Environment.h"
 #include "common/ExecutionContext.h"
 #include "common/Executor.h"
@@ -17,6 +18,7 @@
 #include "common/RuntimeMLIR.h"
 #include "cudaq.h"
 #include "cudaq/Frontend/nvqpp/AttributeNames.h"
+#include "cudaq/Optimizer/Builder/Runtime.h"
 #include "cudaq/Optimizer/CodeGen/OpenQASMEmitter.h"
 #include "cudaq/Optimizer/CodeGen/Passes.h"
 #include "cudaq/Optimizer/Dialect/CC/CCDialect.h"
@@ -112,7 +114,8 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
   /// @brief Invoke the kernel in the JIT engine
   void invokeJITKernel(mlir::ExecutionEngine *jit,
                        const std::string &kernelName) {
-    auto funcPtr = jit->lookup(std::string("__nvqpp__mlirgen__") + kernelName);
+    auto funcPtr = jit->lookup(std::string(cudaq::runtime::cudaqGenPrefixName) +
+                               kernelName);
     if (!funcPtr) {
       throw std::runtime_error(
           "cudaq::builder failed to get kernelReg function.");
@@ -347,12 +350,24 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
     return output_names;
   }
 
+  std::vector<cudaq::KernelExecution>
+  lowerQuakeCode(const std::string &kernelName, void *kernelArgs) {
+    return lowerQuakeCode(kernelName, kernelArgs, {});
+  }
+
+  std::vector<cudaq::KernelExecution>
+  lowerQuakeCode(const std::string &kernelName,
+                 const std::vector<void *> &rawArgs) {
+    return lowerQuakeCode(kernelName, nullptr, rawArgs);
+  }
+
   /// @brief Extract the Quake representation for the given kernel name and
   /// lower it to the code format required for the specific backend. The
   /// lowering process is controllable via the configuration file in the
   /// platform directory for the targeted backend.
   std::vector<cudaq::KernelExecution>
-  lowerQuakeCode(const std::string &kernelName, void *kernelArgs) {
+  lowerQuakeCode(const std::string &kernelName, void *kernelArgs,
+                 const std::vector<void *> &rawArgs) {
 
     auto [m_module, contextPtr, updatedArgs] =
         extractQuakeCodeAndContext(kernelName, kernelArgs);
@@ -361,7 +376,7 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
 
     // Extract the kernel name
     auto func = m_module.lookupSymbol<mlir::func::FuncOp>(
-        std::string("__nvqpp__mlirgen__") + kernelName);
+        std::string(cudaq::runtime::cudaqGenPrefixName) + kernelName);
 
     // Create a new Module to clone the function into
     auto location = mlir::FileLineColLoc::get(&context, "<builder>", 1, 1);
@@ -402,10 +417,26 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
         throw std::runtime_error("Remote rest platform Quake lowering failed.");
     };
 
-    if (updatedArgs) {
-      cudaq::info("Run Quake Synth.\n");
+    if (!rawArgs.empty() || updatedArgs) {
       mlir::PassManager pm(&context);
-      pm.addPass(cudaq::opt::createQuakeSynthesizer(kernelName, updatedArgs));
+      if (!rawArgs.empty()) {
+        cudaq::info("Run Argument Synth.\n");
+        opt::ArgumentConverter argCon(kernelName, moduleOp);
+        argCon.gen(rawArgs);
+        std::string kernName = cudaq::runtime::cudaqGenPrefixName + kernelName;
+        mlir::StringRef sr{kernName};
+        mlir::SmallVector<mlir::StringRef> kernels = {sr};
+        std::string substBuff;
+        llvm::raw_string_ostream ss(substBuff);
+        ss << argCon.getSubstitutionModule();
+        mlir::StringRef su{substBuff};
+        mlir::SmallVector<mlir::StringRef> substs = {su};
+        pm.addNestedPass<mlir::func::FuncOp>(
+            opt::createArgumentSynthesisPass(kernels, substs));
+      } else if (updatedArgs) {
+        cudaq::info("Run Quake Synth.\n");
+        pm.addPass(cudaq::opt::createQuakeSynthesizer(kernelName, updatedArgs));
+      }
       pm.addPass(mlir::createCanonicalizerPass());
       if (disableMLIRthreading || enablePrintMLIREachPass)
         moduleOp.getContext()->disableMultithreading();
@@ -418,7 +449,7 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
     runPassPipeline(passPipelineConfig, moduleOp);
 
     auto entryPointFunc = moduleOp.lookupSymbol<mlir::func::FuncOp>(
-        std::string("__nvqpp__mlirgen__") + kernelName);
+        std::string(cudaq::runtime::cudaqGenPrefixName) + kernelName);
     std::vector<std::size_t> mapping_reorder_idx;
     if (auto mappingAttr = dyn_cast_if_present<mlir::ArrayAttr>(
             entryPointFunc->getAttr("mapping_reorder_idx"))) {
@@ -448,7 +479,7 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
 
         // Get the ansatz
         auto ansatz = moduleOp.lookupSymbol<mlir::func::FuncOp>(
-            std::string("__nvqpp__mlirgen__") + kernelName);
+            std::string(cudaq::runtime::cudaqGenPrefixName) + kernelName);
 
         // Create a new Module to clone the ansatz into it
         auto tmpModuleOp = builder.create<mlir::ModuleOp>();
@@ -513,6 +544,21 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
     return codes;
   }
 
+  void launchKernel(const std::string &kernelName,
+                    const std::vector<void *> &rawArgs) override {
+    cudaq::info("launching remote rest kernel ({})", kernelName);
+
+    // TODO future iterations of this should support non-void return types.
+    if (!executionContext)
+      throw std::runtime_error(
+          "Remote rest execution can only be performed via cudaq::sample(), "
+          "cudaq::observe(), or cudaq::draw().");
+
+    // Get the Quake code, lowered according to config file.
+    auto codes = lowerQuakeCode(kernelName, rawArgs);
+    completeLaunchKernel(kernelName, std::move(codes));
+  }
+
   /// @brief Launch the kernel. Extract the Quake code and lower to
   /// the representation required by the targeted backend. Handle all pertinent
   /// modifications for the execution context as well as asynchronous or
@@ -530,6 +576,11 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
 
     // Get the Quake code, lowered according to config file.
     auto codes = lowerQuakeCode(kernelName, args);
+    completeLaunchKernel(kernelName, std::move(codes));
+  }
+
+  void completeLaunchKernel(const std::string &kernelName,
+                            std::vector<cudaq::KernelExecution> &&codes) {
 
     // After performing lowerQuakeCode, check to see if we are simply drawing
     // the circuit. If so, perform the trace here and then return.
diff --git a/runtime/common/BaseRemoteSimulatorQPU.h b/runtime/common/BaseRemoteSimulatorQPU.h
index 2f63f16faf..9e833aa565 100644
--- a/runtime/common/BaseRemoteSimulatorQPU.h
+++ b/runtime/common/BaseRemoteSimulatorQPU.h
@@ -105,6 +105,11 @@ class BaseRemoteSimulatorQPU : public cudaq::QPU {
       throw std::runtime_error("Failed to launch VQE. Error: " + errorMsg);
   }
 
+  void launchKernel(const std::string &name,
+                    const std::vector<void *> &rawArgs) override {
+    throw std::runtime_error("launch kernel on raw args not implemented");
+  }
+
   void launchKernel(const std::string &name, void (*kernelFunc)(void *),
                     void *args, std::uint64_t voidStarSize,
                     std::uint64_t resultOffset) override {
diff --git a/runtime/cudaq/platform/default/DefaultQuantumPlatform.cpp b/runtime/cudaq/platform/default/DefaultQuantumPlatform.cpp
index 6e2ad949c1..988e029367 100644
--- a/runtime/cudaq/platform/default/DefaultQuantumPlatform.cpp
+++ b/runtime/cudaq/platform/default/DefaultQuantumPlatform.cpp
@@ -39,6 +39,11 @@ class DefaultQPU : public cudaq::QPU {
     kernelFunc(args);
   }
 
+  void launchKernel(const std::string &name,
+                    const std::vector<void *> &) override {
+    throw std::runtime_error("Wrong kernel launch point.");
+  }
+
   /// Overrides setExecutionContext to forward it to the ExecutionManager
   void setExecutionContext(cudaq::ExecutionContext *context) override {
     ScopedTraceWithContext("DefaultPlatform::setExecutionContext",
diff --git a/runtime/cudaq/platform/mqpu/custatevec/GPUEmulatedQPU.cpp b/runtime/cudaq/platform/mqpu/custatevec/GPUEmulatedQPU.cpp
index 5abd45bdf7..eb62f6569a 100644
--- a/runtime/cudaq/platform/mqpu/custatevec/GPUEmulatedQPU.cpp
+++ b/runtime/cudaq/platform/mqpu/custatevec/GPUEmulatedQPU.cpp
@@ -44,6 +44,11 @@ class GPUEmulatedQPU : public cudaq::QPU {
     kernelFunc(args);
   }
 
+  void launchKernel(const std::string &name,
+                    const std::vector<void *> &rawArgs) override {
+    throw std::runtime_error("not implemented");
+  }
+
   /// Overrides setExecutionContext to forward it to the ExecutionManager
   void setExecutionContext(cudaq::ExecutionContext *context) override {
     cudaSetDevice(qpu_id);
diff --git a/runtime/cudaq/platform/orca/OrcaQPU.cpp b/runtime/cudaq/platform/orca/OrcaQPU.cpp
index bfab6ac839..8c6a414b5a 100644
--- a/runtime/cudaq/platform/orca/OrcaQPU.cpp
+++ b/runtime/cudaq/platform/orca/OrcaQPU.cpp
@@ -175,6 +175,10 @@ class OrcaRemoteRESTQPU : public cudaq::QPU {
   void launchKernel(const std::string &kernelName, void (*kernelFunc)(void *),
                     void *args, std::uint64_t voidStarSize,
                     std::uint64_t resultOffset) override;
+  void launchKernel(const std::string &kernelName,
+                    const std::vector<void *> &rawArgs) override {
+    throw std::runtime_error("launch kernel on raw args not implemented");
+  }
 };
 
 /// @brief This setTargetBackend override is in charge of reading the
@@ -321,4 +325,4 @@ cudaq::RestHeaders OrcaRemoteRESTQPU::getHeaders() {
 
 } // namespace
 
-CUDAQ_REGISTER_TYPE(cudaq::QPU, OrcaRemoteRESTQPU, orca)
\ No newline at end of file
+CUDAQ_REGISTER_TYPE(cudaq::QPU, OrcaRemoteRESTQPU, orca)
diff --git a/runtime/cudaq/platform/qpu.h b/runtime/cudaq/platform/qpu.h
index 6f1a3024e3..0209c8bd7b 100644
--- a/runtime/cudaq/platform/qpu.h
+++ b/runtime/cudaq/platform/qpu.h
@@ -174,6 +174,8 @@ class QPU : public registry::RegisteredType<QPU> {
   /// as a struct-packed void pointer and its corresponding size.
   virtual void launchKernel(const std::string &name, void (*kernelFunc)(void *),
                             void *args, std::uint64_t, std::uint64_t) = 0;
+  virtual void launchKernel(const std::string &name,
+                            const std::vector<void *> &rawArgs) = 0;
 
   /// Launch serialized code for remote execution. Subtypes that support this
   /// should override this function.
diff --git a/runtime/cudaq/platform/quantum_platform.cpp b/runtime/cudaq/platform/quantum_platform.cpp
index 6f98283047..1c407df1f0 100644
--- a/runtime/cudaq/platform/quantum_platform.cpp
+++ b/runtime/cudaq/platform/quantum_platform.cpp
@@ -163,6 +163,19 @@ void quantum_platform::launchKernel(std::string kernelName,
   qpu->launchKernel(kernelName, kernelFunc, args, voidStarSize, resultOffset);
 }
 
+void quantum_platform::launchKernel(std::string kernelName,
+                                    const std::vector<void *> &rawArgs) {
+  std::size_t qpu_id = 0;
+
+  auto tid = std::hash<std::thread::id>{}(std::this_thread::get_id());
+  auto iter = threadToQpuId.find(tid);
+  if (iter != threadToQpuId.end())
+    qpu_id = iter->second;
+
+  auto &qpu = platformQPUs[qpu_id];
+  qpu->launchKernel(kernelName, rawArgs);
+}
+
 void quantum_platform::launchSerializedCodeExecution(
     const std::string &name,
     cudaq::SerializedCodeExecutionContext &serializeCodeExecutionObject) {
@@ -201,3 +214,12 @@ void cudaq::altLaunchKernel(const char *kernelName, void (*kernelFunc)(void *),
   platform.launchKernel(kernName, kernelFunc, kernelArgs, argsSize,
                         resultOffset);
 }
+
+void cudaq::streamlinedLaunchKernel(const char *kernelName,
+                                    const std::vector<void *> &rawArgs) {
+  std::size_t argsSize = rawArgs.size();
+  ScopedTraceWithContext("streamlinedLaunchKernel", kernelName, argsSize);
+  auto &platform = *cudaq::getQuantumPlatformInternal();
+  std::string kernName = kernelName;
+  platform.launchKernel(kernName, rawArgs);
+}
diff --git a/runtime/cudaq/platform/quantum_platform.h b/runtime/cudaq/platform/quantum_platform.h
index 17244340d0..5f972fd7bc 100644
--- a/runtime/cudaq/platform/quantum_platform.h
+++ b/runtime/cudaq/platform/quantum_platform.h
@@ -145,6 +145,7 @@ class quantum_platform {
   void launchKernel(std::string kernelName, void (*kernelFunc)(void *),
                     void *args, std::uint64_t voidStarSize,
                     std::uint64_t resultOffset);
+  void launchKernel(std::string kernelName, const std::vector<void *> &);
 
   // This method is the hook for executing SerializedCodeExecutionContext
   // objects.
@@ -212,8 +213,20 @@ class quantum_platform {
 /// tied to the quantum platform instance somehow. Note that the compiler cannot
 /// provide that information.
 extern "C" {
+// Client-server (legacy) interface.
 void altLaunchKernel(const char *kernelName, void (*kernel)(void *), void *args,
                      std::uint64_t argsSize, std::uint64_t resultOffset);
+// Streamlined interface for launching kernels. Argument synthesis and JIT
+// compilation *must* happen on the local machine.
+void streamlinedLaunchKernel(const char *kernelName,
+                             const std::vector<void *> &rawArgs);
+// Hybrid of the client-server and streamlined approaches. Letting JIT
+// compilation happen either early or late and can handle return values from
+// each kernel launch.
+void hybridLaunchKernel(const char *kernelName, void (*kernel)(void *),
+                        void *args, std::uint64_t argsSize,
+                        std::uint64_t resultOffset,
+                        const std::vector<void *> &rawArgs);
 }
 
 } // namespace cudaq
diff --git a/targettests/execution/test-6.cpp b/targettests/execution/test-6.cpp
index 24288cb89f..b0c6fc855f 100644
--- a/targettests/execution/test-6.cpp
+++ b/targettests/execution/test-6.cpp
@@ -8,6 +8,7 @@
 
 // REQUIRES: c++20
 // RUN: nvq++ --target quantinuum --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ -fkernel-exec-kind=2 --target quantinuum --emulate %s -o %t && %t | FileCheck %s
 
 #include <cudaq.h>
 #include <iostream>
diff --git a/targettests/execution/to_integer.cpp b/targettests/execution/to_integer.cpp
index 598ddd59c1..4890fd16f1 100644
--- a/targettests/execution/to_integer.cpp
+++ b/targettests/execution/to_integer.cpp
@@ -6,10 +6,8 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-// RUN: nvq++ -std=c++17 --enable-mlir %s -o %t
-// RUN: if [ $(echo %cpp_std | cut -c4- ) -ge 20 ]; then \
-// RUN:   nvq++ --enable-mlir %s -o %t && %t; \
-// RUN: fi
+// RUN: nvq++ %cpp_std --enable-mlir %s -o %t && %t
+// RUN: nvq++ %cpp_std -fkernel-exec-kind=2 --enable-mlir %s -o %t && %t
 
 #include <cudaq.h>
 
diff --git a/tools/nvqpp/nvq++.in b/tools/nvqpp/nvq++.in
index ed1fd32741..98368cdf8e 100644
--- a/tools/nvqpp/nvq++.in
+++ b/tools/nvqpp/nvq++.in
@@ -95,6 +95,9 @@ function f_option_handling {
 	-flower-to-cfg)
 		ENABLE_LOWER_TO_CFG=true
 		;;
+	-fkernel-exec-kind=*)
+		KERNEL_EXECUTION_KIND="{codegen=${1#*=}}"
+		;;
 	*)
 		# Pass any unrecognized options on to the clang++ tool.
 		ARGS="${ARGS} $1"
@@ -325,6 +328,7 @@ SHOW_VERSION=false
 ENABLE_UNWIND_LOWERING=true
 ENABLE_DEVICE_CODE_LOADERS=true
 ENABLE_KERNEL_EXECUTION=true
+KERNEL_EXECUTION_KIND=
 ENABLE_AGGRESSIVE_EARLY_INLINE=true
 ENABLE_LOWER_TO_CFG=true
 ENABLE_APPLY_SPECIALIZATION=true
@@ -680,7 +684,7 @@ if ${ENABLE_APPLY_SPECIALIZATION}; then
 fi
 if ${ENABLE_KERNEL_EXECUTION}; then
 	RUN_OPT=true
-	OPT_PASSES=$(add_pass_to_pipeline "${OPT_PASSES}" "kernel-execution")
+	OPT_PASSES=$(add_pass_to_pipeline "${OPT_PASSES}" "kernel-execution${KERNEL_EXECUTION_KIND}")
 fi
 if ${ENABLE_AGGRESSIVE_EARLY_INLINE}; then
 	RUN_OPT=true