diff --git a/include/cudaq/Optimizer/Transforms/Passes.h b/include/cudaq/Optimizer/Transforms/Passes.h index 56f65f4f79..ad31dff838 100644 --- a/include/cudaq/Optimizer/Transforms/Passes.h +++ b/include/cudaq/Optimizer/Transforms/Passes.h @@ -57,9 +57,9 @@ inline std::unique_ptr createPySynthCallableBlockArgs() { /// Helper function to build an argument synthesis pass. The names of the /// functions and the substitutions text can be built as an unzipped pair of /// lists. -std::unique_ptr createArgumentSynthesisPass( - const mlir::ArrayRef &funcNames, - const mlir::ArrayRef &substitutions); +std::unique_ptr +createArgumentSynthesisPass(mlir::ArrayRef funcNames, + mlir::ArrayRef substitutions); // declarative passes #define GEN_PASS_DECL diff --git a/lib/Optimizer/Transforms/ArgumentSynthesis.cpp b/lib/Optimizer/Transforms/ArgumentSynthesis.cpp index 7281c0e21f..138e842373 100644 --- a/lib/Optimizer/Transforms/ArgumentSynthesis.cpp +++ b/lib/Optimizer/Transforms/ArgumentSynthesis.cpp @@ -73,8 +73,9 @@ class ArgumentSynthesisPass assert(*substMod && "module must have been created"); // 2. Go through the Module and process each substitution. - std::vector processedArgs(func.getFunctionType().getNumInputs()); - std::vector> replacements; + SmallVector processedArgs(func.getFunctionType().getNumInputs()); + SmallVector> replacements; + BitVector replacedArgs(processedArgs.size()); for (auto &op : *substMod) { auto subst = dyn_cast(op); if (!subst) { @@ -103,6 +104,17 @@ class ArgumentSynthesisPass // OK, substitute the code for the argument. Block &entry = func.getRegion().front(); processedArgs[pos] = true; + if (subst.getBody().front().empty()) { + // No code is present. Erase the argument if it is not used. + const auto numUses = + std::distance(entry.getArgument(pos).getUses().begin(), + entry.getArgument(pos).getUses().end()); + LLVM_DEBUG(llvm::dbgs() << "maybe erasing an unused argument (" + << std::to_string(numUses) << ")\n"); + if (numUses == 0) + replacedArgs.set(pos); + continue; + } OpBuilder builder{ctx}; Block *splitBlock = entry.splitBlock(entry.begin()); builder.setInsertionPointToEnd(&entry); @@ -126,7 +138,6 @@ class ArgumentSynthesisPass // function is still dead and can be removed by a DCE. // 3. Replace the block argument values with the freshly inserted new code. - BitVector replacedArgs(processedArgs.size()); for (auto [pos, fromVal, toVal] : replacements) { replacedArgs.set(pos); fromVal.replaceAllUsesWith(toVal); @@ -142,9 +153,9 @@ class ArgumentSynthesisPass // Helper function that takes an unzipped pair of lists of function names and // substitution code strings. This is meant to make adding this pass to a // pipeline easier from within a tool (such as the JIT compiler). -std::unique_ptr cudaq::opt::createArgumentSynthesisPass( - const ArrayRef &funcNames, - const ArrayRef &substitutions) { +std::unique_ptr +cudaq::opt::createArgumentSynthesisPass(ArrayRef funcNames, + ArrayRef substitutions) { SmallVector pairs; if (funcNames.size() == substitutions.size()) for (auto [name, text] : llvm::zip(funcNames, substitutions)) diff --git a/lib/Optimizer/Transforms/GenKernelExecution.cpp b/lib/Optimizer/Transforms/GenKernelExecution.cpp index 7927a1995d..d57fc97f5f 100644 --- a/lib/Optimizer/Transforms/GenKernelExecution.cpp +++ b/lib/Optimizer/Transforms/GenKernelExecution.cpp @@ -1329,8 +1329,7 @@ class GenerateKernelExecution Value vecArgPtrs; if (isCodegenArgumentGather(codegenKind)) { // 1) Allocate and initialize a std::vector object. - const unsigned count = - cudaq::cc::numberOfHiddenArgs(addThisPtr, hiddenSRet); + const unsigned count = devFuncTy.getInputs().size(); auto stdVec = builder.create( loc, cudaq::opt::factory::stlVectorType(ptrI8Ty)); auto arrPtrTy = cudaq::cc::ArrayType::get(ctx, ptrI8Ty, count); diff --git a/python/extension/CMakeLists.txt b/python/extension/CMakeLists.txt index 48225b9740..425bc1eea8 100644 --- a/python/extension/CMakeLists.txt +++ b/python/extension/CMakeLists.txt @@ -72,6 +72,7 @@ declare_mlir_python_extension(CUDAQuantumPythonSources.Extension ../../runtime/cudaq/platform/common/QuantumExecutionQueue.cpp ../../runtime/cudaq/platform/default/rest_server/RemoteRuntimeClient.cpp ../../runtime/cudaq/platform/orca/OrcaQPU.cpp + ../../runtime/common/ArgumentConversion.cpp EMBED_CAPI_LINK_LIBS CUDAQuantumMLIRCAPI diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h index 631e813056..570fa3e597 100644 --- a/runtime/common/BaseRemoteRESTQPU.h +++ b/runtime/common/BaseRemoteRESTQPU.h @@ -8,6 +8,7 @@ #pragma once +#include "common/ArgumentConversion.h" #include "common/Environment.h" #include "common/ExecutionContext.h" #include "common/Executor.h" @@ -17,6 +18,7 @@ #include "common/RuntimeMLIR.h" #include "cudaq.h" #include "cudaq/Frontend/nvqpp/AttributeNames.h" +#include "cudaq/Optimizer/Builder/Runtime.h" #include "cudaq/Optimizer/CodeGen/OpenQASMEmitter.h" #include "cudaq/Optimizer/CodeGen/Passes.h" #include "cudaq/Optimizer/Dialect/CC/CCDialect.h" @@ -112,7 +114,8 @@ class BaseRemoteRESTQPU : public cudaq::QPU { /// @brief Invoke the kernel in the JIT engine void invokeJITKernel(mlir::ExecutionEngine *jit, const std::string &kernelName) { - auto funcPtr = jit->lookup(std::string("__nvqpp__mlirgen__") + kernelName); + auto funcPtr = jit->lookup(std::string(cudaq::runtime::cudaqGenPrefixName) + + kernelName); if (!funcPtr) { throw std::runtime_error( "cudaq::builder failed to get kernelReg function."); @@ -347,12 +350,24 @@ class BaseRemoteRESTQPU : public cudaq::QPU { return output_names; } + std::vector + lowerQuakeCode(const std::string &kernelName, void *kernelArgs) { + return lowerQuakeCode(kernelName, kernelArgs, {}); + } + + std::vector + lowerQuakeCode(const std::string &kernelName, + const std::vector &rawArgs) { + return lowerQuakeCode(kernelName, nullptr, rawArgs); + } + /// @brief Extract the Quake representation for the given kernel name and /// lower it to the code format required for the specific backend. The /// lowering process is controllable via the configuration file in the /// platform directory for the targeted backend. std::vector - lowerQuakeCode(const std::string &kernelName, void *kernelArgs) { + lowerQuakeCode(const std::string &kernelName, void *kernelArgs, + const std::vector &rawArgs) { auto [m_module, contextPtr, updatedArgs] = extractQuakeCodeAndContext(kernelName, kernelArgs); @@ -361,7 +376,7 @@ class BaseRemoteRESTQPU : public cudaq::QPU { // Extract the kernel name auto func = m_module.lookupSymbol( - std::string("__nvqpp__mlirgen__") + kernelName); + std::string(cudaq::runtime::cudaqGenPrefixName) + kernelName); // Create a new Module to clone the function into auto location = mlir::FileLineColLoc::get(&context, "", 1, 1); @@ -402,10 +417,26 @@ class BaseRemoteRESTQPU : public cudaq::QPU { throw std::runtime_error("Remote rest platform Quake lowering failed."); }; - if (updatedArgs) { - cudaq::info("Run Quake Synth.\n"); + if (!rawArgs.empty() || updatedArgs) { mlir::PassManager pm(&context); - pm.addPass(cudaq::opt::createQuakeSynthesizer(kernelName, updatedArgs)); + if (!rawArgs.empty()) { + cudaq::info("Run Argument Synth.\n"); + opt::ArgumentConverter argCon(kernelName, moduleOp); + argCon.gen(rawArgs); + std::string kernName = cudaq::runtime::cudaqGenPrefixName + kernelName; + mlir::StringRef sr{kernName}; + mlir::SmallVector kernels = {sr}; + std::string substBuff; + llvm::raw_string_ostream ss(substBuff); + ss << argCon.getSubstitutionModule(); + mlir::StringRef su{substBuff}; + mlir::SmallVector substs = {su}; + pm.addNestedPass( + opt::createArgumentSynthesisPass(kernels, substs)); + } else if (updatedArgs) { + cudaq::info("Run Quake Synth.\n"); + pm.addPass(cudaq::opt::createQuakeSynthesizer(kernelName, updatedArgs)); + } pm.addPass(mlir::createCanonicalizerPass()); if (disableMLIRthreading || enablePrintMLIREachPass) moduleOp.getContext()->disableMultithreading(); @@ -418,7 +449,7 @@ class BaseRemoteRESTQPU : public cudaq::QPU { runPassPipeline(passPipelineConfig, moduleOp); auto entryPointFunc = moduleOp.lookupSymbol( - std::string("__nvqpp__mlirgen__") + kernelName); + std::string(cudaq::runtime::cudaqGenPrefixName) + kernelName); std::vector mapping_reorder_idx; if (auto mappingAttr = dyn_cast_if_present( entryPointFunc->getAttr("mapping_reorder_idx"))) { @@ -448,7 +479,7 @@ class BaseRemoteRESTQPU : public cudaq::QPU { // Get the ansatz auto ansatz = moduleOp.lookupSymbol( - std::string("__nvqpp__mlirgen__") + kernelName); + std::string(cudaq::runtime::cudaqGenPrefixName) + kernelName); // Create a new Module to clone the ansatz into it auto tmpModuleOp = builder.create(); @@ -513,6 +544,21 @@ class BaseRemoteRESTQPU : public cudaq::QPU { return codes; } + void launchKernel(const std::string &kernelName, + const std::vector &rawArgs) override { + cudaq::info("launching remote rest kernel ({})", kernelName); + + // TODO future iterations of this should support non-void return types. + if (!executionContext) + throw std::runtime_error( + "Remote rest execution can only be performed via cudaq::sample(), " + "cudaq::observe(), or cudaq::draw()."); + + // Get the Quake code, lowered according to config file. + auto codes = lowerQuakeCode(kernelName, rawArgs); + completeLaunchKernel(kernelName, std::move(codes)); + } + /// @brief Launch the kernel. Extract the Quake code and lower to /// the representation required by the targeted backend. Handle all pertinent /// modifications for the execution context as well as asynchronous or @@ -530,6 +576,11 @@ class BaseRemoteRESTQPU : public cudaq::QPU { // Get the Quake code, lowered according to config file. auto codes = lowerQuakeCode(kernelName, args); + completeLaunchKernel(kernelName, std::move(codes)); + } + + void completeLaunchKernel(const std::string &kernelName, + std::vector &&codes) { // After performing lowerQuakeCode, check to see if we are simply drawing // the circuit. If so, perform the trace here and then return. diff --git a/runtime/common/BaseRemoteSimulatorQPU.h b/runtime/common/BaseRemoteSimulatorQPU.h index 2f63f16faf..9e833aa565 100644 --- a/runtime/common/BaseRemoteSimulatorQPU.h +++ b/runtime/common/BaseRemoteSimulatorQPU.h @@ -105,6 +105,11 @@ class BaseRemoteSimulatorQPU : public cudaq::QPU { throw std::runtime_error("Failed to launch VQE. Error: " + errorMsg); } + void launchKernel(const std::string &name, + const std::vector &rawArgs) override { + throw std::runtime_error("launch kernel on raw args not implemented"); + } + void launchKernel(const std::string &name, void (*kernelFunc)(void *), void *args, std::uint64_t voidStarSize, std::uint64_t resultOffset) override { diff --git a/runtime/cudaq/platform/default/DefaultQuantumPlatform.cpp b/runtime/cudaq/platform/default/DefaultQuantumPlatform.cpp index 6e2ad949c1..988e029367 100644 --- a/runtime/cudaq/platform/default/DefaultQuantumPlatform.cpp +++ b/runtime/cudaq/platform/default/DefaultQuantumPlatform.cpp @@ -39,6 +39,11 @@ class DefaultQPU : public cudaq::QPU { kernelFunc(args); } + void launchKernel(const std::string &name, + const std::vector &) override { + throw std::runtime_error("Wrong kernel launch point."); + } + /// Overrides setExecutionContext to forward it to the ExecutionManager void setExecutionContext(cudaq::ExecutionContext *context) override { ScopedTraceWithContext("DefaultPlatform::setExecutionContext", diff --git a/runtime/cudaq/platform/mqpu/custatevec/GPUEmulatedQPU.cpp b/runtime/cudaq/platform/mqpu/custatevec/GPUEmulatedQPU.cpp index 5abd45bdf7..eb62f6569a 100644 --- a/runtime/cudaq/platform/mqpu/custatevec/GPUEmulatedQPU.cpp +++ b/runtime/cudaq/platform/mqpu/custatevec/GPUEmulatedQPU.cpp @@ -44,6 +44,11 @@ class GPUEmulatedQPU : public cudaq::QPU { kernelFunc(args); } + void launchKernel(const std::string &name, + const std::vector &rawArgs) override { + throw std::runtime_error("not implemented"); + } + /// Overrides setExecutionContext to forward it to the ExecutionManager void setExecutionContext(cudaq::ExecutionContext *context) override { cudaSetDevice(qpu_id); diff --git a/runtime/cudaq/platform/orca/OrcaQPU.cpp b/runtime/cudaq/platform/orca/OrcaQPU.cpp index bfab6ac839..8c6a414b5a 100644 --- a/runtime/cudaq/platform/orca/OrcaQPU.cpp +++ b/runtime/cudaq/platform/orca/OrcaQPU.cpp @@ -175,6 +175,10 @@ class OrcaRemoteRESTQPU : public cudaq::QPU { void launchKernel(const std::string &kernelName, void (*kernelFunc)(void *), void *args, std::uint64_t voidStarSize, std::uint64_t resultOffset) override; + void launchKernel(const std::string &kernelName, + const std::vector &rawArgs) override { + throw std::runtime_error("launch kernel on raw args not implemented"); + } }; /// @brief This setTargetBackend override is in charge of reading the @@ -321,4 +325,4 @@ cudaq::RestHeaders OrcaRemoteRESTQPU::getHeaders() { } // namespace -CUDAQ_REGISTER_TYPE(cudaq::QPU, OrcaRemoteRESTQPU, orca) \ No newline at end of file +CUDAQ_REGISTER_TYPE(cudaq::QPU, OrcaRemoteRESTQPU, orca) diff --git a/runtime/cudaq/platform/qpu.h b/runtime/cudaq/platform/qpu.h index 6f1a3024e3..0209c8bd7b 100644 --- a/runtime/cudaq/platform/qpu.h +++ b/runtime/cudaq/platform/qpu.h @@ -174,6 +174,8 @@ class QPU : public registry::RegisteredType { /// as a struct-packed void pointer and its corresponding size. virtual void launchKernel(const std::string &name, void (*kernelFunc)(void *), void *args, std::uint64_t, std::uint64_t) = 0; + virtual void launchKernel(const std::string &name, + const std::vector &rawArgs) = 0; /// Launch serialized code for remote execution. Subtypes that support this /// should override this function. diff --git a/runtime/cudaq/platform/quantum_platform.cpp b/runtime/cudaq/platform/quantum_platform.cpp index 6f98283047..1c407df1f0 100644 --- a/runtime/cudaq/platform/quantum_platform.cpp +++ b/runtime/cudaq/platform/quantum_platform.cpp @@ -163,6 +163,19 @@ void quantum_platform::launchKernel(std::string kernelName, qpu->launchKernel(kernelName, kernelFunc, args, voidStarSize, resultOffset); } +void quantum_platform::launchKernel(std::string kernelName, + const std::vector &rawArgs) { + std::size_t qpu_id = 0; + + auto tid = std::hash{}(std::this_thread::get_id()); + auto iter = threadToQpuId.find(tid); + if (iter != threadToQpuId.end()) + qpu_id = iter->second; + + auto &qpu = platformQPUs[qpu_id]; + qpu->launchKernel(kernelName, rawArgs); +} + void quantum_platform::launchSerializedCodeExecution( const std::string &name, cudaq::SerializedCodeExecutionContext &serializeCodeExecutionObject) { @@ -201,3 +214,12 @@ void cudaq::altLaunchKernel(const char *kernelName, void (*kernelFunc)(void *), platform.launchKernel(kernName, kernelFunc, kernelArgs, argsSize, resultOffset); } + +void cudaq::streamlinedLaunchKernel(const char *kernelName, + const std::vector &rawArgs) { + std::size_t argsSize = rawArgs.size(); + ScopedTraceWithContext("streamlinedLaunchKernel", kernelName, argsSize); + auto &platform = *cudaq::getQuantumPlatformInternal(); + std::string kernName = kernelName; + platform.launchKernel(kernName, rawArgs); +} diff --git a/runtime/cudaq/platform/quantum_platform.h b/runtime/cudaq/platform/quantum_platform.h index 17244340d0..5f972fd7bc 100644 --- a/runtime/cudaq/platform/quantum_platform.h +++ b/runtime/cudaq/platform/quantum_platform.h @@ -145,6 +145,7 @@ class quantum_platform { void launchKernel(std::string kernelName, void (*kernelFunc)(void *), void *args, std::uint64_t voidStarSize, std::uint64_t resultOffset); + void launchKernel(std::string kernelName, const std::vector &); // This method is the hook for executing SerializedCodeExecutionContext // objects. @@ -212,8 +213,20 @@ class quantum_platform { /// tied to the quantum platform instance somehow. Note that the compiler cannot /// provide that information. extern "C" { +// Client-server (legacy) interface. void altLaunchKernel(const char *kernelName, void (*kernel)(void *), void *args, std::uint64_t argsSize, std::uint64_t resultOffset); +// Streamlined interface for launching kernels. Argument synthesis and JIT +// compilation *must* happen on the local machine. +void streamlinedLaunchKernel(const char *kernelName, + const std::vector &rawArgs); +// Hybrid of the client-server and streamlined approaches. Letting JIT +// compilation happen either early or late and can handle return values from +// each kernel launch. +void hybridLaunchKernel(const char *kernelName, void (*kernel)(void *), + void *args, std::uint64_t argsSize, + std::uint64_t resultOffset, + const std::vector &rawArgs); } } // namespace cudaq diff --git a/targettests/execution/test-6.cpp b/targettests/execution/test-6.cpp index 24288cb89f..b0c6fc855f 100644 --- a/targettests/execution/test-6.cpp +++ b/targettests/execution/test-6.cpp @@ -8,6 +8,7 @@ // REQUIRES: c++20 // RUN: nvq++ --target quantinuum --emulate %s -o %t && %t | FileCheck %s +// RUN: nvq++ -fkernel-exec-kind=2 --target quantinuum --emulate %s -o %t && %t | FileCheck %s #include #include diff --git a/targettests/execution/to_integer.cpp b/targettests/execution/to_integer.cpp index 598ddd59c1..4890fd16f1 100644 --- a/targettests/execution/to_integer.cpp +++ b/targettests/execution/to_integer.cpp @@ -6,10 +6,8 @@ * the terms of the Apache License 2.0 which accompanies this distribution. * ******************************************************************************/ -// RUN: nvq++ -std=c++17 --enable-mlir %s -o %t -// RUN: if [ $(echo %cpp_std | cut -c4- ) -ge 20 ]; then \ -// RUN: nvq++ --enable-mlir %s -o %t && %t; \ -// RUN: fi +// RUN: nvq++ %cpp_std --enable-mlir %s -o %t && %t +// RUN: nvq++ %cpp_std -fkernel-exec-kind=2 --enable-mlir %s -o %t && %t #include diff --git a/tools/nvqpp/nvq++.in b/tools/nvqpp/nvq++.in index ed1fd32741..98368cdf8e 100644 --- a/tools/nvqpp/nvq++.in +++ b/tools/nvqpp/nvq++.in @@ -95,6 +95,9 @@ function f_option_handling { -flower-to-cfg) ENABLE_LOWER_TO_CFG=true ;; + -fkernel-exec-kind=*) + KERNEL_EXECUTION_KIND="{codegen=${1#*=}}" + ;; *) # Pass any unrecognized options on to the clang++ tool. ARGS="${ARGS} $1" @@ -325,6 +328,7 @@ SHOW_VERSION=false ENABLE_UNWIND_LOWERING=true ENABLE_DEVICE_CODE_LOADERS=true ENABLE_KERNEL_EXECUTION=true +KERNEL_EXECUTION_KIND= ENABLE_AGGRESSIVE_EARLY_INLINE=true ENABLE_LOWER_TO_CFG=true ENABLE_APPLY_SPECIALIZATION=true @@ -680,7 +684,7 @@ if ${ENABLE_APPLY_SPECIALIZATION}; then fi if ${ENABLE_KERNEL_EXECUTION}; then RUN_OPT=true - OPT_PASSES=$(add_pass_to_pipeline "${OPT_PASSES}" "kernel-execution") + OPT_PASSES=$(add_pass_to_pipeline "${OPT_PASSES}" "kernel-execution${KERNEL_EXECUTION_KIND}") fi if ${ENABLE_AGGRESSIVE_EARLY_INLINE}; then RUN_OPT=true