Skip to content

Commit

Permalink
[JIT] Hook everything up to use the new argument synthesis (#2084)
Browse files Browse the repository at this point in the history
* [JIT] Hook everything up to use the new argument synthesis

Add option to nvq++ to use the new kernel launcher.

Add code to the runtime to implement the new launch sequence. This
receives the new vector of arguments protocol, uses the new argument
conversion, and then calls the new argument synthesis pass to specialize
the kernel for JIT compilation.

Add a couple of tests to smoke test this new implementation.

* Add missing override.
Add ArgumentConversion module to the python library to resolve symbols.
  • Loading branch information
schweitzpgi authored Aug 15, 2024
1 parent a817c65 commit 78f0715
Show file tree
Hide file tree
Showing 15 changed files with 146 additions and 25 deletions.
6 changes: 3 additions & 3 deletions include/cudaq/Optimizer/Transforms/Passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,9 @@ inline std::unique_ptr<mlir::Pass> createPySynthCallableBlockArgs() {
/// Helper function to build an argument synthesis pass. The names of the
/// functions and the substitutions text can be built as an unzipped pair of
/// lists.
std::unique_ptr<mlir::Pass> createArgumentSynthesisPass(
const mlir::ArrayRef<mlir::StringRef> &funcNames,
const mlir::ArrayRef<mlir::StringRef> &substitutions);
std::unique_ptr<mlir::Pass>
createArgumentSynthesisPass(mlir::ArrayRef<mlir::StringRef> funcNames,
mlir::ArrayRef<mlir::StringRef> substitutions);

// declarative passes
#define GEN_PASS_DECL
Expand Down
23 changes: 17 additions & 6 deletions lib/Optimizer/Transforms/ArgumentSynthesis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,9 @@ class ArgumentSynthesisPass
assert(*substMod && "module must have been created");

// 2. Go through the Module and process each substitution.
std::vector<bool> processedArgs(func.getFunctionType().getNumInputs());
std::vector<std::tuple<unsigned, Value, Value>> replacements;
SmallVector<bool> processedArgs(func.getFunctionType().getNumInputs());
SmallVector<std::tuple<unsigned, Value, Value>> replacements;
BitVector replacedArgs(processedArgs.size());
for (auto &op : *substMod) {
auto subst = dyn_cast<cudaq::cc::ArgumentSubstitutionOp>(op);
if (!subst) {
Expand Down Expand Up @@ -103,6 +104,17 @@ class ArgumentSynthesisPass
// OK, substitute the code for the argument.
Block &entry = func.getRegion().front();
processedArgs[pos] = true;
if (subst.getBody().front().empty()) {
// No code is present. Erase the argument if it is not used.
const auto numUses =
std::distance(entry.getArgument(pos).getUses().begin(),
entry.getArgument(pos).getUses().end());
LLVM_DEBUG(llvm::dbgs() << "maybe erasing an unused argument ("
<< std::to_string(numUses) << ")\n");
if (numUses == 0)
replacedArgs.set(pos);
continue;
}
OpBuilder builder{ctx};
Block *splitBlock = entry.splitBlock(entry.begin());
builder.setInsertionPointToEnd(&entry);
Expand All @@ -126,7 +138,6 @@ class ArgumentSynthesisPass
// function is still dead and can be removed by a DCE.

// 3. Replace the block argument values with the freshly inserted new code.
BitVector replacedArgs(processedArgs.size());
for (auto [pos, fromVal, toVal] : replacements) {
replacedArgs.set(pos);
fromVal.replaceAllUsesWith(toVal);
Expand All @@ -142,9 +153,9 @@ class ArgumentSynthesisPass
// Helper function that takes an unzipped pair of lists of function names and
// substitution code strings. This is meant to make adding this pass to a
// pipeline easier from within a tool (such as the JIT compiler).
std::unique_ptr<mlir::Pass> cudaq::opt::createArgumentSynthesisPass(
const ArrayRef<StringRef> &funcNames,
const ArrayRef<StringRef> &substitutions) {
std::unique_ptr<mlir::Pass>
cudaq::opt::createArgumentSynthesisPass(ArrayRef<StringRef> funcNames,
ArrayRef<StringRef> substitutions) {
SmallVector<std::string> pairs;
if (funcNames.size() == substitutions.size())
for (auto [name, text] : llvm::zip(funcNames, substitutions))
Expand Down
3 changes: 1 addition & 2 deletions lib/Optimizer/Transforms/GenKernelExecution.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1329,8 +1329,7 @@ class GenerateKernelExecution
Value vecArgPtrs;
if (isCodegenArgumentGather(codegenKind)) {
// 1) Allocate and initialize a std::vector<void*> object.
const unsigned count =
cudaq::cc::numberOfHiddenArgs(addThisPtr, hiddenSRet);
const unsigned count = devFuncTy.getInputs().size();
auto stdVec = builder.create<cudaq::cc::AllocaOp>(
loc, cudaq::opt::factory::stlVectorType(ptrI8Ty));
auto arrPtrTy = cudaq::cc::ArrayType::get(ctx, ptrI8Ty, count);
Expand Down
1 change: 1 addition & 0 deletions python/extension/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ declare_mlir_python_extension(CUDAQuantumPythonSources.Extension
../../runtime/cudaq/platform/common/QuantumExecutionQueue.cpp
../../runtime/cudaq/platform/default/rest_server/RemoteRuntimeClient.cpp
../../runtime/cudaq/platform/orca/OrcaQPU.cpp
../../runtime/common/ArgumentConversion.cpp

EMBED_CAPI_LINK_LIBS
CUDAQuantumMLIRCAPI
Expand Down
67 changes: 59 additions & 8 deletions runtime/common/BaseRemoteRESTQPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

#pragma once

#include "common/ArgumentConversion.h"
#include "common/Environment.h"
#include "common/ExecutionContext.h"
#include "common/Executor.h"
Expand All @@ -17,6 +18,7 @@
#include "common/RuntimeMLIR.h"
#include "cudaq.h"
#include "cudaq/Frontend/nvqpp/AttributeNames.h"
#include "cudaq/Optimizer/Builder/Runtime.h"
#include "cudaq/Optimizer/CodeGen/OpenQASMEmitter.h"
#include "cudaq/Optimizer/CodeGen/Passes.h"
#include "cudaq/Optimizer/Dialect/CC/CCDialect.h"
Expand Down Expand Up @@ -112,7 +114,8 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
/// @brief Invoke the kernel in the JIT engine
void invokeJITKernel(mlir::ExecutionEngine *jit,
const std::string &kernelName) {
auto funcPtr = jit->lookup(std::string("__nvqpp__mlirgen__") + kernelName);
auto funcPtr = jit->lookup(std::string(cudaq::runtime::cudaqGenPrefixName) +
kernelName);
if (!funcPtr) {
throw std::runtime_error(
"cudaq::builder failed to get kernelReg function.");
Expand Down Expand Up @@ -347,12 +350,24 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
return output_names;
}

std::vector<cudaq::KernelExecution>
lowerQuakeCode(const std::string &kernelName, void *kernelArgs) {
return lowerQuakeCode(kernelName, kernelArgs, {});
}

std::vector<cudaq::KernelExecution>
lowerQuakeCode(const std::string &kernelName,
const std::vector<void *> &rawArgs) {
return lowerQuakeCode(kernelName, nullptr, rawArgs);
}

/// @brief Extract the Quake representation for the given kernel name and
/// lower it to the code format required for the specific backend. The
/// lowering process is controllable via the configuration file in the
/// platform directory for the targeted backend.
std::vector<cudaq::KernelExecution>
lowerQuakeCode(const std::string &kernelName, void *kernelArgs) {
lowerQuakeCode(const std::string &kernelName, void *kernelArgs,
const std::vector<void *> &rawArgs) {

auto [m_module, contextPtr, updatedArgs] =
extractQuakeCodeAndContext(kernelName, kernelArgs);
Expand All @@ -361,7 +376,7 @@ class BaseRemoteRESTQPU : public cudaq::QPU {

// Extract the kernel name
auto func = m_module.lookupSymbol<mlir::func::FuncOp>(
std::string("__nvqpp__mlirgen__") + kernelName);
std::string(cudaq::runtime::cudaqGenPrefixName) + kernelName);

// Create a new Module to clone the function into
auto location = mlir::FileLineColLoc::get(&context, "<builder>", 1, 1);
Expand Down Expand Up @@ -402,10 +417,26 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
throw std::runtime_error("Remote rest platform Quake lowering failed.");
};

if (updatedArgs) {
cudaq::info("Run Quake Synth.\n");
if (!rawArgs.empty() || updatedArgs) {
mlir::PassManager pm(&context);
pm.addPass(cudaq::opt::createQuakeSynthesizer(kernelName, updatedArgs));
if (!rawArgs.empty()) {
cudaq::info("Run Argument Synth.\n");
opt::ArgumentConverter argCon(kernelName, moduleOp);
argCon.gen(rawArgs);
std::string kernName = cudaq::runtime::cudaqGenPrefixName + kernelName;
mlir::StringRef sr{kernName};
mlir::SmallVector<mlir::StringRef> kernels = {sr};
std::string substBuff;
llvm::raw_string_ostream ss(substBuff);
ss << argCon.getSubstitutionModule();
mlir::StringRef su{substBuff};
mlir::SmallVector<mlir::StringRef> substs = {su};
pm.addNestedPass<mlir::func::FuncOp>(
opt::createArgumentSynthesisPass(kernels, substs));
} else if (updatedArgs) {
cudaq::info("Run Quake Synth.\n");
pm.addPass(cudaq::opt::createQuakeSynthesizer(kernelName, updatedArgs));
}
pm.addPass(mlir::createCanonicalizerPass());
if (disableMLIRthreading || enablePrintMLIREachPass)
moduleOp.getContext()->disableMultithreading();
Expand All @@ -418,7 +449,7 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
runPassPipeline(passPipelineConfig, moduleOp);

auto entryPointFunc = moduleOp.lookupSymbol<mlir::func::FuncOp>(
std::string("__nvqpp__mlirgen__") + kernelName);
std::string(cudaq::runtime::cudaqGenPrefixName) + kernelName);
std::vector<std::size_t> mapping_reorder_idx;
if (auto mappingAttr = dyn_cast_if_present<mlir::ArrayAttr>(
entryPointFunc->getAttr("mapping_reorder_idx"))) {
Expand Down Expand Up @@ -448,7 +479,7 @@ class BaseRemoteRESTQPU : public cudaq::QPU {

// Get the ansatz
auto ansatz = moduleOp.lookupSymbol<mlir::func::FuncOp>(
std::string("__nvqpp__mlirgen__") + kernelName);
std::string(cudaq::runtime::cudaqGenPrefixName) + kernelName);

// Create a new Module to clone the ansatz into it
auto tmpModuleOp = builder.create<mlir::ModuleOp>();
Expand Down Expand Up @@ -513,6 +544,21 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
return codes;
}

void launchKernel(const std::string &kernelName,
const std::vector<void *> &rawArgs) override {
cudaq::info("launching remote rest kernel ({})", kernelName);

// TODO future iterations of this should support non-void return types.
if (!executionContext)
throw std::runtime_error(
"Remote rest execution can only be performed via cudaq::sample(), "
"cudaq::observe(), or cudaq::draw().");

// Get the Quake code, lowered according to config file.
auto codes = lowerQuakeCode(kernelName, rawArgs);
completeLaunchKernel(kernelName, std::move(codes));
}

/// @brief Launch the kernel. Extract the Quake code and lower to
/// the representation required by the targeted backend. Handle all pertinent
/// modifications for the execution context as well as asynchronous or
Expand All @@ -530,6 +576,11 @@ class BaseRemoteRESTQPU : public cudaq::QPU {

// Get the Quake code, lowered according to config file.
auto codes = lowerQuakeCode(kernelName, args);
completeLaunchKernel(kernelName, std::move(codes));
}

void completeLaunchKernel(const std::string &kernelName,
std::vector<cudaq::KernelExecution> &&codes) {

// After performing lowerQuakeCode, check to see if we are simply drawing
// the circuit. If so, perform the trace here and then return.
Expand Down
5 changes: 5 additions & 0 deletions runtime/common/BaseRemoteSimulatorQPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,11 @@ class BaseRemoteSimulatorQPU : public cudaq::QPU {
throw std::runtime_error("Failed to launch VQE. Error: " + errorMsg);
}

void launchKernel(const std::string &name,
const std::vector<void *> &rawArgs) override {
throw std::runtime_error("launch kernel on raw args not implemented");
}

void launchKernel(const std::string &name, void (*kernelFunc)(void *),
void *args, std::uint64_t voidStarSize,
std::uint64_t resultOffset) override {
Expand Down
5 changes: 5 additions & 0 deletions runtime/cudaq/platform/default/DefaultQuantumPlatform.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,11 @@ class DefaultQPU : public cudaq::QPU {
kernelFunc(args);
}

void launchKernel(const std::string &name,
const std::vector<void *> &) override {
throw std::runtime_error("Wrong kernel launch point.");
}

/// Overrides setExecutionContext to forward it to the ExecutionManager
void setExecutionContext(cudaq::ExecutionContext *context) override {
ScopedTraceWithContext("DefaultPlatform::setExecutionContext",
Expand Down
5 changes: 5 additions & 0 deletions runtime/cudaq/platform/mqpu/custatevec/GPUEmulatedQPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,11 @@ class GPUEmulatedQPU : public cudaq::QPU {
kernelFunc(args);
}

void launchKernel(const std::string &name,
const std::vector<void *> &rawArgs) override {
throw std::runtime_error("not implemented");
}

/// Overrides setExecutionContext to forward it to the ExecutionManager
void setExecutionContext(cudaq::ExecutionContext *context) override {
cudaSetDevice(qpu_id);
Expand Down
6 changes: 5 additions & 1 deletion runtime/cudaq/platform/orca/OrcaQPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,10 @@ class OrcaRemoteRESTQPU : public cudaq::QPU {
void launchKernel(const std::string &kernelName, void (*kernelFunc)(void *),
void *args, std::uint64_t voidStarSize,
std::uint64_t resultOffset) override;
void launchKernel(const std::string &kernelName,
const std::vector<void *> &rawArgs) override {
throw std::runtime_error("launch kernel on raw args not implemented");
}
};

/// @brief This setTargetBackend override is in charge of reading the
Expand Down Expand Up @@ -321,4 +325,4 @@ cudaq::RestHeaders OrcaRemoteRESTQPU::getHeaders() {

} // namespace

CUDAQ_REGISTER_TYPE(cudaq::QPU, OrcaRemoteRESTQPU, orca)
CUDAQ_REGISTER_TYPE(cudaq::QPU, OrcaRemoteRESTQPU, orca)
2 changes: 2 additions & 0 deletions runtime/cudaq/platform/qpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,8 @@ class QPU : public registry::RegisteredType<QPU> {
/// as a struct-packed void pointer and its corresponding size.
virtual void launchKernel(const std::string &name, void (*kernelFunc)(void *),
void *args, std::uint64_t, std::uint64_t) = 0;
virtual void launchKernel(const std::string &name,
const std::vector<void *> &rawArgs) = 0;

/// Launch serialized code for remote execution. Subtypes that support this
/// should override this function.
Expand Down
22 changes: 22 additions & 0 deletions runtime/cudaq/platform/quantum_platform.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,19 @@ void quantum_platform::launchKernel(std::string kernelName,
qpu->launchKernel(kernelName, kernelFunc, args, voidStarSize, resultOffset);
}

void quantum_platform::launchKernel(std::string kernelName,
const std::vector<void *> &rawArgs) {
std::size_t qpu_id = 0;

auto tid = std::hash<std::thread::id>{}(std::this_thread::get_id());
auto iter = threadToQpuId.find(tid);
if (iter != threadToQpuId.end())
qpu_id = iter->second;

auto &qpu = platformQPUs[qpu_id];
qpu->launchKernel(kernelName, rawArgs);
}

void quantum_platform::launchSerializedCodeExecution(
const std::string &name,
cudaq::SerializedCodeExecutionContext &serializeCodeExecutionObject) {
Expand Down Expand Up @@ -201,3 +214,12 @@ void cudaq::altLaunchKernel(const char *kernelName, void (*kernelFunc)(void *),
platform.launchKernel(kernName, kernelFunc, kernelArgs, argsSize,
resultOffset);
}

void cudaq::streamlinedLaunchKernel(const char *kernelName,
const std::vector<void *> &rawArgs) {
std::size_t argsSize = rawArgs.size();
ScopedTraceWithContext("streamlinedLaunchKernel", kernelName, argsSize);
auto &platform = *cudaq::getQuantumPlatformInternal();
std::string kernName = kernelName;
platform.launchKernel(kernName, rawArgs);
}
13 changes: 13 additions & 0 deletions runtime/cudaq/platform/quantum_platform.h
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ class quantum_platform {
void launchKernel(std::string kernelName, void (*kernelFunc)(void *),
void *args, std::uint64_t voidStarSize,
std::uint64_t resultOffset);
void launchKernel(std::string kernelName, const std::vector<void *> &);

// This method is the hook for executing SerializedCodeExecutionContext
// objects.
Expand Down Expand Up @@ -212,8 +213,20 @@ class quantum_platform {
/// tied to the quantum platform instance somehow. Note that the compiler cannot
/// provide that information.
extern "C" {
// Client-server (legacy) interface.
void altLaunchKernel(const char *kernelName, void (*kernel)(void *), void *args,
std::uint64_t argsSize, std::uint64_t resultOffset);
// Streamlined interface for launching kernels. Argument synthesis and JIT
// compilation *must* happen on the local machine.
void streamlinedLaunchKernel(const char *kernelName,
const std::vector<void *> &rawArgs);
// Hybrid of the client-server and streamlined approaches. Letting JIT
// compilation happen either early or late and can handle return values from
// each kernel launch.
void hybridLaunchKernel(const char *kernelName, void (*kernel)(void *),
void *args, std::uint64_t argsSize,
std::uint64_t resultOffset,
const std::vector<void *> &rawArgs);
}

} // namespace cudaq
Expand Down
1 change: 1 addition & 0 deletions targettests/execution/test-6.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

// REQUIRES: c++20
// RUN: nvq++ --target quantinuum --emulate %s -o %t && %t | FileCheck %s
// RUN: nvq++ -fkernel-exec-kind=2 --target quantinuum --emulate %s -o %t && %t | FileCheck %s

#include <cudaq.h>
#include <iostream>
Expand Down
6 changes: 2 additions & 4 deletions targettests/execution/to_integer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,8 @@
* the terms of the Apache License 2.0 which accompanies this distribution. *
******************************************************************************/

// RUN: nvq++ -std=c++17 --enable-mlir %s -o %t
// RUN: if [ $(echo %cpp_std | cut -c4- ) -ge 20 ]; then \
// RUN: nvq++ --enable-mlir %s -o %t && %t; \
// RUN: fi
// RUN: nvq++ %cpp_std --enable-mlir %s -o %t && %t
// RUN: nvq++ %cpp_std -fkernel-exec-kind=2 --enable-mlir %s -o %t && %t

#include <cudaq.h>

Expand Down
6 changes: 5 additions & 1 deletion tools/nvqpp/nvq++.in
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,9 @@ function f_option_handling {
-flower-to-cfg)
ENABLE_LOWER_TO_CFG=true
;;
-fkernel-exec-kind=*)
KERNEL_EXECUTION_KIND="{codegen=${1#*=}}"
;;
*)
# Pass any unrecognized options on to the clang++ tool.
ARGS="${ARGS} $1"
Expand Down Expand Up @@ -325,6 +328,7 @@ SHOW_VERSION=false
ENABLE_UNWIND_LOWERING=true
ENABLE_DEVICE_CODE_LOADERS=true
ENABLE_KERNEL_EXECUTION=true
KERNEL_EXECUTION_KIND=
ENABLE_AGGRESSIVE_EARLY_INLINE=true
ENABLE_LOWER_TO_CFG=true
ENABLE_APPLY_SPECIALIZATION=true
Expand Down Expand Up @@ -680,7 +684,7 @@ if ${ENABLE_APPLY_SPECIALIZATION}; then
fi
if ${ENABLE_KERNEL_EXECUTION}; then
RUN_OPT=true
OPT_PASSES=$(add_pass_to_pipeline "${OPT_PASSES}" "kernel-execution")
OPT_PASSES=$(add_pass_to_pipeline "${OPT_PASSES}" "kernel-execution${KERNEL_EXECUTION_KIND}")
fi
if ${ENABLE_AGGRESSIVE_EARLY_INLINE}; then
RUN_OPT=true
Expand Down

0 comments on commit 78f0715

Please sign in to comment.