diff --git a/include/cudaq/Optimizer/Builder/Runtime.h b/include/cudaq/Optimizer/Builder/Runtime.h index bf81843fd9..c25a5cd2ee 100644 --- a/include/cudaq/Optimizer/Builder/Runtime.h +++ b/include/cudaq/Optimizer/Builder/Runtime.h @@ -23,7 +23,8 @@ static constexpr unsigned cudaqGenPrefixLength = sizeof(cudaqGenPrefixName) - 1; /// compile time (see `cudaqGenPrefixName`) or it can be rewritten to call back /// to the runtime library (and be handled at runtime). static constexpr const char launchKernelFuncName[] = "altLaunchKernel"; -static constexpr const char launchKernelVersion2FuncName[] = - "altLaunchKernelUsingLocalJIT"; +static constexpr const char launchKernelStreamlinedFuncName[] = + "streamlinedLaunchKernel"; +static constexpr const char launchKernelHybridFuncName[] = "hybridLaunchKernel"; } // namespace cudaq::runtime diff --git a/include/cudaq/Optimizer/Transforms/Passes.td b/include/cudaq/Optimizer/Transforms/Passes.td index e8e5a79b0d..8fa3eb8bd0 100644 --- a/include/cudaq/Optimizer/Transforms/Passes.td +++ b/include/cudaq/Optimizer/Transforms/Passes.td @@ -302,6 +302,27 @@ def GenerateKernelExecution : Pass<"kernel-execution", "mlir::ModuleOp"> { use of library side argument conversion and the argument synthesis pass. More generally, this option can be used when JIT compiling kernels on the client/host/local processor. + + There are multiple code generation kinds that are supported for flexibility + and streamlining the kernel launch process. These tend to be related to the + target and runtime environment the compiler is being run in and can involve + some technical issues that require deeper understanding of the entire + process. In general, it is not recommended for user's to change this value. + + ``` + codegen kind description + + 0 Hybrid. A combination of 1 and 2 that allowed early and + streamlined JIT compilation but also supports return values + and dynamic parameters. + 1 Client-server interchange format. Supports kernels that + return results and dynamic parameters. + 2 Streamlined for JIT. The kernel will be converted to a + nullary function with no results. Return values from the + kernel are ignored, if present. All parameter values are to + be inlined by the JIT compiler, so this codegen kind does not + support any dynamic parameters. + ``` }]; let dependentDialects = ["cudaq::cc::CCDialect", "mlir::LLVM::LLVMDialect"]; @@ -311,8 +332,8 @@ def GenerateKernelExecution : Pass<"kernel-execution", "mlir::ModuleOp"> { /*default=*/"\"-\"", "Name of output file.">, Option<"startingArgIdx", "starting-arg-idx", "std::size_t", /*default=*/"0", "The starting argument index for the argsCreator.">, - Option<"altLaunchVersion", "alt-launch", "std::size_t", /*default=*/"1", - "Specify the version of altLaunchKernel to be used."> + Option<"codegenKind", "codegen", "std::size_t", /*default=*/"1", + "Set the kind of code to generate for the launches."> ]; } diff --git a/lib/Optimizer/Builder/Intrinsics.cpp b/lib/Optimizer/Builder/Intrinsics.cpp index 12030de199..5daceec94b 100644 --- a/lib/Optimizer/Builder/Intrinsics.cpp +++ b/lib/Optimizer/Builder/Intrinsics.cpp @@ -293,25 +293,33 @@ static constexpr IntrinsicCode intrinsicTable[] = { return %3 : !cc.struct<{!cc.ptr, i64}> })#"}, - {cudaq::runtime::launchKernelFuncName, // altLaunchKernel + // altLaunchKernel(kernelName, thunk, commBuffer, buffSize, resultOffset) + {cudaq::runtime::launchKernelFuncName, {}, R"#( func.func private @altLaunchKernel(!cc.ptr, !cc.ptr, !cc.ptr, i64, i64) -> ())#"}, - {cudaq::runtime:: - launchKernelVersion2FuncName, // altLaunchKernelUsingLocalJIT + {"free", {}, "func.func private @free(!cc.ptr) -> ()"}, + + // hybridLaunchKernel(kernelName, thunk, commBuffer, buffSize, + // resultOffset, vectorArgPtrs) + {cudaq::runtime::launchKernelHybridFuncName, {}, R"#( - func.func private @altLaunchKernelUsingLocalJIT(!cc.ptr, !cc.ptr, !cc.ptr) -> ())#"}, - - {"free", {}, "func.func private @free(!cc.ptr) -> ()"}, + func.func private @hybridLaunchKernel(!cc.ptr, !cc.ptr, !cc.ptr, i64, i64, !cc.ptr) -> ())#"}, {cudaq::llvmMemCopyIntrinsic, // llvm.memcpy.p0i8.p0i8.i64 {}, R"#( func.func private @llvm.memcpy.p0i8.p0i8.i64(!cc.ptr, !cc.ptr, i64, i1) -> ())#"}, - {"malloc", {}, "func.func private @malloc(i64) -> !cc.ptr"}}; + {"malloc", {}, "func.func private @malloc(i64) -> !cc.ptr"}, + + // streamlinedLaunchKernel(kernelName, vectorArgPtrs) + {cudaq::runtime::launchKernelStreamlinedFuncName, + {}, + R"#( + func.func private @streamlinedLaunchKernel(!cc.ptr, !cc.ptr) -> ())#"}}; static constexpr std::size_t intrinsicTableSize = sizeof(intrinsicTable) / sizeof(IntrinsicCode); diff --git a/lib/Optimizer/CodeGen/CCToLLVM.cpp b/lib/Optimizer/CodeGen/CCToLLVM.cpp index aa025656b8..01596ae760 100644 --- a/lib/Optimizer/CodeGen/CCToLLVM.cpp +++ b/lib/Optimizer/CodeGen/CCToLLVM.cpp @@ -495,6 +495,33 @@ class SizeOfOpPattern : public ConvertOpToLLVMPattern { } }; +class OffsetOfOpPattern : public ConvertOpToLLVMPattern { +public: + using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + + // Use the GEP approach for now. LLVM is planning to remove support for this + // at some point. See: https://github.com/llvm/llvm-project/issues/71507 + LogicalResult + matchAndRewrite(cudaq::cc::OffsetOfOp offsetOp, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto inputTy = offsetOp.getInputType(); + SmallVector args; + for (std::int32_t i : offsetOp.getConstantIndices()) + args.push_back(i); + auto resultTy = offsetOp.getType(); + auto loc = offsetOp.getLoc(); + // TODO: replace this with some target-specific memory layout computation + // when we upgrade to a newer MLIR. + auto zero = rewriter.create(loc, 0, 64); + auto ptrTy = cudaq::cc::PointerType::get(inputTy); + auto nul = rewriter.create(loc, ptrTy, zero); + Value nextPtr = + rewriter.create(loc, ptrTy, nul, args); + rewriter.replaceOpWithNewOp(offsetOp, resultTy, nextPtr); + return success(); + } +}; + class StdvecDataOpPattern : public ConvertOpToLLVMPattern { public: @@ -647,7 +674,8 @@ void cudaq::opt::populateCCToLLVMPatterns(LLVMTypeConverter &typeConverter, ComputePtrOpPattern, CreateStringLiteralOpPattern, ExtractValueOpPattern, FuncToPtrOpPattern, GlobalOpPattern, InsertValueOpPattern, InstantiateCallableOpPattern, - LoadOpPattern, PoisonOpPattern, SizeOfOpPattern, - StdvecDataOpPattern, StdvecInitOpPattern, StdvecSizeOpPattern, - StoreOpPattern, UndefOpPattern>(typeConverter); + LoadOpPattern, OffsetOfOpPattern, PoisonOpPattern, + SizeOfOpPattern, StdvecDataOpPattern, StdvecInitOpPattern, + StdvecSizeOpPattern, StoreOpPattern, UndefOpPattern>( + typeConverter); } diff --git a/lib/Optimizer/Transforms/GenKernelExecution.cpp b/lib/Optimizer/Transforms/GenKernelExecution.cpp index 78176e5387..7927a1995d 100644 --- a/lib/Optimizer/Transforms/GenKernelExecution.cpp +++ b/lib/Optimizer/Transforms/GenKernelExecution.cpp @@ -34,19 +34,29 @@ namespace cudaq::opt { using namespace mlir; -namespace { // Define some constant function name strings. -static constexpr const char cudaqRegisterLambdaName[] = +static constexpr const char CudaqRegisterLambdaName[] = "cudaqRegisterLambdaName"; -static constexpr const char cudaqRegisterArgsCreator[] = +static constexpr const char CudaqRegisterArgsCreator[] = "cudaqRegisterArgsCreator"; -static constexpr const char cudaqRegisterKernelName[] = +static constexpr const char CudaqRegisterKernelName[] = "cudaqRegisterKernelName"; /// This value is used to indicate that a kernel does not return a result. static constexpr std::uint64_t NoResultOffset = std::numeric_limits::max(); +/// Generate code for packing arguments as raw data. +static bool isCodegenPackedData(std::size_t kind) { + return kind == 0 || kind == 1; +} + +/// Generate code that gathers the arguments for conversion and synthesis. +static bool isCodegenArgumentGather(std::size_t kind) { + return kind == 0 || kind == 2; +} + +namespace { class GenerateKernelExecution : public cudaq::opt::impl::GenerateKernelExecutionBase< GenerateKernelExecution> { @@ -277,18 +287,13 @@ class GenerateKernelExecution Value genComputeReturnOffset(Location loc, OpBuilder &builder, FunctionType funcTy, - cudaq::cc::StructType msgStructTy, - Value nullSt) { - auto i64Ty = builder.getI64Type(); + cudaq::cc::StructType msgStructTy) { if (funcTy.getNumResults() == 0) return builder.create(loc, NoResultOffset, 64); - auto members = msgStructTy.getMembers(); std::int32_t numKernelArgs = funcTy.getNumInputs(); - auto resTy = cudaq::cc::PointerType::get(members[numKernelArgs]); - auto gep = builder.create( - loc, resTy, nullSt, - SmallVector{numKernelArgs}); - return builder.create(loc, i64Ty, gep); + auto i64Ty = builder.getI64Type(); + return builder.create( + loc, i64Ty, msgStructTy, ArrayRef{numKernelArgs}); } /// Create a function that determines the return value offset in the message @@ -305,11 +310,8 @@ class GenerateKernelExecution OpBuilder::InsertionGuard guard(builder); auto *entry = returnOffsetFunc.addEntryBlock(); builder.setInsertionPointToStart(entry); - auto ptrTy = cudaq::cc::PointerType::get(msgStructTy); - auto zero = builder.create(loc, 0, 64); - auto basePtr = builder.create(loc, ptrTy, zero); auto result = - genComputeReturnOffset(loc, builder, devKernelTy, msgStructTy, basePtr); + genComputeReturnOffset(loc, builder, devKernelTy, msgStructTy); builder.create(loc, result); } @@ -1116,227 +1118,309 @@ class GenerateKernelExecution /// library. Pass along the thunk, so the runtime can call the quantum /// circuit. These entry points are `operator()` member functions in a class, /// so account for the `this` argument here. - void genNewHostEntryPoint1(Location loc, OpBuilder &builder, - FunctionType funcTy, - cudaq::cc::StructType structTy, - LLVM::GlobalOp kernelNameObj, func::FuncOp thunk, - func::FuncOp rewriteEntry, bool addThisPtr) { + void genNewHostEntryPoint(Location loc, OpBuilder &builder, + FunctionType devFuncTy, + LLVM::GlobalOp kernelNameObj, func::FuncOp hostFunc, + bool addThisPtr, cudaq::cc::StructType structTy, + func::FuncOp thunkFunc) { auto *ctx = builder.getContext(); auto i64Ty = builder.getI64Type(); - auto offset = funcTy.getNumInputs(); + auto offset = devFuncTy.getNumInputs(); auto thunkTy = getThunkType(ctx); auto structPtrTy = cudaq::cc::PointerType::get(structTy); - Block *rewriteEntryBlock = rewriteEntry.addEntryBlock(); + Block *hostFuncEntryBlock = hostFunc.addEntryBlock(); + const bool hiddenSRet = cudaq::opt::factory::hasHiddenSRet(devFuncTy); OpBuilder::InsertionGuard guard(builder); - builder.setInsertionPointToStart(rewriteEntryBlock); - Value stVal = builder.create(loc, structTy); + builder.setInsertionPointToStart(hostFuncEntryBlock); + auto i8Ty = builder.getI8Type(); + auto ptrI8Ty = cudaq::cc::PointerType::get(i8Ty); - // Process all the arguments for the original call, ignoring any hidden - // arguments (such as the `this` pointer). - auto zero = builder.create(loc, 0, 64); - Value extraBytes = zero; - bool hasTrailingData = false; - SmallVector blockArgs{dropAnyHiddenArguments( - rewriteEntryBlock->getArguments(), funcTy, addThisPtr)}; - std::int32_t idx = 0; - SmallVector blockValues(blockArgs.size()); - std::copy(blockArgs.begin(), blockArgs.end(), blockValues.begin()); - for (auto iter = blockArgs.begin(), end = blockArgs.end(); iter != end; - ++iter, ++idx) { - Value arg = *iter; - Type inTy = arg.getType(); - Type quakeTy = funcTy.getInput(idx); - // If the argument is a callable, skip it. - if (isa(quakeTy)) - continue; - // If the argument is an empty struct, skip it. - if (auto strTy = dyn_cast(quakeTy)) - if (strTy.isEmpty()) + Value temp; + Value castTemp; + Value resultOffset; + Value castLoadThunk; + Value extendedStructSize; + if (isCodegenPackedData(codegenKind)) { + Value stVal = builder.create(loc, structTy); + + // Process all the arguments for the original call, ignoring any hidden + // arguments (such as the `this` pointer). + auto zero = builder.create(loc, 0, 64); + Value extraBytes = zero; + bool hasTrailingData = false; + SmallVector blockArgs{dropAnyHiddenArguments( + hostFuncEntryBlock->getArguments(), devFuncTy, addThisPtr)}; + std::int32_t idx = 0; + SmallVector blockValues(blockArgs.size()); + std::copy(blockArgs.begin(), blockArgs.end(), blockValues.begin()); + for (auto iter = blockArgs.begin(), end = blockArgs.end(); iter != end; + ++iter, ++idx) { + Value arg = *iter; + Type inTy = arg.getType(); + Type quakeTy = devFuncTy.getInput(idx); + // If the argument is a callable, skip it. + if (isa(quakeTy)) continue; + // If the argument is an empty struct, skip it. + if (auto strTy = dyn_cast(quakeTy)) + if (strTy.isEmpty()) + continue; - if (auto stdvecTy = dyn_cast(quakeTy)) { - // Per the CUDA-Q spec, an entry point kernel must take a `[const] - // std::vector` value argument. - // Should the spec stipulate that pure device kernels must pass by - // read-only reference, i.e., take `const std::vector &` arguments? - auto ptrInTy = cast(inTy); - // If this is a std::vector, unpack it. - if (stdvecTy.getElementType() == builder.getI1Type()) { - // Create a mock vector of i8 and populate the bools, 1 per char. - Value temp = builder.create( - loc, ptrInTy.getElementType()); - builder.create(loc, std::nullopt, - cudaq::stdvecBoolUnpackToInitList, - ArrayRef{temp, arg}); - arg = blockValues[idx] = temp; + if (auto stdvecTy = dyn_cast(quakeTy)) { + // Per the CUDA-Q spec, an entry point kernel must take a `[const] + // std::vector` value argument. + // Should the spec stipulate that pure device kernels must pass by + // read-only reference, i.e., take `const std::vector &` arguments? + auto ptrInTy = cast(inTy); + // If this is a std::vector, unpack it. + if (stdvecTy.getElementType() == builder.getI1Type()) { + // Create a mock vector of i8 and populate the bools, 1 per char. + Value tmp = builder.create( + loc, ptrInTy.getElementType()); + builder.create(loc, std::nullopt, + cudaq::stdvecBoolUnpackToInitList, + ArrayRef{tmp, arg}); + arg = blockValues[idx] = tmp; + } + // FIXME: call the `size` member function. For expediency, assume this + // is an std::vector and the size is the scaled delta between the + // first two pointers. Use the unscaled size for now. + auto [p1, p2] = insertVectorSizeAndIncrementExtraBytes( + loc, builder, arg, ptrInTy, stdvecTy, stVal, idx, extraBytes); + stVal = p1; + extraBytes = p2; + hasTrailingData = true; + continue; } - // FIXME: call the `size` member function. For expediency, assume this - // is an std::vector and the size is the scaled delta between the - // first two pointers. Use the unscaled size for now. - auto [p1, p2] = insertVectorSizeAndIncrementExtraBytes( - loc, builder, arg, ptrInTy, stdvecTy, stVal, idx, extraBytes); - stVal = p1; - extraBytes = p2; - hasTrailingData = true; - continue; - } - if (auto strTy = dyn_cast(quakeTy)) { - if (!isa(arg.getType())) { - // If argument is not a pointer, then struct was promoted into a - // register. - auto *parent = builder.getBlock()->getParentOp(); - auto module = parent->getParentOfType(); - auto tmp = builder.create(loc, quakeTy); - auto cast = builder.create( - loc, cudaq::cc::PointerType::get(arg.getType()), tmp); - if (cudaq::opt::factory::isX86_64(module)) { - builder.create(loc, arg, cast); - if (cudaq::opt::factory::structUsesTwoArguments(quakeTy)) { - auto arrTy = cudaq::cc::ArrayType::get(builder.getI8Type()); - auto cast = builder.create( - loc, cudaq::cc::PointerType::get(arrTy), tmp); - auto hiPtr = builder.create( - loc, cudaq::cc::PointerType::get(builder.getI8Type()), cast, - cudaq::cc::ComputePtrArg{8}); - ++iter; - Value nextArg = *iter; - auto cast2 = builder.create( - loc, cudaq::cc::PointerType::get(nextArg.getType()), hiPtr); - builder.create(loc, nextArg, cast2); + if (auto strTy = dyn_cast(quakeTy)) { + if (!isa(arg.getType())) { + // If argument is not a pointer, then struct was promoted into a + // register. + auto *parent = builder.getBlock()->getParentOp(); + auto module = parent->getParentOfType(); + auto tmp = builder.create(loc, quakeTy); + auto cast = builder.create( + loc, cudaq::cc::PointerType::get(arg.getType()), tmp); + if (cudaq::opt::factory::isX86_64(module)) { + builder.create(loc, arg, cast); + if (cudaq::opt::factory::structUsesTwoArguments(quakeTy)) { + auto arrTy = cudaq::cc::ArrayType::get(builder.getI8Type()); + auto cast = builder.create( + loc, cudaq::cc::PointerType::get(arrTy), tmp); + auto hiPtr = builder.create( + loc, cudaq::cc::PointerType::get(builder.getI8Type()), cast, + cudaq::cc::ComputePtrArg{8}); + ++iter; + Value nextArg = *iter; + auto cast2 = builder.create( + loc, cudaq::cc::PointerType::get(nextArg.getType()), hiPtr); + builder.create(loc, nextArg, cast2); + } + } else { + builder.create(loc, arg, cast); } - } else { - builder.create(loc, arg, cast); + // Load the assembled (sub-)struct and insert into the buffer value. + Value v = builder.create(loc, tmp); + stVal = builder.create( + loc, stVal.getType(), stVal, v, idx); + continue; } - // Load the assembled (sub-)struct and insert into the buffer value. - Value v = builder.create(loc, tmp); - stVal = builder.create(loc, stVal.getType(), - stVal, v, idx); + if (!cudaq::cc::isDynamicType(strTy)) { + // struct is static size, so just load the value (byval ptr). + Value v = builder.create(loc, arg); + stVal = builder.create( + loc, stVal.getType(), stVal, v, idx); + continue; + } + auto genTy = cast( + cudaq::opt::factory::genArgumentBufferType(strTy)); + Value zero = builder.create(loc, 0, 64); + auto [quakeVal, recursiveSize] = computeRecursiveDynamicStructSize( + loc, builder, strTy, arg, zero, genTy); + stVal = builder.create( + loc, stVal.getType(), stVal, quakeVal, idx); + extraBytes = + builder.create(loc, extraBytes, recursiveSize); + hasTrailingData = true; continue; } - if (!cudaq::cc::isDynamicType(strTy)) { - // struct is static size, so just load the value (byval ptr). - Value v = builder.create(loc, arg); - stVal = builder.create(loc, stVal.getType(), - stVal, v, idx); + if (auto ptrTy = dyn_cast(inTy)) { + if (isa(ptrTy.getElementType())) { + // Special case: if the argument is a `cudaq::state*`, then just + // pass the pointer. We can do that in this case because the + // synthesis step (which will receive the argument data) is assumed + // to run in the same memory space. + Value argPtr = builder.create(loc, inTy, arg); + stVal = builder.create( + loc, stVal.getType(), stVal, argPtr, idx); + } continue; } - auto genTy = cast( - cudaq::opt::factory::genArgumentBufferType(strTy)); - Value zero = builder.create(loc, 0, 64); - auto [quakeVal, recursiveSize] = computeRecursiveDynamicStructSize( - loc, builder, strTy, arg, zero, genTy); + stVal = builder.create(loc, stVal.getType(), - stVal, quakeVal, idx); - extraBytes = - builder.create(loc, extraBytes, recursiveSize); - hasTrailingData = true; - continue; + stVal, arg, idx); } - if (auto ptrTy = dyn_cast(inTy)) { - if (isa(ptrTy.getElementType())) { - // Special case: if the argument is a `cudaq::state*`, then just pass - // the pointer. We can do that in this case because the synthesis step - // (which will receive the argument data) is assumed to run in the - // same memory space. - Value argPtr = builder.create(loc, inTy, arg); - stVal = builder.create(loc, stVal.getType(), - stVal, argPtr, idx); + + // Compute the struct size without the trailing bytes, structSize, and + // with the trailing bytes, extendedStructSize. + Value structSize = + builder.create(loc, i64Ty, structTy); + extendedStructSize = + builder.create(loc, structSize, extraBytes); + + // Allocate our struct to save the argument to. + auto buff = + builder.create(loc, i8Ty, extendedStructSize); + + temp = builder.create(loc, structPtrTy, buff); + + // Store the arguments to the argument section. + builder.create(loc, stVal, temp); + + auto structPtrArrTy = + cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(structTy)); + temp = builder.create(loc, structPtrArrTy, buff); + + // Append the vector data to the end of the struct. + if (hasTrailingData) { + Value vecToBuffer = builder.create( + loc, ptrI8Ty, buff, SmallVector{structSize}); + // Ignore any hidden `this` argument. + for (auto inp : llvm::enumerate(blockValues)) { + Value arg = inp.value(); + Type inTy = arg.getType(); + std::int32_t idx = inp.index(); + Type quakeTy = devFuncTy.getInput(idx); + if (auto stdvecTy = dyn_cast(quakeTy)) { + auto bytes = builder.create( + loc, builder.getI64Type(), stVal, idx); + assert(stdvecTy == devFuncTy.getInput(idx)); + auto ptrInTy = cast(inTy); + vecToBuffer = encodeVectorData(loc, builder, bytes, stdvecTy, arg, + vecToBuffer, ptrInTy); + if (stdvecTy.getElementType() == builder.getI1Type()) { + auto ptrI1Ty = cudaq::cc::PointerType::get(builder.getI1Type()); + auto heapPtr = builder.create( + loc, cudaq::cc::PointerType::get(ptrI1Ty), arg, + ArrayRef{0}); + auto loadHeapPtr = + builder.create(loc, heapPtr); + Value heapCast = builder.create( + loc, cudaq::cc::PointerType::get(i8Ty), loadHeapPtr); + builder.create(loc, std::nullopt, "free", + ArrayRef{heapCast}); + } + } else if (auto strTy = dyn_cast(quakeTy)) { + if (cudaq::cc::isDynamicType(strTy)) + vecToBuffer = encodeDynamicStructData(loc, builder, strTy, arg, + temp, vecToBuffer); + } } - continue; } - - stVal = builder.create(loc, stVal.getType(), - stVal, arg, idx); + Value loadThunk = + builder.create(loc, thunkTy, thunkFunc.getName()); + castLoadThunk = + builder.create(loc, ptrI8Ty, loadThunk); + castTemp = builder.create(loc, ptrI8Ty, temp); + resultOffset = genComputeReturnOffset(loc, builder, devFuncTy, structTy); } - // Compute the struct size without the trailing bytes, structSize, and with - // the trailing bytes, extendedStructSize. - auto nullSt = builder.create(loc, structPtrTy, zero); - Value structSize = - builder.create(loc, i64Ty, structTy); - Value extendedStructSize = - builder.create(loc, structSize, extraBytes); - - // Allocate our struct to save the argument to. - auto i8Ty = builder.getI8Type(); - auto ptrI8Ty = cudaq::cc::PointerType::get(i8Ty); - auto buff = - builder.create(loc, i8Ty, extendedStructSize); - - Value temp = builder.create(loc, structPtrTy, buff); - - // Store the arguments to the argument section. - builder.create(loc, stVal, temp); - - auto structPtrArrTy = - cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(structTy)); - temp = builder.create(loc, structPtrArrTy, buff); - - // Append the vector data to the end of the struct. - if (hasTrailingData) { - Value vecToBuffer = builder.create( - loc, ptrI8Ty, buff, SmallVector{structSize}); - // Ignore any hidden `this` argument. - for (auto inp : llvm::enumerate(blockValues)) { - Value arg = inp.value(); - Type inTy = arg.getType(); - std::int32_t idx = inp.index(); - Type quakeTy = funcTy.getInput(idx); - if (auto stdvecTy = dyn_cast(quakeTy)) { - auto bytes = builder.create( - loc, builder.getI64Type(), stVal, idx); - assert(stdvecTy == funcTy.getInput(idx)); - auto ptrInTy = cast(inTy); - vecToBuffer = encodeVectorData(loc, builder, bytes, stdvecTy, arg, - vecToBuffer, ptrInTy); - if (stdvecTy.getElementType() == builder.getI1Type()) { - auto ptrI1Ty = cudaq::cc::PointerType::get(builder.getI1Type()); - auto heapPtr = builder.create( - loc, cudaq::cc::PointerType::get(ptrI1Ty), arg, - ArrayRef{0}); - auto loadHeapPtr = builder.create(loc, heapPtr); - Value heapCast = builder.create( - loc, cudaq::cc::PointerType::get(i8Ty), loadHeapPtr); - builder.create(loc, std::nullopt, "free", - ArrayRef{heapCast}); - } - } else if (auto strTy = dyn_cast(quakeTy)) { - if (cudaq::cc::isDynamicType(strTy)) - vecToBuffer = encodeDynamicStructData(loc, builder, strTy, arg, - temp, vecToBuffer); + Value vecArgPtrs; + if (isCodegenArgumentGather(codegenKind)) { + // 1) Allocate and initialize a std::vector object. + const unsigned count = + cudaq::cc::numberOfHiddenArgs(addThisPtr, hiddenSRet); + auto stdVec = builder.create( + loc, cudaq::opt::factory::stlVectorType(ptrI8Ty)); + auto arrPtrTy = cudaq::cc::ArrayType::get(ctx, ptrI8Ty, count); + Value buffer = builder.create(loc, arrPtrTy); + auto i64Ty = builder.getI64Type(); + auto buffSize = builder.create(loc, i64Ty, arrPtrTy); + auto ptrPtrTy = cudaq::cc::PointerType::get(ptrI8Ty); + auto cast1 = builder.create(loc, ptrPtrTy, buffer); + auto ptr3Ty = cudaq::cc::PointerType::get(ptrPtrTy); + auto stdVec0 = builder.create(loc, ptr3Ty, stdVec); + builder.create(loc, cast1, stdVec0); + auto cast2 = builder.create(loc, i64Ty, buffer); + auto endBuff = builder.create(loc, cast2, buffSize); + auto cast3 = builder.create(loc, ptrPtrTy, endBuff); + auto stdVec1 = builder.create( + loc, ptr3Ty, stdVec, ArrayRef{1}); + builder.create(loc, cast3, stdVec1); + auto stdVec2 = builder.create( + loc, ptr3Ty, stdVec, ArrayRef{2}); + builder.create(loc, cast3, stdVec2); + + // 2) Iterate over the arguments passed in and populate the vector. + SmallVector blockArgs{dropAnyHiddenArguments( + hostFuncEntryBlock->getArguments(), devFuncTy, addThisPtr)}; + for (auto iter : llvm::enumerate(blockArgs)) { + std::int32_t i = iter.index(); + auto pos = builder.create( + loc, ptrPtrTy, buffer, ArrayRef{i}); + auto blkArg = iter.value(); + if (isa(blkArg.getType())) { + auto castArg = + builder.create(loc, ptrI8Ty, blkArg); + builder.create(loc, castArg, pos); + continue; } + auto temp = builder.create(loc, blkArg.getType()); + builder.create(loc, blkArg, temp); + auto castTemp = builder.create(loc, ptrI8Ty, temp); + builder.create(loc, castTemp, pos); } + vecArgPtrs = builder.create(loc, ptrI8Ty, stdVec); } // Prepare to call the `launchKernel` runtime library entry point. Value loadKernName = builder.create( loc, cudaq::opt::factory::getPointerType(kernelNameObj.getType()), kernelNameObj.getSymName()); - Value loadThunk = - builder.create(loc, thunkTy, thunk.getName()); auto castLoadKernName = builder.create(loc, ptrI8Ty, loadKernName); - auto castLoadThunk = - builder.create(loc, ptrI8Ty, loadThunk); - auto castTemp = builder.create(loc, ptrI8Ty, temp); - - auto resultOffset = - genComputeReturnOffset(loc, builder, funcTy, structTy, nullSt); // Generate the call to `launchKernel`. - builder.create( - loc, std::nullopt, cudaq::runtime::launchKernelFuncName, - ArrayRef{castLoadKernName, castLoadThunk, castTemp, - extendedStructSize, resultOffset}); - const bool hiddenSRet = cudaq::opt::factory::hasHiddenSRet(funcTy); + switch (codegenKind) { + case 0: { + assert(vecArgPtrs && "vector must be initialized"); + builder.create( + loc, std::nullopt, cudaq::runtime::launchKernelHybridFuncName, + ArrayRef{castLoadKernName, castLoadThunk, castTemp, + extendedStructSize, resultOffset, vecArgPtrs}); + } break; + case 1: { + builder.create( + loc, std::nullopt, cudaq::runtime::launchKernelFuncName, + ArrayRef{castLoadKernName, castLoadThunk, castTemp, + extendedStructSize, resultOffset}); + } break; + case 2: { + assert(vecArgPtrs && "vector must be initialized"); + builder.create( + loc, std::nullopt, cudaq::runtime::launchKernelStreamlinedFuncName, + ArrayRef{castLoadKernName, vecArgPtrs}); + // For this codegen kind, we drop any results on the floor and return + // random data in registers and/or off the stack. This maintains parity + // with any pre-existing kernel launchers. + SmallVector garbage; + for (auto ty : hostFunc.getFunctionType().getResults()) + garbage.push_back(builder.create(loc, ty)); + builder.create(loc, garbage); + return; + } + default: + hostFunc.emitOpError("codegen kind is invalid"); + return; + } // If and only if this kernel returns a value, unpack and load the // result value(s) from the struct returned by `launchKernel` and return // them to our caller. SmallVector results; - const bool multiResult = funcTy.getResults().size() > 1; - for (auto res : llvm::enumerate(funcTy.getResults())) { + const bool multiResult = devFuncTy.getResults().size() > 1; + for (auto res : llvm::enumerate(devFuncTy.getResults())) { int off = res.index() + offset; if (auto vecTy = dyn_cast(res.value())) { auto eleTy = vecTy.getElementType(); @@ -1352,7 +1436,7 @@ class GenerateKernelExecution auto vecLen = builder.create(loc, gep1); if (vecTy.getElementType() == builder.getI1Type()) { genStdvecBoolFromInitList(loc, builder, - rewriteEntryBlock->getArguments().front(), + hostFuncEntryBlock->getArguments().front(), dataPtr, vecLen); } else { cudaq::IRBuilder irBuilder(builder); @@ -1362,7 +1446,7 @@ class GenerateKernelExecution return; } genStdvecTFromInitList(loc, builder, - rewriteEntryBlock->getArguments().front(), + hostFuncEntryBlock->getArguments().front(), dataPtr, tSize, vecLen); } offset++; @@ -1378,11 +1462,11 @@ class GenerateKernelExecution if (multiResult) return builder.create( loc, cudaq::cc::PointerType::get(res.value()), - rewriteEntryBlock->getArguments().front(), + hostFuncEntryBlock->getArguments().front(), SmallVector{off}); return builder.create( loc, cudaq::cc::PointerType::get(res.value()), - rewriteEntryBlock->getArguments().front()); + hostFuncEntryBlock->getArguments().front()); }(); builder.create(loc, loadVal, sretPtr); } else { @@ -1393,91 +1477,6 @@ class GenerateKernelExecution builder.create(loc, results); } - void genNewHostEntryPoint2(Location loc, OpBuilder &builder, - FunctionType devFuncTy, - LLVM::GlobalOp kernelNameObj, - func::FuncOp hostFunc, bool addThisPtr) { - const bool hiddenSRet = cudaq::opt::factory::hasHiddenSRet(devFuncTy); - const unsigned count = - cudaq::cc::numberOfHiddenArgs(addThisPtr, hiddenSRet); - auto *ctx = builder.getContext(); - auto i8PtrTy = cudaq::cc::PointerType::get(builder.getI8Type()); - - // 0) Pointer our builder into the entry block of the function. - Block *hostFuncEntryBlock = hostFunc.addEntryBlock(); - - OpBuilder::InsertionGuard guard(builder); - builder.setInsertionPointToStart(hostFuncEntryBlock); - - // 1) Allocate and initialize a std::vector object. - auto stdVec = builder.create( - loc, cudaq::opt::factory::stlVectorType(i8PtrTy)); - auto arrPtrTy = cudaq::cc::ArrayType::get(ctx, i8PtrTy, count); - Value buffer = builder.create(loc, arrPtrTy); - auto i64Ty = builder.getI64Type(); - auto buffSize = builder.create(loc, i64Ty, arrPtrTy); - auto ptrPtrTy = cudaq::cc::PointerType::get(i8PtrTy); - auto cast1 = builder.create(loc, ptrPtrTy, buffer); - auto ptr3Ty = cudaq::cc::PointerType::get(ptrPtrTy); - auto stdVec0 = builder.create(loc, ptr3Ty, stdVec); - builder.create(loc, cast1, stdVec0); - auto cast2 = builder.create(loc, i64Ty, buffer); - auto endBuff = builder.create(loc, cast2, buffSize); - auto cast3 = builder.create(loc, ptrPtrTy, endBuff); - auto stdVec1 = builder.create( - loc, ptr3Ty, stdVec, ArrayRef{1}); - builder.create(loc, cast3, stdVec1); - auto stdVec2 = builder.create( - loc, ptr3Ty, stdVec, ArrayRef{2}); - builder.create(loc, cast3, stdVec2); - auto zero = builder.create(loc, 0, 64); - auto nullPtr = builder.create(loc, i8PtrTy, zero); - - // 2) Iterate over the arguments passed in and populate the vector. - SmallVector blockArgs{dropAnyHiddenArguments( - hostFuncEntryBlock->getArguments(), devFuncTy, addThisPtr)}; - for (auto iter : llvm::enumerate(blockArgs)) { - std::int32_t i = iter.index(); - auto pos = builder.create( - loc, ptrPtrTy, buffer, ArrayRef{i}); - auto blkArg = iter.value(); - if (isa(blkArg.getType())) { - auto castArg = builder.create(loc, i8PtrTy, blkArg); - builder.create(loc, castArg, pos); - continue; - } - auto temp = builder.create(loc, blkArg.getType()); - builder.create(loc, blkArg, temp); - auto castTemp = builder.create(loc, i8PtrTy, temp); - builder.create(loc, castTemp, pos); - } - - auto resultBuffer = builder.create(loc, i8PtrTy); - builder.create(loc, nullPtr, resultBuffer); - auto castResultBuffer = - builder.create(loc, i8PtrTy, resultBuffer); - auto castStdvec = builder.create(loc, i8PtrTy, stdVec); - Value loadKernName = builder.create( - loc, cudaq::opt::factory::getPointerType(kernelNameObj.getType()), - kernelNameObj.getSymName()); - auto castKernelNameObj = - builder.create(loc, i8PtrTy, loadKernName); - builder.create( - loc, std::nullopt, cudaq::runtime::launchKernelVersion2FuncName, - ArrayRef{castKernelNameObj, castStdvec, castResultBuffer}); - - // FIXME: Drop any results on the floor for now and return random data left - // on the stack. (Maintains parity with existing kernel launch.) - if (hostFunc.getFunctionType().getResults().empty()) { - builder.create(loc); - return; - } - // There can only be 1 return type in C++, so this is safe. - Value garbage = builder.create( - loc, hostFunc.getFunctionType().getResult(0)); - builder.create(loc, garbage); - } - /// A kernel function that takes a quantum type argument (also known as a pure /// device kernel) cannot be called directly from C++ (classical) code. It /// must be called via other quantum code. @@ -1491,6 +1490,88 @@ class GenerateKernelExecution return true; } + LLVM::LLVMFuncOp registerKernelForExecution(Location loc, OpBuilder &builder, + const std::string &classNameStr, + LLVM::GlobalOp kernelNameObj, + func::FuncOp argsCreatorFunc, + StringRef mangledName) { + auto module = getOperation(); + auto *ctx = builder.getContext(); + auto ptrType = cudaq::cc::PointerType::get(builder.getI8Type()); + auto initFun = builder.create( + loc, classNameStr + ".kernelRegFunc", + LLVM::LLVMFunctionType::get(cudaq::opt::factory::getVoidType(ctx), {})); + OpBuilder::InsertionGuard guard(builder); + auto *initFunEntry = initFun.addEntryBlock(); + builder.setInsertionPointToStart(initFunEntry); + auto kernRef = builder.create( + loc, cudaq::opt::factory::getPointerType(kernelNameObj.getType()), + kernelNameObj.getSymName()); + auto castKernRef = builder.create(loc, ptrType, kernRef); + builder.create(loc, std::nullopt, CudaqRegisterKernelName, + ValueRange{castKernRef}); + + if (isCodegenPackedData(codegenKind)) { + // Register the argsCreator too + auto ptrPtrType = cudaq::cc::PointerType::get(ptrType); + auto argsCreatorFuncType = FunctionType::get( + ctx, {ptrPtrType, ptrPtrType}, {builder.getI64Type()}); + Value loadArgsCreator = builder.create( + loc, argsCreatorFuncType, argsCreatorFunc.getName()); + auto castLoadArgsCreator = + builder.create(loc, ptrType, loadArgsCreator); + builder.create( + loc, std::nullopt, CudaqRegisterArgsCreator, + ValueRange{castKernRef, castLoadArgsCreator}); + } + + // Check if this is a lambda mangled name + auto demangledPtr = abi::__cxa_demangle(mangledName.str().c_str(), nullptr, + nullptr, nullptr); + if (demangledPtr) { + std::string demangledName(demangledPtr); + demangledName = + std::regex_replace(demangledName, std::regex("::operator()(.*)"), ""); + if (demangledName.find("$_") != std::string::npos) { + auto insertPoint = builder.saveInsertionPoint(); + builder.setInsertionPointToStart(module.getBody()); + + // Create the function if it doesn't already exist. + if (!module.lookupSymbol(CudaqRegisterLambdaName)) + builder.create( + module.getLoc(), CudaqRegisterLambdaName, + LLVM::LLVMFunctionType::get( + cudaq::opt::factory::getVoidType(ctx), + {cudaq::opt::factory::getPointerType(ctx), + cudaq::opt::factory::getPointerType(ctx)})); + + // Create this global name, it is unique for any lambda + // bc classNameStr contains the parentFunc + varName + auto lambdaName = builder.create( + loc, + cudaq::opt::factory::getStringType(ctx, demangledName.size() + 1), + /*isConstant=*/true, LLVM::Linkage::External, + classNameStr + ".lambdaName", + builder.getStringAttr(demangledName + '\0'), /*alignment=*/0); + + builder.restoreInsertionPoint(insertPoint); + auto lambdaRef = builder.create( + loc, cudaq::opt::factory::getPointerType(lambdaName.getType()), + lambdaName.getSymName()); + + auto castLambdaRef = builder.create( + loc, cudaq::opt::factory::getPointerType(ctx), lambdaRef); + auto castKernelRef = builder.create( + loc, cudaq::opt::factory::getPointerType(ctx), castKernRef); + builder.create(loc, std::nullopt, CudaqRegisterLambdaName, + ValueRange{castLambdaRef, castKernelRef}); + } + } + + builder.create(loc, ValueRange{}); + return initFun; + } + void runOnOperation() override { auto module = getOperation(); DataLayoutAnalysis dla(module); // caches module's data layout information. @@ -1508,26 +1589,40 @@ class GenerateKernelExecution if (!mangledNameMap || mangledNameMap.empty()) return; auto irBuilder = cudaq::IRBuilder::atBlockEnd(module.getBody()); - if (altLaunchVersion == 1) + switch (codegenKind) { + case 0: + if (failed(irBuilder.loadIntrinsic( + module, cudaq::runtime::launchKernelHybridFuncName))) { + module.emitError("could not load altLaunchKernel intrinsic."); + return; + } + break; + case 1: if (failed(irBuilder.loadIntrinsic( module, cudaq::runtime::launchKernelFuncName))) { module.emitError("could not load altLaunchKernel intrinsic."); return; } - if (altLaunchVersion == 2) + break; + case 2: if (failed(irBuilder.loadIntrinsic( - module, cudaq::runtime::launchKernelVersion2FuncName))) { + module, cudaq::runtime::launchKernelStreamlinedFuncName))) { module.emitError("could not load altLaunchKernel intrinsic."); return; } + break; + default: + module.emitError("invalid codegen kind value."); + return; + } auto loc = module.getLoc(); auto ptrType = cudaq::cc::PointerType::get(builder.getI8Type()); auto regKern = builder.create( - loc, cudaqRegisterKernelName, FunctionType::get(ctx, {ptrType}, {})); + loc, CudaqRegisterKernelName, FunctionType::get(ctx, {ptrType}, {})); regKern.setPrivate(); auto regArgs = builder.create( - loc, cudaqRegisterArgsCreator, + loc, CudaqRegisterArgsCreator, FunctionType::get(ctx, {ptrType, ptrType}, {})); regArgs.setPrivate(); @@ -1622,7 +1717,7 @@ class GenerateKernelExecution func::FuncOp thunk; func::FuncOp argsCreatorFunc; - if (altLaunchVersion == 1) { + if (isCodegenPackedData(codegenKind)) { // Generate the function that computes the return offset. genReturnOffsetFunction(loc, builder, funcTy, structTy, classNameStr); @@ -1652,94 +1747,15 @@ class GenerateKernelExecution // Generate a new mangled function on the host side to call the // callback function. - if (hostEntryNeeded) { - if (altLaunchVersion == 1) - genNewHostEntryPoint1(loc, builder, funcTy, structTy, kernelNameObj, - thunk, hostFunc, hasThisPtr); - else - genNewHostEntryPoint2(loc, builder, funcTy, kernelNameObj, hostFunc, - hasThisPtr); - } + if (hostEntryNeeded) + genNewHostEntryPoint(loc, builder, funcTy, kernelNameObj, hostFunc, + hasThisPtr, structTy, thunk); // Generate a function at startup to register this kernel as having // been processed for kernel execution. - auto initFun = builder.create( - loc, classNameStr + ".kernelRegFunc", - LLVM::LLVMFunctionType::get(cudaq::opt::factory::getVoidType(ctx), - {})); - { - OpBuilder::InsertionGuard guard(builder); - auto *initFunEntry = initFun.addEntryBlock(); - builder.setInsertionPointToStart(initFunEntry); - auto kernRef = builder.create( - loc, cudaq::opt::factory::getPointerType(kernelNameObj.getType()), - kernelNameObj.getSymName()); - auto castKernRef = - builder.create(loc, ptrType, kernRef); - builder.create(loc, std::nullopt, cudaqRegisterKernelName, - ValueRange{castKernRef}); - - if (altLaunchVersion == 1) { - // Register the argsCreator too - auto ptrPtrType = cudaq::cc::PointerType::get(ptrType); - auto argsCreatorFuncType = FunctionType::get( - ctx, {ptrPtrType, ptrPtrType}, {builder.getI64Type()}); - Value loadArgsCreator = builder.create( - loc, argsCreatorFuncType, argsCreatorFunc.getName()); - auto castLoadArgsCreator = builder.create( - loc, ptrType, loadArgsCreator); - builder.create( - loc, std::nullopt, cudaqRegisterArgsCreator, - ValueRange{castKernRef, castLoadArgsCreator}); - } - - // Check if this is a lambda mangled name - auto demangledPtr = abi::__cxa_demangle(mangledName.str().c_str(), - nullptr, nullptr, nullptr); - if (demangledPtr) { - std::string demangledName(demangledPtr); - demangledName = std::regex_replace( - demangledName, std::regex("::operator()(.*)"), ""); - if (demangledName.find("$_") != std::string::npos) { - auto insertPoint = builder.saveInsertionPoint(); - builder.setInsertionPointToStart(module.getBody()); - - // Create the function if it doesn't already exist. - if (!module.lookupSymbol(cudaqRegisterLambdaName)) - builder.create( - module.getLoc(), cudaqRegisterLambdaName, - LLVM::LLVMFunctionType::get( - cudaq::opt::factory::getVoidType(ctx), - {cudaq::opt::factory::getPointerType(ctx), - cudaq::opt::factory::getPointerType(ctx)})); - - // Create this global name, it is unique for any lambda - // bc classNameStr contains the parentFunc + varName - auto lambdaName = builder.create( - loc, - cudaq::opt::factory::getStringType(ctx, - demangledName.size() + 1), - /*isConstant=*/true, LLVM::Linkage::External, - classNameStr + ".lambdaName", - builder.getStringAttr(demangledName + '\0'), /*alignment=*/0); - - builder.restoreInsertionPoint(insertPoint); - auto lambdaRef = builder.create( - loc, cudaq::opt::factory::getPointerType(lambdaName.getType()), - lambdaName.getSymName()); - - auto castLambdaRef = builder.create( - loc, cudaq::opt::factory::getPointerType(ctx), lambdaRef); - auto castKernelRef = builder.create( - loc, cudaq::opt::factory::getPointerType(ctx), castKernRef); - builder.create( - loc, std::nullopt, cudaqRegisterLambdaName, - ValueRange{castLambdaRef, castKernelRef}); - } - } - - builder.create(loc, ValueRange{}); - } + auto initFun = + registerKernelForExecution(loc, builder, classNameStr, kernelNameObj, + argsCreatorFunc, mangledName); // Create a global with a default ctor to be run at program startup. // The ctor will execute the above function, which will register this diff --git a/python/tests/builder/test_qalloc_init.py b/python/tests/builder/test_qalloc_init.py index e0cc626f02..623c383f3c 100644 --- a/python/tests/builder/test_qalloc_init.py +++ b/python/tests/builder/test_qalloc_init.py @@ -171,6 +171,25 @@ def test_kernel_complex_params_rotate_f64(): assert '10' in counts +@skipIfNvidiaFP64NotInstalled +def test_kernel_complex_force_kron(): + cudaq.reset_target() + cudaq.set_target('nvidia-fp64') + + c = [0. + 0j] * 1024 + c[1023] = 1j + + kernel, vec = cudaq.make_kernel(list[complex]) + p = kernel.qalloc(1) + q = kernel.qalloc(vec) + kernel.mz(p) + kernel.mz(q) + + counts = cudaq.sample(kernel, c) + assert len(counts) == 1 + assert '01111111111' in counts + + @skipIfNvidiaNotInstalled def test_kernel_complex_params_rotate_f32(): cudaq.reset_target() diff --git a/runtime/common/ArgumentConversion.cpp b/runtime/common/ArgumentConversion.cpp index f39cf942a5..e76a24b5a0 100644 --- a/runtime/common/ArgumentConversion.cpp +++ b/runtime/common/ArgumentConversion.cpp @@ -340,8 +340,10 @@ void cudaq::opt::ArgumentConverter::gen(const std::vector &arguments) { FunctionType fromFuncTy = fun.getFunctionType(); for (auto iter : llvm::enumerate(llvm::zip(fromFuncTy.getInputs(), arguments))) { - Type argTy = std::get<0>(iter.value()); void *argPtr = std::get<1>(iter.value()); + if (!argPtr) + continue; + Type argTy = std::get<0>(iter.value()); unsigned i = iter.index(); auto buildSubst = [&, i = i](Ts &&...ts) { builder.setInsertionPointToEnd(substModule.getBody()); @@ -422,3 +424,33 @@ void cudaq::opt::ArgumentConverter::gen(const std::vector &arguments) { substitutions.emplace_back(std::move(subst)); } } + +void cudaq::opt::ArgumentConverter::gen( + const std::vector &arguments, + const std::unordered_set &exclusions) { + std::vector partialArgs; + for (auto iter : llvm::enumerate(arguments)) { + if (exclusions.contains(iter.index())) { + partialArgs.push_back(nullptr); + continue; + } + partialArgs.push_back(iter.value()); + } + gen(partialArgs); +} + +void cudaq::opt::ArgumentConverter::gen_drop_front( + const std::vector &arguments, unsigned numDrop) { + // If we're dropping all the arguments, we're done. + if (numDrop >= arguments.size()) + return; + std::vector partialArgs; + for (void *arg : arguments) { + if (numDrop--) { + partialArgs.push_back(nullptr); + continue; + } + partialArgs.push_back(arg); + } + gen(partialArgs); +} diff --git a/runtime/common/ArgumentConversion.h b/runtime/common/ArgumentConversion.h index 3251e0d304..1e1efb9347 100644 --- a/runtime/common/ArgumentConversion.h +++ b/runtime/common/ArgumentConversion.h @@ -13,6 +13,7 @@ #include "cudaq/qis/state.h" #include "mlir/IR/Builders.h" #include "mlir/IR/Types.h" +#include namespace cudaq::opt { @@ -53,6 +54,15 @@ class ArgumentConverter { /// The arguments are those presented to the kernel, kernelName. void gen(const std::vector &arguments); + /// Generate a substitution ModuleOp but include only the arguments that do + /// not appear in the set of \p exclusions. + void gen(const std::vector &arguments, + const std::unordered_set &exclusions); + + /// Generate a substitution ModuleOp but drop the first \p numDrop arguments + /// and thereby exclude them from the substitutions. + void gen_drop_front(const std::vector &arguments, unsigned numDrop); + /// Get the list of substitutions that were generated by `gen()`. mlir::SmallVector &getSubstitutions() { return substitutions; diff --git a/runtime/nvqir/custatevec/CuStateVecCircuitSimulator.cu b/runtime/nvqir/custatevec/CuStateVecCircuitSimulator.cu index ed034822d7..4693eefd36 100644 --- a/runtime/nvqir/custatevec/CuStateVecCircuitSimulator.cu +++ b/runtime/nvqir/custatevec/CuStateVecCircuitSimulator.cu @@ -95,7 +95,7 @@ void kronprod(uint32_t n_blocks, int32_t threads_per_block, void *arr0) { cudaKronprod<<>>( tsize1, reinterpret_cast(arr1), - (1UL << tsize2), reinterpret_cast(arr2), + tsize2, reinterpret_cast(arr2), reinterpret_cast(arr0)); } diff --git a/test/Quake/kernel_exec-1.qke b/test/Quake/kernel_exec-1.qke index b0287073a7..4463b01c71 100644 --- a/test/Quake/kernel_exec-1.qke +++ b/test/Quake/kernel_exec-1.qke @@ -7,7 +7,8 @@ // ========================================================================== // // RUN: cudaq-opt --kernel-execution %s | FileCheck %s -// RUN: cudaq-opt --kernel-execution=alt-launch=2 %s | FileCheck --check-prefix=ALT2 %s +// RUN: cudaq-opt --kernel-execution=codegen=2 %s | FileCheck --check-prefix=STREAM %s +// RUN: cudaq-opt --kernel-execution=codegen=0 %s | FileCheck --check-prefix=HYBRID %s module attributes {quake.mangled_name_map = { __nvqpp__mlirgen__ghz = "_ZN3ghzclEi"}} { @@ -86,23 +87,21 @@ module attributes {quake.mangled_name_map = { // CHECK-LABEL: func.func @_ZN3ghzclEi( // CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, %[[VAL_1:.*]]: i32) -> f64 { -// CHECK: %[[VAL_2:.*]] = cc.undef !cc.struct<{i32, f64}> -// CHECK: %[[VAL_3:.*]] = arith.constant 0 : i64 +// CHECK-DAG: %[[VAL_2:.*]] = cc.undef !cc.struct<{i32, f64}> +// CHECK-DAG: %[[VAL_3:.*]] = arith.constant 0 : i64 // CHECK: %[[VAL_4:.*]] = cc.insert_value %[[VAL_1]], %[[VAL_2]][0] : (!cc.struct<{i32, f64}>, i32) -> !cc.struct<{i32, f64}> -// CHECK: %[[VAL_5:.*]] = cc.cast %[[VAL_3]] : (i64) -> !cc.ptr> // CHECK: %[[VAL_7:.*]] = cc.sizeof !cc.struct<{i32, f64}> : i64 // CHECK: %[[VAL_8:.*]] = arith.addi %[[VAL_7]], %[[VAL_3]] : i64 // CHECK: %[[VAL_9:.*]] = cc.alloca i8[%[[VAL_8]] : i64] // CHECK: %[[VAL_10:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr>) -> !cc.ptr> // CHECK: cc.store %[[VAL_4]], %[[VAL_10]] : !cc.ptr> // CHECK: %[[VAL_11:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr>) -> !cc.ptr x ?>> -// CHECK: %[[VAL_12:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr> // CHECK: %[[VAL_13:.*]] = constant @ghz.thunk : (!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}> -// CHECK: %[[VAL_14:.*]] = cc.cast %[[VAL_12]] : (!llvm.ptr>) -> !cc.ptr // CHECK: %[[VAL_15:.*]] = cc.func_ptr %[[VAL_13]] : ((!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}>) -> !cc.ptr // CHECK: %[[VAL_16:.*]] = cc.cast %[[VAL_11]] : (!cc.ptr x ?>>) -> !cc.ptr -// CHECK: %[[VAL_17:.*]] = cc.compute_ptr %[[VAL_5]][1] : (!cc.ptr>) -> !cc.ptr -// CHECK: %[[VAL_18:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr) -> i64 +// CHECK: %[[VAL_18:.*]] = cc.offsetof !cc.struct<{i32, f64}> [1] : i64 +// CHECK: %[[VAL_12:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr> +// CHECK: %[[VAL_14:.*]] = cc.cast %[[VAL_12]] : (!llvm.ptr>) -> !cc.ptr // CHECK: call @altLaunchKernel(%[[VAL_14]], %[[VAL_15]], %[[VAL_16]], %[[VAL_8]], %[[VAL_18]]) : (!cc.ptr, !cc.ptr, !cc.ptr, i64, i64) -> () // CHECK: %[[VAL_19:.*]] = cc.compute_ptr %[[VAL_11]][0, 1] : (!cc.ptr x ?>>) -> !cc.ptr // CHECK: %[[VAL_20:.*]] = cc.load %[[VAL_19]] : !cc.ptr @@ -131,8 +130,8 @@ module attributes {quake.mangled_name_map = { // CHECK: } // CHECK-LABEL: func.func @ghz.argsCreator( -// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr>, -// CHECK-SAME: %[[VAL_1:.*]]: !cc.ptr>) -> i64 { +// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr>, +// CHECK-SAME: %[[VAL_1:.*]]: !cc.ptr>) -> i64 { // CHECK: %[[VAL_2:.*]] = cc.undef !cc.struct<{i32, f64}> // CHECK: %[[VAL_14:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr>) -> !cc.ptr x ?>> // CHECK: %[[VAL_3:.*]] = arith.constant 0 : i64 @@ -160,35 +159,72 @@ module attributes {quake.mangled_name_map = { // CHECK: } -// ALT2-LABEL: func.func @_ZN3ghzclEi( -// ALT2-SAME: %[[VAL_0:.*]]: !cc.ptr, %[[VAL_1:.*]]: i32) -> f64 { -// ALT2: %[[VAL_2:.*]] = cc.alloca !cc.struct<{!cc.ptr>, !cc.ptr>, !cc.ptr>}> -// ALT2: %[[VAL_3:.*]] = cc.alloca !cc.array x 1> -// ALT2: %[[VAL_4:.*]] = cc.sizeof !cc.array x 1> : i64 -// ALT2: %[[VAL_5:.*]] = cc.cast %[[VAL_3]] : (!cc.ptr x 1>>) -> !cc.ptr> -// ALT2: %[[VAL_6:.*]] = cc.cast %[[VAL_2]] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> -// ALT2: cc.store %[[VAL_5]], %[[VAL_6]] : !cc.ptr>> -// ALT2: %[[VAL_7:.*]] = cc.cast %[[VAL_3]] : (!cc.ptr x 1>>) -> i64 -// ALT2: %[[VAL_8:.*]] = arith.addi %[[VAL_7]], %[[VAL_4]] : i64 -// ALT2: %[[VAL_9:.*]] = cc.cast %[[VAL_8]] : (i64) -> !cc.ptr> -// ALT2: %[[VAL_10:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> -// ALT2: cc.store %[[VAL_9]], %[[VAL_10]] : !cc.ptr>> -// ALT2: %[[VAL_11:.*]] = cc.compute_ptr %[[VAL_2]][2] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> -// ALT2: cc.store %[[VAL_9]], %[[VAL_11]] : !cc.ptr>> -// ALT2: %[[VAL_12:.*]] = arith.constant 0 : i64 -// ALT2: %[[VAL_13:.*]] = cc.cast %[[VAL_12]] : (i64) -> !cc.ptr -// ALT2: %[[VAL_14:.*]] = cc.compute_ptr %[[VAL_3]][0] : (!cc.ptr x 1>>) -> !cc.ptr> -// ALT2: %[[VAL_15:.*]] = cc.alloca i32 -// ALT2: cc.store %[[VAL_1]], %[[VAL_15]] : !cc.ptr -// ALT2: %[[VAL_16:.*]] = cc.cast %[[VAL_15]] : (!cc.ptr) -> !cc.ptr -// ALT2: cc.store %[[VAL_16]], %[[VAL_14]] : !cc.ptr> -// ALT2: %[[VAL_17:.*]] = cc.alloca !cc.ptr -// ALT2: cc.store %[[VAL_13]], %[[VAL_17]] : !cc.ptr> -// ALT2: %[[VAL_18:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr>) -> !cc.ptr -// ALT2: %[[VAL_19:.*]] = cc.cast %[[VAL_2]] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr -// ALT2: %[[VAL_20:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr> -// ALT2: %[[VAL_21:.*]] = cc.cast %[[VAL_20]] : (!llvm.ptr>) -> !cc.ptr -// ALT2: call @altLaunchKernelUsingLocalJIT(%[[VAL_21]], %[[VAL_19]], %[[VAL_18]]) : (!cc.ptr, !cc.ptr, !cc.ptr) -> () -// ALT2: %[[VAL_22:.*]] = cc.undef f64 -// ALT2: return %[[VAL_22]] : f64 -// ALT2: } +// STREAM-LABEL: func.func @_ZN3ghzclEi( +// STREAM-SAME: %[[VAL_0:.*]]: !cc.ptr, %[[VAL_1:.*]]: i32) -> f64 { +// STREAM: %[[VAL_2:.*]] = cc.alloca !cc.struct<{!cc.ptr>, !cc.ptr>, !cc.ptr>}> +// STREAM: %[[VAL_3:.*]] = cc.alloca !cc.array x 1> +// STREAM: %[[VAL_4:.*]] = cc.sizeof !cc.array x 1> : i64 +// STREAM: %[[VAL_5:.*]] = cc.cast %[[VAL_3]] : (!cc.ptr x 1>>) -> !cc.ptr> +// STREAM: %[[VAL_6:.*]] = cc.cast %[[VAL_2]] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> +// STREAM: cc.store %[[VAL_5]], %[[VAL_6]] : !cc.ptr>> +// STREAM: %[[VAL_7:.*]] = cc.cast %[[VAL_3]] : (!cc.ptr x 1>>) -> i64 +// STREAM: %[[VAL_8:.*]] = arith.addi %[[VAL_7]], %[[VAL_4]] : i64 +// STREAM: %[[VAL_9:.*]] = cc.cast %[[VAL_8]] : (i64) -> !cc.ptr> +// STREAM: %[[VAL_10:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> +// STREAM: cc.store %[[VAL_9]], %[[VAL_10]] : !cc.ptr>> +// STREAM: %[[VAL_11:.*]] = cc.compute_ptr %[[VAL_2]][2] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> +// STREAM: cc.store %[[VAL_9]], %[[VAL_11]] : !cc.ptr>> +// STREAM: %[[VAL_14:.*]] = cc.compute_ptr %[[VAL_3]][0] : (!cc.ptr x 1>>) -> !cc.ptr> +// STREAM: %[[VAL_15:.*]] = cc.alloca i32 +// STREAM: cc.store %[[VAL_1]], %[[VAL_15]] : !cc.ptr +// STREAM: %[[VAL_16:.*]] = cc.cast %[[VAL_15]] : (!cc.ptr) -> !cc.ptr +// STREAM: cc.store %[[VAL_16]], %[[VAL_14]] : !cc.ptr> +// STREAM: %[[VAL_19:.*]] = cc.cast %[[VAL_2]] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr +// STREAM: %[[VAL_20:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr> +// STREAM: %[[VAL_21:.*]] = cc.cast %[[VAL_20]] : (!llvm.ptr>) -> !cc.ptr +// STREAM: call @streamlinedLaunchKernel(%[[VAL_21]], %[[VAL_19]]) : (!cc.ptr, !cc.ptr) -> () +// STREAM: %[[VAL_22:.*]] = cc.undef f64 +// STREAM: return %[[VAL_22]] : f64 +// STREAM: } + +// HYBRID-LABEL: func.func @_ZN3ghzclEi( +// HYBRID-SAME: %[[VAL_0:.*]]: !cc.ptr, %[[VAL_1:.*]]: i32) -> f64 { +// HYBRID: %[[VAL_2:.*]] = cc.undef !cc.struct<{i32, f64}> +// HYBRID: %[[VAL_3:.*]] = arith.constant 0 : i64 +// HYBRID: %[[VAL_4:.*]] = cc.insert_value %[[VAL_1]], %[[VAL_2]][0] : (!cc.struct<{i32, f64}>, i32) -> !cc.struct<{i32, f64}> +// HYBRID: %[[VAL_6:.*]] = cc.sizeof !cc.struct<{i32, f64}> : i64 +// HYBRID: %[[VAL_7:.*]] = arith.addi %[[VAL_6]], %[[VAL_3]] : i64 +// HYBRID: %[[VAL_8:.*]] = cc.alloca i8{{\[}}%[[VAL_7]] : i64] +// HYBRID: %[[VAL_9:.*]] = cc.cast %[[VAL_8]] : (!cc.ptr>) -> !cc.ptr> +// HYBRID: cc.store %[[VAL_4]], %[[VAL_9]] : !cc.ptr> +// HYBRID: %[[VAL_10:.*]] = cc.cast %[[VAL_8]] : (!cc.ptr>) -> !cc.ptr x ?>> +// HYBRID: %[[VAL_11:.*]] = constant @ghz.thunk : (!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}> +// HYBRID: %[[VAL_12:.*]] = cc.func_ptr %[[VAL_11]] : ((!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}>) -> !cc.ptr +// HYBRID: %[[VAL_13:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr x ?>>) -> !cc.ptr +// HYBRID: %[[VAL_15:.*]] = cc.offsetof !cc.struct<{i32, f64}> [1] : i64 +// HYBRID: %[[VAL_16:.*]] = cc.alloca !cc.struct<{!cc.ptr>, !cc.ptr>, !cc.ptr>}> +// HYBRID: %[[VAL_17:.*]] = cc.alloca !cc.array x 1> +// HYBRID: %[[VAL_18:.*]] = cc.sizeof !cc.array x 1> : i64 +// HYBRID: %[[VAL_19:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr x 1>>) -> !cc.ptr> +// HYBRID: %[[VAL_20:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> +// HYBRID: cc.store %[[VAL_19]], %[[VAL_20]] : !cc.ptr>> +// HYBRID: %[[VAL_21:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr x 1>>) -> i64 +// HYBRID: %[[VAL_22:.*]] = arith.addi %[[VAL_21]], %[[VAL_18]] : i64 +// HYBRID: %[[VAL_23:.*]] = cc.cast %[[VAL_22]] : (i64) -> !cc.ptr> +// HYBRID: %[[VAL_24:.*]] = cc.compute_ptr %[[VAL_16]][1] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> +// HYBRID: cc.store %[[VAL_23]], %[[VAL_24]] : !cc.ptr>> +// HYBRID: %[[VAL_25:.*]] = cc.compute_ptr %[[VAL_16]][2] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> +// HYBRID: cc.store %[[VAL_23]], %[[VAL_25]] : !cc.ptr>> +// HYBRID: %[[VAL_26:.*]] = cc.compute_ptr %[[VAL_17]][0] : (!cc.ptr x 1>>) -> !cc.ptr> +// HYBRID: %[[VAL_27:.*]] = cc.alloca i32 +// HYBRID: cc.store %[[VAL_1]], %[[VAL_27]] : !cc.ptr +// HYBRID: %[[VAL_28:.*]] = cc.cast %[[VAL_27]] : (!cc.ptr) -> !cc.ptr +// HYBRID: cc.store %[[VAL_28]], %[[VAL_26]] : !cc.ptr> +// HYBRID: %[[VAL_29:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr +// HYBRID: %[[VAL_30:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr> +// HYBRID: %[[VAL_31:.*]] = cc.cast %[[VAL_30]] : (!llvm.ptr>) -> !cc.ptr +// HYBRID: call @hybridLaunchKernel(%[[VAL_31]], %[[VAL_12]], %[[VAL_13]], %[[VAL_7]], %[[VAL_15]], %[[VAL_29]]) : (!cc.ptr, !cc.ptr, !cc.ptr, i64, i64, !cc.ptr) -> () +// HYBRID: %[[VAL_32:.*]] = cc.compute_ptr %[[VAL_10]][0, 1] : (!cc.ptr x ?>>) -> !cc.ptr +// HYBRID: %[[VAL_33:.*]] = cc.load %[[VAL_32]] : !cc.ptr +// HYBRID: return %[[VAL_33]] : f64 +// HYBRID: } diff --git a/test/Quake/kernel_exec-2.qke b/test/Quake/kernel_exec-2.qke index e5b8a7f24c..fa3a8a5492 100644 --- a/test/Quake/kernel_exec-2.qke +++ b/test/Quake/kernel_exec-2.qke @@ -65,12 +65,12 @@ __nvqpp__mlirgen__function_cargo = "pants"}} { // CHECK: call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_21]], %[[VAL_26]], %[[VAL_22]], %[[VAL_23]]) : (!cc.ptr, !cc.ptr, i64, i1) -> () // CHECK: %[[VAL_90:.*]] = cc.cast %[[VAL_21]] : // CHECK: %[[VAL_27:.*]] = cc.compute_ptr %[[VAL_90]][%[[VAL_22]]] : (!cc.ptr>, i64) -> !cc.ptr -// CHECK: %[[VAL_28:.*]] = llvm.mlir.addressof @function_hawaiian.kernelName : !llvm.ptr> // CHECK: %[[VAL_29:.*]] = constant @function_hawaiian.thunk : (!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}> -// CHECK: %[[VAL_30:.*]] = cc.cast %[[VAL_28]] : (!llvm.ptr>) -> !cc.ptr // CHECK: %[[VAL_31:.*]] = cc.func_ptr %[[VAL_29]] : ((!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}>) -> !cc.ptr // CHECK: %[[VAL_32:.*]] = cc.cast %[[VAL_20]] : (!cc.ptr x ?>>) -> !cc.ptr // CHECK: %[[VAL_33:.*]] = arith.constant 2147483647 : i64 +// CHECK: %[[VAL_28:.*]] = llvm.mlir.addressof @function_hawaiian.kernelName : !llvm.ptr> +// CHECK: %[[VAL_30:.*]] = cc.cast %[[VAL_28]] : (!llvm.ptr>) -> !cc.ptr // CHECK: call @altLaunchKernel(%[[VAL_30]], %[[VAL_31]], %[[VAL_32]], %[[VAL_17]], %[[VAL_33]]) : (!cc.ptr, !cc.ptr, !cc.ptr, i64, i64) -> () // CHECK: return // CHECK: } diff --git a/test/Quake/return_vector.qke b/test/Quake/return_vector.qke index 79f090bb4c..3394ada1a2 100644 --- a/test/Quake/return_vector.qke +++ b/test/Quake/return_vector.qke @@ -46,20 +46,17 @@ func.func @test_0(%0: !cc.ptr, !cc.ptr, !cc.ptr, %[[VAL_2:.*]]: i32) { // CHECK: %[[VAL_3:.*]] = arith.constant 4 : i64 // CHECK: %[[VAL_4:.*]] = constant @test_0.thunk : (!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}> -// CHECK: %[[VAL_5:.*]] = arith.constant 0 : i64 // CHECK: %[[VAL_6:.*]] = cc.undef !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> // CHECK: %[[VAL_7:.*]] = cc.insert_value %[[VAL_2]], %[[VAL_6]][0] : (!cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}>, i32) -> !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> -// CHECK: %[[VAL_8:.*]] = cc.cast %[[VAL_5]] : (i64) -> !cc.ptr, i64}>}>> // CHECK: %[[VAL_9:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> : i64 // CHECK: %[[VAL_10:.*]] = cc.alloca i8{{\[}}%[[VAL_9]] : i64] // CHECK: %[[VAL_11:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr>) -> !cc.ptr, i64}>}>> // CHECK: cc.store %[[VAL_7]], %[[VAL_11]] : !cc.ptr, i64}>}>> -// CHECK: %[[VAL_12:.*]] = llvm.mlir.addressof @test_0.kernelName : !llvm.ptr> -// CHECK: %[[VAL_13:.*]] = cc.cast %[[VAL_12]] : (!llvm.ptr>) -> !cc.ptr // CHECK: %[[VAL_14:.*]] = cc.func_ptr %[[VAL_4]] : ((!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}>) -> !cc.ptr // CHECK: %[[VAL_15:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr>) -> !cc.ptr -// CHECK: %[[VAL_16:.*]] = cc.compute_ptr %[[VAL_8]][1] : (!cc.ptr, i64}>}>>) -> !cc.ptr, i64}>> -// CHECK: %[[VAL_17:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr, i64}>>) -> i64 +// CHECK: %[[VAL_17:.*]] = cc.offsetof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> [1] : i64 +// CHECK: %[[VAL_12:.*]] = llvm.mlir.addressof @test_0.kernelName : !llvm.ptr> +// CHECK: %[[VAL_13:.*]] = cc.cast %[[VAL_12]] : (!llvm.ptr>) -> !cc.ptr // CHECK: call @altLaunchKernel(%[[VAL_13]], %[[VAL_14]], %[[VAL_15]], %[[VAL_9]], %[[VAL_17]]) : (!cc.ptr, !cc.ptr, !cc.ptr, i64, i64) -> () // CHECK: %[[VAL_18:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr>) -> !cc.ptr, i64}>}>> // CHECK: %[[VAL_19:.*]] = cc.compute_ptr %[[VAL_18]][1, 0] : (!cc.ptr, i64}>}>>) -> !cc.ptr> @@ -113,20 +110,17 @@ func.func @test_1(%0: !cc.ptr, !cc.ptr, !cc.ptr, %[[VAL_2:.*]]: i32) { // CHECK: %[[VAL_3:.*]] = arith.constant 8 : i64 // CHECK: %[[VAL_4:.*]] = constant @test_1.thunk : (!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}> -// CHECK: %[[VAL_5:.*]] = arith.constant 0 : i64 // CHECK: %[[VAL_6:.*]] = cc.undef !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> // CHECK: %[[VAL_7:.*]] = cc.insert_value %[[VAL_2]], %[[VAL_6]][0] : (!cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}>, i32) -> !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> -// CHECK: %[[VAL_8:.*]] = cc.cast %[[VAL_5]] : (i64) -> !cc.ptr, i64}>}>> // CHECK: %[[VAL_9:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> : i64 // CHECK: %[[VAL_10:.*]] = cc.alloca i8{{\[}}%[[VAL_9]] : i64] // CHECK: %[[VAL_11:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr>) -> !cc.ptr, i64}>}>> // CHECK: cc.store %[[VAL_7]], %[[VAL_11]] : !cc.ptr, i64}>}>> -// CHECK: %[[VAL_12:.*]] = llvm.mlir.addressof @test_1.kernelName : !llvm.ptr> -// CHECK: %[[VAL_13:.*]] = cc.cast %[[VAL_12]] : (!llvm.ptr>) -> !cc.ptr // CHECK: %[[VAL_14:.*]] = cc.func_ptr %[[VAL_4]] : ((!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}>) -> !cc.ptr // CHECK: %[[VAL_15:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr>) -> !cc.ptr -// CHECK: %[[VAL_16:.*]] = cc.compute_ptr %[[VAL_8]][1] : (!cc.ptr, i64}>}>>) -> !cc.ptr, i64}>> -// CHECK: %[[VAL_17:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr, i64}>>) -> i64 +// CHECK: %[[VAL_17:.*]] = cc.offsetof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> [1] : i64 +// CHECK: %[[VAL_12:.*]] = llvm.mlir.addressof @test_1.kernelName : !llvm.ptr> +// CHECK: %[[VAL_13:.*]] = cc.cast %[[VAL_12]] : (!llvm.ptr>) -> !cc.ptr // CHECK: call @altLaunchKernel(%[[VAL_13]], %[[VAL_14]], %[[VAL_15]], %[[VAL_9]], %[[VAL_17]]) : (!cc.ptr, !cc.ptr, !cc.ptr, i64, i64) -> () // CHECK: %[[VAL_18:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr>) -> !cc.ptr, i64}>}>> // CHECK: %[[VAL_19:.*]] = cc.compute_ptr %[[VAL_18]][1, 0] : (!cc.ptr, i64}>}>>) -> !cc.ptr> diff --git a/unittests/integration/get_state_tester.cpp b/unittests/integration/get_state_tester.cpp index 00f3ca1dec..452c55c8bb 100644 --- a/unittests/integration/get_state_tester.cpp +++ b/unittests/integration/get_state_tester.cpp @@ -165,3 +165,22 @@ CUDAQ_TEST(GetStateTester, checkOverlapFromHostVector) { EXPECT_NEAR(1.0, state.overlap(hostState).real(), 1e-3); } #endif + +CUDAQ_TEST(GetStateTester, checkKron) { + auto force_kron = [](std::vector> vec) __qpu__ { + cudaq::qubit a; + cudaq::qvector qvec(vec); + }; + // Construct a 6-qubit |111111> state + const int num_qubits_input_state = 6; + std::vector> hostStateData( + 1 << num_qubits_input_state); + hostStateData[hostStateData.size() - 1] = 1.0; + + auto counts = cudaq::sample(force_kron, hostStateData); + + // Expect a single state with a deterministic outcome + EXPECT_EQ(counts.size(), 1); + EXPECT_EQ(counts.begin()->first, + "0" + std::string(num_qubits_input_state, '1')); +}