Skip to content

Commit

Permalink
Fixing execution region result placement.
Browse files Browse the repository at this point in the history
  • Loading branch information
benvanik committed Jan 31, 2025
1 parent 0159762 commit ae9e5a7
Show file tree
Hide file tree
Showing 8 changed files with 179 additions and 97 deletions.
3 changes: 2 additions & 1 deletion compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -1507,8 +1507,9 @@ def FLOW_TensorBarrierOp : FLOW_PureOp<"tensor.barrier", [
DeclareOpInterfaceMethods<Util_HoistableOpInterface>,
Util_ShapeAwareOp,
]> {
let summary = [{}];
let summary = [{indicates a value that must have a specific affinity}];
let description = [{
DO NOT SUBMIT
}];

let arguments = (ins
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ util.global private @device : !hal.device
// CHECK-LABEL: @tensorBarrierDispatch
// CHECK-SAME: (%[[INPUT:.+]]: !stream.resource<*>, %[[DIM0:.+]]: index, %[[DIM1:.+]]: index)
util.func public @tensorBarrierDispatch(%input: tensor<?x128xi8>, %dim0: index) -> tensor<?x128xi8> {
// CHECK: %[[BARRIER:.+]] = stream.async.barrier %[[INPUT]] : !stream.resource<*>{%[[DIM0]]} -> !stream.resource<*>
// CHECK: %[[BARRIER:.+]] = stream.async.barrier on(#hal.device.affinity<@device>) %[[INPUT]] : !stream.resource<*>{%[[DIM0]]}
%barrier = flow.tensor.barrier %input : tensor<?x128xi8>{%dim0} on #hal.device.affinity<@device>
// CHECK: %[[SIZE:.+]] = stream.tensor.sizeof on(#hal.device.affinity<@device>) tensor<?x128xi8>{%arg2} : index
// CHECK: %[[RESULT:.+]] = stream.tensor.dispatch on(#hal.device.affinity<@device>) @ex::@entry(%[[BARRIER]])
Expand Down Expand Up @@ -170,7 +170,7 @@ util.global private @device : !hal.device
// CHECK-LABEL: @tensorBarrier
// CHECK-SAME: (%[[INPUT:.+]]: !stream.resource<*>, %[[INPUT_SIZE:.+]]: index, %[[DIM0:.+]]: index)
util.func public @tensorBarrier(%input: tensor<?x128xi8>, %dim0: index) -> tensor<?x128xi8> {
// CHECK: %[[TRANSFER:.+]] = stream.async.barrier %[[INPUT]] : !stream.resource<*>{%[[INPUT_SIZE]]} -> !stream.resource<*>
// CHECK: %[[TRANSFER:.+]] = stream.async.barrier on(#hal.device.affinity<@device>) %[[INPUT]] : !stream.resource<*>{%[[INPUT_SIZE]]} -> !stream.resource<*>
%transfer = flow.tensor.barrier %input : tensor<?x128xi8>{%dim0} on #hal.device.affinity<@device>
// CHECK: util.return %[[TRANSFER]], %[[INPUT_SIZE]]
util.return %transfer : tensor<?x128xi8>
Expand Down
13 changes: 13 additions & 0 deletions compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2469,6 +2469,19 @@ bool AsyncBarrierOp::isMetadata() { return true; }

LogicalResult AsyncBarrierOp::verify() { return success(); }

Value AsyncBarrierOp::getTiedResult(unsigned resultIndex) {
return IREE::Util::TiedOpInterface::findTiedBaseValue(getSource());
}

::std::optional<unsigned>
AsyncBarrierOp::getTiedResultOperandIndex(unsigned resultIndex) {
return {0}; // source
}

SmallVector<int64_t> AsyncBarrierOp::getTiedResultOperandIndices() {
return {0}; // source
}

//===----------------------------------------------------------------------===//
// stream.async.transfer
//===----------------------------------------------------------------------===//
Expand Down
16 changes: 12 additions & 4 deletions compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -2291,15 +2291,25 @@ def Stream_AsyncCollectiveOp : Stream_Op<"async.collective", [
}

def Stream_AsyncBarrierOp : Stream_Op<"async.barrier", [
AllTypesMatch<["source", "result"]>,
Stream_AffinityOp,
Stream_AsyncPhaseOp,
DeclareOpInterfaceMethods<Stream_StreamableOp, [
"isMetadata",
]>,
Util_SizeAwareOp,
DeclareOpInterfaceMethods<Util_TiedOpInterface, [
"getTiedResult",
"getTiedResultOperandIndex",
"getTiedResultOperandIndices",
]>,
]> {
let summary = [{ }];
let summary = [{indicates a value that must have a specific affinity}];
let description = [{
DO NOT SUBMIT
tie to force in-place
do operand and result need same affinity?
no execution, so what does it mean?
}];

let arguments = (ins
Expand All @@ -2318,11 +2328,9 @@ def Stream_AsyncBarrierOp : Stream_Op<"async.barrier", [
);

let assemblyFormat = [{
(`on` `(` $affinity^ `)`)?
$source `:` type($source)
`` `{` $size `}`
(`from` `(` $affinity^ `)`)?
`->`
type($result)
attr-dict-with-keyword
}];

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -694,10 +694,14 @@ static LogicalResult applyAsyncTransferOp(IREE::Stream::AsyncTransferOp asyncOp,
};
auto currentAffinityAttr =
IREE::Stream::AffinityAttr::lookupOrDefault(asyncOp);
bool transferIn = asyncOp.getSourceAffinityAttr() != currentAffinityAttr ||
isStaging(asyncOp.getSource());
bool transferOut = asyncOp.getResultAffinityAttr() != currentAffinityAttr ||
isStaging(asyncOp.getResult());
auto sourceAffinityAttr = asyncOp.getSourceAffinityAttr();
auto resultAffinityAttr = asyncOp.getResultAffinityAttr();
bool transferIn =
(sourceAffinityAttr && sourceAffinityAttr != currentAffinityAttr) ||
isStaging(asyncOp.getSource());
bool transferOut =
(resultAffinityAttr && resultAffinityAttr != currentAffinityAttr) ||
isStaging(asyncOp.getResult());

auto sourceRange = scope.lookupResourceRange(asyncOp.getSource());
auto targetRange = scope.lookupResourceRange(asyncOp.getResult());
Expand Down Expand Up @@ -1274,35 +1278,47 @@ struct ResultReservationSet {
};

struct ResultAllocation {
// Affinity for the allocations.
IREE::Stream::AffinityAttr affinityAttr;
// Reservations bucketed by lifetime.
SmallVector<ResultReservationSet> reservationSets;
};

// A map of allocation placement affinities to the alloc reservations requested.
using ResultAllocationMap =
llvm::MapVector<IREE::Stream::AffinityAttr, SmallVector<ResultReservation>>;

// Produces parameters for one or more result allocations composed of an ordered
// set of |reservations| with matching lifetimes.
static ResultAllocation
reserveResultAllocation(ArrayRef<ResultReservation> reservations) {
// We want deterministic ordering of the allocations for each lifetime type
// so we build them all here and then just nuke the ones we don't end up
// using.
SmallVector<ResultReservationSet> sets(
IREE::Stream::getMaxEnumValForLifetime() + 1);
for (auto &reservation : reservations) {
auto &set =
sets[static_cast<unsigned>(reservation.resultType.getLifetime())];
set.reservationLocs.push_back(reservation.loc);
set.reservationTypes.push_back(reservation.resultType);
set.reservationSizes.push_back(reservation.resultSize);
set.reservations.push_back(std::move(reservation));
}
// set of |reservations| with matching lifetimes. Allocations will be bucketed
// both by their allocation affinity (where they should be placed) and their
// lifetime (how long they're expected to live).
static std::vector<ResultAllocation>
reserveResultAllocations(ResultAllocationMap &reservationMap) {
std::vector<ResultAllocation> result;
for (auto &[affinityAttr, reservations] : reservationMap) {
// We want deterministic ordering of the allocations for each lifetime type
// so we build them all here and then just nuke the ones we don't end up
// using.
SmallVector<ResultReservationSet> sets(
IREE::Stream::getMaxEnumValForLifetime() + 1);
for (auto &reservation : reservations) {
auto &set =
sets[static_cast<unsigned>(reservation.resultType.getLifetime())];
set.reservationLocs.push_back(reservation.loc);
set.reservationTypes.push_back(reservation.resultType);
set.reservationSizes.push_back(reservation.resultSize);
set.reservations.push_back(std::move(reservation));
}

// Remove unused sets. This does a bunch of moves and is really bad but eh.
for (int i = sets.size() - 1; i >= 0; --i) {
if (sets[i].reservations.empty()) {
sets.erase(sets.begin() + i);
// Remove unused sets. This does a bunch of moves and is really bad but eh.
for (int i = sets.size() - 1; i >= 0; --i) {
if (sets[i].reservations.empty()) {
sets.erase(sets.begin() + i);
}
}
result.push_back(ResultAllocation{affinityAttr, sets});
}
return ResultAllocation{sets};
return result;
}

//===----------------------------------------------------------------------===//
Expand Down Expand Up @@ -1541,7 +1557,7 @@ allocateExecutionRegion(IREE::Stream::AsyncExecuteOp executeOp) {
auto resourceRange = ResourceRange(arg, operandSize);
scope.mapResourceRange(arg, resourceRange, asmState.get());
}
SmallVector<ResultReservation> resultReservations;
ResultAllocationMap resultReservations;
for (auto [result, resultSize] :
llvm::zip_equal(executeOp.getResults(), executeOp.getResultSizes())) {
auto resultType = llvm::cast<IREE::Stream::ResourceType>(result.getType());
Expand Down Expand Up @@ -1623,6 +1639,18 @@ allocateExecutionRegion(IREE::Stream::AsyncExecuteOp executeOp) {
continue;
}

// DO NOT SUBMIT
// have to get consumer affinity
// may be able to get from a transfer
// could check allocation compatibility with execute op and place there
IREE::Stream::AffinityAttr allocationAffinity;
if (auto transferOp = dyn_cast_if_present<IREE::Stream::AsyncTransferOp>(
yieldValue.getDefiningOp())) {
allocationAffinity = transferOp.getResultAffinityAttr();
} else {
allocationAffinity = executeOp.getAffinityAttr();
}

// Queue up the allocation for packing.
ResultReservation resultReservation = {
definingOp->getLoc(), result, resultType, resultSize, yieldValue,
Expand All @@ -1633,54 +1661,56 @@ allocateExecutionRegion(IREE::Stream::AsyncExecuteOp executeOp) {
resultReservation.result.printAsOperand(llvm::dbgs(), asmState);
llvm::dbgs() << "\n";
});
resultReservations.push_back(resultReservation);
resultReservations[allocationAffinity].push_back(resultReservation);
}
auto resultAllocation = reserveResultAllocation(resultReservations);
for (auto &reservationSet : resultAllocation.reservationSets) {
// Allocate and tie an operand to the result.
auto timepointType = externalBuilder.getType<IREE::Stream::TimepointType>();
auto [allocaOp, suballocations] =
IREE::Stream::ResourceAllocaOp::createSuballocations(
timepointType, reservationSet.reservationTypes.front(),
reservationSet.reservationLocs, reservationSet.reservationSizes,
executeOp.getAwaitTimepoint(), executeOp.getAffinityAttr(),
externalBuilder);
newAwaitTimepoints.push_back(allocaOp.getResultTimepoint());

auto asmState = getRootAsmState(executeOp->getParentOp());
LLVM_DEBUG({
llvm::dbgs() << " + alloc for result reservation set: ";
allocaOp.print(llvm::dbgs(), *asmState);
llvm::dbgs() << ":\n";
});

for (auto [reservation, suballocation] :
llvm::zip_equal(reservationSet.reservations, suballocations)) {
newOperands.push_back(suballocation);
newOperandSizes.push_back(reservation.resultSize);
resultReplacements.push_back(
std::make_pair(reservation.result, suballocation));

// Insert entry arg for the new operand tied all the way to the yield.
auto arg =
entryBlock.addArgument(reservation.resultType, reservation.loc);
for (auto &resultAllocation : reserveResultAllocations(resultReservations)) {
for (auto &reservationSet : resultAllocation.reservationSets) {
// Allocate and tie an operand to the result.
auto timepointType =
externalBuilder.getType<IREE::Stream::TimepointType>();
auto [allocaOp, suballocations] =
IREE::Stream::ResourceAllocaOp::createSuballocations(
timepointType, reservationSet.reservationTypes.front(),
reservationSet.reservationLocs, reservationSet.reservationSizes,
executeOp.getAwaitTimepoint(), resultAllocation.affinityAttr,
externalBuilder);
newAwaitTimepoints.push_back(allocaOp.getResultTimepoint());

auto asmState = getRootAsmState(executeOp->getParentOp());
LLVM_DEBUG({
llvm::dbgs() << " + adding entry arg for reservation ";
reservation.result.printAsOperand(llvm::dbgs(), *asmState);
llvm::dbgs() << "{";
reservation.resultSize.printAsOperand(llvm::dbgs(), *asmState);
llvm::dbgs() << "} from ";
reservation.yieldValue.printAsOperand(llvm::dbgs(), *asmState);
llvm::dbgs() << " as ";
arg.printAsOperand(llvm::dbgs(), *asmState);
llvm::dbgs() << "\n";
llvm::dbgs() << " + alloc for result reservation set: ";
allocaOp.print(llvm::dbgs(), *asmState);
llvm::dbgs() << ":\n";
});

// Map into scope, updating all aliases.
auto resourceRange = ResourceRange(arg, reservation.resultSize);
scope.mapResourceRange(reservation.yieldValue, resourceRange,
asmState.get());
for (auto [reservation, suballocation] :
llvm::zip_equal(reservationSet.reservations, suballocations)) {
newOperands.push_back(suballocation);
newOperandSizes.push_back(reservation.resultSize);
resultReplacements.push_back(
std::make_pair(reservation.result, suballocation));

// Insert entry arg for the new operand tied all the way to the yield.
auto arg =
entryBlock.addArgument(reservation.resultType, reservation.loc);

LLVM_DEBUG({
llvm::dbgs() << " + adding entry arg for reservation ";
reservation.result.printAsOperand(llvm::dbgs(), *asmState);
llvm::dbgs() << "{";
reservation.resultSize.printAsOperand(llvm::dbgs(), *asmState);
llvm::dbgs() << "} from ";
reservation.yieldValue.printAsOperand(llvm::dbgs(), *asmState);
llvm::dbgs() << " as ";
arg.printAsOperand(llvm::dbgs(), *asmState);
llvm::dbgs() << "\n";
});

// Map into scope, updating all aliases.
auto resourceRange = ResourceRange(arg, reservation.resultSize);
scope.mapResourceRange(reservation.yieldValue, resourceRange,
asmState.get());
}
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -150,8 +150,15 @@ struct ExecutePartitionBuilder {
// If the op has the same affinity as the partition region we can strip it.
// Note that some ops may have affinities that are more specific and we
// want to preserve those as long as possible.
if (auto affinityOp =
dyn_cast<IREE::Stream::AffinityOpInterface>(clonedOp)) {
if (auto transferOp = dyn_cast<IREE::Stream::AsyncTransferOp>(clonedOp)) {
if (transferOp.getSourceAffinityAttr() == partition->affinity) {
transferOp.setSourceAffinityAttr(nullptr);
}
if (transferOp.getResultAffinityAttr() == partition->affinity) {
transferOp.setResultAffinityAttr(nullptr);
}
} else if (auto affinityOp =
dyn_cast<IREE::Stream::AffinityOpInterface>(clonedOp)) {
if (affinityOp.getAffinityAttr() == partition->affinity) {
affinityOp.setAffinityAttr(nullptr);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -530,6 +530,28 @@ util.func public @applyAsyncTransferOp(%operand: !stream.resource<transient>, %s

// -----

// CHECK-LABEL: @applyAsyncTransferMultiScopeOp
// CHECK-SAME: (%[[OPERAND:.+]]: !stream.resource<transient>, %[[SIZE:.+]]: index)
util.func public @applyAsyncTransferMultiScopeOp(%operand: !stream.resource<transient>, %size: index) {
// CHECK: %[[ALLOCA:.+]], %[[ALLOCA_TIMEPOINT:.+]] = stream.resource.alloca uninitialized on(#hal.device.affinity<@result_device>) : !stream.resource<transient>{%[[SIZE]]}
// CHECK: stream.cmd.execute on(#hal.device.affinity<@execution_device>) await(%[[ALLOCA_TIMEPOINT]])
// CHECK-SAME: with(%[[OPERAND]] as %[[OPERAND_CAPTURE:.+]]: !stream.resource<transient>{%[[SIZE]]},
// CHECK-SAME: %[[ALLOCA]] as %[[ALLOCA_CAPTURE:.+]]: !stream.resource<transient>{%[[SIZE]]})
%result, %result_timepoint = stream.async.execute on(#hal.device.affinity<@execution_device>) with(%operand as %capture: !stream.resource<transient>{%size}) -> !stream.resource<transient>{%size} {
// CHECK: stream.cmd.copy %[[OPERAND_CAPTURE]][%c0], %[[ALLOCA_CAPTURE]][%c0], %[[SIZE]]
// CHECK-SAME: : !stream.resource<transient>{%[[SIZE]]} -> !stream.resource<transient>{%[[SIZE]]}
// CHECK: stream.cmd.flush to(#hal.device.affinity<@result_device>) %[[ALLOCA_CAPTURE]][%c0 for %[[SIZE]]]
// CHECK-SAME: : !stream.resource<transient>{%[[SIZE]]}
%0 = stream.async.transfer %capture : !stream.resource<transient>{%size} from(#hal.device.affinity<@execution_device>) -> to(#hal.device.affinity<@result_device>) !stream.resource<transient>{%size}
stream.yield %0 : !stream.resource<transient>{%size}
} => !stream.timepoint
// CHECK: util.optimization_barrier %[[ALLOCA]]
util.optimization_barrier %result : !stream.resource<transient>
util.return
}

// -----

// CHECK-LABEL: @applyAsyncDispatchOp
// CHECK-SAME: (%[[OPERAND:.+]]: !stream.resource<transient>, %[[SIZE:.+]]: index, %[[OFFSET:.+]]: index, %[[END:.+]]: index, %[[LENGTH:.+]]: index)
util.func public @applyAsyncDispatchOp(%operand: !stream.resource<transient>, %size: index, %offset: index, %end: index, %length: index) {
Expand Down
Loading

0 comments on commit ae9e5a7

Please sign in to comment.