From 19422afc3061ddb3a59b307070043c879caf21cb Mon Sep 17 00:00:00 2001
From: Oleg Shyshkov <shyshkov@google.com>
Date: Thu, 4 Apr 2024 10:37:31 -0700
Subject: [PATCH] [XLA:GPU] Add TiledHloInstruction.

A graph of TiledHloInstruction represents an HLO graph with associated concrete tiles sizes. In the following changes I'll add code to build the graph from SymbolicTiledHloInstruction and use the tiled graph for Cost Model and Triton codegen.

PiperOrigin-RevId: 621903701
---
 xla/service/gpu/model/BUILD                   |  33 ++++
 .../gpu/model/tiled_hlo_instruction.cc        | 118 +++++++++++++++
 xla/service/gpu/model/tiled_hlo_instruction.h | 136 +++++++++++++++++
 .../gpu/model/tiled_hlo_instruction_test.cc   | 142 ++++++++++++++++++
 4 files changed, 429 insertions(+)
 create mode 100644 xla/service/gpu/model/tiled_hlo_instruction.cc
 create mode 100644 xla/service/gpu/model/tiled_hlo_instruction.h
 create mode 100644 xla/service/gpu/model/tiled_hlo_instruction_test.cc
diff --git a/xla/service/gpu/model/BUILD b/xla/service/gpu/model/BUILD
index e5f9fe6b2dc28..8d60e89d28b2d 100644
--- a/xla/service/gpu/model/BUILD
+++ b/xla/service/gpu/model/BUILD
@@ -590,6 +590,39 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "tiled_hlo_instruction",
+    srcs = ["tiled_hlo_instruction.cc"],
+    hdrs = ["tiled_hlo_instruction.h"],
+    deps = [
+        ":indexing_map",
+        "//xla:util",
+        "//xla/hlo/ir:hlo",
+        "@com_google_absl//absl/hash",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+xla_cc_test(
+    name = "tiled_hlo_instruction_test",
+    srcs = ["tiled_hlo_instruction_test.cc"],
+    deps = [
+        ":indexing_map",
+        ":indexing_test_utils",
+        ":tiled_hlo_instruction",
+        "//xla:shape_util",
+        "//xla/hlo/ir:hlo",
+        "//xla/tests:hlo_test_base",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//mlir:IR",
+        "@tsl//tsl/platform:statusor",
+    ],
+)
+
 cc_library(
     name = "symbolic_tile_analysis",
     srcs = ["symbolic_tile_analysis.cc"],
diff --git a/xla/service/gpu/model/tiled_hlo_instruction.cc b/xla/service/gpu/model/tiled_hlo_instruction.cc
new file mode 100644
index 0000000000000..7c0698f2e40c0
--- /dev/null
+++ b/xla/service/gpu/model/tiled_hlo_instruction.cc
@@ -0,0 +1,118 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/model/tiled_hlo_instruction.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/hash/hash.h"
+#include "absl/memory/memory.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/gpu/model/indexing_map.h"
+#include "xla/util.h"
+
+namespace xla {
+namespace gpu {
+
+size_t TiledHloInstruction::PtrHash::operator()(
+    const TiledHloInstruction* tiled_hlo) const {
+  return absl::HashOf(*tiled_hlo);
+}
+
+bool TiledHloInstruction::PtrEqual::operator()(
+    const TiledHloInstruction* lhs, const TiledHloInstruction* rhs) const {
+  return *lhs == *rhs;
+}
+
+bool operator==(const TiledHloInstruction& lhs,
+                const TiledHloInstruction& rhs) {
+  return lhs.hlo() == rhs.hlo() && lhs.tile_sizes() == rhs.tile_sizes() &&
+         lhs.tile_strides() == rhs.tile_strides() &&
+         lhs.block_id_to_tile_offsets_indexing() ==
+             rhs.block_id_to_tile_offsets_indexing();
+}
+
+bool operator!=(const TiledHloInstruction& lhs,
+                const TiledHloInstruction& rhs) {
+  return !(lhs == rhs);
+}
+
+/*static*/
+absl::StatusOr<std::unique_ptr<TiledHloInstruction>>
+TiledHloInstruction::Create(const HloInstruction* hlo,
+                            std::vector<int64_t> tile_sizes,
+                            std::vector<int64_t> tile_strides,
+                            IndexingMap block_id_to_tile_offsets_indexing) {
+  int rank = hlo->shape().rank();
+
+  if (tile_sizes.size() != rank) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("Number of tile sizes must be equal to the rank of the "
+                     "hlo shape. tile_sizes = ",
+                     tile_sizes.size(), ", hlo = ", hlo->ToString()));
+  }
+
+  if (tile_strides.size() != rank) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("Number of tile strides must be equal to the rank of the "
+                     "hlo shape. tile_sizes = ",
+                     tile_strides.size(), ", hlo = ", hlo->ToString()));
+  }
+
+  if (block_id_to_tile_offsets_indexing.GetDimensionCount() != 1 ||
+      block_id_to_tile_offsets_indexing.GetSymbolCount() != 0) {
+    return absl::InvalidArgumentError(absl::StrCat(
+        "block_id_to_tile_offsets_indexing must have 1 dim and 0 symbols. "
+        "block_id_to_tile_offsets_indexing = ",
+        block_id_to_tile_offsets_indexing.ToString()));
+  }
+
+  if (block_id_to_tile_offsets_indexing.GetAffineMap().getNumResults() !=
+      rank) {
+    return absl::InvalidArgumentError(absl::StrCat(
+        "block_id_to_tile_offsets_indexing must have the same number of "
+        "results as the rank of the hlo shape. "
+        "block_id_to_tile_offsets_indexing = ",
+        block_id_to_tile_offsets_indexing.ToString(),
+        ", hlo = ", hlo->ToString()));
+  }
+
+  return absl::WrapUnique(new TiledHloInstruction(
+      hlo, std::move(tile_sizes), std::move(tile_strides),
+      std::move(block_id_to_tile_offsets_indexing)));
+}
+
+std::string TiledHloInstruction::ToString() const {
+  std::stringstream ss;
+  ss << "hlo: " << hlo_->ToString() << "\n";
+  ss << "tile_sizes: {" << absl::StrJoin(tile_sizes_, ", ") << "}\n";
+  ss << "tile_strides: {" << absl::StrJoin(tile_strides_, ", ") << "}\n";
+  ss << "block_id_to_tile_offsets_indexing: "
+     << block_id_to_tile_offsets_indexing_;
+  return ss.str();
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/xla/service/gpu/model/tiled_hlo_instruction.h b/xla/service/gpu/model/tiled_hlo_instruction.h
new file mode 100644
index 0000000000000..045cfd27dd8aa
--- /dev/null
+++ b/xla/service/gpu/model/tiled_hlo_instruction.h
@@ -0,0 +1,136 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_MODEL_TILED_HLO_INSTRUCTION_H_
+#define XLA_SERVICE_GPU_MODEL_TILED_HLO_INSTRUCTION_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/gpu/model/indexing_map.h"
+
+namespace xla {
+namespace gpu {
+
+// A wrapper around HloInstruction that represents a tiled HLO instruction.
+//
+// The class contains information required to emit this instruction in
+// block-level codegen. Tile sizes and strides are constants and do not depend
+// on the block id. Tile offsets are computed using an indexing map of form:
+// `(block_id) -> (tile_offset0, tile_offset1, ...)`.
+class TiledHloInstruction {
+ public:
+  // PtrHash and PtrEqual are helper classes to use in hash maps and sets that
+  // compare values behind the pointers. For example,
+  // absl::flat_hash_set<TiledHloInstruction*, PtrHash, PtrEqual> hlo_set;
+  struct PtrHash {
+    size_t operator()(const TiledHloInstruction* tiled_hlo) const;
+  };
+
+  struct PtrEqual {
+    bool operator()(const TiledHloInstruction* lhs,
+                    const TiledHloInstruction* rhs) const;
+  };
+
+  // Creates an instance of TiledHloInstruction. Returns an error if any of the
+  // following preconditions is not met:
+  // * Number of tile sizes, strides should match HLO shape rank.
+  // * Number of result of `block_id_to_tile_offsets_indexing` should match HLO
+  //   shape rank.
+  // * `block_id_to_tile_offsets_indexing` should have only 1 dimension and 0
+  //   symbols.
+  static absl::StatusOr<std::unique_ptr<TiledHloInstruction>> Create(
+      const HloInstruction* hlo, std::vector<int64_t> tile_sizes,
+      std::vector<int64_t> tile_strides,
+      IndexingMap block_id_to_tile_offsets_indexing);
+
+  // Returns the original HLO instruction.
+  const HloInstruction* hlo() const { return hlo_; }
+
+  // Returns the tile sizes. The number of tile sizes is equal to the rank of
+  // the output shape.
+  const std::vector<int64_t>& tile_sizes() const { return tile_sizes_; }
+
+  // Returns the tile strides. The number of tile strides is equal to the rank
+  // of the output shape.
+  const std::vector<int64_t>& tile_strides() const { return tile_strides_; }
+
+  // Returns the indexing map from block_id to tile offsets. The map has a form
+  // of `(block_id) -> (tile_offset0, tile_offset1, ...)`. The number of tile
+  // offsets is equal to the rank of the output shape.
+  const IndexingMap& block_id_to_tile_offsets_indexing() const {
+    return block_id_to_tile_offsets_indexing_;
+  }
+
+  const TiledHloInstruction* operand(int64_t operand_id) const {
+    return operands_[operand_id];
+  }
+
+  const std::vector<TiledHloInstruction*>& operands() const {
+    return operands_;
+  }
+
+  void AppendOperand(TiledHloInstruction* operand) {
+    operands_.push_back(operand);
+  }
+
+  std::string ToString() const;
+
+ private:
+  TiledHloInstruction(const HloInstruction* hlo,
+                      std::vector<int64_t> tile_sizes,
+                      std::vector<int64_t> tile_strides,
+                      IndexingMap block_id_to_tile_offsets_indexing)
+      : hlo_(hlo),
+        tile_sizes_(std::move(tile_sizes)),
+        tile_strides_(std::move(tile_strides)),
+        block_id_to_tile_offsets_indexing_(
+            std::move(block_id_to_tile_offsets_indexing)) {}
+
+  // Pointer to the original HLO instruction.
+  const HloInstruction* hlo_;
+
+  // Tile sizes and strides.
+  std::vector<int64_t> tile_sizes_;
+  std::vector<int64_t> tile_strides_;
+
+  // Indexing map from block_id to tile offsets.
+  IndexingMap block_id_to_tile_offsets_indexing_;
+
+  // Operands of the instruction in the tiled computation graph.
+  std::vector<TiledHloInstruction*> operands_;
+};
+
+bool operator==(const TiledHloInstruction& lhs, const TiledHloInstruction& rhs);
+bool operator!=(const TiledHloInstruction& lhs, const TiledHloInstruction& rhs);
+
+template <typename H>
+H AbslHashValue(H h, const TiledHloInstruction& tiled_hlo_instruction) {
+  return H::combine(std::move(h), tiled_hlo_instruction.hlo(),
+                    tiled_hlo_instruction.tile_sizes(),
+                    tiled_hlo_instruction.tile_strides(),
+                    tiled_hlo_instruction.block_id_to_tile_offsets_indexing());
+}
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_MODEL_TILED_HLO_INSTRUCTION_H_
diff --git a/xla/service/gpu/model/tiled_hlo_instruction_test.cc b/xla/service/gpu/model/tiled_hlo_instruction_test.cc
new file mode 100644
index 0000000000000..dc2db9b3d96bd
--- /dev/null
+++ b/xla/service/gpu/model/tiled_hlo_instruction_test.cc
@@ -0,0 +1,142 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/model/tiled_hlo_instruction.h"
+
+#include <memory>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/container/flat_hash_set.h"
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/gpu/model/indexing_map.h"
+#include "xla/service/gpu/model/indexing_test_utils.h"
+#include "xla/shape_util.h"
+#include "xla/tests/hlo_test_base.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+class TiledHloInstructionTest : public HloTestBase {
+ public:
+  mlir::MLIRContext mlir_context_;
+};
+
+TEST_F(TiledHloInstructionTest, PtrHashAndPtrEqualWorkCorrectly) {
+  std::unique_ptr<HloInstruction> hlo = HloInstruction::CreateParameter(
+      /*parameter_number=*/0,
+      ShapeUtil::MakeShape(PrimitiveType::F32, {32, 64}), "p0");
+
+  IndexingMap block_id_to_tile_offsets_indexing = IndexingMap::FromTensorSizes(
+      ParseAffineMap("(d0) -> (d0 floordiv 16, (d0 mod 16) * 16)",
+                     &mlir_context_),
+      /*dim_upper_bounds=*/{8},
+      /*symbol_upper_bounds=*/{});
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<TiledHloInstruction> tiled_hlo1,
+      TiledHloInstruction::Create(hlo.get(), /*tile_sizes=*/{16, 16},
+                                  /*tile_strides=*/{1, 1},
+                                  block_id_to_tile_offsets_indexing));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<TiledHloInstruction> tiled_hlo2,
+      TiledHloInstruction::Create(hlo.get(), /*tile_sizes=*/{16, 16},
+                                  /*tile_strides=*/{1, 1},
+                                  block_id_to_tile_offsets_indexing));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<TiledHloInstruction> tiled_hlo3,
+      TiledHloInstruction::Create(hlo.get(), /*tile_sizes=*/{16, 32},
+                                  /*tile_strides=*/{1, 1},
+                                  block_id_to_tile_offsets_indexing));
+
+  EXPECT_EQ(*tiled_hlo1, *tiled_hlo2);
+  EXPECT_NE(*tiled_hlo1, *tiled_hlo3);
+
+  absl::flat_hash_set<TiledHloInstruction*, TiledHloInstruction::PtrHash,
+                      TiledHloInstruction::PtrEqual>
+      tiled_hlo_set = {tiled_hlo1.get(), tiled_hlo2.get(), tiled_hlo3.get()};
+  EXPECT_EQ(tiled_hlo_set.size(), 2);
+}
+
+TEST_F(TiledHloInstructionTest, TileSizesAndStridesShouldMatchHloShapeRank) {
+  std::unique_ptr<HloInstruction> hlo = HloInstruction::CreateParameter(
+      /*parameter_number=*/0,
+      ShapeUtil::MakeShape(PrimitiveType::F32, {32, 64}), "p0");
+
+  IndexingMap block_id_to_tile_offsets_indexing = IndexingMap::FromTensorSizes(
+      ParseAffineMap("(d0) -> (d0 floordiv 16, (d0 mod 16) * 16)",
+                     &mlir_context_),
+      /*dim_upper_bounds=*/{8},
+      /*symbol_upper_bounds=*/{});
+
+  EXPECT_THAT(
+      TiledHloInstruction::Create(hlo.get(), /*tile_sizes=*/{16},
+                                  /*tile_strides=*/{1, 1},
+                                  block_id_to_tile_offsets_indexing)
+          .status()
+          .message(),
+      ::testing::HasSubstr("Number of tile sizes must be equal to the rank"));
+
+  EXPECT_THAT(
+      TiledHloInstruction::Create(hlo.get(), /*tile_sizes=*/{16, 16},
+                                  /*tile_strides=*/{1, 1, 1},
+                                  block_id_to_tile_offsets_indexing)
+          .status()
+          .message(),
+      ::testing::HasSubstr("Number of tile strides must be equal to the rank"));
+}
+
+TEST_F(TiledHloInstructionTest,
+       ShouldReturnErrorIfBlockIdToTileOffsetsIndexingIsInvalid) {
+  std::unique_ptr<HloInstruction> hlo = HloInstruction::CreateParameter(
+      /*parameter_number=*/0,
+      ShapeUtil::MakeShape(PrimitiveType::F32, {32, 64}), "p0");
+
+  IndexingMap block_id_to_tile_offsets_indexing1 = IndexingMap::FromTensorSizes(
+      ParseAffineMap("(d0) -> (d0 floordiv 16)", &mlir_context_),
+      /*dim_upper_bounds=*/{8},
+      /*symbol_upper_bounds=*/{});
+
+  EXPECT_THAT(
+      TiledHloInstruction::Create(hlo.get(), /*tile_sizes=*/{16, 16},
+                                  /*tile_strides=*/{1, 1},
+                                  block_id_to_tile_offsets_indexing1)
+          .status()
+          .message(),
+      ::testing::HasSubstr(
+          "must have the same number of results as the rank of the hlo shape"));
+
+  IndexingMap block_id_to_tile_offsets_indexing2 = IndexingMap::FromTensorSizes(
+      ParseAffineMap("(d0)[s0] -> (d0 + s0, d0 floordiv 16)", &mlir_context_),
+      /*dim_upper_bounds=*/{8},
+      /*symbol_upper_bounds=*/{8});
+
+  EXPECT_THAT(TiledHloInstruction::Create(hlo.get(), /*tile_sizes=*/{16, 16},
+                                          /*tile_strides=*/{1, 1},
+                                          block_id_to_tile_offsets_indexing2)
+                  .status()
+                  .message(),
+              ::testing::HasSubstr("must have 1 dim and 0 symbols"));
+}
+
+}  // namespace
+
+}  // namespace gpu
+}  // namespace xla