Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[OpOptimization/Conv2dOp] add RVV Dialect BenchMark #132

Open
wants to merge 25 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
df62750
update benchmark Conv2DNhwcHwcfOp and Conv2d
FloatingcloudKnight Jul 3, 2024
9fd9224
Merge branch 'main' of github.com:buddy-compiler/buddy-benchmark
FloatingcloudKnight Jul 3, 2024
8cade53
delete conv2d-rvv benchmark
FloatingcloudKnight Jul 3, 2024
2f73a7e
[DeepLearning/Ops] Add Conv2DNchwFchwOp benchmark
FloatingcloudKnight Jul 3, 2024
cbb138a
Revert "[DeepLearning/Ops] Add Conv2DNchwFchwOp benchmark"
FloatingcloudKnight Jul 3, 2024
bbd0134
Revert "delete conv2d-rvv benchmark"
FloatingcloudKnight Jul 3, 2024
8f46266
[DeepLearning/Ops] Add Conv2DNhwcHwcfOp benchmark
FloatingcloudKnight Jul 3, 2024
83432fa
[DeepLearning/Ops] Add Conv2DNhwcHwcfOp benchmark
FloatingcloudKnight Jul 3, 2024
7b51c1b
update some details
FloatingcloudKnight Jul 4, 2024
beda28a
[DeepLearning/Ops] Add Conv2DNhwcHwcfOp benchmark
FloatingcloudKnight Jul 4, 2024
b34568c
[DeepLearning/Ops] Add Conv2DNhwcHwcfOp benchmark
FloatingcloudKnight Jul 4, 2024
d6af74c
[DeepLearning/Ops] Add Conv2DNhwcHwcfOp benchmark
FloatingcloudKnight Jul 4, 2024
2631534
add pooling test
FloatingcloudKnight Jul 5, 2024
df7f236
Merge branch 'main' of https://github.com/FloatingcloudKnight/buddy-b…
FloatingcloudKnight Jul 5, 2024
c4ac568
[DeepLearning/Ops] Add PoolingNhwcSumOp benchmark
FloatingcloudKnight Jul 5, 2024
361a1d8
add conv2d benchmark
FloatingcloudKnight Jul 11, 2024
8ff9510
Merge branch 'main' of https://github.com/FloatingcloudKnight/buddy-b…
FloatingcloudKnight Jul 11, 2024
bae9cbf
add conv2d rvv benchmark
FloatingcloudKnight Jul 11, 2024
f63656c
Merge branch 'main' of https://github.com/FloatingcloudKnight/buddy-b…
FloatingcloudKnight Jul 11, 2024
1970cab
fix error in OpOptimization.
FloatingcloudKnight Jul 11, 2024
d46b736
Merge branch 'main' of https://github.com/FloatingcloudKnight/buddy-b…
FloatingcloudKnight Jul 11, 2024
ff77678
update some settings
FloatingcloudKnight Jul 15, 2024
6120cca
update some setings
FloatingcloudKnight Jul 15, 2024
d2acfca
update rvv mlir
FloatingcloudKnight Jul 22, 2024
53a1597
[OpOptimization/Conv2D] update rvv benchmark
FloatingcloudKnight Aug 14, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ target_link_libraries(GoogleBenchmark INTERFACE Threads::Threads)
# Find OpenCV
#-------------------------------------------------------------------------------

if(DEFINED IMAGE_PROCESSING_BENCHMARKS OR OP_OPTIMIZATION_BENCHMARKS)
if(DEFINED IMAGE_PROCESSING_BENCHMARKS)
find_package(OpenCV REQUIRED CONFIG)
include_directories(${OpenCV_INCLUDE_DIRS})
endif()
Expand Down
1 change: 1 addition & 0 deletions benchmarks/OpOptimization/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
add_subdirectory(Conv2dNchwFchw)
add_subdirectory(MatMul)
add_subdirectory(Conv2dOp)
66 changes: 66 additions & 0 deletions benchmarks/OpOptimization/Conv2dOp/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
if (CROSS_COMPILE_RVV)
set(RISCV_GNU_TOOLCHAIN ${BUDDY_MLIR_BUILD_DIR}/thirdparty/riscv-gnu-toolchain)
set(RISCV_GNU_TOOLCHAIN_SYSROOT ${RISCV_GNU_TOOLCHAIN}/sysroot)
set(BUDDY_OPT_TRIPLE riscv64)
set(BUDDY_OPT_ATTR +v,+m)
endif()

add_custom_command(OUTPUT conv2d_scalar.o
COMMAND cat ${BUDDY_SOURCE_DIR}/benchmarks/OpOptimization/Conv2dOp/Conv2D.mlir |
sed 's/@conv_2d/@conv_2d_scalar/' |
${BUDDY_MLIR_BUILD_DIR}/bin/buddy-opt
-convert-linalg-to-loops
-lower-affine
-arith-bufferize
-convert-scf-to-cf
-convert-vector-to-llvm
-convert-arith-to-llvm
-finalize-memref-to-llvm
-llvm-request-c-wrappers
-convert-func-to-llvm
-reconcile-unrealized-casts |
${LLVM_MLIR_BINARY_DIR}/mlir-translate --mlir-to-llvmir |
${LLVM_MLIR_BINARY_DIR}/llc -O3 -mtriple=${BUDDY_OPT_TRIPLE}
-mattr=${BUDDY_OPT_ATTR} --filetype=obj
-o ${BUDDY_BINARY_DIR}/../benchmarks/OpOptimization/Conv2dOp/conv2d_scalar.o
)
add_library(Conv2DScalar STATIC conv2d_scalar.o)
set_target_properties(Conv2DScalar PROPERTIES LINKER_LANGUAGE CXX)

add_custom_command(OUTPUT conv2d_rvv.o
COMMAND cat ${BUDDY_SOURCE_DIR}/benchmarks/OpOptimization/Conv2dOp/Conv2DRVV.mlir |
sed 's/@conv_2d/@conv_2d_rvv/' |
${BUDDY_MLIR_BUILD_DIR}/bin/buddy-opt
-lower-affine
-convert-scf-to-cf
-convert-math-to-llvm
-lower-vector-exp
-lower-rvv
-convert-vector-to-llvm
-finalize-memref-to-llvm
-llvm-request-c-wrappers
-convert-func-to-llvm
-reconcile-unrealized-casts |
${BUDDY_MLIR_BUILD_DIR}/bin/buddy-translate --buddy-to-llvmir |
${LLVM_MLIR_BINARY_DIR}/llc -O3 -mtriple=${BUDDY_OPT_TRIPLE}
-mattr=${BUDDY_OPT_ATTR} --filetype=obj
-o ${BUDDY_BINARY_DIR}/../benchmarks/OpOptimization/Conv2dOp/conv2d_rvv.o
)
add_library(Conv2DRVV STATIC conv2d_rvv.o)
set_target_properties(Conv2DRVV PROPERTIES LINKER_LANGUAGE CXX)

add_executable(conv2d-benchmark
Conv2DBenchmark.cpp
)

set_target_properties(conv2d-benchmark PROPERTIES
LINK_FLAGS "-static"
)

set(BenchmarkTool GoogleBenchmark)

target_link_libraries(conv2d-benchmark
${BenchmarkTool}
Conv2DScalar
Conv2DRVV
)
50 changes: 50 additions & 0 deletions benchmarks/OpOptimization/Conv2dOp/Conv2D.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#map = affine_map<(d0) -> (d0)>
#map1 = affine_map<(d0) -> (d0 ceildiv 32)>
module{
func.func @conv_2d(%arg0: memref<?x?xi32>, %arg1: memref<?x?xi32>, %arg2: memref<?x?xi32>) {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c32 = arith.constant 32 : index
%cst = arith.constant 0 : i32
%0 = vector.splat %cst : vector<32xi32>
%dim = memref.dim %arg1, %c0 : memref<?x?xi32>
%dim_0 = memref.dim %arg1, %c1 : memref<?x?xi32>
%dim_1 = memref.dim %arg2, %c0 : memref<?x?xi32>
%dim_2 = memref.dim %arg2, %c1 : memref<?x?xi32>
affine.for %arg3 = #map(%c0) to #map(%dim_1) {
affine.for %arg4 = #map(%c0) to #map(%dim) {
affine.for %arg5 = #map(%c0) to #map(%dim_0) {
affine.for %arg6 = #map(%c0) to #map1(%dim_2) {
%1 = memref.load %arg1[%arg4, %arg5] : memref<?x?xi32>
%2 = arith.index_cast %c0 : index to i32
%4 = arith.cmpi sge, %1, %2 : i32
scf.if %4 {
%5 = vector.broadcast %1 : i32 to vector<32xi32>
%6 = arith.muli %arg6, %c32 : index
%7 = arith.subi %dim_2, %6 : index
%8 = arith.cmpi sge, %7, %c32 : index
scf.if %8 {
%9 = affine.vector_load %arg0[%arg3 + %arg4, %arg5 + %arg6 * 32] : memref<?x?xi32>, vector<32xi32>
%10 = affine.vector_load %arg2[%arg3, %arg6 * 32] : memref<?x?xi32>, vector<32xi32>
%11 = arith.muli %9, %5 : vector<32xi32>
%12 = arith.addi %10, %11 : vector<32xi32>
affine.vector_store %12, %arg2[%arg3, %arg6 * 32] : memref<?x?xi32>, vector<32xi32>
} else {
%9 = vector.create_mask %7 : vector<32xi1>
%10 = arith.addi %arg3, %arg4 : index
%11 = arith.muli %arg6, %c32 : index
%12 = arith.addi %arg5, %11 : index
%13 = vector.maskedload %arg0[%10, %12], %9, %0 : memref<?x?xi32>, vector<32xi1>, vector<32xi32> into vector<32xi32>
%14 = vector.maskedload %arg2[%arg3, %11], %9, %0 : memref<?x?xi32>, vector<32xi1>, vector<32xi32> into vector<32xi32>
%15 = arith.muli %13, %5 : vector<32xi32>
%16 = arith.addi %14, %15 : vector<32xi32>
vector.maskedstore %arg2[%arg3, %11], %9, %16 : memref<?x?xi32>, vector<32xi1>, vector<32xi32>
}
}
}
}
}
}
return
}
}
133 changes: 133 additions & 0 deletions benchmarks/OpOptimization/Conv2dOp/Conv2DBenchmark.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
//===- Conv2DBenchmark.cpp ------------------------------------------------===//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
//===----------------------------------------------------------------------===//
//
// This file implements the benchmark for GEMM operation.
//
//===----------------------------------------------------------------------===//

#include <benchmark/benchmark.h>
#include <buddy/Core/Container.h>
#include <iostream>
#include <random>

// Define target layout.
#define INPUT_R 16
#define INPUT_C 16
#define KERNEL_R 4
#define KERNEL_C 4
#define OUTPUT_R (INPUT_R - KERNEL_R + 1)
#define OUTPUT_C (INPUT_C - KERNEL_C + 1)

// Helper functions and variables.
namespace {
const std::string PASS = "\033[32mPASS\033[0m";
const std::string FAIL = "\033[31mFAIL\033[0m";

bool areArraysEqual(int array1[], int array2[], int size) {
for (int i = 0; i < size; ++i) {
if (array1[i] != array2[i]) {
return false;
}
}
return true;
}
} // namespace

namespace {
// Declare the C interface.
extern "C" {
void _mlir_ciface_conv_2d_scalar(MemRef<int, 2> *input, MemRef<int, 2> *filter,
MemRef<int, 2> *output);
void _mlir_ciface_conv_2d_rvv(MemRef<int, 2> *input, MemRef<int, 2> *filter,
MemRef<int, 2> *output);
}

#define DEFINE_BENCHMARK(name, func) \
void BM_CONV2D_##name(benchmark::State &state) { \
intptr_t sizesInput[2] = {INPUT_R, INPUT_C}; \
intptr_t sizesKernel[2] = {KERNEL_R, KERNEL_C}; \
intptr_t sizesOutput[2] = {OUTPUT_R, OUTPUT_C}; \
MemRef<int, 2> input(sizesInput, 1); \
MemRef<int, 2> filter(sizesKernel, 1); \
MemRef<int, 2> output(sizesOutput, 0); \
for (auto _ : state) { \
func(&input, &filter, &output); \
} \
}

DEFINE_BENCHMARK(SCALAR, _mlir_ciface_conv_2d_scalar)
DEFINE_BENCHMARK(RVV, _mlir_ciface_conv_2d_rvv)
} // namespace

BENCHMARK(BM_CONV2D_SCALAR)->Unit(benchmark::kMillisecond);
BENCHMARK(BM_CONV2D_RVV)->Unit(benchmark::kMillisecond);

void verification() {
// Set the random number generator.
std::random_device rd;
std::mt19937 generator(rd());
std::uniform_int_distribution<int> distribution(1, 100);

// Set the layout sizes of input and output memref container.
intptr_t sizesInput[2] = {INPUT_R, INPUT_C};
intptr_t sizesKernel[2] = {KERNEL_R, KERNEL_C};
intptr_t sizesOutput[2] = {OUTPUT_R, OUTPUT_C};

// Generate input memref container with random numbers.
const int inputSize = INPUT_R * INPUT_C;
int inputRand[inputSize];
for (int i = 0; i < inputSize; ++i) {
inputRand[i] = distribution(generator);
}
MemRef<int, 2> inputMemRef(inputRand, sizesInput);

// Generate kernel memref container with random numbers.
const int kernelSize = KERNEL_R * KERNEL_C;
int kernelRand[kernelSize];
for (int i = 0; i < kernelSize; ++i) {
kernelRand[i] = distribution(generator);
}
MemRef<int, 2> kernelMemRef(kernelRand, sizesKernel);

// Generate a result using a scalar method for comparison during verification.
const int outputSize = OUTPUT_R * OUTPUT_C;
MemRef<int, 2> outputScalar(sizesOutput, 0);
MemRef<int, 2> outputRVV(sizesOutput, 0);
_mlir_ciface_conv_2d_scalar(&inputMemRef, &kernelMemRef, &outputScalar);
_mlir_ciface_conv_2d_rvv(&inputMemRef, &kernelMemRef, &outputRVV);
auto resultScalar = outputScalar.getData();
auto resultRVV = outputRVV.getData();

// Print the verfication result.
std::cout << "-----------------------------------------------------------"
<< std::endl;
std::cout << "Correctness Verification:" << std::endl;
std::cout << "Transform case: "
<< (areArraysEqual(resultScalar, resultRVV, outputSize) ? PASS
: FAIL)
<< std::endl;
std::cout << "-----------------------------------------------------------"
<< std::endl;
}

int main(int argc, char **argv) {
// Run benchmark.
::benchmark::Initialize(&argc, argv);
::benchmark::RunSpecifiedBenchmarks();
// Run correctness verification.
verification();
return 0;
}
56 changes: 56 additions & 0 deletions benchmarks/OpOptimization/Conv2dOp/Conv2DRVV.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
module{
func.func @conv_2d(%arg0: memref<?x?xi32>, %arg1: memref<?x?xi32>, %arg2: memref<?x?xi32>) {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%sew = arith.constant 2 : index
%dim = memref.dim %arg1, %c0 : memref<?x?xi32>
%dim_0 = memref.dim %arg1, %c1 : memref<?x?xi32>
%dim_1 = memref.dim %arg2, %c0 : memref<?x?xi32>
%dim_2 = memref.dim %arg2, %c1 : memref<?x?xi32>

affine.for %tmp0 = %c0 to %dim_1 {
%tmpAVL, %tmpIdx = scf.while (%avl = %dim_2, %idx = %c0) : (index, index) -> (index, index) {
// If avl greater than zero.
%cond = arith.cmpi sgt, %avl, %c0 : index
// Pass avl, idx to the after region.
scf.condition(%cond) %avl, %idx : index, index
} do {
^bb0(%avl : index, %idx : index):
%vl = rvv.setvl %avl, %sew, %c1 : index
%vl_i32 = arith.index_cast %vl : index to i32
%mask = vector.create_mask %vl : vector<[8]xi1>
%c_vector = vector_exp.predication %mask, %vl_i32 : vector<[8]xi1>, i32 {
%ele = vector.load %arg2[%tmp0, %idx] : memref<?x?xi32>, vector<[8]xi32>
vector.yield %ele : vector<[8]xi32>
} : vector<[8]xi32>
%tmpvector = affine.for %tmp1 = %c0 to %dim iter_args(%vector_iter0 = %c_vector) -> (vector<[8]xi32>) {
%vector_next = affine.for %tmp2 = %c0 to %dim_0 iter_args(%vector_iter1 = %vector_iter0) -> (vector<[8]xi32>) {
%0 = affine.load %arg1[%tmp1, %tmp2] : memref<?x?xi32>
%1 = arith.addi %tmp0, %tmp1 : index
%2 = arith.addi %idx, %tmp2 : index
%input_vector = vector_exp.predication %mask, %vl_i32 : vector<[8]xi1>, i32 {
%ele = vector.load %arg0[%1, %2] : memref<?x?xi32>, vector<[8]xi32>
vector.yield %ele : vector<[8]xi32>
} : vector<[8]xi32>

%3 = rvv.mul %input_vector, %0, %vl : vector<[8]xi32>, i32, index
%output = rvv.add %3, %vector_iter1, %vl : vector<[8]xi32>, vector<[8]xi32>, index

affine.yield %output: vector<[8]xi32>
}
affine.yield %vector_next : vector<[8]xi32>
}
vector_exp.predication %mask, %vl_i32 : vector<[8]xi1>, i32 {
vector.store %tmpvector, %arg2[%tmp0, %idx] : memref<?x?xi32>, vector<[8]xi32>
vector.yield
} : () -> ()

// Update idx and avl.
%new_idx = arith.addi %idx, %vl : index
%new_avl = arith.subi %avl, %vl : index
scf.yield %new_avl, %new_idx : index, index
}
}
return
}
}