diff --git a/benchmarks/DeepLearning/Models/Bert/CMakeLists.txt b/benchmarks/DeepLearning/Models/Bert/CMakeLists.txt index bc37f5fb..cd19b6f2 100644 --- a/benchmarks/DeepLearning/Models/Bert/CMakeLists.txt +++ b/benchmarks/DeepLearning/Models/Bert/CMakeLists.txt @@ -7,161 +7,231 @@ add_custom_command( COMMENT "Generating forward.mlir, subgraph0.mlir" ) +add_executable(dl-model-bert-benchmark + GoogleBenchmarkMain.cpp +) + +target_link_libraries(dl-model-bert-benchmark + GoogleBenchmark +) + +# CMAKE_C_FLAGS is set when configuring cmake. +separate_arguments(CLANG_FLAGS_LIST UNIX_COMMAND "${CMAKE_C_FLAGS}") + +################################################################################ +# +# Build scalar target. +# +################################################################################ add_custom_command( - OUTPUT forward_auto_vectorization.o - COMMAND - cat ${CMAKE_CURRENT_SOURCE_DIR}/forward.mlir | - sed -e {s/@forward/@forward_auto_vectorization/} - -e {s/@subgraph0/@subgraph0_auto_vectorization/} | - ${LLVM_MLIR_BINARY_DIR}/mlir-opt - -pass-pipeline - "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith), \ - empty-tensor-to-alloc-tensor, convert-elementwise-to-linalg, arith-bufferize, \ - func.func(linalg-bufferize, tensor-bufferize), func-bufferize)" | - ${LLVM_MLIR_BINARY_DIR}/mlir-opt - -pass-pipeline - "builtin.module(func.func(buffer-deallocation-simplification, convert-linalg-to-loops), \ - eliminate-empty-tensors, func.func(llvm-request-c-wrappers), \ - convert-math-to-llvm, convert-math-to-libm, convert-scf-to-cf, \ - convert-arith-to-llvm, expand-strided-metadata, finalize-memref-to-llvm, \ - convert-func-to-llvm, reconcile-unrealized-casts)" | - ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir | - ${LLVM_MLIR_BINARY_DIR}/llc -O3 - -mtriple=${BUDDY_OPT_TRIPLE} -mattr=${BUDDY_OPT_ATTR} -filetype=obj - -o ${CMAKE_CURRENT_BINARY_DIR}/forward_auto_vectorization.o + OUTPUT forward_scalar.o + COMMAND cat ${CMAKE_CURRENT_SOURCE_DIR}/forward.mlir | + sed -e {s/@forward/@forward_scalar/} | + sed -e {s/@subgraph0/@subgraph0_scalar/} | + ${LLVM_MLIR_BINARY_DIR}/mlir-opt + -expand-strided-metadata + -finalize-memref-to-llvm + -llvm-request-c-wrappers + -convert-func-to-llvm + -reconcile-unrealized-casts | + ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir -o forward_scalar.ll + COMMAND ${LLVM_MLIR_BINARY_DIR}/clang -O3 ${CLANG_FLAGS_LIST} forward_scalar.ll + -c -save-temps -o ${CMAKE_CURRENT_BINARY_DIR}/forward_scalar.o DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/forward.mlir - COMMENT "Building forward_auto_vectorization.o" + COMMENT "Building forward_scalar.o" VERBATIM) add_custom_command( - OUTPUT subgraph0_auto_vectorization.o - COMMAND - cat ${CMAKE_CURRENT_SOURCE_DIR}/subgraph0.mlir | - sed -e {s/@subgraph0/@subgraph0_auto_vectorization/} | - ${BUDDY_MLIR_BINARY_DIR}/buddy-opt - -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith), empty-tensor-to-alloc-tensor, convert-elementwise-to-linalg, func-bufferize-dynamic-offset, arith-bufferize, func.func(linalg-bufferize, tensor-bufferize))" | - ${BUDDY_MLIR_BINARY_DIR}/buddy-opt - -convert-elementwise-to-linalg - -func-bufferize-dynamic-offset - -arith-bufferize - -func-bufferize - -tensor-bufferize - -linalg-bufferize - -finalizing-bufferize - -convert-linalg-to-loops - -lower-affine - -convert-scf-to-cf - -llvm-request-c-wrappers - -convert-math-to-llvm - -convert-math-to-libm - -convert-arith-to-llvm - -convert-func-to-llvm - -expand-strided-metadata - -finalize-memref-to-llvm - -reconcile-unrealized-casts | - ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir | - ${LLVM_MLIR_BINARY_DIR}/llc -O3 - -mtriple=${BUDDY_OPT_TRIPLE} -mattr=${BUDDY_OPT_ATTR} -filetype=obj - -o ${CMAKE_CURRENT_BINARY_DIR}/subgraph0_auto_vectorization.o + OUTPUT subgraph0_scalar.o + COMMAND cat ${CMAKE_CURRENT_SOURCE_DIR}/subgraph0.mlir | + sed -e {s/@subgraph0/@subgraph0_scalar/} | + ${LLVM_MLIR_BINARY_DIR}/mlir-opt + -pass-pipeline + "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | + ${BUDDY_MLIR_BINARY_DIR}/buddy-opt + -convert-elementwise-to-linalg + -func-bufferize-dynamic-offset + -arith-bufferize + -linalg-bufferize + -tensor-bufferize + -convert-math-to-llvm + -convert-math-to-libm + -one-shot-bufferize + -convert-linalg-to-affine-loops + -lower-affine + -func-bufferize + -tensor-bufferize + -arith-bufferize + -buffer-deallocation + -finalizing-bufferize + -convert-vector-to-scf + -expand-strided-metadata + -convert-vector-to-llvm + -finalize-memref-to-llvm + -convert-scf-to-cf + -llvm-request-c-wrappers + -convert-arith-to-llvm + -convert-func-to-llvm + -reconcile-unrealized-casts | + ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir -o subgraph0_scalar.ll + COMMAND ${LLVM_MLIR_BINARY_DIR}/clang -O3 ${CLANG_FLAGS_LIST} subgraph0_scalar.ll + -c -save-temps -o ${CMAKE_CURRENT_BINARY_DIR}/subgraph0_scalar.o DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/subgraph0.mlir - ${BUDDY_MLIR_BINARY_DIR}/buddy-opt - COMMENT "Building subgraph0_auto_vectorization.o" + COMMENT "Building subgraph0_scalar.o" VERBATIM) +add_library(bert_scalar STATIC subgraph0_scalar.o forward_scalar.o) +set_target_properties(bert_scalar PROPERTIES LINKER_LANGUAGE CXX) +target_link_libraries(dl-model-bert-benchmark + bert_scalar + ${BUDDY_LIB_DIR}/libStaticMLIRCRunnerUtils.a +) - add_custom_command( - OUTPUT forward_buddy_vectorization.o - COMMAND - cat ${CMAKE_CURRENT_SOURCE_DIR}/forward.mlir | - sed -e {s/@forward/@forward_buddy_vectorization/} - -e {s/@subgraph0/@subgraph0_buddy_vectorization/} | - ${LLVM_MLIR_BINARY_DIR}/mlir-opt - -pass-pipeline - "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith), \ - empty-tensor-to-alloc-tensor, convert-elementwise-to-linalg, arith-bufferize, \ - func.func(linalg-bufferize, tensor-bufferize), func-bufferize)" | - ${LLVM_MLIR_BINARY_DIR}/mlir-opt - -pass-pipeline - "builtin.module(func.func(buffer-deallocation-simplification, convert-linalg-to-loops), \ - eliminate-empty-tensors, func.func(llvm-request-c-wrappers), \ - convert-math-to-llvm, convert-math-to-libm, convert-scf-to-cf, \ - convert-arith-to-llvm, expand-strided-metadata, finalize-memref-to-llvm, \ - convert-func-to-llvm, reconcile-unrealized-casts)" | - ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir | - ${LLVM_MLIR_BINARY_DIR}/llc -O3 - -mtriple=${BUDDY_OPT_TRIPLE} -mattr=${BUDDY_OPT_ATTR} -filetype=obj - -o ${CMAKE_CURRENT_BINARY_DIR}/forward_buddy_vectorization.o - DEPENDS - ${CMAKE_CURRENT_SOURCE_DIR}/forward.mlir - COMMENT "Building forward_buddy_vectorization.o" - VERBATIM) +################################################################################ +# +# Build matmul/batch_matmul optimization target. +# +################################################################################ +add_custom_command( + OUTPUT forward_opt.o + COMMAND cat ${CMAKE_CURRENT_SOURCE_DIR}/forward.mlir | + sed -e {s/@forward/@forward_opt/} | + sed -e {s/@subgraph0/@subgraph0_opt/} | + ${LLVM_MLIR_BINARY_DIR}/mlir-opt + -expand-strided-metadata + -finalize-memref-to-llvm + -llvm-request-c-wrappers + -convert-func-to-llvm + -reconcile-unrealized-casts | + ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir -o forward_opt.ll + COMMAND ${LLVM_MLIR_BINARY_DIR}/clang -O3 ${CLANG_FLAGS_LIST} forward_opt.ll + -c -save-temps -o ${CMAKE_CURRENT_BINARY_DIR}/forward_opt.o + DEPENDS + ${CMAKE_CURRENT_SOURCE_DIR}/forward.mlir + COMMENT "Building forward_opt.o" + VERBATIM) add_custom_command( - OUTPUT subgraph0_buddy_vectorization.o - COMMAND - cat ${CMAKE_CURRENT_SOURCE_DIR}/subgraph0.mlir | - sed -e {s/@subgraph0/@subgraph0_buddy_vectorization/} | - ${BUDDY_MLIR_BINARY_DIR}/buddy-opt - -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith), empty-tensor-to-alloc-tensor, convert-elementwise-to-linalg, func-bufferize-dynamic-offset, arith-bufferize, func.func(linalg-bufferize, tensor-bufferize))" | - ${BUDDY_MLIR_BINARY_DIR}/buddy-opt - -convert-elementwise-to-linalg - -func-bufferize-dynamic-offset - -arith-bufferize - -func-bufferize - -tensor-bufferize - -linalg-bufferize - -finalizing-bufferize - -batchmatmul-optimize - -convert-linalg-to-affine-loops - -lower-affine - -convert-vector-to-scf - -convert-scf-to-cf - -llvm-request-c-wrappers - -convert-vector-to-llvm - -convert-math-to-llvm - -convert-math-to-libm - -convert-arith-to-llvm - -convert-func-to-llvm - -expand-strided-metadata - -finalize-memref-to-llvm - -reconcile-unrealized-casts | - ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir | - ${LLVM_MLIR_BINARY_DIR}/llc -O3 - -mtriple=${BUDDY_OPT_TRIPLE} -mattr=${BUDDY_OPT_ATTR} -filetype=obj - -o ${CMAKE_CURRENT_BINARY_DIR}/subgraph0_buddy_vectorization.o + OUTPUT subgraph0_opt.o + COMMAND cat ${CMAKE_CURRENT_SOURCE_DIR}/subgraph0.mlir | + sed -e {s/@subgraph0/@subgraph0_opt/} | + ${LLVM_MLIR_BINARY_DIR}/mlir-opt + -pass-pipeline + "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | + ${BUDDY_MLIR_BINARY_DIR}/buddy-opt + -convert-elementwise-to-linalg + -func-bufferize-dynamic-offset + -arith-bufferize + -linalg-bufferize + -tensor-bufferize + -convert-math-to-llvm + -convert-math-to-libm + -one-shot-bufferize + -matmul-parallel-vectorization-optimize # matmul optimization + -batchmatmul-optimize # batchmatmul optimization + -convert-linalg-to-affine-loops + -lower-affine + -func-bufferize + -tensor-bufferize + -arith-bufferize + -buffer-deallocation + -finalizing-bufferize + -convert-vector-to-scf + -expand-strided-metadata + -convert-vector-to-llvm + -finalize-memref-to-llvm + -convert-scf-to-cf + -llvm-request-c-wrappers + -convert-arith-to-llvm + -convert-func-to-llvm + -reconcile-unrealized-casts | + ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir -o subgraph0_opt.ll + COMMAND ${LLVM_MLIR_BINARY_DIR}/clang -O3 ${CLANG_FLAGS_LIST} subgraph0_opt.ll + -c -save-temps -o ${CMAKE_CURRENT_BINARY_DIR}/subgraph0_opt.o DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/subgraph0.mlir - ${BUDDY_MLIR_BINARY_DIR}/buddy-opt - COMMENT "Building subgraph0_buddy_vectorization.o" + COMMENT "Building subgraph0_opt.o" VERBATIM) - - -add_library(BERT_AUTO_VECTORIZATION subgraph0_auto_vectorization.o forward_auto_vectorization.o) -set_target_properties(BERT_AUTO_VECTORIZATION PROPERTIES LINKER_LANGUAGE CXX) - -add_library(BERT_BUDDY_VECTORIZATION STATIC subgraph0_buddy_vectorization.o forward_buddy_vectorization.o) -set_target_properties(BERT_BUDDY_VECTORIZATION PROPERTIES LINKER_LANGUAGE CXX) - -add_executable(dl-model-bert-benchmark - GoogleBenchmarkMain.cpp -) - -set_target_properties(dl-model-bert-benchmark PROPERTIES - LINK_FLAGS "-static" +add_library(bert_opt STATIC subgraph0_opt.o forward_opt.o) +set_target_properties(bert_opt PROPERTIES LINKER_LANGUAGE CXX) +target_link_libraries(dl-model-bert-benchmark + bert_opt + ${BUDDY_LIB_DIR}/libStaticMLIRCRunnerUtils.a ) -set(BenchmarkTool GoogleBenchmark) - -if(CROSS_COMPILE_RVV) - set(BUDDY_LIB_DIR ${BUDDY_MLIR_CROSS_LIB_DIR}) -else() - set(BUDDY_LIB_DIR ${BUDDY_MLIR_LIB_DIR}) -endif() +################################################################################ +# +# Build matmul/batch_matmul optimization target with openmp. +# +################################################################################ +add_custom_command( + OUTPUT forward_opt_omp.o + COMMAND cat ${CMAKE_CURRENT_SOURCE_DIR}/forward.mlir | + sed -e {s/@forward/@forward_opt_omp/} | + sed -e {s/@subgraph0/@subgraph0_opt_omp/} | + ${LLVM_MLIR_BINARY_DIR}/mlir-opt + -expand-strided-metadata + -finalize-memref-to-llvm + -llvm-request-c-wrappers + -convert-func-to-llvm + -reconcile-unrealized-casts | + ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir -o forward_opt_omp.ll + COMMAND ${LLVM_MLIR_BINARY_DIR}/clang -O3 ${CLANG_FLAGS_LIST} forward_opt_omp.ll + -c -save-temps -o ${CMAKE_CURRENT_BINARY_DIR}/forward_opt_omp.o + DEPENDS + ${CMAKE_CURRENT_SOURCE_DIR}/forward.mlir + COMMENT "Building forward_opt_omp.o" + VERBATIM) +add_custom_command( + OUTPUT subgraph0_opt_omp.o + COMMAND cat ${CMAKE_CURRENT_SOURCE_DIR}/subgraph0.mlir | + sed -e {s/@subgraph0/@subgraph0_opt_omp/} | + ${LLVM_MLIR_BINARY_DIR}/mlir-opt + -pass-pipeline + "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | + ${BUDDY_MLIR_BINARY_DIR}/buddy-opt + -convert-elementwise-to-linalg + -func-bufferize-dynamic-offset + -arith-bufferize + -linalg-bufferize + -tensor-bufferize + -convert-math-to-llvm + -convert-math-to-libm + -one-shot-bufferize + -matmul-parallel-vectorization-optimize # matmul optimization + -batchmatmul-optimize # batchmatmul optimization + -convert-linalg-to-affine-loops + -lower-affine + -convert-scf-to-openmp # openmp support + -func-bufferize + -tensor-bufferize + -arith-bufferize + -buffer-deallocation + -finalizing-bufferize + -convert-vector-to-scf + -expand-strided-metadata + -convert-vector-to-llvm + -finalize-memref-to-llvm + -convert-scf-to-cf + -llvm-request-c-wrappers + -convert-openmp-to-llvm # openmp support + -convert-arith-to-llvm + -convert-func-to-llvm + -reconcile-unrealized-casts | + ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir -o subgraph0_opt_omp.ll + COMMAND ${LLVM_MLIR_BINARY_DIR}/clang -O3 ${CLANG_FLAGS_LIST} subgraph0_opt_omp.ll + -fopenmp -c -save-temps -o ${CMAKE_CURRENT_BINARY_DIR}/subgraph0_opt_omp.o + DEPENDS + ${CMAKE_CURRENT_SOURCE_DIR}/subgraph0.mlir + COMMENT "Building subgraph0_opt_omp.o" + VERBATIM) +add_library(bert_opt_omp subgraph0_opt_omp.o forward_opt_omp.o) +set_target_properties(bert_opt_omp PROPERTIES LINKER_LANGUAGE CXX) target_link_libraries(dl-model-bert-benchmark - ${BenchmarkTool} - BERT_AUTO_VECTORIZATION - BERT_BUDDY_VECTORIZATION + # /root/intern/buddy-mlir/llvm/build-omp-shared-rv/lib/libomp.so + /root/intern/buddy-mlir/llvm/build/lib/libomp.so + bert_opt_ompw ${BUDDY_LIB_DIR}/libStaticMLIRCRunnerUtils.a ) diff --git a/benchmarks/DeepLearning/Models/Bert/GoogleBenchmarkMain.cpp b/benchmarks/DeepLearning/Models/Bert/GoogleBenchmarkMain.cpp index bc406034..4d48f147 100644 --- a/benchmarks/DeepLearning/Models/Bert/GoogleBenchmarkMain.cpp +++ b/benchmarks/DeepLearning/Models/Bert/GoogleBenchmarkMain.cpp @@ -50,14 +50,14 @@ namespace { // Declare the BERT C interface. extern "C" { -void _mlir_ciface_forward_auto_vectorization(MemRef *output, +void _mlir_ciface_forward_scalar(MemRef *output, MemRef *input0, MemRef *input1, MemRef *input2, MemRef *input3, MemRef *input4); -void _mlir_ciface_forward_buddy_vectorization(MemRef *output, +void _mlir_ciface_forward_opt(MemRef *output, MemRef *input0, MemRef *input1, MemRef *input2, @@ -82,11 +82,11 @@ void DL_MODEL_BERT(benchmark::State &state, Func func) { } // namespace // Register benchmarking function with different arguments. -BENCHMARK_CAPTURE(DL_MODEL_BERT, Auto_Vectorization, - _mlir_ciface_forward_auto_vectorization) +BENCHMARK_CAPTURE(DL_MODEL_BERT, scalar, + _mlir_ciface_forward_scalar) ->Unit(benchmark::kMillisecond); -BENCHMARK_CAPTURE(DL_MODEL_BERT, Buddy_Vectorization, - _mlir_ciface_forward_buddy_vectorization) +BENCHMARK_CAPTURE(DL_MODEL_BERT, opt, + _mlir_ciface_forward_opt) ->Unit(benchmark::kMillisecond); /// Correctness Verification @@ -102,8 +102,8 @@ void verification() { MemRef input4({1, MaxTokenLength}, 6); // Call the forward functions of the model. - _mlir_ciface_forward_auto_vectorization(&outputAutoVectorization, &input0, &input1, &input2, &input3, &input4); - _mlir_ciface_forward_buddy_vectorization(&outputBuddyVectorization, &input0, &input1, &input2, &input3, &input4); + _mlir_ciface_forward_scalar(&outputAutoVectorization, &input0, &input1, &input2, &input3, &input4); + _mlir_ciface_forward_opt(&outputBuddyVectorization, &input0, &input1, &input2, &input3, &input4); auto resultAutoVectorization = outputAutoVectorization.getData(); auto resultBuddyVectorization = outputBuddyVectorization.getData(); diff --git a/benchmarks/DeepLearning/Models/TinyLlama/CMakeLists.txt b/benchmarks/DeepLearning/Models/TinyLlama/CMakeLists.txt index 0eeb4666..8503ee60 100644 --- a/benchmarks/DeepLearning/Models/TinyLlama/CMakeLists.txt +++ b/benchmarks/DeepLearning/Models/TinyLlama/CMakeLists.txt @@ -10,13 +10,15 @@ add_custom_command( add_executable(dl-model-tinyllama-benchmark Main.cpp ) -set_target_properties(dl-model-tinyllama-benchmark PROPERTIES - LINK_FLAGS "-static" -) + target_link_libraries(dl-model-tinyllama-benchmark GoogleBenchmark ) +# Set up Openmp +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fopenmp") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp") + # CMAKE_C_FLAGS is set when configuring cmake. separate_arguments(CLANG_FLAGS_LIST UNIX_COMMAND "${CMAKE_C_FLAGS}") @@ -122,8 +124,8 @@ add_custom_command( -eliminate-empty-tensors -empty-tensor-to-alloc-tensor -one-shot-bufferize - -matmul-parallel-vectorization-optimize # new added - -batchmatmul-optimize # new added + -matmul-parallel-vectorization-optimize # matmul optimization + -batchmatmul-optimize # batchmatmul optimization -convert-linalg-to-affine-loops -lower-affine -func-bufferize @@ -155,3 +157,76 @@ target_link_libraries(dl-model-tinyllama-benchmark tinyllama_matmul_opt ${BUDDY_LIB_DIR}/libStaticMLIRCRunnerUtils.a ) + +################################################################################ +# +# Build matmul/batch_matmul optimization target with openmp. +# +################################################################################ +add_custom_command( + OUTPUT forward_matmul_opt_omp.o + COMMAND cat ${CMAKE_CURRENT_SOURCE_DIR}/forward.mlir | + sed -e {s/@forward/@forward_matmul_opt_omp/} | + sed -e {s/@subgraph0/@subgraph0_matmul_opt_omp/} | + ${LLVM_MLIR_BINARY_DIR}/mlir-opt + -expand-strided-metadata + -finalize-memref-to-llvm + -llvm-request-c-wrappers + -convert-func-to-llvm + -reconcile-unrealized-casts | + ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir -o forward_matmul_opt_omp.ll + COMMAND ${LLVM_MLIR_BINARY_DIR}/clang -O3 ${CLANG_FLAGS_LIST} forward_matmul_opt_omp.ll + -c -save-temps -o ${CMAKE_CURRENT_BINARY_DIR}/forward_matmul_opt_omp.o + DEPENDS + ${CMAKE_CURRENT_SOURCE_DIR}/forward.mlir + COMMENT "Building forward_matmul_opt_omp.o" + VERBATIM) + +add_custom_command( + OUTPUT subgraph0_matmul_opt_omp.o + COMMAND cat ${CMAKE_CURRENT_SOURCE_DIR}/subgraph0.mlir | + sed -e {s/@subgraph0/@subgraph0_matmul_opt_omp/} | + ${LLVM_MLIR_BINARY_DIR}/mlir-opt + -pass-pipeline + "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | + ${BUDDY_MLIR_BINARY_DIR}/buddy-opt + -eliminate-empty-tensors + -empty-tensor-to-alloc-tensor + -one-shot-bufferize + -matmul-parallel-vectorization-optimize # # matmul optimization + -batchmatmul-optimize # # batchmatmul optimization + -convert-linalg-to-affine-loops + -lower-affine + -convert-scf-to-openmp # openmp support + -func-bufferize + -tensor-bufferize + -arith-bufferize + -buffer-deallocation + -finalizing-bufferize + -convert-vector-to-scf + -expand-strided-metadata + -convert-vector-to-llvm + -finalize-memref-to-llvm + -convert-scf-to-cf + -llvm-request-c-wrappers + -convert-openmp-to-llvm # openmp support + -convert-arith-to-llvm + -convert-math-to-llvm + -convert-math-to-libm + -convert-func-to-llvm + -reconcile-unrealized-casts | + ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir -o subgraph0_matmul_opt_omp.ll + COMMAND ${LLVM_MLIR_BINARY_DIR}/clang -O3 ${CLANG_FLAGS_LIST} subgraph0_matmul_opt_omp.ll + -c -save-temps -o ${CMAKE_CURRENT_BINARY_DIR}/subgraph0_matmul_opt_omp.o + DEPENDS + ${CMAKE_CURRENT_SOURCE_DIR}/subgraph0.mlir + COMMENT "Building subgraph0_matmul_opt_omp.o" + VERBATIM) +add_library(tinyllama_matmul_opt_omp subgraph0_matmul_opt_omp.o forward_matmul_opt_omp.o) +set_target_properties(tinyllama_matmul_opt_omp PROPERTIES LINKER_LANGUAGE CXX) + +target_link_libraries(dl-model-tinyllama-benchmark + ${BUDDY_LIB_DIR}/libomp.so + tinyllama_matmul_opt_omp + ${BUDDY_LIB_DIR}/libStaticMLIRCRunnerUtils.a +) diff --git a/benchmarks/DeepLearning/Models/TinyLlama/Main.cpp b/benchmarks/DeepLearning/Models/TinyLlama/Main.cpp index 00c1da0a..59cff327 100644 --- a/benchmarks/DeepLearning/Models/TinyLlama/Main.cpp +++ b/benchmarks/DeepLearning/Models/TinyLlama/Main.cpp @@ -74,6 +74,9 @@ void _mlir_ciface_forward_scalar(MemRef *a, MemRef *b, MemRef *c); void _mlir_ciface_forward_matmul_opt(MemRef *a, MemRef *b, MemRef *c); +void _mlir_ciface_forward_matmul_opt_omp(MemRef *a, + MemRef *b, + MemRef *c); /// [Step 1] Add function of your new method. } BENCHMARK_CAPTURE(DL_MODEL_TINYLLAMA, scalar, _mlir_ciface_forward_scalar) @@ -81,6 +84,9 @@ BENCHMARK_CAPTURE(DL_MODEL_TINYLLAMA, scalar, _mlir_ciface_forward_scalar) BENCHMARK_CAPTURE(DL_MODEL_TINYLLAMA, matmul_opt, _mlir_ciface_forward_matmul_opt) ->Unit(benchmark::kMillisecond); +BENCHMARK_CAPTURE(DL_MODEL_TINYLLAMA, matmul_opt_omp, + _mlir_ciface_forward_matmul_opt_omp) + ->Unit(benchmark::kMillisecond); /// [Step 2] Call GoogleBenchmark function to run your new method. // ----------------------------------------------------------------------------- @@ -102,7 +108,9 @@ int main(int argc, char **argv) { float *outputExpected = resultContainer[0].getData(); MLIRVerification(outputExpected, _mlir_ciface_forward_matmul_opt, - "matmul_opt"); - /// [Step 3] Add your new method for verification. - return 0; + "matmul_opt"); + MLIRVerification(outputExpected, _mlir_ciface_forward_matmul_opt_omp, + "matmul_opt_omp"); + // /// [Step 3] Add your new method for verification. + // return 0; } diff --git a/benchmarks/DeepLearning/Ops/BatchMatMulOp/BatchMatMulBroadcast.mlir b/benchmarks/DeepLearning/Ops/BatchMatMulOp/BatchMatMulBroadcast.mlir new file mode 100644 index 00000000..3488eab8 --- /dev/null +++ b/benchmarks/DeepLearning/Ops/BatchMatMulOp/BatchMatMulBroadcast.mlir @@ -0,0 +1,46 @@ +// The MLIR prototype of batchmatmul-optimize in buddy-opt. + +#map = affine_map<(d0) -> (d0 ceildiv 16)> +#tail_len_map = affine_map<(d0) -> (d0 mod 16)> +#if_set = affine_set<(d0)[s0] : (s0 - d0 * 16 >= 16)> +#b_col_idx_tail_map = affine_map<(d0) -> (d0 * 16)> + +func.func @batch_matmul(%a : memref, %b : memref, %c : memref) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %step = arith.constant 16 : index + %c0_f32 = arith.constant 0.0 : f32 + %c0_f32_vec = vector.splat %c0_f32 : vector<16xf32> + + %a_row = memref.dim %a, %c1 : memref + %a_col = memref.dim %a, %c2 : memref + %b_row = memref.dim %b, %c1 : memref + %b_col = memref.dim %b, %c2 : memref + %batch = memref.dim %a, %c0 : memref + + %tail_len = affine.apply #tail_len_map(%b_col) + %mask_vec = vector.create_mask %tail_len : vector<16xi1> + + affine.parallel (%batch_idx) = (0) to (%batch){ // Affine.parallel can be lowered to the omp dialect, which enables batch-level parallelization. + affine.prefetch %a[%batch_idx, %a_row, %a_col], read, locality<3>, data : memref // Explicitly prefetch, about 5% faster on X86. + affine.for %b_row_idx = 0 to %b_row { + affine.for %b_col_idx = 0 to #map(%b_col) { + %b_vec = affine.vector_load %b[%batch_idx, %b_row_idx, %b_col_idx * 16] : memref, vector<16xf32> + %b_col_idx_tail = affine.apply #b_col_idx_tail_map(%b_col_idx) + affine.for %a_row_idx = 0 to %a_row { + %a_ele = affine.load %a[%batch_idx, %a_row_idx, %b_row_idx] : memref + %a_vec = vector.broadcast %a_ele : f32 to vector<16xf32> + %c_vec = affine.vector_load %c[%batch_idx, %a_row_idx, %b_col_idx * 16] : memref, vector<16xf32> + %result_vec = vector.fma %a_vec, %b_vec, %c_vec : vector<16xf32> + affine.if #if_set(%b_col_idx)[%b_col] { + affine.vector_store %result_vec, %c[%batch_idx, %a_row_idx, %b_col_idx * 16] : memref, vector<16xf32> + } else { + vector.maskedstore %c[%batch_idx, %a_row_idx, %b_col_idx_tail], %mask_vec, %result_vec : memref, vector<16xi1>, vector<16xf32> + } + } + } + } + } + return +} diff --git a/benchmarks/DeepLearning/Ops/BatchMatMulOp/CMakeLists.txt b/benchmarks/DeepLearning/Ops/BatchMatMulOp/CMakeLists.txt index 67915a4a..273eea20 100644 --- a/benchmarks/DeepLearning/Ops/BatchMatMulOp/CMakeLists.txt +++ b/benchmarks/DeepLearning/Ops/BatchMatMulOp/CMakeLists.txt @@ -3,6 +3,10 @@ add_executable(dl-op-linalg-batch-matmul-benchmark ) target_link_libraries(dl-op-linalg-batch-matmul-benchmark GoogleBenchmark) +# Set up Openmp +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fopenmp") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp") + # CMAKE_C_FLAGS is set when configuring CMake separate_arguments(CLANG_FLAGS_LIST UNIX_COMMAND "${CMAKE_C_FLAGS}") @@ -130,3 +134,54 @@ set_target_properties(batch_matmul_scf PROPERTIES LINKER_LANGUAGE CXX) target_link_libraries(dl-op-linalg-batch-matmul-benchmark batch_matmul_scf ) + +# BatchMatMul Broadcast +add_custom_command(OUTPUT batch_matmul_broadcast.o + COMMAND cat ${BUDDY_SOURCE_DIR}/benchmarks/DeepLearning/Ops/BatchMatMulOp/BatchMatMulBroadcast.mlir | + sed 's/@batch_matmul/@batch_matmul_broadcast/' | + ${BUDDY_MLIR_BUILD_DIR}/bin/buddy-opt + -expand-strided-metadata + -affine-super-vectorize + -lower-affine + -convert-vector-to-llvm + -finalize-memref-to-llvm + -convert-scf-to-cf + -llvm-request-c-wrappers + -convert-func-to-llvm + -reconcile-unrealized-casts | + ${BUDDY_MLIR_BUILD_DIR}/bin/buddy-translate --buddy-to-llvmir -o batch_matmul_broadcast.ll + COMMAND ${LLVM_MLIR_BINARY_DIR}/clang -O3 ${CLANG_FLAGS_LIST} batch_matmul_broadcast.ll + -c -save-temps -o ${CMAKE_CURRENT_BINARY_DIR}/batch_matmul_broadcast.o +) +add_library(batch_matmul_broadcast STATIC batch_matmul_broadcast.o) +set_target_properties(batch_matmul_broadcast PROPERTIES LINKER_LANGUAGE CXX) +target_link_libraries(dl-op-linalg-batch-matmul-benchmark + batch_matmul_broadcast +) + +# BatchMatMul Broadcast Openmp +add_custom_command(OUTPUT batch_matmul_broadcast_omp.o + COMMAND cat ${BUDDY_SOURCE_DIR}/benchmarks/DeepLearning/Ops/BatchMatMulOp/BatchMatMulBroadcast.mlir | + sed 's/@batch_matmul/@batch_matmul_broadcast_omp/' | + ${BUDDY_MLIR_BUILD_DIR}/bin/buddy-opt + -expand-strided-metadata + -affine-super-vectorize + -lower-affine + -convert-scf-to-openmp + -convert-vector-to-llvm + -finalize-memref-to-llvm + -convert-scf-to-cf + -llvm-request-c-wrappers + -convert-openmp-to-llvm + -convert-func-to-llvm + -reconcile-unrealized-casts | + ${BUDDY_MLIR_BUILD_DIR}/bin/buddy-translate --buddy-to-llvmir -o batch_matmul_broadcast_omp.ll + COMMAND ${LLVM_MLIR_BINARY_DIR}/clang -O3 ${CLANG_FLAGS_LIST} batch_matmul_broadcast_omp.ll + -c -save-temps -o ${CMAKE_CURRENT_BINARY_DIR}/batch_matmul_broadcast_omp.o +) +add_library(batch_matmul_broadcast_omp STATIC batch_matmul_broadcast_omp.o) +set_target_properties(batch_matmul_broadcast_omp PROPERTIES LINKER_LANGUAGE CXX) +target_link_libraries(dl-op-linalg-batch-matmul-benchmark + ${BUDDY_LIB_DIR}/libomp.so + batch_matmul_broadcast_omp +) diff --git a/benchmarks/DeepLearning/Ops/BatchMatMulOp/Main.cpp b/benchmarks/DeepLearning/Ops/BatchMatMulOp/Main.cpp index d0a0dd9f..f29ee910 100644 --- a/benchmarks/DeepLearning/Ops/BatchMatMulOp/Main.cpp +++ b/benchmarks/DeepLearning/Ops/BatchMatMulOp/Main.cpp @@ -26,11 +26,11 @@ // Benchmark Configuration. You can change the number here as needed. // ----------------------------------------------------------------------------- -#define BATCH_SIZE 3 -#define _SIZE_M 128 -#define _SIZE_N 128 -#define _SIZE_K 128 -#define _NUM_ITER 5 +#define BATCH_SIZE 128 +#define _SIZE_M 256 +#define _SIZE_N 256 +#define _SIZE_K 256 +#define _NUM_ITER 10 // ----------------------------------------------------------------------------- // Global Variables and Functions. No need to change the code here. @@ -83,12 +83,19 @@ void _mlir_ciface_batch_matmul_tile(MemRef *A, MemRef *B, MemRef *C); void _mlir_ciface_batch_matmul_scf(MemRef *A, MemRef *B, MemRef *C); +void _mlir_ciface_batch_matmul_broadcast(MemRef *A, + MemRef *B, + MemRef *C); +void _mlir_ciface_batch_matmul_broadcast_omp(MemRef *A, + MemRef *B, + MemRef *C); /// [Step 1] Add function of your new method. } -BENCHMARK_CAPTURE(DL_OPS_BATCH_MATMUL, Scalar, _mlir_ciface_batch_matmul_scalar) - ->Unit(benchmark::kMillisecond) - ->Iterations(_NUM_ITER); +// BENCHMARK_CAPTURE(DL_OPS_BATCH_MATMUL, Scalar, +// _mlir_ciface_batch_matmul_scalar) +// ->Unit(benchmark::kMillisecond) +// ->Iterations(_NUM_ITER); BENCHMARK_CAPTURE(DL_OPS_BATCH_MATMUL, AutoVectorization, _mlir_ciface_batch_matmul_auto_vectorization) ->Unit(benchmark::kMillisecond) @@ -103,6 +110,14 @@ BENCHMARK_CAPTURE(DL_OPS_BATCH_MATMUL, Tile, _mlir_ciface_batch_matmul_tile) BENCHMARK_CAPTURE(DL_OPS_BATCH_MATMUL, SCF, _mlir_ciface_batch_matmul_scf) ->Unit(benchmark::kMillisecond) ->Iterations(_NUM_ITER); +BENCHMARK_CAPTURE(DL_OPS_BATCH_MATMUL, BROADCAST, + _mlir_ciface_batch_matmul_broadcast) + ->Unit(benchmark::kMillisecond) + ->Iterations(_NUM_ITER); +BENCHMARK_CAPTURE(DL_OPS_BATCH_MATMUL, BROADCAST_OMP, + _mlir_ciface_batch_matmul_broadcast_omp) + ->Unit(benchmark::kMillisecond) + ->Iterations(_NUM_ITER); /// [Step 2] Call GoogleBenchmark function to run your new method. // ----------------------------------------------------------------------------- @@ -129,6 +144,10 @@ int main(int argc, char **argv) { MLIRVerification(outputExpected, _mlir_ciface_batch_matmul_tile, "Tile"); MLIRVerification(outputExpected, _mlir_ciface_batch_matmul_scf, "SCF"); + MLIRVerification(outputExpected, _mlir_ciface_batch_matmul_broadcast, + "BROADCAST"); + MLIRVerification(outputExpected, _mlir_ciface_batch_matmul_broadcast_omp, + "BROADCAST_OMP"); /// [Step 3] Add your new method for verification. delete[] input1; diff --git a/benchmarks/DeepLearning/Ops/MatMulOp/CMakeLists.txt b/benchmarks/DeepLearning/Ops/MatMulOp/CMakeLists.txt index 9d433239..890637fa 100644 --- a/benchmarks/DeepLearning/Ops/MatMulOp/CMakeLists.txt +++ b/benchmarks/DeepLearning/Ops/MatMulOp/CMakeLists.txt @@ -77,4 +77,29 @@ target_link_libraries(dl-op-linalg-matmul-benchmark matmul_tile ) +add_custom_command(OUTPUT matmul_tile_omp.o + COMMAND cat ${BUDDY_SOURCE_DIR}/benchmarks/DeepLearning/Ops/MatMulOp/matmul.mlir | + sed 's/@matmul/@matmul_tile_omp/' | + ${BUDDY_MLIR_BINARY_DIR}/buddy-opt + -matmul-optimize="vec-size=16;kernel-m=2;kernel-n=4" + -convert-linalg-to-loops + -expand-strided-metadata + -lower-affine + -convert-scf-to-cf + -convert-vector-to-llvm + -finalize-memref-to-llvm + -convert-arith-to-llvm + -llvm-request-c-wrappers + -convert-func-to-llvm + -reconcile-unrealized-casts | + ${LLVM_MLIR_BINARY_DIR}/mlir-translate --mlir-to-llvmir -o matmul_tile_omp.ll + COMMAND ${LLVM_MLIR_BINARY_DIR}/clang -O3 ${CLANG_FLAGS_LIST} matmul_tile_omp.ll + -fopenmp -c -save-temps -o ${CMAKE_CURRENT_BINARY_DIR}/matmul_tile_omp.o +) +add_library(matmul_tile_omp STATIC matmul_tile_omp.o) +set_target_properties(matmul_tile_omp PROPERTIES LINKER_LANGUAGE CXX) +target_link_libraries(dl-op-linalg-matmul-benchmark + matmul_tile_omp +) + # Build the target for your new method here. diff --git a/benchmarks/DeepLearning/README.md b/benchmarks/DeepLearning/README.md index 058c9602..464d43b7 100644 --- a/benchmarks/DeepLearning/README.md +++ b/benchmarks/DeepLearning/README.md @@ -65,7 +65,7 @@ Make sure that the PYTHONPATH variable includes the directory of LLVM/MLIR pytho ```bash $ cd buddy-mlir/build $ export BUDDY_MLIR_BUILD_DIR=$PWD -$ export LLVM_MLIR_BUILD_DIR=$PWD/../llvm/build +$ export LLVM_MLIR_BUILD_DIR=${BUDDY_MLIR_BUILD_DIR}/../llvm/build $ export PYTHONPATH=${LLVM_MLIR_BUILD_DIR}/tools/mlir/python_packages/mlir_core:${BUDDY_MLIR_BUILD_DIR}/python_packages:${PYTHONPATH} ``` @@ -107,10 +107,10 @@ Follow the relevant [documentation](https://github.com/buddy-compiler/buddy-mlir ```bash $ cd buddy-mlir/build $ export BUDDY_MLIR_BUILD_DIR=$PWD -$ export LLVM_MLIR_BUILD_DIR=${BUDDY_MLIR_BUILD_DIR}/../llvm/build/ +$ export LLVM_MLIR_BUILD_DIR=${BUDDY_MLIR_BUILD_DIR}/../llvm/build +$ export PYTHONPATH=${LLVM_MLIR_BUILD_DIR}/tools/mlir/python_packages/mlir_core:${BUDDY_MLIR_BUILD_DIR}/python_packages:${PYTHONPATH} $ export BUDDY_MLIR_BUILD_CROSS_DIR=${BUDDY_MLIR_BUILD_DIR}/../build-cross-rv $ export RISCV_GNU_TOOLCHAIN=${BUDDY_MLIR_BUILD_DIR}/thirdparty/riscv-gnu-toolchain -$ export PYTHONPATH=${LLVM_MLIR_BUILD_DIR}/tools/mlir/python_packages/mlir_core:${BUDDY_MLIR_BUILD_DIR}/python_packages:${PYTHONPATH} ``` 2. Build the benchmark for the target platform: