diff --git a/sw/blas/.gitignore b/sw/blas/.gitignore new file mode 100644 index 000000000..2ff975f29 --- /dev/null +++ b/sw/blas/.gitignore @@ -0,0 +1 @@ +**/data/data.h \ No newline at end of file diff --git a/sw/blas/axpy/.gitignore b/sw/blas/axpy/.gitignore deleted file mode 100644 index f5ac16baa..000000000 --- a/sw/blas/axpy/.gitignore +++ /dev/null @@ -1 +0,0 @@ -data/data.h diff --git a/sw/blas/blas.h b/sw/blas/blas.h index a7910d25e..33c29e175 100644 --- a/sw/blas/blas.h +++ b/sw/blas/blas.h @@ -5,4 +5,5 @@ #pragma once #include "axpy/src/axpy.h" -#include "gemm/src/gemm.h" \ No newline at end of file +#include "dot/src/dot.h" +#include "gemm/src/gemm.h" diff --git a/sw/blas/dot/Makefile b/sw/blas/dot/Makefile new file mode 100644 index 000000000..cee16bb92 --- /dev/null +++ b/sw/blas/dot/Makefile @@ -0,0 +1,31 @@ +# Copyright 2024 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +# Usage of absolute paths is required to externally include this Makefile +MK_DIR := $(dir $(realpath $(lastword $(MAKEFILE_LIST)))) +DATA_DIR := $(realpath $(MK_DIR)/data) +SRC_DIR := $(realpath $(MK_DIR)/src) + +DATA_CFG ?= $(DATA_DIR)/params.json +SECTION ?= + +APP ?= dot +SRCS ?= $(realpath $(SRC_DIR)/main.c) +INCDIRS ?= $(dir $(DATA_H)) $(SRC_DIR) + +DATAGEN_PY = $(MK_DIR)/scripts/datagen.py +DATA_H ?= $(DATA_DIR)/data.h + +$(dir $(DATA_H)): + mkdir -p $@ + +$(DATA_H): $(DATAGEN_PY) $(DATA_CFG) | $(dir $(DATA_H)) + $< -c $(DATA_CFG) --section="$(SECTION)" > $@ + +.PHONY: clean-data clean + +clean-data: + rm -f $(DATA_H) + +clean: clean-data diff --git a/sw/blas/dot/data/params.json b/sw/blas/dot/data/params.json new file mode 100644 index 000000000..329a03b50 --- /dev/null +++ b/sw/blas/dot/data/params.json @@ -0,0 +1,7 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +{ + n: 4096 +} diff --git a/sw/blas/dot/scripts/datagen.py b/sw/blas/dot/scripts/datagen.py new file mode 100755 index 000000000..01f36899e --- /dev/null +++ b/sw/blas/dot/scripts/datagen.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python3 +# Copyright 2024 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +import numpy as np +import os +import sys + +sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/")) +from data_utils import format_scalar_definition, format_array_definition, \ + format_scalar_declaration, format_ifdef_wrapper, DataGen # noqa: E402 + + +class DotDataGen(DataGen): + + MIN = -1000 + MAX = +1000 + # AXI splits bursts crossing 4KB address boundaries. To minimize + # the occurrence of these splits the data should be aligned to 4KB + BURST_ALIGNMENT = 4096 + + def golden_model(self, x, y): + return np.dot(x, y) + + def emit_header(self, **kwargs): + header = [super().emit_header()] + + n = kwargs['n'] + x = np.random.uniform(self.MIN, self.MAX, n) + y = np.random.uniform(self.MIN, self.MAX, n) + g = self.golden_model(x, y) + + assert (n % (8 * 4)) == 0, "n must be an integer multiple of the number of cores times " \ + "the unrolling factor" + + header += [format_scalar_definition('const uint32_t', 'n', n)] + header += [format_array_definition('double', 'x', x, alignment=self.BURST_ALIGNMENT, + section=kwargs['section'])] + header += [format_array_definition('double', 'y', y, alignment=self.BURST_ALIGNMENT, + section=kwargs['section'])] + header += [format_scalar_declaration('double', 'result', alignment=self.BURST_ALIGNMENT, + section=kwargs['section'])] + result_def = format_scalar_definition('double', 'g', g) + header += [format_ifdef_wrapper('BIST', result_def)] + header = '\n\n'.join(header) + + return header + + +if __name__ == '__main__': + sys.exit(DotDataGen().main()) diff --git a/sw/blas/dot/scripts/verify.py b/sw/blas/dot/scripts/verify.py new file mode 100755 index 000000000..9d61ff466 --- /dev/null +++ b/sw/blas/dot/scripts/verify.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python3 +# Copyright 2024 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +import sys +from pathlib import Path +from datagen import DotDataGen + +sys.path.append(str(Path(__file__).parent / '../../../../util/sim/')) +from verif_utils import Verifier # noqa: E402 + + +class DotVerifier(Verifier): + + OUTPUT_UIDS = ['result'] + + def get_actual_results(self): + return self.get_output_from_symbol('result', 'double') + + def get_expected_results(self): + x = self.get_input_from_symbol('x', 'double') + y = self.get_input_from_symbol('y', 'double') + return DotDataGen().golden_model(x, y) + + def check_results(self, *args): + return super().check_results(*args, rtol=1e-10) + + +if __name__ == "__main__": + sys.exit(DotVerifier().main()) diff --git a/sw/blas/dot/src/dot.h b/sw/blas/dot/src/dot.h new file mode 100644 index 000000000..a8a81561c --- /dev/null +++ b/sw/blas/dot/src/dot.h @@ -0,0 +1,145 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#include "snrt.h" + +inline void dot_seq(uint32_t n, double *x, double *y, double *output) { + // Start of SSR region. + register volatile double ft0 asm("ft0"); + register volatile double ft1 asm("ft1"); + asm volatile("" : "=f"(ft0), "=f"(ft1)); + + snrt_ssr_loop_1d(SNRT_SSR_DM0, n, sizeof(double)); + snrt_ssr_loop_1d(SNRT_SSR_DM1, n, sizeof(double)); + + snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_1D, x); + snrt_ssr_read(SNRT_SSR_DM1, SNRT_SSR_1D, y); + + register volatile double res_ssr asm("fs0") = 0; + + snrt_ssr_enable(); + + const register uint32_t Nm1 asm("t0") = n - 1; + asm volatile( + "frep.o %[n_frep], 1, 0, 0 \n" + "fmadd.d %0, ft0, ft1, %0" + : "=f"(res_ssr) /* output operands */ + : "f"(ft0), "f"(ft1), "0"(res_ssr), + [ n_frep ] "r"(Nm1) /* input operands */ + :); + + // End of SSR region. + snrt_fpu_fence(); + snrt_ssr_disable(); + asm volatile("" : : "f"(ft0), "f"(ft1)); + output[0] = res_ssr; +} + +inline void dot_seq_4_acc(uint32_t n, double *x, double *y, double *output) { + // Start of SSR region. + register volatile double ft0 asm("ft0"); + register volatile double ft1 asm("ft1"); + asm volatile("" : "=f"(ft0), "=f"(ft1)); + + snrt_ssr_loop_1d(SNRT_SSR_DM0, n, sizeof(double)); + snrt_ssr_loop_1d(SNRT_SSR_DM1, n, sizeof(double)); + + snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_1D, x); + snrt_ssr_read(SNRT_SSR_DM1, SNRT_SSR_1D, y); + + register volatile double res_ssr_0 asm("fs0") = 0; + register volatile double res_ssr_1 asm("fs1") = 0; + register volatile double res_ssr_2 asm("fs2") = 0; + register volatile double res_ssr_3 asm("fs3") = 0; + + snrt_ssr_enable(); + + const register uint32_t Nm1 asm("t0") = (n >> 2) - 1; + asm volatile( + "frep.o %[n_frep], 4, 0, 0 \n" + "fmadd.d %0, ft0, ft1, %0 \n" + "fmadd.d %1, ft0, ft1, %1 \n" + "fmadd.d %2, ft0, ft1, %2 \n" + "fmadd.d %3, ft0, ft1, %3" + : "=f"(res_ssr_0), "=f"(res_ssr_1), "=f"(res_ssr_2), + "=f"(res_ssr_3) /* output operands */ + : "f"(ft0), "f"(ft1), "0"(res_ssr_0), "1"(res_ssr_1), "2"(res_ssr_2), + "3"(res_ssr_3), [ n_frep ] "r"(Nm1) /* input operands */ + :); + + // End of SSR region. + snrt_fpu_fence(); + snrt_ssr_disable(); + + asm volatile( + "fadd.d %[res_ssr_0], %[res_ssr_0], %[res_ssr_1] \n" + "fadd.d %[res_ssr_2], %[res_ssr_2], %[res_ssr_3] \n" + "fadd.d %[res_ssr_0], %[res_ssr_0], %[res_ssr_2]" + : [ res_ssr_0 ] "=f"(res_ssr_0), + [ res_ssr_2 ] "=f"(res_ssr_2) /* output operands */ + : [ res_ssr_1 ] "f"(res_ssr_1), + [ res_ssr_3 ] "f"(res_ssr_3) /* input operands */ + :); + + asm volatile("" : : "f"(ft0), "f"(ft1)); + output[0] = res_ssr_0; +} + +static inline void dot(uint32_t n, double *x, double *y, double *result) { + double *local_x, *local_y, *partial_sums; + + uint32_t start_cycle, end_cycle; + + // Allocate space in TCDM + local_x = (double *)snrt_l1_next(); + local_y = local_x + n; + partial_sums = local_y + n; + + // Copy data in TCDM + if (snrt_is_dm_core()) { + size_t size = n * sizeof(double); + snrt_dma_start_1d(local_x, x, size); + snrt_dma_start_1d(local_y, y, size); + snrt_dma_wait_all(); + } + + // Calculate size and pointers for each core + int core_idx = snrt_cluster_core_idx(); + int frac_core = n / snrt_cluster_compute_core_num(); + int offset_core = core_idx * frac_core; + local_x += offset_core; + local_y += offset_core; + + snrt_cluster_hw_barrier(); + + start_cycle = snrt_mcycle(); + + // Compute partial sums + if (snrt_is_compute_core()) { + dot_seq_4_acc(frac_core, local_x, local_y, &partial_sums[core_idx]); + } + + snrt_cluster_hw_barrier(); + + // Reduce partial sums on core 0 +#ifndef _DOTP_EXCLUDE_FINAL_SYNC_ + if (snrt_cluster_core_idx() == 0) { + for (uint32_t i = 1; i < snrt_cluster_compute_core_num(); i++) { + partial_sums[0] += partial_sums[i]; + } + snrt_fpu_fence(); + } +#endif + + end_cycle = snrt_mcycle(); + + snrt_cluster_hw_barrier(); + + // Copy data out of TCDM + if (snrt_is_dm_core()) { + *result = partial_sums[0]; + } + + snrt_cluster_hw_barrier(); +} diff --git a/sw/blas/dot/src/main.c b/sw/blas/dot/src/main.c new file mode 100644 index 000000000..44620c210 --- /dev/null +++ b/sw/blas/dot/src/main.c @@ -0,0 +1,27 @@ +// Copyright 2023 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#include "snrt.h" + +#include "data.h" +#include "dot.h" + +int main() { + dot(n, x, y, &result); + +// TODO: currently only works for single cluster otherwise need to +// synchronize all cores here +#ifdef BIST + uint32_t nerr = 1; + + // Check computation is correct + if (snrt_global_core_idx() == 0) { + if (result == g) nerr--; + return nerr; + } + +#endif + + return 0; +} diff --git a/sw/blas/gemm/.gitignore b/sw/blas/gemm/.gitignore deleted file mode 100644 index f5ac16baa..000000000 --- a/sw/blas/gemm/.gitignore +++ /dev/null @@ -1 +0,0 @@ -data/data.h diff --git a/target/snitch_cluster/sw.mk b/target/snitch_cluster/sw.mk index 1415fcb4e..28ab6668d 100644 --- a/target/snitch_cluster/sw.mk +++ b/target/snitch_cluster/sw.mk @@ -42,6 +42,7 @@ APPS = sw/apps/lto APPS += sw/apps/nop APPS += sw/apps/blas/axpy APPS += sw/apps/blas/gemm +APPS += sw/apps/blas/dot APPS += sw/apps/dnn/batchnorm APPS += sw/apps/dnn/conv2d APPS += sw/apps/dnn/fusedconv diff --git a/target/snitch_cluster/sw/apps/blas/dot/Makefile b/target/snitch_cluster/sw/apps/blas/dot/Makefile new file mode 100644 index 000000000..41e58a534 --- /dev/null +++ b/target/snitch_cluster/sw/apps/blas/dot/Makefile @@ -0,0 +1,10 @@ +# Copyright 2024 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Matteo Perotti + +include ../../../../../../sw/blas/dot/Makefile +include ../../common.mk + +$(DEP): $(DATA_H) diff --git a/target/snitch_cluster/sw/run.yaml b/target/snitch_cluster/sw/run.yaml index e5f07c731..3842e3e70 100644 --- a/target/snitch_cluster/sw/run.yaml +++ b/target/snitch_cluster/sw/run.yaml @@ -78,6 +78,8 @@ runs: cmd: [../../../sw/blas/axpy/scripts/verify.py, "${sim_bin}", "${elf}"] - elf: apps/blas/gemm/build/gemm.elf cmd: [../../../sw/blas/gemm/scripts/verify.py, "${sim_bin}", "${elf}"] + - elf: apps/blas/dot/build/dot.elf + cmd: [../../../sw/blas/dot/scripts/verify.py, "${sim_bin}", "${elf}"] - elf: apps/dnn/batchnorm/build/batchnorm.elf - elf: apps/dnn/maxpool/build/maxpool.elf # - elf: apps/dnn/conv2d/build/conv2d.elf # Fails with wrong results diff --git a/util/sim/data_utils.py b/util/sim/data_utils.py index 9763d416f..2ed621b34 100644 --- a/util/sim/data_utils.py +++ b/util/sim/data_utils.py @@ -144,6 +144,16 @@ def format_scalar_definition(dtype, uid, scalar): return s +def format_scalar_declaration(dtype, uid, alignment=None, section=None): + attributes = _variable_attributes(alignment, section) + s = f'{_alias_dtype(dtype)} {uid}' + if attributes: + s += f' {attributes};' + else: + s += ';' + return s + + def format_array_initializer(dtype, array): s = '{\n' array = flatten(array)