sw: Add DOTP benchmark (#151)

Co-authored-by: Luca Colagrande <luca.colagrande3@gmail.com>
pulp-platform · Jun 18, 2024 · dc4a6a6 · dc4a6a6
1 parent 06dcb96
commit dc4a6a6
Show file tree

Hide file tree

Showing 14 changed files with 319 additions and 3 deletions.
diff --git a/sw/blas/.gitignore b/sw/blas/.gitignore
@@ -0,0 +1 @@
+**/data/data.h
diff --git a/sw/blas/axpy/.gitignore b/sw/blas/axpy/.gitignore
diff --git a/sw/blas/blas.h b/sw/blas/blas.h
@@ -5,4 +5,5 @@
 #pragma once
 
 #include "axpy/src/axpy.h"
-#include "gemm/src/gemm.h"
+#include "dot/src/dot.h"
+#include "gemm/src/gemm.h"
diff --git a/sw/blas/dot/Makefile b/sw/blas/dot/Makefile
@@ -0,0 +1,31 @@
+# Copyright 2024 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+# Usage of absolute paths is required to externally include this Makefile
+MK_DIR   := $(dir $(realpath $(lastword $(MAKEFILE_LIST))))
+DATA_DIR := $(realpath $(MK_DIR)/data)
+SRC_DIR  := $(realpath $(MK_DIR)/src)
+
+DATA_CFG ?= $(DATA_DIR)/params.json
+SECTION  ?=
+
+APP     ?= dot
+SRCS    ?= $(realpath $(SRC_DIR)/main.c)
+INCDIRS ?= $(dir $(DATA_H)) $(SRC_DIR)
+
+DATAGEN_PY = $(MK_DIR)/scripts/datagen.py
+DATA_H    ?= $(DATA_DIR)/data.h
+
+$(dir $(DATA_H)):
+	mkdir -p $@
+
+$(DATA_H): $(DATAGEN_PY) $(DATA_CFG) | $(dir $(DATA_H))
+	$< -c $(DATA_CFG) --section="$(SECTION)" > $@
+
+.PHONY: clean-data clean
+
+clean-data:
+	rm -f $(DATA_H)
+
+clean: clean-data
diff --git a/sw/blas/dot/data/params.json b/sw/blas/dot/data/params.json
@@ -0,0 +1,7 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+{
+    n: 4096
+}
diff --git a/sw/blas/dot/scripts/datagen.py b/sw/blas/dot/scripts/datagen.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python3
+# Copyright 2024 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import os
+import sys
+
+sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/"))
+from data_utils import format_scalar_definition, format_array_definition, \
+                       format_scalar_declaration, format_ifdef_wrapper, DataGen  # noqa: E402
+
+
+class DotDataGen(DataGen):
+
+    MIN = -1000
+    MAX = +1000
+    # AXI splits bursts crossing 4KB address boundaries. To minimize
+    # the occurrence of these splits the data should be aligned to 4KB
+    BURST_ALIGNMENT = 4096
+
+    def golden_model(self, x, y):
+        return np.dot(x, y)
+
+    def emit_header(self, **kwargs):
+        header = [super().emit_header()]
+
+        n = kwargs['n']
+        x = np.random.uniform(self.MIN, self.MAX, n)
+        y = np.random.uniform(self.MIN, self.MAX, n)
+        g = self.golden_model(x, y)
+
+        assert (n % (8 * 4)) == 0, "n must be an integer multiple of the number of cores times " \
+                                   "the unrolling factor"
+
+        header += [format_scalar_definition('const uint32_t', 'n', n)]
+        header += [format_array_definition('double', 'x', x, alignment=self.BURST_ALIGNMENT,
+                                           section=kwargs['section'])]
+        header += [format_array_definition('double', 'y', y, alignment=self.BURST_ALIGNMENT,
+                                           section=kwargs['section'])]
+        header += [format_scalar_declaration('double', 'result', alignment=self.BURST_ALIGNMENT,
+                                             section=kwargs['section'])]
+        result_def = format_scalar_definition('double', 'g', g)
+        header += [format_ifdef_wrapper('BIST', result_def)]
+        header = '\n\n'.join(header)
+
+        return header
+
+
+if __name__ == '__main__':
+    sys.exit(DotDataGen().main())
diff --git a/sw/blas/dot/scripts/verify.py b/sw/blas/dot/scripts/verify.py
@@ -0,0 +1,31 @@
+#!/usr/bin/env python3
+# Copyright 2024 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+import sys
+from pathlib import Path
+from datagen import DotDataGen
+
+sys.path.append(str(Path(__file__).parent / '../../../../util/sim/'))
+from verif_utils import Verifier  # noqa: E402
+
+
+class DotVerifier(Verifier):
+
+    OUTPUT_UIDS = ['result']
+
+    def get_actual_results(self):
+        return self.get_output_from_symbol('result', 'double')
+
+    def get_expected_results(self):
+        x = self.get_input_from_symbol('x', 'double')
+        y = self.get_input_from_symbol('y', 'double')
+        return DotDataGen().golden_model(x, y)
+
+    def check_results(self, *args):
+        return super().check_results(*args, rtol=1e-10)
+
+
+if __name__ == "__main__":
+    sys.exit(DotVerifier().main())
diff --git a/sw/blas/dot/src/dot.h b/sw/blas/dot/src/dot.h
@@ -0,0 +1,145 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "snrt.h"
+
+inline void dot_seq(uint32_t n, double *x, double *y, double *output) {
+    // Start of SSR region.
+    register volatile double ft0 asm("ft0");
+    register volatile double ft1 asm("ft1");
+    asm volatile("" : "=f"(ft0), "=f"(ft1));
+
+    snrt_ssr_loop_1d(SNRT_SSR_DM0, n, sizeof(double));
+    snrt_ssr_loop_1d(SNRT_SSR_DM1, n, sizeof(double));
+
+    snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_1D, x);
+    snrt_ssr_read(SNRT_SSR_DM1, SNRT_SSR_1D, y);
+
+    register volatile double res_ssr asm("fs0") = 0;
+
+    snrt_ssr_enable();
+
+    const register uint32_t Nm1 asm("t0") = n - 1;
+    asm volatile(
+        "frep.o %[n_frep], 1, 0, 0 \n"
+        "fmadd.d %0, ft0, ft1, %0"
+        : "=f"(res_ssr) /* output operands */
+        : "f"(ft0), "f"(ft1), "0"(res_ssr),
+          [ n_frep ] "r"(Nm1) /* input operands */
+        :);
+
+    // End of SSR region.
+    snrt_fpu_fence();
+    snrt_ssr_disable();
+    asm volatile("" : : "f"(ft0), "f"(ft1));
+    output[0] = res_ssr;
+}
+
+inline void dot_seq_4_acc(uint32_t n, double *x, double *y, double *output) {
+    // Start of SSR region.
+    register volatile double ft0 asm("ft0");
+    register volatile double ft1 asm("ft1");
+    asm volatile("" : "=f"(ft0), "=f"(ft1));
+
+    snrt_ssr_loop_1d(SNRT_SSR_DM0, n, sizeof(double));
+    snrt_ssr_loop_1d(SNRT_SSR_DM1, n, sizeof(double));
+
+    snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_1D, x);
+    snrt_ssr_read(SNRT_SSR_DM1, SNRT_SSR_1D, y);
+
+    register volatile double res_ssr_0 asm("fs0") = 0;
+    register volatile double res_ssr_1 asm("fs1") = 0;
+    register volatile double res_ssr_2 asm("fs2") = 0;
+    register volatile double res_ssr_3 asm("fs3") = 0;
+
+    snrt_ssr_enable();
+
+    const register uint32_t Nm1 asm("t0") = (n >> 2) - 1;
+    asm volatile(
+        "frep.o %[n_frep], 4, 0, 0 \n"
+        "fmadd.d %0, ft0, ft1, %0 \n"
+        "fmadd.d %1, ft0, ft1, %1 \n"
+        "fmadd.d %2, ft0, ft1, %2 \n"
+        "fmadd.d %3, ft0, ft1, %3"
+        : "=f"(res_ssr_0), "=f"(res_ssr_1), "=f"(res_ssr_2),
+          "=f"(res_ssr_3) /* output operands */
+        : "f"(ft0), "f"(ft1), "0"(res_ssr_0), "1"(res_ssr_1), "2"(res_ssr_2),
+          "3"(res_ssr_3), [ n_frep ] "r"(Nm1) /* input operands */
+        :);
+
+    // End of SSR region.
+    snrt_fpu_fence();
+    snrt_ssr_disable();
+
+    asm volatile(
+        "fadd.d %[res_ssr_0], %[res_ssr_0], %[res_ssr_1] \n"
+        "fadd.d %[res_ssr_2], %[res_ssr_2], %[res_ssr_3] \n"
+        "fadd.d %[res_ssr_0], %[res_ssr_0], %[res_ssr_2]"
+        : [ res_ssr_0 ] "=f"(res_ssr_0),
+          [ res_ssr_2 ] "=f"(res_ssr_2) /* output operands */
+        : [ res_ssr_1 ] "f"(res_ssr_1),
+          [ res_ssr_3 ] "f"(res_ssr_3) /* input operands */
+        :);
+
+    asm volatile("" : : "f"(ft0), "f"(ft1));
+    output[0] = res_ssr_0;
+}
+
+static inline void dot(uint32_t n, double *x, double *y, double *result) {
+    double *local_x, *local_y, *partial_sums;
+
+    uint32_t start_cycle, end_cycle;
+
+    // Allocate space in TCDM
+    local_x = (double *)snrt_l1_next();
+    local_y = local_x + n;
+    partial_sums = local_y + n;
+
+    // Copy data in TCDM
+    if (snrt_is_dm_core()) {
+        size_t size = n * sizeof(double);
+        snrt_dma_start_1d(local_x, x, size);
+        snrt_dma_start_1d(local_y, y, size);
+        snrt_dma_wait_all();
+    }
+
+    // Calculate size and pointers for each core
+    int core_idx = snrt_cluster_core_idx();
+    int frac_core = n / snrt_cluster_compute_core_num();
+    int offset_core = core_idx * frac_core;
+    local_x += offset_core;
+    local_y += offset_core;
+
+    snrt_cluster_hw_barrier();
+
+    start_cycle = snrt_mcycle();
+
+    // Compute partial sums
+    if (snrt_is_compute_core()) {
+        dot_seq_4_acc(frac_core, local_x, local_y, &partial_sums[core_idx]);
+    }
+
+    snrt_cluster_hw_barrier();
+
+    // Reduce partial sums on core 0
+#ifndef _DOTP_EXCLUDE_FINAL_SYNC_
+    if (snrt_cluster_core_idx() == 0) {
+        for (uint32_t i = 1; i < snrt_cluster_compute_core_num(); i++) {
+            partial_sums[0] += partial_sums[i];
+        }
+        snrt_fpu_fence();
+    }
+#endif
+
+    end_cycle = snrt_mcycle();
+
+    snrt_cluster_hw_barrier();
+
+    // Copy data out of TCDM
+    if (snrt_is_dm_core()) {
+        *result = partial_sums[0];
+    }
+
+    snrt_cluster_hw_barrier();
+}
diff --git a/sw/blas/dot/src/main.c b/sw/blas/dot/src/main.c
@@ -0,0 +1,27 @@
+// Copyright 2023 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "snrt.h"
+
+#include "data.h"
+#include "dot.h"
+
+int main() {
+    dot(n, x, y, &result);
+
+// TODO: currently only works for single cluster otherwise need to
+//       synchronize all cores here
+#ifdef BIST
+    uint32_t nerr = 1;
+
+    // Check computation is correct
+    if (snrt_global_core_idx() == 0) {
+        if (result == g) nerr--;
+        return nerr;
+    }
+
+#endif
+
+    return 0;
+}
diff --git a/sw/blas/gemm/.gitignore b/sw/blas/gemm/.gitignore
diff --git a/target/snitch_cluster/sw.mk b/target/snitch_cluster/sw.mk
@@ -42,6 +42,7 @@ APPS  = sw/apps/lto
 APPS += sw/apps/nop
 APPS += sw/apps/blas/axpy
 APPS += sw/apps/blas/gemm
+APPS += sw/apps/blas/dot
 APPS += sw/apps/dnn/batchnorm
 APPS += sw/apps/dnn/conv2d
 APPS += sw/apps/dnn/fusedconv

diff --git a/target/snitch_cluster/sw/apps/blas/dot/Makefile b/target/snitch_cluster/sw/apps/blas/dot/Makefile
@@ -0,0 +1,10 @@
+# Copyright 2024 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Matteo Perotti <mperotti@iis.ee.ethz.ch>
+
+include ../../../../../../sw/blas/dot/Makefile
+include ../../common.mk
+
+$(DEP): $(DATA_H)
diff --git a/target/snitch_cluster/sw/run.yaml b/target/snitch_cluster/sw/run.yaml
@@ -78,6 +78,8 @@ runs:
     cmd: [../../../sw/blas/axpy/scripts/verify.py, "${sim_bin}", "${elf}"]
   - elf: apps/blas/gemm/build/gemm.elf
     cmd: [../../../sw/blas/gemm/scripts/verify.py, "${sim_bin}", "${elf}"]
+  - elf: apps/blas/dot/build/dot.elf
+    cmd: [../../../sw/blas/dot/scripts/verify.py, "${sim_bin}", "${elf}"]
   - elf: apps/dnn/batchnorm/build/batchnorm.elf
   - elf: apps/dnn/maxpool/build/maxpool.elf
   # - elf: apps/dnn/conv2d/build/conv2d.elf # Fails with wrong results

diff --git a/util/sim/data_utils.py b/util/sim/data_utils.py
@@ -144,6 +144,16 @@ def format_scalar_definition(dtype, uid, scalar):
     return s
 
 
+def format_scalar_declaration(dtype, uid, alignment=None, section=None):
+    attributes = _variable_attributes(alignment, section)
+    s = f'{_alias_dtype(dtype)} {uid}'
+    if attributes:
+        s += f' {attributes};'
+    else:
+        s += ';'
+    return s
+
+
 def format_array_initializer(dtype, array):
     s = '{\n'
     array = flatten(array)