Skip to content

Commit

Permalink
sw: Add DOTP benchmark (#151)
Browse files Browse the repository at this point in the history
Co-authored-by: Luca Colagrande <luca.colagrande3@gmail.com>
  • Loading branch information
mp-17 and colluca authored Jun 18, 2024
1 parent 06dcb96 commit dc4a6a6
Show file tree
Hide file tree
Showing 14 changed files with 319 additions and 3 deletions.
1 change: 1 addition & 0 deletions sw/blas/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
**/data/data.h
1 change: 0 additions & 1 deletion sw/blas/axpy/.gitignore

This file was deleted.

3 changes: 2 additions & 1 deletion sw/blas/blas.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@
#pragma once

#include "axpy/src/axpy.h"
#include "gemm/src/gemm.h"
#include "dot/src/dot.h"
#include "gemm/src/gemm.h"
31 changes: 31 additions & 0 deletions sw/blas/dot/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Copyright 2024 ETH Zurich and University of Bologna.
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0

# Usage of absolute paths is required to externally include this Makefile
MK_DIR := $(dir $(realpath $(lastword $(MAKEFILE_LIST))))
DATA_DIR := $(realpath $(MK_DIR)/data)
SRC_DIR := $(realpath $(MK_DIR)/src)

DATA_CFG ?= $(DATA_DIR)/params.json
SECTION ?=

APP ?= dot
SRCS ?= $(realpath $(SRC_DIR)/main.c)
INCDIRS ?= $(dir $(DATA_H)) $(SRC_DIR)

DATAGEN_PY = $(MK_DIR)/scripts/datagen.py
DATA_H ?= $(DATA_DIR)/data.h

$(dir $(DATA_H)):
mkdir -p $@

$(DATA_H): $(DATAGEN_PY) $(DATA_CFG) | $(dir $(DATA_H))
$< -c $(DATA_CFG) --section="$(SECTION)" > $@

.PHONY: clean-data clean

clean-data:
rm -f $(DATA_H)

clean: clean-data
7 changes: 7 additions & 0 deletions sw/blas/dot/data/params.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
// Copyright 2024 ETH Zurich and University of Bologna.
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0

{
n: 4096
}
52 changes: 52 additions & 0 deletions sw/blas/dot/scripts/datagen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#!/usr/bin/env python3
# Copyright 2024 ETH Zurich and University of Bologna.
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0

import numpy as np
import os
import sys

sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/"))
from data_utils import format_scalar_definition, format_array_definition, \
format_scalar_declaration, format_ifdef_wrapper, DataGen # noqa: E402


class DotDataGen(DataGen):

MIN = -1000
MAX = +1000
# AXI splits bursts crossing 4KB address boundaries. To minimize
# the occurrence of these splits the data should be aligned to 4KB
BURST_ALIGNMENT = 4096

def golden_model(self, x, y):
return np.dot(x, y)

def emit_header(self, **kwargs):
header = [super().emit_header()]

n = kwargs['n']
x = np.random.uniform(self.MIN, self.MAX, n)
y = np.random.uniform(self.MIN, self.MAX, n)
g = self.golden_model(x, y)

assert (n % (8 * 4)) == 0, "n must be an integer multiple of the number of cores times " \
"the unrolling factor"

header += [format_scalar_definition('const uint32_t', 'n', n)]
header += [format_array_definition('double', 'x', x, alignment=self.BURST_ALIGNMENT,
section=kwargs['section'])]
header += [format_array_definition('double', 'y', y, alignment=self.BURST_ALIGNMENT,
section=kwargs['section'])]
header += [format_scalar_declaration('double', 'result', alignment=self.BURST_ALIGNMENT,
section=kwargs['section'])]
result_def = format_scalar_definition('double', 'g', g)
header += [format_ifdef_wrapper('BIST', result_def)]
header = '\n\n'.join(header)

return header


if __name__ == '__main__':
sys.exit(DotDataGen().main())
31 changes: 31 additions & 0 deletions sw/blas/dot/scripts/verify.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#!/usr/bin/env python3
# Copyright 2024 ETH Zurich and University of Bologna.
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0

import sys
from pathlib import Path
from datagen import DotDataGen

sys.path.append(str(Path(__file__).parent / '../../../../util/sim/'))
from verif_utils import Verifier # noqa: E402


class DotVerifier(Verifier):

OUTPUT_UIDS = ['result']

def get_actual_results(self):
return self.get_output_from_symbol('result', 'double')

def get_expected_results(self):
x = self.get_input_from_symbol('x', 'double')
y = self.get_input_from_symbol('y', 'double')
return DotDataGen().golden_model(x, y)

def check_results(self, *args):
return super().check_results(*args, rtol=1e-10)


if __name__ == "__main__":
sys.exit(DotVerifier().main())
145 changes: 145 additions & 0 deletions sw/blas/dot/src/dot.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
// Copyright 2024 ETH Zurich and University of Bologna.
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0

#include "snrt.h"

inline void dot_seq(uint32_t n, double *x, double *y, double *output) {
// Start of SSR region.
register volatile double ft0 asm("ft0");
register volatile double ft1 asm("ft1");
asm volatile("" : "=f"(ft0), "=f"(ft1));

snrt_ssr_loop_1d(SNRT_SSR_DM0, n, sizeof(double));
snrt_ssr_loop_1d(SNRT_SSR_DM1, n, sizeof(double));

snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_1D, x);
snrt_ssr_read(SNRT_SSR_DM1, SNRT_SSR_1D, y);

register volatile double res_ssr asm("fs0") = 0;

snrt_ssr_enable();

const register uint32_t Nm1 asm("t0") = n - 1;
asm volatile(
"frep.o %[n_frep], 1, 0, 0 \n"
"fmadd.d %0, ft0, ft1, %0"
: "=f"(res_ssr) /* output operands */
: "f"(ft0), "f"(ft1), "0"(res_ssr),
[ n_frep ] "r"(Nm1) /* input operands */
:);

// End of SSR region.
snrt_fpu_fence();
snrt_ssr_disable();
asm volatile("" : : "f"(ft0), "f"(ft1));
output[0] = res_ssr;
}

inline void dot_seq_4_acc(uint32_t n, double *x, double *y, double *output) {
// Start of SSR region.
register volatile double ft0 asm("ft0");
register volatile double ft1 asm("ft1");
asm volatile("" : "=f"(ft0), "=f"(ft1));

snrt_ssr_loop_1d(SNRT_SSR_DM0, n, sizeof(double));
snrt_ssr_loop_1d(SNRT_SSR_DM1, n, sizeof(double));

snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_1D, x);
snrt_ssr_read(SNRT_SSR_DM1, SNRT_SSR_1D, y);

register volatile double res_ssr_0 asm("fs0") = 0;
register volatile double res_ssr_1 asm("fs1") = 0;
register volatile double res_ssr_2 asm("fs2") = 0;
register volatile double res_ssr_3 asm("fs3") = 0;

snrt_ssr_enable();

const register uint32_t Nm1 asm("t0") = (n >> 2) - 1;
asm volatile(
"frep.o %[n_frep], 4, 0, 0 \n"
"fmadd.d %0, ft0, ft1, %0 \n"
"fmadd.d %1, ft0, ft1, %1 \n"
"fmadd.d %2, ft0, ft1, %2 \n"
"fmadd.d %3, ft0, ft1, %3"
: "=f"(res_ssr_0), "=f"(res_ssr_1), "=f"(res_ssr_2),
"=f"(res_ssr_3) /* output operands */
: "f"(ft0), "f"(ft1), "0"(res_ssr_0), "1"(res_ssr_1), "2"(res_ssr_2),
"3"(res_ssr_3), [ n_frep ] "r"(Nm1) /* input operands */
:);

// End of SSR region.
snrt_fpu_fence();
snrt_ssr_disable();

asm volatile(
"fadd.d %[res_ssr_0], %[res_ssr_0], %[res_ssr_1] \n"
"fadd.d %[res_ssr_2], %[res_ssr_2], %[res_ssr_3] \n"
"fadd.d %[res_ssr_0], %[res_ssr_0], %[res_ssr_2]"
: [ res_ssr_0 ] "=f"(res_ssr_0),
[ res_ssr_2 ] "=f"(res_ssr_2) /* output operands */
: [ res_ssr_1 ] "f"(res_ssr_1),
[ res_ssr_3 ] "f"(res_ssr_3) /* input operands */
:);

asm volatile("" : : "f"(ft0), "f"(ft1));
output[0] = res_ssr_0;
}

static inline void dot(uint32_t n, double *x, double *y, double *result) {
double *local_x, *local_y, *partial_sums;

uint32_t start_cycle, end_cycle;

// Allocate space in TCDM
local_x = (double *)snrt_l1_next();
local_y = local_x + n;
partial_sums = local_y + n;

// Copy data in TCDM
if (snrt_is_dm_core()) {
size_t size = n * sizeof(double);
snrt_dma_start_1d(local_x, x, size);
snrt_dma_start_1d(local_y, y, size);
snrt_dma_wait_all();
}

// Calculate size and pointers for each core
int core_idx = snrt_cluster_core_idx();
int frac_core = n / snrt_cluster_compute_core_num();
int offset_core = core_idx * frac_core;
local_x += offset_core;
local_y += offset_core;

snrt_cluster_hw_barrier();

start_cycle = snrt_mcycle();

// Compute partial sums
if (snrt_is_compute_core()) {
dot_seq_4_acc(frac_core, local_x, local_y, &partial_sums[core_idx]);
}

snrt_cluster_hw_barrier();

// Reduce partial sums on core 0
#ifndef _DOTP_EXCLUDE_FINAL_SYNC_
if (snrt_cluster_core_idx() == 0) {
for (uint32_t i = 1; i < snrt_cluster_compute_core_num(); i++) {
partial_sums[0] += partial_sums[i];
}
snrt_fpu_fence();
}
#endif

end_cycle = snrt_mcycle();

snrt_cluster_hw_barrier();

// Copy data out of TCDM
if (snrt_is_dm_core()) {
*result = partial_sums[0];
}

snrt_cluster_hw_barrier();
}
27 changes: 27 additions & 0 deletions sw/blas/dot/src/main.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
// Copyright 2023 ETH Zurich and University of Bologna.
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0

#include "snrt.h"

#include "data.h"
#include "dot.h"

int main() {
dot(n, x, y, &result);

// TODO: currently only works for single cluster otherwise need to
// synchronize all cores here
#ifdef BIST
uint32_t nerr = 1;

// Check computation is correct
if (snrt_global_core_idx() == 0) {
if (result == g) nerr--;
return nerr;
}

#endif

return 0;
}
1 change: 0 additions & 1 deletion sw/blas/gemm/.gitignore

This file was deleted.

1 change: 1 addition & 0 deletions target/snitch_cluster/sw.mk
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ APPS = sw/apps/lto
APPS += sw/apps/nop
APPS += sw/apps/blas/axpy
APPS += sw/apps/blas/gemm
APPS += sw/apps/blas/dot
APPS += sw/apps/dnn/batchnorm
APPS += sw/apps/dnn/conv2d
APPS += sw/apps/dnn/fusedconv
Expand Down
10 changes: 10 additions & 0 deletions target/snitch_cluster/sw/apps/blas/dot/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Copyright 2024 ETH Zurich and University of Bologna.
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0
#
# Matteo Perotti <mperotti@iis.ee.ethz.ch>

include ../../../../../../sw/blas/dot/Makefile
include ../../common.mk

$(DEP): $(DATA_H)
2 changes: 2 additions & 0 deletions target/snitch_cluster/sw/run.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ runs:
cmd: [../../../sw/blas/axpy/scripts/verify.py, "${sim_bin}", "${elf}"]
- elf: apps/blas/gemm/build/gemm.elf
cmd: [../../../sw/blas/gemm/scripts/verify.py, "${sim_bin}", "${elf}"]
- elf: apps/blas/dot/build/dot.elf
cmd: [../../../sw/blas/dot/scripts/verify.py, "${sim_bin}", "${elf}"]
- elf: apps/dnn/batchnorm/build/batchnorm.elf
- elf: apps/dnn/maxpool/build/maxpool.elf
# - elf: apps/dnn/conv2d/build/conv2d.elf # Fails with wrong results
Expand Down
10 changes: 10 additions & 0 deletions util/sim/data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,16 @@ def format_scalar_definition(dtype, uid, scalar):
return s


def format_scalar_declaration(dtype, uid, alignment=None, section=None):
attributes = _variable_attributes(alignment, section)
s = f'{_alias_dtype(dtype)} {uid}'
if attributes:
s += f' {attributes};'
else:
s += ';'
return s


def format_array_initializer(dtype, array):
s = '{\n'
array = flatten(array)
Expand Down

0 comments on commit dc4a6a6

Please sign in to comment.