Skip to content

Commit

Permalink
hw: Add multi-layer omega TCDM interconnects (#94)
Browse files Browse the repository at this point in the history
* hw: Add multi-level Omega networks to TCDM interconnect

* ci: Test OmegaNet TCDM interconnect

* hw: Sensible defaults for omega TCDM, clarify TODOs

---------

Co-authored-by: Luca Colagrande <luca.colagrande3@gmail.com>
  • Loading branch information
paulsc96 and colluca authored Feb 16, 2024
1 parent 2f532e9 commit 98ac9f1
Show file tree
Hide file tree
Showing 6 changed files with 285 additions and 26 deletions.
8 changes: 8 additions & 0 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -142,3 +142,11 @@ snitch-cluster-fdiv-vsim:
- make CFG_OVERRIDE=cfg/fdiv.hjson sw
- make bin/snitch_cluster.vsim
- ./run.py sw/fdiv.yaml --simulator vsim -j --run-dir runs/vsim

# Test OmegaNet TCDM interconnect
snitch-cluster-omega-vsim:
script:
- cd target/snitch_cluster
- make CFG_OVERRIDE=cfg/omega.hjson sw
- make bin/snitch_cluster.vsim
- ./run.py sw/run.yaml --simulator vsim -j --run-dir runs/vsim
23 changes: 23 additions & 0 deletions docs/schema/snitch_cluster.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,29 @@
16,
32
]
},
"topology": {
"type": "string",
"description": "Network topology used for TCDM interconnect.",
"enum": ["LogarithmicInterconnect", "OmegaNet"],
"default": "LogarithmicInterconnect"
},
"radix": {
"type": "number",
"description": "Radix of switches in switch-based TCDM interconnect topologies (ignored for logarithmic interconnect).",
"enum": [2, 4],
"default": 2
},
"num_switch_nets": {
"type": "number",
"description": "Number of parallel networks in switch-based TCDM interconnect topologies (ignored for logarithmic interconnect).",
"minimum": 1,
"default": 4
},
"switch_lfsr_arbiter": {
"type": "boolean",
"description": "Whether to use pseudorandom (LFSR-generated) arbitration in switch-based TCDM interconect topologies instead of pseudo-round-robin (ignored for logarithmic interconnect).",
"default": false
}
}
},
Expand Down
8 changes: 7 additions & 1 deletion hw/snitch_cluster/src/snitch_cluster.sv
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,10 @@ module snitch_cluster
/// Radix of the individual switch points of the network.
/// Currently supported are `32'd2` and `32'd4`.
parameter int unsigned Radix = 32'd2,
/// Number of parallel networks for switch-based TCDM interconnect.
parameter int unsigned NumSwitchNets = 32'd2,
/// Whether to use an LFSR to arbitrate switch-based TCDM networks.
parameter bit SwitchLfsrArbiter = 1'b0,
/// ## Timing Tuning Parameters
/// Insert Pipeline registers into off-loading path (request)
parameter bit RegisterOffloadReq = 1'b0,
Expand Down Expand Up @@ -798,7 +802,9 @@ module snitch_cluster
.user_t (tcdm_user_t),
.MemoryResponseLatency (1 + RegisterTCDMCuts),
.Radix (Radix),
.Topology (Topology)
.Topology (Topology),
.NumSwitchNets (NumSwitchNets),
.SwitchLfsrArbiter (SwitchLfsrArbiter)
) i_tcdm_interconnect (
.clk_i,
.rst_ni,
Expand Down
6 changes: 4 additions & 2 deletions hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -310,8 +310,10 @@ module ${cfg['name']}_wrapper (
.SsrCfgs (${cfg['pkg_name']}::SsrCfgs),
.NumSequencerInstr (NumSequencerInstr),
.Hive (${cfg['pkg_name']}::Hive),
.Topology (snitch_pkg::LogarithmicInterconnect),
.Radix (2),
.Topology (snitch_pkg::${cfg['tcdm']['topology']}),
.Radix (${int(cfg['tcdm']['radix'])}),
.NumSwitchNets (${int(cfg['tcdm']['num_switch_nets'])}),
.SwitchLfsrArbiter (${int(cfg['tcdm']['switch_lfsr_arbiter'])}),
.RegisterOffloadReq (${int(cfg['timing']['register_offload_req'])}),
.RegisterOffloadRsp (${int(cfg['timing']['register_offload_rsp'])}),
.RegisterCoreReq (${int(cfg['timing']['register_core_req'])}),
Expand Down
138 changes: 115 additions & 23 deletions hw/snitch_cluster/src/snitch_tcdm_interconnect.sv
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@ module snitch_tcdm_interconnect #(
/// Radix of the individual switch points of the network.
/// Currently supported are `32'd2` and `32'd4`.
parameter int unsigned Radix = 32'd2,
/// Number of parallel networks for switch-based interconnects.
parameter int unsigned NumSwitchNets = 32'd2,
/// Whether to use an LFSR to arbitrate switch-based networks.
parameter bit SwitchLfsrArbiter = 1'b0,
/// Payload type of the data request ports.
parameter type tcdm_req_t = logic,
/// Payload type of the data response ports.
Expand Down Expand Up @@ -133,29 +137,117 @@ module snitch_tcdm_interconnect #(
.ready_i ( mem_q_ready_flat )
);
end else if (Topology == snitch_pkg::OmegaNet) begin : gen_omega_net
stream_omega_net #(
.NumInp ( NumInp ),
.NumOut ( NumOut ),
.payload_t ( mem_req_chan_t ),
.SpillReg ( 1'b0 ),
.ExtPrio ( 1'b0 ),
.AxiVldRdy ( 1'b1 ),
.LockIn ( 1'b1 ),
.Radix ( Radix )
) i_stream_omega_net (
.clk_i,
.rst_ni,
.flush_i ( 1'b0 ),
.rr_i ( '0 ),
.data_i ( in_req ),
.sel_i ( bank_select ),
.valid_i ( req_q_valid_flat ),
.ready_o ( rsp_q_ready_flat ),
.data_o ( out_req ),
.idx_o ( ),
.valid_o ( mem_q_valid_flat ),
.ready_i ( mem_q_ready_flat )
);
localparam int unsigned NumInpPerNet = cf_math_pkg::ceil_div(NumInp, NumSwitchNets);

// Intermediate request signals for Omega-to-Xbar interface
mem_req_chan_t [NumSwitchNets-1:0][NumOut-1:0] oout_data;
logic [NumSwitchNets-1:0][NumOut-1:0] oout_valid, oout_ready;

// Arbitration for Omega and Xbar stages, respectively
logic [cf_math_pkg::idx_width(NumOut)-1:0] rr1;
logic [cf_math_pkg::idx_width(NumSwitchNets)-1:0] rr2;

// Use pseudorandom arbitration if desired. For reference, see:
// https://github.com/pulp-platform/cluster_interconnect/blob/master/rtl/tcdm_interconnect/tcdm_interconnect.sv
if (SwitchLfsrArbiter) begin : gen_omega_lsfr
logic [cf_math_pkg::idx_width(NumInp)-1:0] rr;
lfsr #(
.LfsrWidth ( 64 ),
.OutWidth ( $clog2(NumInp) ),
.CipherLayers ( 3 ),
.CipherReg ( 1'b1 )
) i_lfsr (
.clk_i,
.rst_ni,
.en_i ( |(req_q_valid_flat & rsp_q_ready_flat) ),
.out_o ( rr )
);
// The upper bits of `rr1` are truncated iff not needed in Butterfly networks.
assign rr1 = rr[$high(rr):$clog2(NumSwitchNets)];
assign rr2 = rr[$clog2(NumSwitchNets)-1:0];
end else begin : gen_no_omega_lsfr
assign rr1 = '0;
assign rr2 = '0;
end

// Work around enum incompatibility and expand signals for part selects
typedef logic [$bits(mem_req_chan_t)-1:0] data_t;
typedef data_t [NumSwitchNets-1:0][NumInpPerNet-1:0] flat_data_t;
typedef logic [NumSwitchNets-1:0][NumInpPerNet-1:0] flat_hs_t;

flat_data_t data_in;
select_t [NumSwitchNets-1:0][NumInpPerNet-1:0] in_sel;
flat_hs_t in_valid, in_ready;

assign data_in = in_req;
assign in_sel = bank_select;
assign in_valid = req_q_valid_flat;
assign rsp_q_ready_flat = in_ready;

// Generate Omega networks (first stage)
// TODO: Ideally, we should balance the tie-off of unused ports across networks,
// minimizing request imbalalance and maximizing performance.
for (genvar i = 0; i < NumSwitchNets; ++i) begin : gen_omega_nets
data_t [NumOut-1:0] data_out;
assign oout_data[i] = data_out;
stream_omega_net #(
.NumInp ( NumInpPerNet ),
.NumOut ( NumOut ),
.payload_t ( data_t ),
.ExtPrio ( SwitchLfsrArbiter ),
.SpillReg ( 1'b0 ),
.AxiVldRdy ( 1'b1 ),
.LockIn ( 1'b1 ),
.Radix ( Radix )
) i_stream_omega_net (
.clk_i,
.rst_ni,
.flush_i ( 1'b0 ),
// TODO: switch-level arbitration currently unconnected inside `stream_omega_net`
.rr_i ( /*rr1*/ '0 ),
.sel_i ( in_sel[i] ),
.data_i ( data_in[i] ),
.valid_i ( in_valid[i] ),
.ready_o ( in_ready[i] ),
.data_o ( data_out ),
.valid_o ( oout_valid[i] ),
.ready_i ( oout_ready[i] ),
.idx_o ( )
);
end

// Generate per-output multiplexers (second stage)
for (genvar i = 0; i < NumOut; ++i) begin : gen_out_arbs
mem_req_chan_t [NumSwitchNets-1:0] rrin_data;
logic [NumSwitchNets-1:0] rrin_valid, rrin_ready;

// Bundle Omega net request channels for this bank
for (genvar k = 0; k < NumSwitchNets; ++k) begin : gen_rrin_in
assign rrin_data[k] = oout_data[k][i];
assign rrin_valid[k] = oout_valid[k][i];
assign oout_ready[k][i] = rrin_ready[k];
end

rr_arb_tree #(
.NumIn ( NumSwitchNets ),
.DataType ( mem_req_chan_t ),
.ExtPrio ( SwitchLfsrArbiter ),
.AxiVldRdy ( 1'b1 ),
.LockIn ( 1'b1 )
) i_rr_arb_tree (
.clk_i,
.rst_ni,
.flush_i ( 1'b0 ),
.rr_i ( rr2 ),
.data_i ( rrin_data ),
.req_i ( rrin_valid ),
.gnt_o ( rrin_ready ),
.data_o ( out_req[i] ),
.req_o ( mem_q_valid_flat[i] ),
.gnt_i ( mem_q_ready_flat[i] ),
.idx_o ( )
);
end
end

// -------------
Expand Down
128 changes: 128 additions & 0 deletions target/snitch_cluster/cfg/omega.hjson
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
// Copyright 2023 ETH Zurich and University of Bologna.
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0

// Cluster configuration for a simple testbench system.
{
nr_s1_quadrant: 1,
s1_quadrant: {
nr_clusters: 1,
},

cluster: {
boot_addr: 4096, // 0x1000
cluster_base_addr: 268435456, // 0x1000_0000
cluster_base_offset: 0, // 0x0
cluster_base_hartid: 0,
addr_width: 48,
data_width: 64,
tcdm: {
size: 128,
banks: 32,
topology: OmegaNet
},
cluster_periph_size: 64, // kB
zero_mem_size: 64, // kB
dma_data_width: 512,
dma_axi_req_fifo_depth: 3,
dma_req_fifo_depth: 3,
// Timing parameters
timing: {
lat_comp_fp32: 3,
lat_comp_fp64: 3,
lat_comp_fp16: 2,
lat_comp_fp16_alt: 2,
lat_comp_fp8: 1,
lat_comp_fp8_alt: 1,
lat_noncomp: 1,
lat_conv: 2,
lat_sdotp: 3,
fpu_pipe_config: "BEFORE"
narrow_xbar_latency: "CUT_ALL_PORTS",
wide_xbar_latency: "CUT_ALL_PORTS",
// Isolate the core.
register_core_req: true,
register_core_rsp: true,
register_offload_req: true,
register_offload_rsp: true
},
hives: [
// Hive 0
{
icache: {
size: 8, // total instruction cache size in kByte
sets: 2, // number of ways
cacheline: 256 // word size in bits
},
cores: [
{ $ref: "#/compute_core_template" },
{ $ref: "#/compute_core_template" },
{ $ref: "#/compute_core_template" },
{ $ref: "#/compute_core_template" },
{ $ref: "#/compute_core_template" },
{ $ref: "#/compute_core_template" },
{ $ref: "#/compute_core_template" },
{ $ref: "#/compute_core_template" },
{ $ref: "#/dma_core_template" },
]
}
]
},
dram: {
// 0x8000_0000
address: 2147483648,
// 0x8000_0000
length: 2147483648
},
peripherals: {
clint: {
// 0xffff_0000
address: 4294901760,
// 0x0000_1000
length: 4096
},
},
// Templates.
compute_core_template: {
isa: "rv32imafd",
xssr: true,
xfrep: true,
xdma: false,
xf16: true,
xf16alt: true,
xf8: true,
xf8alt: true,
xfdotp: true,
xfvec: true,
num_int_outstanding_loads: 1,
num_int_outstanding_mem: 4,
num_fp_outstanding_loads: 4,
num_fp_outstanding_mem: 4,
num_sequencer_instructions: 16,
num_dtlb_entries: 1,
num_itlb_entries: 1,
// Enable division/square root unit
// Xdiv_sqrt: true,
},
dma_core_template: {
isa: "rv32imafd",
// Xdiv_sqrt: true,
# isa: "rv32ema",
xdma: true
xssr: false
xfrep: false
xf16: false,
xf16alt: false,
xf8: false,
xf8alt: false,
xfdotp: false,
xfvec: false,
num_int_outstanding_loads: 1,
num_int_outstanding_mem: 4,
num_fp_outstanding_loads: 4,
num_fp_outstanding_mem: 4,
num_sequencer_instructions: 16,
num_dtlb_entries: 1,
num_itlb_entries: 1,
}
}

0 comments on commit 98ac9f1

Please sign in to comment.