Skip to content

Commit

Permalink
Updated Performance test (#844)
Browse files Browse the repository at this point in the history
  • Loading branch information
stnolting committed Mar 22, 2024
2 parents c960724 + f297cd4 commit 3d7012b
Show file tree
Hide file tree
Showing 16 changed files with 4,072 additions and 16 deletions.
3 changes: 2 additions & 1 deletion sim/simple/ghdl.run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ if [ -z "$1" ]
then
GHDL_RUN_ARGS="${@:---stop-time=10ms}"
else
GHDL_RUN_ARGS=$1
# Lets pass down all the parameters to GHDL instead of just 1
GHDL_RUN_ARGS=$@
fi

echo "Using simulation run arguments: $GHDL_RUN_ARGS";
Expand Down
3 changes: 2 additions & 1 deletion sim/simple/ghdl.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,5 @@ set -e
cd $(dirname "$0")

./ghdl.setup.sh
./ghdl.run.sh $1
# We want to be able to pass down more than 1 parameter to GHDL
./ghdl.run.sh $@
53 changes: 41 additions & 12 deletions sim/simple/neorv32_tb.simple.vhd
Original file line number Diff line number Diff line change
Expand Up @@ -48,33 +48,62 @@ use neorv32.neorv32_application_image.all; -- this file is generated by the imag
use std.textio.all;

entity neorv32_tb_simple is
generic (
PERFORMANCE_OPTION : natural := 0 -- Set core options for performance measurements
);
end neorv32_tb_simple;

architecture neorv32_tb_simple_rtl of neorv32_tb_simple is

-- advanced configuration --
constant num_configs_c : natural := 3; -- number of pre-defined configurations

-- helpers --
type bool_t is array (0 to num_configs_c-1) of boolean;
type natural_t is array (0 to num_configs_c-1) of natural;
type performance_options_type_t is record
fast_mul_en_c : bool_t;
fast_shift_en_c : bool_t;
imem_size_c : natural_t;
icache_en_c : bool_t;
icache_block_size_c : natural_t;
dcache_en_c : bool_t;
dcache_block_size_c : natural_t;
end record;


-- User Configuration ---------------------------------------------------------------------
-- -------------------------------------------------------------------------------------------
-- core performance options --
constant performance_options_c : performance_options_type_t := (
-- default fast core area core
fast_mul_en_c => ( true, true, false), -- Fast multiplication, more area
fast_shift_en_c => ( true, true, false), -- Fast shifting, more area
imem_size_c => ( 32*1024, 128*1024, 128*1024), -- Instruction memory size min. 128kB for performance tests
icache_en_c => ( true, false, false), -- I$ disabled for performance tests
icache_block_size_c => ( 32, 32, 32), -- I$ block size
dcache_en_c => ( true, false, false), -- D$ disabled for performance tests
dcache_block_size_c => ( 32, 32, 32) -- D$ block size
);

-- general --
constant int_imem_c : boolean := true; -- true: use proc-internal IMEM, false: use external simulated IMEM (ext. mem A)
constant int_dmem_c : boolean := true; -- true: use proc-internal DMEM, false: use external simulated DMEM (ext. mem B)
constant imem_size_c : natural := 32*1024; -- size in bytes of processor-internal IMEM / external mem A
constant dmem_size_c : natural := 8*1024; -- size in bytes of processor-internal DMEM / external mem B
constant f_clock_c : natural := 100000000; -- main clock in Hz
constant baud0_rate_c : natural := 19200; -- simulation UART0 (primary UART) baud rate
constant baud1_rate_c : natural := 19200; -- simulation UART1 (secondary UART) baud rate
constant icache_en_c : boolean := true; -- implement i-cache
constant icache_block_size_c : natural := 32; -- i-cache block size in bytes
-- simulated external Wishbone memory A (can be used as external IMEM) --
constant ext_mem_a_base_addr_c : std_ulogic_vector(31 downto 0) := x"00000000"; -- wishbone memory base address (external IMEM base)
constant ext_mem_a_size_c : natural := imem_size_c; -- wishbone memory size in bytes
constant ext_mem_a_size_c : natural := performance_options_c.imem_size_c(PERFORMANCE_OPTION); -- wishbone memory size in bytes
constant ext_mem_a_latency_c : natural := 8; -- latency in clock cycles (min 1, max 255), plus 1 cycle initial delay
-- simulated external Wishbone memory B (can be used as external DMEM) --
constant ext_mem_b_base_addr_c : std_ulogic_vector(31 downto 0) := x"80000000"; -- wishbone memory base address (external DMEM base)
constant ext_mem_b_size_c : natural := dmem_size_c; -- wishbone memory size in bytes
constant ext_mem_b_latency_c : natural := 8; -- latency in clock cycles (min 1, max 255), plus 1 cycle initial delay
-- simulated external Wishbone memory C (can be used to simulate external IO access) --
constant ext_mem_c_base_addr_c : std_ulogic_vector(31 downto 0) := x"F0000000"; -- wishbone memory base address (default begin of EXTERNAL IO area)
constant ext_mem_c_size_c : natural := icache_block_size_c/2; -- wishbone memory size in bytes, should be smaller than an iCACHE block
constant ext_mem_c_size_c : natural := performance_options_c.icache_block_size_c(PERFORMANCE_OPTION)/2; -- wishbone memory size in bytes, should be smaller than an iCACHE block
constant ext_mem_c_latency_c : natural := 128; -- latency in clock cycles (min 1, max 255), plus 1 cycle initial delay
-- simulation interrupt trigger --
constant irq_trigger_base_addr_c : std_ulogic_vector(31 downto 0) := x"FF000000";
Expand Down Expand Up @@ -182,8 +211,8 @@ begin
CPU_EXTENSION_RISCV_Zmmul => false, -- implement multiply-only M sub-extension?
CPU_EXTENSION_RISCV_Zxcfu => true, -- implement custom (instr.) functions unit?
-- Extension Options --
FAST_MUL_EN => true, -- use DSPs for M extension's multiplier
FAST_SHIFT_EN => true, -- use barrel shifter for shift operations
FAST_MUL_EN => performance_options_c.fast_mul_en_c(PERFORMANCE_OPTION), -- use DSPs for M extension's multiplier
FAST_SHIFT_EN => performance_options_c.fast_shift_en_c(PERFORMANCE_OPTION), -- use barrel shifter for shift operations
REGFILE_HW_RST => false, -- no hardware reset
-- Physical Memory Protection (PMP) --
PMP_NUM_REGIONS => 5, -- number of regions (0..16)
Expand All @@ -197,18 +226,18 @@ begin
AMO_RVS_GRANULARITY => 4, -- size in bytes, has to be a power of 2, min 4
-- Internal Instruction memory --
MEM_INT_IMEM_EN => int_imem_c , -- implement processor-internal instruction memory
MEM_INT_IMEM_SIZE => imem_size_c, -- size of processor-internal instruction memory in bytes
MEM_INT_IMEM_SIZE => performance_options_c.imem_size_c(PERFORMANCE_OPTION), -- size of processor-internal instruction memory in bytes
-- Internal Data memory --
MEM_INT_DMEM_EN => int_dmem_c, -- implement processor-internal data memory
MEM_INT_DMEM_SIZE => dmem_size_c, -- size of processor-internal data memory in bytes
-- Internal Cache memory --
ICACHE_EN => icache_en_c, -- implement instruction cache
ICACHE_EN => performance_options_c.icache_en_c(PERFORMANCE_OPTION), -- implement instruction cache
ICACHE_NUM_BLOCKS => 64, -- i-cache: number of blocks (min 2), has to be a power of 2
ICACHE_BLOCK_SIZE => icache_block_size_c, -- i-cache: block size in bytes (min 4), has to be a power of 2
ICACHE_BLOCK_SIZE => performance_options_c.icache_block_size_c(PERFORMANCE_OPTION), -- i-cache: block size in bytes (min 4), has to be a power of 2
-- Internal Data Cache (dCACHE) --
DCACHE_EN => true, -- implement data cache
DCACHE_EN => performance_options_c.dcache_en_c(PERFORMANCE_OPTION), -- implement data cache
DCACHE_NUM_BLOCKS => 32, -- d-cache: number of blocks (min 1), has to be a power of 2
DCACHE_BLOCK_SIZE => 32, -- d-cache: block size in bytes (min 4), has to be a power of 2
DCACHE_BLOCK_SIZE => performance_options_c.dcache_block_size_c(PERFORMANCE_OPTION), -- d-cache: block size in bytes (min 4), has to be a power of 2
-- External bus interface --
XBUS_EN => true, -- implement external memory bus interface?
XBUS_TIMEOUT => 256, -- cycles after a pending bus access auto-terminates (0 = disabled)
Expand Down
4 changes: 2 additions & 2 deletions sw/common/common.mk
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ NEORV32_EXG_PATH = $(NEORV32_HOME)/sw/image_gen
# Path to NEORV32 core rtl folder
NEORV32_RTL_PATH = $(NEORV32_LOCAL_RTL)/core
# Path to NEORV32 sim folder
NEORV32_SIM_PATH = $(NEORV32_HOME)/sim
NEORV32_SIM_PATH = $(NEORV32_HOME)/sim/simple
# Marker file to check for NEORV32 home folder
NEORV32_HOME_MARKER = $(NEORV32_INC_PATH)/neorv32.h

Expand Down Expand Up @@ -298,7 +298,7 @@ endif
# -----------------------------------------------------------------------------
sim: $(APP_IMG) install
@echo "Simulating processor using simple testbench..."
@sh $(NEORV32_SIM_PATH)/simple/ghdl.sh $(GHDL_RUN_FLAGS)
@sh $(NEORV32_SIM_PATH)/ghdl.sh $(GHDL_RUN_FLAGS)


# -----------------------------------------------------------------------------
Expand Down
107 changes: 107 additions & 0 deletions sw/example/performance_tests/I/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
# NEORV32 I performance measurement test

This code piece allows the measurement of the number of cycles of various I instructions.
The possible instructions to test are:
arith : add, addi, sub, lui, auipc
shift : sll, slli, srl, srli, sra, srai
logic : xor, xori, or, ori, and, andi
comp : slt, slti, sltu, sltiu
sync : fence, fence.i
branch: beq, bne, blt, bgem bltu, bgeu
jump : jal, jalr
load : lb, lh, lbu, lhu, lw
store : sb, sh, sw
CSR : csrrw, csrrs, csrrc, csrrw, csrrsi, csrrci

The number of instructions run can be tuned by setting the following command line parameters:
`USER_FLAGS+=-DinstLoop=1` This tunes the number loops run, default 1
`USER_FLAGS+=-DinstCalls=256` This tunes the number of instructions called per inner loop, default 256.
The instCalls variable impacts memory, as each instruction instance takes up memory.

The limit the performance image size which instructions that can be tested can be controlled the following comand line parameters. The name of the parameter matches the list of instruction groups above:
`USER_FLAGS+=-Drv32I_arith`
`USER_FLAGS+=-Drv32I_shift`
`USER_FLAGS+=-Drv32I_logic`
`USER_FLAGS+=-Drv32I_comp`
`USER_FLAGS+=-Drv32I_load`
`USER_FLAGS+=-Drv32I_store`
`USER_FLAGS+=-Drv32I_branch_beq`
`USER_FLAGS+=-Drv32I_branch_bne`
`USER_FLAGS+=-Drv32I_branch_blt`
`USER_FLAGS+=-Drv32I_branch_bge`
`USER_FLAGS+=-Drv32I_branch_bltu`
`USER_FLAGS+=-Drv32I_branch_bgeu`
`USER_FLAGS+=-Drv32I_jump`
`USER_FLAGS+=-Drv32I_sync`
`USER_FLAGS+=-Drv32I_env` This is ecall and ebreak. The test is currently not implemented.
`USER_FLAGS+=-Drv32I_csr`
`USER_FLAGS+=-Drv32I_mret`
`USER_FLAGS+=-Drv32I_all` Run all instruction tests, the image will be large

For the branch instructions 3 numbers are provided:
- No branch: The branch is not taken
- Branch forward: The branch is taken and the target is ahead of the branch instruction.
- Branch backward: The branch is taken and the target is behind the branch instruction. This will trigger the default branch predictor.

For the `JALR` instruction there is an additional parameter:
`USER_FLAGS+=-Drv32I_jalr_auipc_cycles` This set the number of cycles AUIPC takes, default is 2. This is used to offset the JALR cycle count as we need to use an AUIPC instruction in conjunction with JALR

For the `MRET` instruction there is an additional parameter:
`USER_FLAGS+=-Drv32I_mret_jal_csrw_cycles` This set the number of cycles that a JAL and CSRW instruction takes, default is 11. This is used to offset the MRET cycle count as we need set MTVEC and JAL to the MRET instruction during the measurement.

For less verbose output `USER_FLAGS+=-DSILENT_MODE=1` can be applied

## Compiler warning!!
The built in assembly assumes that C (compressed instruction) extension is not applied. If C is used the NOPs required for branch instructions to function will be the wrong size.

## Example compile and run
This will run the Arith instruction suite

```
make USER_FLAGS+=-DRUN_CHECK USER_FLAGS+=-DUART0_SIM_MODE USER_FLAGS+=-Drv32I_arith clean_all exe
make sim
```

## Exemplary Test Output

```
<<< I performance test >>>
perform: for (i=0;i<1,i++) {256 instructions}
add tot. 1058 cyc
total 1058 cyc
add rd,rs1,rs2 inst. 4 cyc
addi tot. 1058 cyc
total 2116 cyc
addi rd,rs1,imm inst. 4 cyc
sub tot. 1058 cyc
total 3174 cyc
sub rd,rs1,rs2 inst. 4 cyc
lui tot. 1058 cyc
total 4232 cyc
lui rd,imm inst. 4 cyc
auipc tot. 1058 cyc
total 5290 cyc
auipc rd,imm inst. 4 cyc
instructions tested: 5
total 5290 cycles
avg. inst. execute cyles 4.132
```
Loading

0 comments on commit 3d7012b

Please sign in to comment.