Skip to content

Commit

Permalink
CPU RTL optimization (#857)
Browse files Browse the repository at this point in the history
  • Loading branch information
stnolting committed Mar 23, 2024
2 parents f5ae2b6 + f97927e commit 1fd901c
Show file tree
Hide file tree
Showing 10 changed files with 211 additions and 216 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ mimpid = 0x01040312 -> Version 01.04.03.12 -> v1.4.3.12

| Date | Version | Comment | Link |
|:----:|:-------:|:--------|:----:|
| 26.03.2024 | 1.9.7.1 | CPU hardware optimization (reduced hardware footprint, shortened critical path) | [#857](https://github.com/stnolting/neorv32/pull/857) |
| 22.03.2024 | [**:rocket:1.9.7**](https://github.com/stnolting/neorv32/releases/tag/v1.9.7) | **New release** | |
| 18.03.2024 | 1.9.6.9 | :sparkles: update CFU example: now implementing the Extended Tiny Encryption Algorithm (XTEA) | [#855](https://github.com/stnolting/neorv32/pull/855) |
| 16.03.2024 | 1.9.6.8 | rework cache system: L1 + L2 caches, all based on the generic cache component | [#853](https://github.com/stnolting/neorv32/pull/853) |
Expand Down
1 change: 1 addition & 0 deletions rtl/core/neorv32_cache.vhd
Original file line number Diff line number Diff line change
Expand Up @@ -958,6 +958,7 @@ begin
bus_req_o.ben <= (others => '1'); -- full-word writes only
bus_req_o.src <= '0'; -- cache accesses are always "data" accesses
bus_req_o.priv <= '0'; -- cache accesses are always "unprivileged" accesses
bus_req_o.rvso <= '0'; -- cache accesses can never be a reservation set operation

-- fsm --
case state is
Expand Down
24 changes: 12 additions & 12 deletions rtl/core/neorv32_cpu.vhd
Original file line number Diff line number Diff line change
Expand Up @@ -114,8 +114,9 @@ architecture neorv32_cpu_rtl of neorv32_cpu is
-- local signals --
signal ctrl : ctrl_bus_t; -- main control bus
signal imm : std_ulogic_vector(XLEN-1 downto 0); -- immediate
signal rs1, rs2 : std_ulogic_vector(XLEN-1 downto 0); -- source register 1,2
signal rs3, rs4 : std_ulogic_vector(XLEN-1 downto 0); -- source register 3,4
signal rf_wdata : std_ulogic_vector(XLEN-1 downto 0); -- register file write data
signal rs1, rs2 : std_ulogic_vector(XLEN-1 downto 0); -- source registers 1 and 2
signal rs3, rs4 : std_ulogic_vector(XLEN-1 downto 0); -- source registers 3 and 4 (optional)
signal alu_res : std_ulogic_vector(XLEN-1 downto 0); -- alu result
signal alu_add : std_ulogic_vector(XLEN-1 downto 0); -- alu address result
signal alu_cmp : std_ulogic_vector(1 downto 0); -- comparator result
Expand Down Expand Up @@ -266,18 +267,17 @@ begin
clk_i => clk_i, -- global clock, rising edge
rstn_i => rstn_i, -- global reset, low-active, async
ctrl_i => ctrl, -- main control bus
-- data input --
alu_i => alu_res, -- ALU result
mem_i => mem_rdata, -- memory read data
csr_i => csr_rdata, -- CSR read data
ret_i => link_pc, -- return address
-- data output --
rs1_o => rs1, -- rs1
rs2_o => rs2, -- rs2
rs3_o => rs3, -- rs3
rs4_o => rs4 -- rs4
-- operands --
rd_i => rf_wdata, -- destination operand rd
rs1_o => rs1, -- source operand rs1
rs2_o => rs2, -- source operand rs2
rs3_o => rs3, -- source operand rs3
rs4_o => rs4 -- source operand rs4
);

-- all buses are zero unless there is an according operation --
rf_wdata <= alu_res or mem_rdata or csr_rdata or link_pc;


-- ALU (Arithmetic/Logic Unit) and ALU Co-Processors --------------------------------------
-- -------------------------------------------------------------------------------------------
Expand Down
10 changes: 5 additions & 5 deletions rtl/core/neorv32_cpu_alu.vhd
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ begin
opa_x <= (opa(opa'left) and (not ctrl_i.alu_unsigned)) & opa; -- sign-extend
opb_x <= (opb(opb'left) and (not ctrl_i.alu_unsigned)) & opb; -- sign-extend

addsub_res <= std_ulogic_vector(unsigned(opa_x) - unsigned(opb_x)) when (ctrl_i.alu_op(0) = '1') else
addsub_res <= std_ulogic_vector(unsigned(opa_x) - unsigned(opb_x)) when (ctrl_i.alu_sub = '1') else
std_ulogic_vector(unsigned(opa_x) + unsigned(opb_x));

add_o <= addsub_res(XLEN-1 downto 0); -- direct output of adder result
Expand All @@ -142,17 +142,17 @@ begin
-- -------------------------------------------------------------------------------------------
alu_core: process(ctrl_i, addsub_res, cp_res, rs1_i, opb)
begin
res_o <= (others => '0');
case ctrl_i.alu_op is
when alu_op_zero_c => res_o <= (others => '0');
when alu_op_add_c => res_o <= addsub_res(XLEN-1 downto 0);
when alu_op_sub_c => res_o <= addsub_res(XLEN-1 downto 0);
when alu_op_cp_c => res_o <= cp_res;
when alu_op_slt_c => res_o(XLEN-1 downto 1) <= (others => '0');
res_o(0) <= addsub_res(addsub_res'left); -- carry/borrow
when alu_op_slt_c => res_o(0) <= addsub_res(addsub_res'left); -- carry/borrow
when alu_op_movb_c => res_o <= opb;
when alu_op_xor_c => res_o <= opb xor rs1_i;
when alu_op_or_c => res_o <= opb or rs1_i;
when alu_op_and_c => res_o <= opb and rs1_i;
when others => res_o <= addsub_res(XLEN-1 downto 0); -- don't care
when others => res_o <= (others => '0');
end case;
end process alu_core;

Expand Down
55 changes: 23 additions & 32 deletions rtl/core/neorv32_cpu_control.vhd
Original file line number Diff line number Diff line change
Expand Up @@ -684,7 +684,7 @@ begin

-- PC output --
curr_pc_o <= execute_engine.pc(XLEN-1 downto 1) & '0'; -- current PC
link_pc_o <= execute_engine.link_pc(XLEN-1 downto 1) & '0'; -- return address
link_pc_o <= (execute_engine.link_pc(XLEN-1 downto 1) & '0') when (execute_engine.state = BRANCHED) else (others => '0'); -- return address


-- Decoding Helper Logic ------------------------------------------------------------------
Expand Down Expand Up @@ -814,7 +814,7 @@ begin
csr.we_nxt <= '0';
csr.re_nxt <= '0';
--
ctrl_nxt <= ctrl_bus_zero_c; -- all zero/off by default, default ALU operation = ADD, default RF input = ALU
ctrl_nxt <= ctrl_bus_zero_c; -- all zero/off by default (default ALU operation = ZERO, adder.out = ADD)

-- ALU sign control --
if (execute_engine.ir(instr_opcode_lsb_c+4) = '1') then -- ALU ops
Expand Down Expand Up @@ -890,24 +890,23 @@ begin

-- register/immediate ALU operation --
when opcode_alu_c | opcode_alui_c =>

-- ALU core operation --
case execute_engine.ir(instr_funct3_msb_c downto instr_funct3_lsb_c) is -- actual ALU operation (re-coding)
when funct3_subadd_c => -- ADD(I), SUB
if ((execute_engine.ir(instr_opcode_msb_c-1) = '1') and (execute_engine.ir(instr_funct7_msb_c-1) = '1')) then
ctrl_nxt.alu_op <= alu_op_sub_c; -- SUB if not an immediate op and funct7.6 set
else
ctrl_nxt.alu_op <= alu_op_add_c;
end if;
when funct3_slt_c | funct3_sltu_c => -- SLT(I), SLTU(I)
ctrl_nxt.alu_op <= alu_op_slt_c;
when funct3_xor_c => -- XOR(I)
ctrl_nxt.alu_op <= alu_op_xor_c;
when funct3_or_c => -- OR(I)
ctrl_nxt.alu_op <= alu_op_or_c;
when others => -- AND(I) or multi-cycle / co-processor operation
ctrl_nxt.alu_op <= alu_op_and_c;
case execute_engine.ir(instr_funct3_msb_c downto instr_funct3_lsb_c) is -- operation re-coding
when funct3_subadd_c => ctrl_nxt.alu_op <= alu_op_add_c; -- ADD(I), SUB
when funct3_slt_c | funct3_sltu_c => ctrl_nxt.alu_op <= alu_op_slt_c; -- SLT(I), SLTU(I)
when funct3_xor_c => ctrl_nxt.alu_op <= alu_op_xor_c; -- XOR(I)
when funct3_or_c => ctrl_nxt.alu_op <= alu_op_or_c; -- OR(I)
when others => ctrl_nxt.alu_op <= alu_op_and_c; -- AND(I) or multi-cycle / co-processor operation (shifts)
end case;

-- addition/subtraction control --
if (execute_engine.ir(instr_funct3_msb_c downto instr_funct3_lsb_c+1) = funct3_slt_c(2 downto 1)) or -- SLT(I), SLTU(I)
((execute_engine.ir(instr_funct3_msb_c downto instr_funct3_lsb_c) = funct3_subadd_c) and
(execute_engine.ir(instr_opcode_msb_c-1) = '1') and (execute_engine.ir(instr_funct7_msb_c-1) = '1')) then
ctrl_nxt.alu_sub <= '1';
end if;

-- EXT: co-processor MULDIV operation (multi-cycle) --
if ((CPU_EXTENSION_RISCV_M = true) and (execute_engine.ir(instr_opcode_lsb_c+5) = opcode_alu_c(5)) and
((decode_aux.is_m_mul = '1') or (decode_aux.is_m_div = '1'))) or -- MUL/DIV
Expand Down Expand Up @@ -995,7 +994,6 @@ begin

when BRANCH => -- update next_PC on taken branches and jumps
-- ------------------------------------------------------------
ctrl_nxt.rf_mux <= rf_mux_ret_c; -- return address = link PC
ctrl_nxt.rf_wb_en <= execute_engine.ir(instr_opcode_lsb_c+2); -- save return address if link operation (will not happen if misaligned)
if (trap_ctrl.exc_buf(exc_illegal_c) = '0') and (execute_engine.branch_taken = '1') then -- valid taken branch
fetch_engine.reset <= '1'; -- reset instruction fetch to restart at modified PC
Expand All @@ -1008,10 +1006,7 @@ begin
-- ------------------------------------------------------------
execute_engine.state_nxt <= DISPATCH;
-- house keeping: use this state also to (re-)initialize the register file's x0/zero register --
if (REGFILE_HW_RST = false) then -- x0 does not provide a dedicated hardware reset
ctrl_nxt.rf_mux <= rf_mux_csr_c; -- this will return 0 since csr.re_nxt is zero
ctrl_nxt.rf_zero_we <= '1'; -- force write access to x0
end if;
ctrl_nxt.rf_zero_we <= not bool_to_ulogic_f(REGFILE_HW_RST); -- force write access to x0 if it is a physical register

when MEM_REQ => -- trigger memory request
-- ------------------------------------------------------------
Expand All @@ -1024,7 +1019,6 @@ begin

when MEM_WAIT => -- wait for bus transaction to finish
-- ------------------------------------------------------------
ctrl_nxt.rf_mux <= rf_mux_mem_c; -- RF input = memory read data
if (lsu_wait_i = '0') or -- bus system has completed the transaction
(trap_ctrl.exc_buf(exc_saccess_c) = '1') or (trap_ctrl.exc_buf(exc_laccess_c) = '1') or -- access exception
(trap_ctrl.exc_buf(exc_salign_c) = '1') or (trap_ctrl.exc_buf(exc_lalign_c) = '1') then -- alignment exception
Expand All @@ -1044,7 +1038,6 @@ begin
when others => -- SYSTEM - system environment operation; no effect if illegal instruction
-- ------------------------------------------------------------
execute_engine.state_nxt <= DISPATCH; -- default
ctrl_nxt.rf_mux <= rf_mux_csr_c; -- CSR read data
if (execute_engine.ir(instr_funct3_msb_c downto instr_funct3_lsb_c) = funct3_env_c) and -- ENVIRONMENT
(trap_ctrl.exc_buf(exc_illegal_c) = '0') then -- not an illegal instruction
case execute_engine.ir(instr_funct12_msb_c downto instr_funct12_lsb_c) is
Expand Down Expand Up @@ -1094,13 +1087,12 @@ begin
(not trap_ctrl.exc_buf(exc_iaccess_c)) and (not trap_ctrl.exc_buf(exc_saccess_c)) and (not trap_ctrl.exc_buf(exc_laccess_c));
ctrl_o.rf_rs1 <= execute_engine.ir(instr_rs1_msb_c downto instr_rs1_lsb_c);
ctrl_o.rf_rs2 <= execute_engine.ir(instr_rs2_msb_c downto instr_rs2_lsb_c);
ctrl_o.rf_rs3 <= execute_engine.ir(instr_rs3_msb_c downto instr_rs3_lsb_c);
ctrl_o.rf_rd <= execute_engine.ir(instr_rd_msb_c downto instr_rd_lsb_c);
ctrl_o.rf_mux <= ctrl.rf_mux;
ctrl_o.rf_rd <= execute_engine.ir(instr_rd_msb_c downto instr_rd_lsb_c);
ctrl_o.rf_zero_we <= ctrl.rf_zero_we;

-- alu --
ctrl_o.alu_op <= ctrl.alu_op;
ctrl_o.alu_sub <= ctrl.alu_sub;
ctrl_o.alu_opa_mux <= ctrl.alu_opa_mux;
ctrl_o.alu_opb_mux <= ctrl.alu_opb_mux;
ctrl_o.alu_unsigned <= ctrl.alu_unsigned;
Expand All @@ -1114,9 +1106,9 @@ begin
ctrl_o.lsu_priv <= csr.mstatus_mpp when (csr.mstatus_mprv = '1') else csr.privilege_eff; -- effective privilege level for loads/stores in M-mode

-- instruction word bit fields --
ctrl_o.ir_funct3 <= execute_engine.ir(instr_funct3_msb_c downto instr_funct3_lsb_c);
ctrl_o.ir_funct3 <= execute_engine.ir(instr_funct3_msb_c downto instr_funct3_lsb_c);
ctrl_o.ir_funct12 <= execute_engine.ir(instr_funct12_msb_c downto instr_funct12_lsb_c);
ctrl_o.ir_opcode <= execute_engine.ir(instr_opcode_msb_c downto instr_opcode_lsb_c);
ctrl_o.ir_opcode <= execute_engine.ir(instr_opcode_msb_c downto instr_opcode_lsb_c);

-- cpu status --
ctrl_o.cpu_priv <= csr.privilege_eff;
Expand Down Expand Up @@ -1537,7 +1529,7 @@ begin
elsif (trap_ctrl.irq_buf(irq_msi_irq_c) = '1') then trap_ctrl.cause <= trap_msi_c; -- machine software interrupt (MSI)
elsif (trap_ctrl.irq_buf(irq_mti_irq_c) = '1') then trap_ctrl.cause <= trap_mti_c; -- machine timer interrupt (MTI)
--
else trap_ctrl.cause <= trap_mti_c; end if; -- don't care
else trap_ctrl.cause <= (others => '0'); end if;
end if;
end process trap_priority;

Expand All @@ -1550,8 +1542,7 @@ begin
trap_ctrl.env_pending <= '0';
elsif rising_edge(clk_i) then
if (trap_ctrl.env_pending = '0') then -- no pending trap environment yet
-- trigger IRQ only in EXECUTE state --
if (trap_ctrl.exc_fire = '1') or ((trap_ctrl.irq_fire = '1') and (execute_engine.state = EXECUTE)) then
if (trap_ctrl.exc_fire = '1') or ((trap_ctrl.irq_fire = '1') and (execute_engine.state = EXECUTE)) then -- trigger IRQ only in EXECUTE state
trap_ctrl.env_pending <= '1'; -- now execute engine can start trap handling
end if;
elsif (trap_ctrl.env_enter = '1') then -- start of trap environment acknowledged by execute engine
Expand Down
2 changes: 2 additions & 0 deletions rtl/core/neorv32_cpu_lsu.vhd
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,8 @@ begin
when others => -- word
rdata_o(XLEN-1 downto 0) <= bus_rsp_i.data(XLEN-1 downto 0);
end case;
else
rdata_o <= (others => '0'); -- output zero if there is no memory access
end if;
end if;
end process mem_di_reg;
Expand Down
57 changes: 22 additions & 35 deletions rtl/core/neorv32_cpu_regfile.vhd
Original file line number Diff line number Diff line change
Expand Up @@ -59,16 +59,12 @@ entity neorv32_cpu_regfile is
clk_i : in std_ulogic; -- global clock, rising edge
rstn_i : in std_ulogic; -- global reset, low-active, async
ctrl_i : in ctrl_bus_t; -- main control bus
-- data input --
alu_i : in std_ulogic_vector(XLEN-1 downto 0); -- ALU result
mem_i : in std_ulogic_vector(XLEN-1 downto 0); -- memory read data
csr_i : in std_ulogic_vector(XLEN-1 downto 0); -- CSR read data
ret_i : in std_ulogic_vector(XLEN-1 downto 0); -- link PC
-- data output --
rs1_o : out std_ulogic_vector(XLEN-1 downto 0); -- rs1
rs2_o : out std_ulogic_vector(XLEN-1 downto 0); -- rs2
rs3_o : out std_ulogic_vector(XLEN-1 downto 0); -- rs3
rs4_o : out std_ulogic_vector(XLEN-1 downto 0) -- rs4
-- operands --
rd_i : in std_ulogic_vector(XLEN-1 downto 0); -- destination operand rd
rs1_o : out std_ulogic_vector(XLEN-1 downto 0); -- source operand rs1
rs2_o : out std_ulogic_vector(XLEN-1 downto 0); -- source operand rs2
rs3_o : out std_ulogic_vector(XLEN-1 downto 0); -- source operand rs3
rs4_o : out std_ulogic_vector(XLEN-1 downto 0) -- source operand rs4
);
end neorv32_cpu_regfile;

Expand All @@ -82,30 +78,17 @@ architecture neorv32_cpu_regfile_rtl of neorv32_cpu_regfile is
signal reg_file : reg_file_t;

-- access --
signal rf_wdata : std_ulogic_vector(XLEN-1 downto 0); -- write-back data
signal rf_we : std_ulogic; -- write enable
signal rf_we_sel : std_ulogic_vector((2**addr_bits_c)-1 downto 0); -- one-hot write enable
signal rd_zero : std_ulogic; -- writing to x0?
signal opa_addr : std_ulogic_vector(4 downto 0); -- rs1/rd address
signal rd_addr : std_ulogic_vector(4 downto 0); -- rd address
signal rs3_addr : std_ulogic_vector(4 downto 0); -- rs3 address
signal rs4_addr : std_ulogic_vector(4 downto 0); -- rs4 address

begin

-- Data Write-Back Select -----------------------------------------------------------------
-- -------------------------------------------------------------------------------------------
wb_select: process(ctrl_i, alu_i, mem_i, csr_i, ret_i)
begin
case ctrl_i.rf_mux is
when rf_mux_alu_c => rf_wdata <= alu_i; -- ALU result
when rf_mux_mem_c => rf_wdata <= mem_i; -- memory read data
when rf_mux_csr_c => rf_wdata <= csr_i; -- CSR read data
when rf_mux_ret_c => rf_wdata <= ret_i; -- link PC (return address)
when others => rf_wdata <= alu_i; -- don't care
end case;
end process wb_select;


-- FPGA Register File (no hardware reset) -------------------------------------------------
-- FPGA-Style Register File (BlockRAM, no hardware reset at all) --------------------------
-- -------------------------------------------------------------------------------------------
register_file_fpga:
if not RST_EN generate
Expand All @@ -125,7 +108,7 @@ begin
begin
if rising_edge(clk_i) then
if (rf_we = '1') then
reg_file(to_integer(unsigned(opa_addr(addr_bits_c-1 downto 0)))) <= rf_wdata;
reg_file(to_integer(unsigned(opa_addr(addr_bits_c-1 downto 0)))) <= rd_i;
end if;
rs1_o <= reg_file(to_integer(unsigned(opa_addr(addr_bits_c-1 downto 0))));
rs2_o <= reg_file(to_integer(unsigned(ctrl_i.rf_rs2(addr_bits_c-1 downto 0))));
Expand All @@ -135,18 +118,19 @@ begin
end generate;


-- ASIC Register File (full hardware reset) -----------------------------------------------
-- ASIC-Style Register File (individual FFs, full hardware reset) -------------------------
-- -------------------------------------------------------------------------------------------
register_file_asic:
if RST_EN generate

-- "write" to x0 if no write access --
rd_addr <= ctrl_i.rf_rd(addr_bits_c-1 downto 0) when (ctrl_i.rf_wb_en = '1') else (others => '0');

-- write enable decoder --
we_decode: process(ctrl_i)
we_decode: process(rd_addr)
begin
rf_we_sel <= (others => '0');
if (ctrl_i.rf_wb_en = '1') then
rf_we_sel(to_integer(unsigned(ctrl_i.rf_rd(addr_bits_c-1 downto 0)))) <= '1';
end if;
rf_we_sel(to_integer(unsigned(rd_addr(addr_bits_c-1 downto 0)))) <= '1';
end process we_decode;

-- individual registers --
Expand All @@ -158,14 +142,16 @@ begin
reg_file(i) <= (others => '0');
elsif rising_edge(clk_i) then
if (rf_we_sel(i) = '1') then
reg_file(i) <= rf_wdata;
reg_file(i) <= rd_i;
end if;
end if;
end process register_file;
end generate;

reg_file(0) <= (others => '0'); -- x0 is hardwired to zero
-- x0 is hardwired to zero --
reg_file(0) <= (others => '0');

-- synchronous read --
rf_read: process(clk_i)
begin
if rising_edge(clk_i) then
Expand All @@ -184,9 +170,10 @@ begin
rs3_read: process(clk_i)
begin
if rising_edge(clk_i) then
rs3_o <= reg_file(to_integer(unsigned(ctrl_i.rf_rs3(addr_bits_c-1 downto 0))));
rs3_o <= reg_file(to_integer(unsigned(rs3_addr(addr_bits_c-1 downto 0))));
end if;
end process rs3_read;
rs3_addr <= ctrl_i.ir_funct12(11 downto 7); -- RISC-V compliant
end generate;

rs3_disable:
Expand Down
Loading

0 comments on commit 1fd901c

Please sign in to comment.