Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CPU RTL optimization #857

Merged
merged 7 commits into from
Mar 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ mimpid = 0x01040312 -> Version 01.04.03.12 -> v1.4.3.12

| Date | Version | Comment | Link |
|:----:|:-------:|:--------|:----:|
| 26.03.2024 | 1.9.7.1 | CPU hardware optimization (reduced hardware footprint, shortened critical path) | [#857](https://github.com/stnolting/neorv32/pull/857) |
| 22.03.2024 | [**:rocket:1.9.7**](https://github.com/stnolting/neorv32/releases/tag/v1.9.7) | **New release** | |
| 18.03.2024 | 1.9.6.9 | :sparkles: update CFU example: now implementing the Extended Tiny Encryption Algorithm (XTEA) | [#855](https://github.com/stnolting/neorv32/pull/855) |
| 16.03.2024 | 1.9.6.8 | rework cache system: L1 + L2 caches, all based on the generic cache component | [#853](https://github.com/stnolting/neorv32/pull/853) |
Expand Down
1 change: 1 addition & 0 deletions rtl/core/neorv32_cache.vhd
Original file line number Diff line number Diff line change
Expand Up @@ -958,6 +958,7 @@ begin
bus_req_o.ben <= (others => '1'); -- full-word writes only
bus_req_o.src <= '0'; -- cache accesses are always "data" accesses
bus_req_o.priv <= '0'; -- cache accesses are always "unprivileged" accesses
bus_req_o.rvso <= '0'; -- cache accesses can never be a reservation set operation

-- fsm --
case state is
Expand Down
24 changes: 12 additions & 12 deletions rtl/core/neorv32_cpu.vhd
Original file line number Diff line number Diff line change
Expand Up @@ -114,8 +114,9 @@ architecture neorv32_cpu_rtl of neorv32_cpu is
-- local signals --
signal ctrl : ctrl_bus_t; -- main control bus
signal imm : std_ulogic_vector(XLEN-1 downto 0); -- immediate
signal rs1, rs2 : std_ulogic_vector(XLEN-1 downto 0); -- source register 1,2
signal rs3, rs4 : std_ulogic_vector(XLEN-1 downto 0); -- source register 3,4
signal rf_wdata : std_ulogic_vector(XLEN-1 downto 0); -- register file write data
signal rs1, rs2 : std_ulogic_vector(XLEN-1 downto 0); -- source registers 1 and 2
signal rs3, rs4 : std_ulogic_vector(XLEN-1 downto 0); -- source registers 3 and 4 (optional)
signal alu_res : std_ulogic_vector(XLEN-1 downto 0); -- alu result
signal alu_add : std_ulogic_vector(XLEN-1 downto 0); -- alu address result
signal alu_cmp : std_ulogic_vector(1 downto 0); -- comparator result
Expand Down Expand Up @@ -266,18 +267,17 @@ begin
clk_i => clk_i, -- global clock, rising edge
rstn_i => rstn_i, -- global reset, low-active, async
ctrl_i => ctrl, -- main control bus
-- data input --
alu_i => alu_res, -- ALU result
mem_i => mem_rdata, -- memory read data
csr_i => csr_rdata, -- CSR read data
ret_i => link_pc, -- return address
-- data output --
rs1_o => rs1, -- rs1
rs2_o => rs2, -- rs2
rs3_o => rs3, -- rs3
rs4_o => rs4 -- rs4
-- operands --
rd_i => rf_wdata, -- destination operand rd
rs1_o => rs1, -- source operand rs1
rs2_o => rs2, -- source operand rs2
rs3_o => rs3, -- source operand rs3
rs4_o => rs4 -- source operand rs4
);

-- all buses are zero unless there is an according operation --
rf_wdata <= alu_res or mem_rdata or csr_rdata or link_pc;


-- ALU (Arithmetic/Logic Unit) and ALU Co-Processors --------------------------------------
-- -------------------------------------------------------------------------------------------
Expand Down
10 changes: 5 additions & 5 deletions rtl/core/neorv32_cpu_alu.vhd
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ begin
opa_x <= (opa(opa'left) and (not ctrl_i.alu_unsigned)) & opa; -- sign-extend
opb_x <= (opb(opb'left) and (not ctrl_i.alu_unsigned)) & opb; -- sign-extend

addsub_res <= std_ulogic_vector(unsigned(opa_x) - unsigned(opb_x)) when (ctrl_i.alu_op(0) = '1') else
addsub_res <= std_ulogic_vector(unsigned(opa_x) - unsigned(opb_x)) when (ctrl_i.alu_sub = '1') else
std_ulogic_vector(unsigned(opa_x) + unsigned(opb_x));

add_o <= addsub_res(XLEN-1 downto 0); -- direct output of adder result
Expand All @@ -142,17 +142,17 @@ begin
-- -------------------------------------------------------------------------------------------
alu_core: process(ctrl_i, addsub_res, cp_res, rs1_i, opb)
begin
res_o <= (others => '0');
case ctrl_i.alu_op is
when alu_op_zero_c => res_o <= (others => '0');
when alu_op_add_c => res_o <= addsub_res(XLEN-1 downto 0);
when alu_op_sub_c => res_o <= addsub_res(XLEN-1 downto 0);
when alu_op_cp_c => res_o <= cp_res;
when alu_op_slt_c => res_o(XLEN-1 downto 1) <= (others => '0');
res_o(0) <= addsub_res(addsub_res'left); -- carry/borrow
when alu_op_slt_c => res_o(0) <= addsub_res(addsub_res'left); -- carry/borrow
when alu_op_movb_c => res_o <= opb;
when alu_op_xor_c => res_o <= opb xor rs1_i;
when alu_op_or_c => res_o <= opb or rs1_i;
when alu_op_and_c => res_o <= opb and rs1_i;
when others => res_o <= addsub_res(XLEN-1 downto 0); -- don't care
when others => res_o <= (others => '0');
end case;
end process alu_core;

Expand Down
55 changes: 23 additions & 32 deletions rtl/core/neorv32_cpu_control.vhd
Original file line number Diff line number Diff line change
Expand Up @@ -684,7 +684,7 @@ begin

-- PC output --
curr_pc_o <= execute_engine.pc(XLEN-1 downto 1) & '0'; -- current PC
link_pc_o <= execute_engine.link_pc(XLEN-1 downto 1) & '0'; -- return address
link_pc_o <= (execute_engine.link_pc(XLEN-1 downto 1) & '0') when (execute_engine.state = BRANCHED) else (others => '0'); -- return address


-- Decoding Helper Logic ------------------------------------------------------------------
Expand Down Expand Up @@ -814,7 +814,7 @@ begin
csr.we_nxt <= '0';
csr.re_nxt <= '0';
--
ctrl_nxt <= ctrl_bus_zero_c; -- all zero/off by default, default ALU operation = ADD, default RF input = ALU
ctrl_nxt <= ctrl_bus_zero_c; -- all zero/off by default (default ALU operation = ZERO, adder.out = ADD)

-- ALU sign control --
if (execute_engine.ir(instr_opcode_lsb_c+4) = '1') then -- ALU ops
Expand Down Expand Up @@ -890,24 +890,23 @@ begin

-- register/immediate ALU operation --
when opcode_alu_c | opcode_alui_c =>

-- ALU core operation --
case execute_engine.ir(instr_funct3_msb_c downto instr_funct3_lsb_c) is -- actual ALU operation (re-coding)
when funct3_subadd_c => -- ADD(I), SUB
if ((execute_engine.ir(instr_opcode_msb_c-1) = '1') and (execute_engine.ir(instr_funct7_msb_c-1) = '1')) then
ctrl_nxt.alu_op <= alu_op_sub_c; -- SUB if not an immediate op and funct7.6 set
else
ctrl_nxt.alu_op <= alu_op_add_c;
end if;
when funct3_slt_c | funct3_sltu_c => -- SLT(I), SLTU(I)
ctrl_nxt.alu_op <= alu_op_slt_c;
when funct3_xor_c => -- XOR(I)
ctrl_nxt.alu_op <= alu_op_xor_c;
when funct3_or_c => -- OR(I)
ctrl_nxt.alu_op <= alu_op_or_c;
when others => -- AND(I) or multi-cycle / co-processor operation
ctrl_nxt.alu_op <= alu_op_and_c;
case execute_engine.ir(instr_funct3_msb_c downto instr_funct3_lsb_c) is -- operation re-coding
when funct3_subadd_c => ctrl_nxt.alu_op <= alu_op_add_c; -- ADD(I), SUB
when funct3_slt_c | funct3_sltu_c => ctrl_nxt.alu_op <= alu_op_slt_c; -- SLT(I), SLTU(I)
when funct3_xor_c => ctrl_nxt.alu_op <= alu_op_xor_c; -- XOR(I)
when funct3_or_c => ctrl_nxt.alu_op <= alu_op_or_c; -- OR(I)
when others => ctrl_nxt.alu_op <= alu_op_and_c; -- AND(I) or multi-cycle / co-processor operation (shifts)
end case;

-- addition/subtraction control --
if (execute_engine.ir(instr_funct3_msb_c downto instr_funct3_lsb_c+1) = funct3_slt_c(2 downto 1)) or -- SLT(I), SLTU(I)
((execute_engine.ir(instr_funct3_msb_c downto instr_funct3_lsb_c) = funct3_subadd_c) and
(execute_engine.ir(instr_opcode_msb_c-1) = '1') and (execute_engine.ir(instr_funct7_msb_c-1) = '1')) then
ctrl_nxt.alu_sub <= '1';
end if;

-- EXT: co-processor MULDIV operation (multi-cycle) --
if ((CPU_EXTENSION_RISCV_M = true) and (execute_engine.ir(instr_opcode_lsb_c+5) = opcode_alu_c(5)) and
((decode_aux.is_m_mul = '1') or (decode_aux.is_m_div = '1'))) or -- MUL/DIV
Expand Down Expand Up @@ -995,7 +994,6 @@ begin

when BRANCH => -- update next_PC on taken branches and jumps
-- ------------------------------------------------------------
ctrl_nxt.rf_mux <= rf_mux_ret_c; -- return address = link PC
ctrl_nxt.rf_wb_en <= execute_engine.ir(instr_opcode_lsb_c+2); -- save return address if link operation (will not happen if misaligned)
if (trap_ctrl.exc_buf(exc_illegal_c) = '0') and (execute_engine.branch_taken = '1') then -- valid taken branch
fetch_engine.reset <= '1'; -- reset instruction fetch to restart at modified PC
Expand All @@ -1008,10 +1006,7 @@ begin
-- ------------------------------------------------------------
execute_engine.state_nxt <= DISPATCH;
-- house keeping: use this state also to (re-)initialize the register file's x0/zero register --
if (REGFILE_HW_RST = false) then -- x0 does not provide a dedicated hardware reset
ctrl_nxt.rf_mux <= rf_mux_csr_c; -- this will return 0 since csr.re_nxt is zero
ctrl_nxt.rf_zero_we <= '1'; -- force write access to x0
end if;
ctrl_nxt.rf_zero_we <= not bool_to_ulogic_f(REGFILE_HW_RST); -- force write access to x0 if it is a physical register

when MEM_REQ => -- trigger memory request
-- ------------------------------------------------------------
Expand All @@ -1024,7 +1019,6 @@ begin

when MEM_WAIT => -- wait for bus transaction to finish
-- ------------------------------------------------------------
ctrl_nxt.rf_mux <= rf_mux_mem_c; -- RF input = memory read data
if (lsu_wait_i = '0') or -- bus system has completed the transaction
(trap_ctrl.exc_buf(exc_saccess_c) = '1') or (trap_ctrl.exc_buf(exc_laccess_c) = '1') or -- access exception
(trap_ctrl.exc_buf(exc_salign_c) = '1') or (trap_ctrl.exc_buf(exc_lalign_c) = '1') then -- alignment exception
Expand All @@ -1044,7 +1038,6 @@ begin
when others => -- SYSTEM - system environment operation; no effect if illegal instruction
-- ------------------------------------------------------------
execute_engine.state_nxt <= DISPATCH; -- default
ctrl_nxt.rf_mux <= rf_mux_csr_c; -- CSR read data
if (execute_engine.ir(instr_funct3_msb_c downto instr_funct3_lsb_c) = funct3_env_c) and -- ENVIRONMENT
(trap_ctrl.exc_buf(exc_illegal_c) = '0') then -- not an illegal instruction
case execute_engine.ir(instr_funct12_msb_c downto instr_funct12_lsb_c) is
Expand Down Expand Up @@ -1094,13 +1087,12 @@ begin
(not trap_ctrl.exc_buf(exc_iaccess_c)) and (not trap_ctrl.exc_buf(exc_saccess_c)) and (not trap_ctrl.exc_buf(exc_laccess_c));
ctrl_o.rf_rs1 <= execute_engine.ir(instr_rs1_msb_c downto instr_rs1_lsb_c);
ctrl_o.rf_rs2 <= execute_engine.ir(instr_rs2_msb_c downto instr_rs2_lsb_c);
ctrl_o.rf_rs3 <= execute_engine.ir(instr_rs3_msb_c downto instr_rs3_lsb_c);
ctrl_o.rf_rd <= execute_engine.ir(instr_rd_msb_c downto instr_rd_lsb_c);
ctrl_o.rf_mux <= ctrl.rf_mux;
ctrl_o.rf_rd <= execute_engine.ir(instr_rd_msb_c downto instr_rd_lsb_c);
ctrl_o.rf_zero_we <= ctrl.rf_zero_we;

-- alu --
ctrl_o.alu_op <= ctrl.alu_op;
ctrl_o.alu_sub <= ctrl.alu_sub;
ctrl_o.alu_opa_mux <= ctrl.alu_opa_mux;
ctrl_o.alu_opb_mux <= ctrl.alu_opb_mux;
ctrl_o.alu_unsigned <= ctrl.alu_unsigned;
Expand All @@ -1114,9 +1106,9 @@ begin
ctrl_o.lsu_priv <= csr.mstatus_mpp when (csr.mstatus_mprv = '1') else csr.privilege_eff; -- effective privilege level for loads/stores in M-mode

-- instruction word bit fields --
ctrl_o.ir_funct3 <= execute_engine.ir(instr_funct3_msb_c downto instr_funct3_lsb_c);
ctrl_o.ir_funct3 <= execute_engine.ir(instr_funct3_msb_c downto instr_funct3_lsb_c);
ctrl_o.ir_funct12 <= execute_engine.ir(instr_funct12_msb_c downto instr_funct12_lsb_c);
ctrl_o.ir_opcode <= execute_engine.ir(instr_opcode_msb_c downto instr_opcode_lsb_c);
ctrl_o.ir_opcode <= execute_engine.ir(instr_opcode_msb_c downto instr_opcode_lsb_c);

-- cpu status --
ctrl_o.cpu_priv <= csr.privilege_eff;
Expand Down Expand Up @@ -1537,7 +1529,7 @@ begin
elsif (trap_ctrl.irq_buf(irq_msi_irq_c) = '1') then trap_ctrl.cause <= trap_msi_c; -- machine software interrupt (MSI)
elsif (trap_ctrl.irq_buf(irq_mti_irq_c) = '1') then trap_ctrl.cause <= trap_mti_c; -- machine timer interrupt (MTI)
--
else trap_ctrl.cause <= trap_mti_c; end if; -- don't care
else trap_ctrl.cause <= (others => '0'); end if;
end if;
end process trap_priority;

Expand All @@ -1550,8 +1542,7 @@ begin
trap_ctrl.env_pending <= '0';
elsif rising_edge(clk_i) then
if (trap_ctrl.env_pending = '0') then -- no pending trap environment yet
-- trigger IRQ only in EXECUTE state --
if (trap_ctrl.exc_fire = '1') or ((trap_ctrl.irq_fire = '1') and (execute_engine.state = EXECUTE)) then
if (trap_ctrl.exc_fire = '1') or ((trap_ctrl.irq_fire = '1') and (execute_engine.state = EXECUTE)) then -- trigger IRQ only in EXECUTE state
trap_ctrl.env_pending <= '1'; -- now execute engine can start trap handling
end if;
elsif (trap_ctrl.env_enter = '1') then -- start of trap environment acknowledged by execute engine
Expand Down
2 changes: 2 additions & 0 deletions rtl/core/neorv32_cpu_lsu.vhd
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,8 @@ begin
when others => -- word
rdata_o(XLEN-1 downto 0) <= bus_rsp_i.data(XLEN-1 downto 0);
end case;
else
rdata_o <= (others => '0'); -- output zero if there is no memory access
end if;
end if;
end process mem_di_reg;
Expand Down
57 changes: 22 additions & 35 deletions rtl/core/neorv32_cpu_regfile.vhd
Original file line number Diff line number Diff line change
Expand Up @@ -59,16 +59,12 @@ entity neorv32_cpu_regfile is
clk_i : in std_ulogic; -- global clock, rising edge
rstn_i : in std_ulogic; -- global reset, low-active, async
ctrl_i : in ctrl_bus_t; -- main control bus
-- data input --
alu_i : in std_ulogic_vector(XLEN-1 downto 0); -- ALU result
mem_i : in std_ulogic_vector(XLEN-1 downto 0); -- memory read data
csr_i : in std_ulogic_vector(XLEN-1 downto 0); -- CSR read data
ret_i : in std_ulogic_vector(XLEN-1 downto 0); -- link PC
-- data output --
rs1_o : out std_ulogic_vector(XLEN-1 downto 0); -- rs1
rs2_o : out std_ulogic_vector(XLEN-1 downto 0); -- rs2
rs3_o : out std_ulogic_vector(XLEN-1 downto 0); -- rs3
rs4_o : out std_ulogic_vector(XLEN-1 downto 0) -- rs4
-- operands --
rd_i : in std_ulogic_vector(XLEN-1 downto 0); -- destination operand rd
rs1_o : out std_ulogic_vector(XLEN-1 downto 0); -- source operand rs1
rs2_o : out std_ulogic_vector(XLEN-1 downto 0); -- source operand rs2
rs3_o : out std_ulogic_vector(XLEN-1 downto 0); -- source operand rs3
rs4_o : out std_ulogic_vector(XLEN-1 downto 0) -- source operand rs4
);
end neorv32_cpu_regfile;

Expand All @@ -82,30 +78,17 @@ architecture neorv32_cpu_regfile_rtl of neorv32_cpu_regfile is
signal reg_file : reg_file_t;

-- access --
signal rf_wdata : std_ulogic_vector(XLEN-1 downto 0); -- write-back data
signal rf_we : std_ulogic; -- write enable
signal rf_we_sel : std_ulogic_vector((2**addr_bits_c)-1 downto 0); -- one-hot write enable
signal rd_zero : std_ulogic; -- writing to x0?
signal opa_addr : std_ulogic_vector(4 downto 0); -- rs1/rd address
signal rd_addr : std_ulogic_vector(4 downto 0); -- rd address
signal rs3_addr : std_ulogic_vector(4 downto 0); -- rs3 address
signal rs4_addr : std_ulogic_vector(4 downto 0); -- rs4 address

begin

-- Data Write-Back Select -----------------------------------------------------------------
-- -------------------------------------------------------------------------------------------
wb_select: process(ctrl_i, alu_i, mem_i, csr_i, ret_i)
begin
case ctrl_i.rf_mux is
when rf_mux_alu_c => rf_wdata <= alu_i; -- ALU result
when rf_mux_mem_c => rf_wdata <= mem_i; -- memory read data
when rf_mux_csr_c => rf_wdata <= csr_i; -- CSR read data
when rf_mux_ret_c => rf_wdata <= ret_i; -- link PC (return address)
when others => rf_wdata <= alu_i; -- don't care
end case;
end process wb_select;


-- FPGA Register File (no hardware reset) -------------------------------------------------
-- FPGA-Style Register File (BlockRAM, no hardware reset at all) --------------------------
-- -------------------------------------------------------------------------------------------
register_file_fpga:
if not RST_EN generate
Expand All @@ -125,7 +108,7 @@ begin
begin
if rising_edge(clk_i) then
if (rf_we = '1') then
reg_file(to_integer(unsigned(opa_addr(addr_bits_c-1 downto 0)))) <= rf_wdata;
reg_file(to_integer(unsigned(opa_addr(addr_bits_c-1 downto 0)))) <= rd_i;
end if;
rs1_o <= reg_file(to_integer(unsigned(opa_addr(addr_bits_c-1 downto 0))));
rs2_o <= reg_file(to_integer(unsigned(ctrl_i.rf_rs2(addr_bits_c-1 downto 0))));
Expand All @@ -135,18 +118,19 @@ begin
end generate;


-- ASIC Register File (full hardware reset) -----------------------------------------------
-- ASIC-Style Register File (individual FFs, full hardware reset) -------------------------
-- -------------------------------------------------------------------------------------------
register_file_asic:
if RST_EN generate

-- "write" to x0 if no write access --
rd_addr <= ctrl_i.rf_rd(addr_bits_c-1 downto 0) when (ctrl_i.rf_wb_en = '1') else (others => '0');

-- write enable decoder --
we_decode: process(ctrl_i)
we_decode: process(rd_addr)
begin
rf_we_sel <= (others => '0');
if (ctrl_i.rf_wb_en = '1') then
rf_we_sel(to_integer(unsigned(ctrl_i.rf_rd(addr_bits_c-1 downto 0)))) <= '1';
end if;
rf_we_sel(to_integer(unsigned(rd_addr(addr_bits_c-1 downto 0)))) <= '1';
end process we_decode;

-- individual registers --
Expand All @@ -158,14 +142,16 @@ begin
reg_file(i) <= (others => '0');
elsif rising_edge(clk_i) then
if (rf_we_sel(i) = '1') then
reg_file(i) <= rf_wdata;
reg_file(i) <= rd_i;
end if;
end if;
end process register_file;
end generate;

reg_file(0) <= (others => '0'); -- x0 is hardwired to zero
-- x0 is hardwired to zero --
reg_file(0) <= (others => '0');

-- synchronous read --
rf_read: process(clk_i)
begin
if rising_edge(clk_i) then
Expand All @@ -184,9 +170,10 @@ begin
rs3_read: process(clk_i)
begin
if rising_edge(clk_i) then
rs3_o <= reg_file(to_integer(unsigned(ctrl_i.rf_rs3(addr_bits_c-1 downto 0))));
rs3_o <= reg_file(to_integer(unsigned(rs3_addr(addr_bits_c-1 downto 0))));
end if;
end process rs3_read;
rs3_addr <= ctrl_i.ir_funct12(11 downto 7); -- RISC-V compliant
end generate;

rs3_disable:
Expand Down
Loading