Skip to content

Commit

Permalink
[rtl] cleanup & rework/optimize CPU branch system (#735)
Browse files Browse the repository at this point in the history
  • Loading branch information
stnolting committed Nov 23, 2023
2 parents 8dfd2d9 + 987fbb7 commit 617c8bb
Show file tree
Hide file tree
Showing 5 changed files with 66 additions and 57 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ mimpid = 0x01040312 -> Version 01.04.03.12 -> v1.4.3.12

| Date (*dd.mm.yyyy*) | Version | Comment |
|:-------------------:|:-------:|:--------|
| 23.11.2023 | 1.9.1.5 | clean-up & rework CPU branch logic; [#735](https://github.com/stnolting/neorv32/pull/735) |
| 21.11.2023 | 1.9.1.4 | :bug: fix bug in handling of "misaligned instruction exception"; [#734](https://github.com/stnolting/neorv32/pull/734) |
| 20.11.2023 | 1.9.1.3 | :bug: fix wiring of FPU exception flags; [#733](https://github.com/stnolting/neorv32/pull/733) |
| 18.11.2023 | 1.9.1.2 | add XIP clock divider to fine-tune SPI frequency; [#731](https://github.com/stnolting/neorv32/pull/731) |
Expand Down
6 changes: 3 additions & 3 deletions rtl/core/neorv32_cpu.vhd
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ architecture neorv32_cpu_rtl of neorv32_cpu is
signal be_store : std_ulogic; -- bus error on store data access
signal fetch_pc : std_ulogic_vector(XLEN-1 downto 0); -- pc for instruction fetch
signal curr_pc : std_ulogic_vector(XLEN-1 downto 0); -- current pc (for currently executed instruction)
signal next_pc : std_ulogic_vector(XLEN-1 downto 0); -- next pc (for next executed instruction)
signal link_pc : std_ulogic_vector(XLEN-1 downto 0); -- link pc (return address)
signal pmp_ex_fault : std_ulogic; -- PMP instruction fetch fault
signal pmp_rw_fault : std_ulogic; -- PMP read/write access fault

Expand Down Expand Up @@ -217,7 +217,7 @@ begin
imm_o => imm, -- immediate
fetch_pc_o => fetch_pc, -- instruction fetch address
curr_pc_o => curr_pc, -- current PC (corresponding to current instruction)
next_pc_o => next_pc, -- next PC (corresponding to next instruction)
link_pc_o => link_pc, -- link PC (return address)
csr_rdata_o => csr_rdata, -- CSR read data
-- external CSR interface --
xcsr_we_o => xcsr_we, -- global write enable
Expand Down Expand Up @@ -268,7 +268,7 @@ begin
alu_i => alu_res, -- ALU result
mem_i => mem_rdata, -- memory read data
csr_i => csr_rdata, -- CSR read data
npc_i => next_pc, -- next PC
ret_i => link_pc, -- return address
-- data output --
rs1_o => rs1, -- rs1
rs2_o => rs2, -- rs2
Expand Down
106 changes: 57 additions & 49 deletions rtl/core/neorv32_cpu_control.vhd
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ entity neorv32_cpu_control is
imm_o : out std_ulogic_vector(XLEN-1 downto 0); -- immediate
fetch_pc_o : out std_ulogic_vector(XLEN-1 downto 0); -- instruction fetch address
curr_pc_o : out std_ulogic_vector(XLEN-1 downto 0); -- current PC (corresponding to current instruction)
next_pc_o : out std_ulogic_vector(XLEN-1 downto 0); -- next PC (corresponding to next instruction)
link_pc_o : out std_ulogic_vector(XLEN-1 downto 0); -- link PC (return address)
csr_rdata_o : out std_ulogic_vector(XLEN-1 downto 0); -- CSR read data
-- external CSR interface --
xcsr_we_o : out std_ulogic; -- global write enable
Expand Down Expand Up @@ -137,7 +137,7 @@ architecture neorv32_cpu_control_rtl of neorv32_cpu_control is
state_prev : fetch_engine_state_t;
restart : std_ulogic;
unaligned : std_ulogic;
pc : std_ulogic_vector(XLEN-1 downto 2); -- word-aligned
pc : std_ulogic_vector(XLEN-1 downto 0);
reset : std_ulogic;
resp : std_ulogic; -- bus response
end record;
Expand Down Expand Up @@ -201,6 +201,7 @@ architecture neorv32_cpu_control_rtl of neorv32_cpu_control is
pc_we : std_ulogic; -- PC update enabled
next_pc : std_ulogic_vector(XLEN-1 downto 0); -- next PC, corresponding to next instruction to be executed
next_pc_inc : std_ulogic_vector(XLEN-1 downto 0); -- increment to get next PC
link_pc : std_ulogic_vector(XLEN-1 downto 0); -- next PC for linking (return address)
end record;
signal execute_engine : execute_engine_t;

Expand Down Expand Up @@ -361,7 +362,7 @@ begin
fetch_engine.state_prev <= IF_RESTART;
fetch_engine.restart <= '1'; -- set to reset IPB
fetch_engine.unaligned <= '0';
fetch_engine.pc <= CPU_BOOT_ADDR(XLEN-1 downto 2); -- 32-bit aligned boot address
fetch_engine.pc <= CPU_BOOT_ADDR(XLEN-1 downto 2) & "00"; -- 32-bit aligned boot address
elsif rising_edge(clk_i) then
-- previous state (for HPMs only) --
fetch_engine.state_prev <= fetch_engine.state;
Expand All @@ -378,8 +379,8 @@ begin

when IF_RESTART => -- set new fetch start address
-- ------------------------------------------------------------
fetch_engine.pc <= execute_engine.pc(XLEN-1 downto 2); -- initialize with logical PC, word aligned
fetch_engine.unaligned <= execute_engine.pc(1);
fetch_engine.pc <= execute_engine.next_pc(XLEN-1 downto 2) & "00"; -- initialize with logical PC, word aligned
fetch_engine.unaligned <= execute_engine.next_pc(1);
fetch_engine.state <= IF_REQUEST;

when IF_REQUEST => -- request new 32-bit-aligned instruction word
Expand All @@ -391,7 +392,7 @@ begin
when IF_PENDING => -- wait for bus response and write instruction data to prefetch buffer
-- ------------------------------------------------------------
if (fetch_engine.resp = '1') then -- wait for bus response
fetch_engine.pc <= std_ulogic_vector(unsigned(fetch_engine.pc) + 1); -- next word
fetch_engine.pc <= std_ulogic_vector(unsigned(fetch_engine.pc) + 4); -- next word
fetch_engine.unaligned <= '0';
if (fetch_engine.restart = '1') or (fetch_engine.reset = '1') then -- restart request (fast) due to branch
fetch_engine.state <= IF_RESTART;
Expand All @@ -409,8 +410,8 @@ begin
end process fetch_engine_fsm;

-- PC output for instruction fetch --
bus_req_o.addr <= fetch_engine.pc & "00"; -- word aligned
fetch_pc_o <= fetch_engine.pc & "00"; -- word aligned
bus_req_o.addr <= fetch_engine.pc; -- word aligned
fetch_pc_o <= fetch_engine.pc; -- word aligned

-- instruction fetch (read) request if IPB not full --
bus_req_o.stb <= '1' when (fetch_engine.state = IF_REQUEST) and (ipb.free = "11") else '0';
Expand Down Expand Up @@ -504,7 +505,7 @@ begin
issue_engine.align <= '0'; -- start aligned after reset
elsif rising_edge(clk_i) then
if (fetch_engine.restart = '1') then
issue_engine.align <= execute_engine.pc(1); -- branch to unaligned address?
issue_engine.align <= execute_engine.next_pc(1); -- branch to unaligned address?
elsif (issue_engine.ack = '1') then
issue_engine.align <= (issue_engine.align and (not issue_engine.align_clr)) or issue_engine.align_set; -- "RS" flip-flop
end if;
Expand Down Expand Up @@ -606,10 +607,14 @@ begin
-- -------------------------------------------------------------------------------------------
branch_check: process(execute_engine.ir, cmp_i)
begin
if (execute_engine.ir(instr_funct3_msb_c) = '0') then -- beq / bne
execute_engine.branch_taken <= cmp_i(cmp_equal_c) xor execute_engine.ir(instr_funct3_lsb_c);
else -- blt(u) / bge(u)
execute_engine.branch_taken <= cmp_i(cmp_less_c) xor execute_engine.ir(instr_funct3_lsb_c);
if (execute_engine.ir(instr_opcode_lsb_c+2) = '0') then -- conditional branch
if (execute_engine.ir(instr_funct3_msb_c) = '0') then -- beq / bne
execute_engine.branch_taken <= cmp_i(cmp_equal_c) xor execute_engine.ir(instr_funct3_lsb_c);
else -- blt(u) / bge(u)
execute_engine.branch_taken <= cmp_i(cmp_less_c) xor execute_engine.ir(instr_funct3_lsb_c);
end if;
else -- unconditional branch
execute_engine.branch_taken <= '1';
end if;
end process branch_check;

Expand All @@ -627,30 +632,26 @@ begin
execute_engine.is_ci <= '0';
execute_engine.pc <= CPU_BOOT_ADDR(XLEN-1 downto 2) & "00"; -- 32-bit aligned boot address
execute_engine.next_pc <= CPU_BOOT_ADDR(XLEN-1 downto 2) & "00"; -- 32-bit aligned boot address
execute_engine.link_pc <= CPU_BOOT_ADDR(XLEN-1 downto 2) & "00"; -- 32-bit aligned boot address
elsif rising_edge(clk_i) then
-- control bus --
ctrl <= ctrl_nxt;

-- execute engine arbiter --
execute_engine.state <= execute_engine.state_nxt;
execute_engine.state_prev <= execute_engine.state; -- for HPMs only
execute_engine.state_prev2 <= execute_engine.state_prev; -- for HPMs only
execute_engine.state_prev <= execute_engine.state;
execute_engine.state_prev2 <= execute_engine.state_prev;
execute_engine.ir <= execute_engine.ir_nxt;
execute_engine.is_ci <= execute_engine.is_ci_nxt;

-- program counter (PC) --
-- current PC: address of instruction being executed --
if (execute_engine.pc_we = '1') then
if (execute_engine.state = BRANCH) then -- jump/taken-branch
if (alu_add_i(1) = '0') or (CPU_EXTENSION_RISCV_C = true) then -- update only if not misaligned
execute_engine.pc <= alu_add_i(XLEN-1 downto 1) & '0';
end if;
else -- new/next instruction address (address will always be properly aligned)
execute_engine.pc <= execute_engine.next_pc(XLEN-1 downto 1) & '0';
end if;
execute_engine.pc <= execute_engine.next_pc(XLEN-1 downto 1) & '0';
end if;

-- next PC --
-- next PC: address of next logic instruction --
case execute_engine.state is

when TRAP_ENTER => -- starting trap environment
if (trap_ctrl.cause(5) = '1') and (CPU_EXTENSION_RISCV_Sdext = true) then -- debug mode (re-)entry
execute_engine.next_pc <= CPU_DEBUG_PARK_ADDR(XLEN-1 downto 2) & "00"; -- debug mode enter; start at "parking loop" <normal_entry>
Expand All @@ -663,33 +664,46 @@ begin
execute_engine.next_pc <= csr.mtvec(XLEN-1 downto 2) & "00"; -- pc = mtvec
end if;
end if;

when TRAP_EXIT => -- leaving trap environment
if (debug_ctrl.running = '1') and (CPU_EXTENSION_RISCV_Sdext = true) then -- debug mode exit
execute_engine.next_pc <= csr.dpc(XLEN-1 downto 1) & '0';
else -- normal end of trap
execute_engine.next_pc <= csr.mepc(XLEN-1 downto 1) & '0';
end if;
when BRANCHED => -- control flow transfer
execute_engine.next_pc <= execute_engine.pc(XLEN-1 downto 1) & '0'; -- branch/jump destination

when BRANCH => -- control flow transfer
if (trap_ctrl.exc_buf(exc_illegal_c) = '0') and (execute_engine.branch_taken = '1') then -- valid taken branch
execute_engine.next_pc <= alu_add_i(XLEN-1 downto 1) & '0';
end if;

when EXECUTE => -- linear increment
execute_engine.next_pc <= std_ulogic_vector(unsigned(execute_engine.pc) + unsigned(execute_engine.next_pc_inc)); -- next linear PC
when others =>
execute_engine.next_pc <= std_ulogic_vector(unsigned(execute_engine.pc) + unsigned(execute_engine.next_pc_inc));

when others => -- no update
NULL;

end case;

-- link PC: return address --
if (execute_engine.state = BRANCH) then
execute_engine.link_pc <= execute_engine.next_pc(XLEN-1 downto 1) & '0';
end if;
end if;
end process execute_engine_fsm_sync;

-- check if branch destination is misaligned --
trap_ctrl.instr_ma <= '1' when (execute_engine.state = BRANCH) and (execute_engine.pc_we = '1') and
(alu_add_i(1) = '1') and (CPU_EXTENSION_RISCV_C = false) else '0';
trap_ctrl.instr_ma <= '1' when (execute_engine.state = BRANCH) and (trap_ctrl.exc_buf(exc_illegal_c) = '0') and -- valid branch instruction
(execute_engine.branch_taken = '1') and -- branch is taken
(alu_add_i(1) = '1') and (CPU_EXTENSION_RISCV_C = false) else '0'; -- misaligned destination

-- PC increment for next LINEAR instruction (+2 for compressed instr., +4 otherwise) --
execute_engine.next_pc_inc(XLEN-1 downto 4) <= (others => '0');
execute_engine.next_pc_inc(3 downto 0) <= x"4" when ((execute_engine.is_ci = '0') or (CPU_EXTENSION_RISCV_C = false)) else x"2";

-- PC output --
curr_pc_o <= execute_engine.pc(XLEN-1 downto 1) & '0'; -- current PC
next_pc_o <= execute_engine.next_pc(XLEN-1 downto 1) & '0'; -- next PC
link_pc_o <= execute_engine.link_pc(XLEN-1 downto 1) & '0'; -- return address


-- Decoding Helper Logic ------------------------------------------------------------------
Expand Down Expand Up @@ -853,17 +867,15 @@ begin

when DISPATCH => -- Wait for ISSUE ENGINE to emit valid instruction word
-- ------------------------------------------------------------
if (issue_engine.valid(0) = '1') or (issue_engine.valid(1) = '1') then -- new instruction word available
if (trap_ctrl.env_pending = '1') or (trap_ctrl.exc_fire = '1') then -- pending trap or pending exception (fast)
execute_engine.state_nxt <= TRAP_ENTER;
else -- normal execution
issue_engine.ack <= '1';
trap_ctrl.instr_be <= issue_engine.data(33); -- bus access fault during instruction fetch
execute_engine.is_ci_nxt <= issue_engine.data(32); -- this is a de-compressed instruction
execute_engine.ir_nxt <= issue_engine.data(31 downto 0); -- instruction word
execute_engine.pc_we <= '1'; -- pc <= next_pc
execute_engine.state_nxt <= EXECUTE;
end if;
if (trap_ctrl.env_pending = '1') or (trap_ctrl.exc_fire = '1') then -- pending trap or pending exception (fast)
execute_engine.state_nxt <= TRAP_ENTER;
elsif (issue_engine.valid(0) = '1') or (issue_engine.valid(1) = '1') then -- new instruction word available
issue_engine.ack <= '1';
trap_ctrl.instr_be <= issue_engine.data(33); -- bus access fault during instruction fetch
execute_engine.is_ci_nxt <= issue_engine.data(32); -- this is a de-compressed instruction
execute_engine.ir_nxt <= issue_engine.data(31 downto 0); -- instruction word
execute_engine.pc_we <= '1'; -- pc <= next_pc
execute_engine.state_nxt <= EXECUTE;
end if;

when TRAP_ENTER => -- Enter trap environment and jump to trap vector
Expand All @@ -881,7 +893,6 @@ begin
when RESTART => -- reset and restart instruction fetch at <next_pc>
-- ------------------------------------------------------------
fetch_engine.reset <= '1';
execute_engine.pc_we <= '1';
execute_engine.state_nxt <= BRANCHED;

when EXECUTE => -- Decode and execute instruction (control will be here for exactly 1 cycle in any case)
Expand Down Expand Up @@ -990,14 +1001,11 @@ begin
execute_engine.state_nxt <= RESTART; -- reset instruction fetch + IPB (only required for fence.i)
end if;

when BRANCH => -- update PC on taken branches and jumps
when BRANCH => -- update next_PC on taken branches and jumps
-- ------------------------------------------------------------
ctrl_nxt.rf_mux <= rf_mux_npc_c; -- return address = next PC
ctrl_nxt.rf_mux <= rf_mux_ret_c; -- return address = link PC
ctrl_nxt.rf_wb_en <= execute_engine.ir(instr_opcode_lsb_c+2); -- save return address if link operation (will not happen if misaligned)
if (trap_ctrl.exc_buf(exc_illegal_c) = '0') then -- update only if not illegal instruction
execute_engine.pc_we <= '1'; -- update PC with branch DST; will be overridden in DISPATCH if branch not taken
end if;
if (execute_engine.ir(instr_opcode_lsb_c+2) = '1') or (execute_engine.branch_taken = '1') then -- JAL[R] or taken branch
if (trap_ctrl.exc_buf(exc_illegal_c) = '0') and (execute_engine.branch_taken = '1') then -- valid taken branch
fetch_engine.reset <= '1'; -- reset instruction fetch to restart at modified PC
execute_engine.state_nxt <= BRANCHED;
else
Expand Down
6 changes: 3 additions & 3 deletions rtl/core/neorv32_cpu_regfile.vhd
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ entity neorv32_cpu_regfile is
alu_i : in std_ulogic_vector(XLEN-1 downto 0); -- ALU result
mem_i : in std_ulogic_vector(XLEN-1 downto 0); -- memory read data
csr_i : in std_ulogic_vector(XLEN-1 downto 0); -- CSR read data
npc_i : in std_ulogic_vector(XLEN-1 downto 0); -- next PC
ret_i : in std_ulogic_vector(XLEN-1 downto 0); -- link PC
-- data output --
rs1_o : out std_ulogic_vector(XLEN-1 downto 0); -- rs1
rs2_o : out std_ulogic_vector(XLEN-1 downto 0); -- rs2
Expand Down Expand Up @@ -96,13 +96,13 @@ begin

-- Data Write-Back Select -----------------------------------------------------------------
-- -------------------------------------------------------------------------------------------
wb_select: process(ctrl_i, alu_i, mem_i, csr_i, npc_i)
wb_select: process(ctrl_i, alu_i, mem_i, csr_i, ret_i)
begin
case ctrl_i.rf_mux is
when rf_mux_alu_c => rf_wdata <= alu_i; -- ALU result
when rf_mux_mem_c => rf_wdata <= mem_i; -- memory read data
when rf_mux_csr_c => rf_wdata <= csr_i; -- CSR read data
when rf_mux_npc_c => rf_wdata <= npc_i; -- next PC (return/link address)
when rf_mux_ret_c => rf_wdata <= ret_i; -- link PC (return address)
when others => rf_wdata <= alu_i; -- don't care
end case;
end process wb_select;
Expand Down
4 changes: 2 additions & 2 deletions rtl/core/neorv32_package.vhd
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ package neorv32_package is

-- Architecture Constants -----------------------------------------------------------------
-- -------------------------------------------------------------------------------------------
constant hw_version_c : std_ulogic_vector(31 downto 0) := x"01090104"; -- hardware version
constant hw_version_c : std_ulogic_vector(31 downto 0) := x"01090105"; -- hardware version
constant archid_c : natural := 19; -- official RISC-V architecture ID
constant XLEN : natural := 32; -- native data path width, do not change!

Expand Down Expand Up @@ -601,7 +601,7 @@ package neorv32_package is
constant rf_mux_alu_c : std_ulogic_vector(1 downto 0) := "00"; -- register file <= alu result
constant rf_mux_mem_c : std_ulogic_vector(1 downto 0) := "01"; -- register file <= memory read data
constant rf_mux_csr_c : std_ulogic_vector(1 downto 0) := "10"; -- register file <= CSR read data
constant rf_mux_npc_c : std_ulogic_vector(1 downto 0) := "11"; -- register file <= next-PC (for branch-and-link)
constant rf_mux_ret_c : std_ulogic_vector(1 downto 0) := "11"; -- register file <= link-PC (return address)

-- Trap ID Codes --------------------------------------------------------------------------
-- -------------------------------------------------------------------------------------------
Expand Down

0 comments on commit 617c8bb

Please sign in to comment.