Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[rtl] cleanup & rework/optimize CPU branch system #735

Merged
merged 3 commits into from
Nov 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ mimpid = 0x01040312 -> Version 01.04.03.12 -> v1.4.3.12

| Date (*dd.mm.yyyy*) | Version | Comment |
|:-------------------:|:-------:|:--------|
| 23.11.2023 | 1.9.1.5 | clean-up & rework CPU branch logic; [#735](https://github.com/stnolting/neorv32/pull/735) |
| 21.11.2023 | 1.9.1.4 | :bug: fix bug in handling of "misaligned instruction exception"; [#734](https://github.com/stnolting/neorv32/pull/734) |
| 20.11.2023 | 1.9.1.3 | :bug: fix wiring of FPU exception flags; [#733](https://github.com/stnolting/neorv32/pull/733) |
| 18.11.2023 | 1.9.1.2 | add XIP clock divider to fine-tune SPI frequency; [#731](https://github.com/stnolting/neorv32/pull/731) |
Expand Down
6 changes: 3 additions & 3 deletions rtl/core/neorv32_cpu.vhd
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ architecture neorv32_cpu_rtl of neorv32_cpu is
signal be_store : std_ulogic; -- bus error on store data access
signal fetch_pc : std_ulogic_vector(XLEN-1 downto 0); -- pc for instruction fetch
signal curr_pc : std_ulogic_vector(XLEN-1 downto 0); -- current pc (for currently executed instruction)
signal next_pc : std_ulogic_vector(XLEN-1 downto 0); -- next pc (for next executed instruction)
signal link_pc : std_ulogic_vector(XLEN-1 downto 0); -- link pc (return address)
signal pmp_ex_fault : std_ulogic; -- PMP instruction fetch fault
signal pmp_rw_fault : std_ulogic; -- PMP read/write access fault

Expand Down Expand Up @@ -217,7 +217,7 @@ begin
imm_o => imm, -- immediate
fetch_pc_o => fetch_pc, -- instruction fetch address
curr_pc_o => curr_pc, -- current PC (corresponding to current instruction)
next_pc_o => next_pc, -- next PC (corresponding to next instruction)
link_pc_o => link_pc, -- link PC (return address)
csr_rdata_o => csr_rdata, -- CSR read data
-- external CSR interface --
xcsr_we_o => xcsr_we, -- global write enable
Expand Down Expand Up @@ -268,7 +268,7 @@ begin
alu_i => alu_res, -- ALU result
mem_i => mem_rdata, -- memory read data
csr_i => csr_rdata, -- CSR read data
npc_i => next_pc, -- next PC
ret_i => link_pc, -- return address
-- data output --
rs1_o => rs1, -- rs1
rs2_o => rs2, -- rs2
Expand Down
106 changes: 57 additions & 49 deletions rtl/core/neorv32_cpu_control.vhd
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ entity neorv32_cpu_control is
imm_o : out std_ulogic_vector(XLEN-1 downto 0); -- immediate
fetch_pc_o : out std_ulogic_vector(XLEN-1 downto 0); -- instruction fetch address
curr_pc_o : out std_ulogic_vector(XLEN-1 downto 0); -- current PC (corresponding to current instruction)
next_pc_o : out std_ulogic_vector(XLEN-1 downto 0); -- next PC (corresponding to next instruction)
link_pc_o : out std_ulogic_vector(XLEN-1 downto 0); -- link PC (return address)
csr_rdata_o : out std_ulogic_vector(XLEN-1 downto 0); -- CSR read data
-- external CSR interface --
xcsr_we_o : out std_ulogic; -- global write enable
Expand Down Expand Up @@ -137,7 +137,7 @@ architecture neorv32_cpu_control_rtl of neorv32_cpu_control is
state_prev : fetch_engine_state_t;
restart : std_ulogic;
unaligned : std_ulogic;
pc : std_ulogic_vector(XLEN-1 downto 2); -- word-aligned
pc : std_ulogic_vector(XLEN-1 downto 0);
reset : std_ulogic;
resp : std_ulogic; -- bus response
end record;
Expand Down Expand Up @@ -201,6 +201,7 @@ architecture neorv32_cpu_control_rtl of neorv32_cpu_control is
pc_we : std_ulogic; -- PC update enabled
next_pc : std_ulogic_vector(XLEN-1 downto 0); -- next PC, corresponding to next instruction to be executed
next_pc_inc : std_ulogic_vector(XLEN-1 downto 0); -- increment to get next PC
link_pc : std_ulogic_vector(XLEN-1 downto 0); -- next PC for linking (return address)
end record;
signal execute_engine : execute_engine_t;

Expand Down Expand Up @@ -361,7 +362,7 @@ begin
fetch_engine.state_prev <= IF_RESTART;
fetch_engine.restart <= '1'; -- set to reset IPB
fetch_engine.unaligned <= '0';
fetch_engine.pc <= CPU_BOOT_ADDR(XLEN-1 downto 2); -- 32-bit aligned boot address
fetch_engine.pc <= CPU_BOOT_ADDR(XLEN-1 downto 2) & "00"; -- 32-bit aligned boot address
elsif rising_edge(clk_i) then
-- previous state (for HPMs only) --
fetch_engine.state_prev <= fetch_engine.state;
Expand All @@ -378,8 +379,8 @@ begin

when IF_RESTART => -- set new fetch start address
-- ------------------------------------------------------------
fetch_engine.pc <= execute_engine.pc(XLEN-1 downto 2); -- initialize with logical PC, word aligned
fetch_engine.unaligned <= execute_engine.pc(1);
fetch_engine.pc <= execute_engine.next_pc(XLEN-1 downto 2) & "00"; -- initialize with logical PC, word aligned
fetch_engine.unaligned <= execute_engine.next_pc(1);
fetch_engine.state <= IF_REQUEST;

when IF_REQUEST => -- request new 32-bit-aligned instruction word
Expand All @@ -391,7 +392,7 @@ begin
when IF_PENDING => -- wait for bus response and write instruction data to prefetch buffer
-- ------------------------------------------------------------
if (fetch_engine.resp = '1') then -- wait for bus response
fetch_engine.pc <= std_ulogic_vector(unsigned(fetch_engine.pc) + 1); -- next word
fetch_engine.pc <= std_ulogic_vector(unsigned(fetch_engine.pc) + 4); -- next word
fetch_engine.unaligned <= '0';
if (fetch_engine.restart = '1') or (fetch_engine.reset = '1') then -- restart request (fast) due to branch
fetch_engine.state <= IF_RESTART;
Expand All @@ -409,8 +410,8 @@ begin
end process fetch_engine_fsm;

-- PC output for instruction fetch --
bus_req_o.addr <= fetch_engine.pc & "00"; -- word aligned
fetch_pc_o <= fetch_engine.pc & "00"; -- word aligned
bus_req_o.addr <= fetch_engine.pc; -- word aligned
fetch_pc_o <= fetch_engine.pc; -- word aligned

-- instruction fetch (read) request if IPB not full --
bus_req_o.stb <= '1' when (fetch_engine.state = IF_REQUEST) and (ipb.free = "11") else '0';
Expand Down Expand Up @@ -504,7 +505,7 @@ begin
issue_engine.align <= '0'; -- start aligned after reset
elsif rising_edge(clk_i) then
if (fetch_engine.restart = '1') then
issue_engine.align <= execute_engine.pc(1); -- branch to unaligned address?
issue_engine.align <= execute_engine.next_pc(1); -- branch to unaligned address?
elsif (issue_engine.ack = '1') then
issue_engine.align <= (issue_engine.align and (not issue_engine.align_clr)) or issue_engine.align_set; -- "RS" flip-flop
end if;
Expand Down Expand Up @@ -606,10 +607,14 @@ begin
-- -------------------------------------------------------------------------------------------
branch_check: process(execute_engine.ir, cmp_i)
begin
if (execute_engine.ir(instr_funct3_msb_c) = '0') then -- beq / bne
execute_engine.branch_taken <= cmp_i(cmp_equal_c) xor execute_engine.ir(instr_funct3_lsb_c);
else -- blt(u) / bge(u)
execute_engine.branch_taken <= cmp_i(cmp_less_c) xor execute_engine.ir(instr_funct3_lsb_c);
if (execute_engine.ir(instr_opcode_lsb_c+2) = '0') then -- conditional branch
if (execute_engine.ir(instr_funct3_msb_c) = '0') then -- beq / bne
execute_engine.branch_taken <= cmp_i(cmp_equal_c) xor execute_engine.ir(instr_funct3_lsb_c);
else -- blt(u) / bge(u)
execute_engine.branch_taken <= cmp_i(cmp_less_c) xor execute_engine.ir(instr_funct3_lsb_c);
end if;
else -- unconditional branch
execute_engine.branch_taken <= '1';
end if;
end process branch_check;

Expand All @@ -627,30 +632,26 @@ begin
execute_engine.is_ci <= '0';
execute_engine.pc <= CPU_BOOT_ADDR(XLEN-1 downto 2) & "00"; -- 32-bit aligned boot address
execute_engine.next_pc <= CPU_BOOT_ADDR(XLEN-1 downto 2) & "00"; -- 32-bit aligned boot address
execute_engine.link_pc <= CPU_BOOT_ADDR(XLEN-1 downto 2) & "00"; -- 32-bit aligned boot address
elsif rising_edge(clk_i) then
-- control bus --
ctrl <= ctrl_nxt;

-- execute engine arbiter --
execute_engine.state <= execute_engine.state_nxt;
execute_engine.state_prev <= execute_engine.state; -- for HPMs only
execute_engine.state_prev2 <= execute_engine.state_prev; -- for HPMs only
execute_engine.state_prev <= execute_engine.state;
execute_engine.state_prev2 <= execute_engine.state_prev;
execute_engine.ir <= execute_engine.ir_nxt;
execute_engine.is_ci <= execute_engine.is_ci_nxt;

-- program counter (PC) --
-- current PC: address of instruction being executed --
if (execute_engine.pc_we = '1') then
if (execute_engine.state = BRANCH) then -- jump/taken-branch
if (alu_add_i(1) = '0') or (CPU_EXTENSION_RISCV_C = true) then -- update only if not misaligned
execute_engine.pc <= alu_add_i(XLEN-1 downto 1) & '0';
end if;
else -- new/next instruction address (address will always be properly aligned)
execute_engine.pc <= execute_engine.next_pc(XLEN-1 downto 1) & '0';
end if;
execute_engine.pc <= execute_engine.next_pc(XLEN-1 downto 1) & '0';
end if;

-- next PC --
-- next PC: address of next logic instruction --
case execute_engine.state is

when TRAP_ENTER => -- starting trap environment
if (trap_ctrl.cause(5) = '1') and (CPU_EXTENSION_RISCV_Sdext = true) then -- debug mode (re-)entry
execute_engine.next_pc <= CPU_DEBUG_PARK_ADDR(XLEN-1 downto 2) & "00"; -- debug mode enter; start at "parking loop" <normal_entry>
Expand All @@ -663,33 +664,46 @@ begin
execute_engine.next_pc <= csr.mtvec(XLEN-1 downto 2) & "00"; -- pc = mtvec
end if;
end if;

when TRAP_EXIT => -- leaving trap environment
if (debug_ctrl.running = '1') and (CPU_EXTENSION_RISCV_Sdext = true) then -- debug mode exit
execute_engine.next_pc <= csr.dpc(XLEN-1 downto 1) & '0';
else -- normal end of trap
execute_engine.next_pc <= csr.mepc(XLEN-1 downto 1) & '0';
end if;
when BRANCHED => -- control flow transfer
execute_engine.next_pc <= execute_engine.pc(XLEN-1 downto 1) & '0'; -- branch/jump destination

when BRANCH => -- control flow transfer
if (trap_ctrl.exc_buf(exc_illegal_c) = '0') and (execute_engine.branch_taken = '1') then -- valid taken branch
execute_engine.next_pc <= alu_add_i(XLEN-1 downto 1) & '0';
end if;

when EXECUTE => -- linear increment
execute_engine.next_pc <= std_ulogic_vector(unsigned(execute_engine.pc) + unsigned(execute_engine.next_pc_inc)); -- next linear PC
when others =>
execute_engine.next_pc <= std_ulogic_vector(unsigned(execute_engine.pc) + unsigned(execute_engine.next_pc_inc));

when others => -- no update
NULL;

end case;

-- link PC: return address --
if (execute_engine.state = BRANCH) then
execute_engine.link_pc <= execute_engine.next_pc(XLEN-1 downto 1) & '0';
end if;
end if;
end process execute_engine_fsm_sync;

-- check if branch destination is misaligned --
trap_ctrl.instr_ma <= '1' when (execute_engine.state = BRANCH) and (execute_engine.pc_we = '1') and
(alu_add_i(1) = '1') and (CPU_EXTENSION_RISCV_C = false) else '0';
trap_ctrl.instr_ma <= '1' when (execute_engine.state = BRANCH) and (trap_ctrl.exc_buf(exc_illegal_c) = '0') and -- valid branch instruction
(execute_engine.branch_taken = '1') and -- branch is taken
(alu_add_i(1) = '1') and (CPU_EXTENSION_RISCV_C = false) else '0'; -- misaligned destination

-- PC increment for next LINEAR instruction (+2 for compressed instr., +4 otherwise) --
execute_engine.next_pc_inc(XLEN-1 downto 4) <= (others => '0');
execute_engine.next_pc_inc(3 downto 0) <= x"4" when ((execute_engine.is_ci = '0') or (CPU_EXTENSION_RISCV_C = false)) else x"2";

-- PC output --
curr_pc_o <= execute_engine.pc(XLEN-1 downto 1) & '0'; -- current PC
next_pc_o <= execute_engine.next_pc(XLEN-1 downto 1) & '0'; -- next PC
link_pc_o <= execute_engine.link_pc(XLEN-1 downto 1) & '0'; -- return address


-- Decoding Helper Logic ------------------------------------------------------------------
Expand Down Expand Up @@ -853,17 +867,15 @@ begin

when DISPATCH => -- Wait for ISSUE ENGINE to emit valid instruction word
-- ------------------------------------------------------------
if (issue_engine.valid(0) = '1') or (issue_engine.valid(1) = '1') then -- new instruction word available
if (trap_ctrl.env_pending = '1') or (trap_ctrl.exc_fire = '1') then -- pending trap or pending exception (fast)
execute_engine.state_nxt <= TRAP_ENTER;
else -- normal execution
issue_engine.ack <= '1';
trap_ctrl.instr_be <= issue_engine.data(33); -- bus access fault during instruction fetch
execute_engine.is_ci_nxt <= issue_engine.data(32); -- this is a de-compressed instruction
execute_engine.ir_nxt <= issue_engine.data(31 downto 0); -- instruction word
execute_engine.pc_we <= '1'; -- pc <= next_pc
execute_engine.state_nxt <= EXECUTE;
end if;
if (trap_ctrl.env_pending = '1') or (trap_ctrl.exc_fire = '1') then -- pending trap or pending exception (fast)
execute_engine.state_nxt <= TRAP_ENTER;
elsif (issue_engine.valid(0) = '1') or (issue_engine.valid(1) = '1') then -- new instruction word available
issue_engine.ack <= '1';
trap_ctrl.instr_be <= issue_engine.data(33); -- bus access fault during instruction fetch
execute_engine.is_ci_nxt <= issue_engine.data(32); -- this is a de-compressed instruction
execute_engine.ir_nxt <= issue_engine.data(31 downto 0); -- instruction word
execute_engine.pc_we <= '1'; -- pc <= next_pc
execute_engine.state_nxt <= EXECUTE;
end if;

when TRAP_ENTER => -- Enter trap environment and jump to trap vector
Expand All @@ -881,7 +893,6 @@ begin
when RESTART => -- reset and restart instruction fetch at <next_pc>
-- ------------------------------------------------------------
fetch_engine.reset <= '1';
execute_engine.pc_we <= '1';
execute_engine.state_nxt <= BRANCHED;

when EXECUTE => -- Decode and execute instruction (control will be here for exactly 1 cycle in any case)
Expand Down Expand Up @@ -990,14 +1001,11 @@ begin
execute_engine.state_nxt <= RESTART; -- reset instruction fetch + IPB (only required for fence.i)
end if;

when BRANCH => -- update PC on taken branches and jumps
when BRANCH => -- update next_PC on taken branches and jumps
-- ------------------------------------------------------------
ctrl_nxt.rf_mux <= rf_mux_npc_c; -- return address = next PC
ctrl_nxt.rf_mux <= rf_mux_ret_c; -- return address = link PC
ctrl_nxt.rf_wb_en <= execute_engine.ir(instr_opcode_lsb_c+2); -- save return address if link operation (will not happen if misaligned)
if (trap_ctrl.exc_buf(exc_illegal_c) = '0') then -- update only if not illegal instruction
execute_engine.pc_we <= '1'; -- update PC with branch DST; will be overridden in DISPATCH if branch not taken
end if;
if (execute_engine.ir(instr_opcode_lsb_c+2) = '1') or (execute_engine.branch_taken = '1') then -- JAL[R] or taken branch
if (trap_ctrl.exc_buf(exc_illegal_c) = '0') and (execute_engine.branch_taken = '1') then -- valid taken branch
fetch_engine.reset <= '1'; -- reset instruction fetch to restart at modified PC
execute_engine.state_nxt <= BRANCHED;
else
Expand Down
6 changes: 3 additions & 3 deletions rtl/core/neorv32_cpu_regfile.vhd
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ entity neorv32_cpu_regfile is
alu_i : in std_ulogic_vector(XLEN-1 downto 0); -- ALU result
mem_i : in std_ulogic_vector(XLEN-1 downto 0); -- memory read data
csr_i : in std_ulogic_vector(XLEN-1 downto 0); -- CSR read data
npc_i : in std_ulogic_vector(XLEN-1 downto 0); -- next PC
ret_i : in std_ulogic_vector(XLEN-1 downto 0); -- link PC
-- data output --
rs1_o : out std_ulogic_vector(XLEN-1 downto 0); -- rs1
rs2_o : out std_ulogic_vector(XLEN-1 downto 0); -- rs2
Expand Down Expand Up @@ -96,13 +96,13 @@ begin

-- Data Write-Back Select -----------------------------------------------------------------
-- -------------------------------------------------------------------------------------------
wb_select: process(ctrl_i, alu_i, mem_i, csr_i, npc_i)
wb_select: process(ctrl_i, alu_i, mem_i, csr_i, ret_i)
begin
case ctrl_i.rf_mux is
when rf_mux_alu_c => rf_wdata <= alu_i; -- ALU result
when rf_mux_mem_c => rf_wdata <= mem_i; -- memory read data
when rf_mux_csr_c => rf_wdata <= csr_i; -- CSR read data
when rf_mux_npc_c => rf_wdata <= npc_i; -- next PC (return/link address)
when rf_mux_ret_c => rf_wdata <= ret_i; -- link PC (return address)
when others => rf_wdata <= alu_i; -- don't care
end case;
end process wb_select;
Expand Down
4 changes: 2 additions & 2 deletions rtl/core/neorv32_package.vhd
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ package neorv32_package is

-- Architecture Constants -----------------------------------------------------------------
-- -------------------------------------------------------------------------------------------
constant hw_version_c : std_ulogic_vector(31 downto 0) := x"01090104"; -- hardware version
constant hw_version_c : std_ulogic_vector(31 downto 0) := x"01090105"; -- hardware version
constant archid_c : natural := 19; -- official RISC-V architecture ID
constant XLEN : natural := 32; -- native data path width, do not change!

Expand Down Expand Up @@ -601,7 +601,7 @@ package neorv32_package is
constant rf_mux_alu_c : std_ulogic_vector(1 downto 0) := "00"; -- register file <= alu result
constant rf_mux_mem_c : std_ulogic_vector(1 downto 0) := "01"; -- register file <= memory read data
constant rf_mux_csr_c : std_ulogic_vector(1 downto 0) := "10"; -- register file <= CSR read data
constant rf_mux_npc_c : std_ulogic_vector(1 downto 0) := "11"; -- register file <= next-PC (for branch-and-link)
constant rf_mux_ret_c : std_ulogic_vector(1 downto 0) := "11"; -- register file <= link-PC (return address)

-- Trap ID Codes --------------------------------------------------------------------------
-- -------------------------------------------------------------------------------------------
Expand Down