Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Faster mul/div operations #240

Merged
merged 3 commits into from
Dec 14, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ defined by the `hw_version_c` constant in the main VHDL package file [`rtl/core/

| Date (*dd.mm.yyyy*) | Version | Comment |
|:----------:|:-------:|:--------|
| 14.12.2021 | 1.6.4.9 | optimized CPU's multiplication/division co-processor: divisions are 1 cycle faster, fast-multiplications (when using DSPs) are 1 cycle faster, slightly less resource utilization, see [PR #240](https://github.com/stnolting/neorv32/pull/240) |
| 11.12.2021 | 1.6.4.8 | watchdog: added new _DBEN_ and _HALF_ flags to control register (enable WDT during debugging, check timeout counter level), see [PR #239](https://github.com/stnolting/neorv32/pull/239) |
| 10.12.2021 | 1.6.4.7 | optimized CPU's multiplication/division co-processor: all mul/div operations are 1 cycle faster + slightly less resource utilization, see [PR #238](https://github.com/stnolting/neorv32/pull/238) |
| 08.12.2021 | 1.6.4.6 | :warning: reworked **Fast Interrupt Requests (FIRQ)** system, see [PR #236](https://github.com/stnolting/neorv32/pull/236) |
Expand Down
4 changes: 2 additions & 2 deletions docs/datasheet/cpu.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -781,8 +781,8 @@ configurations are presented in <<_cpu_performance>>.
| Memory access | `I/E` | `lb` `lh` `lw` `lbu` `lhu` `sb` `sh` `sw` | 4 + ML
| Memory access | `C` | `c.lw` `c.sw` `c.lwsp` `c.swsp` | 4 + ML
| Memory access | `A` | `lr.w` `sc.w` | 4 + ML
| Multiplication | `M` | `mul` `mulh` `mulhsu` `mulhu` | 3+32+2; FAST_MULfootnote:[DSP-based multiplication; enabled via `FAST_MUL_EN`.]: 6
| Division | `M` | `div` `divu` `rem` `remu` | 3+32+2
| Multiplication | `M` | `mul` `mulh` `mulhsu` `mulhu` | 2+32+2; FAST_MULfootnote:[DSP-based multiplication; enabled via `FAST_MUL_EN`.]: 4
| Division | `M` | `div` `divu` `rem` `remu` | 2+32+2
| CSR access | `Zicsr` | `csrrw` `csrrs` `csrrc` `csrrwi` `csrrsi` `csrrci` | 4
| System | `I/E`+`Zicsr` | `ecall` `ebreak` | 4
| System | `I/E` | `fence` | 3
Expand Down
80 changes: 39 additions & 41 deletions rtl/core/neorv32_cpu_cp_muldiv.vhd
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,6 @@ architecture neorv32_cpu_cp_muldiv_rtl of neorv32_cpu_cp_muldiv is
signal start_div : std_ulogic;
signal start_mul : std_ulogic;
signal operation : std_ulogic;
signal div_opx : std_ulogic_vector(data_width_c-1 downto 0);
signal div_opy : std_ulogic_vector(data_width_c-1 downto 0);
signal rs1_is_signed : std_ulogic;
signal rs2_is_signed : std_ulogic;
Expand All @@ -110,7 +109,6 @@ architecture neorv32_cpu_cp_muldiv_rtl of neorv32_cpu_cp_muldiv is
signal mul_p_sext : std_ulogic;
signal mul_op_x : signed(32 downto 0); -- for using DSPs
signal mul_op_y : signed(32 downto 0); -- for using DSPs
signal mul_buf_ff : signed(65 downto 0); -- for using DSPs

begin

Expand All @@ -120,7 +118,6 @@ begin
begin
if (rstn_i = '0') then
state <= IDLE;
div_opx <= (others => def_rst_val_c);
div_opy <= (others => def_rst_val_c);
cnt <= (others => def_rst_val_c);
cp_op_ff <= (others => def_rst_val_c);
Expand All @@ -140,50 +137,44 @@ begin

when IDLE =>
cp_op_ff <= cp_op;
cnt <= "11110";
if (start_i = '1') then
if (operation = '1') and (DIVISION_EN = true) then -- division
cnt <= "11111";
state <= DIV_PREPROCESS;
else
cnt <= "11110";
state <= PROCESSING;
start_div <= '1';
state <= DIV_PREPROCESS;
else -- multiplication
if (FAST_MUL_EN = true) then
valid_o <= '1';
state <= FINALIZE;
else
state <= PROCESSING;
end if;
end if;
end if;

when DIV_PREPROCESS =>
if (DIVISION_EN = true) then
-- check relevant input signs --
if (cp_op = cp_op_div_c) then -- result sign compensation for div?
div_res_corr <= rs1_i(rs1_i'left) xor rs2_i(rs2_i'left);
elsif (cp_op = cp_op_rem_c) then -- result sign compensation for rem?
div_res_corr <= rs1_i(rs1_i'left);
else
div_res_corr <= '0';
end if;
-- divide by zero? --
opy_is_zero <= not or_reduce_f(rs2_i); -- set if rs2 = 0
-- abs(rs1) --
if ((rs1_i(rs1_i'left) and rs1_is_signed) = '1') then -- signed division?
div_opx <= std_ulogic_vector(0 - unsigned(rs1_i)); -- make positive
else
div_opx <= rs1_i;
end if;
-- abs(rs2) --
if ((rs2_i(rs2_i'left) and rs2_is_signed) = '1') then -- signed division?
div_opy <= std_ulogic_vector(0 - unsigned(rs2_i)); -- make positive
else
div_opy <= rs2_i;
end if;
--
start_div <= '1';
state <= PROCESSING;
-- check relevant input signs --
if (cp_op = cp_op_div_c) then -- result sign compensation for div?
div_res_corr <= rs1_i(rs1_i'left) xor rs2_i(rs2_i'left);
elsif (cp_op = cp_op_rem_c) then -- result sign compensation for rem?
div_res_corr <= rs1_i(rs1_i'left);
else
state <= IDLE;
div_res_corr <= '0';
end if;
-- divide by zero? --
opy_is_zero <= not or_reduce_f(rs2_i); -- set if rs2 = 0
-- abs(rs2) --
if ((rs2_i(rs2_i'left) and rs2_is_signed) = '1') then -- signed division?
div_opy <= std_ulogic_vector(0 - unsigned(rs2_i)); -- make positive
else
div_opy <= rs2_i;
end if;
--
state <= PROCESSING;

when PROCESSING =>
cnt <= std_ulogic_vector(unsigned(cnt) - 1);
if (cnt = "00000") or ((FAST_MUL_EN = true) and (operation = '0')) then
if (cnt = "00000") then
valid_o <= '1';
state <= FINALIZE;
end if;
Expand Down Expand Up @@ -235,18 +226,21 @@ begin
end process multiplier_core;
end generate;

-- parallel multiplication --
-- parallel multiplication (using DSP blocks) --
multiplier_core_dsp:
if (FAST_MUL_EN = true) generate
multiplier_core: process(clk_i)
variable tmp_v : signed(65 downto 0);
begin
if rising_edge(clk_i) then
if (start_mul = '1') then
mul_op_x <= signed((rs1_i(rs1_i'left) and rs1_is_signed) & rs1_i);
mul_op_y <= signed((rs2_i(rs2_i'left) and rs2_is_signed) & rs2_i);
end if;
mul_buf_ff <= mul_op_x * mul_op_y;
mul_product <= std_ulogic_vector(mul_buf_ff(63 downto 0)); -- let the register balancing do the magic here
tmp_v := mul_op_x * mul_op_y;
mul_product <= std_ulogic_vector(tmp_v(63 downto 0));
--mul_buf_ff <= mul_op_x * mul_op_y;
--mul_product <= std_ulogic_vector(mul_buf_ff(63 downto 0)); -- let the register balancing do the magic here
end if;
end process multiplier_core;
end generate;
Expand Down Expand Up @@ -282,7 +276,11 @@ begin
remainder <= (others => def_rst_val_c);
elsif rising_edge(clk_i) then
if (start_div = '1') then -- start new division
quotient <= div_opx;
if ((rs1_i(rs1_i'left) and rs1_is_signed) = '1') then -- signed division?
quotient <= std_ulogic_vector(0 - unsigned(rs1_i)); -- make positive
else
quotient <= rs1_i;
end if;
remainder <= (others => '0');
elsif (state = PROCESSING) or (state = FINALIZE) then -- running?
quotient <= quotient(30 downto 0) & (not div_sub(32));
Expand Down Expand Up @@ -315,7 +313,7 @@ begin

-- Data Output ----------------------------------------------------------------------------
-- -------------------------------------------------------------------------------------------
operation_result: process(rstn_i, clk_i)
operation_result: process(out_en, cp_op_ff, mul_product, div_res, quotient, opy_is_zero, rs1_i, remainder)
begin
if (out_en = '1') then
case cp_op_ff is
Expand Down
2 changes: 1 addition & 1 deletion rtl/core/neorv32_package.vhd
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ package neorv32_package is
-- Architecture Constants (do not modify!) ------------------------------------------------
-- -------------------------------------------------------------------------------------------
constant data_width_c : natural := 32; -- native data path width - do not change!
constant hw_version_c : std_ulogic_vector(31 downto 0) := x"01060408"; -- no touchy!
constant hw_version_c : std_ulogic_vector(31 downto 0) := x"01060409"; -- no touchy!
constant archid_c : natural := 19; -- official NEORV32 architecture ID - hands off!

-- Check if we're inside the Matrix -------------------------------------------------------
Expand Down