Skip to content

Commit

Permalink
Merge pull request #240 from stnolting/faster_muldiv
Browse files Browse the repository at this point in the history
Faster mul/div operations
  • Loading branch information
stnolting committed Dec 14, 2021
2 parents fcfec71 + 170eff3 commit 8fc1795
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 44 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ defined by the `hw_version_c` constant in the main VHDL package file [`rtl/core/

| Date (*dd.mm.yyyy*) | Version | Comment |
|:----------:|:-------:|:--------|
| 14.12.2021 | 1.6.4.9 | optimized CPU's multiplication/division co-processor: divisions are 1 cycle faster, fast-multiplications (when using DSPs) are 1 cycle faster, slightly less resource utilization, see [PR #240](https://github.com/stnolting/neorv32/pull/240) |
| 11.12.2021 | 1.6.4.8 | watchdog: added new _DBEN_ and _HALF_ flags to control register (enable WDT during debugging, check timeout counter level), see [PR #239](https://github.com/stnolting/neorv32/pull/239) |
| 10.12.2021 | 1.6.4.7 | optimized CPU's multiplication/division co-processor: all mul/div operations are 1 cycle faster + slightly less resource utilization, see [PR #238](https://github.com/stnolting/neorv32/pull/238) |
| 08.12.2021 | 1.6.4.6 | :warning: reworked **Fast Interrupt Requests (FIRQ)** system, see [PR #236](https://github.com/stnolting/neorv32/pull/236) |
Expand Down
4 changes: 2 additions & 2 deletions docs/datasheet/cpu.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -781,8 +781,8 @@ configurations are presented in <<_cpu_performance>>.
| Memory access | `I/E` | `lb` `lh` `lw` `lbu` `lhu` `sb` `sh` `sw` | 4 + ML
| Memory access | `C` | `c.lw` `c.sw` `c.lwsp` `c.swsp` | 4 + ML
| Memory access | `A` | `lr.w` `sc.w` | 4 + ML
| Multiplication | `M` | `mul` `mulh` `mulhsu` `mulhu` | 3+32+2; FAST_MULfootnote:[DSP-based multiplication; enabled via `FAST_MUL_EN`.]: 6
| Division | `M` | `div` `divu` `rem` `remu` | 3+32+2
| Multiplication | `M` | `mul` `mulh` `mulhsu` `mulhu` | 2+32+2; FAST_MULfootnote:[DSP-based multiplication; enabled via `FAST_MUL_EN`.]: 4
| Division | `M` | `div` `divu` `rem` `remu` | 2+32+2
| CSR access | `Zicsr` | `csrrw` `csrrs` `csrrc` `csrrwi` `csrrsi` `csrrci` | 4
| System | `I/E`+`Zicsr` | `ecall` `ebreak` | 4
| System | `I/E` | `fence` | 3
Expand Down
80 changes: 39 additions & 41 deletions rtl/core/neorv32_cpu_cp_muldiv.vhd
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,6 @@ architecture neorv32_cpu_cp_muldiv_rtl of neorv32_cpu_cp_muldiv is
signal start_div : std_ulogic;
signal start_mul : std_ulogic;
signal operation : std_ulogic;
signal div_opx : std_ulogic_vector(data_width_c-1 downto 0);
signal div_opy : std_ulogic_vector(data_width_c-1 downto 0);
signal rs1_is_signed : std_ulogic;
signal rs2_is_signed : std_ulogic;
Expand All @@ -110,7 +109,6 @@ architecture neorv32_cpu_cp_muldiv_rtl of neorv32_cpu_cp_muldiv is
signal mul_p_sext : std_ulogic;
signal mul_op_x : signed(32 downto 0); -- for using DSPs
signal mul_op_y : signed(32 downto 0); -- for using DSPs
signal mul_buf_ff : signed(65 downto 0); -- for using DSPs

begin

Expand All @@ -120,7 +118,6 @@ begin
begin
if (rstn_i = '0') then
state <= IDLE;
div_opx <= (others => def_rst_val_c);
div_opy <= (others => def_rst_val_c);
cnt <= (others => def_rst_val_c);
cp_op_ff <= (others => def_rst_val_c);
Expand All @@ -140,50 +137,44 @@ begin

when IDLE =>
cp_op_ff <= cp_op;
cnt <= "11110";
if (start_i = '1') then
if (operation = '1') and (DIVISION_EN = true) then -- division
cnt <= "11111";
state <= DIV_PREPROCESS;
else
cnt <= "11110";
state <= PROCESSING;
start_div <= '1';
state <= DIV_PREPROCESS;
else -- multiplication
if (FAST_MUL_EN = true) then
valid_o <= '1';
state <= FINALIZE;
else
state <= PROCESSING;
end if;
end if;
end if;

when DIV_PREPROCESS =>
if (DIVISION_EN = true) then
-- check relevant input signs --
if (cp_op = cp_op_div_c) then -- result sign compensation for div?
div_res_corr <= rs1_i(rs1_i'left) xor rs2_i(rs2_i'left);
elsif (cp_op = cp_op_rem_c) then -- result sign compensation for rem?
div_res_corr <= rs1_i(rs1_i'left);
else
div_res_corr <= '0';
end if;
-- divide by zero? --
opy_is_zero <= not or_reduce_f(rs2_i); -- set if rs2 = 0
-- abs(rs1) --
if ((rs1_i(rs1_i'left) and rs1_is_signed) = '1') then -- signed division?
div_opx <= std_ulogic_vector(0 - unsigned(rs1_i)); -- make positive
else
div_opx <= rs1_i;
end if;
-- abs(rs2) --
if ((rs2_i(rs2_i'left) and rs2_is_signed) = '1') then -- signed division?
div_opy <= std_ulogic_vector(0 - unsigned(rs2_i)); -- make positive
else
div_opy <= rs2_i;
end if;
--
start_div <= '1';
state <= PROCESSING;
-- check relevant input signs --
if (cp_op = cp_op_div_c) then -- result sign compensation for div?
div_res_corr <= rs1_i(rs1_i'left) xor rs2_i(rs2_i'left);
elsif (cp_op = cp_op_rem_c) then -- result sign compensation for rem?
div_res_corr <= rs1_i(rs1_i'left);
else
state <= IDLE;
div_res_corr <= '0';
end if;
-- divide by zero? --
opy_is_zero <= not or_reduce_f(rs2_i); -- set if rs2 = 0
-- abs(rs2) --
if ((rs2_i(rs2_i'left) and rs2_is_signed) = '1') then -- signed division?
div_opy <= std_ulogic_vector(0 - unsigned(rs2_i)); -- make positive
else
div_opy <= rs2_i;
end if;
--
state <= PROCESSING;

when PROCESSING =>
cnt <= std_ulogic_vector(unsigned(cnt) - 1);
if (cnt = "00000") or ((FAST_MUL_EN = true) and (operation = '0')) then
if (cnt = "00000") then
valid_o <= '1';
state <= FINALIZE;
end if;
Expand Down Expand Up @@ -235,18 +226,21 @@ begin
end process multiplier_core;
end generate;

-- parallel multiplication --
-- parallel multiplication (using DSP blocks) --
multiplier_core_dsp:
if (FAST_MUL_EN = true) generate
multiplier_core: process(clk_i)
variable tmp_v : signed(65 downto 0);
begin
if rising_edge(clk_i) then
if (start_mul = '1') then
mul_op_x <= signed((rs1_i(rs1_i'left) and rs1_is_signed) & rs1_i);
mul_op_y <= signed((rs2_i(rs2_i'left) and rs2_is_signed) & rs2_i);
end if;
mul_buf_ff <= mul_op_x * mul_op_y;
mul_product <= std_ulogic_vector(mul_buf_ff(63 downto 0)); -- let the register balancing do the magic here
tmp_v := mul_op_x * mul_op_y;
mul_product <= std_ulogic_vector(tmp_v(63 downto 0));
--mul_buf_ff <= mul_op_x * mul_op_y;
--mul_product <= std_ulogic_vector(mul_buf_ff(63 downto 0)); -- let the register balancing do the magic here
end if;
end process multiplier_core;
end generate;
Expand Down Expand Up @@ -282,7 +276,11 @@ begin
remainder <= (others => def_rst_val_c);
elsif rising_edge(clk_i) then
if (start_div = '1') then -- start new division
quotient <= div_opx;
if ((rs1_i(rs1_i'left) and rs1_is_signed) = '1') then -- signed division?
quotient <= std_ulogic_vector(0 - unsigned(rs1_i)); -- make positive
else
quotient <= rs1_i;
end if;
remainder <= (others => '0');
elsif (state = PROCESSING) or (state = FINALIZE) then -- running?
quotient <= quotient(30 downto 0) & (not div_sub(32));
Expand Down Expand Up @@ -315,7 +313,7 @@ begin

-- Data Output ----------------------------------------------------------------------------
-- -------------------------------------------------------------------------------------------
operation_result: process(rstn_i, clk_i)
operation_result: process(out_en, cp_op_ff, mul_product, div_res, quotient, opy_is_zero, rs1_i, remainder)
begin
if (out_en = '1') then
case cp_op_ff is
Expand Down
2 changes: 1 addition & 1 deletion rtl/core/neorv32_package.vhd
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ package neorv32_package is
-- Architecture Constants (do not modify!) ------------------------------------------------
-- -------------------------------------------------------------------------------------------
constant data_width_c : natural := 32; -- native data path width - do not change!
constant hw_version_c : std_ulogic_vector(31 downto 0) := x"01060408"; -- no touchy!
constant hw_version_c : std_ulogic_vector(31 downto 0) := x"01060409"; -- no touchy!
constant archid_c : natural := 19; -- official NEORV32 architecture ID - hands off!

-- Check if we're inside the Matrix -------------------------------------------------------
Expand Down

0 comments on commit 8fc1795

Please sign in to comment.