diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1462e9ec1..f634f0166 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -26,6 +26,7 @@ defined by the `hw_version_c` constant in the main VHDL package file [`rtl/core/
 
 | Date (*dd.mm.yyyy*) | Version | Comment |
 |:----------:|:-------:|:--------|
+| 14.12.2021 | 1.6.4.9 | optimized CPU's multiplication/division co-processor: divisions are 1 cycle faster, fast-multiplications (when using DSPs) are 1 cycle faster, slightly less resource utilization, see [PR #240](https://github.com/stnolting/neorv32/pull/240) |
 | 11.12.2021 | 1.6.4.8 | watchdog: added new _DBEN_ and _HALF_ flags to control register (enable WDT during debugging, check timeout counter level), see [PR #239](https://github.com/stnolting/neorv32/pull/239) |
 | 10.12.2021 | 1.6.4.7 | optimized CPU's multiplication/division co-processor: all mul/div operations are 1 cycle faster + slightly less resource utilization, see [PR #238](https://github.com/stnolting/neorv32/pull/238) |
 | 08.12.2021 | 1.6.4.6 | :warning: reworked **Fast Interrupt Requests (FIRQ)** system, see [PR #236](https://github.com/stnolting/neorv32/pull/236) |
diff --git a/docs/datasheet/cpu.adoc b/docs/datasheet/cpu.adoc
index 80037fac3..e57462b9f 100644
--- a/docs/datasheet/cpu.adoc
+++ b/docs/datasheet/cpu.adoc
@@ -781,8 +781,8 @@ configurations are presented in <<_cpu_performance>>.
 | Memory access | `I/E` | `lb` `lh` `lw` `lbu` `lhu` `sb` `sh` `sw` | 4 + ML
 | Memory access | `C`   | `c.lw` `c.sw` `c.lwsp` `c.swsp`           | 4 + ML
 | Memory access | `A`   | `lr.w` `sc.w`                             | 4 + ML
-| Multiplication | `M`  | `mul` `mulh` `mulhsu` `mulhu` | 3+32+2; FAST_MULfootnote:[DSP-based multiplication; enabled via `FAST_MUL_EN`.]: 6
-| Division       | `M`  | `div` `divu` `rem` `remu`     | 3+32+2
+| Multiplication | `M`  | `mul` `mulh` `mulhsu` `mulhu` | 2+32+2; FAST_MULfootnote:[DSP-based multiplication; enabled via `FAST_MUL_EN`.]: 4
+| Division       | `M`  | `div` `divu` `rem` `remu`     | 2+32+2
 | CSR access | `Zicsr` | `csrrw` `csrrs` `csrrc` `csrrwi` `csrrsi` `csrrci` | 4
 | System | `I/E`+`Zicsr` | `ecall` `ebreak` | 4
 | System | `I/E` | `fence` | 3
diff --git a/rtl/core/neorv32_cpu_cp_muldiv.vhd b/rtl/core/neorv32_cpu_cp_muldiv.vhd
index 36ac9c550..4a3904e53 100644
--- a/rtl/core/neorv32_cpu_cp_muldiv.vhd
+++ b/rtl/core/neorv32_cpu_cp_muldiv.vhd
@@ -87,7 +87,6 @@ architecture neorv32_cpu_cp_muldiv_rtl of neorv32_cpu_cp_muldiv is
   signal start_div     : std_ulogic;
   signal start_mul     : std_ulogic;
   signal operation     : std_ulogic;
-  signal div_opx       : std_ulogic_vector(data_width_c-1 downto 0);
   signal div_opy       : std_ulogic_vector(data_width_c-1 downto 0);
   signal rs1_is_signed : std_ulogic;
   signal rs2_is_signed : std_ulogic;
@@ -110,7 +109,6 @@ architecture neorv32_cpu_cp_muldiv_rtl of neorv32_cpu_cp_muldiv is
   signal mul_p_sext     : std_ulogic;
   signal mul_op_x       : signed(32 downto 0); -- for using DSPs
   signal mul_op_y       : signed(32 downto 0); -- for using DSPs
-  signal mul_buf_ff     : signed(65 downto 0); -- for using DSPs
 
 begin
 
@@ -120,7 +118,6 @@ begin
   begin
     if (rstn_i = '0') then
       state        <= IDLE;
-      div_opx      <= (others => def_rst_val_c);
       div_opy      <= (others => def_rst_val_c);
       cnt          <= (others => def_rst_val_c);
       cp_op_ff     <= (others => def_rst_val_c);
@@ -140,50 +137,44 @@ begin
 
         when IDLE =>
           cp_op_ff <= cp_op;
+          cnt      <= "11110";
           if (start_i = '1') then
             if (operation = '1') and (DIVISION_EN = true) then -- division
-              cnt <= "11111";
-              state <= DIV_PREPROCESS;
-            else
-              cnt <= "11110";
-              state <= PROCESSING;
+              start_div <= '1';
+              state     <= DIV_PREPROCESS;
+            else -- multiplication
+              if (FAST_MUL_EN = true) then
+                valid_o <= '1';
+                state   <= FINALIZE;
+              else
+                state <= PROCESSING;
+              end if;
             end if;
           end if;
 
         when DIV_PREPROCESS =>
-          if (DIVISION_EN = true) then
-            -- check relevant input signs --
-            if (cp_op = cp_op_div_c) then -- result sign compensation for div?
-              div_res_corr <= rs1_i(rs1_i'left) xor rs2_i(rs2_i'left);
-            elsif (cp_op = cp_op_rem_c) then -- result sign compensation for rem?
-              div_res_corr <= rs1_i(rs1_i'left);
-            else
-              div_res_corr <= '0';
-            end if;
-            -- divide by zero? --
-            opy_is_zero <= not or_reduce_f(rs2_i); -- set if rs2 = 0
-            -- abs(rs1) --
-            if ((rs1_i(rs1_i'left) and rs1_is_signed) = '1') then -- signed division?
-              div_opx <= std_ulogic_vector(0 - unsigned(rs1_i)); -- make positive
-            else
-              div_opx <= rs1_i;
-            end if;
-            -- abs(rs2) --
-            if ((rs2_i(rs2_i'left) and rs2_is_signed) = '1') then -- signed division?
-              div_opy <= std_ulogic_vector(0 - unsigned(rs2_i)); -- make positive
-            else
-              div_opy <= rs2_i;
-            end if;
-            --
-            start_div <= '1';
-            state     <= PROCESSING;
+          -- check relevant input signs --
+          if (cp_op = cp_op_div_c) then -- result sign compensation for div?
+            div_res_corr <= rs1_i(rs1_i'left) xor rs2_i(rs2_i'left);
+          elsif (cp_op = cp_op_rem_c) then -- result sign compensation for rem?
+            div_res_corr <= rs1_i(rs1_i'left);
           else
-            state <= IDLE;
+            div_res_corr <= '0';
           end if;
+          -- divide by zero? --
+          opy_is_zero <= not or_reduce_f(rs2_i); -- set if rs2 = 0
+          -- abs(rs2) --
+          if ((rs2_i(rs2_i'left) and rs2_is_signed) = '1') then -- signed division?
+            div_opy <= std_ulogic_vector(0 - unsigned(rs2_i)); -- make positive
+          else
+            div_opy <= rs2_i;
+          end if;
+          --
+          state <= PROCESSING;
 
         when PROCESSING =>
           cnt <= std_ulogic_vector(unsigned(cnt) - 1);
-          if (cnt = "00000") or ((FAST_MUL_EN = true) and (operation = '0')) then
+          if (cnt = "00000") then
             valid_o <= '1';
             state   <= FINALIZE;
           end if;
@@ -235,18 +226,21 @@ begin
     end process multiplier_core;
   end generate;
 
-  -- parallel multiplication --
+  -- parallel multiplication (using DSP blocks) --
   multiplier_core_dsp:
   if (FAST_MUL_EN = true) generate
     multiplier_core: process(clk_i)
+      variable tmp_v : signed(65 downto 0);
     begin
       if rising_edge(clk_i) then
         if (start_mul = '1') then
           mul_op_x <= signed((rs1_i(rs1_i'left) and rs1_is_signed) & rs1_i);
           mul_op_y <= signed((rs2_i(rs2_i'left) and rs2_is_signed) & rs2_i);
         end if;
-        mul_buf_ff  <= mul_op_x * mul_op_y;
-        mul_product <= std_ulogic_vector(mul_buf_ff(63 downto 0)); -- let the register balancing do the magic here
+        tmp_v := mul_op_x * mul_op_y;
+        mul_product <= std_ulogic_vector(tmp_v(63 downto 0));
+        --mul_buf_ff  <= mul_op_x * mul_op_y;
+        --mul_product <= std_ulogic_vector(mul_buf_ff(63 downto 0)); -- let the register balancing do the magic here
       end if;
     end process multiplier_core;
   end generate;
@@ -282,7 +276,11 @@ begin
         remainder <= (others => def_rst_val_c);
       elsif rising_edge(clk_i) then
         if (start_div = '1') then -- start new division
-          quotient  <= div_opx;
+          if ((rs1_i(rs1_i'left) and rs1_is_signed) = '1') then -- signed division?
+            quotient <= std_ulogic_vector(0 - unsigned(rs1_i)); -- make positive
+          else
+            quotient <= rs1_i;
+          end if;
           remainder <= (others => '0');
         elsif (state = PROCESSING) or (state = FINALIZE) then -- running?
           quotient <= quotient(30 downto 0) & (not div_sub(32));
@@ -315,7 +313,7 @@ begin
 
   -- Data Output ----------------------------------------------------------------------------
   -- -------------------------------------------------------------------------------------------
-  operation_result: process(rstn_i, clk_i)
+  operation_result: process(out_en, cp_op_ff, mul_product, div_res, quotient, opy_is_zero, rs1_i, remainder)
   begin
     if (out_en = '1') then
       case cp_op_ff is
diff --git a/rtl/core/neorv32_package.vhd b/rtl/core/neorv32_package.vhd
index 0083c286a..176d099aa 100644
--- a/rtl/core/neorv32_package.vhd
+++ b/rtl/core/neorv32_package.vhd
@@ -64,7 +64,7 @@ package neorv32_package is
   -- Architecture Constants (do not modify!) ------------------------------------------------
   -- -------------------------------------------------------------------------------------------
   constant data_width_c : natural := 32; -- native data path width - do not change!
-  constant hw_version_c : std_ulogic_vector(31 downto 0) := x"01060408"; -- no touchy!
+  constant hw_version_c : std_ulogic_vector(31 downto 0) := x"01060409"; -- no touchy!
   constant archid_c     : natural := 19; -- official NEORV32 architecture ID - hands off!
 
   -- Check if we're inside the Matrix -------------------------------------------------------