diff --git a/cranelift/codegen/src/isa/aarch64/lower.isle b/cranelift/codegen/src/isa/aarch64/lower.isle index 415d408549b6..7d18b602eed5 100644 --- a/cranelift/codegen/src/isa/aarch64/lower.isle +++ b/cranelift/codegen/src/isa/aarch64/lower.isle @@ -820,6 +820,26 @@ (dst_lo Reg (madd $I64 x_lo y_lo (zero_reg)))) (value_regs dst_lo dst_hi))) +;; Special cases where the upper bits are sign-or-zero extended of the lower bits +;; so the calculation here is much simpler with just a `umulh` or `smulh` +;; instead of the additions above as well. +(rule (lower (has_type $I128 (imul (uextend x) (uextend y)))) + (let ( + (x Reg (put_in_reg_zext64 x)) + (y Reg (put_in_reg_zext64 y)) + ) + (value_regs + (madd $I64 x y (zero_reg)) + (umulh $I64 x y)))) +(rule (lower (has_type $I128 (imul (sextend x) (sextend y)))) + (let ( + (x Reg (put_in_reg_sext64 x)) + (y Reg (put_in_reg_sext64 y)) + ) + (value_regs + (madd $I64 x y (zero_reg)) + (smulh $I64 x y)))) + ;; Case for i8x16, i16x8, and i32x4. (rule -2 (lower (has_type (ty_vec128 ty @ (not_i64x2)) (imul x y))) (mul x y (vector_size ty))) diff --git a/cranelift/filetests/filetests/isa/aarch64/i128.clif b/cranelift/filetests/filetests/isa/aarch64/i128.clif new file mode 100644 index 000000000000..bee69e094474 --- /dev/null +++ b/cranelift/filetests/filetests/isa/aarch64/i128.clif @@ -0,0 +1,48 @@ +test compile precise-output +set enable_llvm_abi_extensions=true +target aarch64 + +function %mul_uextend_i64(i64, i64) -> i128 { +block0(v0: i64, v1: i64): + v2 = uextend.i128 v0 + v3 = uextend.i128 v1 + v4 = imul v2, v3 + return v4 +} + +; VCode: +; block0: +; madd x3, x0, x1, xzr +; umulh x1, x0, x1 +; mov x0, x3 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; mul x3, x0, x1 +; umulh x1, x0, x1 +; mov x0, x3 +; ret + +function %mul_sextend_i64(i64, i64) -> i128 { +block0(v0: i64, v1: i64): + v2 = sextend.i128 v0 + v3 = sextend.i128 v1 + v4 = imul v2, v3 + return v4 +} + +; VCode: +; block0: +; madd x3, x0, x1, xzr +; smulh x1, x0, x1 +; mov x0, x3 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; mul x3, x0, x1 +; smulh x1, x0, x1 +; mov x0, x3 +; ret +