Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

x64: Add shuffle specialization for palignr #5999

Merged
merged 2 commits into from
Mar 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions cranelift/codegen/src/isa/x64/inst.isle
Original file line number Diff line number Diff line change
Expand Up @@ -3231,14 +3231,14 @@
dst))

;; Helper for creating `palignr` instructions.
(decl x64_palignr (Xmm XmmMem u8 OperandSize) Xmm)
(rule 0 (x64_palignr src1 src2 imm size)
(decl x64_palignr (Xmm XmmMem u8) Xmm)
(rule 0 (x64_palignr src1 src2 imm)
(xmm_rm_r_imm (SseOpcode.Palignr)
src1
src2
imm
size))
(rule 1 (x64_palignr src1 src2 imm size)
(OperandSize.Size32)))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why this change? Why always 32?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess it was always passed 32, but I'm still a little confused why it was like that.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@abrown do you perhaps remember why palignr was always generated with OperandSize.Size32? I'll admit I don't fully understand how OperandSize maps to instructions all the time but I naively figured that it could be "constant folded" into the palignr constructor, but a double-check would be good.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah I remember now actually, this has to do with the rex flags when encoding where a 64-bit size forces the W bit to be set and otherwise W is unset. I believe the encoding of the palignr instruction forces this to "unset" so for palignr it should always be a non-64-bit size.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I think that sounds right. The OperandSize isn't my invention and I recall having to work around it--I think things would be more clear if we hid OperandSize completely or alternately surfaced it as REX.W, which is more direct.

(rule 1 (x64_palignr src1 src2 imm)
(if-let $true (has_avx))
(xmm_rmr_imm_vex (AvxOpcode.Vpalignr) src1 src2 imm))

Expand Down
26 changes: 18 additions & 8 deletions cranelift/codegen/src/isa/x64/lower.isle
Original file line number Diff line number Diff line change
Expand Up @@ -894,10 +894,10 @@
(swiden_high (and (value_type (multi_lane 8 16))
y)))))
(let ((x1 Xmm x)
(x2 Xmm (x64_palignr x1 x1 8 (OperandSize.Size32)))
(x2 Xmm (x64_palignr x1 x1 8))
(x3 Xmm (x64_pmovsxbw x2))
(y1 Xmm y)
(y2 Xmm (x64_palignr y1 y1 8 (OperandSize.Size32)))
(y2 Xmm (x64_palignr y1 y1 8))
(y3 Xmm (x64_pmovsxbw y2)))
(x64_pmullw x3 y3)))

Expand Down Expand Up @@ -962,10 +962,10 @@
(uwiden_high (and (value_type (multi_lane 8 16))
y)))))
(let ((x1 Xmm x)
(x2 Xmm (x64_palignr x1 x1 8 (OperandSize.Size32)))
(x2 Xmm (x64_palignr x1 x1 8))
(x3 Xmm (x64_pmovzxbw x2))
(y1 Xmm y)
(y2 Xmm (x64_palignr y1 y1 8 (OperandSize.Size32)))
(y2 Xmm (x64_palignr y1 y1 8))
(y3 Xmm (x64_pmovzxbw y2)))
(x64_pmullw x3 y3)))

Expand Down Expand Up @@ -3284,11 +3284,11 @@

(rule (lower (has_type $I16X8 (swiden_high val @ (value_type $I8X16))))
(let ((x Xmm val))
(x64_pmovsxbw (x64_palignr x x 8 (OperandSize.Size32)))))
(x64_pmovsxbw (x64_palignr x x 8))))

(rule (lower (has_type $I32X4 (swiden_high val @ (value_type $I16X8))))
(let ((x Xmm val))
(x64_pmovsxwd (x64_palignr x x 8 (OperandSize.Size32)))))
(x64_pmovsxwd (x64_palignr x x 8))))

(rule (lower (has_type $I64X2 (swiden_high val @ (value_type $I32X4))))
(x64_pmovsxdq (x64_pshufd val 0xEE)))
Expand All @@ -3308,11 +3308,11 @@

(rule (lower (has_type $I16X8 (uwiden_high val @ (value_type $I8X16))))
(let ((x Xmm val))
(x64_pmovzxbw (x64_palignr x x 8 (OperandSize.Size32)))))
(x64_pmovzxbw (x64_palignr x x 8))))

(rule (lower (has_type $I32X4 (uwiden_high val @ (value_type $I16X8))))
(let ((x Xmm val))
(x64_pmovzxwd (x64_palignr x x 8 (OperandSize.Size32)))))
(x64_pmovzxwd (x64_palignr x x 8))))

(rule (lower (has_type $I64X2 (uwiden_high val @ (value_type $I32X4))))
(x64_pmovzxdq (x64_pshufd val 0xEE)))
Expand Down Expand Up @@ -3561,6 +3561,16 @@

;; Rules for `shuffle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; When the shuffle looks like "concatenate `a` and `b` and shift right by n*8
;; bytes", that's a `palignr` instruction. Note that the order of operands are
;; swapped in the instruction here. The `palignr` instruction uses the second
;; operand as the low-order bytes and the first operand as high-order bytes,
;; so put `a` second.
(rule 13 (lower (shuffle a b (palignr_imm_from_immediate n)))
(x64_palignr b a n))
(decl palignr_imm_from_immediate (u8) Immediate)
(extern extractor palignr_imm_from_immediate palignr_imm_from_immediate)

;; Special case the `pshuf{l,h}w` instruction which shuffles four 16-bit
;; integers within one value, preserving the other four 16-bit integers in that
;; value (either the high or low half). The complicated logic is in the
Expand Down
10 changes: 10 additions & 0 deletions cranelift/codegen/src/isa/x64/lower/isle.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1117,6 +1117,16 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> {
None
}
}

fn palignr_imm_from_immediate(&mut self, imm: Immediate) -> Option<u8> {
let bytes = self.lower_ctx.get_immediate_data(imm).as_slice();

if bytes.windows(2).all(|a| a[0] + 1 == a[1]) {
Some(bytes[0])
} else {
None
}
}
}

impl IsleContext<'_, '_, MInst, X64Backend> {
Expand Down
151 changes: 148 additions & 3 deletions cranelift/filetests/filetests/isa/x64/shuffle.clif
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ function %not_single_pshufd(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):
v2 = bitcast.i8x16 little v0
v3 = bitcast.i8x16 little v1
v4 = shuffle v2, v3, [8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23]
v4 = shuffle v2, v3, [8 9 10 11 12 13 14 15 20 21 22 23 20 21 22 23]
v5 = bitcast.i32x4 little v4
return v5
}
Expand All @@ -205,7 +205,7 @@ block0(v0: i32x4, v1: i32x4):
; pushq %rbp
; movq %rsp, %rbp
; block0:
; shufps $78, %xmm0, %xmm1, %xmm0
; shufps $94, %xmm0, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
Expand All @@ -215,7 +215,7 @@ block0(v0: i32x4, v1: i32x4):
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; shufps $0x4e, %xmm1, %xmm0
; shufps $0x5e, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
Expand Down Expand Up @@ -644,3 +644,148 @@ block0(v0: i8x16, v1: i8x16):
; popq %rbp
; retq

function %palignr_0(i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16):
v2 = shuffle v0, v1, [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
return v2
}

; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movdqa %xmm0, %xmm4
; movdqa %xmm1, %xmm0
; palignr $0, %xmm0, %xmm4, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movdqa %xmm0, %xmm4
; movdqa %xmm1, %xmm0
; palignr $0, %xmm4, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq

function %palignr_1(i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16):
v2 = shuffle v0, v1, [1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
return v2
}

; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movdqa %xmm0, %xmm4
; movdqa %xmm1, %xmm0
; palignr $1, %xmm0, %xmm4, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movdqa %xmm0, %xmm4
; movdqa %xmm1, %xmm0
; palignr $1, %xmm4, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq

function %palignr_5(i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16):
v2 = shuffle v0, v1, [5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20]
return v2
}

; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movdqa %xmm0, %xmm4
; movdqa %xmm1, %xmm0
; palignr $5, %xmm0, %xmm4, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movdqa %xmm0, %xmm4
; movdqa %xmm1, %xmm0
; palignr $5, %xmm4, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq

function %palignr_11(i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16):
v2 = shuffle v0, v1, [11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26]
return v2
}

; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movdqa %xmm0, %xmm4
; movdqa %xmm1, %xmm0
; palignr $11, %xmm0, %xmm4, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movdqa %xmm0, %xmm4
; movdqa %xmm1, %xmm0
; palignr $0xb, %xmm4, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq

function %palignr_16(i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16):
v2 = shuffle v0, v1, [16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31]
return v2
}

; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movdqa %xmm0, %xmm4
; movdqa %xmm1, %xmm0
; palignr $16, %xmm0, %xmm4, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movdqa %xmm0, %xmm4
; movdqa %xmm1, %xmm0
; palignr $0x10, %xmm4, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq

25 changes: 25 additions & 0 deletions cranelift/filetests/filetests/isa/x64/simd-arith-avx.clif
Original file line number Diff line number Diff line change
Expand Up @@ -1948,3 +1948,28 @@ block0(v0: i32x4):
; popq %rbp
; retq

function %palignr_11(i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16):
v2 = shuffle v0, v1, [11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26]
return v2
}

; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vpalignr $11, %xmm1, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vpalignr $0xb, %xmm0, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq

16 changes: 13 additions & 3 deletions cranelift/filetests/filetests/runtests/simd-shuffle.clif
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ block0(v0: i32x4, v1: i32x4):
v5 = bitcast.i32x4 little v4
return v5
}
; run: %pshufd_0022([1 2 3 4], [5 6 7 8]) == [4 2 3 1]
; run: %pshufd_3120([1 2 3 4], [5 6 7 8]) == [4 2 3 1]

function %pshufd_7546(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):
Expand All @@ -83,7 +83,7 @@ block0(v0: i32x4, v1: i32x4):
v5 = bitcast.i32x4 little v4
return v5
}
; run: %pshufd_0022([1 2 3 4], [5 6 7 8]) == [8 6 5 7]
; run: %pshufd_7546([1 2 3 4], [5 6 7 8]) == [8 6 5 7]

function %not_pshufd(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):
Expand All @@ -93,7 +93,17 @@ block0(v0: i32x4, v1: i32x4):
v5 = bitcast.i32x4 little v4
return v5
}
; run: %pshufd_0022([1 2 3 4], [5 6 7 8]) == [3 4 5 6]
; run: %not_pshufd([1 2 3 4], [5 6 7 8]) == [3 4 5 6]

function %not_pshufd2(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):
v2 = bitcast.i8x16 little v0
v3 = bitcast.i8x16 little v1
v4 = shuffle v2, v3, [8 9 10 11 12 13 14 15 20 21 22 23 20 21 22 23]
v5 = bitcast.i32x4 little v4
return v5
}
; run: %not_pshufd2([1 2 3 4], [5 6 7 8]) == [3 4 6 6]

function %punpckldq(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):
Expand Down