Browse Source

x64: Add `shuffle` specialization for `palignr` (#5999)

* x64: Add `shuffle` specialization for `palignr`

This commit adds specializations for the `palignr` instruction to the
x64 backend to specialize some more patterns of byte shuffles.

* Fix tests
pull/6007/head
Alex Crichton 2 years ago
committed by GitHub
parent
commit
e2a6fe99c2
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
  1. 8
      cranelift/codegen/src/isa/x64/inst.isle
  2. 26
      cranelift/codegen/src/isa/x64/lower.isle
  3. 10
      cranelift/codegen/src/isa/x64/lower/isle.rs
  4. 151
      cranelift/filetests/filetests/isa/x64/shuffle.clif
  5. 25
      cranelift/filetests/filetests/isa/x64/simd-arith-avx.clif
  6. 16
      cranelift/filetests/filetests/runtests/simd-shuffle.clif

8
cranelift/codegen/src/isa/x64/inst.isle

@ -3231,14 +3231,14 @@
dst))
;; Helper for creating `palignr` instructions.
(decl x64_palignr (Xmm XmmMem u8 OperandSize) Xmm)
(rule 0 (x64_palignr src1 src2 imm size)
(decl x64_palignr (Xmm XmmMem u8) Xmm)
(rule 0 (x64_palignr src1 src2 imm)
(xmm_rm_r_imm (SseOpcode.Palignr)
src1
src2
imm
size))
(rule 1 (x64_palignr src1 src2 imm size)
(OperandSize.Size32)))
(rule 1 (x64_palignr src1 src2 imm)
(if-let $true (has_avx))
(xmm_rmr_imm_vex (AvxOpcode.Vpalignr) src1 src2 imm))

26
cranelift/codegen/src/isa/x64/lower.isle

@ -894,10 +894,10 @@
(swiden_high (and (value_type (multi_lane 8 16))
y)))))
(let ((x1 Xmm x)
(x2 Xmm (x64_palignr x1 x1 8 (OperandSize.Size32)))
(x2 Xmm (x64_palignr x1 x1 8))
(x3 Xmm (x64_pmovsxbw x2))
(y1 Xmm y)
(y2 Xmm (x64_palignr y1 y1 8 (OperandSize.Size32)))
(y2 Xmm (x64_palignr y1 y1 8))
(y3 Xmm (x64_pmovsxbw y2)))
(x64_pmullw x3 y3)))
@ -962,10 +962,10 @@
(uwiden_high (and (value_type (multi_lane 8 16))
y)))))
(let ((x1 Xmm x)
(x2 Xmm (x64_palignr x1 x1 8 (OperandSize.Size32)))
(x2 Xmm (x64_palignr x1 x1 8))
(x3 Xmm (x64_pmovzxbw x2))
(y1 Xmm y)
(y2 Xmm (x64_palignr y1 y1 8 (OperandSize.Size32)))
(y2 Xmm (x64_palignr y1 y1 8))
(y3 Xmm (x64_pmovzxbw y2)))
(x64_pmullw x3 y3)))
@ -3284,11 +3284,11 @@
(rule (lower (has_type $I16X8 (swiden_high val @ (value_type $I8X16))))
(let ((x Xmm val))
(x64_pmovsxbw (x64_palignr x x 8 (OperandSize.Size32)))))
(x64_pmovsxbw (x64_palignr x x 8))))
(rule (lower (has_type $I32X4 (swiden_high val @ (value_type $I16X8))))
(let ((x Xmm val))
(x64_pmovsxwd (x64_palignr x x 8 (OperandSize.Size32)))))
(x64_pmovsxwd (x64_palignr x x 8))))
(rule (lower (has_type $I64X2 (swiden_high val @ (value_type $I32X4))))
(x64_pmovsxdq (x64_pshufd val 0xEE)))
@ -3308,11 +3308,11 @@
(rule (lower (has_type $I16X8 (uwiden_high val @ (value_type $I8X16))))
(let ((x Xmm val))
(x64_pmovzxbw (x64_palignr x x 8 (OperandSize.Size32)))))
(x64_pmovzxbw (x64_palignr x x 8))))
(rule (lower (has_type $I32X4 (uwiden_high val @ (value_type $I16X8))))
(let ((x Xmm val))
(x64_pmovzxwd (x64_palignr x x 8 (OperandSize.Size32)))))
(x64_pmovzxwd (x64_palignr x x 8))))
(rule (lower (has_type $I64X2 (uwiden_high val @ (value_type $I32X4))))
(x64_pmovzxdq (x64_pshufd val 0xEE)))
@ -3561,6 +3561,16 @@
;; Rules for `shuffle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; When the shuffle looks like "concatenate `a` and `b` and shift right by n*8
;; bytes", that's a `palignr` instruction. Note that the order of operands are
;; swapped in the instruction here. The `palignr` instruction uses the second
;; operand as the low-order bytes and the first operand as high-order bytes,
;; so put `a` second.
(rule 13 (lower (shuffle a b (palignr_imm_from_immediate n)))
(x64_palignr b a n))
(decl palignr_imm_from_immediate (u8) Immediate)
(extern extractor palignr_imm_from_immediate palignr_imm_from_immediate)
;; Special case the `pshuf{l,h}w` instruction which shuffles four 16-bit
;; integers within one value, preserving the other four 16-bit integers in that
;; value (either the high or low half). The complicated logic is in the

10
cranelift/codegen/src/isa/x64/lower/isle.rs

@ -1117,6 +1117,16 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> {
None
}
}
fn palignr_imm_from_immediate(&mut self, imm: Immediate) -> Option<u8> {
let bytes = self.lower_ctx.get_immediate_data(imm).as_slice();
if bytes.windows(2).all(|a| a[0] + 1 == a[1]) {
Some(bytes[0])
} else {
None
}
}
}
impl IsleContext<'_, '_, MInst, X64Backend> {

151
cranelift/filetests/filetests/isa/x64/shuffle.clif

@ -196,7 +196,7 @@ function %not_single_pshufd(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):
v2 = bitcast.i8x16 little v0
v3 = bitcast.i8x16 little v1
v4 = shuffle v2, v3, [8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23]
v4 = shuffle v2, v3, [8 9 10 11 12 13 14 15 20 21 22 23 20 21 22 23]
v5 = bitcast.i32x4 little v4
return v5
}
@ -205,7 +205,7 @@ block0(v0: i32x4, v1: i32x4):
; pushq %rbp
; movq %rsp, %rbp
; block0:
; shufps $78, %xmm0, %xmm1, %xmm0
; shufps $94, %xmm0, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
@ -215,7 +215,7 @@ block0(v0: i32x4, v1: i32x4):
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; shufps $0x4e, %xmm1, %xmm0
; shufps $0x5e, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
@ -644,3 +644,148 @@ block0(v0: i8x16, v1: i8x16):
; popq %rbp
; retq
function %palignr_0(i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16):
v2 = shuffle v0, v1, [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movdqa %xmm0, %xmm4
; movdqa %xmm1, %xmm0
; palignr $0, %xmm0, %xmm4, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movdqa %xmm0, %xmm4
; movdqa %xmm1, %xmm0
; palignr $0, %xmm4, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %palignr_1(i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16):
v2 = shuffle v0, v1, [1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movdqa %xmm0, %xmm4
; movdqa %xmm1, %xmm0
; palignr $1, %xmm0, %xmm4, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movdqa %xmm0, %xmm4
; movdqa %xmm1, %xmm0
; palignr $1, %xmm4, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %palignr_5(i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16):
v2 = shuffle v0, v1, [5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20]
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movdqa %xmm0, %xmm4
; movdqa %xmm1, %xmm0
; palignr $5, %xmm0, %xmm4, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movdqa %xmm0, %xmm4
; movdqa %xmm1, %xmm0
; palignr $5, %xmm4, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %palignr_11(i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16):
v2 = shuffle v0, v1, [11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26]
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movdqa %xmm0, %xmm4
; movdqa %xmm1, %xmm0
; palignr $11, %xmm0, %xmm4, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movdqa %xmm0, %xmm4
; movdqa %xmm1, %xmm0
; palignr $0xb, %xmm4, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %palignr_16(i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16):
v2 = shuffle v0, v1, [16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31]
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movdqa %xmm0, %xmm4
; movdqa %xmm1, %xmm0
; palignr $16, %xmm0, %xmm4, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movdqa %xmm0, %xmm4
; movdqa %xmm1, %xmm0
; palignr $0x10, %xmm4, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq

25
cranelift/filetests/filetests/isa/x64/simd-arith-avx.clif

@ -1948,3 +1948,28 @@ block0(v0: i32x4):
; popq %rbp
; retq
function %palignr_11(i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16):
v2 = shuffle v0, v1, [11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26]
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vpalignr $11, %xmm1, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vpalignr $0xb, %xmm0, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq

16
cranelift/filetests/filetests/runtests/simd-shuffle.clif

@ -73,7 +73,7 @@ block0(v0: i32x4, v1: i32x4):
v5 = bitcast.i32x4 little v4
return v5
}
; run: %pshufd_0022([1 2 3 4], [5 6 7 8]) == [4 2 3 1]
; run: %pshufd_3120([1 2 3 4], [5 6 7 8]) == [4 2 3 1]
function %pshufd_7546(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):
@ -83,7 +83,7 @@ block0(v0: i32x4, v1: i32x4):
v5 = bitcast.i32x4 little v4
return v5
}
; run: %pshufd_0022([1 2 3 4], [5 6 7 8]) == [8 6 5 7]
; run: %pshufd_7546([1 2 3 4], [5 6 7 8]) == [8 6 5 7]
function %not_pshufd(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):
@ -93,7 +93,17 @@ block0(v0: i32x4, v1: i32x4):
v5 = bitcast.i32x4 little v4
return v5
}
; run: %pshufd_0022([1 2 3 4], [5 6 7 8]) == [3 4 5 6]
; run: %not_pshufd([1 2 3 4], [5 6 7 8]) == [3 4 5 6]
function %not_pshufd2(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):
v2 = bitcast.i8x16 little v0
v3 = bitcast.i8x16 little v1
v4 = shuffle v2, v3, [8 9 10 11 12 13 14 15 20 21 22 23 20 21 22 23]
v5 = bitcast.i32x4 little v4
return v5
}
; run: %not_pshufd2([1 2 3 4], [5 6 7 8]) == [3 4 6 6]
function %punpckldq(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):

Loading…
Cancel
Save