diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle index bcc63bfdbd..9b1b6ad8b2 100644 --- a/cranelift/codegen/src/isa/x64/inst.isle +++ b/cranelift/codegen/src/isa/x64/inst.isle @@ -3231,14 +3231,14 @@ dst)) ;; Helper for creating `palignr` instructions. -(decl x64_palignr (Xmm XmmMem u8 OperandSize) Xmm) -(rule 0 (x64_palignr src1 src2 imm size) +(decl x64_palignr (Xmm XmmMem u8) Xmm) +(rule 0 (x64_palignr src1 src2 imm) (xmm_rm_r_imm (SseOpcode.Palignr) src1 src2 imm - size)) -(rule 1 (x64_palignr src1 src2 imm size) + (OperandSize.Size32))) +(rule 1 (x64_palignr src1 src2 imm) (if-let $true (has_avx)) (xmm_rmr_imm_vex (AvxOpcode.Vpalignr) src1 src2 imm)) diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index 5b4773f6cb..6a4f73620d 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -894,10 +894,10 @@ (swiden_high (and (value_type (multi_lane 8 16)) y))))) (let ((x1 Xmm x) - (x2 Xmm (x64_palignr x1 x1 8 (OperandSize.Size32))) + (x2 Xmm (x64_palignr x1 x1 8)) (x3 Xmm (x64_pmovsxbw x2)) (y1 Xmm y) - (y2 Xmm (x64_palignr y1 y1 8 (OperandSize.Size32))) + (y2 Xmm (x64_palignr y1 y1 8)) (y3 Xmm (x64_pmovsxbw y2))) (x64_pmullw x3 y3))) @@ -962,10 +962,10 @@ (uwiden_high (and (value_type (multi_lane 8 16)) y))))) (let ((x1 Xmm x) - (x2 Xmm (x64_palignr x1 x1 8 (OperandSize.Size32))) + (x2 Xmm (x64_palignr x1 x1 8)) (x3 Xmm (x64_pmovzxbw x2)) (y1 Xmm y) - (y2 Xmm (x64_palignr y1 y1 8 (OperandSize.Size32))) + (y2 Xmm (x64_palignr y1 y1 8)) (y3 Xmm (x64_pmovzxbw y2))) (x64_pmullw x3 y3))) @@ -3284,11 +3284,11 @@ (rule (lower (has_type $I16X8 (swiden_high val @ (value_type $I8X16)))) (let ((x Xmm val)) - (x64_pmovsxbw (x64_palignr x x 8 (OperandSize.Size32))))) + (x64_pmovsxbw (x64_palignr x x 8)))) (rule (lower (has_type $I32X4 (swiden_high val @ (value_type $I16X8)))) (let ((x Xmm val)) - (x64_pmovsxwd (x64_palignr x x 8 (OperandSize.Size32))))) + (x64_pmovsxwd (x64_palignr x x 8)))) (rule (lower (has_type $I64X2 (swiden_high val @ (value_type $I32X4)))) (x64_pmovsxdq (x64_pshufd val 0xEE))) @@ -3308,11 +3308,11 @@ (rule (lower (has_type $I16X8 (uwiden_high val @ (value_type $I8X16)))) (let ((x Xmm val)) - (x64_pmovzxbw (x64_palignr x x 8 (OperandSize.Size32))))) + (x64_pmovzxbw (x64_palignr x x 8)))) (rule (lower (has_type $I32X4 (uwiden_high val @ (value_type $I16X8)))) (let ((x Xmm val)) - (x64_pmovzxwd (x64_palignr x x 8 (OperandSize.Size32))))) + (x64_pmovzxwd (x64_palignr x x 8)))) (rule (lower (has_type $I64X2 (uwiden_high val @ (value_type $I32X4)))) (x64_pmovzxdq (x64_pshufd val 0xEE))) @@ -3561,6 +3561,16 @@ ;; Rules for `shuffle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; When the shuffle looks like "concatenate `a` and `b` and shift right by n*8 +;; bytes", that's a `palignr` instruction. Note that the order of operands are +;; swapped in the instruction here. The `palignr` instruction uses the second +;; operand as the low-order bytes and the first operand as high-order bytes, +;; so put `a` second. +(rule 13 (lower (shuffle a b (palignr_imm_from_immediate n))) + (x64_palignr b a n)) +(decl palignr_imm_from_immediate (u8) Immediate) +(extern extractor palignr_imm_from_immediate palignr_imm_from_immediate) + ;; Special case the `pshuf{l,h}w` instruction which shuffles four 16-bit ;; integers within one value, preserving the other four 16-bit integers in that ;; value (either the high or low half). The complicated logic is in the diff --git a/cranelift/codegen/src/isa/x64/lower/isle.rs b/cranelift/codegen/src/isa/x64/lower/isle.rs index 815e40e351..bff7c42807 100644 --- a/cranelift/codegen/src/isa/x64/lower/isle.rs +++ b/cranelift/codegen/src/isa/x64/lower/isle.rs @@ -1117,6 +1117,16 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> { None } } + + fn palignr_imm_from_immediate(&mut self, imm: Immediate) -> Option { + let bytes = self.lower_ctx.get_immediate_data(imm).as_slice(); + + if bytes.windows(2).all(|a| a[0] + 1 == a[1]) { + Some(bytes[0]) + } else { + None + } + } } impl IsleContext<'_, '_, MInst, X64Backend> { diff --git a/cranelift/filetests/filetests/isa/x64/shuffle.clif b/cranelift/filetests/filetests/isa/x64/shuffle.clif index ce1d1b4842..f8f9b613e0 100644 --- a/cranelift/filetests/filetests/isa/x64/shuffle.clif +++ b/cranelift/filetests/filetests/isa/x64/shuffle.clif @@ -196,7 +196,7 @@ function %not_single_pshufd(i32x4, i32x4) -> i32x4 { block0(v0: i32x4, v1: i32x4): v2 = bitcast.i8x16 little v0 v3 = bitcast.i8x16 little v1 - v4 = shuffle v2, v3, [8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23] + v4 = shuffle v2, v3, [8 9 10 11 12 13 14 15 20 21 22 23 20 21 22 23] v5 = bitcast.i32x4 little v4 return v5 } @@ -205,7 +205,7 @@ block0(v0: i32x4, v1: i32x4): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; shufps $78, %xmm0, %xmm1, %xmm0 +; shufps $94, %xmm0, %xmm1, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -215,7 +215,7 @@ block0(v0: i32x4, v1: i32x4): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; shufps $0x4e, %xmm1, %xmm0 +; shufps $0x5e, %xmm1, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq @@ -644,3 +644,148 @@ block0(v0: i8x16, v1: i8x16): ; popq %rbp ; retq +function %palignr_0(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle v0, v1, [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15] + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movdqa %xmm0, %xmm4 +; movdqa %xmm1, %xmm0 +; palignr $0, %xmm0, %xmm4, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movdqa %xmm0, %xmm4 +; movdqa %xmm1, %xmm0 +; palignr $0, %xmm4, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %palignr_1(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle v0, v1, [1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movdqa %xmm0, %xmm4 +; movdqa %xmm1, %xmm0 +; palignr $1, %xmm0, %xmm4, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movdqa %xmm0, %xmm4 +; movdqa %xmm1, %xmm0 +; palignr $1, %xmm4, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %palignr_5(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle v0, v1, [5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20] + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movdqa %xmm0, %xmm4 +; movdqa %xmm1, %xmm0 +; palignr $5, %xmm0, %xmm4, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movdqa %xmm0, %xmm4 +; movdqa %xmm1, %xmm0 +; palignr $5, %xmm4, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %palignr_11(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle v0, v1, [11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26] + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movdqa %xmm0, %xmm4 +; movdqa %xmm1, %xmm0 +; palignr $11, %xmm0, %xmm4, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movdqa %xmm0, %xmm4 +; movdqa %xmm1, %xmm0 +; palignr $0xb, %xmm4, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %palignr_16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle v0, v1, [16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31] + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movdqa %xmm0, %xmm4 +; movdqa %xmm1, %xmm0 +; palignr $16, %xmm0, %xmm4, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movdqa %xmm0, %xmm4 +; movdqa %xmm1, %xmm0 +; palignr $0x10, %xmm4, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + diff --git a/cranelift/filetests/filetests/isa/x64/simd-arith-avx.clif b/cranelift/filetests/filetests/isa/x64/simd-arith-avx.clif index ac04216046..c33650d1a8 100644 --- a/cranelift/filetests/filetests/isa/x64/simd-arith-avx.clif +++ b/cranelift/filetests/filetests/isa/x64/simd-arith-avx.clif @@ -1948,3 +1948,28 @@ block0(v0: i32x4): ; popq %rbp ; retq +function %palignr_11(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle v0, v1, [11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26] + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpalignr $11, %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpalignr $0xb, %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + diff --git a/cranelift/filetests/filetests/runtests/simd-shuffle.clif b/cranelift/filetests/filetests/runtests/simd-shuffle.clif index f236a02055..2ef1671e22 100644 --- a/cranelift/filetests/filetests/runtests/simd-shuffle.clif +++ b/cranelift/filetests/filetests/runtests/simd-shuffle.clif @@ -73,7 +73,7 @@ block0(v0: i32x4, v1: i32x4): v5 = bitcast.i32x4 little v4 return v5 } -; run: %pshufd_0022([1 2 3 4], [5 6 7 8]) == [4 2 3 1] +; run: %pshufd_3120([1 2 3 4], [5 6 7 8]) == [4 2 3 1] function %pshufd_7546(i32x4, i32x4) -> i32x4 { block0(v0: i32x4, v1: i32x4): @@ -83,7 +83,7 @@ block0(v0: i32x4, v1: i32x4): v5 = bitcast.i32x4 little v4 return v5 } -; run: %pshufd_0022([1 2 3 4], [5 6 7 8]) == [8 6 5 7] +; run: %pshufd_7546([1 2 3 4], [5 6 7 8]) == [8 6 5 7] function %not_pshufd(i32x4, i32x4) -> i32x4 { block0(v0: i32x4, v1: i32x4): @@ -93,7 +93,17 @@ block0(v0: i32x4, v1: i32x4): v5 = bitcast.i32x4 little v4 return v5 } -; run: %pshufd_0022([1 2 3 4], [5 6 7 8]) == [3 4 5 6] +; run: %not_pshufd([1 2 3 4], [5 6 7 8]) == [3 4 5 6] + +function %not_pshufd2(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [8 9 10 11 12 13 14 15 20 21 22 23 20 21 22 23] + v5 = bitcast.i32x4 little v4 + return v5 +} +; run: %not_pshufd2([1 2 3 4], [5 6 7 8]) == [3 4 6 6] function %punpckldq(i32x4, i32x4) -> i32x4 { block0(v0: i32x4, v1: i32x4):