From c5ddb4b803129acb6d8c1bdfe564c8fe0dd3e53b Mon Sep 17 00:00:00 2001 From: Sam Parker Date: Mon, 25 Jul 2022 20:40:36 +0100 Subject: [PATCH] [AArch64] Port SIMD narrowing to ISLE (#4478) * [AArch64] Port SIMD narrowing to ISLE Fvdemote, snarrow, unarrow and uunarrow. Also refactor the aarch64 instructions descriptions to parameterize on ScalarSize instead of using different opcodes. The zero_value pure constructor has been introduced and used by the integer narrow operations and it replaces, and extends, the compare zero patterns. Copright (c) 2022, Arm Limited. * use short 'if' patterns --- cranelift/codegen/src/isa/aarch64/inst.isle | 136 +++++---- .../codegen/src/isa/aarch64/inst/args.rs | 10 + .../codegen/src/isa/aarch64/inst/emit.rs | 50 ++-- .../src/isa/aarch64/inst/emit_tests.rs | 81 +++++- cranelift/codegen/src/isa/aarch64/inst/mod.rs | 102 ++----- cranelift/codegen/src/isa/aarch64/lower.isle | 101 ++++--- .../codegen/src/isa/aarch64/lower/isle.rs | 21 -- .../src/isa/aarch64/lower_dynamic_neon.isle | 42 +++ .../codegen/src/isa/aarch64/lower_inst.rs | 66 +---- cranelift/codegen/src/machinst/isle.rs | 80 ++++++ cranelift/codegen/src/prelude.isle | 19 ++ .../filetests/isa/aarch64/compare_zero.clif | 267 +++++++++++++++++ .../isa/aarch64/dynamic-simd-narrow.clif | 204 ++++++++++--- .../filetests/isa/aarch64/simd-narrow.clif | 268 ++++++++++++++++++ .../runtests/dynamic-simd-narrow-widen.clif | 230 +++++++++++++++ 15 files changed, 1340 insertions(+), 337 deletions(-) create mode 100644 cranelift/filetests/filetests/isa/aarch64/simd-narrow.clif create mode 100644 cranelift/filetests/filetests/runtests/dynamic-simd-narrow-widen.clif diff --git a/cranelift/codegen/src/isa/aarch64/inst.isle b/cranelift/codegen/src/isa/aarch64/inst.isle index dc5a16fd4a..95d6c5ae44 100644 --- a/cranelift/codegen/src/isa/aarch64/inst.isle +++ b/cranelift/codegen/src/isa/aarch64/inst.isle @@ -531,7 +531,8 @@ (op VecRRNarrowOp) (rd WritableReg) (rn Reg) - (high_half bool)) + (high_half bool) + (lane_size ScalarSize)) ;; 1-operand vector instruction that operates on a pair of elements. (VecRRPair @@ -905,6 +906,17 @@ (rule (scalar_size $F32) (ScalarSize.Size32)) (rule (scalar_size $F64) (ScalarSize.Size64)) +;; Helper for calculating the `ScalarSize` lane type from vector type +(decl lane_size (Type) ScalarSize) +(rule (lane_size (multi_lane 8 _)) (ScalarSize.Size8)) +(rule (lane_size (multi_lane 16 _)) (ScalarSize.Size16)) +(rule (lane_size (multi_lane 32 _)) (ScalarSize.Size32)) +(rule (lane_size (multi_lane 64 _)) (ScalarSize.Size64)) +(rule (lane_size (dynamic_lane 8 _)) (ScalarSize.Size8)) +(rule (lane_size (dynamic_lane 16 _)) (ScalarSize.Size16)) +(rule (lane_size (dynamic_lane 32 _)) (ScalarSize.Size32)) +(rule (lane_size (dynamic_lane 64 _)) (ScalarSize.Size64)) + (type Cond extern (enum (Eq) @@ -936,17 +948,6 @@ (Size64x2) )) -(type DynamicVectorSize extern - (enum - (Size8x8xN) - (Size8x16xN) - (Size16x4xN) - (Size16x8xN) - (Size32x2xN) - (Size32x4xN) - (Size64x2xN) -)) - ;; Helper for calculating the `VectorSize` corresponding to a type (decl vector_size (Type) VectorSize) (rule (vector_size (multi_lane 8 8)) (VectorSize.Size8x8)) @@ -1203,34 +1204,16 @@ ;; A vector narrowing operation with one argument. (type VecRRNarrowOp (enum - ;; Extract narrow, 16-bit elements - (Xtn16) - ;; Extract narrow, 32-bit elements - (Xtn32) - ;; Extract narrow, 64-bit elements - (Xtn64) - ;; Signed saturating extract narrow, 16-bit elements - (Sqxtn16) - ;; Signed saturating extract narrow, 32-bit elements - (Sqxtn32) - ;; Signed saturating extract narrow, 64-bit elements - (Sqxtn64) - ;; Signed saturating extract unsigned narrow, 16-bit elements - (Sqxtun16) - ;; Signed saturating extract unsigned narrow, 32-bit elements - (Sqxtun32) - ;; Signed saturating extract unsigned narrow, 64-bit elements - (Sqxtun64) - ;; Unsigned saturating extract narrow, 16-bit elements - (Uqxtn16) - ;; Unsigned saturating extract narrow, 32-bit elements - (Uqxtn32) - ;; Unsigned saturating extract narrow, 64-bit elements - (Uqxtn64) - ;; Floating-point convert to lower precision narrow, 32-bit elements - (Fcvtn32) - ;; Floating-point convert to lower precision narrow, 64-bit elements - (Fcvtn64) + ;; Extract narrow. + (Xtn) + ;; Signed saturating extract narrow. + (Sqxtn) + ;; Signed saturating extract unsigned narrow. + (Sqxtun) + ;; Unsigned saturating extract narrow. + (Uqxtn) + ;; Floating-point convert to lower precision narrow. + (Fcvtn) )) (type VecRRRLongOp @@ -1623,10 +1606,19 @@ dst)) ;; Helper for emitting `MInst.VecRRNarrow` instructions. -(decl vec_rr_narrow (VecRRNarrowOp Reg bool) Reg) -(rule (vec_rr_narrow op src high_half) +(decl vec_rr_narrow (VecRRNarrowOp Reg ScalarSize) Reg) +(rule (vec_rr_narrow op src size) + (let ((dst WritableReg (temp_writable_reg $I8X16)) + (_ Unit (emit (MInst.VecRRNarrow op dst src $false size)))) + dst)) + +;; Helper for emitting `MInst.VecRRNarrow` instructions which update the +;; high half of the destination register. +(decl vec_rr_narrow_high (VecRRNarrowOp Reg Reg ScalarSize) Reg) +(rule (vec_rr_narrow_high op mod src size) (let ((dst WritableReg (temp_writable_reg $I8X16)) - (_ Unit (emit (MInst.VecRRNarrow op dst src high_half)))) + (_1 Unit (emit (MInst.FpuMove128 dst mod))) + (_2 Unit (emit (MInst.VecRRNarrow op dst src $true size)))) dst)) ;; Helper for emitting `MInst.VecRRLong` instructions. @@ -1673,6 +1665,14 @@ (_2 Unit (emit (MInst.MovToVec dst src2 lane size)))) dst)) +;; Helper for emitting `MInst.VecMovElement` instructions. +(decl mov_vec_elem (Reg Reg u8 u8 VectorSize) Reg) +(rule (mov_vec_elem src1 src2 dst_idx src_idx size) + (let ((dst WritableReg (temp_writable_reg $I8X16)) + (_1 Unit (emit (MInst.FpuMove128 dst src1))) + (_2 Unit (emit (MInst.VecMovElement dst src2 dst_idx src_idx size)))) + dst)) + ;; Helper for emitting `MInst.MovFromVec` instructions. (decl mov_from_vec (Reg u8 VectorSize) Reg) (rule (mov_from_vec rn idx size) @@ -1830,9 +1830,37 @@ (decl rev64 (Reg VectorSize) Reg) (rule (rev64 x size) (vec_misc (VecMisc2.Rev64) x size)) -;; Helper for generating `xtn64` instructions. -(decl xtn64 (Reg bool) Reg) -(rule (xtn64 x high_half) (vec_rr_narrow (VecRRNarrowOp.Xtn64) x high_half)) +;; Helper for generating `xtn` instructions. +(decl xtn (Reg ScalarSize) Reg) +(rule (xtn x size) (vec_rr_narrow (VecRRNarrowOp.Xtn) x size)) + +;; Helper for generating `fcvtn` instructions. +(decl fcvtn (Reg ScalarSize) Reg) +(rule (fcvtn x size) (vec_rr_narrow (VecRRNarrowOp.Fcvtn) x size)) + +;; Helper for generating `sqxtn` instructions. +(decl sqxtn (Reg ScalarSize) Reg) +(rule (sqxtn x size) (vec_rr_narrow (VecRRNarrowOp.Sqxtn) x size)) + +;; Helper for generating `sqxtn2` instructions. +(decl sqxtn2 (Reg Reg ScalarSize) Reg) +(rule (sqxtn2 x y size) (vec_rr_narrow_high (VecRRNarrowOp.Sqxtn) x y size)) + +;; Helper for generating `sqxtun` instructions. +(decl sqxtun (Reg ScalarSize) Reg) +(rule (sqxtun x size) (vec_rr_narrow (VecRRNarrowOp.Sqxtun) x size)) + +;; Helper for generating `sqxtun2` instructions. +(decl sqxtun2 (Reg Reg ScalarSize) Reg) +(rule (sqxtun2 x y size) (vec_rr_narrow_high (VecRRNarrowOp.Sqxtun) x y size)) + +;; Helper for generating `uqxtn` instructions. +(decl uqxtn (Reg ScalarSize) Reg) +(rule (uqxtn x size) (vec_rr_narrow (VecRRNarrowOp.Uqxtn) x size)) + +;; Helper for generating `uqxtn2` instructions. +(decl uqxtn2 (Reg Reg ScalarSize) Reg) +(rule (uqxtn2 x y size) (vec_rr_narrow_high (VecRRNarrowOp.Uqxtn) x y size)) ;; Helper for generating `addp` instructions. (decl addp (Reg Reg VectorSize) Reg) @@ -2202,16 +2230,6 @@ (alu_rrr op ty x_lo y_lo) (alu_rrr op ty x_hi y_hi)))) -;; Float vector compare helpers ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -;; Match 32 bit float 0 value -(decl zero_value_f32 (Ieee32) Ieee32) -(extern extractor zero_value_f32 zero_value_f32) - -;; Match 64 bit float 0 value -(decl zero_value_f64 (Ieee64) Ieee64) -(extern extractor zero_value_f64 zero_value_f64) - ;; Generate comparison to zero operator from input condition code (decl float_cc_cmp_zero_to_vec_misc_op (FloatCC) VecMisc2) (extern constructor float_cc_cmp_zero_to_vec_misc_op float_cc_cmp_zero_to_vec_misc_op) @@ -2242,12 +2260,6 @@ (rule (fcmeq0 rn size) (vec_misc (VecMisc2.Fcmeq0) rn size)) -;; Int vector compare helpers ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -;; Match integer 0 value -(decl zero_value (Imm64) Imm64) -(extern extractor zero_value zero_value) - ;; Generate comparison to zero operator from input condition code (decl int_cc_cmp_zero_to_vec_misc_op (IntCC) VecMisc2) (extern constructor int_cc_cmp_zero_to_vec_misc_op int_cc_cmp_zero_to_vec_misc_op) diff --git a/cranelift/codegen/src/isa/aarch64/inst/args.rs b/cranelift/codegen/src/isa/aarch64/inst/args.rs index ef9abd42ec..ee5e3774ae 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/args.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/args.rs @@ -643,6 +643,16 @@ impl ScalarSize { _ => panic!("Unexpected scalar FP operand size: {:?}", self), } } + + pub fn widen(&self) -> ScalarSize { + match self { + ScalarSize::Size8 => ScalarSize::Size16, + ScalarSize::Size16 => ScalarSize::Size32, + ScalarSize::Size32 => ScalarSize::Size64, + ScalarSize::Size64 => ScalarSize::Size128, + ScalarSize::Size128 => panic!("can't widen 128-bits"), + } + } } /// Type used to communicate the size of a vector operand. diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs index ee63225e09..9fbbee1849 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs @@ -2252,15 +2252,17 @@ impl MachInstEmit for Inst { &Inst::VecDup { rd, rn, size } => { let rd = allocs.next_writable(rd); let rn = allocs.next(rn); - let imm5 = match size { - VectorSize::Size8x16 => 0b00001, - VectorSize::Size16x8 => 0b00010, - VectorSize::Size32x4 => 0b00100, - VectorSize::Size64x2 => 0b01000, + let q = size.is_128bits() as u32; + let imm5 = match size.lane_size() { + ScalarSize::Size8 => 0b00001, + ScalarSize::Size16 => 0b00010, + ScalarSize::Size32 => 0b00100, + ScalarSize::Size64 => 0b01000, _ => unimplemented!("Unexpected VectorSize: {:?}", size), }; sink.put4( - 0b010_01110000_00000_000011_00000_00000 + 0b000_01110000_00000_000011_00000_00000 + | (q << 30) | (imm5 << 16) | (machreg_to_gpr(rn) << 5) | machreg_to_vec(rd.to_reg()), @@ -2395,24 +2397,30 @@ impl MachInstEmit for Inst { rd, rn, high_half, + lane_size, } => { let rn = allocs.next(rn); let rd = allocs.next_writable(rd); - let (u, size, bits_12_16) = match op { - VecRRNarrowOp::Xtn16 => (0b0, 0b00, 0b10010), - VecRRNarrowOp::Xtn32 => (0b0, 0b01, 0b10010), - VecRRNarrowOp::Xtn64 => (0b0, 0b10, 0b10010), - VecRRNarrowOp::Sqxtn16 => (0b0, 0b00, 0b10100), - VecRRNarrowOp::Sqxtn32 => (0b0, 0b01, 0b10100), - VecRRNarrowOp::Sqxtn64 => (0b0, 0b10, 0b10100), - VecRRNarrowOp::Sqxtun16 => (0b1, 0b00, 0b10010), - VecRRNarrowOp::Sqxtun32 => (0b1, 0b01, 0b10010), - VecRRNarrowOp::Sqxtun64 => (0b1, 0b10, 0b10010), - VecRRNarrowOp::Uqxtn16 => (0b1, 0b00, 0b10100), - VecRRNarrowOp::Uqxtn32 => (0b1, 0b01, 0b10100), - VecRRNarrowOp::Uqxtn64 => (0b1, 0b10, 0b10100), - VecRRNarrowOp::Fcvtn32 => (0b0, 0b00, 0b10110), - VecRRNarrowOp::Fcvtn64 => (0b0, 0b01, 0b10110), + + let size = match lane_size { + ScalarSize::Size8 => 0b00, + ScalarSize::Size16 => 0b01, + ScalarSize::Size32 => 0b10, + _ => panic!("unsupported size: {:?}", lane_size), + }; + + // Floats use a single bit, to encode either half or single. + let size = match op { + VecRRNarrowOp::Fcvtn => size >> 1, + _ => size, + }; + + let (u, bits_12_16) = match op { + VecRRNarrowOp::Xtn => (0b0, 0b10010), + VecRRNarrowOp::Sqxtn => (0b0, 0b10100), + VecRRNarrowOp::Sqxtun => (0b1, 0b10010), + VecRRNarrowOp::Uqxtn => (0b1, 0b10100), + VecRRNarrowOp::Fcvtn => (0b0, 0b10110), }; sink.put4(enc_vec_rr_misc( diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs index f9e25b47bd..4a722d470e 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs @@ -2338,6 +2338,15 @@ fn test_aarch64_binemit() { "1B423BD5", "mrs x27, nzcv", )); + insns.push(( + Inst::VecDup { + rd: writable_vreg(24), + rn: xreg(8), + size: VectorSize::Size8x8, + }, + "180D010E", + "dup v24.8b, w8", + )); insns.push(( Inst::VecDup { rd: writable_vreg(25), @@ -2347,6 +2356,15 @@ fn test_aarch64_binemit() { "F90C014E", "dup v25.16b, w7", )); + insns.push(( + Inst::VecDup { + rd: writable_vreg(1), + rn: xreg(22), + size: VectorSize::Size16x4, + }, + "C10E020E", + "dup v1.4h, w22", + )); insns.push(( Inst::VecDup { rd: writable_vreg(2), @@ -2356,6 +2374,15 @@ fn test_aarch64_binemit() { "E20E024E", "dup v2.8h, w23", )); + insns.push(( + Inst::VecDup { + rd: writable_vreg(30), + rn: xreg(28), + size: VectorSize::Size32x2, + }, + "9E0F040E", + "dup v30.2s, w28", + )); insns.push(( Inst::VecDup { rd: writable_vreg(0), @@ -2652,10 +2679,11 @@ fn test_aarch64_binemit() { insns.push(( Inst::VecRRNarrow { - op: VecRRNarrowOp::Xtn16, + op: VecRRNarrowOp::Xtn, rd: writable_vreg(25), rn: vreg(17), high_half: false, + lane_size: ScalarSize::Size8, }, "392A210E", "xtn v25.8b, v17.8h", @@ -2663,10 +2691,11 @@ fn test_aarch64_binemit() { insns.push(( Inst::VecRRNarrow { - op: VecRRNarrowOp::Xtn32, + op: VecRRNarrowOp::Xtn, rd: writable_vreg(3), rn: vreg(10), high_half: true, + lane_size: ScalarSize::Size16, }, "4329614E", "xtn2 v3.8h, v10.4s", @@ -2674,10 +2703,11 @@ fn test_aarch64_binemit() { insns.push(( Inst::VecRRNarrow { - op: VecRRNarrowOp::Xtn64, + op: VecRRNarrowOp::Xtn, rd: writable_vreg(22), rn: vreg(8), high_half: false, + lane_size: ScalarSize::Size32, }, "1629A10E", "xtn v22.2s, v8.2d", @@ -2685,10 +2715,11 @@ fn test_aarch64_binemit() { insns.push(( Inst::VecRRNarrow { - op: VecRRNarrowOp::Sqxtn16, + op: VecRRNarrowOp::Sqxtn, rd: writable_vreg(7), rn: vreg(22), high_half: true, + lane_size: ScalarSize::Size8, }, "C74A214E", "sqxtn2 v7.16b, v22.8h", @@ -2696,10 +2727,11 @@ fn test_aarch64_binemit() { insns.push(( Inst::VecRRNarrow { - op: VecRRNarrowOp::Sqxtn32, + op: VecRRNarrowOp::Sqxtn, rd: writable_vreg(31), rn: vreg(0), high_half: true, + lane_size: ScalarSize::Size16, }, "1F48614E", "sqxtn2 v31.8h, v0.4s", @@ -2707,10 +2739,11 @@ fn test_aarch64_binemit() { insns.push(( Inst::VecRRNarrow { - op: VecRRNarrowOp::Sqxtn64, + op: VecRRNarrowOp::Sqxtn, rd: writable_vreg(14), rn: vreg(20), high_half: false, + lane_size: ScalarSize::Size32, }, "8E4AA10E", "sqxtn v14.2s, v20.2d", @@ -2718,10 +2751,11 @@ fn test_aarch64_binemit() { insns.push(( Inst::VecRRNarrow { - op: VecRRNarrowOp::Sqxtun16, + op: VecRRNarrowOp::Sqxtun, rd: writable_vreg(16), rn: vreg(23), high_half: false, + lane_size: ScalarSize::Size8, }, "F02A212E", "sqxtun v16.8b, v23.8h", @@ -2729,10 +2763,11 @@ fn test_aarch64_binemit() { insns.push(( Inst::VecRRNarrow { - op: VecRRNarrowOp::Sqxtun32, + op: VecRRNarrowOp::Sqxtun, rd: writable_vreg(28), rn: vreg(9), high_half: true, + lane_size: ScalarSize::Size16, }, "3C29616E", "sqxtun2 v28.8h, v9.4s", @@ -2740,10 +2775,11 @@ fn test_aarch64_binemit() { insns.push(( Inst::VecRRNarrow { - op: VecRRNarrowOp::Sqxtun64, + op: VecRRNarrowOp::Sqxtun, rd: writable_vreg(15), rn: vreg(15), high_half: false, + lane_size: ScalarSize::Size32, }, "EF29A12E", "sqxtun v15.2s, v15.2d", @@ -2751,10 +2787,11 @@ fn test_aarch64_binemit() { insns.push(( Inst::VecRRNarrow { - op: VecRRNarrowOp::Uqxtn16, + op: VecRRNarrowOp::Uqxtn, rd: writable_vreg(21), rn: vreg(4), high_half: true, + lane_size: ScalarSize::Size8, }, "9548216E", "uqxtn2 v21.16b, v4.8h", @@ -2762,10 +2799,11 @@ fn test_aarch64_binemit() { insns.push(( Inst::VecRRNarrow { - op: VecRRNarrowOp::Uqxtn32, + op: VecRRNarrowOp::Uqxtn, rd: writable_vreg(31), rn: vreg(31), high_half: false, + lane_size: ScalarSize::Size16, }, "FF4B612E", "uqxtn v31.4h, v31.4s", @@ -2773,10 +2811,11 @@ fn test_aarch64_binemit() { insns.push(( Inst::VecRRNarrow { - op: VecRRNarrowOp::Uqxtn64, + op: VecRRNarrowOp::Uqxtn, rd: writable_vreg(11), rn: vreg(12), high_half: true, + lane_size: ScalarSize::Size32, }, "8B49A16E", "uqxtn2 v11.4s, v12.2d", @@ -2784,10 +2823,11 @@ fn test_aarch64_binemit() { insns.push(( Inst::VecRRNarrow { - op: VecRRNarrowOp::Fcvtn32, + op: VecRRNarrowOp::Fcvtn, rd: writable_vreg(0), rn: vreg(0), high_half: false, + lane_size: ScalarSize::Size16, }, "0068210E", "fcvtn v0.4h, v0.4s", @@ -2795,10 +2835,23 @@ fn test_aarch64_binemit() { insns.push(( Inst::VecRRNarrow { - op: VecRRNarrowOp::Fcvtn64, + op: VecRRNarrowOp::Fcvtn, + rd: writable_vreg(2), + rn: vreg(7), + high_half: false, + lane_size: ScalarSize::Size32, + }, + "E268610E", + "fcvtn v2.2s, v7.2d", + )); + + insns.push(( + Inst::VecRRNarrow { + op: VecRRNarrowOp::Fcvtn, rd: writable_vreg(31), rn: vreg(30), high_half: true, + lane_size: ScalarSize::Size32, }, "DF6B614E", "fcvtn2 v31.4s, v30.2d", diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs index 42b2959b9b..b708d6df05 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs @@ -2124,94 +2124,24 @@ impl Inst { rd, rn, high_half, + lane_size, } => { - let (op, rd_size, size) = match (op, high_half) { - (VecRRNarrowOp::Xtn16, false) => { - ("xtn", VectorSize::Size8x8, VectorSize::Size16x8) - } - (VecRRNarrowOp::Xtn16, true) => { - ("xtn2", VectorSize::Size8x16, VectorSize::Size16x8) - } - (VecRRNarrowOp::Xtn32, false) => { - ("xtn", VectorSize::Size16x4, VectorSize::Size32x4) - } - (VecRRNarrowOp::Xtn32, true) => { - ("xtn2", VectorSize::Size16x8, VectorSize::Size32x4) - } - (VecRRNarrowOp::Xtn64, false) => { - ("xtn", VectorSize::Size32x2, VectorSize::Size64x2) - } - (VecRRNarrowOp::Xtn64, true) => { - ("xtn2", VectorSize::Size32x4, VectorSize::Size64x2) - } - (VecRRNarrowOp::Sqxtn16, false) => { - ("sqxtn", VectorSize::Size8x8, VectorSize::Size16x8) - } - (VecRRNarrowOp::Sqxtn16, true) => { - ("sqxtn2", VectorSize::Size8x16, VectorSize::Size16x8) - } - (VecRRNarrowOp::Sqxtn32, false) => { - ("sqxtn", VectorSize::Size16x4, VectorSize::Size32x4) - } - (VecRRNarrowOp::Sqxtn32, true) => { - ("sqxtn2", VectorSize::Size16x8, VectorSize::Size32x4) - } - (VecRRNarrowOp::Sqxtn64, false) => { - ("sqxtn", VectorSize::Size32x2, VectorSize::Size64x2) - } - (VecRRNarrowOp::Sqxtn64, true) => { - ("sqxtn2", VectorSize::Size32x4, VectorSize::Size64x2) - } - (VecRRNarrowOp::Sqxtun16, false) => { - ("sqxtun", VectorSize::Size8x8, VectorSize::Size16x8) - } - (VecRRNarrowOp::Sqxtun16, true) => { - ("sqxtun2", VectorSize::Size8x16, VectorSize::Size16x8) - } - (VecRRNarrowOp::Sqxtun32, false) => { - ("sqxtun", VectorSize::Size16x4, VectorSize::Size32x4) - } - (VecRRNarrowOp::Sqxtun32, true) => { - ("sqxtun2", VectorSize::Size16x8, VectorSize::Size32x4) - } - (VecRRNarrowOp::Sqxtun64, false) => { - ("sqxtun", VectorSize::Size32x2, VectorSize::Size64x2) - } - (VecRRNarrowOp::Sqxtun64, true) => { - ("sqxtun2", VectorSize::Size32x4, VectorSize::Size64x2) - } - (VecRRNarrowOp::Uqxtn16, false) => { - ("uqxtn", VectorSize::Size8x8, VectorSize::Size16x8) - } - (VecRRNarrowOp::Uqxtn16, true) => { - ("uqxtn2", VectorSize::Size8x16, VectorSize::Size16x8) - } - (VecRRNarrowOp::Uqxtn32, false) => { - ("uqxtn", VectorSize::Size16x4, VectorSize::Size32x4) - } - (VecRRNarrowOp::Uqxtn32, true) => { - ("uqxtn2", VectorSize::Size16x8, VectorSize::Size32x4) - } - (VecRRNarrowOp::Uqxtn64, false) => { - ("uqxtn", VectorSize::Size32x2, VectorSize::Size64x2) - } - (VecRRNarrowOp::Uqxtn64, true) => { - ("uqxtn2", VectorSize::Size32x4, VectorSize::Size64x2) - } - (VecRRNarrowOp::Fcvtn32, false) => { - ("fcvtn", VectorSize::Size16x4, VectorSize::Size32x4) - } - (VecRRNarrowOp::Fcvtn32, true) => { - ("fcvtn2", VectorSize::Size16x8, VectorSize::Size32x4) - } - (VecRRNarrowOp::Fcvtn64, false) => { - ("fcvtn", VectorSize::Size32x2, VectorSize::Size64x2) - } - (VecRRNarrowOp::Fcvtn64, true) => { - ("fcvtn2", VectorSize::Size32x4, VectorSize::Size64x2) - } + let vec64 = VectorSize::from_lane_size(lane_size, false); + let vec128 = VectorSize::from_lane_size(lane_size, true); + let rn_size = VectorSize::from_lane_size(lane_size.widen(), true); + let (op, rd_size) = match (op, high_half) { + (VecRRNarrowOp::Xtn, false) => ("xtn", vec64), + (VecRRNarrowOp::Xtn, true) => ("xtn2", vec128), + (VecRRNarrowOp::Sqxtn, false) => ("sqxtn", vec64), + (VecRRNarrowOp::Sqxtn, true) => ("sqxtn2", vec128), + (VecRRNarrowOp::Sqxtun, false) => ("sqxtun", vec64), + (VecRRNarrowOp::Sqxtun, true) => ("sqxtun2", vec128), + (VecRRNarrowOp::Uqxtn, false) => ("uqxtn", vec64), + (VecRRNarrowOp::Uqxtn, true) => ("uqxtn2", vec128), + (VecRRNarrowOp::Fcvtn, false) => ("fcvtn", vec64), + (VecRRNarrowOp::Fcvtn, true) => ("fcvtn2", vec128), }; - let rn = pretty_print_vreg_vector(rn, size, allocs); + let rn = pretty_print_vreg_vector(rn, rn_size, allocs); let rd = pretty_print_vreg_vector(rd.to_reg(), rd_size, allocs); format!("{} {}, {}", op, rd, rn) diff --git a/cranelift/codegen/src/isa/aarch64/lower.isle b/cranelift/codegen/src/isa/aarch64/lower.isle index ab8fc79052..51034bd3f9 100644 --- a/cranelift/codegen/src/isa/aarch64/lower.isle +++ b/cranelift/codegen/src/isa/aarch64/lower.isle @@ -475,7 +475,7 @@ ;; Extract the low half components of rn. ;; tmp1 = |c|a| - (tmp1 Reg (xtn64 rn $false)) + (tmp1 Reg (xtn rn (ScalarSize.Size32))) ;; Sum the respective high half components. ;; rd = |dg+ch|be+af||dg+ch|be+af| @@ -483,7 +483,7 @@ ;; Extract the low half components of rm. ;; tmp2 = |g|e| - (tmp2 Reg (xtn64 rm $false)) + (tmp2 Reg (xtn rm (ScalarSize.Size32))) ;; Shift the high half components, into the high half. ;; rd = |dg+ch << 32|be+af << 32| @@ -1450,68 +1450,55 @@ (value_regs_get src 0)) -;;;; Rules for `fcmp` 32 bit ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; Rules for `fcmp` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -(rule (lower (has_type ty @ (multi_lane _ _) (fcmp (fcmp_zero_cond_not_eq cond) x (splat (f32const (zero_value_f32 y)))))) +(rule (lower (has_type ty @ (multi_lane _ _) (fcmp (fcmp_zero_cond_not_eq cond) x y))) + (if (zero_value y)) (let ((rn Reg x) (vec_size VectorSize (vector_size ty))) (value_reg (not (fcmeq0 rn vec_size) vec_size)))) -(rule (lower (has_type ty @ (multi_lane _ _) (fcmp (fcmp_zero_cond cond) x (splat (f32const (zero_value_f32 y)))))) +(rule (lower (has_type ty @ (multi_lane _ _) (fcmp (fcmp_zero_cond cond) x y))) + (if (zero_value y)) (let ((rn Reg x) (vec_size VectorSize (vector_size ty))) (value_reg (float_cmp_zero cond rn vec_size)))) -(rule (lower (has_type ty @ (multi_lane _ _) (fcmp (fcmp_zero_cond_not_eq cond) (splat (f32const (zero_value_f32 x))) y))) +(rule (lower (has_type ty @ (multi_lane _ _) (fcmp (fcmp_zero_cond_not_eq cond) x y))) + (if (zero_value x)) (let ((rn Reg y) (vec_size VectorSize (vector_size ty))) (value_reg (not (fcmeq0 rn vec_size) vec_size)))) -(rule (lower (has_type ty @ (multi_lane _ _) (fcmp (fcmp_zero_cond cond) (splat (f32const (zero_value_f32 x))) y))) +(rule (lower (has_type ty @ (multi_lane _ _) (fcmp (fcmp_zero_cond cond) x y))) + (if (zero_value x)) (let ((rn Reg y) (vec_size VectorSize (vector_size ty))) (value_reg (float_cmp_zero_swap cond rn vec_size)))) -;;;; Rules for `fcmp` 64 bit ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -(rule (lower (has_type ty @ (multi_lane _ _) (fcmp (fcmp_zero_cond_not_eq cond) x (splat (f64const (zero_value_f64 y)))))) - (let ((rn Reg x) - (vec_size VectorSize (vector_size ty))) - (value_reg (not (fcmeq0 rn vec_size) vec_size)))) - -(rule (lower (has_type ty @ (multi_lane _ _) (fcmp (fcmp_zero_cond cond) x (splat (f64const (zero_value_f64 y)))))) - (let ((rn Reg x) - (vec_size VectorSize (vector_size ty))) - (value_reg (float_cmp_zero cond rn vec_size)))) - -(rule (lower (has_type ty @ (multi_lane _ _) (fcmp (fcmp_zero_cond_not_eq cond) (splat (f64const (zero_value_f64 x))) y))) - (let ((rn Reg y) - (vec_size VectorSize (vector_size ty))) - (value_reg (not (fcmeq0 rn vec_size) vec_size)))) - -(rule (lower (has_type ty @ (multi_lane _ _) (fcmp (fcmp_zero_cond cond) (splat (f64const (zero_value_f64 x))) y))) - (let ((rn Reg y) - (vec_size VectorSize (vector_size ty))) - (value_reg (float_cmp_zero_swap cond rn vec_size)))) ;;;; Rules for `icmp` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -(rule (lower (has_type ty @ (multi_lane _ _) (icmp (icmp_zero_cond_not_eq cond) x (splat (iconst (zero_value y)))))) +(rule (lower (has_type ty @ (multi_lane _ _) (icmp (icmp_zero_cond_not_eq cond) x y))) + (if (zero_value y)) (let ((rn Reg x) (vec_size VectorSize (vector_size ty))) (value_reg (not (cmeq0 rn vec_size) vec_size)))) -(rule (lower (has_type ty @ (multi_lane _ _) (icmp (icmp_zero_cond cond) x (splat (iconst (zero_value y)))))) +(rule (lower (has_type ty @ (multi_lane _ _) (icmp (icmp_zero_cond cond) x y))) + (if (zero_value y)) (let ((rn Reg x) (vec_size VectorSize (vector_size ty))) (value_reg (int_cmp_zero cond rn vec_size)))) -(rule (lower (has_type ty @ (multi_lane _ _) (icmp (icmp_zero_cond_not_eq cond) (splat (iconst (zero_value x))) y))) +(rule (lower (has_type ty @ (multi_lane _ _) (icmp (icmp_zero_cond_not_eq cond) x y))) + (if (zero_value x)) (let ((rn Reg y) (vec_size VectorSize (vector_size ty))) (value_reg (not (cmeq0 rn vec_size) vec_size)))) -(rule (lower (has_type ty @ (multi_lane _ _) (icmp (icmp_zero_cond cond) (splat (iconst (zero_value x))) y))) +(rule (lower (has_type ty @ (multi_lane _ _) (icmp (icmp_zero_cond cond) x y))) + (if (zero_value x)) (let ((rn Reg y) (vec_size VectorSize (vector_size ty))) (value_reg (int_cmp_zero_swap cond rn vec_size)))) @@ -1624,3 +1611,53 @@ (rule (lower (and (has_type (valid_atomic_transaction ty) (atomic_cas flags addr src1 src2)))) (atomic_cas_loop addr src1 src2 ty)) + + +;;;; Rules for 'fvdemote' ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +(rule (lower (fvdemote x)) + (fcvtn x (ScalarSize.Size32))) + + +;;;; Rules for `snarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +(rule (lower (has_type (ty_vec128_int ty) (snarrow x y))) + (if (zero_value y)) + (sqxtn x (lane_size ty))) + +(rule (lower (has_type (ty_vec64_int ty) (snarrow x y))) + (let ((dst Reg (mov_vec_elem x y 1 0 (VectorSize.Size64x2)))) + (sqxtn dst (lane_size ty)))) + +(rule (lower (has_type (ty_vec128_int ty) (snarrow x y))) + (let ((low_half Reg (sqxtn x (lane_size ty))) + (result Reg (sqxtn2 low_half y (lane_size ty)))) + result)) + + +;;;; Rules for `unarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +(rule (lower (has_type (ty_vec128_int ty) (unarrow x y))) + (if (zero_value y)) + (sqxtun x (lane_size ty))) + +(rule (lower (has_type (ty_vec64_int ty) (unarrow x y))) + (let ((dst Reg (mov_vec_elem x y 1 0 (VectorSize.Size64x2)))) + (sqxtun dst (lane_size ty)))) + +(rule (lower (has_type (ty_vec128_int ty) (unarrow x y))) + (let ((low_half Reg (sqxtun x (lane_size ty))) + (result Reg (sqxtun2 low_half y (lane_size ty)))) + result)) + + +;;;; Rules for `uunarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +(rule (lower (has_type (ty_vec128_int ty) (uunarrow x y))) + (if (zero_value y)) + (uqxtn x (lane_size ty))) + +(rule (lower (has_type (ty_vec64_int ty) (uunarrow x y))) + (let ((dst Reg (mov_vec_elem x y 1 0 (VectorSize.Size64x2)))) + (uqxtn dst (lane_size ty)))) + +(rule (lower (has_type (ty_vec128_int ty) (uunarrow x y))) + (let ((low_half Reg (uqxtn x (lane_size ty))) + (result Reg (uqxtn2 low_half y (lane_size ty)))) + result)) diff --git a/cranelift/codegen/src/isa/aarch64/lower/isle.rs b/cranelift/codegen/src/isa/aarch64/lower/isle.rs index fa4556f03d..8ae644955d 100644 --- a/cranelift/codegen/src/isa/aarch64/lower/isle.rs +++ b/cranelift/codegen/src/isa/aarch64/lower/isle.rs @@ -424,25 +424,4 @@ where _ => panic!(), } } - - fn zero_value(&mut self, value: Imm64) -> Option { - if value.bits() == 0 { - return Some(value); - } - None - } - - fn zero_value_f32(&mut self, value: Ieee32) -> Option { - if value.bits() == 0 { - return Some(value); - } - None - } - - fn zero_value_f64(&mut self, value: Ieee64) -> Option { - if value.bits() == 0 { - return Some(value); - } - None - } } diff --git a/cranelift/codegen/src/isa/aarch64/lower_dynamic_neon.isle b/cranelift/codegen/src/isa/aarch64/lower_dynamic_neon.isle index 12d20b3e3d..a58f6f28a0 100644 --- a/cranelift/codegen/src/isa/aarch64/lower_dynamic_neon.isle +++ b/cranelift/codegen/src/isa/aarch64/lower_dynamic_neon.isle @@ -47,6 +47,48 @@ (vec_rrr (VecALUOp.Fcmgt) (put_in_reg y) (put_in_reg x) (vector_size ty)) (put_in_reg y) (put_in_reg x)))) +;;;; Rules for `snarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +(rule (lower (has_type (ty_dyn128_int ty) (snarrow x y))) + (if-let _ (zero_value y)) + (sqxtn x (lane_size ty))) + +(rule (lower (has_type (ty_dyn64_int ty) (snarrow x y))) + (let ((dst Reg (mov_vec_elem x y 1 0 (VectorSize.Size64x2)))) + (sqxtn dst (lane_size ty)))) + +(rule (lower (has_type (ty_dyn128_int ty) (snarrow x y))) + (let ((low_half Reg (sqxtn x (lane_size ty))) + (result Reg (sqxtn2 low_half y (lane_size ty)))) + result)) + +;;;; Rules for `unarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +(rule (lower (has_type (ty_dyn128_int ty) (unarrow x y))) + (if-let _ (zero_value y)) + (sqxtun x (lane_size ty))) + +(rule (lower (has_type (ty_dyn64_int ty) (unarrow x y))) + (let ((dst Reg (mov_vec_elem x y 1 0 (VectorSize.Size64x2)))) + (sqxtun dst (lane_size ty)))) + +(rule (lower (has_type (ty_dyn128_int ty) (unarrow x y))) + (let ((low_half Reg (sqxtun x (lane_size ty))) + (result Reg (sqxtun2 low_half y (lane_size ty)))) + result)) + +;;;; Rules for `uunarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +(rule (lower (has_type (ty_dyn128_int ty) (uunarrow x y))) + (if-let _ (zero_value y)) + (uqxtn x (lane_size ty))) + +(rule (lower (has_type (ty_dyn64_int ty) (uunarrow x y))) + (let ((dst Reg (mov_vec_elem x y 1 0 (VectorSize.Size64x2)))) + (uqxtn dst (lane_size ty)))) + +(rule (lower (has_type (ty_dyn128_int ty) (uunarrow x y))) + (let ((low_half Reg (uqxtn x (lane_size ty))) + (result Reg (uqxtn2 low_half y (lane_size ty)))) + result)) + ;;; Rules for `dynamic_stack_addr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (dynamic_stack_addr stack_slot)) (let ((dst WritableReg (temp_writable_reg $I64)) diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs index 9c166e1c53..18fa8d24ef 100644 --- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs +++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs @@ -1767,57 +1767,7 @@ pub(crate) fn lower_insn_to_regs>( }); } - Opcode::Snarrow | Opcode::Unarrow | Opcode::Uunarrow => { - let nonzero_high_half = maybe_input_insn(ctx, inputs[1], Opcode::Vconst) - .map_or(true, |insn| { - const_param_to_u128(ctx, insn).expect("Invalid immediate bytes") != 0 - }); - let ty = ty.unwrap(); - let ty = if ty.is_dynamic_vector() { - ty.dynamic_to_vector() - .unwrap_or_else(|| panic!("Unsupported dynamic type: {}?", ty)) - } else { - ty - }; - - let op = match (op, ty) { - (Opcode::Snarrow, I8X16) => VecRRNarrowOp::Sqxtn16, - (Opcode::Snarrow, I16X8) => VecRRNarrowOp::Sqxtn32, - (Opcode::Snarrow, I32X4) => VecRRNarrowOp::Sqxtn64, - (Opcode::Unarrow, I8X16) => VecRRNarrowOp::Sqxtun16, - (Opcode::Unarrow, I16X8) => VecRRNarrowOp::Sqxtun32, - (Opcode::Unarrow, I32X4) => VecRRNarrowOp::Sqxtun64, - (Opcode::Uunarrow, I8X16) => VecRRNarrowOp::Uqxtn16, - (Opcode::Uunarrow, I16X8) => VecRRNarrowOp::Uqxtn32, - (Opcode::Uunarrow, I32X4) => VecRRNarrowOp::Uqxtn64, - (_, ty) => { - return Err(CodegenError::Unsupported(format!( - "{}: Unsupported type: {:?}", - op, ty - ))) - } - }; - let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); - - ctx.emit(Inst::VecRRNarrow { - op, - rd, - rn, - high_half: false, - }); - - if nonzero_high_half { - let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); - - ctx.emit(Inst::VecRRNarrow { - op, - rd, - rn, - high_half: true, - }); - } - } + Opcode::Snarrow | Opcode::Unarrow | Opcode::Uunarrow => implemented_in_isle(ctx), Opcode::SwidenLow | Opcode::SwidenHigh | Opcode::UwidenLow | Opcode::UwidenHigh => { let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); @@ -1940,19 +1890,7 @@ pub(crate) fn lower_insn_to_regs>( }); } - Opcode::Fvdemote => { - debug_assert_eq!(ty.unwrap(), F32X4); - - let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); - - ctx.emit(Inst::VecRRNarrow { - op: VecRRNarrowOp::Fcvtn64, - rd, - rn, - high_half: false, - }); - } + Opcode::Fvdemote => implemented_in_isle(ctx), Opcode::ExtractVector => implemented_in_isle(ctx), diff --git a/cranelift/codegen/src/machinst/isle.rs b/cranelift/codegen/src/machinst/isle.rs index 74d0f7976c..1ca00533c6 100644 --- a/cranelift/codegen/src/machinst/isle.rs +++ b/cranelift/codegen/src/machinst/isle.rs @@ -354,6 +354,15 @@ macro_rules! isle_prelude_methods { } } + #[inline] + fn ty_vec64_int(&mut self, ty: Type) -> Option { + if ty.is_vector() && ty.bits() == 64 && ty.lane_type().is_int() { + Some(ty) + } else { + None + } + } + #[inline] fn ty_vec128_int(&mut self, ty: Type) -> Option { if ty.is_vector() && ty.bits() == 128 && ty.lane_type().is_int() { @@ -470,6 +479,24 @@ macro_rules! isle_prelude_methods { } } + #[inline] + fn ty_dyn64_int(&mut self, ty: Type) -> Option { + if ty.is_dynamic_vector() && ty.min_bits() == 64 && ty.lane_type().is_int() { + Some(ty) + } else { + None + } + } + + #[inline] + fn ty_dyn128_int(&mut self, ty: Type) -> Option { + if ty.is_dynamic_vector() && ty.min_bits() == 128 && ty.lane_type().is_int() { + Some(ty) + } else { + None + } + } + #[inline] fn def_inst(&mut self, val: Value) -> Option { self.lower_ctx.dfg().value_def(val).inst() @@ -487,6 +514,59 @@ macro_rules! isle_prelude_methods { val } + fn zero_value(&mut self, value: Value) -> Option { + let insn = self.def_inst(value); + if insn.is_some() { + let insn = insn.unwrap(); + let inst_data = self.lower_ctx.data(insn); + match inst_data { + InstructionData::Unary { + opcode: Opcode::Splat, + arg, + } => { + let arg = arg.clone(); + return self.zero_value(arg); + } + InstructionData::UnaryConst { + opcode: Opcode::Vconst, + constant_handle, + } => { + let constant_data = + self.lower_ctx.get_constant_data(*constant_handle).clone(); + if constant_data.into_vec().iter().any(|&x| x != 0) { + return None; + } else { + return Some(value); + } + } + InstructionData::UnaryImm { imm, .. } => { + if imm.bits() == 0 { + return Some(value); + } else { + return None; + } + } + InstructionData::UnaryIeee32 { imm, .. } => { + if imm.bits() == 0 { + return Some(value); + } else { + return None; + } + } + InstructionData::UnaryIeee64 { imm, .. } => { + if imm.bits() == 0 { + return Some(value); + } else { + return None; + } + } + _ => None, + } + } else { + None + } + } + fn not_i64x2(&mut self, ty: Type) -> Option<()> { if ty == I64X2 { None diff --git a/cranelift/codegen/src/prelude.isle b/cranelift/codegen/src/prelude.isle index a47fcd63b8..a18828ca38 100644 --- a/cranelift/codegen/src/prelude.isle +++ b/cranelift/codegen/src/prelude.isle @@ -341,6 +341,11 @@ (decl ty_vec128 (Type) Type) (extern extractor ty_vec128 ty_vec128) +;; An extractor that only matches 64-bit vector types with integer +;; lanes (I8X8, I16X4, I32X2) +(decl ty_vec64_int (Type) Type) +(extern extractor ty_vec64_int ty_vec64_int) + ;; An extractor that only matches 128-bit vector types with integer ;; lanes (I8X16, I16X8, I32X4, I64X2). (decl ty_vec128_int (Type) Type) @@ -458,6 +463,16 @@ (decl dynamic_fp_lane (u32) Type) (extern extractor dynamic_fp_lane dynamic_fp_lane) +;; An extractor that only matches 64-bit dynamic vector types with integer +;; lanes (I8X8XN, I16X4XN, I32X2XN) +(decl ty_dyn64_int (Type) Type) +(extern extractor ty_dyn64_int ty_dyn64_int) + +;; An extractor that only matches 128-bit dynamic vector types with integer +;; lanes (I8X16XN, I16X8XN, I32X4XN, I64X2XN). +(decl ty_dyn128_int (Type) Type) +(extern extractor ty_dyn128_int ty_dyn128_int) + ;; Match the instruction that defines the given value, if any. (decl def_inst (Inst) Value) (extern extractor def_inst def_inst) @@ -471,6 +486,10 @@ (decl offset32_to_u32 (Offset32) u32) (extern constructor offset32_to_u32 offset32_to_u32) +;; Match any zero value for iconst, fconst32, fconst64, vconst and splat. +(decl pure zero_value (Value) Value) +(extern constructor zero_value zero_value) + ;; Instruction creation helpers ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Emit an instruction. diff --git a/cranelift/filetests/filetests/isa/aarch64/compare_zero.clif b/cranelift/filetests/filetests/isa/aarch64/compare_zero.clif index 122ea536a4..6827b774ca 100644 --- a/cranelift/filetests/filetests/isa/aarch64/compare_zero.clif +++ b/cranelift/filetests/filetests/isa/aarch64/compare_zero.clif @@ -14,6 +14,17 @@ block0(v0: i8x16): ; cmeq v0.16b, v0.16b, #0 ; ret +function %f0_vconst(i8x16) -> b8x16 { +block0(v0: i8x16): + v1 = vconst.i8x16 0x00 + v2 = icmp eq v0, v1 + return v2 +} + +; block0: +; cmeq v0.16b, v0.16b, #0 +; ret + function %f1(i16x8) -> b16x8 { block0(v0: i16x8): v1 = iconst.i16 0 @@ -26,6 +37,17 @@ block0(v0: i16x8): ; cmeq v0.8h, v0.8h, #0 ; ret +function %f1_vconst(i16x8) -> b16x8 { +block0(v0: i16x8): + v1 = vconst.i16x8 0x00 + v2 = icmp eq v1, v0 + return v2 +} + +; block0: +; cmeq v0.8h, v0.8h, #0 +; ret + function %f2(i32x4) -> b32x4 { block0(v0: i32x4): v1 = iconst.i32 0 @@ -39,6 +61,18 @@ block0(v0: i32x4): ; mvn v0.16b, v3.16b ; ret +function %f2_vconst(i32x4) -> b32x4 { +block0(v0: i32x4): + v1 = vconst.i32x4 0x00 + v2 = icmp ne v0, v1 + return v2 +} + +; block0: +; cmeq v3.4s, v0.4s, #0 +; mvn v0.16b, v3.16b +; ret + function %f3(i64x2) -> b64x2 { block0(v0: i64x2): v1 = iconst.i64 0 @@ -52,6 +86,18 @@ block0(v0: i64x2): ; mvn v0.16b, v3.16b ; ret +function %f3_vconst(i64x2) -> b64x2 { +block0(v0: i64x2): + v1 = vconst.i64x2 0x00 + v2 = icmp ne v1, v0 + return v2 +} + +; block0: +; cmeq v3.2d, v0.2d, #0 +; mvn v0.16b, v3.16b +; ret + function %f4(i8x16) -> b8x16 { block0(v0: i8x16): v1 = iconst.i8 0 @@ -64,6 +110,17 @@ block0(v0: i8x16): ; cmle v0.16b, v0.16b, #0 ; ret +function %f4_vconst(i8x16) -> b8x16 { +block0(v0: i8x16): + v1 = vconst.i8x16 0x00 + v2 = icmp sle v0, v1 + return v2 +} + +; block0: +; cmle v0.16b, v0.16b, #0 +; ret + function %f5(i16x8) -> b16x8 { block0(v0: i16x8): v1 = iconst.i16 0 @@ -76,6 +133,17 @@ block0(v0: i16x8): ; cmge v0.8h, v0.8h, #0 ; ret +function %f5_vconst(i16x8) -> b16x8 { +block0(v0: i16x8): + v1 = vconst.i16x8 0x00 + v2 = icmp sle v1, v0 + return v2 +} + +; block0: +; cmge v0.8h, v0.8h, #0 +; ret + function %f6(i32x4) -> b32x4 { block0(v0: i32x4): v1 = iconst.i32 0 @@ -88,6 +156,17 @@ block0(v0: i32x4): ; cmge v0.4s, v0.4s, #0 ; ret +function %f6_vconst(i32x4) -> b32x4 { +block0(v0: i32x4): + v1 = vconst.i32x4 0x00 + v2 = icmp sge v0, v1 + return v2 +} + +; block0: +; cmge v0.4s, v0.4s, #0 +; ret + function %f7(i64x2) -> b64x2 { block0(v0: i64x2): v1 = iconst.i64 0 @@ -100,6 +179,17 @@ block0(v0: i64x2): ; cmle v0.2d, v0.2d, #0 ; ret +function %f7_vconst(i64x2) -> b64x2 { +block0(v0: i64x2): + v1 = vconst.i64x2 0x00 + v2 = icmp sge v1, v0 + return v2 +} + +; block0: +; cmle v0.2d, v0.2d, #0 +; ret + function %f8(i8x16) -> b8x16 { block0(v0: i8x16): v1 = iconst.i8 0 @@ -112,6 +202,17 @@ block0(v0: i8x16): ; cmlt v0.16b, v0.16b, #0 ; ret +function %f8_vconst(i8x16) -> b8x16 { +block0(v0: i8x16): + v1 = vconst.i8x16 0x00 + v2 = icmp slt v0, v1 + return v2 +} + +; block0: +; cmlt v0.16b, v0.16b, #0 +; ret + function %f9(i16x8) -> b16x8 { block0(v0: i16x8): v1 = iconst.i16 0 @@ -124,6 +225,17 @@ block0(v0: i16x8): ; cmgt v0.8h, v0.8h, #0 ; ret +function %f9_vconst(i16x8) -> b16x8 { +block0(v0: i16x8): + v1 = vconst.i16x8 0x00 + v2 = icmp slt v1, v0 + return v2 +} + +; block0: +; cmgt v0.8h, v0.8h, #0 +; ret + function %f10(i32x4) -> b32x4 { block0(v0: i32x4): v1 = iconst.i32 0 @@ -136,6 +248,17 @@ block0(v0: i32x4): ; cmgt v0.4s, v0.4s, #0 ; ret +function %f10_vconst(i32x4) -> b32x4 { +block0(v0: i32x4): + v1 = vconst.i32x4 0x00 + v2 = icmp sgt v0, v1 + return v2 +} + +; block0: +; cmgt v0.4s, v0.4s, #0 +; ret + function %f11(i64x2) -> b64x2 { block0(v0: i64x2): v1 = iconst.i64 0 @@ -148,6 +271,17 @@ block0(v0: i64x2): ; cmlt v0.2d, v0.2d, #0 ; ret +function %f11_vconst(i64x2) -> b64x2 { +block0(v0: i64x2): + v1 = vconst.i64x2 0x00 + v2 = icmp sgt v1, v0 + return v2 +} + +; block0: +; cmlt v0.2d, v0.2d, #0 +; ret + function %f12(f32x4) -> b32x4 { block0(v0: f32x4): v1 = f32const 0.0 @@ -160,6 +294,17 @@ block0(v0: f32x4): ; fcmeq v0.4s, v0.4s, #0.0 ; ret +function %f12_vconst(f32x4) -> b32x4 { +block0(v0: f32x4): + v1 = vconst.f32x4 [0.0 0.0 0.0 0.0] + v2 = fcmp eq v0, v1 + return v2 +} + +; block0: +; fcmeq v0.4s, v0.4s, #0.0 +; ret + function %f13(f64x2) -> b64x2 { block0(v0: f64x2): v1 = f64const 0.0 @@ -172,6 +317,17 @@ block0(v0: f64x2): ; fcmeq v0.2d, v0.2d, #0.0 ; ret +function %f13_vconst(f64x2) -> b64x2 { +block0(v0: f64x2): + v1 = vconst.f64x2 [0.0 0.0] + v2 = fcmp eq v1, v0 + return v2 +} + +; block0: +; fcmeq v0.2d, v0.2d, #0.0 +; ret + function %f14(f64x2) -> b64x2 { block0(v0: f64x2): v1 = f64const 0.0 @@ -185,6 +341,18 @@ block0(v0: f64x2): ; mvn v0.16b, v3.16b ; ret +function %f14_vconst(f64x2) -> b64x2 { +block0(v0: f64x2): + v1 = vconst.f64x2 [0.0 0.0] + v2 = fcmp ne v0, v1 + return v2 +} + +; block0: +; fcmeq v3.2d, v0.2d, #0.0 +; mvn v0.16b, v3.16b +; ret + function %f15(f32x4) -> b32x4 { block0(v0: f32x4): v1 = f32const 0.0 @@ -198,6 +366,18 @@ block0(v0: f32x4): ; mvn v0.16b, v3.16b ; ret +function %f15_vconst(f32x4) -> b32x4 { +block0(v0: f32x4): + v1 = vconst.f32x4 [0.0 0.0 0.0 0.0] + v2 = fcmp ne v1, v0 + return v2 +} + +; block0: +; fcmeq v3.4s, v0.4s, #0.0 +; mvn v0.16b, v3.16b +; ret + function %f16(f32x4) -> b32x4 { block0(v0: f32x4): v1 = f32const 0.0 @@ -210,6 +390,17 @@ block0(v0: f32x4): ; fcmle v0.4s, v0.4s, #0.0 ; ret +function %f16_vconst(f32x4) -> b32x4 { +block0(v0: f32x4): + v1 = vconst.f32x4 [0.0 0.0 0.0 0.0] + v2 = fcmp le v0, v1 + return v2 +} + +; block0: +; fcmle v0.4s, v0.4s, #0.0 +; ret + function %f17(f64x2) -> b64x2 { block0(v0: f64x2): v1 = f64const 0.0 @@ -222,6 +413,17 @@ block0(v0: f64x2): ; fcmge v0.2d, v0.2d, #0.0 ; ret +function %f17_vconst(f64x2) -> b64x2 { +block0(v0: f64x2): + v1 = vconst.f64x2 [0.0 0.0] + v2 = fcmp le v1, v0 + return v2 +} + +; block0: +; fcmge v0.2d, v0.2d, #0.0 +; ret + function %f18(f64x2) -> b64x2 { block0(v0: f64x2): v1 = f64const 0.0 @@ -234,6 +436,17 @@ block0(v0: f64x2): ; fcmge v0.2d, v0.2d, #0.0 ; ret +function %f18_vconst(f64x2) -> b64x2 { +block0(v0: f64x2): + v1 = vconst.f64x2 [0.0 0.0] + v2 = fcmp ge v0, v1 + return v2 +} + +; block0: +; fcmge v0.2d, v0.2d, #0.0 +; ret + function %f19(f32x4) -> b32x4 { block0(v0: f32x4): v1 = f32const 0.0 @@ -246,6 +459,17 @@ block0(v0: f32x4): ; fcmle v0.4s, v0.4s, #0.0 ; ret +function %f19_vconst(f32x4) -> b32x4 { +block0(v0: f32x4): + v1 = vconst.f32x4 [0.0 0.0 0.0 0.0] + v2 = fcmp ge v1, v0 + return v2 +} + +; block0: +; fcmle v0.4s, v0.4s, #0.0 +; ret + function %f20(f32x4) -> b32x4 { block0(v0: f32x4): v1 = f32const 0.0 @@ -258,6 +482,17 @@ block0(v0: f32x4): ; fcmlt v0.4s, v0.4s, #0.0 ; ret +function %f20_vconst(f32x4) -> b32x4 { +block0(v0: f32x4): + v1 = vconst.f32x4 [0.0 0.0 0.0 0.0] + v2 = fcmp lt v0, v1 + return v2 +} + +; block0: +; fcmlt v0.4s, v0.4s, #0.0 +; ret + function %f21(f64x2) -> b64x2 { block0(v0: f64x2): v1 = f64const 0.0 @@ -270,6 +505,17 @@ block0(v0: f64x2): ; fcmgt v0.2d, v0.2d, #0.0 ; ret +function %f21_vconst(f64x2) -> b64x2 { +block0(v0: f64x2): + v1 = vconst.f64x2 [0.0 0.0] + v2 = fcmp lt v1, v0 + return v2 +} + +; block0: +; fcmgt v0.2d, v0.2d, #0.0 +; ret + function %f22(f64x2) -> b64x2 { block0(v0: f64x2): v1 = f64const 0.0 @@ -282,6 +528,17 @@ block0(v0: f64x2): ; fcmgt v0.2d, v0.2d, #0.0 ; ret +function %f22_vconst(f64x2) -> b64x2 { +block0(v0: f64x2): + v1 = vconst.f64x2 [0.0 0.0] + v2 = fcmp gt v0, v1 + return v2 +} + +; block0: +; fcmgt v0.2d, v0.2d, #0.0 +; ret + function %f23(f32x4) -> b32x4 { block0(v0: f32x4): v1 = f32const 0.0 @@ -294,3 +551,13 @@ block0(v0: f32x4): ; fcmlt v0.4s, v0.4s, #0.0 ; ret +function %f23_vconst(f32x4) -> b32x4 { +block0(v0: f32x4): + v1 = vconst.f32x4 [0.0 0.0 0.0 0.0] + v2 = fcmp gt v1, v0 + return v2 +} + +; block0: +; fcmlt v0.4s, v0.4s, #0.0 +; ret diff --git a/cranelift/filetests/filetests/isa/aarch64/dynamic-simd-narrow.clif b/cranelift/filetests/filetests/isa/aarch64/dynamic-simd-narrow.clif index 7f39747abc..f9e7b32448 100644 --- a/cranelift/filetests/filetests/isa/aarch64/dynamic-simd-narrow.clif +++ b/cranelift/filetests/filetests/isa/aarch64/dynamic-simd-narrow.clif @@ -1,6 +1,26 @@ -test compile +test compile precise-output target aarch64 +function %snarrow_i16x4(i16) -> i8x8 { + gv0 = dyn_scale_target_const.i16x4 + gv1 = dyn_scale_target_const.i8x8 + dt0 = i16x4*gv0 + dt1 = i8x8*gv0 + +block0(v0: i16): + v1 = splat.dt0 v0 + v2 = snarrow.dt0 v1, v1 + v3 = extract_vector v2, 0 + return v3 +} + +; block0: +; dup v2.4h, w0 +; mov v7.16b, v2.16b +; mov v7.d[1], v2.d[0] +; sqxtn v0.8b, v7.8h +; ret + function %snarrow_i16x8(i16) -> i8x16 { gv0 = dyn_scale_target_const.i16x8 gv1 = dyn_scale_target_const.i8x16 @@ -14,10 +34,31 @@ block0(v0: i16): return v3 } -; check: dup v2.8h, w0 -; nextln: sqxtn v0.8b, v2.8h -; nextln: sqxtn2 v0.16b, v2.8h -; nextln: ret +; block0: +; dup v2.8h, w0 +; sqxtn v0.8b, v2.8h +; sqxtn2 v0.16b, v2.8h +; ret + +function %snarrow_i32x2(i32) -> i16x4 { + gv0 = dyn_scale_target_const.i32x2 + gv1 = dyn_scale_target_const.i16x4 + dt0 = i32x2*gv0 + dt1 = i16x4*gv0 + +block0(v0: i32): + v1 = splat.dt0 v0 + v2 = snarrow.dt0 v1, v1 + v3 = extract_vector v2, 0 + return v3 +} + +; block0: +; dup v2.2s, w0 +; mov v7.16b, v2.16b +; mov v7.d[1], v2.d[0] +; sqxtn v0.4h, v7.4s +; ret function %snarrow_i32x4(i32) -> i16x8 { gv0 = dyn_scale_target_const.i32x4 @@ -32,10 +73,11 @@ block0(v0: i32): return v3 } -; check: dup v2.4s, w0 -; nextln: sqxtn v0.4h, v2.4s -; nextln: sqxtn2 v0.8h, v2.4s -; nextln: ret +; block0: +; dup v2.4s, w0 +; sqxtn v0.4h, v2.4s +; sqxtn2 v0.8h, v2.4s +; ret function %snarrow_i64x2(i64) -> i32x4 { gv0 = dyn_scale_target_const.i64x2 @@ -50,10 +92,31 @@ block0(v0: i64): return v3 } -; check: dup v2.2d, x0 -; nextln: sqxtn v0.2s, v2.2d -; nextln: sqxtn2 v0.4s, v2.2d -; nextln: ret +; block0: +; dup v2.2d, x0 +; sqxtn v0.2s, v2.2d +; sqxtn2 v0.4s, v2.2d +; ret + +function %unarrow_i16x4(i16) -> i8x8 { + gv0 = dyn_scale_target_const.i16x4 + gv1 = dyn_scale_target_const.i8x8 + dt0 = i16x4*gv0 + dt1 = i8x8*gv0 + +block0(v0: i16): + v1 = splat.dt0 v0 + v2 = unarrow.dt0 v1, v1 + v3 = extract_vector v2, 0 + return v3 +} + +; block0: +; dup v2.4h, w0 +; mov v7.16b, v2.16b +; mov v7.d[1], v2.d[0] +; sqxtun v0.8b, v7.8h +; ret function %unarrow_i16x8(i16) -> i8x16 { gv0 = dyn_scale_target_const.i16x8 @@ -68,10 +131,31 @@ block0(v0: i16): return v3 } -; check: dup v2.8h, w0 -; nextln: sqxtun v0.8b, v2.8h -; nextln: sqxtun2 v0.16b, v2.8h -; nextln: ret +; block0: +; dup v2.8h, w0 +; sqxtun v0.8b, v2.8h +; sqxtun2 v0.16b, v2.8h +; ret + +function %unarrow_i32x2(i32) -> i16x4 { + gv0 = dyn_scale_target_const.i32x2 + gv1 = dyn_scale_target_const.i16x4 + dt0 = i32x2*gv0 + dt1 = i16x4*gv0 + +block0(v0: i32): + v1 = splat.dt0 v0 + v2 = unarrow.dt0 v1, v1 + v3 = extract_vector v2, 0 + return v3 +} + +; block0: +; dup v2.2s, w0 +; mov v7.16b, v2.16b +; mov v7.d[1], v2.d[0] +; sqxtun v0.4h, v7.4s +; ret function %unarrow_i32x4(i32) -> i16x8 { gv0 = dyn_scale_target_const.i32x4 @@ -86,10 +170,11 @@ block0(v0: i32): return v3 } -; check: dup v2.4s, w0 -; nextln: sqxtun v0.4h, v2.4s -; nextln: sqxtun2 v0.8h, v2.4s -; nextln: ret +; block0: +; dup v2.4s, w0 +; sqxtun v0.4h, v2.4s +; sqxtun2 v0.8h, v2.4s +; ret function %unarrow_i64x2(i64) -> i32x4 { gv0 = dyn_scale_target_const.i64x2 @@ -104,10 +189,31 @@ block0(v0: i64): return v3 } -; check: dup v2.2d, x0 -; nextln: sqxtun v0.2s, v2.2d -; nextln: sqxtun2 v0.4s, v2.2d -; nextln: ret +; block0: +; dup v2.2d, x0 +; sqxtun v0.2s, v2.2d +; sqxtun2 v0.4s, v2.2d +; ret + +function %uunarrow_i16x4(i16) -> i8x8 { + gv0 = dyn_scale_target_const.i16x4 + gv1 = dyn_scale_target_const.i8x8 + dt0 = i16x4*gv0 + dt1 = i8x8*gv0 + +block0(v0: i16): + v1 = splat.dt0 v0 + v2 = uunarrow.dt0 v1, v1 + v3 = extract_vector v2, 0 + return v3 +} + +; block0: +; dup v2.4h, w0 +; mov v7.16b, v2.16b +; mov v7.d[1], v2.d[0] +; uqxtn v0.8b, v7.8h +; ret function %uunarrow_i16x8(i16) -> i8x16 { gv0 = dyn_scale_target_const.i16x8 @@ -122,10 +228,31 @@ block0(v0: i16): return v3 } -; check: dup v2.8h, w0 -; nextln: uqxtn v0.8b, v2.8h -; nextln: uqxtn2 v0.16b, v2.8h -; nextln: ret +; block0: +; dup v2.8h, w0 +; uqxtn v0.8b, v2.8h +; uqxtn2 v0.16b, v2.8h +; ret + +function %uunarrow_i32x2(i32) -> i16x4 { + gv0 = dyn_scale_target_const.i32x2 + gv1 = dyn_scale_target_const.i16x4 + dt0 = i32x2*gv0 + dt1 = i16x4*gv0 + +block0(v0: i32): + v1 = splat.dt0 v0 + v2 = uunarrow.dt0 v1, v1 + v3 = extract_vector v2, 0 + return v3 +} + +; block0: +; dup v2.2s, w0 +; mov v7.16b, v2.16b +; mov v7.d[1], v2.d[0] +; uqxtn v0.4h, v7.4s +; ret function %uunarrow_i32x4(i32) -> i16x8 { gv0 = dyn_scale_target_const.i32x4 @@ -140,10 +267,11 @@ block0(v0: i32): return v3 } -; check: dup v2.4s, w0 -; nextln: uqxtn v0.4h, v2.4s -; nextln: uqxtn2 v0.8h, v2.4s -; nextln: ret +; block0: +; dup v2.4s, w0 +; uqxtn v0.4h, v2.4s +; uqxtn2 v0.8h, v2.4s +; ret function %uunarrow_i64x2(i64) -> i32x4 { gv0 = dyn_scale_target_const.i64x2 @@ -158,7 +286,9 @@ block0(v0: i64): return v3 } -; check: dup v2.2d, x0 -; nextln: uqxtn v0.2s, v2.2d -; nextln: uqxtn2 v0.4s, v2.2d -; nextln: ret +; block0: +; dup v2.2d, x0 +; uqxtn v0.2s, v2.2d +; uqxtn2 v0.4s, v2.2d +; ret + diff --git a/cranelift/filetests/filetests/isa/aarch64/simd-narrow.clif b/cranelift/filetests/filetests/isa/aarch64/simd-narrow.clif new file mode 100644 index 0000000000..dcf23e1cfe --- /dev/null +++ b/cranelift/filetests/filetests/isa/aarch64/simd-narrow.clif @@ -0,0 +1,268 @@ +test compile precise-output +set unwind_info=false +target aarch64 + +function %snarrow_i16x4(i16x4, i16x4) -> i8x8 { +block0(v0: i16x4, v1: i16x4): + v2 = snarrow v0, v1 + return v2 +} + +; block0: +; mov v0.d[1], v1.d[0] +; sqxtn v0.8b, v0.8h +; ret + +function %snarrow_i16x8(i16x8, i16x8) -> i8x16 { +block0(v0: i16x8, v1: i16x8): + v2 = snarrow v0, v1 + return v2 +} + +; block0: +; sqxtn v0.8b, v0.8h +; sqxtn2 v0.16b, v1.8h +; ret + +function %snarrow_i32x2(i32x2, i32x2) -> i16x4 { +block0(v0: i32x2, v1: i32x2): + v2 = snarrow v0, v1 + return v2 +} + +; block0: +; mov v0.d[1], v1.d[0] +; sqxtn v0.4h, v0.4s +; ret + +function %snarrow_i32x4(i32x4, i32x4) -> i16x8 { +block0(v0: i32x4, v1: i32x4): + v2 = snarrow v0, v1 + return v2 +} + +; block0: +; sqxtn v0.4h, v0.4s +; sqxtn2 v0.8h, v1.4s +; ret + +function %snarrow_i64x2(i64x2, i64x2) -> i32x4 { +block0(v0: i64x2, v1: i64x2): + v2 = snarrow v0, v1 + return v2 +} + +; block0: +; sqxtn v0.2s, v0.2d +; sqxtn2 v0.4s, v1.2d +; ret + +function %unarrow_i16x4(i16x4, i16x4) -> i8x8 { +block0(v0: i16x4, v1: i16x4): + v2 = unarrow v0, v1 + return v2 +} + +; block0: +; mov v0.d[1], v1.d[0] +; sqxtun v0.8b, v0.8h +; ret + +function %unarrow_i16x8(i16x8, i16x8) -> i8x16 { +block0(v0: i16x8, v1: i16x8): + v2 = unarrow v0, v1 + return v2 +} + +; block0: +; sqxtun v0.8b, v0.8h +; sqxtun2 v0.16b, v1.8h +; ret + +function %unarrow_i32x2(i32x2, i32x2) -> i16x4 { +block0(v0: i32x2, v1: i32x2): + v2 = unarrow v0, v1 + return v2 +} + +; block0: +; mov v0.d[1], v1.d[0] +; sqxtun v0.4h, v0.4s +; ret + +function %unarrow_i32x4(i32x4, i32x4) -> i16x8 { +block0(v0: i32x4, v1: i32x4): + v2 = unarrow v0, v1 + return v2 +} + +; block0: +; sqxtun v0.4h, v0.4s +; sqxtun2 v0.8h, v1.4s +; ret + +function %unarrow_i64x2(i64x2, i64x2) -> i32x4 { +block0(v0: i64x2, v1: i64x2): + v2 = unarrow v0, v1 + return v2 +} + +; block0: +; sqxtun v0.2s, v0.2d +; sqxtun2 v0.4s, v1.2d +; ret + +function %uunarrow_i16x4(i16x4, i16x4) -> i8x8 { +block0(v0: i16x4, v1: i16x4): + v2 = uunarrow v0, v1 + return v2 +} + +; block0: +; mov v0.d[1], v1.d[0] +; uqxtn v0.8b, v0.8h +; ret + +function %uunarrow_i16x8(i16x8, i16x8) -> i8x16 { +block0(v0: i16x8, v1: i16x8): + v2 = uunarrow v0, v1 + return v2 +} + +; block0: +; uqxtn v0.8b, v0.8h +; uqxtn2 v0.16b, v1.8h +; ret + +function %uunarrow_i32x2(i32x2, i32x2) -> i16x4 { +block0(v0: i32x2, v1: i32x2): + v2 = uunarrow v0, v1 + return v2 +} + +; block0: +; mov v0.d[1], v1.d[0] +; uqxtn v0.4h, v0.4s +; ret + +function %uunarrow_i32x4(i32x4, i32x4) -> i16x8 { +block0(v0: i32x4, v1: i32x4): + v2 = uunarrow v0, v1 + return v2 +} + +; block0: +; uqxtn v0.4h, v0.4s +; uqxtn2 v0.8h, v1.4s +; ret + +function %uunarrow_i64x2(i64x2, i64x2) -> i32x4 { +block0(v0: i64x2, v1: i64x2): + v2 = uunarrow v0, v1 + return v2 +} + +; block0: +; uqxtn v0.2s, v0.2d +; uqxtn2 v0.4s, v1.2d +; ret + +function %snarrow_i16x8_zero(i16x8) -> i8x16 { +block0(v0: i16x8): + v1 = vconst.i16x8 0x00 + v2 = snarrow v0, v1 + return v2 +} + +; block0: +; sqxtn v0.8b, v0.8h +; ret + +function %snarrow_i32x4_zero(i32x4) -> i16x8 { +block0(v0: i32x4): + v1 = vconst.i32x4 0x00 + v2 = snarrow v0, v1 + return v2 +} + +; block0: +; sqxtn v0.4h, v0.4s +; ret + +function %snarrow_i64x2_zero(i64x2) -> i32x4 { +block0(v0: i64x2): + v1 = vconst.i64x2 0x00 + v2 = snarrow v0, v1 + return v2 +} + +; block0: +; sqxtn v0.2s, v0.2d +; ret + +function %unarrow_i16x8_zero(i16x8) -> i8x16 { +block0(v0: i16x8): + v1 = vconst.i16x8 0x00 + v2 = unarrow v0, v1 + return v2 +} + +; block0: +; sqxtun v0.8b, v0.8h +; ret + +function %unarrow_i32x4_zero(i32x4) -> i16x8 { +block0(v0: i32x4): + v1 = vconst.i32x4 0x00 + v2 = unarrow v0, v1 + return v2 +} + +; block0: +; sqxtun v0.4h, v0.4s +; ret + +function %unarrow_i64x2_zero(i64x2) -> i32x4 { +block0(v0: i64x2): + v1 = vconst.i64x2 0x00 + v2 = unarrow v0, v1 + return v2 +} + +; block0: +; sqxtun v0.2s, v0.2d +; ret + +function %uunarrow_i16x8_zero(i16x8) -> i8x16 { +block0(v0: i16x8): + v1 = vconst.i16x8 0x00 + v2 = uunarrow v0, v1 + return v2 +} + +; block0: +; uqxtn v0.8b, v0.8h +; ret + +function %uunarrow_i32x4_zero(i32x4) -> i16x8 { +block0(v0: i32x4): + v1 = vconst.i32x4 0x00 + v2 = uunarrow v0, v1 + return v2 +} + +; block0: +; uqxtn v0.4h, v0.4s +; ret + +function %uunarrow_i64x2_zero(i64x2) -> i32x4 { +block0(v0: i64x2): + v1 = vconst.i64x2 0x00 + v2 = uunarrow v0, v1 + return v2 +} + +; block0: +; uqxtn v0.2s, v0.2d +; ret + diff --git a/cranelift/filetests/filetests/runtests/dynamic-simd-narrow-widen.clif b/cranelift/filetests/filetests/runtests/dynamic-simd-narrow-widen.clif new file mode 100644 index 0000000000..e3ff5416c9 --- /dev/null +++ b/cranelift/filetests/filetests/runtests/dynamic-simd-narrow-widen.clif @@ -0,0 +1,230 @@ +test run +target aarch64 + +function %snarrow_i16x8(i16, i16) -> i8x16 { + gv0 = dyn_scale_target_const.i16x8 + gv1 = dyn_scale_target_const.i8x16 + dt0 = i16x8*gv0 + dt1 = i8x16*gv0 + +block0(v0: i16, v1: i16): + v2 = splat.dt0 v0 + v3 = splat.dt0 v1 + v4 = snarrow.dt0 v2, v3 + v5 = extract_vector v4, 0 + return v5 +} +; run: %snarrow_i16x8(1, -1) == [1 1 1 1 1 1 1 1 -1 -1 -1 -1 -1 -1 -1 -1] +; run: %snarrow_i16x8(32767, -32768) == [127 127 127 127 127 127 127 127 -128 -128 -128 -128 -128 -128 -128 -128] + +function %snarrow_i32x4(i32, i32) -> i16x8 { + gv0 = dyn_scale_target_const.i32x4 + gv1 = dyn_scale_target_const.i16x8 + dt0 = i32x4*gv0 + dt1 = i16x8*gv0 + +block0(v0: i32, v1: i32): + v2 = splat.dt0 v0 + v3 = splat.dt0 v1 + v4 = snarrow.dt0 v2, v3 + v5 = extract_vector v4, 0 + return v5 +} +; run: %snarrow_i32x4(1, -1) == [1 1 1 1 -1 -1 -1 -1] +; run: %snarrow_i32x4(-65536, 65535) == [-32768 -32768 -32768 -32768 32767 32767 32767 32767] + +function %snarrow_i64x2(i64, i64) -> i32x4 { + gv0 = dyn_scale_target_const.i64x2 + gv1 = dyn_scale_target_const.i32x4 + dt0 = i64x2*gv0 + dt1 = i32x4*gv0 + +block0(v0: i64, v1: i64): + v2 = splat.dt0 v0 + v3 = splat.dt0 v1 + v4 = snarrow.dt0 v2, v3 + v5 = extract_vector v4, 0 + return v5 +} +; run: %snarrow_i64x2(-65536, -5000000000) == [-65536 -65536 -2147483648 -2147483648] +; run: %snarrow_i64x2(65535, 5000000000) == [65535 65535 2147483647 2147483647] + +function %unarrow_i16x8(i16, i16) -> i8x16 { + gv0 = dyn_scale_target_const.i16x8 + gv1 = dyn_scale_target_const.i8x16 + dt0 = i16x8*gv0 + dt1 = i8x16*gv0 + +block0(v0: i16, v1:i16): + v2 = splat.dt0 v0 + v3 = splat.dt0 v1 + v4 = unarrow.dt0 v2, v3 + v5 = extract_vector v4, 0 + return v5 +} +; run: %unarrow_i16x8(1, -1) == [1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0] +; run: %unarrow_i16x8(32767, -32768) == [255 255 255 255 255 255 255 255 0 0 0 0 0 0 0 0] + +function %unarrow_i32x4(i32, i32) -> i16x8 { + gv0 = dyn_scale_target_const.i32x4 + gv1 = dyn_scale_target_const.i16x8 + dt0 = i32x4*gv0 + dt1 = i16x8*gv0 + +block0(v0: i32, v1: i32): + v2 = splat.dt0 v0 + v3 = splat.dt0 v1 + v4 = unarrow.dt0 v2, v3 + v5 = extract_vector v4, 0 + return v5 +} +; run: %unarrow_i32x4(1, -1) == [1 1 1 1 0 0 0 0] +; run: %unarrow_i32x4(65536, -65536) == [65535 65535 65535 65535 0 0 0 0] + +function %unarrow_i64x2(i64, i64) -> i32x4 { + gv0 = dyn_scale_target_const.i64x2 + gv1 = dyn_scale_target_const.i32x4 + dt0 = i64x2*gv0 + dt1 = i32x4*gv0 + +block0(v0: i64, v1: i64): + v2 = splat.dt0 v0 + v3 = splat.dt0 v1 + v4 = unarrow.dt0 v2, v3 + v5 = extract_vector v4, 0 + return v5 +} +; run: %unarrow_i64x2(1, -1) == [1 1 0 0] +; run: %unarrow_i64x2(4294967296, 1) == [4294967295 4294967295 1 1] + +function %uunarrow_i16x8(i16, i16) -> i8x16 { + gv0 = dyn_scale_target_const.i16x8 + gv1 = dyn_scale_target_const.i8x16 + dt0 = i16x8*gv0 + dt1 = i8x16*gv0 + +block0(v0: i16, v1:i16): + v2 = splat.dt0 v0 + v3 = splat.dt0 v1 + v4 = uunarrow.dt0 v2, v3 + v5 = extract_vector v4, 0 + return v5 +} +; run: %uunarrow_i16x8(1, -1) == [1 1 1 1 1 1 1 1 255 255 255 255 255 255 255 255] +; run: %uunarrow_i16x8(32767, -32768) == [255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255] + +function %uunarrow_i32x4(i32, i32) -> i16x8 { + gv0 = dyn_scale_target_const.i32x4 + gv1 = dyn_scale_target_const.i16x8 + dt0 = i32x4*gv0 + dt1 = i16x8*gv0 + +block0(v0: i32, v1: i32): + v2 = splat.dt0 v0 + v3 = splat.dt0 v1 + v4 = uunarrow.dt0 v2, v3 + v5 = extract_vector v4, 0 + return v5 +} +; run: %uunarrow_i32x4(1, -1) == [1 1 1 1 65535 65535 65535 65535] +; run: %uunarrow_i32x4(65536, -65536) == [65535 65535 65535 65535 65535 65535 65535 65535] + +function %uunarrow_i64x2(i64, i64) -> i32x4 { + gv0 = dyn_scale_target_const.i64x2 + gv1 = dyn_scale_target_const.i32x4 + dt0 = i64x2*gv0 + dt1 = i32x4*gv0 + +block0(v0: i64, v1: i64): + v2 = splat.dt0 v0 + v3 = splat.dt0 v1 + v4 = uunarrow.dt0 v2, v3 + v5 = extract_vector v4, 0 + return v5 +} +; run: %uunarrow_i64x2(1, -1) == [1 1 4294967295 4294967295] +; run: %uunarrow_i64x2(4294967296, 1) == [4294967295 4294967295 1 1] + +function %swidenhigh_i8x16(i8) -> i16x8 { + gv0 = dyn_scale_target_const.i16x8 + gv1 = dyn_scale_target_const.i8x16 + dt0 = i8x16*gv1 + dt1 = i16x8*gv0 + +block0(v0: i8): + v1 = splat.dt0 v0 + v2 = swiden_high v1 + v3 = extract_vector v2, 0 + return v3 +} +; run: %swidenhigh_i8x16(9) == [9 9 9 9 9 9 9 9] + +function %swidenhigh_i16x8(i16) -> i32x4 { + gv0 = dyn_scale_target_const.i32x4 + gv1 = dyn_scale_target_const.i16x8 + dt0 = i16x8*gv1 + dt1 = i32x4*gv0 + +block0(v0: i16): + v1 = splat.dt0 v0 + v2 = swiden_high v1 + v3 = extract_vector v2, 0 + return v3 +} +; run: %swidenhigh_i16x8(-8) == [-8 -8 -8 -8] + +function %swidenhigh_i32x4(i32) -> i64x2 { + gv0 = dyn_scale_target_const.i32x4 + gv1 = dyn_scale_target_const.i64x2 + dt0 = i64x2*gv1 + dt1 = i32x4*gv0 + +block0(v0: i32): + v1 = splat.dt1 v0 + v2 = swiden_high v1 + v3 = extract_vector v2, 0 + return v3 +} +; run: %swidenhigh_i32x4(-4) == [-4 -4] + +function %swidenlow_i8x16(i8) -> i16x8 { + gv0 = dyn_scale_target_const.i16x8 + gv1 = dyn_scale_target_const.i8x16 + dt0 = i8x16*gv1 + dt1 = i16x8*gv0 + +block0(v0: i8): + v1 = splat.dt0 v0 + v2 = swiden_low v1 + v3 = extract_vector v2, 0 + return v3 +} +; run: %swidenhigh_i8x16(9) == [9 9 9 9 9 9 9 9] + +function %swidenlow_i16x8(i16) -> i32x4 { + gv0 = dyn_scale_target_const.i32x4 + gv1 = dyn_scale_target_const.i16x8 + dt0 = i16x8*gv1 + dt1 = i32x4*gv0 + +block0(v0: i16): + v1 = splat.dt0 v0 + v2 = swiden_low v1 + v3 = extract_vector v2, 0 + return v3 +} +; run: %swidenhigh_i16x8(-8) == [-8 -8 -8 -8] + +function %swidenlow_i32x4(i32) -> i64x2 { + gv0 = dyn_scale_target_const.i32x4 + gv1 = dyn_scale_target_const.i64x2 + dt0 = i64x2*gv1 + dt1 = i32x4*gv0 + +block0(v0: i32): + v1 = splat.dt1 v0 + v2 = swiden_low v1 + v3 = extract_vector v2, 0 + return v3 +} +; run: %swidenhigh_i32x4(-4) == [-4 -4]