AArch64: Implement SIMD conversions

4 years ago · b895ac0e40
6 changed files with 424 additions and 202 deletions
--- a/build.rs
+++ b/build.rs
@ -196,7 +196,6 @@ fn experimental_x64_should_panic(testsuite: &str, testname: &str, strategy: &str
 /// Ignore tests that aren't supported yet.
 fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
    let target = env::var("TARGET").unwrap();
    match strategy {
        #[cfg(feature = "lightbeam")]
        "Lightbeam" => match (testsuite, testname) {
@ -207,38 +206,6 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
            _ => (),
        },
        "Cranelift" => match (testsuite, testname) {
            ("simd", "simd_address") => return false,
            ("simd", "simd_align") => return false,
            ("simd", "simd_bitwise") => return false,
            ("simd", "simd_bit_shift") => return false,
            ("simd", "simd_boolean") => return false,
            ("simd", "simd_const") => return false,
            ("simd", "simd_f32x4") => return false,
            ("simd", "simd_f32x4_arith") => return false,
            ("simd", "simd_f32x4_cmp") => return false,
            ("simd", "simd_f64x2") => return false,
            ("simd", "simd_f64x2_arith") => return false,
            ("simd", "simd_f64x2_cmp") => return false,
            ("simd", "simd_i8x16_arith") => return false,
            ("simd", "simd_i8x16_arith2") => return false,
            ("simd", "simd_i8x16_cmp") => return false,
            ("simd", "simd_i8x16_sat_arith") => return false,
            ("simd", "simd_i16x8_arith") => return false,
            ("simd", "simd_i16x8_arith2") => return false,
            ("simd", "simd_i16x8_cmp") => return false,
            ("simd", "simd_i16x8_sat_arith") => return false,
            ("simd", "simd_i32x4_arith") => return false,
            ("simd", "simd_i32x4_arith2") => return false,
            ("simd", "simd_i32x4_cmp") => return false,
            ("simd", "simd_i64x2_arith") => return false,
            ("simd", "simd_lane") => return false,
            ("simd", "simd_load_extend") => return false,
            ("simd", "simd_load_splat") => return false,
            ("simd", "simd_store") => return false,
            // Most simd tests are known to fail on aarch64 for now, it's going
            // to be a big chunk of work to implement them all there!
            ("simd", _) if target.contains("aarch64") => return true,
            // TODO(#1886): Ignore reference types tests if this isn't x64,
            // because Cranelift only supports reference types on x64.
            ("reference_types", _) => {
--- a/cranelift/codegen/src/isa/aarch64/inst/args.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/args.rs
@ -671,6 +671,15 @@ impl VectorSize {
            VectorSize::Size64x2 => unreachable!(),
        }
    }
    pub fn halve(&self) -> VectorSize {
        match self {
            VectorSize::Size8x16 => VectorSize::Size8x8,
            VectorSize::Size16x8 => VectorSize::Size16x4,
            VectorSize::Size32x4 => VectorSize::Size32x2,
            _ => *self,
        }
    }
 }
 //=============================================================================
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@ -1400,6 +1400,22 @@ impl MachInstEmit for Inst {
                        debug_assert!(!size.is_128bits());
                        (0b1, 0b10011, enc_size)
                    }
                    VecMisc2::Fcvtzs => {
                        debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2);
                        (0b0, 0b11011, enc_size)
                    }
                    VecMisc2::Fcvtzu => {
                        debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2);
                        (0b1, 0b11011, enc_size)
                    }
                    VecMisc2::Scvtf => {
                        debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2);
                        (0b0, 0b11101, enc_size & 0b1)
                    }
                    VecMisc2::Ucvtf => {
                        debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2);
                        (0b1, 0b11101, enc_size & 0b1)
                    }
                };
                sink.put4(enc_vec_rr_misc((q << 1) | u, size, bits_12_16, rd, rn));
            }
@ -1644,7 +1660,12 @@ impl MachInstEmit for Inst {
                        | machreg_to_vec(rd.to_reg()),
                );
            }
-            &Inst::VecExtend { t, rd, rn } => {
+            &Inst::VecExtend {
                t,
                rd,
                rn,
                high_half,
            } => {
                let (u, immh) = match t {
                    VecExtendOp::Sxtl8 => (0b0, 0b001),
                    VecExtendOp::Sxtl16 => (0b0, 0b010),
@ -1655,22 +1676,38 @@ impl MachInstEmit for Inst {
                };
                sink.put4(
                    0b000_011110_0000_000_101001_00000_00000
                        | ((high_half as u32) << 30)
                        | (u << 29)
                        | (immh << 19)
                        | (machreg_to_vec(rn) << 5)
                        | machreg_to_vec(rd.to_reg()),
                );
            }
-            &Inst::VecMiscNarrow { op, rd, rn, size } => {
+            &Inst::VecMiscNarrow {
-                debug_assert!(!size.is_128bits());
+                op,
-                let size = match size.widen() {
+                rd,
-                    VectorSize::Size64x2 => 0b10,
+                rn,
-                    _ => unimplemented!(),
+                size,
                high_half,
            } => {
                let size = match size.lane_size() {
                    ScalarSize::Size8 => 0b00,
                    ScalarSize::Size16 => 0b01,
                    ScalarSize::Size32 => 0b10,
                    _ => panic!("Unexpected vector operand lane size!"),
                };
                let (u, bits_12_16) = match op {
                    VecMiscNarrowOp::Xtn => (0b0, 0b10010),
                    VecMiscNarrowOp::Sqxtn => (0b0, 0b10100),
                    VecMiscNarrowOp::Sqxtun => (0b1, 0b10010),
                };
-                sink.put4(enc_vec_rr_misc(u, size, bits_12_16, rd, rn));
+                sink.put4(enc_vec_rr_misc(
                    ((high_half as u32) << 1) | u,
                    size,
                    bits_12_16,
                    rd,
                    rn,
                ));
            }
            &Inst::VecMovElement {
                rd,
--- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@ -2008,6 +2008,7 @@ fn test_aarch64_binemit() {
            t: VecExtendOp::Sxtl8,
            rd: writable_vreg(4),
            rn: vreg(27),
            high_half: false,
        },
        "64A7080F",
        "sxtl v4.8h, v27.8b",
@ -2017,15 +2018,17 @@ fn test_aarch64_binemit() {
            t: VecExtendOp::Sxtl16,
            rd: writable_vreg(17),
            rn: vreg(19),
            high_half: true,
        },
-        "71A6100F",
+        "71A6104F",
-        "sxtl v17.4s, v19.4h",
+        "sxtl2 v17.4s, v19.8h",
    ));
    insns.push((
        Inst::VecExtend {
            t: VecExtendOp::Sxtl32,
            rd: writable_vreg(30),
            rn: vreg(6),
            high_half: false,
        },
        "DEA4200F",
        "sxtl v30.2d, v6.2s",
@ -2035,15 +2038,17 @@ fn test_aarch64_binemit() {
            t: VecExtendOp::Uxtl8,
            rd: writable_vreg(3),
            rn: vreg(29),
            high_half: true,
        },
-        "A3A7082F",
+        "A3A7086F",
-        "uxtl v3.8h, v29.8b",
+        "uxtl2 v3.8h, v29.16b",
    ));
    insns.push((
        Inst::VecExtend {
            t: VecExtendOp::Uxtl16,
            rd: writable_vreg(15),
            rn: vreg(12),
            high_half: false,
        },
        "8FA5102F",
        "uxtl v15.4s, v12.4h",
@ -2053,9 +2058,10 @@ fn test_aarch64_binemit() {
            t: VecExtendOp::Uxtl32,
            rd: writable_vreg(28),
            rn: vreg(2),
            high_half: true,
        },
-        "5CA4202F",
+        "5CA4206F",
-        "uxtl v28.2d, v2.2s",
+        "uxtl2 v28.2d, v2.4s",
    ));
    insns.push((
@ -2088,11 +2094,36 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(22),
            rn: vreg(8),
            size: VectorSize::Size32x2,
            high_half: false,
        },
        "1629A10E",
        "xtn v22.2s, v8.2d",
    ));
    insns.push((
        Inst::VecMiscNarrow {
            op: VecMiscNarrowOp::Sqxtn,
            rd: writable_vreg(31),
            rn: vreg(0),
            size: VectorSize::Size16x8,
            high_half: true,
        },
        "1F48614E",
        "sqxtn2 v31.8h, v0.4s",
    ));
    insns.push((
        Inst::VecMiscNarrow {
            op: VecMiscNarrowOp::Sqxtun,
            rd: writable_vreg(16),
            rn: vreg(23),
            size: VectorSize::Size8x16,
            high_half: false,
        },
        "F02A212E",
        "sqxtun v16.8b, v23.8h",
    ));
    insns.push((
        Inst::VecRRR {
            alu_op: VecALUOp::Sqadd,
@ -3322,6 +3353,50 @@ fn test_aarch64_binemit() {
        "shll v1.2d, v10.2s, #32",
    ));
    insns.push((
        Inst::VecMisc {
            op: VecMisc2::Fcvtzs,
            rd: writable_vreg(4),
            rn: vreg(22),
            size: VectorSize::Size32x4,
        },
        "C4BAA14E",
        "fcvtzs v4.4s, v22.4s",
    ));
    insns.push((
        Inst::VecMisc {
            op: VecMisc2::Fcvtzu,
            rd: writable_vreg(29),
            rn: vreg(15),
            size: VectorSize::Size64x2,
        },
        "FDB9E16E",
        "fcvtzu v29.2d, v15.2d",
    ));
    insns.push((
        Inst::VecMisc {
            op: VecMisc2::Scvtf,
            rd: writable_vreg(20),
            rn: vreg(8),
            size: VectorSize::Size32x4,
        },
        "14D9214E",
        "scvtf v20.4s, v8.4s",
    ));
    insns.push((
        Inst::VecMisc {
            op: VecMisc2::Ucvtf,
            rd: writable_vreg(10),
            rn: vreg(19),
            size: VectorSize::Size64x2,
        },
        "6ADA616E",
        "ucvtf v10.2d, v19.2d",
    ));
    insns.push((
        Inst::VecLanes {
            op: VecLanesOp::Uminv,
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@ -308,6 +308,14 @@ pub enum VecMisc2 {
    Rev64,
    /// Shift left long (by element size)
    Shll,
    /// Floating-point convert to signed integer, rounding toward zero
    Fcvtzs,
    /// Floating-point convert to unsigned integer, rounding toward zero
    Fcvtzu,
    /// Signed integer convert to floating-point
    Scvtf,
    /// Unsigned integer convert to floating-point
    Ucvtf,
 }
 /// A Vector narrowing operation with two registers.
@ -315,6 +323,10 @@ pub enum VecMisc2 {
 pub enum VecMiscNarrowOp {
    /// Extract Narrow
    Xtn,
    /// Signed saturating extract narrow
    Sqxtn,
    /// Signed saturating extract unsigned narrow
    Sqxtun,
 }
 /// An operation across the lanes of vectors.
@ -884,6 +896,7 @@ pub enum Inst {
        t: VecExtendOp,
        rd: Writable<Reg>,
        rn: Reg,
        high_half: bool,
    },
    /// Move vector element to another vector element.
@ -901,6 +914,7 @@ pub enum Inst {
        rd: Writable<Reg>,
        rn: Reg,
        size: VectorSize,
        high_half: bool,
    },
    /// A vector ALU op.
@ -1628,9 +1642,16 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
            collector.add_mod(rd);
            collector.add_use(rn);
        }
-        &Inst::VecMiscNarrow { rd, rn, .. } => {
+        &Inst::VecMiscNarrow {
-            collector.add_def(rd);
+            rd, rn, high_half, ..
        } => {
            collector.add_use(rn);
            if high_half {
                collector.add_mod(rd);
            } else {
                collector.add_def(rd);
            }
        }
        &Inst::VecRRR {
            alu_op, rd, rn, rm, ..
@ -2300,10 +2321,16 @@ fn aarch64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
        &mut Inst::VecMiscNarrow {
            ref mut rd,
            ref mut rn,
            high_half,
            ..
        } => {
            map_def(mapper, rd);
            map_use(mapper, rn);
            if high_half {
                map_mod(mapper, rd);
            } else {
                map_def(mapper, rd);
            }
        }
        &mut Inst::VecRRR {
            alu_op,
@ -3155,14 +3182,20 @@ impl Inst {
                let rn = show_vreg_element(rn, mb_rru, 0, size);
                format!("dup {}, {}", rd, rn)
            }
-            &Inst::VecExtend { t, rd, rn } => {
+            &Inst::VecExtend { t, rd, rn, high_half } => {
-                let (op, dest, src) = match t {
+                let (op, dest, src) = match (t, high_half) {
-                    VecExtendOp::Sxtl8 => ("sxtl", VectorSize::Size16x8, VectorSize::Size8x8),
+                    (VecExtendOp::Sxtl8, false) => ("sxtl", VectorSize::Size16x8, VectorSize::Size8x8),
-                    VecExtendOp::Sxtl16 => ("sxtl", VectorSize::Size32x4, VectorSize::Size16x4),
+                    (VecExtendOp::Sxtl8, true) => ("sxtl2", VectorSize::Size16x8, VectorSize::Size8x16),
-                    VecExtendOp::Sxtl32 => ("sxtl", VectorSize::Size64x2, VectorSize::Size32x2),
+                    (VecExtendOp::Sxtl16, false) => ("sxtl", VectorSize::Size32x4, VectorSize::Size16x4),
-                    VecExtendOp::Uxtl8 => ("uxtl", VectorSize::Size16x8, VectorSize::Size8x8),
+                    (VecExtendOp::Sxtl16, true) => ("sxtl2", VectorSize::Size32x4, VectorSize::Size16x8),
-                    VecExtendOp::Uxtl16 => ("uxtl", VectorSize::Size32x4, VectorSize::Size16x4),
+                    (VecExtendOp::Sxtl32, false) => ("sxtl", VectorSize::Size64x2, VectorSize::Size32x2),
-                    VecExtendOp::Uxtl32 => ("uxtl", VectorSize::Size64x2, VectorSize::Size32x2),
+                    (VecExtendOp::Sxtl32, true) => ("sxtl2", VectorSize::Size64x2, VectorSize::Size32x4),
                    (VecExtendOp::Uxtl8, false) => ("uxtl", VectorSize::Size16x8, VectorSize::Size8x8),
                    (VecExtendOp::Uxtl8, true) => ("uxtl2", VectorSize::Size16x8, VectorSize::Size8x16),
                    (VecExtendOp::Uxtl16, false) => ("uxtl", VectorSize::Size32x4, VectorSize::Size16x4),
                    (VecExtendOp::Uxtl16, true) => ("uxtl2", VectorSize::Size32x4, VectorSize::Size16x8),
                    (VecExtendOp::Uxtl32, false) => ("uxtl", VectorSize::Size64x2, VectorSize::Size32x2),
                    (VecExtendOp::Uxtl32, true) => ("uxtl2", VectorSize::Size64x2, VectorSize::Size32x4),
                };
                let rd = show_vreg_vector(rd.to_reg(), mb_rru, dest);
                let rn = show_vreg_vector(rn, mb_rru, src);
@ -3179,11 +3212,22 @@ impl Inst {
                let rn = show_vreg_element(rn, mb_rru, idx2, size);
                format!("mov {}, {}", rd, rn)
            }
-            &Inst::VecMiscNarrow { op, rd, rn, size } => {
+            &Inst::VecMiscNarrow { op, rd, rn, size, high_half } => {
-                let rd = show_vreg_vector(rd.to_reg(), mb_rru, size);
+                let dest_size = if high_half {
                    assert!(size.is_128bits());
                    size
                } else {
                    size.halve()
                };
                let rd = show_vreg_vector(rd.to_reg(), mb_rru, dest_size);
                let rn = show_vreg_vector(rn, mb_rru, size.widen());
-                let op = match op {
+                let op = match (op, high_half) {
-                    VecMiscNarrowOp::Xtn => "xtn",
+                    (VecMiscNarrowOp::Xtn, false) => "xtn",
                    (VecMiscNarrowOp::Xtn, true) => "xtn2",
                    (VecMiscNarrowOp::Sqxtn, false) => "sqxtn",
                    (VecMiscNarrowOp::Sqxtn, true) => "sqxtn2",
                    (VecMiscNarrowOp::Sqxtun, false) => "sqxtun",
                    (VecMiscNarrowOp::Sqxtun, true) => "sqxtun2",
                };
                format!("{} {}, {}", op, rd, rn)
            }
@ -3267,6 +3311,10 @@ impl Inst {
                    VecMisc2::Fsqrt => ("fsqrt", size),
                    VecMisc2::Rev64 => ("rev64", size),
                    VecMisc2::Shll => ("shll", size),
                    VecMisc2::Fcvtzs => ("fcvtzs", size),
                    VecMisc2::Fcvtzu => ("fcvtzu", size),
                    VecMisc2::Scvtf => ("scvtf", size),
                    VecMisc2::Ucvtf => ("ucvtf", size),
                };
                let rd_size = if is_shll { size.widen() } else { size };
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@ -7,7 +7,7 @@ use crate::ir::Inst as IRInst;
 use crate::ir::{InstructionData, Opcode, TrapCode};
 use crate::machinst::lower::*;
 use crate::machinst::*;
-use crate::CodegenResult;
+use crate::{CodegenError, CodegenResult};
 use crate::isa::aarch64::abi::*;
 use crate::isa::aarch64::inst::*;
@ -66,7 +66,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            let rd = get_output_reg(ctx, outputs[0]);
            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
            let ty = ty.unwrap();
-            if ty_bits(ty) < 128 {
+            if !ty.is_vector() {
                let (rm, negated) = put_input_in_rse_imm12_maybe_negated(
                    ctx,
                    inputs[1],
@ -94,7 +94,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            let rd = get_output_reg(ctx, outputs[0]);
            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
            let ty = ty.unwrap();
-            if ty_bits(ty) < 128 {
+            if !ty.is_vector() {
                let (rm, negated) = put_input_in_rse_imm12_maybe_negated(
                    ctx,
                    inputs[1],
@ -124,7 +124,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            let is_signed = op == Opcode::SaddSat || op == Opcode::SsubSat;
            let ty = ty.unwrap();
            let rd = get_output_reg(ctx, outputs[0]);
-            if ty_bits(ty) < 128 {
+            if !ty.is_vector() {
                let narrow_mode = if is_signed {
                    NarrowValueMode::SignExtend64
                } else {
@ -180,7 +180,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
        Opcode::Ineg => {
            let rd = get_output_reg(ctx, outputs[0]);
            let ty = ty.unwrap();
-            if ty_bits(ty) < 128 {
+            if !ty.is_vector() {
                let rn = zero_reg();
                let rm = put_input_in_rse_imm12(ctx, inputs[0], NarrowValueMode::None);
                let alu_op = choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64);
@ -201,7 +201,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
            let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
            let ty = ty.unwrap();
-            if ty_bits(ty) < 128 {
+            if !ty.is_vector() {
                let alu_op = choose_32_64(ty, ALUOp::MAdd32, ALUOp::MAdd64);
                ctx.emit(Inst::AluRRRR {
                    alu_op,
@ -274,6 +274,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                        rd: tmp1,
                        rn,
                        size: VectorSize::Size32x2,
                        high_half: false,
                    });
                    // Sum the respective high half components.
@ -293,6 +294,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                        rd: tmp2,
                        rn: rm,
                        size: VectorSize::Size32x2,
                        high_half: false,
                    });
                    // Shift the high half components, into the high half.
@ -570,7 +572,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
        Opcode::Bnot => {
            let rd = get_output_reg(ctx, outputs[0]);
            let ty = ty.unwrap();
-            if ty_bits(ty) < 128 {
+            if !ty.is_vector() {
                let rm = put_input_in_rs_immlogic(ctx, inputs[0], NarrowValueMode::None);
                let alu_op = choose_32_64(ty, ALUOp::OrrNot32, ALUOp::OrrNot64);
                // NOT rd, rm ==> ORR_NOT rd, zero, rm
@ -594,7 +596,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
        | Opcode::BxorNot => {
            let rd = get_output_reg(ctx, outputs[0]);
            let ty = ty.unwrap();
-            if ty_bits(ty) < 128 {
+            if !ty.is_vector() {
                let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
                let rm = put_input_in_rs_immlogic(ctx, inputs[1], NarrowValueMode::None);
                let alu_op = match op {
@ -633,7 +635,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
        Opcode::Ishl | Opcode::Ushr | Opcode::Sshr => {
            let ty = ty.unwrap();
            let rd = get_output_reg(ctx, outputs[0]);
-            if ty_bits(ty) < 128 {
+            if !ty.is_vector() {
                let size = OperandSize::from_bits(ty_bits(ty));
                let narrow_mode = match (op, size) {
                    (Opcode::Ishl, _) => NarrowValueMode::None,
@ -1159,6 +1161,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                    t,
                    rd,
                    rn: rd.to_reg(),
                    high_half: false,
                });
            }
        }
@ -1433,7 +1436,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
        Opcode::Bitselect | Opcode::Vselect => {
            let ty = ty.unwrap();
-            if ty_bits(ty) < 128 {
+            if !ty.is_vector() {
                debug_assert_ne!(Opcode::Vselect, op);
                let tmp = ctx.alloc_tmp(RegClass::I64, I64);
                let rd = get_output_reg(ctx, outputs[0]);
@ -1696,7 +1699,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            };
            let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
-            if ty_bits(ty) < 128 {
+            if !ty.is_vector() {
                let alu_op = choose_32_64(ty, ALUOp::SubS32, ALUOp::SubS64);
                let rm = put_input_in_rse_imm12(ctx, inputs[1], narrow_mode);
                ctx.emit(alu_inst_imm12(alu_op, writable_zero_reg(), rn, rm));
@ -1716,7 +1719,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
            let rd = get_output_reg(ctx, outputs[0]);
-            if ty_bits(ty) < 128 {
+            if !ty.is_vector() {
                match ty_bits(ty) {
                    32 => {
                        ctx.emit(Inst::FpuCmp32 { rn, rm });
@ -2106,7 +2109,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
            let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
            let rd = get_output_reg(ctx, outputs[0]);
-            if bits < 128 {
+            if !ty.is_vector() {
                let fpu_op = match (op, bits) {
                    (Opcode::Fadd, 32) => FPUOp2::Add32,
                    (Opcode::Fadd, 64) => FPUOp2::Add64,
@ -2149,7 +2152,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            let bits = ty_bits(ty);
            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
            let rd = get_output_reg(ctx, outputs[0]);
-            if bits < 128 {
+            if !ty.is_vector() {
                let fpu_op = match (op, bits) {
                    (Opcode::Sqrt, 32) => FPUOp1::Sqrt32,
                    (Opcode::Sqrt, 64) => FPUOp1::Sqrt64,
@ -2414,153 +2417,186 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
        }
        Opcode::FcvtFromUint | Opcode::FcvtFromSint => {
-            let in_bits = ty_bits(ctx.input_ty(insn, 0));
+            let ty = ty.unwrap();
            let out_bits = ty_bits(ctx.output_ty(insn, 0));
            let signed = op == Opcode::FcvtFromSint;
            let op = match (signed, in_bits, out_bits) {
                (false, 8, 32) | (false, 16, 32) | (false, 32, 32) => IntToFpuOp::U32ToF32,
                (true, 8, 32) | (true, 16, 32) | (true, 32, 32) => IntToFpuOp::I32ToF32,
                (false, 8, 64) | (false, 16, 64) | (false, 32, 64) => IntToFpuOp::U32ToF64,
                (true, 8, 64) | (true, 16, 64) | (true, 32, 64) => IntToFpuOp::I32ToF64,
                (false, 64, 32) => IntToFpuOp::U64ToF32,
                (true, 64, 32) => IntToFpuOp::I64ToF32,
                (false, 64, 64) => IntToFpuOp::U64ToF64,
                (true, 64, 64) => IntToFpuOp::I64ToF64,
                _ => panic!("Unknown input/output-bits combination"),
            };
            let narrow_mode = match (signed, in_bits) {
                (false, 8) | (false, 16) | (false, 32) => NarrowValueMode::ZeroExtend32,
                (true, 8) | (true, 16) | (true, 32) => NarrowValueMode::SignExtend32,
                (false, 64) => NarrowValueMode::ZeroExtend64,
                (true, 64) => NarrowValueMode::SignExtend64,
                _ => panic!("Unknown input size"),
            };
            let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
            let rd = get_output_reg(ctx, outputs[0]);
-            ctx.emit(Inst::IntToFpu { op, rd, rn });
+
            if ty.is_vector() {
                let op = if signed {
                    VecMisc2::Scvtf
                } else {
                    VecMisc2::Ucvtf
                };
                let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
                ctx.emit(Inst::VecMisc {
                    op,
                    rd,
                    rn,
                    size: VectorSize::from_ty(ty),
                });
            } else {
                let in_bits = ty_bits(ctx.input_ty(insn, 0));
                let out_bits = ty_bits(ty);
                let op = match (signed, in_bits, out_bits) {
                    (false, 8, 32) | (false, 16, 32) | (false, 32, 32) => IntToFpuOp::U32ToF32,
                    (true, 8, 32) | (true, 16, 32) | (true, 32, 32) => IntToFpuOp::I32ToF32,
                    (false, 8, 64) | (false, 16, 64) | (false, 32, 64) => IntToFpuOp::U32ToF64,
                    (true, 8, 64) | (true, 16, 64) | (true, 32, 64) => IntToFpuOp::I32ToF64,
                    (false, 64, 32) => IntToFpuOp::U64ToF32,
                    (true, 64, 32) => IntToFpuOp::I64ToF32,
                    (false, 64, 64) => IntToFpuOp::U64ToF64,
                    (true, 64, 64) => IntToFpuOp::I64ToF64,
                    _ => panic!("Unknown input/output-bits combination"),
                };
                let narrow_mode = match (signed, in_bits) {
                    (false, 8) | (false, 16) | (false, 32) => NarrowValueMode::ZeroExtend32,
                    (true, 8) | (true, 16) | (true, 32) => NarrowValueMode::SignExtend32,
                    (false, 64) => NarrowValueMode::ZeroExtend64,
                    (true, 64) => NarrowValueMode::SignExtend64,
                    _ => panic!("Unknown input size"),
                };
                let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
                ctx.emit(Inst::IntToFpu { op, rd, rn });
            }
        }
        Opcode::FcvtToUintSat | Opcode::FcvtToSintSat => {
-            let in_ty = ctx.input_ty(insn, 0);
+            let ty = ty.unwrap();
            let in_bits = ty_bits(in_ty);
            let out_ty = ctx.output_ty(insn, 0);
            let out_bits = ty_bits(out_ty);
            let out_signed = op == Opcode::FcvtToSintSat;
            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
            let rd = get_output_reg(ctx, outputs[0]);
-            // FIMM Vtmp1, u32::MAX or u64::MAX or i32::MAX or i64::MAX
+            if ty.is_vector() {
-            // FMIN Vtmp2, Vin, Vtmp1
+                let op = if out_signed {
-            // FIMM Vtmp1, 0 or 0 or i32::MIN or i64::MIN
+                    VecMisc2::Fcvtzs
-            // FMAX Vtmp2, Vtmp2, Vtmp1
+                } else {
-            // (if signed) FIMM Vtmp1, 0
+                    VecMisc2::Fcvtzu
-            // FCMP Vin, Vin
+                };
            // FCSEL Vtmp2, Vtmp1, Vtmp2, NE  // on NaN, select 0
            // convert Rout, Vtmp2
            assert!(in_bits == 32 || in_bits == 64);
            assert!(out_bits == 32 || out_bits == 64);
            let min: f64 = match (out_bits, out_signed) {
                (32, true) => std::i32::MIN as f64,
                (32, false) => 0.0,
                (64, true) => std::i64::MIN as f64,
                (64, false) => 0.0,
                _ => unreachable!(),
            };
            let max = match (out_bits, out_signed) {
                (32, true) => std::i32::MAX as f64,
                (32, false) => std::u32::MAX as f64,
                (64, true) => std::i64::MAX as f64,
                (64, false) => std::u64::MAX as f64,
                _ => unreachable!(),
            };
            let rtmp1 = ctx.alloc_tmp(RegClass::V128, in_ty);
            let rtmp2 = ctx.alloc_tmp(RegClass::V128, in_ty);
-            if in_bits == 32 {
+                ctx.emit(Inst::VecMisc {
-                ctx.emit(Inst::LoadFpuConst32 {
+                    op,
-                    rd: rtmp1,
+                    rd,
-                    const_data: max as f32,
+                    rn,
-                });
+                    size: VectorSize::from_ty(ty),
            } else {
                ctx.emit(Inst::LoadFpuConst64 {
                    rd: rtmp1,
                    const_data: max,
                });
            }
            ctx.emit(Inst::FpuRRR {
                fpu_op: choose_32_64(in_ty, FPUOp2::Min32, FPUOp2::Min64),
                rd: rtmp2,
                rn: rn,
                rm: rtmp1.to_reg(),
            });
            if in_bits == 32 {
                ctx.emit(Inst::LoadFpuConst32 {
                    rd: rtmp1,
                    const_data: min as f32,
                });
            } else {
-                ctx.emit(Inst::LoadFpuConst64 {
+                let in_ty = ctx.input_ty(insn, 0);
-                    rd: rtmp1,
+                let in_bits = ty_bits(in_ty);
-                    const_data: min,
+                let out_bits = ty_bits(ty);
-                });
+                // FIMM Vtmp1, u32::MAX or u64::MAX or i32::MAX or i64::MAX
-            }
+                // FMIN Vtmp2, Vin, Vtmp1
-            ctx.emit(Inst::FpuRRR {
+                // FIMM Vtmp1, 0 or 0 or i32::MIN or i64::MIN
-                fpu_op: choose_32_64(in_ty, FPUOp2::Max32, FPUOp2::Max64),
+                // FMAX Vtmp2, Vtmp2, Vtmp1
-                rd: rtmp2,
+                // (if signed) FIMM Vtmp1, 0
-                rn: rtmp2.to_reg(),
+                // FCMP Vin, Vin
-                rm: rtmp1.to_reg(),
+                // FCSEL Vtmp2, Vtmp1, Vtmp2, NE  // on NaN, select 0
-            });
+                // convert Rout, Vtmp2
-            if out_signed {
+
                assert!(in_bits == 32 || in_bits == 64);
                assert!(out_bits == 32 || out_bits == 64);
                let min: f64 = match (out_bits, out_signed) {
                    (32, true) => std::i32::MIN as f64,
                    (32, false) => 0.0,
                    (64, true) => std::i64::MIN as f64,
                    (64, false) => 0.0,
                    _ => unreachable!(),
                };
                let max = match (out_bits, out_signed) {
                    (32, true) => std::i32::MAX as f64,
                    (32, false) => std::u32::MAX as f64,
                    (64, true) => std::i64::MAX as f64,
                    (64, false) => std::u64::MAX as f64,
                    _ => unreachable!(),
                };
                let rtmp1 = ctx.alloc_tmp(RegClass::V128, in_ty);
                let rtmp2 = ctx.alloc_tmp(RegClass::V128, in_ty);
                if in_bits == 32 {
                    ctx.emit(Inst::LoadFpuConst32 {
                        rd: rtmp1,
-                        const_data: 0.0,
+                        const_data: max as f32,
                    });
                } else {
                    ctx.emit(Inst::LoadFpuConst64 {
                        rd: rtmp1,
-                        const_data: 0.0,
+                        const_data: max,
                    });
                }
-            }
+                ctx.emit(Inst::FpuRRR {
-            if in_bits == 32 {
+                    fpu_op: choose_32_64(in_ty, FPUOp2::Min32, FPUOp2::Min64),
                ctx.emit(Inst::FpuCmp32 { rn: rn, rm: rn });
                ctx.emit(Inst::FpuCSel32 {
                    rd: rtmp2,
-                    rn: rtmp1.to_reg(),
+                    rn: rn,
-                    rm: rtmp2.to_reg(),
+                    rm: rtmp1.to_reg(),
                    cond: Cond::Ne,
                });
-            } else {
+                if in_bits == 32 {
-                ctx.emit(Inst::FpuCmp64 { rn: rn, rm: rn });
+                    ctx.emit(Inst::LoadFpuConst32 {
-                ctx.emit(Inst::FpuCSel64 {
+                        rd: rtmp1,
                        const_data: min as f32,
                    });
                } else {
                    ctx.emit(Inst::LoadFpuConst64 {
                        rd: rtmp1,
                        const_data: min,
                    });
                }
                ctx.emit(Inst::FpuRRR {
                    fpu_op: choose_32_64(in_ty, FPUOp2::Max32, FPUOp2::Max64),
                    rd: rtmp2,
-                    rn: rtmp1.to_reg(),
+                    rn: rtmp2.to_reg(),
-                    rm: rtmp2.to_reg(),
+                    rm: rtmp1.to_reg(),
                    cond: Cond::Ne,
                });
-            }
+                if out_signed {
                    if in_bits == 32 {
                        ctx.emit(Inst::LoadFpuConst32 {
                            rd: rtmp1,
                            const_data: 0.0,
                        });
                    } else {
                        ctx.emit(Inst::LoadFpuConst64 {
                            rd: rtmp1,
                            const_data: 0.0,
                        });
                    }
                }
                if in_bits == 32 {
                    ctx.emit(Inst::FpuCmp32 { rn: rn, rm: rn });
                    ctx.emit(Inst::FpuCSel32 {
                        rd: rtmp2,
                        rn: rtmp1.to_reg(),
                        rm: rtmp2.to_reg(),
                        cond: Cond::Ne,
                    });
                } else {
                    ctx.emit(Inst::FpuCmp64 { rn: rn, rm: rn });
                    ctx.emit(Inst::FpuCSel64 {
                        rd: rtmp2,
                        rn: rtmp1.to_reg(),
                        rm: rtmp2.to_reg(),
                        cond: Cond::Ne,
                    });
                }
-            let cvt = match (in_bits, out_bits, out_signed) {
+                let cvt = match (in_bits, out_bits, out_signed) {
-                (32, 32, false) => FpuToIntOp::F32ToU32,
+                    (32, 32, false) => FpuToIntOp::F32ToU32,
-                (32, 32, true) => FpuToIntOp::F32ToI32,
+                    (32, 32, true) => FpuToIntOp::F32ToI32,
-                (32, 64, false) => FpuToIntOp::F32ToU64,
+                    (32, 64, false) => FpuToIntOp::F32ToU64,
-                (32, 64, true) => FpuToIntOp::F32ToI64,
+                    (32, 64, true) => FpuToIntOp::F32ToI64,
-                (64, 32, false) => FpuToIntOp::F64ToU32,
+                    (64, 32, false) => FpuToIntOp::F64ToU32,
-                (64, 32, true) => FpuToIntOp::F64ToI32,
+                    (64, 32, true) => FpuToIntOp::F64ToI32,
-                (64, 64, false) => FpuToIntOp::F64ToU64,
+                    (64, 64, false) => FpuToIntOp::F64ToU64,
-                (64, 64, true) => FpuToIntOp::F64ToI64,
+                    (64, 64, true) => FpuToIntOp::F64ToI64,
-                _ => unreachable!(),
+                    _ => unreachable!(),
-            };
+                };
-            ctx.emit(Inst::FpuToInt {
+                ctx.emit(Inst::FpuToInt {
-                op: cvt,
+                    op: cvt,
-                rd,
+                    rd,
-                rn: rtmp2.to_reg(),
+                    rn: rtmp2.to_reg(),
-            });
+                });
            }
        }
        Opcode::IaddIfcout => {
@ -2689,12 +2725,62 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            });
        }
-        Opcode::Snarrow
+        Opcode::Snarrow | Opcode::Unarrow => {
-        | Opcode::Unarrow
+            let op = if op == Opcode::Snarrow {
-        | Opcode::SwidenLow
+                VecMiscNarrowOp::Sqxtn
-        | Opcode::SwidenHigh
+            } else {
-        | Opcode::UwidenLow
+                VecMiscNarrowOp::Sqxtun
-        | Opcode::UwidenHigh => unimplemented!(),
+            };
            let rd = get_output_reg(ctx, outputs[0]);
            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
            let rn2 = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
            let ty = ty.unwrap();
            ctx.emit(Inst::VecMiscNarrow {
                op,
                rd,
                rn,
                size: VectorSize::from_ty(ty),
                high_half: false,
            });
            ctx.emit(Inst::VecMiscNarrow {
                op,
                rd,
                rn: rn2,
                size: VectorSize::from_ty(ty),
                high_half: true,
            });
        }
        Opcode::SwidenLow | Opcode::SwidenHigh | Opcode::UwidenLow | Opcode::UwidenHigh => {
            let lane_type = ty.unwrap().lane_type();
            let rd = get_output_reg(ctx, outputs[0]);
            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
            let (t, high_half) = match (lane_type, op) {
                (I16, Opcode::SwidenLow) => (VecExtendOp::Sxtl8, false),
                (I16, Opcode::SwidenHigh) => (VecExtendOp::Sxtl8, true),
                (I16, Opcode::UwidenLow) => (VecExtendOp::Uxtl8, false),
                (I16, Opcode::UwidenHigh) => (VecExtendOp::Uxtl8, true),
                (I32, Opcode::SwidenLow) => (VecExtendOp::Sxtl16, false),
                (I32, Opcode::SwidenHigh) => (VecExtendOp::Sxtl16, true),
                (I32, Opcode::UwidenLow) => (VecExtendOp::Uxtl16, false),
                (I32, Opcode::UwidenHigh) => (VecExtendOp::Uxtl16, true),
                _ => {
                    return Err(CodegenError::Unsupported(format!(
                        "Unsupported SIMD vector lane type: {:?}",
                        lane_type
                    )));
                }
            };
            ctx.emit(Inst::VecExtend {
                t,
                rd,
                rn,
                high_half,
            });
        }
        Opcode::TlsValue => unimplemented!(),
    }