Add initial f16 and f128 support to the aarch64 backend (#9076)

3 months ago · 3f5c21bff4
31 changed files with 897 additions and 39 deletions
--- a/cranelift/codegen/meta/src/isa/arm64.rs
+++ b/cranelift/codegen/meta/src/isa/arm64.rs
@ -18,6 +18,12 @@ pub(crate) fn define() -> TargetIsa {
        "",
        false,
    );
+    settings.add_bool(
+        "has_fp16",
+        "Use half-precision floating point (FEAT_FP16) instructions.",
+        "",
+        false,
+    );
    settings.add_bool(
        "sign_return_address_all",
        "If function return address signing is enabled, then apply it to all \
--- a/cranelift/codegen/src/isa/aarch64/abi.rs
+++ b/cranelift/codegen/src/isa/aarch64/abi.rs
@ -102,7 +102,7 @@ impl ABIMachineSpec for AArch64MachineDeps {

    fn compute_arg_locs(
        call_conv: isa::CallConv,
-        _flags: &settings::Flags,
+        flags: &settings::Flags,
        params: &[ir::AbiParam],
        args_or_rets: ArgsOrRets,
        add_ret_area_ptr: bool,
@ -161,6 +161,13 @@ impl ABIMachineSpec for AArch64MachineDeps {
                param.value_type
            );

+            if is_apple_cc && param.value_type == types::F128 && !flags.enable_llvm_abi_extensions()
+            {
+                panic!(
+                    "f128 args/return values not supported for apple_aarch64 unless LLVM ABI extensions are enabled"
+                );
+            }
+
            let (rcs, reg_types) = Inst::rc_for_type(param.value_type)?;

            if matches!(
--- a/cranelift/codegen/src/isa/aarch64/inst.isle
+++ b/cranelift/codegen/src/isa/aarch64/inst.isle
@ -407,6 +407,18 @@
        (rn Reg)
        (rm Reg))

+       ;; Floating-point load, half-precision (16 bit).
+       (FpuLoad16
+        (rd WritableReg)
+        (mem AMode)
+        (flags MemFlags))
+
+       ;; Floating-point store, half-precision (16 bit).
+       (FpuStore16
+        (rd Reg)
+        (mem AMode)
+        (flags MemFlags))
+
       ;; Floating-point load, single-precision (32 bit).
       (FpuLoad32
        (rd WritableReg)
@ -483,6 +495,14 @@
        (rd WritableReg)
        (rn Reg))

+       ;; FP conditional select, 16 bit.
+       ;; Requires FEAT_FP16.
+       (FpuCSel16
+        (rd WritableReg)
+        (rn Reg)
+        (rm Reg)
+        (cond Cond))
+
       ;; FP conditional select, 32 bit.
       (FpuCSel32
        (rd WritableReg)
@ -504,8 +524,8 @@
        (rn Reg))

       ;; Move from a GPR to a vector register.  The scalar value is parked in the lowest lane
-       ;; of the destination, and all other lanes are zeroed out.  Currently only 32- and 64-bit
-       ;; transactions are supported.
+       ;; of the destination, and all other lanes are zeroed out. Currently 16-, 32- and 64-bit
+       ;; transactions are supported. 16-bit moves require FEAT_FP16.
       (MovToFpu
        (rd WritableReg)
        (rn Reg)
@ -1701,6 +1721,9 @@
 (decl use_lse () Inst)
 (extern extractor use_lse use_lse)

+(decl pure use_fp16 () bool)
+(extern constructor use_fp16 use_fp16)
+
 ;; Extractor helpers for various immediate constants ;;;;;;;;;;;;;;;;;;;;;;;;;;

 (decl pure partial move_wide_const_from_u64 (Type u64) MoveWideConst)
@ -2221,9 +2244,19 @@
            (_ Unit (emit (MInst.VecRRLong op dst src high_half))))
        dst))

-;; Helper for emitting `MInst.FpuCSel32` / `MInst.FpuCSel64`
+;; Helper for emitting `MInst.FpuCSel16` / `MInst.FpuCSel32` / `MInst.FpuCSel64`
 ;; instructions.
 (decl fpu_csel (Type Cond Reg Reg) ConsumesFlags)
+(rule (fpu_csel $F16 cond if_true if_false)
+        (fpu_csel $F32 cond if_true if_false))
+
+(rule 1 (fpu_csel $F16 cond if_true if_false)
+        (if-let $true (use_fp16))
+        (let ((dst WritableReg (temp_writable_reg $F16)))
+          (ConsumesFlags.ConsumesFlagsReturnsReg
+           (MInst.FpuCSel16 dst if_true if_false cond)
+           dst)))
+
 (rule (fpu_csel $F32 cond if_true if_false)
      (let ((dst WritableReg (temp_writable_reg $F32)))
        (ConsumesFlags.ConsumesFlagsReturnsReg
@ -2268,6 +2301,9 @@
      (let ((dst WritableReg (temp_writable_reg $I8X16))
            (_ Unit (emit (MInst.MovToFpu dst x size))))
        dst))
+(rule 1 (mov_to_fpu x (ScalarSize.Size16))
+        (if-let $false (use_fp16))
+        (mov_to_fpu x (ScalarSize.Size32)))

 ;; Helper for emitting `MInst.FpuMoveFPImm` instructions.
 (decl fpu_move_fp_imm (ASIMDFPModImm ScalarSize) Reg)
@ -2849,6 +2885,11 @@
      (let ((dst WritableReg (temp_writable_reg $I64))
            (_ Unit (emit (MInst.ULoad64 dst amode flags))))
        dst))
+(decl aarch64_fpuload16 (AMode MemFlags) Reg)
+(rule (aarch64_fpuload16 amode flags)
+      (let ((dst WritableReg (temp_writable_reg $F64))
+            (_ Unit (emit (MInst.FpuLoad16 dst amode flags))))
+        dst))
 (decl aarch64_fpuload32 (AMode MemFlags) Reg)
 (rule (aarch64_fpuload32 amode flags)
      (let ((dst WritableReg (temp_writable_reg $F64))
@ -2885,6 +2926,9 @@
 (decl aarch64_store64 (AMode MemFlags Reg) SideEffectNoResult)
 (rule (aarch64_store64 amode flags val)
      (SideEffectNoResult.Inst (MInst.Store64 val amode flags)))
+(decl aarch64_fpustore16 (AMode MemFlags Reg) SideEffectNoResult)
+(rule (aarch64_fpustore16 amode flags val)
+      (SideEffectNoResult.Inst (MInst.FpuStore16 val amode flags)))
 (decl aarch64_fpustore32 (AMode MemFlags Reg) SideEffectNoResult)
 (rule (aarch64_fpustore32 amode flags val)
      (SideEffectNoResult.Inst (MInst.FpuStore32 val amode flags)))
@ -3229,19 +3273,41 @@
 (rule 1 (add_imm_to_addr val (imm12_from_u64 imm)) (add_imm $I64 val imm))
 (rule 0 (add_imm_to_addr val offset) (add $I64 val (imm $I64 (ImmExtend.Zero) offset)))

+;; Lower a constant f16.
+;;
+;; Note that we must make sure that all bits outside the lowest 16 are set to 0
+;; because this function is also used to load wider constants (that have zeros
+;; in their most significant bits).
+(decl constant_f16 (u16) Reg)
+(rule 3 (constant_f16 n)
+        (if-let $false (use_fp16))
+        (constant_f32 n))
+(rule 2 (constant_f16 0)
+        (vec_dup_imm (asimd_mov_mod_imm_zero (ScalarSize.Size32))
+                     $false
+                     (VectorSize.Size32x2)))
+(rule 1 (constant_f16 n)
+        (if-let imm (asimd_fp_mod_imm_from_u64 n (ScalarSize.Size16)))
+        (fpu_move_fp_imm imm (ScalarSize.Size16)))
+(rule (constant_f16 n)
+      (mov_to_fpu (imm $I16 (ImmExtend.Zero) n) (ScalarSize.Size16)))
+
 ;; Lower a constant f32.
 ;;
 ;; Note that we must make sure that all bits outside the lowest 32 are set to 0
 ;; because this function is also used to load wider constants (that have zeros
 ;; in their most significant bits).
 (decl constant_f32 (u32) Reg)
-(rule 2 (constant_f32 0)
+(rule 3 (constant_f32 0)
        (vec_dup_imm (asimd_mov_mod_imm_zero (ScalarSize.Size32))
                     $false
                     (VectorSize.Size32x2)))
-(rule 1 (constant_f32 n)
+(rule 2 (constant_f32 n)
        (if-let imm (asimd_fp_mod_imm_from_u64 n (ScalarSize.Size32)))
        (fpu_move_fp_imm imm (ScalarSize.Size32)))
+(rule 1 (constant_f32 (u32_as_u16 n))
+        (if-let $true (use_fp16))
+        (constant_f16 n))
 (rule (constant_f32 n)
      (mov_to_fpu (imm $I32 (ImmExtend.Zero) n) (ScalarSize.Size32)))

@ -4063,8 +4129,10 @@

 ;; Helpers for generating select instruction sequences.
 (decl lower_select (ProducesFlags Cond Type Value Value) ValueRegs)
-(rule 2 (lower_select flags cond (ty_scalar_float ty) rn rm)
+(rule 2 (lower_select flags cond (ty_scalar_float (fits_in_64 ty)) rn rm)
      (with_flags flags (fpu_csel ty cond rn rm)))
+(rule 4 (lower_select flags cond $F128 rn rm)
+      (with_flags flags (vec_csel cond rn rm)))
 (rule 3 (lower_select flags cond (ty_vec128 ty) rn rm)
      (with_flags flags (vec_csel cond rn rm)))
 (rule (lower_select flags cond ty rn rm)
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@ -958,6 +958,7 @@ impl MachInstEmit for Inst {
            | &Inst::ULoad64 {
                rd, ref mem, flags, ..
            }
+            | &Inst::FpuLoad16 { rd, ref mem, flags }
            | &Inst::FpuLoad32 { rd, ref mem, flags }
            | &Inst::FpuLoad64 { rd, ref mem, flags }
            | &Inst::FpuLoad128 { rd, ref mem, flags } => {
@ -983,6 +984,7 @@ impl MachInstEmit for Inst {
                    Inst::ULoad32 { .. } => 0b1011100001,
                    Inst::SLoad32 { .. } => 0b1011100010,
                    Inst::ULoad64 { .. } => 0b1111100001,
+                    Inst::FpuLoad16 { .. } => 0b0111110001,
                    Inst::FpuLoad32 { .. } => 0b1011110001,
                    Inst::FpuLoad64 { .. } => 0b1111110001,
                    Inst::FpuLoad128 { .. } => 0b0011110011,
@ -1098,6 +1100,7 @@ impl MachInstEmit for Inst {
            | &Inst::Store16 { rd, ref mem, flags }
            | &Inst::Store32 { rd, ref mem, flags }
            | &Inst::Store64 { rd, ref mem, flags }
+            | &Inst::FpuStore16 { rd, ref mem, flags }
            | &Inst::FpuStore32 { rd, ref mem, flags }
            | &Inst::FpuStore64 { rd, ref mem, flags }
            | &Inst::FpuStore128 { rd, ref mem, flags } => {
@ -1114,6 +1117,7 @@ impl MachInstEmit for Inst {
                    Inst::Store16 { .. } => 0b0111100000,
                    Inst::Store32 { .. } => 0b1011100000,
                    Inst::Store64 { .. } => 0b1111100000,
+                    Inst::FpuStore16 { .. } => 0b0111110000,
                    Inst::FpuStore32 { .. } => 0b1011110000,
                    Inst::FpuStore64 { .. } => 0b1111110000,
                    Inst::FpuStore128 { .. } => 0b0011110010,
@ -2213,6 +2217,9 @@ impl MachInstEmit for Inst {
                };
                sink.put4(enc_inttofpu(top16, rd, rn));
            }
+            &Inst::FpuCSel16 { rd, rn, rm, cond } => {
+                sink.put4(enc_fcsel(rd, rn, rm, cond, ScalarSize::Size16));
+            }
            &Inst::FpuCSel32 { rd, rn, rm, cond } => {
                sink.put4(enc_fcsel(rd, rn, rm, cond, ScalarSize::Size32));
            }
@ -2234,6 +2241,7 @@ impl MachInstEmit for Inst {
            }
            &Inst::MovToFpu { rd, rn, size } => {
                let template = match size {
+                    ScalarSize::Size16 => 0b000_11110_11_1_00_111_000000_00000_00000,
                    ScalarSize::Size32 => 0b000_11110_00_1_00_111_000000_00000_00000,
                    ScalarSize::Size64 => 0b100_11110_01_1_00_111_000000_00000_00000,
                    _ => unreachable!(),
@ -2241,14 +2249,9 @@ impl MachInstEmit for Inst {
                sink.put4(template | (machreg_to_gpr(rn) << 5) | machreg_to_vec(rd.to_reg()));
            }
            &Inst::FpuMoveFPImm { rd, imm, size } => {
-                let size_code = match size {
-                    ScalarSize::Size32 => 0b00,
-                    ScalarSize::Size64 => 0b01,
-                    _ => unimplemented!(),
-                };
                sink.put4(
                    0b000_11110_00_1_00_000_000100_00000_00000
-                        | size_code << 22
+                        | size.ftype() << 22
                        | ((imm.enc_bits() as u32) << 13)
                        | machreg_to_vec(rd.to_reg()),
                );
--- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@ -6699,6 +6699,19 @@ fn test_aarch64_binemit() {
        "fcmp d23, d24",
    ));

+    insns.push((
+        Inst::FpuLoad16 {
+            rd: writable_vreg(16),
+            mem: AMode::RegScaled {
+                rn: xreg(8),
+                rm: xreg(9),
+            },
+            flags: MemFlags::trusted(),
+        },
+        "1079697C",
+        "ldr h16, [x8, x9, LSL #1]",
+    ));
+
    insns.push((
        Inst::FpuLoad32 {
            rd: writable_vreg(16),
@ -6774,6 +6787,19 @@ fn test_aarch64_binemit() {
        "ldr q16, pc+8",
    ));

+    insns.push((
+        Inst::FpuStore16 {
+            rd: vreg(16),
+            mem: AMode::RegScaled {
+                rn: xreg(8),
+                rm: xreg(9),
+            },
+            flags: MemFlags::trusted(),
+        },
+        "1079297C",
+        "str h16, [x8, x9, LSL #1]",
+    ));
+
    insns.push((
        Inst::FpuStore32 {
            rd: vreg(16),
@ -6973,6 +6999,17 @@ fn test_aarch64_binemit() {
        "stp q18, q22, [sp], #304",
    ));

+    insns.push((
+        Inst::FpuCSel16 {
+            rd: writable_vreg(1),
+            rn: vreg(2),
+            rm: vreg(3),
+            cond: Cond::Hi,
+        },
+        "418CE31E",
+        "fcsel h1, h2, h3, hi",
+    ));
+
    insns.push((
        Inst::FpuCSel32 {
            rd: writable_vreg(1),
--- a/cranelift/codegen/src/isa/aarch64/inst/imms.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/imms.rs
@ -750,7 +750,7 @@ impl ASIMDMovModImm {
 #[derive(Clone, Copy, Debug, PartialEq)]
 pub struct ASIMDFPModImm {
    imm: u8,
-    is_64bit: bool,
+    size: ScalarSize,
 }

 impl ASIMDFPModImm {
@ -759,6 +759,21 @@ impl ASIMDFPModImm {
        // In all cases immediates are encoded as an 8-bit number 0b_abcdefgh;
        // let `D` be the inverse of the digit `d`.
        match size {
+            ScalarSize::Size16 => {
+                // In this case the representable immediates are 16-bit numbers of the form
+                // 0b_aBbb_cdef_gh00_0000.
+                let value = value as u16;
+                let b0_5 = (value >> 6) & 0b111111;
+                let b6 = (value >> 6) & (1 << 6);
+                let b7 = (value >> 8) & (1 << 7);
+                let imm = (b0_5 | b6 | b7) as u8;
+
+                if value == Self::value16(imm) {
+                    Some(ASIMDFPModImm { imm, size })
+                } else {
+                    None
+                }
+            }
            ScalarSize::Size32 => {
                // In this case the representable immediates are 32-bit numbers of the form
                // 0b_aBbb_bbbc_defg_h000 shifted to the left by 16.
@ -769,10 +784,7 @@ impl ASIMDFPModImm {
                let imm = (b0_5 | b6 | b7) as u8;

                if value == Self::value32(imm) {
-                    Some(ASIMDFPModImm {
-                        imm,
-                        is_64bit: false,
-                    })
+                    Some(ASIMDFPModImm { imm, size })
                } else {
                    None
                }
@ -786,10 +798,7 @@ impl ASIMDFPModImm {
                let imm = (b0_5 | b6 | b7) as u8;

                if value == Self::value64(imm) {
-                    Some(ASIMDFPModImm {
-                        imm,
-                        is_64bit: true,
-                    })
+                    Some(ASIMDFPModImm { imm, size })
                } else {
                    None
                }
@ -803,6 +812,17 @@ impl ASIMDFPModImm {
        self.imm
    }

+    /// Returns the 16-bit value that corresponds to an 8-bit encoding.
+    fn value16(imm: u8) -> u16 {
+        let imm = imm as u16;
+        let b0_5 = imm & 0b111111;
+        let b6 = (imm >> 6) & 1;
+        let b6_inv = b6 ^ 1;
+        let b7 = (imm >> 7) & 1;
+
+        b0_5 << 6 | (b6 * 0b11) << 12 | b6_inv << 14 | b7 << 15
+    }
+
    /// Returns the 32-bit value that corresponds to an 8-bit encoding.
    fn value32(imm: u8) -> u32 {
        let imm = imm as u32;
@ -931,10 +951,21 @@ impl PrettyPrint for ASIMDMovModImm {

 impl PrettyPrint for ASIMDFPModImm {
    fn pretty_print(&self, _: u8) -> String {
-        if self.is_64bit {
-            format!("#{}", f64::from_bits(Self::value64(self.imm)))
-        } else {
-            format!("#{}", f32::from_bits(Self::value32(self.imm)))
+        match self.size {
+            ScalarSize::Size16 => {
+                // FIXME(#8312): Use `f16` once it is stable.
+                // `value` will always be a normal number. Convert it to a `f32`.
+                let value: u32 = Self::value16(self.imm).into();
+                let sign = (value & 0x8000) << 16;
+                // Adjust the exponent for the difference between the `f16` exponent bias and the
+                // `f32` exponent bias.
+                let exponent = ((value & 0x7c00) + ((127 - 15) << 10)) << 13;
+                let significand = (value & 0x3ff) << 13;
+                format!("#{}", f32::from_bits(sign | exponent | significand))
+            }
+            ScalarSize::Size32 => format!("#{}", f32::from_bits(Self::value32(self.imm))),
+            ScalarSize::Size64 => format!("#{}", f64::from_bits(Self::value64(self.imm))),
+            _ => unreachable!(),
        }
    }
 }
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@ -1,7 +1,7 @@
 //! This module defines aarch64-specific machine instruction types.

 use crate::binemit::{Addend, CodeOffset, Reloc};
-use crate::ir::types::{F32, F64, I128, I16, I32, I64, I8, I8X16, R32, R64};
+use crate::ir::types::{F128, F16, F32, F64, I128, I16, I32, I64, I8, I8X16, R32, R64};
 use crate::ir::{types, ExternalName, MemFlags, Type};
 use crate::isa::{CallConv, FunctionAlignment};
 use crate::machinst::*;
@ -281,6 +281,11 @@ impl Inst {
                mem,
                flags,
            },
+            F16 => Inst::FpuLoad16 {
+                rd: into_reg,
+                mem,
+                flags,
+            },
            F32 => Inst::FpuLoad32 {
                rd: into_reg,
                mem,
@ -292,7 +297,7 @@ impl Inst {
                flags,
            },
            _ => {
-                if ty.is_vector() {
+                if ty.is_vector() || ty.is_float() {
                    let bits = ty_bits(ty);
                    let rd = into_reg;

@ -332,6 +337,11 @@ impl Inst {
                mem,
                flags,
            },
+            F16 => Inst::FpuStore16 {
+                rd: from_reg,
+                mem,
+                flags,
+            },
            F32 => Inst::FpuStore32 {
                rd: from_reg,
                mem,
@ -343,7 +353,7 @@ impl Inst {
                flags,
            },
            _ => {
-                if ty.is_vector() {
+                if ty.is_vector() || ty.is_float() {
                    let bits = ty_bits(ty);
                    let rd = from_reg;

@ -372,6 +382,7 @@ impl Inst {
            Inst::ULoad32 { .. } => Some(I32),
            Inst::SLoad32 { .. } => Some(I32),
            Inst::ULoad64 { .. } => Some(I64),
+            Inst::FpuLoad16 { .. } => Some(F16),
            Inst::FpuLoad32 { .. } => Some(F32),
            Inst::FpuLoad64 { .. } => Some(F64),
            Inst::FpuLoad128 { .. } => Some(I8X16),
@ -379,6 +390,7 @@ impl Inst {
            Inst::Store16 { .. } => Some(I16),
            Inst::Store32 { .. } => Some(I32),
            Inst::Store64 { .. } => Some(I64),
+            Inst::FpuStore16 { .. } => Some(F16),
            Inst::FpuStore32 { .. } => Some(F32),
            Inst::FpuStore64 { .. } => Some(F64),
            Inst::FpuStore128 { .. } => Some(I8X16),
@ -697,6 +709,10 @@ fn aarch64_get_operands(inst: &mut Inst, collector: &mut impl OperandVisitor) {
            collector.reg_use(rn);
            collector.reg_use(rm);
        }
+        Inst::FpuLoad16 { rd, mem, .. } => {
+            collector.reg_def(rd);
+            memarg_operands(mem, collector);
+        }
        Inst::FpuLoad32 { rd, mem, .. } => {
            collector.reg_def(rd);
            memarg_operands(mem, collector);
@ -709,6 +725,10 @@ fn aarch64_get_operands(inst: &mut Inst, collector: &mut impl OperandVisitor) {
            collector.reg_def(rd);
            memarg_operands(mem, collector);
        }
+        Inst::FpuStore16 { rd, mem, .. } => {
+            collector.reg_use(rd);
+            memarg_operands(mem, collector);
+        }
        Inst::FpuStore32 { rd, mem, .. } => {
            collector.reg_use(rd);
            memarg_operands(mem, collector);
@ -749,7 +769,9 @@ fn aarch64_get_operands(inst: &mut Inst, collector: &mut impl OperandVisitor) {
            collector.reg_def(rd);
            collector.reg_use(rn);
        }
-        Inst::FpuCSel32 { rd, rn, rm, .. } | Inst::FpuCSel64 { rd, rn, rm, .. } => {
+        Inst::FpuCSel16 { rd, rn, rm, .. }
+        | Inst::FpuCSel32 { rd, rn, rm, .. }
+        | Inst::FpuCSel64 { rd, rn, rm, .. } => {
            collector.reg_def(rd);
            collector.reg_use(rn);
            collector.reg_use(rm);
@ -1060,6 +1082,7 @@ impl MachInst for Inst {
            | &Inst::SLoad32 { .. }
            | &Inst::ULoad64 { .. }
            | &Inst::LoadP64 { .. }
+            | &Inst::FpuLoad16 { .. }
            | &Inst::FpuLoad32 { .. }
            | &Inst::FpuLoad64 { .. }
            | &Inst::FpuLoad128 { .. }
@ -1070,6 +1093,7 @@ impl MachInst for Inst {
            | &Inst::Store32 { .. }
            | &Inst::Store64 { .. }
            | &Inst::StoreP64 { .. }
+            | &Inst::FpuStore16 { .. }
            | &Inst::FpuStore32 { .. }
            | &Inst::FpuStore64 { .. }
            | &Inst::FpuStore128 { .. } => true,
@ -1134,8 +1158,10 @@ impl MachInst for Inst {
            I64 => Ok((&[RegClass::Int], &[I64])),
            R32 => panic!("32-bit reftype pointer should never be seen on AArch64"),
            R64 => Ok((&[RegClass::Int], &[R64])),
+            F16 => Ok((&[RegClass::Float], &[F16])),
            F32 => Ok((&[RegClass::Float], &[F32])),
            F64 => Ok((&[RegClass::Float], &[F64])),
+            F128 => Ok((&[RegClass::Float], &[F128])),
            I128 => Ok((&[RegClass::Int, RegClass::Int], &[I64, I64])),
            _ if ty.is_vector() => {
                assert!(ty.bits() <= 128);
@ -1809,6 +1835,13 @@ impl Inst {
                let rm = pretty_print_vreg_scalar(rm, size);
                format!("fcmp {rn}, {rm}")
            }
+            &Inst::FpuLoad16 { rd, ref mem, .. } => {
+                let rd = pretty_print_vreg_scalar(rd.to_reg(), ScalarSize::Size16);
+                let mem = mem.clone();
+                let access_ty = self.mem_type().unwrap();
+                let (mem_str, mem) = mem_finalize_for_show(&mem, access_ty, state);
+                format!("{mem_str}ldr {rd}, {mem}")
+            }
            &Inst::FpuLoad32 { rd, ref mem, .. } => {
                let rd = pretty_print_vreg_scalar(rd.to_reg(), ScalarSize::Size32);
                let mem = mem.clone();
@ -1831,6 +1864,13 @@ impl Inst {
                let (mem_str, mem) = mem_finalize_for_show(&mem, access_ty, state);
                format!("{mem_str}ldr {rd}, {mem}")
            }
+            &Inst::FpuStore16 { rd, ref mem, .. } => {
+                let rd = pretty_print_vreg_scalar(rd, ScalarSize::Size16);
+                let mem = mem.clone();
+                let access_ty = self.mem_type().unwrap();
+                let (mem_str, mem) = mem_finalize_for_show(&mem, access_ty, state);
+                format!("{mem_str}str {rd}, {mem}")
+            }
            &Inst::FpuStore32 { rd, ref mem, .. } => {
                let rd = pretty_print_vreg_scalar(rd, ScalarSize::Size32);
                let mem = mem.clone();
@ -1923,6 +1963,13 @@ impl Inst {
                let rn = pretty_print_ireg(rn, sizesrc);
                format!("{op} {rd}, {rn}")
            }
+            &Inst::FpuCSel16 { rd, rn, rm, cond } => {
+                let rd = pretty_print_vreg_scalar(rd.to_reg(), ScalarSize::Size16);
+                let rn = pretty_print_vreg_scalar(rn, ScalarSize::Size16);
+                let rm = pretty_print_vreg_scalar(rm, ScalarSize::Size16);
+                let cond = cond.pretty_print(0);
+                format!("fcsel {rd}, {rn}, {rm}, {cond}")
+            }
            &Inst::FpuCSel32 { rd, rn, rm, cond } => {
                let rd = pretty_print_vreg_scalar(rd.to_reg(), ScalarSize::Size32);
                let rn = pretty_print_vreg_scalar(rn, ScalarSize::Size32);
--- a/cranelift/codegen/src/isa/aarch64/lower.isle
+++ b/cranelift/codegen/src/isa/aarch64/lower.isle
@ -24,6 +24,11 @@
 (rule (lower (has_type ty (null)))
      (imm ty (ImmExtend.Zero) 0))

+;;;; Rules for `f16const` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (f16const (u16_from_ieee16 n)))
+      (constant_f16 n))
+
 ;;;; Rules for `f32const` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 (rule (lower (f32const (u32_from_ieee32 n)))
@ -34,6 +39,11 @@
 (rule (lower (f64const (u64_from_ieee64 n)))
      (constant_f64 n))

+;;;; Rules for `f128const` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (has_type $F128 (f128const (u128_from_constant n))))
+      (constant_f128 n))
+
 ;;;; Rules for `nop` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 (rule (lower (nop))
@ -2329,12 +2339,18 @@
 (rule (lower
       (has_type $R64 (load flags address offset)))
      (aarch64_uload64 (amode $I64 address offset) flags))
+(rule (lower
+       (has_type $F16 (load flags address offset)))
+      (aarch64_fpuload16 (amode $F16 address offset) flags))
 (rule (lower
       (has_type $F32 (load flags address offset)))
      (aarch64_fpuload32 (amode $F32 address offset) flags))
 (rule (lower
       (has_type $F64 (load flags address offset)))
      (aarch64_fpuload64 (amode $F64 address offset) flags))
+(rule (lower
+       (has_type $F128 (load flags address offset)))
+      (aarch64_fpuload128 (amode $F128 address offset) flags))
 (rule (lower
       (has_type $I128 (load flags address offset)))
      (aarch64_loadp64 (pair_amode address offset) flags))
@ -2447,6 +2463,10 @@
      (side_effect
       (aarch64_store32 (amode $I32 address offset) flags value)))

+(rule (lower
+       (store flags value @ (value_type $F16) address offset))
+      (side_effect
+       (aarch64_fpustore16 (amode $F16 address offset) flags value)))
 (rule (lower
       (store flags value @ (value_type $F32) address offset))
      (side_effect
@ -2455,6 +2475,10 @@
       (store flags value @ (value_type $F64) address offset))
      (side_effect
       (aarch64_fpustore64 (amode $F64 address offset) flags value)))
+(rule (lower
+       (store flags value @ (value_type $F128) address offset))
+      (side_effect
+       (aarch64_fpustore128 (amode $F128 address offset) flags value)))

 (rule (lower
       (store flags value @ (value_type $I128) address offset))
@ -2491,9 +2515,17 @@
 ;;; Rules for `bitcast` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 ; SIMD&FP <=> SIMD&FP
-(rule 5 (lower (has_type (ty_float_or_vec _) (bitcast _ x @ (value_type (ty_float_or_vec _)))))
+(rule 7 (lower (has_type (ty_float_or_vec _) (bitcast _ x @ (value_type (ty_float_or_vec _)))))
      x)

+; I128 => SIMD&FP
+(rule 6 (lower (has_type (ty_float_or_vec _) (bitcast _ x @ (value_type $I128))))
+      (mov_to_vec (mov_to_fpu (value_regs_get x 0) (ScalarSize.Size64)) (value_regs_get x 1) 1 (VectorSize.Size64x2)))
+
+; SIMD&FP => I128
+(rule 5 (lower (has_type $I128 (bitcast _ x @ (value_type (ty_float_or_vec _)))))
+      (value_regs (mov_from_vec x 0 (ScalarSize.Size64)) (mov_from_vec x 1 (ScalarSize.Size64))))
+
 ; GPR => SIMD&FP
 (rule 4 (lower (has_type (ty_float_or_vec _) (bitcast _ x @ (value_type in_ty))))
      (if (ty_int_ref_scalar_64 in_ty))
--- a/cranelift/codegen/src/isa/aarch64/lower/isle.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower/isle.rs
@ -149,6 +149,10 @@ impl Context for IsleContext<'_, '_, MInst, AArch64Backend> {
        }
    }

+    fn use_fp16(&mut self) -> bool {
+        self.backend.isa_flags.has_fp16()
+    }
+
    fn move_wide_const_from_u64(&mut self, ty: Type, n: u64) -> Option<MoveWideConst> {
        let bits = ty.bits();
        let n = if bits < 64 {
--- a/cranelift/codegen/src/isa/aarch64/pcc.rs
+++ b/cranelift/codegen/src/isa/aarch64/pcc.rs
@ -65,7 +65,8 @@ pub(crate) fn check(
            let access_ty = inst.mem_type().unwrap();
            check_load(ctx, Some(rd.to_reg()), flags, mem, vcode, access_ty)
        }
-        Inst::FpuLoad32 { ref mem, flags, .. }
+        Inst::FpuLoad16 { ref mem, flags, .. }
+        | Inst::FpuLoad32 { ref mem, flags, .. }
        | Inst::FpuLoad64 { ref mem, flags, .. }
        | Inst::FpuLoad128 { ref mem, flags, .. } => {
            let access_ty = inst.mem_type().unwrap();
@ -91,7 +92,8 @@ pub(crate) fn check(
            let access_ty = inst.mem_type().unwrap();
            check_store(ctx, Some(rd), flags, mem, vcode, access_ty)
        }
-        Inst::FpuStore32 { ref mem, flags, .. }
+        Inst::FpuStore16 { ref mem, flags, .. }
+        | Inst::FpuStore32 { ref mem, flags, .. }
        | Inst::FpuStore64 { ref mem, flags, .. }
        | Inst::FpuStore128 { ref mem, flags, .. } => {
            let access_ty = inst.mem_type().unwrap();
--- a/cranelift/codegen/src/isa/aarch64/settings.rs
+++ b/cranelift/codegen/src/isa/aarch64/settings.rs
@ -3,7 +3,7 @@
 use crate::settings::{self, detail, Builder, Value};
 use core::fmt;

-// Include code generated by `cranelift-codegen/meta/src/gen_settings.rs:`. This file contains a
+// Include code generated by `cranelift/codegen/meta/src/gen_settings.rs:`. This file contains a
 // public `Flags` struct with an impl for all of the settings defined in
-// `cranelift-codegen/meta/src/isa/arm64/settings.rs`.
+// `cranelift/codegen/meta/src/isa/arm64.rs`.
 include!(concat!(env!("OUT_DIR"), "/settings-arm64.rs"));
--- a/cranelift/codegen/src/isle_prelude.rs
+++ b/cranelift/codegen/src/isle_prelude.rs
@ -28,6 +28,11 @@ macro_rules! isle_common_prelude_methods {
            x as i16
        }

+        #[inline]
+        fn u16_as_u32(&mut self, x: u16) -> u32 {
+            x.into()
+        }
+
        #[inline]
        fn u16_as_u64(&mut self, x: u16) -> u64 {
            x.into()
@ -910,6 +915,10 @@ macro_rules! isle_common_prelude_methods {
            u32::try_from(val).ok()
        }

+        fn u32_as_u16(&mut self, val: u32) -> Option<u16> {
+            val.try_into().ok()
+        }
+
        fn u8_as_i8(&mut self, val: u8) -> i8 {
            val as i8
        }
--- a/cranelift/codegen/src/prelude.isle
+++ b/cranelift/codegen/src/prelude.isle
@ -83,6 +83,10 @@
 (decl pure u16_as_i16 (u16) i16)
 (extern constructor u16_as_i16 u16_as_i16)

+(decl pure u16_as_u32 (u16) u32)
+(extern constructor u16_as_u32 u16_as_u32)
+(convert u16 u32 u16_as_u32)
+
 (decl pure u16_as_u64 (u16) u64)
 (extern constructor u16_as_u64 u16_as_u64)
 (convert u16 u64 u16_as_u64)
@ -134,6 +138,9 @@
 (decl u64_as_u32 (u32) u64)
 (extern extractor u64_as_u32 u64_as_u32)

+(decl u32_as_u16 (u16) u32)
+(extern extractor u32_as_u16 u32_as_u16)
+
 (decl pure u64_as_i32 (u64) i32)
 (extern constructor u64_as_i32 u64_as_i32)

--- a/cranelift/filetests/filetests/isa/aarch64/bitcast-fp16.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/bitcast-fp16.clif
@ -0,0 +1,35 @@
+test compile precise-output
+target aarch64 has_fp16
+
+function %bitcast_f16_to_i16(f16) -> i16 {
+block0(v0: f16):
+  v1 = bitcast.i16 v0
+  return v1
+}
+
+; VCode:
+; block0:
+;   umov w0, v0.h[0]
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   umov w0, v0.h[0]
+;   ret
+
+function %bitcast_i16_to_f16(i16) -> f16 {
+block0(v0: i16):
+  v1 = bitcast.f16 v0
+  return v1
+}
+
+; VCode:
+; block0:
+;   fmov h0, w0
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   fmov h0, w0
+;   ret
+
--- a/cranelift/filetests/filetests/isa/aarch64/bitcast.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/bitcast.clif
@ -1,7 +1,39 @@
 test compile precise-output
 target aarch64

-function %f1(f32) -> i32 {
+function %bitcast_f16_to_i16(f16) -> i16 {
+block0(v0: f16):
+  v1 = bitcast.i16 v0
+  return v1
+}
+
+; VCode:
+; block0:
+;   umov w0, v0.h[0]
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   umov w0, v0.h[0]
+;   ret
+
+function %bitcast_i16_to_f16(i16) -> f16 {
+block0(v0: i16):
+  v1 = bitcast.f16 v0
+  return v1
+}
+
+; VCode:
+; block0:
+;   fmov s0, w0
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   fmov s0, w0
+;   ret
+
+function %bitcast_f32_to_i32(f32) -> i32 {
 block0(v0: f32):
  v1 = bitcast.i32 v0
  return v1
@ -17,7 +49,7 @@ block0(v0: f32):
 ;   mov w0, v0.s[0]
 ;   ret

-function %f2(i32) -> f32 {
+function %bitcast_i32_to_f32(i32) -> f32 {
 block0(v0: i32):
  v1 = bitcast.f32 v0
  return v1
@ -33,7 +65,7 @@ block0(v0: i32):
 ;   fmov s0, w0
 ;   ret

-function %f3(f64) -> i64 {
+function %bitcast_f64_to_i64(f64) -> i64 {
 block0(v0: f64):
  v1 = bitcast.i64 v0
  return v1
@ -49,7 +81,7 @@ block0(v0: f64):
 ;   mov x0, v0.d[0]
 ;   ret

-function %f4(i64) -> f64 {
+function %bitcast_i64_to_f64(i64) -> f64 {
 block0(v0: i64):
  v1 = bitcast.f64 v0
  return v1
@ -65,3 +97,75 @@ block0(v0: i64):
 ;   fmov d0, x0
 ;   ret

+function %bitcast_f128_to_i128(f128) -> i128 {
+block0(v0: f128):
+  v1 = bitcast.i128 v0
+  return v1
+}
+
+; VCode:
+; block0:
+;   mov x0, v0.d[0]
+;   mov x1, v0.d[1]
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   mov x0, v0.d[0]
+;   mov x1, v0.d[1]
+;   ret
+
+function %bitcast_i128_to_f128(i128) -> f128 {
+block0(v0: i128):
+  v1 = bitcast.f128 v0
+  return v1
+}
+
+; VCode:
+; block0:
+;   fmov d0, x0
+;   mov v0.d[1], v0.d[1], x1
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   fmov d0, x0
+;   mov v0.d[1], x1
+;   ret
+
+function %bitcast_i64x2_to_i128(i64x2) -> i128 {
+block0(v0: i64x2):
+  v1 = bitcast.i128 little v0
+  return v1
+}
+
+; VCode:
+; block0:
+;   mov x0, v0.d[0]
+;   mov x1, v0.d[1]
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   mov x0, v0.d[0]
+;   mov x1, v0.d[1]
+;   ret
+
+function %bitcast_i128_to_i64x2(i128) -> i64x2 {
+block0(v0: i128):
+  v1 = bitcast.i64x2 little v0
+  return v1
+}
+
+; VCode:
+; block0:
+;   fmov d0, x0
+;   mov v0.d[1], v0.d[1], x1
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   fmov d0, x0
+;   mov v0.d[1], x1
+;   ret
+
--- a/cranelift/filetests/filetests/isa/aarch64/call.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/call.clif
@ -1,6 +1,7 @@
 test compile precise-output
 set unwind_info=false
 set enable_probestack=false
+set enable_llvm_abi_extensions
 target aarch64

 function %f1(i64) -> i64 {
@ -933,3 +934,63 @@ block0:
 ;   ldp x29, x30, [sp], #0x10
 ;   ret

+function %second_f16(f16, f16) -> f16 system_v {
+block0(v0: f16, v1: f16):
+    return v1
+}
+
+; VCode:
+; block0:
+;   mov v0.16b, v1.16b
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   mov v0.16b, v1.16b
+;   ret
+
+function %second_f128(f128, f128) -> f128 system_v {
+block0(v0: f128, v1: f128):
+    return v1
+}
+
+; VCode:
+; block0:
+;   mov v0.16b, v1.16b
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   mov v0.16b, v1.16b
+;   ret
+
+function %second_f16_apple(f16, f16) -> f16 apple_aarch64 {
+block0(v0: f16, v1: f16):
+    return v1
+}
+
+; VCode:
+; block0:
+;   mov v0.16b, v1.16b
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   mov v0.16b, v1.16b
+;   ret
+
+function %second_f128_apple(f128, f128) -> f128 apple_aarch64 {
+block0(v0: f128, v1: f128):
+    return v1
+}
+
+; VCode:
+; block0:
+;   mov v0.16b, v1.16b
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   mov v0.16b, v1.16b
+;   ret
+
--- a/cranelift/filetests/filetests/isa/aarch64/constants-fp16.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/constants-fp16.clif
@ -0,0 +1,70 @@
+test compile precise-output
+set unwind_info=false
+target aarch64 has_fp16
+
+function %f() -> f16 {
+block0:
+  v0 = f16const 0x1.0
+  return v0
+}
+
+; VCode:
+; block0:
+;   fmov h0, #1
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   fmov h0, #1.00000000
+;   ret
+
+function %f() -> f16 {
+block0:
+  v0 = f16const 0x32.0
+  return v0
+}
+
+; VCode:
+; block0:
+;   movz w0, #21056
+;   fmov h0, w0
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   mov w0, #0x5240
+;   fmov h0, w0
+;   ret
+
+function %f() -> f16 {
+block0:
+  v0 = f16const 0x0.0
+  return v0
+}
+
+; VCode:
+; block0:
+;   movi v0.2s, #0
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   movi v0.2s, #0
+;   ret
+
+function %f() -> f16 {
+block0:
+  v0 = f16const -0x10.0
+  return v0
+}
+
+; VCode:
+; block0:
+;   fmov h0, #-16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   fmov h0, #-16.00000000
+;   ret
+
--- a/cranelift/filetests/filetests/isa/aarch64/constants.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/constants.clif
@ -316,6 +316,28 @@ block0:
 ;   mov x0, #-9
 ;   ret

+function %f() -> f128 {
+block0:
+  v0 = f128const 0x1.0
+  return v0
+}
+
+; VCode:
+; block0:
+;   ldr q0, [const(0)]
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   ldr q0, #0x10
+;   ret
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0xff, 0x3f
+
 function %f() -> f64 {
 block0:
  v0 = f64const 0x1.0
@ -348,6 +370,46 @@ block0:
 ;   fmov s0, #5.00000000
 ;   ret

+function %f() -> f16 {
+block0:
+  v0 = f16const 0x1.0
+  return v0
+}
+
+; VCode:
+; block0:
+;   movz w0, #15360
+;   fmov s0, w0
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   mov w0, #0x3c00
+;   fmov s0, w0
+;   ret
+
+function %f() -> f128 {
+block0:
+  v0 = f128const 0x32.0
+  return v0
+}
+
+; VCode:
+; block0:
+;   ldr q0, [const(0)]
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   ldr q0, #0x10
+;   ret
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x90, 0x04, 0x40
+
 function %f() -> f64 {
 block0:
  v0 = f64const 0x32.0
@ -384,6 +446,40 @@ block0:
 ;   fmov s0, w0
 ;   ret

+function %f() -> f16 {
+block0:
+  v0 = f16const 0x32.0
+  return v0
+}
+
+; VCode:
+; block0:
+;   movz w0, #21056
+;   fmov s0, w0
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   mov w0, #0x5240
+;   fmov s0, w0
+;   ret
+
+function %f() -> f128 {
+block0:
+  v0 = f128const 0x0.0
+  return v0
+}
+
+; VCode:
+; block0:
+;   movi v0.16b, #0
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   movi v0.16b, #0
+;   ret
+
 function %f() -> f64 {
 block0:
  v0 = f64const 0x0.0
@ -416,6 +512,44 @@ block0:
 ;   movi v0.2s, #0
 ;   ret

+function %f() -> f16 {
+block0:
+  v0 = f16const 0x0.0
+  return v0
+}
+
+; VCode:
+; block0:
+;   movi v0.2s, #0
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   movi v0.2s, #0
+;   ret
+
+function %f() -> f128 {
+block0:
+  v0 = f128const -0x10.0
+  return v0
+}
+
+; VCode:
+; block0:
+;   ldr q0, [const(0)]
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   ldr q0, #0x10
+;   ret
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x03, 0xc0
+
 function %f() -> f64 {
 block0:
  v0 = f64const -0x10.0
@ -448,3 +582,21 @@ block0:
 ;   fmov s0, #-16.00000000
 ;   ret

+function %f() -> f16 {
+block0:
+  v0 = f16const -0x10.0
+  return v0
+}
+
+; VCode:
+; block0:
+;   movz w0, #52224
+;   fmov s0, w0
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   mov w0, #0xcc00
+;   fmov s0, w0
+;   ret
+
--- a/cranelift/filetests/filetests/isa/aarch64/load-f16-f128.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/load-f16-f128.clif
@ -0,0 +1,36 @@
+test compile precise-output
+set unwind_info=false
+target aarch64
+
+function %load_f16(i64) -> f16 {
+block0(v0: i64):
+    v1 = load.f16 v0
+    return v1
+}
+
+; VCode:
+; block0:
+;   ldr h0, [x0]
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   ldr h0, [x0] ; trap: heap_oob
+;   ret
+
+function %load_f128(i64) -> f128 {
+block0(v0: i64):
+    v1 = load.f128 v0
+    return v1
+}
+
+; VCode:
+; block0:
+;   ldr q0, [x0]
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   ldr q0, [x0] ; trap: heap_oob
+;   ret
+
--- a/cranelift/filetests/filetests/isa/aarch64/select-fp16.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/select-fp16.clif
@ -0,0 +1,21 @@
+test compile precise-output
+target aarch64 has_fp16
+
+function %select_f16(i8, f16, f16) -> f16 {
+block0(v0: i8, v1: f16, v2: f16):
+    v3 = select.f16 v0, v1, v2
+    return v3
+}
+
+; VCode:
+; block0:
+;   ands wzr, w0, #255
+;   fcsel h0, h0, h1, ne
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   tst w0, #0xff
+;   fcsel h0, h0, h1, ne
+;   ret
+
--- a/cranelift/filetests/filetests/isa/aarch64/select.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/select.clif
@ -41,3 +41,78 @@ block0(v0: f32, v1: f32, v2: i64, v3: i64):
 ;   csel x0, x0, x1, eq
 ;   ret

+function %select_f16(i8, f16, f16) -> f16 {
+block0(v0: i8, v1: f16, v2: f16):
+    v3 = select.f16 v0, v1, v2
+    return v3
+}
+
+; VCode:
+; block0:
+;   ands wzr, w0, #255
+;   fcsel s0, s0, s1, ne
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   tst w0, #0xff
+;   fcsel s0, s0, s1, ne
+;   ret
+
+function %select_f32(i8, f32, f32) -> f32 {
+block0(v0: i8, v1: f32, v2: f32):
+    v3 = select.f32 v0, v1, v2
+    return v3
+}
+
+; VCode:
+; block0:
+;   ands wzr, w0, #255
+;   fcsel s0, s0, s1, ne
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   tst w0, #0xff
+;   fcsel s0, s0, s1, ne
+;   ret
+
+function %select_f64(i8, f64, f64) -> f64 {
+block0(v0: i8, v1: f64, v2: f64):
+    v3 = select.f64 v0, v1, v2
+    return v3
+}
+
+; VCode:
+; block0:
+;   ands wzr, w0, #255
+;   fcsel d0, d0, d1, ne
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   tst w0, #0xff
+;   fcsel d0, d0, d1, ne
+;   ret
+
+function %select_f128(i8, f128, f128) -> f128 {
+block0(v0: i8, v1: f128, v2: f128):
+    v3 = select.f128 v0, v1, v2
+    return v3
+}
+
+; VCode:
+; block0:
+;   ands wzr, w0, #255
+;   vcsel v0.16b, v0.16b, v1.16b, ne (if-then-else diamond)
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   tst w0, #0xff
+;   b.ne #0x10
+;   mov v0.16b, v1.16b
+;   b #0x14
+;   mov v0.16b, v0.16b
+;   ret
+
--- a/cranelift/filetests/filetests/isa/aarch64/store-f16-f128.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/store-f16-f128.clif
@ -0,0 +1,36 @@
+test compile precise-output
+set unwind_info=false
+target aarch64
+
+function %store_f16(f16, i64) {
+block0(v0: f16, v1: i64):
+    store.f16 v0, v1
+    return
+}
+
+; VCode:
+; block0:
+;   str h0, [x0]
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   str h0, [x0] ; trap: heap_oob
+;   ret
+
+function %store_f128(f128, i64) {
+block0(v0: f128, v1: i64):
+    store.f128 v0, v1
+    return
+}
+
+; VCode:
+; block0:
+;   str q0, [x0]
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   str q0, [x0] ; trap: heap_oob
+;   ret
+
--- a/cranelift/filetests/filetests/runtests/bitcast-f16-f128.clif
+++ b/cranelift/filetests/filetests/runtests/bitcast-f16-f128.clif
@ -2,6 +2,8 @@ test interpret
 test run
 set enable_llvm_abi_extensions
 target x86_64
+target aarch64
+target aarch64 has_fp16

 function %bitcast_i16_f16(i16) -> f16 fast {
 block0(v0: i16):
--- a/cranelift/filetests/filetests/runtests/f128const.clif
+++ b/cranelift/filetests/filetests/runtests/f128const.clif
@ -2,6 +2,7 @@ test interpret
 test run
 set enable_llvm_abi_extensions
 target x86_64
+target aarch64


 ;; These values are special for RISC-V since it has a dedicated
--- a/cranelift/filetests/filetests/runtests/f16const.clif
+++ b/cranelift/filetests/filetests/runtests/f16const.clif
@ -2,6 +2,8 @@ test interpret
 test run
 set enable_llvm_abi_extensions
 target x86_64
+target aarch64
+target aarch64 has_fp16


 ;; These values are special for RISC-V since it has a dedicated
--- a/cranelift/filetests/filetests/runtests/select-f16-f128.clif
+++ b/cranelift/filetests/filetests/runtests/select-f16-f128.clif
@ -2,6 +2,8 @@ test interpret
 test run
 set enable_llvm_abi_extensions
 target x86_64
+target aarch64
+target aarch64 has_fp16

 function %select_icmp_i8_f16(i8, f16, f16) -> f16 {
 block0(v0: i8, v1: f16, v2: f16):
--- a/cranelift/native/src/lib.rs
+++ b/cranelift/native/src/lib.rs
@ -109,6 +109,10 @@ pub fn infer_native_flags(isa_builder: &mut dyn Configurable) -> Result<(), &'st
            isa_builder.enable("has_pauth").unwrap();
        }

+        if std::arch::is_aarch64_feature_detected!("fp16") {
+            isa_builder.enable("has_fp16").unwrap();
+        }
+
        if cfg!(target_os = "macos") {
            // Pointer authentication is always available on Apple Silicon.
            isa_builder.enable("sign_return_address").unwrap();
--- a/crates/fuzzing/src/generators/codegen_settings.rs
+++ b/crates/fuzzing/src/generators/codegen_settings.rs
@ -123,6 +123,7 @@ impl<'a> Arbitrary<'a> for CodegenSettings {

                    std: "bti" => clif: "use_bti",
                    std: "lse" => clif: "has_lse",
+                    std: "fp16" => clif: "has_fp16",
                    // even though the natural correspondence seems to be
                    // between "paca" and "has_pauth", the latter has no effect
                    // in isolation, so we actually use the setting that affects
--- a/crates/wasmtime/src/config.rs
+++ b/crates/wasmtime/src/config.rs
@ -2812,6 +2812,7 @@ fn detect_host_feature(feature: &str) -> Option<bool> {
        return match feature {
            "lse" => Some(std::arch::is_aarch64_feature_detected!("lse")),
            "paca" => Some(std::arch::is_aarch64_feature_detected!("paca")),
+            "fp16" => Some(std::arch::is_aarch64_feature_detected!("fp16")),

            _ => None,
        };
--- a/crates/wasmtime/src/engine.rs
+++ b/crates/wasmtime/src/engine.rs
@ -391,6 +391,7 @@ impl Engine {
            // aarch64 features to detect
            "has_lse" => "lse",
            "has_pauth" => "paca",
+            "has_fp16" => "fp16",

            // aarch64 features which don't need detection
            // No effect on its own.
--- a/src/commands/compile.rs
+++ b/src/commands/compile.rs
@ -202,6 +202,7 @@ mod test {
            "-Dlogging=n",
            "-Ccranelift-has-lse",
            "-Ccranelift-has-pauth",
+            "-Ccranelift-has-fp16",
            "-Ccranelift-sign-return-address",
            "-Ccranelift-sign-return-address-all",
            "-Ccranelift-sign-return-address-with-bkey",