Cranelift AArch64: Implement scalar FmaxPseudo and FminPseudo

3 years ago · 930b1f17f0
2 changed files with 44 additions and 24 deletions
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@ -2926,42 +2926,62 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
        }

        Opcode::FminPseudo | Opcode::FmaxPseudo => {
-            let ty = ctx.input_ty(insn, 0);
-            if ty == F32X4 || ty == F64X2 {
+            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
+            let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+            let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
+            let (ra, rb) = if op == Opcode::FminPseudo {
+                (rm, rn)
+            } else {
+                (rn, rm)
+            };
+            let ty = ty.unwrap();
+            let lane_type = ty.lane_type();
+
+            debug_assert!(lane_type == F32 || lane_type == F64);
+
+            if ty.is_vector() {
+                let size = VectorSize::from_ty(ty);
+
                // pmin(a,b) => bitsel(b, a, cmpgt(a, b))
                // pmax(a,b) => bitsel(b, a, cmpgt(b, a))
-                let r_dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-                let r_a = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
-                let r_b = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
-                // Since we're going to write the output register `r_dst` anyway, we might as
-                // well first use it to hold the comparison result.  This has the slightly unusual
+                // Since we're going to write the output register `rd` anyway, we might as well
+                // first use it to hold the comparison result.  This has the slightly unusual
                // effect that we modify the output register in the first instruction (`fcmgt`)
                // but read both the inputs again in the second instruction (`bsl`), which means
                // that the output register can't be either of the input registers.  Regalloc
                // should handle this correctly, nevertheless.
                ctx.emit(Inst::VecRRR {
                    alu_op: VecALUOp::Fcmgt,
-                    rd: r_dst,
-                    rn: if op == Opcode::FminPseudo { r_a } else { r_b },
-                    rm: if op == Opcode::FminPseudo { r_b } else { r_a },
-                    size: if ty == F32X4 {
-                        VectorSize::Size32x4
-                    } else {
-                        VectorSize::Size64x2
-                    },
+                    rd,
+                    rn: ra,
+                    rm: rb,
+                    size,
                });
                ctx.emit(Inst::VecRRR {
                    alu_op: VecALUOp::Bsl,
-                    rd: r_dst,
-                    rn: r_b,
-                    rm: r_a,
-                    size: VectorSize::Size8x16,
+                    rd,
+                    rn,
+                    rm,
+                    size,
                });
            } else {
-                return Err(CodegenError::Unsupported(format!(
-                    "{}: Unsupported type: {:?}",
-                    op, ty
-                )));
+                if lane_type == F32 {
+                    ctx.emit(Inst::FpuCmp32 { rn: ra, rm: rb });
+                    ctx.emit(Inst::FpuCSel32 {
+                        rd,
+                        rn,
+                        rm,
+                        cond: Cond::Gt,
+                    });
+                } else {
+                    ctx.emit(Inst::FpuCmp64 { rn: ra, rm: rb });
+                    ctx.emit(Inst::FpuCSel64 {
+                        rd,
+                        rn,
+                        rm,
+                        cond: Cond::Gt,
+                    });
+                }
            }
        }

--- a/cranelift/filetests/filetests/runtests/fmin-max-pseudo.clif
+++ b/cranelift/filetests/filetests/runtests/fmin-max-pseudo.clif
@ -1,6 +1,6 @@
 test run
 ; target s390x TODO: Not yet implemented on s390x
-; target aarch64 TODO: Not yet implemented on aarch64
+target aarch64
 set enable_simd
 target x86_64 machinst skylake