diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs index c4eebe15b0..655ea8458c 100644 --- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs +++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs @@ -2926,42 +2926,62 @@ pub(crate) fn lower_insn_to_regs>( } Opcode::FminPseudo | Opcode::FmaxPseudo => { - let ty = ctx.input_ty(insn, 0); - if ty == F32X4 || ty == F64X2 { + let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); + let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); + let (ra, rb) = if op == Opcode::FminPseudo { + (rm, rn) + } else { + (rn, rm) + }; + let ty = ty.unwrap(); + let lane_type = ty.lane_type(); + + debug_assert!(lane_type == F32 || lane_type == F64); + + if ty.is_vector() { + let size = VectorSize::from_ty(ty); + // pmin(a,b) => bitsel(b, a, cmpgt(a, b)) // pmax(a,b) => bitsel(b, a, cmpgt(b, a)) - let r_dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - let r_a = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); - let r_b = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); - // Since we're going to write the output register `r_dst` anyway, we might as - // well first use it to hold the comparison result. This has the slightly unusual + // Since we're going to write the output register `rd` anyway, we might as well + // first use it to hold the comparison result. This has the slightly unusual // effect that we modify the output register in the first instruction (`fcmgt`) // but read both the inputs again in the second instruction (`bsl`), which means // that the output register can't be either of the input registers. Regalloc // should handle this correctly, nevertheless. ctx.emit(Inst::VecRRR { alu_op: VecALUOp::Fcmgt, - rd: r_dst, - rn: if op == Opcode::FminPseudo { r_a } else { r_b }, - rm: if op == Opcode::FminPseudo { r_b } else { r_a }, - size: if ty == F32X4 { - VectorSize::Size32x4 - } else { - VectorSize::Size64x2 - }, + rd, + rn: ra, + rm: rb, + size, }); ctx.emit(Inst::VecRRR { alu_op: VecALUOp::Bsl, - rd: r_dst, - rn: r_b, - rm: r_a, - size: VectorSize::Size8x16, + rd, + rn, + rm, + size, }); } else { - return Err(CodegenError::Unsupported(format!( - "{}: Unsupported type: {:?}", - op, ty - ))); + if lane_type == F32 { + ctx.emit(Inst::FpuCmp32 { rn: ra, rm: rb }); + ctx.emit(Inst::FpuCSel32 { + rd, + rn, + rm, + cond: Cond::Gt, + }); + } else { + ctx.emit(Inst::FpuCmp64 { rn: ra, rm: rb }); + ctx.emit(Inst::FpuCSel64 { + rd, + rn, + rm, + cond: Cond::Gt, + }); + } } } diff --git a/cranelift/filetests/filetests/runtests/fmin-max-pseudo.clif b/cranelift/filetests/filetests/runtests/fmin-max-pseudo.clif index a1273f9063..7fd70504f1 100644 --- a/cranelift/filetests/filetests/runtests/fmin-max-pseudo.clif +++ b/cranelift/filetests/filetests/runtests/fmin-max-pseudo.clif @@ -1,6 +1,6 @@ test run ; target s390x TODO: Not yet implemented on s390x -; target aarch64 TODO: Not yet implemented on aarch64 +target aarch64 set enable_simd target x86_64 machinst skylake