x64: Refactor float comparisons and their representations (#8362)

* x64: Refactor float comparisons and their representations Currently the `XmmCmpRmR` instruction variant has a `dst` and a `src` field. The instruction doesn't actually write to `dst`, however, and the constructor of `xmm_cmp_rm_r` takes the `src` first followed by the `dst`. This is inconsistent with most other xmm-related instructions where the "src1" comes first and the "src2", which is a memory operand, comes second. This memory-operand-second pattern also matches the Intel manuals more closely. This commit refactors the `XmmCmpRmR` instruction variant with the following changes: * `dst` is renamed to `src1` * `src` is renamed to `src2` * The `xmm_cmp_rm_r` helpers, and callers, swapped their arguments to take `Xmm` first and `XmmMem` second. * The `x64_ptest` instruction followed suit as it was modelled after the same. * Callers of `x64_ucomis` swapped their arguments to preserve the operand orders. * The `Inst::xmm_cmp_rm_r` helper swapped operand order and additionally callers were updated. * The VCode rendering of `XmmCmpRmR` swapped order of its operands, explaining changes in rendering of filetests (although machine code is not changing here). The changes were then additionally propagated to Winch as well. In Winch the `src`/`dst` naming was inherited so it was renamed to `src1` and `src2` which swapped operands as well. In the case of Winch there was additionally an accident in `float_cmp_op` where values were popped in reverse order. This swapping-of-swapping all worked out prior, but to get all the names to align correctly I've swapped this to be more consistent. Sorry there's a lot of swaps-of-swaps here but the basic idea is that the low-level instruction constructor swapped arguments so to preserve the same (correct) output today something else needed to be swapped. In Winch's case it wasn't the immediate caller of the instruction constructor since that method looked correct, but it was instead a higher-level `float_cmp_op` which then called a helper which then called the low-level constructor which had operands swapped. * Review comments
7 months ago · e170037750
7 changed files with 66 additions and 59 deletions
--- a/cranelift/codegen/src/isa/x64/inst.isle
+++ b/cranelift/codegen/src/isa/x64/inst.isle
@ -502,10 +502,13 @@
                     (rhs Xmm)
                     (dst WritableXmm))

-       ;; Float comparisons/tests: cmp (b w l q) (reg addr imm) reg.
+       ;; Float comparisons/tests
+       ;;
+       ;; Compares `src1` against `src2` with `op`. Note that if you're testing
+       ;; `a < b` then `src1 == a` and `src2 == b`.
       (XmmCmpRmR (op SseOpcode)
-                  (src XmmMemAligned)
-                  (dst Xmm))
+                  (src1 Xmm)
+                  (src2 XmmMemAligned))

       ;; A binary XMM instruction with an 8-bit immediate: e.g. cmp (ps pd) imm
       ;; (reg addr) reg
@ -2809,7 +2812,7 @@
      (cmp_rmi_r size (CmpOpcode.Cmp) (RegMemImm.Imm src1) src2))

 ;; Helper for creating `MInst.XmmCmpRmR` instructions.
-(decl xmm_cmp_rm_r (SseOpcode XmmMemAligned Xmm) ProducesFlags)
+(decl xmm_cmp_rm_r (SseOpcode Xmm XmmMem) ProducesFlags)
 (rule (xmm_cmp_rm_r opcode src1 src2)
      (ProducesFlags.ProducesFlagsSideEffect
       (MInst.XmmCmpRmR opcode src1 src2)))
@ -2829,7 +2832,7 @@
      (cmp_rmi_r size (CmpOpcode.Test) src1 src2))

 ;; Helper for creating `ptest` instructions.
-(decl x64_ptest (XmmMem Xmm) ProducesFlags)
+(decl x64_ptest (Xmm XmmMem) ProducesFlags)
 (rule (x64_ptest src1 src2)
      (xmm_cmp_rm_r (SseOpcode.Ptest) src1 src2))

@ -4879,45 +4882,45 @@
 (decl emit_fcmp (FloatCC Value Value) FcmpCondResult)

 (rule (emit_fcmp (FloatCC.Equal) a @ (value_type (ty_scalar_float _)) b)
-      (FcmpCondResult.AndCondition (x64_ucomis b a) (CC.NP) (CC.Z)))
+      (FcmpCondResult.AndCondition (x64_ucomis a b) (CC.NP) (CC.Z)))

 (rule (emit_fcmp (FloatCC.NotEqual) a @ (value_type (ty_scalar_float _)) b)
-      (FcmpCondResult.OrCondition (x64_ucomis b a) (CC.P) (CC.NZ)))
+      (FcmpCondResult.OrCondition (x64_ucomis a b) (CC.P) (CC.NZ)))

 ;; Some scalar lowerings correspond to one condition code.

 (rule (emit_fcmp (FloatCC.Ordered) a @ (value_type (ty_scalar_float ty)) b)
-      (FcmpCondResult.Condition (x64_ucomis b a) (CC.NP)))
+      (FcmpCondResult.Condition (x64_ucomis a b) (CC.NP)))
 (rule (emit_fcmp (FloatCC.Unordered) a @ (value_type (ty_scalar_float ty)) b)
-      (FcmpCondResult.Condition (x64_ucomis b a) (CC.P)))
+      (FcmpCondResult.Condition (x64_ucomis a b) (CC.P)))
 (rule (emit_fcmp (FloatCC.OrderedNotEqual) a @ (value_type (ty_scalar_float ty)) b)
-      (FcmpCondResult.Condition (x64_ucomis b a) (CC.NZ)))
+      (FcmpCondResult.Condition (x64_ucomis a b) (CC.NZ)))
 (rule (emit_fcmp (FloatCC.UnorderedOrEqual) a @ (value_type (ty_scalar_float ty)) b)
-      (FcmpCondResult.Condition (x64_ucomis b a) (CC.Z)))
+      (FcmpCondResult.Condition (x64_ucomis a b) (CC.Z)))
 (rule (emit_fcmp (FloatCC.GreaterThan) a @ (value_type (ty_scalar_float ty)) b)
-      (FcmpCondResult.Condition (x64_ucomis b a) (CC.NBE)))
+      (FcmpCondResult.Condition (x64_ucomis a b) (CC.NBE)))
 (rule (emit_fcmp (FloatCC.GreaterThanOrEqual) a @ (value_type (ty_scalar_float ty)) b)
-      (FcmpCondResult.Condition (x64_ucomis b a) (CC.NB)))
+      (FcmpCondResult.Condition (x64_ucomis a b) (CC.NB)))
 (rule (emit_fcmp (FloatCC.UnorderedOrLessThan) a @ (value_type (ty_scalar_float ty)) b)
-      (FcmpCondResult.Condition (x64_ucomis b a) (CC.B)))
+      (FcmpCondResult.Condition (x64_ucomis a b) (CC.B)))
 (rule (emit_fcmp (FloatCC.UnorderedOrLessThanOrEqual) a @ (value_type (ty_scalar_float ty)) b)
-      (FcmpCondResult.Condition (x64_ucomis b a) (CC.BE)))
+      (FcmpCondResult.Condition (x64_ucomis a b) (CC.BE)))

 ;; Other scalar lowerings are made possible by flipping the operands and
 ;; reversing the condition code.

 (rule (emit_fcmp (FloatCC.LessThan) a @ (value_type (ty_scalar_float ty)) b)
      ;; Same flags as `GreaterThan`.
-      (FcmpCondResult.Condition (x64_ucomis a b) (CC.NBE)))
+      (FcmpCondResult.Condition (x64_ucomis b a) (CC.NBE)))
 (rule (emit_fcmp (FloatCC.LessThanOrEqual) a @ (value_type (ty_scalar_float ty)) b)
      ;; Same flags as `GreaterThanOrEqual`.
-      (FcmpCondResult.Condition (x64_ucomis a b) (CC.NB)))
+      (FcmpCondResult.Condition (x64_ucomis b a) (CC.NB)))
 (rule (emit_fcmp (FloatCC.UnorderedOrGreaterThan) a @ (value_type (ty_scalar_float ty)) b)
      ;; Same flags as `UnorderedOrLessThan`.
-      (FcmpCondResult.Condition (x64_ucomis a b) (CC.B)))
+      (FcmpCondResult.Condition (x64_ucomis b a) (CC.B)))
 (rule (emit_fcmp (FloatCC.UnorderedOrGreaterThanOrEqual) a @ (value_type (ty_scalar_float ty)) b)
      ;; Same flags as `UnorderedOrLessThanOrEqual`.
-      (FcmpCondResult.Condition (x64_ucomis a b) (CC.BE)))
+      (FcmpCondResult.Condition (x64_ucomis b a) (CC.BE)))

 ;;;; Type Guards ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

--- a/cranelift/codegen/src/isa/x64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit.rs
@ -2964,7 +2964,7 @@ pub(crate) fn emit(
                _ => unreachable!(),
            };

-            let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(lhs), dst);
+            let inst = Inst::xmm_cmp_rm_r(cmp_op, dst, RegMem::reg(lhs));
            inst.emit(&[], sink, info, state);

            one_way_jmp(sink, CC::NZ, do_min_max);
@ -3171,9 +3171,9 @@ pub(crate) fn emit(
            }
        }

-        Inst::XmmCmpRmR { op, src, dst } => {
-            let dst = allocs.next(dst.to_reg());
-            let src = src.clone().to_reg_mem().with_allocs(allocs);
+        Inst::XmmCmpRmR { op, src1, src2 } => {
+            let src1 = allocs.next(src1.to_reg());
+            let src2 = src2.clone().to_reg_mem().with_allocs(allocs);

            let rex = RexFlags::clear_w();
            let (prefix, opcode, len) = match op {
@ -3183,13 +3183,13 @@ pub(crate) fn emit(
                _ => unimplemented!("Emit xmm cmp rm r"),
            };

-            match src {
+            match src2 {
                RegMem::Reg { reg } => {
-                    emit_std_reg_reg(sink, prefix, opcode, len, dst, reg, rex);
+                    emit_std_reg_reg(sink, prefix, opcode, len, src1, reg, rex);
                }
                RegMem::Mem { addr } => {
                    let addr = &addr.finalize(state, sink);
-                    emit_std_reg_mem(sink, prefix, opcode, len, dst, addr, rex, 0);
+                    emit_std_reg_mem(sink, prefix, opcode, len, src1, addr, rex, 0);
                }
            }
        }
@ -3461,7 +3461,7 @@ pub(crate) fn emit(

            // Check for NaN.

-            let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(src), src);
+            let inst = Inst::xmm_cmp_rm_r(cmp_op, src, RegMem::reg(src));
            inst.emit(&[], sink, info, state);

            if *is_saturating {
@ -3492,7 +3492,7 @@ pub(crate) fn emit(
                );
                inst.emit(&[], sink, info, state);

-                let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(src), tmp_xmm);
+                let inst = Inst::xmm_cmp_rm_r(cmp_op, tmp_xmm, RegMem::reg(src));
                inst.emit(&[], sink, info, state);

                // Jump if >= to done.
@ -3553,7 +3553,7 @@ pub(crate) fn emit(
                );
                inst.emit(&[], sink, info, state);

-                let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(tmp_xmm), src);
+                let inst = Inst::xmm_cmp_rm_r(cmp_op, src, RegMem::reg(tmp_xmm));
                inst.emit(&[], sink, info, state);

                // no trap if src >= or > threshold
@ -3570,7 +3570,7 @@ pub(crate) fn emit(
                );
                inst.emit(&[], sink, info, state);

-                let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(src), tmp_xmm);
+                let inst = Inst::xmm_cmp_rm_r(cmp_op, tmp_xmm, RegMem::reg(src));
                inst.emit(&[], sink, info, state);

                // no trap if 0 >= src
@ -3668,7 +3668,7 @@ pub(crate) fn emit(
            );
            inst.emit(&[], sink, info, state);

-            let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(tmp_xmm), src);
+            let inst = Inst::xmm_cmp_rm_r(cmp_op, src, RegMem::reg(tmp_xmm));
            inst.emit(&[], sink, info, state);

            let handle_large = sink.get_label();
--- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
@ -3896,25 +3896,25 @@ fn test_x64_emit() {
    // XMM_CMP_RM_R

    insns.push((
-        Inst::xmm_cmp_rm_r(SseOpcode::Ucomiss, RegMem::reg(xmm1), xmm2),
+        Inst::xmm_cmp_rm_r(SseOpcode::Ucomiss, xmm2, RegMem::reg(xmm1)),
        "0F2ED1",
        "ucomiss %xmm1, %xmm2",
    ));

    insns.push((
-        Inst::xmm_cmp_rm_r(SseOpcode::Ucomiss, RegMem::reg(xmm0), xmm9),
+        Inst::xmm_cmp_rm_r(SseOpcode::Ucomiss, xmm9, RegMem::reg(xmm0)),
        "440F2EC8",
        "ucomiss %xmm0, %xmm9",
    ));

    insns.push((
-        Inst::xmm_cmp_rm_r(SseOpcode::Ucomisd, RegMem::reg(xmm13), xmm4),
+        Inst::xmm_cmp_rm_r(SseOpcode::Ucomisd, xmm4, RegMem::reg(xmm13)),
        "66410F2EE5",
        "ucomisd %xmm13, %xmm4",
    ));

    insns.push((
-        Inst::xmm_cmp_rm_r(SseOpcode::Ucomisd, RegMem::reg(xmm11), xmm12),
+        Inst::xmm_cmp_rm_r(SseOpcode::Ucomisd, xmm12, RegMem::reg(xmm11)),
        "66450F2EE3",
        "ucomisd %xmm11, %xmm12",
    ));
--- a/cranelift/codegen/src/isa/x64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/x64/inst/mod.rs
@ -396,12 +396,12 @@ impl Inst {
        }
    }

-    pub(crate) fn xmm_cmp_rm_r(op: SseOpcode, src: RegMem, dst: Reg) -> Inst {
-        src.assert_regclass_is(RegClass::Float);
-        debug_assert!(dst.class() == RegClass::Float);
-        let src = XmmMemAligned::new(src).unwrap();
-        let dst = Xmm::new(dst).unwrap();
-        Inst::XmmCmpRmR { op, src, dst }
+    pub(crate) fn xmm_cmp_rm_r(op: SseOpcode, src1: Reg, src2: RegMem) -> Inst {
+        src2.assert_regclass_is(RegClass::Float);
+        debug_assert!(src1.class() == RegClass::Float);
+        let src2 = XmmMemAligned::new(src2).unwrap();
+        let src1 = Xmm::new(src1).unwrap();
+        Inst::XmmCmpRmR { op, src1, src2 }
    }

    #[allow(dead_code)]
@ -1316,11 +1316,11 @@ impl PrettyPrint for Inst {
                format!("{op} {src}, {dst}")
            }

-            Inst::XmmCmpRmR { op, src, dst } => {
-                let dst = pretty_print_reg(dst.to_reg(), 8, allocs);
-                let src = src.pretty_print(8, allocs);
+            Inst::XmmCmpRmR { op, src1, src2 } => {
+                let src1 = pretty_print_reg(src1.to_reg(), 8, allocs);
+                let src2 = src2.pretty_print(8, allocs);
                let op = ljustify(op.to_string());
-                format!("{op} {src}, {dst}")
+                format!("{op} {src2}, {src1}")
            }

            Inst::CvtIntToFloat {
@ -2155,9 +2155,9 @@ fn x64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandCol
            collector.reg_use(src.to_reg());
            dst.get_operands(collector);
        }
-        Inst::XmmCmpRmR { src, dst, .. } => {
-            collector.reg_use(dst.to_reg());
-            src.get_operands(collector);
+        Inst::XmmCmpRmR { src1, src2, .. } => {
+            collector.reg_use(src1.to_reg());
+            src2.get_operands(collector);
        }
        Inst::Imm { dst, .. } => {
            collector.reg_def(dst.to_writable_reg());
--- a/cranelift/codegen/src/isa/x64/pcc.rs
+++ b/cranelift/codegen/src/isa/x64/pcc.rs
@ -768,13 +768,17 @@ pub(crate) fn check(

        Inst::XmmMinMaxSeq { dst, .. } => ensure_no_fact(vcode, dst.to_writable_reg().to_reg()),

-        Inst::XmmCmpRmR { ref src, .. } => match <&RegMem>::from(src) {
-            RegMem::Mem { ref addr } => {
-                check_load(ctx, None, addr, vcode, I8X16, 128)?;
-                Ok(())
+        Inst::XmmCmpRmR {
+            ref src1, ref src2, ..
+        } => {
+            match <&RegMem>::from(src2) {
+                RegMem::Mem { ref addr } => {
+                    check_load(ctx, None, addr, vcode, I8X16, 128)?;
+                }
+                RegMem::Reg { .. } => {}
            }
-            RegMem::Reg { .. } => Ok(()),
-        },
+            ensure_no_fact(vcode, src1.to_reg())
+        }

        Inst::XmmRmRImm {
            dst,
--- a/winch/codegen/src/codegen/context.rs
+++ b/winch/codegen/src/codegen/context.rs
@ -253,8 +253,8 @@ impl<'a> CodeGenContext<'a> {
        F: FnMut(&mut M, Reg, Reg, Reg, OperandSize),
        M: MacroAssembler,
    {
-        let src1 = self.pop_to_reg(masm, None);
        let src2 = self.pop_to_reg(masm, None);
+        let src1 = self.pop_to_reg(masm, None);
        let dst = self.any_gpr(masm);
        emit(masm, dst, src1.reg, src2.reg, size);
        self.free_reg(src1);
--- a/winch/codegen/src/isa/x64/asm.rs
+++ b/winch/codegen/src/isa/x64/asm.rs
@ -995,9 +995,9 @@ impl Assembler {
        });
    }

-    /// Compares values in src and dst and sets ZF, PF, and CF flags in EFLAGS
+    /// Compares values in src1 and src2 and sets ZF, PF, and CF flags in EFLAGS
    /// register.
-    pub fn ucomis(&mut self, src: Reg, dst: Reg, size: OperandSize) {
+    pub fn ucomis(&mut self, src1: Reg, src2: Reg, size: OperandSize) {
        let op = match size {
            OperandSize::S32 => SseOpcode::Ucomiss,
            OperandSize::S64 => SseOpcode::Ucomisd,
@ -1006,8 +1006,8 @@ impl Assembler {

        self.emit(Inst::XmmCmpRmR {
            op,
-            src: Xmm::from(src).into(),
-            dst: dst.into(),
+            src1: src1.into(),
+            src2: Xmm::from(src2).into(),
        });
    }