Adds packed floating point min/max for X64 for the new backend

Allows for simd_f32x4 and simd_f64x2 spec tests
4 years ago · 7b4d173b90
2 changed files with 200 additions and 6 deletions
--- a/build.rs
+++ b/build.rs
@ -192,8 +192,10 @@ fn experimental_x64_should_panic(testsuite: &str, testname: &str, strategy: &str
        ("simd", "simd_i32x4_arith2") => return false,
        ("simd", "simd_i32x4_cmp") => return false,
        ("simd", "simd_i64x2_arith") => return false,
+        ("simd", "simd_f32x4") => return false,
        ("simd", "simd_f32x4_arith") => return false,
        ("simd", "simd_f32x4_cmp") => return false,
+        ("simd", "simd_f64x2") => return false,
        ("simd", "simd_f64x2_arith") => return false,
        ("simd", "simd_f64x2_cmp") => return false,
        ("simd", "simd_lane") => return false,
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@ -1763,12 +1763,204 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            let is_min = op == Opcode::Fmin;
            let output_ty = ty.unwrap();
            ctx.emit(Inst::gen_move(dst, rhs, output_ty));
-            let op_size = match output_ty {
-                types::F32 => OperandSize::Size32,
-                types::F64 => OperandSize::Size64,
-                _ => panic!("unexpected type {:?} for fmin/fmax", output_ty),
-            };
-            ctx.emit(Inst::xmm_min_max_seq(op_size, is_min, lhs, dst));
+            if !output_ty.is_vector() {
+                let op_size = match output_ty {
+                    types::F32 => OperandSize::Size32,
+                    types::F64 => OperandSize::Size64,
+                    _ => panic!("unexpected type {:?} for fmin/fmax", output_ty),
+                };
+                ctx.emit(Inst::xmm_min_max_seq(op_size, is_min, lhs, dst));
+            } else {
+                // X64's implementation of floating point min and floating point max does not
+                // propagate NaNs and +0's in a way that is friendly to the SIMD spec. For the
+                // scalar approach we use jumps to handle cases where NaN and +0 propagation is
+                // not consistent with what is needed. However for packed floating point min and
+                // floating point max we implement a different approach to avoid the sequence
+                // of jumps that would be required on a per lane basis. Because we do not need to
+                // lower labels and jumps but do need ctx for creating temporaries we implement
+                // the lowering here in lower.rs instead of emit.rs as is done in the case for scalars.
+                // The outline of approach is as follows:
+                //
+                // First we preform the Min/Max in both directions. This is because in the
+                // case of an operand's lane containing a NaN or in the case of the lanes of the
+                // two operands containing 0 but with mismatched signs, x64 will return the second
+                // operand regardless of its contents. So in order to make sure we capture NaNs and
+                // normalize NaNs and 0 values we capture the operation in both directions and merge the
+                // results. Then we normalize the results through operations that create a mask for the
+                // lanes containing NaNs, we use that mask to adjust NaNs to quite NaNs and normalize
+                // 0s.
+                //
+                // The following sequence is generated for min:
+                //
+                // movap{s,d} %lhs, %tmp
+                // minp{s,d} %dst, %tmp
+                // minp,{s,d} %lhs, %dst
+                // orp{s,d} %dst, %tmp
+                // cmpp{s,d} %tmp, %dst, $3
+                // orps{s,d} %dst, %tmp
+                // psrl{s,d} {$10, $13}, %dst
+                // andnp{s,d} %tmp, %dst
+                //
+                // and for max the sequence is:
+                //
+                // movap{s,d} %lhs, %tmp
+                // minp{s,d} %dst, %tmp
+                // minp,{s,d} %lhs, %dst
+                // xorp{s,d} %tmp, %dst
+                // orp{s,d} %dst, %tmp
+                // subp{s,d} %dst, %tmp
+                // cmpp{s,d} %tmp, %dst, $3
+                // psrl{s,d} {$10, $13}, %dst
+                // andnp{s,d} %tmp, %dst
+
+                if is_min {
+                    let (mov_op, min_op, or_op, cmp_op, shift_op, shift_by, andn_op) =
+                        match output_ty {
+                            types::F32X4 => (
+                                SseOpcode::Movaps,
+                                SseOpcode::Minps,
+                                SseOpcode::Orps,
+                                SseOpcode::Cmpps,
+                                SseOpcode::Psrld,
+                                10,
+                                SseOpcode::Andnps,
+                            ),
+                            types::F64X2 => (
+                                SseOpcode::Movapd,
+                                SseOpcode::Minpd,
+                                SseOpcode::Orpd,
+                                SseOpcode::Cmppd,
+                                SseOpcode::Psrlq,
+                                13,
+                                SseOpcode::Andnpd,
+                            ),
+                            _ => unimplemented!("unsupported op type {:?}", output_ty),
+                        };
+
+                    // Copy lhs into tmp
+                    let tmp_xmm1 = ctx.alloc_tmp(RegClass::V128, output_ty);
+                    ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(lhs), tmp_xmm1, None));
+
+                    // Perform min in reverse direction
+                    ctx.emit(Inst::xmm_rm_r(min_op, RegMem::from(dst), tmp_xmm1));
+
+                    // Perform min in original direction
+                    ctx.emit(Inst::xmm_rm_r(min_op, RegMem::reg(lhs), dst));
+
+                    // X64 handles propagation of -0's and Nans differently between left and right
+                    // operands. After doing the min in both directions, this OR will
+                    // guarrentee capture of -0's and Nan in our tmp register
+                    ctx.emit(Inst::xmm_rm_r(or_op, RegMem::from(dst), tmp_xmm1));
+
+                    // Compare unordered to create mask for lanes containing NaNs and then use
+                    // that mask to saturate the NaN containing lanes in the tmp register with 1s.
+                    // TODO: Would a check for NaN and then a jump be better here in the
+                    // common case than continuing on to normalize NaNs that might not exist?
+                    let cond = FcmpImm::from(FloatCC::Unordered);
+                    ctx.emit(Inst::xmm_rm_r_imm(
+                        cmp_op,
+                        RegMem::reg(tmp_xmm1.to_reg()),
+                        dst,
+                        cond.encode(),
+                        false,
+                    ));
+                    ctx.emit(Inst::xmm_rm_r(or_op, RegMem::reg(dst.to_reg()), tmp_xmm1));
+
+                    // The dst register holds a mask for lanes containing NaNs.
+                    // We take that mask and shift in preparation for creating a different mask
+                    // to normalize NaNs (create a quite NaN) by zeroing out the appropriate
+                    // number of least signficant bits. We shift right each lane by 10 bits
+                    // (1 sign + 8 exp. + 1 MSB sig.) for F32X4 and by 13 bits (1 sign +
+                    // 11 exp. + 1 MSB sig.) for F64X2.
+                    ctx.emit(Inst::xmm_rmi_reg(shift_op, RegMemImm::imm(shift_by), dst));
+
+                    // Finally we do a nand with the tmp register to produce the final results
+                    // in the dst.
+                    ctx.emit(Inst::xmm_rm_r(andn_op, RegMem::reg(tmp_xmm1.to_reg()), dst));
+                } else {
+                    let (
+                        mov_op,
+                        max_op,
+                        xor_op,
+                        or_op,
+                        sub_op,
+                        cmp_op,
+                        shift_op,
+                        shift_by,
+                        andn_op,
+                    ) = match output_ty {
+                        types::F32X4 => (
+                            SseOpcode::Movaps,
+                            SseOpcode::Maxps,
+                            SseOpcode::Xorps,
+                            SseOpcode::Orps,
+                            SseOpcode::Subps,
+                            SseOpcode::Cmpps,
+                            SseOpcode::Psrld,
+                            10,
+                            SseOpcode::Andnps,
+                        ),
+                        types::F64X2 => (
+                            SseOpcode::Movapd,
+                            SseOpcode::Maxpd,
+                            SseOpcode::Xorpd,
+                            SseOpcode::Orpd,
+                            SseOpcode::Subpd,
+                            SseOpcode::Cmppd,
+                            SseOpcode::Psrlq,
+                            13,
+                            SseOpcode::Andnpd,
+                        ),
+                        _ => unimplemented!("unsupported op type {:?}", output_ty),
+                    };
+
+                    // Copy lhs into tmp.
+                    let tmp_xmm1 = ctx.alloc_tmp(RegClass::V128, types::F32);
+                    ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(lhs), tmp_xmm1, None));
+
+                    // Perform max in reverse direction.
+                    ctx.emit(Inst::xmm_rm_r(max_op, RegMem::reg(dst.to_reg()), tmp_xmm1));
+
+                    // Perform max in original direction.
+                    ctx.emit(Inst::xmm_rm_r(max_op, RegMem::reg(lhs), dst));
+
+                    // Get the difference between the two results and store in tmp.
+                    // Max uses a different approach than min to account for potential
+                    // discrepancies with plus/minus 0.
+                    ctx.emit(Inst::xmm_rm_r(xor_op, RegMem::reg(tmp_xmm1.to_reg()), dst));
+
+                    // X64 handles propagation of -0's and Nans differently between left and right
+                    // operands. After doing the max in both directions, this OR will
+                    // guarentee capture of 0's and Nan in our tmp register.
+                    ctx.emit(Inst::xmm_rm_r(or_op, RegMem::reg(dst.to_reg()), tmp_xmm1));
+
+                    // Capture NaNs and sign discrepancies.
+                    ctx.emit(Inst::xmm_rm_r(sub_op, RegMem::reg(dst.to_reg()), tmp_xmm1));
+
+                    // Compare unordered to create mask for lanes containing NaNs and then use
+                    // that mask to saturate the NaN containing lanes in the tmp register with 1s.
+                    let cond = FcmpImm::from(FloatCC::Unordered);
+                    ctx.emit(Inst::xmm_rm_r_imm(
+                        cmp_op,
+                        RegMem::reg(tmp_xmm1.to_reg()),
+                        dst,
+                        cond.encode(),
+                        false,
+                    ));
+
+                    // The dst register holds a mask for lanes containing NaNs.
+                    // We take that mask and shift in preparation for creating a different mask
+                    // to normalize NaNs (create a quite NaN) by zeroing out the appropriate
+                    // number of least signficant bits. We shift right each lane by 10 bits
+                    // (1 sign + 8 exp. + 1 MSB sig.) for F32X4 and by 13 bits (1 sign +
+                    // 11 exp. + 1 MSB sig.) for F64X2.
+                    ctx.emit(Inst::xmm_rmi_reg(shift_op, RegMemImm::imm(shift_by), dst));
+
+                    // Finally we do a nand with the tmp register to produce the final results
+                    // in the dst.
+                    ctx.emit(Inst::xmm_rm_r(andn_op, RegMem::reg(tmp_xmm1.to_reg()), dst));
+                }
+            }
        }

        Opcode::Sqrt => {