x64: implement vselect with variable blend instructions

This change implements `vselect` using SSE4.1's `BLENDVPS`, `BLENDVPD`, and `PBLENDVB`. `vselect` is a lane-selecting instruction that is used by [simple_preopt.rs](fa1faf5d22/cranelift/codegen/src/simple_preopt.rs (L947-L999)) to lower `bitselect` to a single x86 instruction when the condition mask is known to be boolean (all 1s or 0s, e.g., from a conversion). This is better than `bitselect` in general, which lowers to 4-5 instructions. The old backend had the `vselect` lowering; this simply introduces it to the new backend.
4 years ago · 7ef3ae2903
7 changed files with 93 additions and 2 deletions
--- a/cranelift/codegen/src/isa/x64/inst/args.rs
+++ b/cranelift/codegen/src/isa/x64/inst/args.rs
@ -478,6 +478,7 @@ pub enum SseOpcode {
    Andnps,
    Andnpd,
    Blendvpd,
    Blendvps,
    Comiss,
    Comisd,
    Cmpps,
@ -547,6 +548,7 @@ pub enum SseOpcode {
    Pandn,
    Pavgb,
    Pavgw,
    Pblendvb,
    Pcmpeqb,
    Pcmpeqw,
    Pcmpeqd,
@ -769,8 +771,10 @@ impl SseOpcode {
            | SseOpcode::Pshufb => SSSE3,
            SseOpcode::Blendvpd
            | SseOpcode::Blendvps
            | SseOpcode::Insertps
            | SseOpcode::Packusdw
            | SseOpcode::Pblendvb
            | SseOpcode::Pcmpeqq
            | SseOpcode::Pextrb
            | SseOpcode::Pextrd
@ -828,6 +832,7 @@ impl fmt::Debug for SseOpcode {
            SseOpcode::Andnps => "andnps",
            SseOpcode::Andnpd => "andnpd",
            SseOpcode::Blendvpd => "blendvpd",
            SseOpcode::Blendvps => "blendvps",
            SseOpcode::Cmpps => "cmpps",
            SseOpcode::Cmppd => "cmppd",
            SseOpcode::Cmpss => "cmpss",
@ -897,6 +902,7 @@ impl fmt::Debug for SseOpcode {
            SseOpcode::Pandn => "pandn",
            SseOpcode::Pavgb => "pavgb",
            SseOpcode::Pavgw => "pavgw",
            SseOpcode::Pblendvb => "pblendvb",
            SseOpcode::Pcmpeqb => "pcmpeqb",
            SseOpcode::Pcmpeqw => "pcmpeqw",
            SseOpcode::Pcmpeqd => "pcmpeqd",
--- a/cranelift/codegen/src/isa/x64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit.rs
@ -1441,6 +1441,7 @@ pub(crate) fn emit(
                SseOpcode::Andpd => (LegacyPrefixes::_66, 0x0F54, 2),
                SseOpcode::Andnps => (LegacyPrefixes::None, 0x0F55, 2),
                SseOpcode::Andnpd => (LegacyPrefixes::_66, 0x0F55, 2),
                SseOpcode::Blendvps => (LegacyPrefixes::_66, 0x0F3814, 3),
                SseOpcode::Blendvpd => (LegacyPrefixes::_66, 0x0F3815, 3),
                SseOpcode::Cvttps2dq => (LegacyPrefixes::_F3, 0x0F5B, 2),
                SseOpcode::Cvtdq2ps => (LegacyPrefixes::None, 0x0F5B, 2),
@ -1480,6 +1481,7 @@ pub(crate) fn emit(
                SseOpcode::Pandn => (LegacyPrefixes::_66, 0x0FDF, 2),
                SseOpcode::Pavgb => (LegacyPrefixes::_66, 0x0FE0, 2),
                SseOpcode::Pavgw => (LegacyPrefixes::_66, 0x0FE3, 2),
                SseOpcode::Pblendvb => (LegacyPrefixes::_66, 0x0F3810, 3),
                SseOpcode::Pcmpeqb => (LegacyPrefixes::_66, 0x0F74, 2),
                SseOpcode::Pcmpeqw => (LegacyPrefixes::_66, 0x0F75, 2),
                SseOpcode::Pcmpeqd => (LegacyPrefixes::_66, 0x0F76, 2),
--- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
@ -3432,6 +3432,18 @@ fn test_x64_emit() {
        "blendvpd %xmm15, %xmm4",
    ));
    insns.push((
        Inst::xmm_rm_r(SseOpcode::Blendvps, RegMem::reg(xmm2), w_xmm3),
        "660F3814DA",
        "blendvps %xmm2, %xmm3",
    ));
    insns.push((
        Inst::xmm_rm_r(SseOpcode::Pblendvb, RegMem::reg(xmm12), w_xmm13),
        "66450F3810EC",
        "pblendvb %xmm12, %xmm13",
    ));
    // ========================================================
    // XMM_RM_R: Integer Packed
--- a/cranelift/codegen/src/isa/x64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/x64/inst/mod.rs
@ -1927,13 +1927,20 @@ fn x64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
            src.get_regs_as_uses(collector);
            collector.add_def(*dst);
        }
-        Inst::XmmRmR { src, dst, .. } => {
+        Inst::XmmRmR { src, dst, op, .. } => {
            if inst.produces_const() {
                // No need to account for src, since src == dst.
                collector.add_def(*dst);
            } else {
                src.get_regs_as_uses(collector);
                collector.add_mod(*dst);
                // Some instructions have an implicit use of XMM0.
                if *op == SseOpcode::Blendvpd
                    || *op == SseOpcode::Blendvps
                    || *op == SseOpcode::Pblendvb
                {
                    collector.add_use(regs::xmm0());
                }
            }
        }
        Inst::XmmRmREvex {
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@ -2029,7 +2029,50 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                ctx.emit(Inst::gen_move(dst, tmp2.to_reg(), ty));
                ctx.emit(Inst::or(ty, RegMem::from(tmp1), dst));
            } else {
-                unimplemented!("scalar bitselect")
+                unimplemented!("no lowering for scalar bitselect instruction")
            }
        }
        Opcode::Vselect => {
            let ty = ty.unwrap();
            let condition = put_input_in_reg(ctx, inputs[0]);
            let condition_ty = ctx.input_ty(insn, 0);
            let if_true = input_to_reg_mem(ctx, inputs[1]);
            let if_false = put_input_in_reg(ctx, inputs[2]);
            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
            if ty.is_vector() {
                // `vselect` relies on the bit representation of the condition:
                // vector boolean types are defined in Cranelift to be all 1s or
                // all 0s. This lowering relies on that fact to use x86's
                // variable blend instructions, which look at the _high_bit_ of
                // the condition mask. All the bits of vector booleans will
                // match (all 1s or all 0s), so we can just use the high bit.
                assert!(condition_ty.lane_type().is_bool());
                // Variable blend instructions expect the condition mask to be
                // in XMM0.
                let xmm0 = Writable::from_reg(regs::xmm0());
                ctx.emit(Inst::gen_move(xmm0, condition, ty));
                // Match up the source and destination registers for regalloc.
                ctx.emit(Inst::gen_move(dst, if_false, ty));
                // Technically PBLENDVB would work in all cases (since the bytes
                // inside the mask will be all 1s or 0s we can blend
                // byte-by-byte instead of word-by-word, e.g.) but
                // type-specialized versions are included here for clarity when
                // troubleshooting and due to slight improvements in
                // latency/throughput on certain processor families.
                let opcode = match condition_ty {
                    types::B64X2 => SseOpcode::Blendvpd,
                    types::B32X4 => SseOpcode::Blendvps,
                    types::B16X8 | types::B8X16 => SseOpcode::Pblendvb,
                    _ => unimplemented!("unable lower vselect for type: {}", condition_ty),
                };
                ctx.emit(Inst::xmm_rm_r(opcode, if_true, dst));
            } else {
                unimplemented!("no lowering for scalar vselect instruction")
            }
        }
--- a/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif
+++ b/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif
@ -15,6 +15,16 @@ block0:
 ; nextln: por     %xmm1, %xmm0
 ; not:    movdqa
 function %vselect_i16x8() -> i16x8 {
 block0:
    v0 = vconst.b16x8 [false true false true false true false true]
    v1 = vconst.i16x8 [0 0 0 0 0 0 0 0]
    v2 = vconst.i16x8 [0 0 0 0 0 0 0 0]
    v3 = vselect v0, v1, v2
    return v3
 }
 ; check:  pblendvb %xmm1, %xmm2
 ; 8x16 shifts: these lower to complex sequences of instructions
--- a/cranelift/filetests/filetests/isa/x64/simd-bitwise-run.clif
+++ b/cranelift/filetests/filetests/isa/x64/simd-bitwise-run.clif
@ -10,6 +10,17 @@ block0(v0: i8x16, v1: i8x16, v2: i8x16):
 ; Remember that bitselect accepts: 1) the selector vector, 2) the "if true" vector, and 3) the "if false" vector.
 ; run: %bitselect_i8x16([0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 255], [127 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42], [42 0 0 0 0 0 0 0 0 0 0 0 0 0 0 127]) == [42 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42]
 function %vselect_i32x4(i32x4, i32x4) -> i32x4 {
 block0(v1: i32x4, v2: i32x4):
    ; `make_trampoline` still does not know how to convert boolean vector types
    ; so we load the value directly here.
    v0 = vconst.b32x4 [true true false false]
    v3 = vselect v0, v1, v2
    return v3
 }
 ; Remember that vselect accepts: 1) the selector vector, 2) the "if true" vector, and 3) the "if false" vector.
 ; run: %vselect_i8x16([1 2 -1 -1], [-1 -1 3 4]) == [1 2 3 4]
 ; shift left