From 50b9399006ae5bf0f7ff76ede6dbbb0c83987ac3 Mon Sep 17 00:00:00 2001 From: Andrew Brown Date: Tue, 29 Sep 2020 14:42:53 -0700 Subject: [PATCH] [machinst x64]: lower remaining lane operations--any_true, all_true, splat --- cranelift/codegen/src/isa/x64/inst/args.rs | 3 + cranelift/codegen/src/isa/x64/inst/emit.rs | 16 ++- cranelift/codegen/src/isa/x64/inst/mod.rs | 19 +++ cranelift/codegen/src/isa/x64/lower.rs | 132 +++++++++++++++++++++ 4 files changed, 165 insertions(+), 5 deletions(-) diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs index fc441c302b..29cf01c71a 100644 --- a/cranelift/codegen/src/isa/x64/inst/args.rs +++ b/cranelift/codegen/src/isa/x64/inst/args.rs @@ -459,6 +459,7 @@ pub enum SseOpcode { Psubd, Psubq, Psubw, + Ptest, Pxor, Rcpss, Roundss, @@ -606,6 +607,7 @@ impl SseOpcode { | SseOpcode::Pminuw | SseOpcode::Pminud | SseOpcode::Pmulld + | SseOpcode::Ptest | SseOpcode::Roundss | SseOpcode::Roundsd => SSE41, @@ -734,6 +736,7 @@ impl fmt::Debug for SseOpcode { SseOpcode::Psubd => "psubd", SseOpcode::Psubq => "psubq", SseOpcode::Psubw => "psubw", + SseOpcode::Ptest => "ptest", SseOpcode::Pxor => "pxor", SseOpcode::Rcpss => "rcpss", SseOpcode::Roundss => "roundss", diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index 71d81c2e4f..1a0568a9d6 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -2003,6 +2003,11 @@ pub(crate) fn emit( sink.bind_label(constant_end_label); } + Inst::XmmFakeDef { .. } => { + // This instruction format only exists to declare a register as a `def`; no code is + // emitted. + } + Inst::Xmm_Mov_R_M { op, src, @@ -2087,19 +2092,20 @@ pub(crate) fn emit( Inst::XMM_Cmp_RM_R { op, src, dst } => { let rex = RexFlags::clear_w(); - let (prefix, opcode) = match op { - SseOpcode::Ucomisd => (LegacyPrefixes::_66, 0x0F2E), - SseOpcode::Ucomiss => (LegacyPrefixes::None, 0x0F2E), + let (prefix, opcode, len) = match op { + SseOpcode::Ptest => (LegacyPrefixes::_66, 0x0F3817, 3), + SseOpcode::Ucomisd => (LegacyPrefixes::_66, 0x0F2E, 2), + SseOpcode::Ucomiss => (LegacyPrefixes::None, 0x0F2E, 2), _ => unimplemented!("Emit xmm cmp rm r"), }; match src { RegMem::Reg { reg } => { - emit_std_reg_reg(sink, prefix, opcode, 2, *dst, *reg, rex); + emit_std_reg_reg(sink, prefix, opcode, len, *dst, *reg, rex); } RegMem::Mem { addr } => { let addr = &addr.finalize(state); - emit_std_reg_mem(sink, prefix, opcode, 2, *dst, addr, rex); + emit_std_reg_mem(sink, prefix, opcode, len, *dst, addr, rex); } } } diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs index 9026bddef7..c334956f8a 100644 --- a/cranelift/codegen/src/isa/x64/inst/mod.rs +++ b/cranelift/codegen/src/isa/x64/inst/mod.rs @@ -342,6 +342,10 @@ pub enum Inst { is64: bool, }, + /// Provides a way to tell the register allocator that the upcoming sequence of instructions + /// will overwrite `dst` so it should be considered as a `def`; use with care. + XmmFakeDef { dst: Writable }, + // ===================================== // Control flow instructions. /// Direct call: call simm32. @@ -640,6 +644,11 @@ impl Inst { Inst::XMM_RM_R { op, src, dst } } + pub(crate) fn xmm_fake_def(dst: Writable) -> Self { + debug_assert!(dst.to_reg().get_class() == RegClass::V128); + Inst::XmmFakeDef { dst } + } + pub(crate) fn xmm_mov_r_m( op: SseOpcode, src: Reg, @@ -1324,6 +1333,12 @@ impl ShowWithRRU for Inst { dst.show_rru(mb_rru), ), + Inst::XmmFakeDef { dst } => format!( + "{} {}", + ljustify("fake_def".into()), + dst.show_rru(mb_rru), + ), + Inst::XmmLoadConstSeq { val, dst, .. } => { format!("load_const ${:?}, {}", val, dst.show_rru(mb_rru),) } @@ -1754,6 +1769,7 @@ fn x64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) { collector.add_mod(*dst); } } + Inst::XmmFakeDef { dst } => collector.add_def(*dst), Inst::XmmLoadConstSeq { dst, .. } => collector.add_def(*dst), Inst::XmmMinMaxSeq { lhs, rhs_dst, .. } => { collector.add_use(*lhs); @@ -2088,6 +2104,9 @@ fn x64_map_regs(inst: &mut Inst, mapper: &RUM) { src.map_uses(mapper); map_mod(mapper, dst); } + Inst::XmmFakeDef { ref mut dst, .. } => { + map_def(mapper, dst); + } Inst::XmmLoadConstSeq { ref mut dst, .. } => { map_def(mapper, dst); } diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index 59b81327e5..249f803305 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -2945,6 +2945,138 @@ fn lower_insn_to_regs>( } } + Opcode::Splat => { + let ty = ty.unwrap(); + assert_eq!(ty.bits(), 128); + let src_ty = ctx.input_ty(insn, 0); + assert!(src_ty.bits() < 128); + let src = input_to_reg_mem(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + + fn emit_insert_lane>( + ctx: &mut C, + src: RegMem, + dst: Writable, + lane: u8, + ty: Type, + ) { + if !ty.is_float() { + let (sse_op, is64) = match ty.lane_bits() { + 8 => (SseOpcode::Pinsrb, false), + 16 => (SseOpcode::Pinsrw, false), + 32 => (SseOpcode::Pinsrd, false), + 64 => (SseOpcode::Pinsrd, true), + _ => panic!("Unable to insertlane for lane size: {}", ty.lane_bits()), + }; + ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, is64)); + } else if ty == types::F32 { + let sse_op = SseOpcode::Insertps; + // Insert 32-bits from replacement (at index 00, bits 7:8) to vector (lane + // shifted into bits 5:6). + let lane = 0b00_00_00_00 | lane << 4; + ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, false)); + } else if ty == types::F64 { + let sse_op = match lane { + // Move the lowest quadword in replacement to vector without changing + // the upper bits. + 0 => SseOpcode::Movsd, + // Move the low 64 bits of replacement vector to the high 64 bits of the + // vector. + 1 => SseOpcode::Movlhps, + _ => unreachable!(), + }; + // Here we use the `xmm_rm_r` encoding because it correctly tells the register + // allocator how we are using `dst`: we are using `dst` as a `mod` whereas other + // encoding formats like `xmm_unary_rm_r` treat it as a `def`. + ctx.emit(Inst::xmm_rm_r(sse_op, src, dst)); + } + }; + + // We know that splat will overwrite all of the lanes of `dst` but it takes several + // instructions to do so. Because of the multiple instructions, there is no good way to + // declare `dst` a `def` except with the following pseudo-instruction. + ctx.emit(Inst::xmm_fake_def(dst)); + match ty.lane_bits() { + 8 => { + emit_insert_lane(ctx, src, dst, 0, ty.lane_type()); + // Initialize a register with all 0s. + let tmp = ctx.alloc_tmp(RegClass::V128, ty); + ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), tmp)); + // Shuffle the lowest byte lane to all other lanes. + ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp), dst)) + } + 16 => { + emit_insert_lane(ctx, src.clone(), dst, 0, ty.lane_type()); + emit_insert_lane(ctx, src, dst, 1, ty.lane_type()); + // Shuffle the lowest two lanes to all other lanes. + ctx.emit(Inst::xmm_rm_r_imm( + SseOpcode::Pshufd, + RegMem::from(dst), + dst, + 0, + false, + )) + } + 32 => { + emit_insert_lane(ctx, src, dst, 0, ty.lane_type()); + // Shuffle the lowest lane to all other lanes. + ctx.emit(Inst::xmm_rm_r_imm( + SseOpcode::Pshufd, + RegMem::from(dst), + dst, + 0, + false, + )) + } + 64 => { + emit_insert_lane(ctx, src.clone(), dst, 0, ty.lane_type()); + emit_insert_lane(ctx, src, dst, 1, ty.lane_type()); + } + _ => panic!("Invalid type to splat: {}", ty), + } + } + + Opcode::VanyTrue => { + let dst = get_output_reg(ctx, outputs[0]); + let src_ty = ctx.input_ty(insn, 0); + assert_eq!(src_ty.bits(), 128); + let src = put_input_in_reg(ctx, inputs[0]); + // Set the ZF if the result is all zeroes. + ctx.emit(Inst::xmm_cmp_rm_r(SseOpcode::Ptest, RegMem::reg(src), src)); + // If the ZF is not set, place a 1 in `dst`. + ctx.emit(Inst::setcc(CC::NZ, dst)); + } + + Opcode::VallTrue => { + let ty = ty.unwrap(); + let dst = get_output_reg(ctx, outputs[0]); + let src_ty = ctx.input_ty(insn, 0); + assert_eq!(src_ty.bits(), 128); + let src = input_to_reg_mem(ctx, inputs[0]); + + let eq = |ty: Type| match ty.lane_bits() { + 8 => SseOpcode::Pcmpeqb, + 16 => SseOpcode::Pcmpeqw, + 32 => SseOpcode::Pcmpeqd, + 64 => SseOpcode::Pcmpeqq, + _ => panic!("Unable to find an instruction for {} for type: {}", op, ty), + }; + + // Initialize a register with all 0s. + let tmp = ctx.alloc_tmp(RegClass::V128, ty); + ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), tmp)); + // Compare to see what lanes are filled with all 1s. + ctx.emit(Inst::xmm_rm_r(eq(src_ty), src, tmp)); + // Set the ZF if the result is all zeroes. + ctx.emit(Inst::xmm_cmp_rm_r( + SseOpcode::Ptest, + RegMem::from(tmp), + tmp.to_reg(), + )); + // If the ZF is set, place a 1 in `dst`. + ctx.emit(Inst::setcc(CC::Z, dst)); + } + Opcode::IaddImm | Opcode::ImulImm | Opcode::UdivImm