Browse Source

x64: add support for packed promote and demote (#2783)

* Add support for x64 packed promote low

* Add support for x64 packed floating point demote

* Update vector promote low and demote by adding constraints

Also does some renaming and minor refactoring
pull/2971/head
Johnnie Birch 3 years ago
committed by GitHub
parent
commit
1770880e19
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
  1. 63
      cranelift/codegen/meta/src/shared/instructions.rs
  2. 2
      cranelift/codegen/src/isa/aarch64/lower_inst.rs
  3. 4
      cranelift/codegen/src/isa/s390x/lower.rs
  4. 6
      cranelift/codegen/src/isa/x64/inst/args.rs
  5. 2
      cranelift/codegen/src/isa/x64/inst/emit.rs
  6. 12
      cranelift/codegen/src/isa/x64/inst/emit_tests.rs
  7. 20
      cranelift/codegen/src/isa/x64/lower.rs
  8. 2
      cranelift/interpreter/src/step.rs
  9. 10
      cranelift/wasm/src/code_translator.rs

63
cranelift/codegen/meta/src/shared/instructions.rs

@ -4223,6 +4223,69 @@ pub(crate) fn define(
.constraints(vec![WiderOrEq(Float.clone(), FloatTo.clone())]),
);
let F64x2 = &TypeVar::new(
"F64x2",
"A SIMD vector type consisting of 2 lanes of 64-bit floats",
TypeSetBuilder::new()
.floats(64..64)
.simd_lanes(2..2)
.includes_scalars(false)
.build(),
);
let F32x4 = &TypeVar::new(
"F32x4",
"A SIMD vector type consisting of 4 lanes of 32-bit floats",
TypeSetBuilder::new()
.floats(32..32)
.simd_lanes(4..4)
.includes_scalars(false)
.build(),
);
let x = &Operand::new("x", F64x2);
let a = &Operand::new("a", F32x4);
ig.push(
Inst::new(
"fvdemote",
r#"
Convert `x` to a smaller floating point format.
Each lane in `x` is converted to the destination floating point format
by rounding to nearest, ties to even.
Cranelift currently only supports two floating point formats
- `f32` and `f64`. This may change in the future.
Fvdemote differs from fdemote in that with fvdemote it targets vectors.
Fvdemote is constrained to having the input type being F64x2 and the result
type being F32x4. The result lane that was the upper half of the input lane
is initialized to zero.
"#,
&formats.unary,
)
.operands_in(vec![x])
.operands_out(vec![a]),
);
ig.push(
Inst::new(
"fvpromote_low",
r#"
Converts packed single precision floating point to packed double precision floating point.
Considering only the lower half of the register, the low lanes in `x` are interpreted as
single precision floats that are then converted to a double precision floats.
The result type will have half the number of vector lanes as the input. Fvpromote_low is
constrained to input F32x4 with a result type of F64x2.
"#,
&formats.unary,
)
.operands_in(vec![a])
.operands_out(vec![x]),
);
let x = &Operand::new("x", Float);
let a = &Operand::new("a", IntTo);

2
cranelift/codegen/src/isa/aarch64/lower_inst.rs

@ -3193,6 +3193,8 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
Opcode::TlsValue => unimplemented!("tls_value"),
Opcode::FcvtLowFromSint => unimplemented!("FcvtLowFromSint"),
Opcode::FvpromoteLow => unimplemented!("FvpromoteLow"),
Opcode::Fvdemote => unimplemented!("Fvdemote"),
}
Ok(())

4
cranelift/codegen/src/isa/s390x/lower.rs

@ -2548,7 +2548,9 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
| Opcode::SwidenHigh
| Opcode::UwidenLow
| Opcode::UwidenHigh
| Opcode::WideningPairwiseDotProductS => {
| Opcode::WideningPairwiseDotProductS
| Opcode::FvpromoteLow
| Opcode::Fvdemote => {
// TODO
panic!("Vector ops not implemented.");
}

6
cranelift/codegen/src/isa/x64/inst/args.rs

@ -489,6 +489,8 @@ pub enum SseOpcode {
Cmpsd,
Cvtdq2ps,
Cvtdq2pd,
Cvtpd2ps,
Cvtps2pd,
Cvtsd2ss,
Cvtsd2si,
Cvtsi2ss,
@ -684,6 +686,8 @@ impl SseOpcode {
| SseOpcode::Comisd
| SseOpcode::Cvtdq2ps
| SseOpcode::Cvtdq2pd
| SseOpcode::Cvtpd2ps
| SseOpcode::Cvtps2pd
| SseOpcode::Cvtsd2ss
| SseOpcode::Cvtsd2si
| SseOpcode::Cvtsi2sd
@ -843,6 +847,8 @@ impl fmt::Debug for SseOpcode {
SseOpcode::Comisd => "comisd",
SseOpcode::Cvtdq2ps => "cvtdq2ps",
SseOpcode::Cvtdq2pd => "cvtdq2pd",
SseOpcode::Cvtpd2ps => "cvtpd2ps",
SseOpcode::Cvtps2pd => "cvtps2pd",
SseOpcode::Cvtsd2ss => "cvtsd2ss",
SseOpcode::Cvtsd2si => "cvtsd2si",
SseOpcode::Cvtsi2ss => "cvtsi2ss",

2
cranelift/codegen/src/isa/x64/inst/emit.rs

@ -1348,6 +1348,8 @@ pub(crate) fn emit(
let (prefix, opcode, num_opcodes) = match op {
SseOpcode::Cvtdq2pd => (LegacyPrefixes::_F3, 0x0FE6, 2),
SseOpcode::Cvtpd2ps => (LegacyPrefixes::_66, 0x0F5A, 2),
SseOpcode::Cvtps2pd => (LegacyPrefixes::None, 0x0F5A, 2),
SseOpcode::Cvtss2sd => (LegacyPrefixes::_F3, 0x0F5A, 2),
SseOpcode::Cvtsd2ss => (LegacyPrefixes::_F2, 0x0F5A, 2),
SseOpcode::Movaps => (LegacyPrefixes::None, 0x0F28, 2),

12
cranelift/codegen/src/isa/x64/inst/emit_tests.rs

@ -3913,6 +3913,18 @@ fn test_x64_emit() {
"vpopcntb %xmm2, %xmm8",
));
insns.push((
Inst::xmm_unary_rm_r(SseOpcode::Cvtpd2ps, RegMem::reg(xmm7), w_xmm7),
"660F5AFF",
"cvtpd2ps %xmm7, %xmm7",
));
insns.push((
Inst::xmm_unary_rm_r(SseOpcode::Cvtps2pd, RegMem::reg(xmm11), w_xmm9),
"450F5ACB",
"cvtps2pd %xmm11, %xmm9",
));
// Xmm to int conversions, and conversely.
insns.push((

20
cranelift/codegen/src/isa/x64/lower.rs

@ -4057,6 +4057,16 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
ctx.emit(Inst::xmm_unary_rm_r(SseOpcode::Cvtss2sd, src, dst));
}
Opcode::FvpromoteLow => {
let src = RegMem::reg(put_input_in_reg(ctx, inputs[0]));
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
ctx.emit(Inst::xmm_unary_rm_r(
SseOpcode::Cvtps2pd,
RegMem::from(src),
dst,
));
}
Opcode::Fdemote => {
// We can't guarantee the RHS (if a load) is 128-bit aligned, so we
// must avoid merging a load here.
@ -4065,6 +4075,16 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
ctx.emit(Inst::xmm_unary_rm_r(SseOpcode::Cvtsd2ss, src, dst));
}
Opcode::Fvdemote => {
let src = RegMem::reg(put_input_in_reg(ctx, inputs[0]));
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
ctx.emit(Inst::xmm_unary_rm_r(
SseOpcode::Cvtpd2ps,
RegMem::from(src),
dst,
));
}
Opcode::FcvtFromSint => {
let output_ty = ty.unwrap();
if !output_ty.is_vector() {

2
cranelift/interpreter/src/step.rs

@ -564,6 +564,8 @@ where
Opcode::FcvtFromUint => unimplemented!("FcvtFromUint"),
Opcode::FcvtFromSint => unimplemented!("FcvtFromSint"),
Opcode::FcvtLowFromSint => unimplemented!("FcvtLowFromSint"),
Opcode::FvpromoteLow => unimplemented!("FvpromoteLow"),
Opcode::Fvdemote => unimplemented!("Fvdemote"),
Opcode::Isplit => unimplemented!("Isplit"),
Opcode::Iconcat => unimplemented!("Iconcat"),
Opcode::AtomicRmw => unimplemented!("AtomicRmw"),

10
cranelift/wasm/src/code_translator.rs

@ -1779,6 +1779,14 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
let a = pop1_with_bitcast(state, I32X4, builder);
state.push1(builder.ins().fcvt_low_from_sint(F64X2, a));
}
Operator::F64x2PromoteLowF32x4 => {
let a = pop1_with_bitcast(state, F32X4, builder);
state.push1(builder.ins().fvpromote_low(a));
}
Operator::F32x4DemoteF64x2Zero => {
let a = pop1_with_bitcast(state, F64X2, builder);
state.push1(builder.ins().fvdemote(a));
}
Operator::I32x4TruncSatF32x4S => {
let a = pop1_with_bitcast(state, F32X4, builder);
state.push1(builder.ins().fcvt_to_sint_sat(I32X4, a))
@ -1884,8 +1892,6 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
| Operator::I16x8ExtAddPairwiseI8x16U
| Operator::I32x4ExtAddPairwiseI16x8S
| Operator::I32x4ExtAddPairwiseI16x8U
| Operator::F32x4DemoteF64x2Zero
| Operator::F64x2PromoteLowF32x4
| Operator::F64x2ConvertLowI32x4U
| Operator::I32x4TruncSatF64x2SZero
| Operator::I32x4TruncSatF64x2UZero => {

Loading…
Cancel
Save