From c8cce5d2d77ed3003ea27e8e5a02225cb2bc3650 Mon Sep 17 00:00:00 2001 From: Andrew Brown Date: Wed, 7 Oct 2020 16:15:09 -0700 Subject: [PATCH] [machinst x64]: enable packed saturated arithmetic --- build.rs | 2 ++ cranelift/codegen/src/isa/x64/inst/args.rs | 12 +++++++ cranelift/codegen/src/isa/x64/inst/emit.rs | 4 +++ .../codegen/src/isa/x64/inst/emit_tests.rs | 24 +++++++++++++ cranelift/codegen/src/isa/x64/lower.rs | 12 +++++++ .../isa/x64/simd-arithmetic-run.clif | 36 +++++++++---------- 6 files changed, 72 insertions(+), 18 deletions(-) diff --git a/build.rs b/build.rs index 32367c9c95..c7d6c3ff80 100644 --- a/build.rs +++ b/build.rs @@ -184,9 +184,11 @@ fn experimental_x64_should_panic(testsuite: &str, testname: &str, strategy: &str ("simd", "simd_i8x16_arith") => return false, ("simd", "simd_i8x16_arith2") => return false, ("simd", "simd_i8x16_cmp") => return false, + ("simd", "simd_i8x16_sat_arith") => return false, ("simd", "simd_i16x8_arith") => return false, ("simd", "simd_i16x8_arith2") => return false, ("simd", "simd_i16x8_cmp") => return false, + ("simd", "simd_i16x8_sat_arith") => return false, ("simd", "simd_i32x4_arith") => return false, ("simd", "simd_i32x4_arith2") => return false, ("simd", "simd_i32x4_cmp") => return false, diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs index 63d8064b0f..2d47d71a5d 100644 --- a/cranelift/codegen/src/isa/x64/inst/args.rs +++ b/cranelift/codegen/src/isa/x64/inst/args.rs @@ -459,6 +459,10 @@ pub enum SseOpcode { Psubd, Psubq, Psubw, + Psubsb, + Psubsw, + Psubusb, + Psubusw, Ptest, Pxor, Rcpss, @@ -582,6 +586,10 @@ impl SseOpcode { | SseOpcode::Psubd | SseOpcode::Psubq | SseOpcode::Psubw + | SseOpcode::Psubsb + | SseOpcode::Psubsw + | SseOpcode::Psubusb + | SseOpcode::Psubusw | SseOpcode::Pxor | SseOpcode::Sqrtpd | SseOpcode::Sqrtsd @@ -736,6 +744,10 @@ impl fmt::Debug for SseOpcode { SseOpcode::Psubd => "psubd", SseOpcode::Psubq => "psubq", SseOpcode::Psubw => "psubw", + SseOpcode::Psubsb => "psubsb", + SseOpcode::Psubsw => "psubsw", + SseOpcode::Psubusb => "psubusb", + SseOpcode::Psubusw => "psubusw", SseOpcode::Ptest => "ptest", SseOpcode::Pxor => "pxor", SseOpcode::Rcpss => "rcpss", diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index bd7b9d4426..63d6884b28 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -1798,6 +1798,10 @@ pub(crate) fn emit( SseOpcode::Psubd => (LegacyPrefixes::_66, 0x0FFA, 2), SseOpcode::Psubq => (LegacyPrefixes::_66, 0x0FFB, 2), SseOpcode::Psubw => (LegacyPrefixes::_66, 0x0FF9, 2), + SseOpcode::Psubsb => (LegacyPrefixes::_66, 0x0FE8, 2), + SseOpcode::Psubsw => (LegacyPrefixes::_66, 0x0FE9, 2), + SseOpcode::Psubusb => (LegacyPrefixes::_66, 0x0FD8, 2), + SseOpcode::Psubusw => (LegacyPrefixes::_66, 0x0FD9, 2), SseOpcode::Pxor => (LegacyPrefixes::_66, 0x0FEF, 2), SseOpcode::Subps => (LegacyPrefixes::None, 0x0F5C, 2), SseOpcode::Subpd => (LegacyPrefixes::_66, 0x0F5C, 2), diff --git a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs index ce144f6263..62992be2bd 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs @@ -3128,6 +3128,30 @@ fn test_x64_emit() { "paddusw %xmm1, %xmm8", )); + insns.push(( + Inst::xmm_rm_r(SseOpcode::Psubsb, RegMem::reg(xmm9), w_xmm5), + "66410FE8E9", + "psubsb %xmm9, %xmm5", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Psubsw, RegMem::reg(xmm7), w_xmm6), + "660FE9F7", + "psubsw %xmm7, %xmm6", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Psubusb, RegMem::reg(xmm12), w_xmm13), + "66450FD8EC", + "psubusb %xmm12, %xmm13", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Psubusw, RegMem::reg(xmm1), w_xmm8), + "66440FD9C1", + "psubusw %xmm1, %xmm8", + )); + insns.push(( Inst::xmm_rm_r(SseOpcode::Pavgb, RegMem::reg(xmm12), w_xmm13), "66450FE0EC", diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index f399e022dc..a9a05177fb 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -546,6 +546,8 @@ fn lower_insn_to_regs>( | Opcode::SaddSat | Opcode::UaddSat | Opcode::Isub + | Opcode::SsubSat + | Opcode::UsubSat | Opcode::Imul | Opcode::AvgRound | Opcode::Band @@ -578,6 +580,16 @@ fn lower_insn_to_regs>( types::I64X2 => SseOpcode::Psubq, _ => panic!("Unsupported type for packed isub instruction: {}", ty), }, + Opcode::SsubSat => match ty { + types::I8X16 => SseOpcode::Psubsb, + types::I16X8 => SseOpcode::Psubsw, + _ => panic!("Unsupported type for packed ssub_sat instruction: {}", ty), + }, + Opcode::UsubSat => match ty { + types::I8X16 => SseOpcode::Psubusb, + types::I16X8 => SseOpcode::Psubusw, + _ => panic!("Unsupported type for packed usub_sat instruction: {}", ty), + }, Opcode::Imul => match ty { types::I16X8 => SseOpcode::Pmullw, types::I32X4 => SseOpcode::Pmulld, diff --git a/cranelift/filetests/filetests/isa/x64/simd-arithmetic-run.clif b/cranelift/filetests/filetests/isa/x64/simd-arithmetic-run.clif index c59c58188a..1ea7d3f945 100644 --- a/cranelift/filetests/filetests/isa/x64/simd-arithmetic-run.clif +++ b/cranelift/filetests/filetests/isa/x64/simd-arithmetic-run.clif @@ -127,24 +127,24 @@ block0: } ; run -;function %sub_sat_i8x16() -> b1 { -;block0: -; v0 = vconst.i8x16 [128 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] ; 128 == 0x80 == -128 -; v1 = vconst.i8x16 [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1] -; -; v2 = ssub_sat v0, v1 -; v3 = extractlane v2, 0 -; v4 = icmp_imm eq v3, 0x80 ; 0x80 == -128 -; -; ; now re-use 0x80 as an unsigned 128 -; v5 = usub_sat v0, v2 -; v6 = extractlane v5, 0 -; v7 = icmp_imm eq v6, 0 -; -; v8 = band v4, v7 -; return v8 -;} -; _run +function %sub_sat_i8x16() -> b1 { +block0: + v0 = vconst.i8x16 [128 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] ; 128 == 0x80 == -128 + v1 = vconst.i8x16 [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1] + + v2 = ssub_sat v0, v1 + v3 = extractlane v2, 0 + v4 = icmp_imm eq v3, 0x80 ; 0x80 == -128 + + ; now re-use 0x80 as an unsigned 128 + v5 = usub_sat v0, v2 + v6 = extractlane v5, 0 + v7 = icmp_imm eq v6, 0 + + v8 = band v4, v7 + return v8 +} +; run ;function %add_sub_f32x4() -> b1 { ;block0: