diff --git a/cranelift/codegen/meta/src/isa/x86/encodings.rs b/cranelift/codegen/meta/src/isa/x86/encodings.rs index 8f4a77d814..a58348d49b 100644 --- a/cranelift/codegen/meta/src/isa/x86/encodings.rs +++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs @@ -1697,6 +1697,7 @@ fn define_simd( let x86_pminu = x86.by_name("x86_pminu"); let x86_pmullq = x86.by_name("x86_pmullq"); let x86_pmuludq = x86.by_name("x86_pmuludq"); + let x86_palignr = x86.by_name("x86_palignr"); let x86_pshufb = x86.by_name("x86_pshufb"); let x86_pshufd = x86.by_name("x86_pshufd"); let x86_psll = x86.by_name("x86_psll"); @@ -1901,6 +1902,8 @@ fn define_simd( rec_fa.opcodes(low), ); } + + // SIMD narrow/widen for (ty, opcodes) in &[(I16, &PACKSSWB), (I32, &PACKSSDW)] { let snarrow = snarrow.bind(vector(*ty, sse_vector_size)); e.enc_both_inferred(snarrow, rec_fa.opcodes(*opcodes)); @@ -1912,6 +1915,13 @@ fn define_simd( let unarrow = unarrow.bind(vector(*ty, sse_vector_size)); e.enc_both_inferred_maybe_isap(unarrow, rec_fa.opcodes(*opcodes), *isap); } + for ty in &[I8, I16, I32, I64] { + e.enc_both_inferred_maybe_isap( + x86_palignr.bind(vector(*ty, sse_vector_size)), + rec_fa_ib.opcodes(&PALIGNR[..]), + Some(use_ssse3_simd), + ); + } // SIMD bitcast all 128-bit vectors to each other (for legalizing splat.x16x8). for from_type in ValueType::all_lane_types().filter(allowed_simd_type) { diff --git a/cranelift/codegen/meta/src/isa/x86/instructions.rs b/cranelift/codegen/meta/src/isa/x86/instructions.rs index 0e48784f23..7acd2e2c50 100644 --- a/cranelift/codegen/meta/src/isa/x86/instructions.rs +++ b/cranelift/codegen/meta/src/isa/x86/instructions.rs @@ -664,6 +664,21 @@ pub(crate) fn define( .operands_out(vec![a]), ); + let c = &Operand::new("c", uimm8) + .with_doc("The number of bytes to shift right; see PALIGNR in Intel manual for details"); + ig.push( + Inst::new( + "x86_palignr", + r#" + Concatenate destination and source operands, extracting a byte-aligned result shifted to + the right by `c`. + "#, + &formats.ternary_imm8, + ) + .operands_in(vec![x, y, c]) + .operands_out(vec![a]), + ); + let i64_t = &TypeVar::new( "i64_t", "A scalar 64bit integer", diff --git a/cranelift/codegen/meta/src/isa/x86/opcodes.rs b/cranelift/codegen/meta/src/isa/x86/opcodes.rs index c357488ddd..25685593a6 100644 --- a/cranelift/codegen/meta/src/isa/x86/opcodes.rs +++ b/cranelift/codegen/meta/src/isa/x86/opcodes.rs @@ -354,6 +354,10 @@ pub static PADDUSB: [u8; 3] = [0x66, 0x0f, 0xdc]; /// Add packed unsigned word integers from xmm2/m128 and xmm1 saturate the results (SSE). pub static PADDUSW: [u8; 3] = [0x66, 0x0f, 0xdd]; +/// Concatenate destination and source operands, extract a byte-aligned result into xmm1 that is +/// shifted to the right by the constant number of bytes in imm8 (SSSE3). +pub static PALIGNR: [u8; 4] = [0x66, 0x0f, 0x3a, 0x0f]; + /// Bitwise AND of xmm2/m128 and xmm1 (SSE2). pub static PAND: [u8; 3] = [0x66, 0x0f, 0xdb]; diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs index 80b4518f9f..7fb878c87a 100644 --- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs +++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs @@ -2133,6 +2133,7 @@ pub(crate) fn lower_insn_to_regs>( | Opcode::X86Insertps | Opcode::X86Movsd | Opcode::X86Movlhps + | Opcode::X86Palignr | Opcode::X86Psll | Opcode::X86Psrl | Opcode::X86Psra diff --git a/cranelift/filetests/filetests/isa/x86/simd-conversion-binemit.clif b/cranelift/filetests/filetests/isa/x86/simd-conversion-binemit.clif index ae1cdda753..b1a95c52d7 100644 --- a/cranelift/filetests/filetests/isa/x86/simd-conversion-binemit.clif +++ b/cranelift/filetests/filetests/isa/x86/simd-conversion-binemit.clif @@ -1,5 +1,6 @@ test binemit -target x86_64 +set enable_simd +target x86_64 has_ssse3=true ; Ensure raw_bitcast emits no instructions. function %raw_bitcast_i16x8_to_b32x4() { @@ -10,8 +11,9 @@ block0: return } -function %fcvt_32(i32x4) { -block0(v0: i32x4 [%xmm6]): -[-, %xmm2] v1 = fcvt_from_sint.f32x4 v0 ; bin: 40 0f 5b d6 +function %conversions_i32x4(i32x4, i32x4) { +block0(v0: i32x4 [%xmm6], v1: i32x4 [%xmm4]): +[-, %xmm2] v2 = fcvt_from_sint.f32x4 v0 ; bin: 40 0f 5b d6 +[-, %xmm6] v3 = x86_palignr v0, v1, 3 ; bin: 66 0f 3a 0f f4 03 return }