diff --git a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs index 454e768c09..c55540ef49 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs @@ -61,6 +61,7 @@ impl Inst { } fn xmm_rm_r_evex(op: Avx512Opcode, src1: Reg, src2: RegMem, dst: Writable) -> Self { + debug_assert_ne!(op, Avx512Opcode::Vpermi2b); src2.assert_regclass_is(RegClass::Float); debug_assert!(src1.class() == RegClass::Float); debug_assert!(dst.to_reg().class() == RegClass::Float); @@ -72,6 +73,27 @@ impl Inst { } } + fn xmm_rm_r_evex3( + op: Avx512Opcode, + src1: Reg, + src2: Reg, + src3: RegMem, + dst: Writable, + ) -> Self { + debug_assert_eq!(op, Avx512Opcode::Vpermi2b); + src3.assert_regclass_is(RegClass::Float); + debug_assert!(src1.class() == RegClass::Float); + debug_assert!(src2.class() == RegClass::Float); + debug_assert!(dst.to_reg().class() == RegClass::Float); + Inst::XmmRmREvex3 { + op, + src1: Xmm::new(src1).unwrap(), + src2: Xmm::new(src2).unwrap(), + src3: XmmMem::new(src3).unwrap(), + dst: WritableXmm::from_writable_reg(dst).unwrap(), + } + } + // TODO Can be replaced by `Inst::move` (high-level) and `Inst::unary_rm_r` (low-level) fn xmm_mov(op: SseOpcode, src: RegMem, dst: Writable) -> Inst { src.assert_regclass_is(RegClass::Float); @@ -4189,19 +4211,37 @@ fn test_x64_emit() { insns.push(( Inst::xmm_rm_r_evex(Avx512Opcode::Vpmullq, xmm10, RegMem::reg(xmm14), w_xmm1), "62D2AD0840CE", - "vpmullq %xmm10, %xmm14, %xmm1", + "vpmullq %xmm14, %xmm10, %xmm1", )); insns.push(( - Inst::xmm_rm_r_evex(Avx512Opcode::Vpermi2b, xmm10, RegMem::reg(xmm14), w_xmm1), + Inst::xmm_rm_r_evex(Avx512Opcode::Vpsraq, xmm10, RegMem::reg(xmm14), w_xmm1), + "62D1AD08E2CE", + "vpsraq %xmm14, %xmm10, %xmm1", + )); + + insns.push(( + Inst::xmm_rm_r_evex3( + Avx512Opcode::Vpermi2b, + xmm1, + xmm10, + RegMem::reg(xmm14), + w_xmm1, + ), "62D22D0875CE", - "vpermi2b %xmm10, %xmm14, %xmm1", + "vpermi2b %xmm14, %xmm10, %xmm1, %xmm1", )); insns.push(( - Inst::xmm_rm_r_evex(Avx512Opcode::Vpermi2b, xmm0, RegMem::reg(xmm1), w_xmm2), + Inst::xmm_rm_r_evex3( + Avx512Opcode::Vpermi2b, + xmm2, + xmm0, + RegMem::reg(xmm1), + w_xmm2, + ), "62F27D0875D1", - "vpermi2b %xmm0, %xmm1, %xmm2", + "vpermi2b %xmm1, %xmm0, %xmm2, %xmm2", )); insns.push(( diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs index ceb0d2962b..5c1e84a091 100644 --- a/cranelift/codegen/src/isa/x64/inst/mod.rs +++ b/cranelift/codegen/src/isa/x64/inst/mod.rs @@ -1179,11 +1179,11 @@ impl PrettyPrint for Inst { dst, .. } => { - let dst = pretty_print_reg(dst.to_reg().to_reg(), 8, allocs); let src1 = pretty_print_reg(src1.to_reg(), 8, allocs); let src2 = src2.pretty_print(8, allocs); + let dst = pretty_print_reg(dst.to_reg().to_reg(), 8, allocs); let op = ljustify(op.to_string()); - format!("{op} {src1}, {src2}, {dst}") + format!("{op} {src2}, {src1}, {dst}") } Inst::XmmRmREvex3 { @@ -1199,7 +1199,7 @@ impl PrettyPrint for Inst { let src3 = src3.pretty_print(8, allocs); let dst = pretty_print_reg(dst.to_reg().to_reg(), 8, allocs); let op = ljustify(op.to_string()); - format!("{op} {src1}, {src2}, {src3}, {dst}") + format!("{op} {src3}, {src2}, {src1}, {dst}") } Inst::XmmMinMaxSeq { diff --git a/cranelift/filetests/filetests/isa/x64/shuffle-avx512.clif b/cranelift/filetests/filetests/isa/x64/shuffle-avx512.clif index 2ccae20022..e0dad66c7b 100644 --- a/cranelift/filetests/filetests/isa/x64/shuffle-avx512.clif +++ b/cranelift/filetests/filetests/isa/x64/shuffle-avx512.clif @@ -15,7 +15,7 @@ block0(v0: i8x16, v1: i8x16): ; movdqa %xmm0, %xmm5 ; movdqu const(0), %xmm0 ; movdqa %xmm5, %xmm6 -; vpermi2b %xmm0, %xmm6, %xmm1, %xmm0 +; vpermi2b %xmm1, %xmm6, %xmm0, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -54,7 +54,7 @@ block0(v0: i8x16, v1: i8x16): ; movdqa %xmm0, %xmm5 ; movdqu const(0), %xmm0 ; movdqa %xmm5, %xmm6 -; vpermi2b %xmm0, %xmm6, %xmm1, %xmm0 +; vpermi2b %xmm1, %xmm6, %xmm0, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret diff --git a/cranelift/filetests/filetests/isa/x64/simd-i64x2-mul-avx512.clif b/cranelift/filetests/filetests/isa/x64/simd-i64x2-mul-avx512.clif new file mode 100644 index 0000000000..6f0e6dbc85 --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/simd-i64x2-mul-avx512.clif @@ -0,0 +1,33 @@ +test compile precise-output +target x86_64 sse42 has_avx has_avx2 has_avx512dq has_avx512vl + +function %imul(i64x2, i64x2) -> i64x2, i64x2 { +block0(v0: i64x2, v1: i64x2): + ;; Force register allocation to pick a different destination than + ;; source for at least one of these instructions. + v2 = imul v0, v1 + v3 = imul v2, v1 + return v2, v3 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpmullq %xmm1, %xmm0, %xmm0 +; vpmullq %xmm1, %xmm0, %xmm1 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpmullq %xmm1, %xmm0, %xmm0 +; vpmullq %xmm1, %xmm0, %xmm1 +; movq %rbp, %rsp +; popq %rbp +; retq + diff --git a/cranelift/filetests/filetests/isa/x64/simd-i64x2-shift-avx512.clif b/cranelift/filetests/filetests/isa/x64/simd-i64x2-shift-avx512.clif index 22210f4796..d7a6796b91 100644 --- a/cranelift/filetests/filetests/isa/x64/simd-i64x2-shift-avx512.clif +++ b/cranelift/filetests/filetests/isa/x64/simd-i64x2-shift-avx512.clif @@ -1,19 +1,26 @@ test compile precise-output target x86_64 sse42 has_avx has_avx2 has_avx512f has_avx512vl -function %sshr(i64x2, i64) -> i64x2 { +function %sshr(i64x2, i64) -> i64x2, i64x2 { block0(v0: i64x2, v1: i64): + ;; Force register allocation to pick a different destination than + ;; source for at least one of these instructions. v2 = sshr v0, v1 - return v2 + v3 = sshr v2, v1 + return v2, v3 } ; VCode: ; pushq %rbp ; movq %rsp, %rbp ; block0: +; movq %rdi, %r9 +; andq %r9, $63, %r9 +; vmovd %r9d, %xmm1 +; vpsraq %xmm1, %xmm0, %xmm0 ; andq %rdi, $63, %rdi -; vmovd %edi, %xmm5 -; vpsraq %xmm5, %xmm0, %xmm0 +; vmovd %edi, %xmm1 +; vpsraq %xmm1, %xmm0, %xmm1 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -23,9 +30,13 @@ block0(v0: i64x2, v1: i64): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 +; movq %rdi, %r9 +; andq $0x3f, %r9 +; vmovd %r9d, %xmm1 +; vpsraq %xmm1, %xmm0, %xmm0 ; andq $0x3f, %rdi -; vmovd %edi, %xmm5 -; vpsraq %xmm5, %xmm0, %xmm0 +; vmovd %edi, %xmm1 +; vpsraq %xmm1, %xmm0, %xmm1 ; movq %rbp, %rsp ; popq %rbp ; retq