diff --git a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
index 454e768c09..c55540ef49 100644
--- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
@@ -61,6 +61,7 @@ impl Inst {
     }
 
     fn xmm_rm_r_evex(op: Avx512Opcode, src1: Reg, src2: RegMem, dst: Writable<Reg>) -> Self {
+        debug_assert_ne!(op, Avx512Opcode::Vpermi2b);
         src2.assert_regclass_is(RegClass::Float);
         debug_assert!(src1.class() == RegClass::Float);
         debug_assert!(dst.to_reg().class() == RegClass::Float);
@@ -72,6 +73,27 @@ impl Inst {
         }
     }
 
+    fn xmm_rm_r_evex3(
+        op: Avx512Opcode,
+        src1: Reg,
+        src2: Reg,
+        src3: RegMem,
+        dst: Writable<Reg>,
+    ) -> Self {
+        debug_assert_eq!(op, Avx512Opcode::Vpermi2b);
+        src3.assert_regclass_is(RegClass::Float);
+        debug_assert!(src1.class() == RegClass::Float);
+        debug_assert!(src2.class() == RegClass::Float);
+        debug_assert!(dst.to_reg().class() == RegClass::Float);
+        Inst::XmmRmREvex3 {
+            op,
+            src1: Xmm::new(src1).unwrap(),
+            src2: Xmm::new(src2).unwrap(),
+            src3: XmmMem::new(src3).unwrap(),
+            dst: WritableXmm::from_writable_reg(dst).unwrap(),
+        }
+    }
+
     // TODO Can be replaced by `Inst::move` (high-level) and `Inst::unary_rm_r` (low-level)
     fn xmm_mov(op: SseOpcode, src: RegMem, dst: Writable<Reg>) -> Inst {
         src.assert_regclass_is(RegClass::Float);
@@ -4189,19 +4211,37 @@ fn test_x64_emit() {
     insns.push((
         Inst::xmm_rm_r_evex(Avx512Opcode::Vpmullq, xmm10, RegMem::reg(xmm14), w_xmm1),
         "62D2AD0840CE",
-        "vpmullq %xmm10, %xmm14, %xmm1",
+        "vpmullq %xmm14, %xmm10, %xmm1",
     ));
 
     insns.push((
-        Inst::xmm_rm_r_evex(Avx512Opcode::Vpermi2b, xmm10, RegMem::reg(xmm14), w_xmm1),
+        Inst::xmm_rm_r_evex(Avx512Opcode::Vpsraq, xmm10, RegMem::reg(xmm14), w_xmm1),
+        "62D1AD08E2CE",
+        "vpsraq  %xmm14, %xmm10, %xmm1",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r_evex3(
+            Avx512Opcode::Vpermi2b,
+            xmm1,
+            xmm10,
+            RegMem::reg(xmm14),
+            w_xmm1,
+        ),
         "62D22D0875CE",
-        "vpermi2b %xmm10, %xmm14, %xmm1",
+        "vpermi2b %xmm14, %xmm10, %xmm1, %xmm1",
     ));
 
     insns.push((
-        Inst::xmm_rm_r_evex(Avx512Opcode::Vpermi2b, xmm0, RegMem::reg(xmm1), w_xmm2),
+        Inst::xmm_rm_r_evex3(
+            Avx512Opcode::Vpermi2b,
+            xmm2,
+            xmm0,
+            RegMem::reg(xmm1),
+            w_xmm2,
+        ),
         "62F27D0875D1",
-        "vpermi2b %xmm0, %xmm1, %xmm2",
+        "vpermi2b %xmm1, %xmm0, %xmm2, %xmm2",
     ));
 
     insns.push((
diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs
index ceb0d2962b..5c1e84a091 100644
--- a/cranelift/codegen/src/isa/x64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/x64/inst/mod.rs
@@ -1179,11 +1179,11 @@ impl PrettyPrint for Inst {
                 dst,
                 ..
             } => {
-                let dst = pretty_print_reg(dst.to_reg().to_reg(), 8, allocs);
                 let src1 = pretty_print_reg(src1.to_reg(), 8, allocs);
                 let src2 = src2.pretty_print(8, allocs);
+                let dst = pretty_print_reg(dst.to_reg().to_reg(), 8, allocs);
                 let op = ljustify(op.to_string());
-                format!("{op} {src1}, {src2}, {dst}")
+                format!("{op} {src2}, {src1}, {dst}")
             }
 
             Inst::XmmRmREvex3 {
@@ -1199,7 +1199,7 @@ impl PrettyPrint for Inst {
                 let src3 = src3.pretty_print(8, allocs);
                 let dst = pretty_print_reg(dst.to_reg().to_reg(), 8, allocs);
                 let op = ljustify(op.to_string());
-                format!("{op} {src1}, {src2}, {src3}, {dst}")
+                format!("{op} {src3}, {src2}, {src1}, {dst}")
             }
 
             Inst::XmmMinMaxSeq {
diff --git a/cranelift/filetests/filetests/isa/x64/shuffle-avx512.clif b/cranelift/filetests/filetests/isa/x64/shuffle-avx512.clif
index 2ccae20022..e0dad66c7b 100644
--- a/cranelift/filetests/filetests/isa/x64/shuffle-avx512.clif
+++ b/cranelift/filetests/filetests/isa/x64/shuffle-avx512.clif
@@ -15,7 +15,7 @@ block0(v0: i8x16, v1: i8x16):
 ;   movdqa  %xmm0, %xmm5
 ;   movdqu  const(0), %xmm0
 ;   movdqa  %xmm5, %xmm6
-;   vpermi2b %xmm0, %xmm6, %xmm1, %xmm0
+;   vpermi2b %xmm1, %xmm6, %xmm0, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -54,7 +54,7 @@ block0(v0: i8x16, v1: i8x16):
 ;   movdqa  %xmm0, %xmm5
 ;   movdqu  const(0), %xmm0
 ;   movdqa  %xmm5, %xmm6
-;   vpermi2b %xmm0, %xmm6, %xmm1, %xmm0
+;   vpermi2b %xmm1, %xmm6, %xmm0, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
diff --git a/cranelift/filetests/filetests/isa/x64/simd-i64x2-mul-avx512.clif b/cranelift/filetests/filetests/isa/x64/simd-i64x2-mul-avx512.clif
new file mode 100644
index 0000000000..6f0e6dbc85
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/x64/simd-i64x2-mul-avx512.clif
@@ -0,0 +1,33 @@
+test compile precise-output
+target x86_64 sse42 has_avx has_avx2 has_avx512dq has_avx512vl
+
+function %imul(i64x2, i64x2) -> i64x2, i64x2 {
+block0(v0: i64x2, v1: i64x2):
+  ;; Force register allocation to pick a different destination than
+  ;; source for at least one of these instructions.
+  v2 = imul v0, v1
+  v3 = imul v2, v1
+  return v2, v3
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   vpmullq %xmm1, %xmm0, %xmm0
+;   vpmullq %xmm1, %xmm0, %xmm1
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   vpmullq %xmm1, %xmm0, %xmm0
+;   vpmullq %xmm1, %xmm0, %xmm1
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
diff --git a/cranelift/filetests/filetests/isa/x64/simd-i64x2-shift-avx512.clif b/cranelift/filetests/filetests/isa/x64/simd-i64x2-shift-avx512.clif
index 22210f4796..d7a6796b91 100644
--- a/cranelift/filetests/filetests/isa/x64/simd-i64x2-shift-avx512.clif
+++ b/cranelift/filetests/filetests/isa/x64/simd-i64x2-shift-avx512.clif
@@ -1,19 +1,26 @@
 test compile precise-output
 target x86_64 sse42 has_avx has_avx2 has_avx512f has_avx512vl
 
-function %sshr(i64x2, i64) -> i64x2 {
+function %sshr(i64x2, i64) -> i64x2, i64x2 {
 block0(v0: i64x2, v1: i64):
+  ;; Force register allocation to pick a different destination than
+  ;; source for at least one of these instructions.
   v2 = sshr v0, v1
-  return v2
+  v3 = sshr v2, v1
+  return v2, v3
 }
 
 ; VCode:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
+;   movq    %rdi, %r9
+;   andq    %r9, $63, %r9
+;   vmovd   %r9d, %xmm1
+;   vpsraq  %xmm1, %xmm0, %xmm0
 ;   andq    %rdi, $63, %rdi
-;   vmovd   %edi, %xmm5
-;   vpsraq  %xmm5, %xmm0, %xmm0
+;   vmovd   %edi, %xmm1
+;   vpsraq  %xmm1, %xmm0, %xmm1
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -23,9 +30,13 @@ block0(v0: i64x2, v1: i64):
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
+;   movq %rdi, %r9
+;   andq $0x3f, %r9
+;   vmovd %r9d, %xmm1
+;   vpsraq %xmm1, %xmm0, %xmm0
 ;   andq $0x3f, %rdi
-;   vmovd %edi, %xmm5
-;   vpsraq %xmm5, %xmm0, %xmm0
+;   vmovd %edi, %xmm1
+;   vpsraq %xmm1, %xmm0, %xmm1
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq