From 84ac3feef8b5ed5e53203a8ccb24cf28a363dd07 Mon Sep 17 00:00:00 2001
From: Benjamin Bouvier <public@benj.me>
Date: Tue, 6 Oct 2020 18:30:14 +0200
Subject: [PATCH] machinst x64: use zero-latency move instructions for f32/f64;

As found by @julian-seward1, movss/movsd aren't included in the
zero-latency move instructions section of the Intel optimization manual.
Use MOVAPS instead for those moves.
---
 cranelift/codegen/src/isa/x64/inst/mod.rs | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs
index b370c97f44..e2f8a6c611 100644
--- a/cranelift/codegen/src/isa/x64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/x64/inst/mod.rs
@@ -2401,10 +2401,12 @@ impl MachInst for Inst {
         match rc_dst {
             RegClass::I64 => Inst::mov_r_r(true, src_reg, dst_reg),
             RegClass::V128 => {
+                // The Intel optimization manual, in "3.5.1.13 Zero-Latency MOV Instructions",
+                // doesn't include MOVSS/MOVSD as instructions with zero-latency. Use movaps for
+                // those, which may write more lanes that we need, but are specified to have
+                // zero-latency.
                 let opcode = match ty {
-                    types::F32 => SseOpcode::Movss,
-                    types::F64 => SseOpcode::Movsd,
-                    types::F32X4 => SseOpcode::Movaps,
+                    types::F32 | types::F64 | types::F32X4 => SseOpcode::Movaps,
                     types::F64X2 => SseOpcode::Movapd,
                     _ if ty.is_vector() && ty.bits() == 128 => SseOpcode::Movdqa,
                     _ => unimplemented!("unable to move type: {}", ty),