x64: clean up regalloc-related semantics on several instructions. (#4811)

* x64: clean up regalloc-related semantics on several instructions. This PR removes all uses of "modify" operands on instructions in the x64 backend, and also removes all uses of "pinned vregs", or vregs that are explicitly tied to particular physical registers. In place of both of these mechanisms, which are legacies of the old regalloc design and supported via compatibility code, the backend now uses operand constraints. This is more flexible as it allows the regalloc to see the liveranges and constraints without "reverse-engineering" move instructions. Eventually, after removing all such uses (including in other backends and by the ABI code), we can remove the compatibility code in regalloc2, significantly simplifying its liverange-construction frontend and thus allowing for higher confidence in correctness as well as possibly a bit more compilation speed. Curiously, there are a few extra move instructions now; they are likely poor splitting decisions and I can try to chase these down later. * Fix cranelift-codegen tests. * Review feedback.
2 years ago · 186c7c3b89
14 changed files with 543 additions and 284 deletions
--- a/cranelift/codegen/src/isa/x64/inst.isle
+++ b/cranelift/codegen/src/isa/x64/inst.isle
@ -64,24 +64,13 @@
       ;; A synthetic sequence to implement the right inline checks for
       ;; remainder and division, assuming the dividend is in %rax.
       ;;
-       ;; Puts the result back into %rax if is_div, %rdx if !is_div, to mimic
-       ;; what the div instruction does.
-       ;;
       ;; The generated code sequence is described in the emit's function match
       ;; arm for this instruction.
-       ;;
-       ;; Note: %rdx is marked as modified by this instruction, to avoid an
-       ;; early clobber problem with the temporary and divisor registers. Make
-       ;; sure to zero %rdx right before this instruction, or you might run into
-       ;; regalloc failures where %rdx is live before its first def!
       (CheckedDivOrRemSeq (kind DivOrRemKind)
                           (size OperandSize)
                           (dividend_lo Gpr)
                           (dividend_hi Gpr)
-                           ;; The divisor operand. Note it's marked as modified
-                           ;; so that it gets assigned a register different from
-                           ;; the temporary.
-                           (divisor WritableGpr)
+                           (divisor Gpr)
                           (dst_quotient WritableGpr)
                           (dst_remainder WritableGpr)
                           (tmp OptionWritableGpr))
@ -205,12 +194,21 @@
                   (src3 XmmMem)
                   (dst WritableXmm))

-       ;; XMM (scalar or vector) binary op that relies on the EVEX prefix.
+       ;; XMM (scalar or vector) binary op that relies on the EVEX
+       ;; prefix. Takes two inputs.
       (XmmRmREvex (op Avx512Opcode)
                   (src1 XmmMem)
                   (src2 Xmm)
                   (dst WritableXmm))

+       ;; XMM (scalar or vector) binary op that relies on the EVEX
+       ;; prefix. Takes three inputs.
+       (XmmRmREvex3 (op Avx512Opcode)
+                   (src1 XmmMem)
+                   (src2 Xmm)
+                   (src3 Xmm)
+                   (dst WritableXmm))
+
       ;; XMM (scalar or vector) unary op: mov between XMM registers (32 64)
       ;; (reg addr) reg, sqrt, etc.
       ;;
@ -255,13 +253,7 @@

       ;; Converts an unsigned int64 to a float32/float64.
       (CvtUint64ToFloatSeq (dst_size OperandSize) ;; 4 or 8
-                            ;; A copy of the source register, fed by
-                            ;; lowering. It is marked as modified during
-                            ;; register allocation to make sure that the
-                            ;; temporary registers differ from the src register,
-                            ;; since both registers are live at the same time in
-                            ;; the generated code sequence.
-                            (src WritableGpr)
+                            (src Gpr)
                            (dst WritableXmm)
                            (tmp_gpr1 WritableGpr)
                            (tmp_gpr2 WritableGpr))
@ -270,13 +262,7 @@
       (CvtFloatToSintSeq (dst_size OperandSize)
                          (src_size OperandSize)
                          (is_saturating bool)
-                          ;; A copy of the source register, fed by
-                          ;; lowering. It is marked as modified during
-                          ;; register allocation to make sure that the
-                          ;; temporary registers differ from the src register,
-                          ;; since both registers are live at the same time in
-                          ;; the generated code sequence.
-                          (src WritableXmm)
+                          (src Xmm)
                          (dst WritableGpr)
                          (tmp_gpr WritableGpr)
                          (tmp_xmm WritableXmm))
@ -285,13 +271,7 @@
       (CvtFloatToUintSeq (dst_size OperandSize)
                          (src_size OperandSize)
                          (is_saturating bool)
-                          ;; A copy of the source register, fed by
-                          ;; lowering. It is marked as modified during
-                          ;; register allocation to make sure that the
-                          ;; temporary registers differ from the src register,
-                          ;; since both registers are live at the same time in
-                          ;; the generated code sequence.
-                          (src WritableXmm)
+                          (src Xmm)
                          (dst WritableGpr)
                          (tmp_gpr WritableGpr)
                          (tmp_xmm WritableXmm))
@ -2769,11 +2749,11 @@
 (decl x64_vpermi2b (Xmm Xmm Xmm) Xmm)
 (rule (x64_vpermi2b src1 src2 src3)
      (let ((dst WritableXmm (temp_writable_xmm))
-            (_ Unit (emit (gen_move $I8X16 dst src3)))
-            (_ Unit (emit (MInst.XmmRmREvex (Avx512Opcode.Vpermi2b)
-                                            src1
-                                            src2
-                                            dst))))
+            (_ Unit (emit (MInst.XmmRmREvex3 (Avx512Opcode.Vpermi2b)
+                                             src1
+                                             src2
+                                             src3
+                                             dst))))
        dst))

 ;; Helper for creating `MInst.MulHi` instructions.
@ -3214,12 +3194,10 @@
 (decl cvt_u64_to_float_seq (Type Gpr) Xmm)
 (rule (cvt_u64_to_float_seq ty src)
      (let ((size OperandSize (raw_operand_size_of_type ty))
-            (src_copy WritableGpr (temp_writable_gpr))
            (dst WritableXmm (temp_writable_xmm))
            (tmp_gpr1 WritableGpr (temp_writable_gpr))
            (tmp_gpr2 WritableGpr (temp_writable_gpr))
-            (_ Unit (emit (gen_move $I64 src_copy src)))
-            (_ Unit (emit (MInst.CvtUint64ToFloatSeq size src_copy dst tmp_gpr1 tmp_gpr2))))
+            (_ Unit (emit (MInst.CvtUint64ToFloatSeq size src dst tmp_gpr1 tmp_gpr2))))
        dst))

 (decl cvt_float_to_uint_seq (Type Value bool) Gpr)
@ -3227,13 +3205,10 @@
      (let ((out_size OperandSize (raw_operand_size_of_type out_ty))
            (src_size OperandSize (raw_operand_size_of_type src_ty))

-            (tmp WritableXmm (temp_writable_xmm))
-            (_ Unit (emit (gen_move src_ty tmp src)))
-
            (dst WritableGpr (temp_writable_gpr))
            (tmp_xmm WritableXmm (temp_writable_xmm))
            (tmp_gpr WritableGpr (temp_writable_gpr))
-            (_ Unit (emit (MInst.CvtFloatToUintSeq out_size src_size is_saturating tmp dst tmp_gpr tmp_xmm))))
+            (_ Unit (emit (MInst.CvtFloatToUintSeq out_size src_size is_saturating src dst tmp_gpr tmp_xmm))))
        dst))

 (decl cvt_float_to_sint_seq (Type Value bool) Gpr)
@ -3241,13 +3216,10 @@
      (let ((out_size OperandSize (raw_operand_size_of_type out_ty))
            (src_size OperandSize (raw_operand_size_of_type src_ty))

-            (tmp WritableXmm (temp_writable_xmm))
-            (_ Unit (emit (gen_move src_ty tmp src)))
-
            (dst WritableGpr (temp_writable_gpr))
            (tmp_xmm WritableXmm (temp_writable_xmm))
            (tmp_gpr WritableGpr (temp_writable_gpr))
-            (_ Unit (emit (MInst.CvtFloatToSintSeq out_size src_size is_saturating tmp dst tmp_gpr tmp_xmm))))
+            (_ Unit (emit (MInst.CvtFloatToSintSeq out_size src_size is_saturating src dst tmp_gpr tmp_xmm))))
        dst))

 (decl fcvt_uint_mask_const () VCodeConstant)
@ -3396,10 +3368,6 @@
            ;; addresses).
            (tmp1 WritableGpr (temp_writable_gpr))

-            ;; Put a zero in tmp1. This is needed for Spectre mitigations (a
-            ;; CMOV that zeroes the index on misspeculation).
-            (_ Unit (emit (MInst.Imm (OperandSize.Size32) 0 tmp1)))
-
            ;; This temporary is used as a signed integer of 32-bits (for the
            ;; wasm-table index) and then 64-bits (address addend). The small
            ;; lie about the I64 type is benign, since the temporary is dead
--- a/cranelift/codegen/src/isa/x64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit.rs
@ -377,11 +377,11 @@ pub(crate) fn emit(
        } => {
            let dividend_lo = allocs.next(dividend_lo.to_reg());
            let dst_quotient = allocs.next(dst_quotient.to_reg().to_reg());
-            let dst_remainder = allocs.next(dst_remainder.to_reg().to_reg());
            debug_assert_eq!(dividend_lo, regs::rax());
            debug_assert_eq!(dst_quotient, regs::rax());
-            debug_assert_eq!(dst_remainder, regs::rdx());
            if size.to_bits() > 8 {
+                let dst_remainder = allocs.next(dst_remainder.to_reg().to_reg());
+                debug_assert_eq!(dst_remainder, regs::rdx());
                let dividend_hi = allocs.next(dividend_hi.to_reg());
                debug_assert_eq!(dividend_hi, regs::rdx());
            }
@ -468,7 +468,11 @@ pub(crate) fn emit(
            let src = allocs.next(src.to_reg());
            let dst = allocs.next(dst.to_reg().to_reg());
            debug_assert_eq!(src, regs::rax());
-            debug_assert_eq!(dst, regs::rdx());
+            if *size == OperandSize::Size8 {
+                debug_assert_eq!(dst, regs::rax());
+            } else {
+                debug_assert_eq!(dst, regs::rdx());
+            }
            match size {
                OperandSize::Size8 => {
                    sink.put1(0x66);
@ -498,7 +502,7 @@ pub(crate) fn emit(
        } => {
            let dividend_lo = allocs.next(dividend_lo.to_reg());
            let dividend_hi = allocs.next(dividend_hi.to_reg());
-            let divisor = allocs.next(divisor.to_reg().to_reg());
+            let divisor = allocs.next(divisor.to_reg());
            let dst_quotient = allocs.next(dst_quotient.to_reg().to_reg());
            let dst_remainder = allocs.next(dst_remainder.to_reg().to_reg());
            let tmp = tmp.map(|tmp| allocs.next(tmp.to_reg().to_reg()));
@ -597,18 +601,45 @@ pub(crate) fn emit(
                sink.bind_label(do_op);
            }

+            let dividend_lo = Gpr::new(regs::rax()).unwrap();
+            let dst_quotient = WritableGpr::from_reg(Gpr::new(regs::rax()).unwrap());
+            let (dividend_hi, dst_remainder) = if *size == OperandSize::Size8 {
+                (
+                    Gpr::new(regs::rax()).unwrap(),
+                    Writable::from_reg(Gpr::new(regs::rax()).unwrap()),
+                )
+            } else {
+                (
+                    Gpr::new(regs::rdx()).unwrap(),
+                    Writable::from_reg(Gpr::new(regs::rdx()).unwrap()),
+                )
+            };
+
            // Fill in the high parts:
            if kind.is_signed() {
                // sign-extend the sign-bit of rax into rdx, for signed opcodes.
-                let inst = Inst::sign_extend_data(*size);
+                let inst =
+                    Inst::sign_extend_data(*size, dividend_lo, WritableGpr::from_reg(dividend_hi));
                inst.emit(&[], sink, info, state);
-            } else {
+            } else if *size != OperandSize::Size8 {
                // zero for unsigned opcodes.
-                let inst = Inst::imm(OperandSize::Size64, 0, Writable::from_reg(regs::rdx()));
+                let inst = Inst::imm(
+                    OperandSize::Size64,
+                    0,
+                    Writable::from_reg(dividend_hi.to_reg()),
+                );
                inst.emit(&[], sink, info, state);
            }

-            let inst = Inst::div(*size, kind.is_signed(), RegMem::reg(divisor));
+            let inst = Inst::div(
+                *size,
+                kind.is_signed(),
+                RegMem::reg(divisor),
+                dividend_lo,
+                dividend_hi,
+                dst_quotient,
+                dst_remainder,
+            );
            inst.emit(&[], sink, info, state);

            // Lowering takes care of moving the result back into the right register, see comment
@ -1393,7 +1424,8 @@ pub(crate) fn emit(
            // ;; generated by lowering: cmp #jmp_table_size, %idx
            // jnb $default_target
            // movl %idx, %tmp2
-            // cmovnb %tmp1, %tmp2 ;; Spectre mitigation; we require tmp1 to be zero on entry.
+            // mov $0, %tmp1
+            // cmovnb %tmp1, %tmp2 ;; Spectre mitigation.
            // lea start_of_jump_table_offset(%rip), %tmp1
            // movslq [%tmp1, %tmp2, 4], %tmp2 ;; shift of 2, viz. multiply index by 4
            // addq %tmp2, %tmp1
@ -1406,6 +1438,13 @@ pub(crate) fn emit(
            let inst = Inst::movzx_rm_r(ExtMode::LQ, RegMem::reg(idx), tmp2);
            inst.emit(&[], sink, info, state);

+            // Zero `tmp1` to overwrite `tmp2` with zeroes on the
+            // out-of-bounds case (Spectre mitigation using CMOV).
+            // Note that we need to do this with a move-immediate
+            // form, because we cannot clobber the flags.
+            let inst = Inst::imm(OperandSize::Size32, 0, tmp1);
+            inst.emit(&[], sink, info, state);
+
            // Spectre mitigation: CMOV to zero the index if the out-of-bounds branch above misspeculated.
            let inst = Inst::cmove(
                OperandSize::Size64,
@ -1768,9 +1807,21 @@ pub(crate) fn emit(
            src1,
            src2,
            dst,
+        }
+        | Inst::XmmRmREvex3 {
+            op,
+            src1,
+            src2,
+            dst,
+            // `dst` reuses `src3`.
+            ..
        } => {
            let dst = allocs.next(dst.to_reg().to_reg());
            let src2 = allocs.next(src2.to_reg());
+            if let Inst::XmmRmREvex3 { src3, .. } = inst {
+                let src3 = allocs.next(src3.to_reg());
+                debug_assert_eq!(src3, dst);
+            }
            let src1 = src1.clone().to_reg_mem().with_allocs(allocs);

            let (w, opcode) = match op {
@ -2086,7 +2137,7 @@ pub(crate) fn emit(
            tmp_gpr1,
            tmp_gpr2,
        } => {
-            let src = allocs.next(src.to_reg().to_reg());
+            let src = allocs.next(src.to_reg());
            let dst = allocs.next(dst.to_reg().to_reg());
            let tmp_gpr1 = allocs.next(tmp_gpr1.to_reg().to_reg());
            let tmp_gpr2 = allocs.next(tmp_gpr2.to_reg().to_reg());
@ -2155,7 +2206,7 @@ pub(crate) fn emit(
            let inst = Inst::shift_r(
                OperandSize::Size64,
                ShiftKind::ShiftRightLogical,
-                Some(1),
+                Imm8Gpr::new(Imm8Reg::Imm8 { imm: 1 }).unwrap(),
                Writable::from_reg(tmp_gpr1),
            );
            inst.emit(&[], sink, info, state);
@ -2208,7 +2259,7 @@ pub(crate) fn emit(
            tmp_gpr,
            tmp_xmm,
        } => {
-            let src = allocs.next(src.to_reg().to_reg());
+            let src = allocs.next(src.to_reg());
            let dst = allocs.next(dst.to_reg().to_reg());
            let tmp_gpr = allocs.next(tmp_gpr.to_reg().to_reg());
            let tmp_xmm = allocs.next(tmp_xmm.to_reg().to_reg());
@ -2417,7 +2468,7 @@ pub(crate) fn emit(
            tmp_gpr,
            tmp_xmm,
        } => {
-            let src = allocs.next(src.to_reg().to_reg());
+            let src = allocs.next(src.to_reg());
            let dst = allocs.next(dst.to_reg().to_reg());
            let tmp_gpr = allocs.next(tmp_gpr.to_reg().to_reg());
            let tmp_xmm = allocs.next(tmp_xmm.to_reg().to_reg());
--- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
@ -1723,6 +1723,10 @@ fn test_x64_emit() {
            OperandSize::Size32,
            true, /*signed*/
            RegMem::reg(regs::rsi()),
+            Gpr::new(regs::rax()).unwrap(),
+            Gpr::new(regs::rdx()).unwrap(),
+            WritableGpr::from_reg(Gpr::new(regs::rax()).unwrap()),
+            WritableGpr::from_reg(Gpr::new(regs::rdx()).unwrap()),
        ),
        "F7FE",
        "idiv    %eax, %edx, %esi, %eax, %edx",
@ -1732,6 +1736,10 @@ fn test_x64_emit() {
            OperandSize::Size64,
            true, /*signed*/
            RegMem::reg(regs::r15()),
+            Gpr::new(regs::rax()).unwrap(),
+            Gpr::new(regs::rdx()).unwrap(),
+            WritableGpr::from_reg(Gpr::new(regs::rax()).unwrap()),
+            WritableGpr::from_reg(Gpr::new(regs::rdx()).unwrap()),
        ),
        "49F7FF",
        "idiv    %rax, %rdx, %r15, %rax, %rdx",
@ -1741,6 +1749,10 @@ fn test_x64_emit() {
            OperandSize::Size32,
            false, /*signed*/
            RegMem::reg(regs::r14()),
+            Gpr::new(regs::rax()).unwrap(),
+            Gpr::new(regs::rdx()).unwrap(),
+            WritableGpr::from_reg(Gpr::new(regs::rax()).unwrap()),
+            WritableGpr::from_reg(Gpr::new(regs::rdx()).unwrap()),
        ),
        "41F7F6",
        "div     %eax, %edx, %r14d, %eax, %edx",
@ -1750,19 +1762,39 @@ fn test_x64_emit() {
            OperandSize::Size64,
            false, /*signed*/
            RegMem::reg(regs::rdi()),
+            Gpr::new(regs::rax()).unwrap(),
+            Gpr::new(regs::rdx()).unwrap(),
+            WritableGpr::from_reg(Gpr::new(regs::rax()).unwrap()),
+            WritableGpr::from_reg(Gpr::new(regs::rdx()).unwrap()),
        ),
        "48F7F7",
        "div     %rax, %rdx, %rdi, %rax, %rdx",
    ));
    insns.push((
-        Inst::div(OperandSize::Size8, false, RegMem::reg(regs::rax())),
+        Inst::div(
+            OperandSize::Size8,
+            false,
+            RegMem::reg(regs::rax()),
+            Gpr::new(regs::rax()).unwrap(),
+            Gpr::new(regs::rdx()).unwrap(),
+            WritableGpr::from_reg(Gpr::new(regs::rax()).unwrap()),
+            WritableGpr::from_reg(Gpr::new(regs::rdx()).unwrap()),
+        ),
        "F6F0",
-        "div     %al, (none), %al, %al, %dl",
+        "div     %al, (none), %al, %al, (none)",
    ));
    insns.push((
-        Inst::div(OperandSize::Size8, false, RegMem::reg(regs::rsi())),
+        Inst::div(
+            OperandSize::Size8,
+            false,
+            RegMem::reg(regs::rsi()),
+            Gpr::new(regs::rax()).unwrap(),
+            Gpr::new(regs::rdx()).unwrap(),
+            WritableGpr::from_reg(Gpr::new(regs::rax()).unwrap()),
+            WritableGpr::from_reg(Gpr::new(regs::rdx()).unwrap()),
+        ),
        "40F6F6",
-        "div     %al, (none), %sil, %al, %dl",
+        "div     %al, (none), %sil, %al, (none)",
    ));

    // ========================================================
@ -1807,25 +1839,41 @@ fn test_x64_emit() {
    // ========================================================
    // cbw
    insns.push((
-        Inst::sign_extend_data(OperandSize::Size8),
+        Inst::sign_extend_data(
+            OperandSize::Size8,
+            Gpr::new(regs::rax()).unwrap(),
+            WritableGpr::from_reg(Gpr::new(regs::rax()).unwrap()),
+        ),
        "6698",
-        "cbw %al, %dl",
+        "cbw %al, %al",
    ));

    // ========================================================
    // cdq family: SignExtendRaxRdx
    insns.push((
-        Inst::sign_extend_data(OperandSize::Size16),
+        Inst::sign_extend_data(
+            OperandSize::Size16,
+            Gpr::new(regs::rax()).unwrap(),
+            WritableGpr::from_reg(Gpr::new(regs::rdx()).unwrap()),
+        ),
        "6699",
        "cwd %ax, %dx",
    ));
    insns.push((
-        Inst::sign_extend_data(OperandSize::Size32),
+        Inst::sign_extend_data(
+            OperandSize::Size32,
+            Gpr::new(regs::rax()).unwrap(),
+            WritableGpr::from_reg(Gpr::new(regs::rdx()).unwrap()),
+        ),
        "99",
        "cdq %eax, %edx",
    ));
    insns.push((
-        Inst::sign_extend_data(OperandSize::Size64),
+        Inst::sign_extend_data(
+            OperandSize::Size64,
+            Gpr::new(regs::rax()).unwrap(),
+            WritableGpr::from_reg(Gpr::new(regs::rdx()).unwrap()),
+        ),
        "4899",
        "cqo %rax, %rdx",
    ));
@ -2813,47 +2861,92 @@ fn test_x64_emit() {
    // ========================================================
    // Shift_R
    insns.push((
-        Inst::shift_r(OperandSize::Size32, ShiftKind::ShiftLeft, None, w_rdi),
+        Inst::shift_r(
+            OperandSize::Size32,
+            ShiftKind::ShiftLeft,
+            Imm8Gpr::new(Imm8Reg::Reg { reg: regs::rcx() }).unwrap(),
+            w_rdi,
+        ),
        "D3E7",
        "shll    %cl, %edi, %edi",
    ));
    insns.push((
-        Inst::shift_r(OperandSize::Size32, ShiftKind::ShiftLeft, None, w_r12),
+        Inst::shift_r(
+            OperandSize::Size32,
+            ShiftKind::ShiftLeft,
+            Imm8Gpr::new(Imm8Reg::Reg { reg: regs::rcx() }).unwrap(),
+            w_r12,
+        ),
        "41D3E4",
        "shll    %cl, %r12d, %r12d",
    ));
    insns.push((
-        Inst::shift_r(OperandSize::Size32, ShiftKind::ShiftLeft, Some(2), w_r8),
+        Inst::shift_r(
+            OperandSize::Size32,
+            ShiftKind::ShiftLeft,
+            Imm8Gpr::new(Imm8Reg::Imm8 { imm: 2 }).unwrap(),
+            w_r8,
+        ),
        "41C1E002",
        "shll    $2, %r8d, %r8d",
    ));
    insns.push((
-        Inst::shift_r(OperandSize::Size32, ShiftKind::ShiftLeft, Some(31), w_r13),
+        Inst::shift_r(
+            OperandSize::Size32,
+            ShiftKind::ShiftLeft,
+            Imm8Gpr::new(Imm8Reg::Imm8 { imm: 31 }).unwrap(),
+            w_r13,
+        ),
        "41C1E51F",
        "shll    $31, %r13d, %r13d",
    ));
    insns.push((
-        Inst::shift_r(OperandSize::Size64, ShiftKind::ShiftLeft, None, w_r13),
+        Inst::shift_r(
+            OperandSize::Size64,
+            ShiftKind::ShiftLeft,
+            Imm8Gpr::new(Imm8Reg::Reg { reg: regs::rcx() }).unwrap(),
+            w_r13,
+        ),
        "49D3E5",
        "shlq    %cl, %r13, %r13",
    ));
    insns.push((
-        Inst::shift_r(OperandSize::Size64, ShiftKind::ShiftLeft, None, w_rdi),
+        Inst::shift_r(
+            OperandSize::Size64,
+            ShiftKind::ShiftLeft,
+            Imm8Gpr::new(Imm8Reg::Reg { reg: regs::rcx() }).unwrap(),
+            w_rdi,
+        ),
        "48D3E7",
        "shlq    %cl, %rdi, %rdi",
    ));
    insns.push((
-        Inst::shift_r(OperandSize::Size64, ShiftKind::ShiftLeft, Some(2), w_r8),
+        Inst::shift_r(
+            OperandSize::Size64,
+            ShiftKind::ShiftLeft,
+            Imm8Gpr::new(Imm8Reg::Imm8 { imm: 2 }).unwrap(),
+            w_r8,
+        ),
        "49C1E002",
        "shlq    $2, %r8, %r8",
    ));
    insns.push((
-        Inst::shift_r(OperandSize::Size64, ShiftKind::ShiftLeft, Some(3), w_rbx),
+        Inst::shift_r(
+            OperandSize::Size64,
+            ShiftKind::ShiftLeft,
+            Imm8Gpr::new(Imm8Reg::Imm8 { imm: 3 }).unwrap(),
+            w_rbx,
+        ),
        "48C1E303",
        "shlq    $3, %rbx, %rbx",
    ));
    insns.push((
-        Inst::shift_r(OperandSize::Size64, ShiftKind::ShiftLeft, Some(63), w_r13),
+        Inst::shift_r(
+            OperandSize::Size64,
+            ShiftKind::ShiftLeft,
+            Imm8Gpr::new(Imm8Reg::Imm8 { imm: 63 }).unwrap(),
+            w_r13,
+        ),
        "49C1E53F",
        "shlq    $63, %r13, %r13",
    ));
@ -2861,7 +2954,7 @@ fn test_x64_emit() {
        Inst::shift_r(
            OperandSize::Size32,
            ShiftKind::ShiftRightLogical,
-            None,
+            Imm8Gpr::new(Imm8Reg::Reg { reg: regs::rcx() }).unwrap(),
            w_rdi,
        ),
        "D3EF",
@ -2871,7 +2964,7 @@ fn test_x64_emit() {
        Inst::shift_r(
            OperandSize::Size32,
            ShiftKind::ShiftRightLogical,
-            Some(2),
+            Imm8Gpr::new(Imm8Reg::Imm8 { imm: 2 }).unwrap(),
            w_r8,
        ),
        "41C1E802",
@ -2881,7 +2974,7 @@ fn test_x64_emit() {
        Inst::shift_r(
            OperandSize::Size32,
            ShiftKind::ShiftRightLogical,
-            Some(31),
+            Imm8Gpr::new(Imm8Reg::Imm8 { imm: 31 }).unwrap(),
            w_r13,
        ),
        "41C1ED1F",
@ -2891,7 +2984,7 @@ fn test_x64_emit() {
        Inst::shift_r(
            OperandSize::Size64,
            ShiftKind::ShiftRightLogical,
-            None,
+            Imm8Gpr::new(Imm8Reg::Reg { reg: regs::rcx() }).unwrap(),
            w_rdi,
        ),
        "48D3EF",
@ -2901,7 +2994,7 @@ fn test_x64_emit() {
        Inst::shift_r(
            OperandSize::Size64,
            ShiftKind::ShiftRightLogical,
-            Some(2),
+            Imm8Gpr::new(Imm8Reg::Imm8 { imm: 2 }).unwrap(),
            w_r8,
        ),
        "49C1E802",
@ -2911,7 +3004,7 @@ fn test_x64_emit() {
        Inst::shift_r(
            OperandSize::Size64,
            ShiftKind::ShiftRightLogical,
-            Some(63),
+            Imm8Gpr::new(Imm8Reg::Imm8 { imm: 63 }).unwrap(),
            w_r13,
        ),
        "49C1ED3F",
@ -2921,7 +3014,7 @@ fn test_x64_emit() {
        Inst::shift_r(
            OperandSize::Size32,
            ShiftKind::ShiftRightArithmetic,
-            None,
+            Imm8Gpr::new(Imm8Reg::Reg { reg: regs::rcx() }).unwrap(),
            w_rdi,
        ),
        "D3FF",
@ -2931,7 +3024,7 @@ fn test_x64_emit() {
        Inst::shift_r(
            OperandSize::Size32,
            ShiftKind::ShiftRightArithmetic,
-            Some(2),
+            Imm8Gpr::new(Imm8Reg::Imm8 { imm: 2 }).unwrap(),
            w_r8,
        ),
        "41C1F802",
@ -2941,7 +3034,7 @@ fn test_x64_emit() {
        Inst::shift_r(
            OperandSize::Size32,
            ShiftKind::ShiftRightArithmetic,
-            Some(31),
+            Imm8Gpr::new(Imm8Reg::Imm8 { imm: 31 }).unwrap(),
            w_r13,
        ),
        "41C1FD1F",
@ -2951,7 +3044,7 @@ fn test_x64_emit() {
        Inst::shift_r(
            OperandSize::Size64,
            ShiftKind::ShiftRightArithmetic,
-            None,
+            Imm8Gpr::new(Imm8Reg::Reg { reg: regs::rcx() }).unwrap(),
            w_rdi,
        ),
        "48D3FF",
@ -2961,7 +3054,7 @@ fn test_x64_emit() {
        Inst::shift_r(
            OperandSize::Size64,
            ShiftKind::ShiftRightArithmetic,
-            Some(2),
+            Imm8Gpr::new(Imm8Reg::Imm8 { imm: 2 }).unwrap(),
            w_r8,
        ),
        "49C1F802",
@ -2971,54 +3064,99 @@ fn test_x64_emit() {
        Inst::shift_r(
            OperandSize::Size64,
            ShiftKind::ShiftRightArithmetic,
-            Some(63),
+            Imm8Gpr::new(Imm8Reg::Imm8 { imm: 63 }).unwrap(),
            w_r13,
        ),
        "49C1FD3F",
        "sarq    $63, %r13, %r13",
    ));
    insns.push((
-        Inst::shift_r(OperandSize::Size64, ShiftKind::RotateLeft, None, w_r8),
+        Inst::shift_r(
+            OperandSize::Size64,
+            ShiftKind::RotateLeft,
+            Imm8Gpr::new(Imm8Reg::Reg { reg: regs::rcx() }).unwrap(),
+            w_r8,
+        ),
        "49D3C0",
        "rolq    %cl, %r8, %r8",
    ));
    insns.push((
-        Inst::shift_r(OperandSize::Size32, ShiftKind::RotateLeft, Some(3), w_r9),
+        Inst::shift_r(
+            OperandSize::Size32,
+            ShiftKind::RotateLeft,
+            Imm8Gpr::new(Imm8Reg::Imm8 { imm: 3 }).unwrap(),
+            w_r9,
+        ),
        "41C1C103",
        "roll    $3, %r9d, %r9d",
    ));
    insns.push((
-        Inst::shift_r(OperandSize::Size32, ShiftKind::RotateRight, None, w_rsi),
+        Inst::shift_r(
+            OperandSize::Size32,
+            ShiftKind::RotateRight,
+            Imm8Gpr::new(Imm8Reg::Reg { reg: regs::rcx() }).unwrap(),
+            w_rsi,
+        ),
        "D3CE",
        "rorl    %cl, %esi, %esi",
    ));
    insns.push((
-        Inst::shift_r(OperandSize::Size64, ShiftKind::RotateRight, Some(5), w_r15),
+        Inst::shift_r(
+            OperandSize::Size64,
+            ShiftKind::RotateRight,
+            Imm8Gpr::new(Imm8Reg::Imm8 { imm: 5 }).unwrap(),
+            w_r15,
+        ),
        "49C1CF05",
        "rorq    $5, %r15, %r15",
    ));
    insns.push((
-        Inst::shift_r(OperandSize::Size8, ShiftKind::RotateRight, None, w_rsi),
+        Inst::shift_r(
+            OperandSize::Size8,
+            ShiftKind::RotateRight,
+            Imm8Gpr::new(Imm8Reg::Reg { reg: regs::rcx() }).unwrap(),
+            w_rsi,
+        ),
        "40D2CE",
        "rorb    %cl, %sil, %sil",
    ));
    insns.push((
-        Inst::shift_r(OperandSize::Size8, ShiftKind::RotateRight, None, w_rax),
+        Inst::shift_r(
+            OperandSize::Size8,
+            ShiftKind::RotateRight,
+            Imm8Gpr::new(Imm8Reg::Reg { reg: regs::rcx() }).unwrap(),
+            w_rax,
+        ),
        "D2C8",
        "rorb    %cl, %al, %al",
    ));
    insns.push((
-        Inst::shift_r(OperandSize::Size8, ShiftKind::RotateRight, Some(5), w_r15),
+        Inst::shift_r(
+            OperandSize::Size8,
+            ShiftKind::RotateRight,
+            Imm8Gpr::new(Imm8Reg::Imm8 { imm: 5 }).unwrap(),
+            w_r15,
+        ),
        "41C0CF05",
        "rorb    $5, %r15b, %r15b",
    ));
    insns.push((
-        Inst::shift_r(OperandSize::Size16, ShiftKind::RotateRight, None, w_rsi),
+        Inst::shift_r(
+            OperandSize::Size16,
+            ShiftKind::RotateRight,
+            Imm8Gpr::new(Imm8Reg::Reg { reg: regs::rcx() }).unwrap(),
+            w_rsi,
+        ),
        "66D3CE",
        "rorw    %cl, %si, %si",
    ));
    insns.push((
-        Inst::shift_r(OperandSize::Size16, ShiftKind::RotateRight, Some(5), w_r15),
+        Inst::shift_r(
+            OperandSize::Size16,
+            ShiftKind::RotateRight,
+            Imm8Gpr::new(Imm8Reg::Imm8 { imm: 5 }).unwrap(),
+            w_r15,
+        ),
        "6641C1CF05",
        "rorw    $5, %r15w, %r15w",
    ));
--- a/cranelift/codegen/src/isa/x64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/x64/inst/mod.rs
@ -131,7 +131,9 @@ impl Inst {
            | Inst::XmmToGpr { op, .. }
            | Inst::XmmUnaryRmR { op, .. } => smallvec![op.available_from()],

-            Inst::XmmUnaryRmREvex { op, .. } | Inst::XmmRmREvex { op, .. } => op.available_from(),
+            Inst::XmmUnaryRmREvex { op, .. }
+            | Inst::XmmRmREvex { op, .. }
+            | Inst::XmmRmREvex3 { op, .. } => op.available_from(),

            Inst::XmmRmRVex { op, .. } => op.available_from(),
        }
@ -195,47 +197,55 @@ impl Inst {
        }
    }

-    pub(crate) fn div(size: OperandSize, signed: bool, divisor: RegMem) -> Inst {
+    pub(crate) fn div(
+        size: OperandSize,
+        signed: bool,
+        divisor: RegMem,
+        dividend_lo: Gpr,
+        dividend_hi: Gpr,
+        dst_quotient: WritableGpr,
+        dst_remainder: WritableGpr,
+    ) -> Inst {
        divisor.assert_regclass_is(RegClass::Int);
        Inst::Div {
            size,
            signed,
            divisor: GprMem::new(divisor).unwrap(),
-            dividend_lo: Gpr::new(regs::rax()).unwrap(),
-            dividend_hi: Gpr::new(regs::rdx()).unwrap(),
-            dst_quotient: WritableGpr::from_reg(Gpr::new(regs::rax()).unwrap()),
-            dst_remainder: Writable::from_reg(Gpr::new(regs::rdx()).unwrap()),
+            dividend_lo,
+            dividend_hi,
+            dst_quotient,
+            dst_remainder,
        }
    }

    pub(crate) fn checked_div_or_rem_seq(
        kind: DivOrRemKind,
        size: OperandSize,
-        divisor: Writable<Reg>,
+        divisor: Reg,
+        dividend_lo: Gpr,
+        dividend_hi: Gpr,
+        dst_quotient: WritableGpr,
+        dst_remainder: WritableGpr,
        tmp: Option<Writable<Reg>>,
    ) -> Inst {
-        debug_assert!(divisor.to_reg().class() == RegClass::Int);
+        debug_assert!(divisor.class() == RegClass::Int);
        debug_assert!(tmp
            .map(|tmp| tmp.to_reg().class() == RegClass::Int)
            .unwrap_or(true));
        Inst::CheckedDivOrRemSeq {
            kind,
            size,
-            divisor: WritableGpr::from_writable_reg(divisor).unwrap(),
-            dividend_lo: Gpr::new(regs::rax()).unwrap(),
-            dividend_hi: Gpr::new(regs::rdx()).unwrap(),
-            dst_quotient: Writable::from_reg(Gpr::new(regs::rax()).unwrap()),
-            dst_remainder: Writable::from_reg(Gpr::new(regs::rdx()).unwrap()),
+            divisor: Gpr::new(divisor).unwrap(),
+            dividend_lo,
+            dividend_hi,
+            dst_quotient,
+            dst_remainder,
            tmp: tmp.map(|tmp| WritableGpr::from_writable_reg(tmp).unwrap()),
        }
    }

-    pub(crate) fn sign_extend_data(size: OperandSize) -> Inst {
-        Inst::SignExtendData {
-            size,
-            src: Gpr::new(regs::rax()).unwrap(),
-            dst: Writable::from_reg(Gpr::new(regs::rdx()).unwrap()),
-        }
+    pub(crate) fn sign_extend_data(size: OperandSize, src: Gpr, dst: WritableGpr) -> Inst {
+        Inst::SignExtendData { size, src, dst }
    }

    pub(crate) fn imm(dst_size: OperandSize, simm64: u64, dst: Writable<Reg>) -> Inst {
@ -415,24 +425,18 @@ impl Inst {
    pub(crate) fn shift_r(
        size: OperandSize,
        kind: ShiftKind,
-        num_bits: Option<u8>,
+        num_bits: Imm8Gpr,
        dst: Writable<Reg>,
    ) -> Inst {
-        debug_assert!(if let Some(num_bits) = num_bits {
-            num_bits < size.to_bits()
-        } else {
-            true
-        });
+        if let Imm8Reg::Imm8 { imm: num_bits } = num_bits.clone().to_imm8_reg() {
+            debug_assert!(num_bits < size.to_bits());
+        }
        debug_assert!(dst.to_reg().class() == RegClass::Int);
        Inst::ShiftR {
            size,
            kind,
            src: Gpr::new(dst.to_reg()).unwrap(),
-            num_bits: Imm8Gpr::new(match num_bits {
-                Some(imm) => Imm8Reg::Imm8 { imm },
-                None => Imm8Reg::Reg { reg: regs::rcx() },
-            })
-            .unwrap(),
+            num_bits,
            dst: WritableGpr::from_writable_reg(dst).unwrap(),
        }
    }
@ -781,8 +785,11 @@ impl PrettyPrint for Inst {
                let dividend_lo = pretty_print_reg(dividend_lo.to_reg(), size.to_bytes(), allocs);
                let dst_quotient =
                    pretty_print_reg(dst_quotient.to_reg().to_reg(), size.to_bytes(), allocs);
-                let dst_remainder =
-                    pretty_print_reg(dst_remainder.to_reg().to_reg(), size.to_bytes(), allocs);
+                let dst_remainder = if size.to_bits() > 8 {
+                    pretty_print_reg(dst_remainder.to_reg().to_reg(), size.to_bytes(), allocs)
+                } else {
+                    "(none)".to_string()
+                };
                let dividend_hi = if size.to_bits() > 8 {
                    pretty_print_reg(dividend_hi.to_reg(), size.to_bytes(), allocs)
                } else {
@ -842,7 +849,7 @@ impl PrettyPrint for Inst {
            } => {
                let dividend_lo = pretty_print_reg(dividend_lo.to_reg(), size.to_bytes(), allocs);
                let dividend_hi = pretty_print_reg(dividend_hi.to_reg(), size.to_bytes(), allocs);
-                let divisor = pretty_print_reg(divisor.to_reg().to_reg(), size.to_bytes(), allocs);
+                let divisor = pretty_print_reg(divisor.to_reg(), size.to_bytes(), allocs);
                let dst_quotient =
                    pretty_print_reg(dst_quotient.to_reg().to_reg(), size.to_bytes(), allocs);
                let dst_remainder =
@ -949,12 +956,34 @@ impl PrettyPrint for Inst {
                dst,
                ..
            } => {
-                let src2 = pretty_print_reg(src2.to_reg(), 8, allocs);
                let dst = pretty_print_reg(dst.to_reg().to_reg(), 8, allocs);
+                let src2 = pretty_print_reg(src2.to_reg(), 8, allocs);
                let src1 = src1.pretty_print(8, allocs);
                format!("{} {}, {}, {}", ljustify(op.to_string()), src1, src2, dst)
            }

+            Inst::XmmRmREvex3 {
+                op,
+                src1,
+                src2,
+                src3,
+                dst,
+                ..
+            } => {
+                let dst = pretty_print_reg(dst.to_reg().to_reg(), 8, allocs);
+                let src2 = pretty_print_reg(src2.to_reg(), 8, allocs);
+                let src3 = pretty_print_reg(src3.to_reg(), 8, allocs);
+                let src1 = src1.pretty_print(8, allocs);
+                format!(
+                    "{} {}, {}, {}, {}",
+                    ljustify(op.to_string()),
+                    src1,
+                    src2,
+                    src3,
+                    dst
+                )
+            }
+
            Inst::XmmMinMaxSeq {
                lhs,
                rhs,
@ -1084,7 +1113,7 @@ impl PrettyPrint for Inst {
                tmp_gpr2,
                ..
            } => {
-                let src = pretty_print_reg(src.to_reg().to_reg(), 8, allocs);
+                let src = pretty_print_reg(src.to_reg(), 8, allocs);
                let dst = pretty_print_reg(dst.to_reg().to_reg(), dst_size.to_bytes(), allocs);
                let tmp_gpr1 = pretty_print_reg(tmp_gpr1.to_reg().to_reg(), 8, allocs);
                let tmp_gpr2 = pretty_print_reg(tmp_gpr2.to_reg().to_reg(), 8, allocs);
@ -1114,7 +1143,7 @@ impl PrettyPrint for Inst {
                tmp_gpr,
                is_saturating,
            } => {
-                let src = pretty_print_reg(src.to_reg().to_reg(), src_size.to_bytes(), allocs);
+                let src = pretty_print_reg(src.to_reg(), src_size.to_bytes(), allocs);
                let dst = pretty_print_reg(dst.to_reg().to_reg(), dst_size.to_bytes(), allocs);
                let tmp_gpr = pretty_print_reg(tmp_gpr.to_reg().to_reg(), 8, allocs);
                let tmp_xmm = pretty_print_reg(tmp_xmm.to_reg().to_reg(), 8, allocs);
@ -1142,7 +1171,7 @@ impl PrettyPrint for Inst {
                tmp_xmm,
                is_saturating,
            } => {
-                let src = pretty_print_reg(src.to_reg().to_reg(), src_size.to_bytes(), allocs);
+                let src = pretty_print_reg(src.to_reg(), src_size.to_bytes(), allocs);
                let dst = pretty_print_reg(dst.to_reg().to_reg(), dst_size.to_bytes(), allocs);
                let tmp_gpr = pretty_print_reg(tmp_gpr.to_reg().to_reg(), 8, allocs);
                let tmp_xmm = pretty_print_reg(tmp_xmm.to_reg().to_reg(), 8, allocs);
@ -1424,9 +1453,19 @@ impl PrettyPrint for Inst {
                not_taken.to_string()
            ),

-            Inst::JmpTableSeq { idx, .. } => {
+            Inst::JmpTableSeq {
+                idx, tmp1, tmp2, ..
+            } => {
                let idx = pretty_print_reg(*idx, 8, allocs);
-                format!("{} {}", ljustify("br_table".into()), idx)
+                let tmp1 = pretty_print_reg(tmp1.to_reg(), 8, allocs);
+                let tmp2 = pretty_print_reg(tmp2.to_reg(), 8, allocs);
+                format!(
+                    "{} {}, {}, {}",
+                    ljustify("br_table".into()),
+                    idx,
+                    tmp1,
+                    tmp2
+                )
            }

            Inst::JmpUnknown { target } => {
@ -1605,8 +1644,8 @@ fn x64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandCol
        } => {
            collector.reg_fixed_use(dividend_lo.to_reg(), regs::rax());
            collector.reg_fixed_def(dst_quotient.to_writable_reg(), regs::rax());
-            collector.reg_fixed_def(dst_remainder.to_writable_reg(), regs::rdx());
            if size.to_bits() > 8 {
+                collector.reg_fixed_def(dst_remainder.to_writable_reg(), regs::rdx());
                collector.reg_fixed_use(dividend_hi.to_reg(), regs::rdx());
            }
            divisor.get_operands(collector);
@ -1634,10 +1673,12 @@ fn x64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandCol
        } => {
            collector.reg_fixed_use(dividend_lo.to_reg(), regs::rax());
            collector.reg_fixed_use(dividend_hi.to_reg(), regs::rdx());
-            collector.reg_mod(divisor.to_writable_reg());
+            collector.reg_use(divisor.to_reg());
            collector.reg_fixed_def(dst_quotient.to_writable_reg(), regs::rax());
            collector.reg_fixed_def(dst_remainder.to_writable_reg(), regs::rdx());
            if let Some(tmp) = tmp {
+                // Early def so that the temporary register does not
+                // conflict with inputs or outputs.
                collector.reg_early_def(tmp.to_writable_reg());
            }
        }
@ -1718,13 +1759,25 @@ fn x64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandCol
            dst,
            ..
        } => {
-            match *op {
-                Avx512Opcode::Vpermi2b => collector.reg_mod(dst.to_writable_reg()),
-                _ => collector.reg_def(dst.to_writable_reg()),
-            }
+            assert_ne!(*op, Avx512Opcode::Vpermi2b);
+            collector.reg_def(dst.to_writable_reg());
            collector.reg_use(src2.to_reg());
            src1.get_operands(collector);
        }
+        Inst::XmmRmREvex3 {
+            op,
+            src1,
+            src2,
+            src3,
+            dst,
+            ..
+        } => {
+            assert_eq!(*op, Avx512Opcode::Vpermi2b);
+            collector.reg_reuse_def(dst.to_writable_reg(), 2); // Reuse `src3`.
+            collector.reg_use(src2.to_reg());
+            collector.reg_use(src3.to_reg());
+            src1.get_operands(collector);
+        }
        Inst::XmmRmRImm {
            op,
            src1,
@ -1795,7 +1848,7 @@ fn x64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandCol
            tmp_gpr2,
            ..
        } => {
-            collector.reg_mod(src.to_writable_reg());
+            collector.reg_use(src.to_reg());
            collector.reg_def(dst.to_writable_reg());
            collector.reg_early_def(tmp_gpr1.to_writable_reg());
            collector.reg_early_def(tmp_gpr2.to_writable_reg());
@ -1814,7 +1867,7 @@ fn x64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandCol
            tmp_xmm,
            ..
        } => {
-            collector.reg_mod(src.to_writable_reg());
+            collector.reg_use(src.to_reg());
            collector.reg_def(dst.to_writable_reg());
            collector.reg_early_def(tmp_gpr.to_writable_reg());
            collector.reg_early_def(tmp_xmm.to_writable_reg());
@ -1911,7 +1964,7 @@ fn x64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandCol
            ..
        } => {
            collector.reg_use(*idx);
-            collector.reg_mod(*tmp1);
+            collector.reg_early_def(*tmp1);
            collector.reg_early_def(*tmp2);
        }

--- a/cranelift/codegen/src/isa/x64/lower/isle.rs
+++ b/cranelift/codegen/src/isa/x64/lower/isle.rs
@ -955,40 +955,34 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> {
        let is_div = kind.is_div();
        let size = OperandSize::from_ty(ty);

-        self.lower_ctx.emit(MInst::gen_move(
-            Writable::from_reg(regs::rax()),
-            dividend.to_reg(),
-            ty,
-        ));
+        let dst_quotient = self.lower_ctx.alloc_tmp(types::I64).only_reg().unwrap();
+        let dst_remainder = self.lower_ctx.alloc_tmp(types::I64).only_reg().unwrap();

        // Always do explicit checks for `srem`: otherwise, INT_MIN % -1 is not handled properly.
        if self.flags.avoid_div_traps() || *kind == DivOrRemKind::SignedRem {
            // A vcode meta-instruction is used to lower the inline checks, since they embed
            // pc-relative offsets that must not change, thus requiring regalloc to not
            // interfere by introducing spills and reloads.
-            //
-            // Note it keeps the result in $rax (for divide) or $rdx (for rem), so that
-            // regalloc is aware of the coalescing opportunity between rax/rdx and the
-            // destination register.
-            let divisor_copy = self.lower_ctx.alloc_tmp(types::I64).only_reg().unwrap();
-            self.lower_ctx
-                .emit(MInst::gen_move(divisor_copy, divisor.to_reg(), types::I64));
-
            let tmp = if *kind == DivOrRemKind::SignedDiv && size == OperandSize::Size64 {
                Some(self.lower_ctx.alloc_tmp(types::I64).only_reg().unwrap())
            } else {
                None
            };
-            // TODO use xor
-            self.lower_ctx.emit(MInst::imm(
+            let dividend_hi = self.lower_ctx.alloc_tmp(types::I64).only_reg().unwrap();
+            self.lower_ctx.emit(MInst::alu_rmi_r(
                OperandSize::Size32,
-                0,
-                Writable::from_reg(regs::rdx()),
+                AluRmiROpcode::Xor,
+                RegMemImm::reg(dividend_hi.to_reg()),
+                dividend_hi,
            ));
            self.lower_ctx.emit(MInst::checked_div_or_rem_seq(
                kind.clone(),
                size,
-                divisor_copy,
+                divisor.to_reg(),
+                Gpr::new(dividend.to_reg()).unwrap(),
+                Gpr::new(dividend_hi.to_reg()).unwrap(),
+                WritableGpr::from_reg(Gpr::new(dst_quotient.to_reg()).unwrap()),
+                WritableGpr::from_reg(Gpr::new(dst_remainder.to_reg()).unwrap()),
                tmp,
            ));
        } else {
@ -997,51 +991,89 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> {
            // divisor into a register instead.
            let divisor = RegMem::reg(divisor.to_reg());

+            let dividend_hi = self.lower_ctx.alloc_tmp(types::I64).only_reg().unwrap();
+
            // Fill in the high parts:
-            if kind.is_signed() {
-                // sign-extend the sign-bit of al into ah for size 1, or rax into rdx, for
-                // signed opcodes.
-                self.lower_ctx.emit(MInst::sign_extend_data(size));
+            let dividend_lo = if kind.is_signed() && ty == types::I8 {
+                let dividend_lo = self.lower_ctx.alloc_tmp(types::I64).only_reg().unwrap();
+                // 8-bit div takes its dividend in only the `lo` reg.
+                self.lower_ctx.emit(MInst::sign_extend_data(
+                    size,
+                    Gpr::new(dividend.to_reg()).unwrap(),
+                    WritableGpr::from_reg(Gpr::new(dividend_lo.to_reg()).unwrap()),
+                ));
+                // `dividend_hi` is not used by the Div below, so we
+                // don't def it here.
+
+                dividend_lo.to_reg()
+            } else if kind.is_signed() {
+                // 16-bit and higher div takes its operand in hi:lo
+                // with half in each (64:64, 32:32 or 16:16).
+                self.lower_ctx.emit(MInst::sign_extend_data(
+                    size,
+                    Gpr::new(dividend.to_reg()).unwrap(),
+                    WritableGpr::from_reg(Gpr::new(dividend_hi.to_reg()).unwrap()),
+                ));
+
+                dividend.to_reg()
            } else if ty == types::I8 {
+                let dividend_lo = self.lower_ctx.alloc_tmp(types::I64).only_reg().unwrap();
                self.lower_ctx.emit(MInst::movzx_rm_r(
                    ExtMode::BL,
-                    RegMem::reg(regs::rax()),
-                    Writable::from_reg(regs::rax()),
+                    RegMem::reg(dividend.to_reg()),
+                    dividend_lo,
                ));
+
+                dividend_lo.to_reg()
            } else {
                // zero for unsigned opcodes.
-                self.lower_ctx.emit(MInst::imm(
-                    OperandSize::Size64,
-                    0,
-                    Writable::from_reg(regs::rdx()),
-                ));
-            }
+                self.lower_ctx
+                    .emit(MInst::imm(OperandSize::Size64, 0, dividend_hi));
+
+                dividend.to_reg()
+            };

            // Emit the actual idiv.
-            self.lower_ctx
-                .emit(MInst::div(size, kind.is_signed(), divisor));
+            self.lower_ctx.emit(MInst::div(
+                size,
+                kind.is_signed(),
+                divisor,
+                Gpr::new(dividend_lo).unwrap(),
+                Gpr::new(dividend_hi.to_reg()).unwrap(),
+                WritableGpr::from_reg(Gpr::new(dst_quotient.to_reg()).unwrap()),
+                WritableGpr::from_reg(Gpr::new(dst_remainder.to_reg()).unwrap()),
+            ));
        }

        // Move the result back into the destination reg.
        if is_div {
            // The quotient is in rax.
-            self.lower_ctx
-                .emit(MInst::gen_move(dst.to_writable_reg(), regs::rax(), ty));
+            self.lower_ctx.emit(MInst::gen_move(
+                dst.to_writable_reg(),
+                dst_quotient.to_reg(),
+                ty,
+            ));
        } else {
            if size == OperandSize::Size8 {
                // The remainder is in AH. Right-shift by 8 bits then move from rax.
                self.lower_ctx.emit(MInst::shift_r(
                    OperandSize::Size64,
                    ShiftKind::ShiftRightLogical,
-                    Some(8),
-                    Writable::from_reg(regs::rax()),
+                    Imm8Gpr::new(Imm8Reg::Imm8 { imm: 8 }).unwrap(),
+                    dst_quotient,
+                ));
+                self.lower_ctx.emit(MInst::gen_move(
+                    dst.to_writable_reg(),
+                    dst_quotient.to_reg(),
+                    ty,
                ));
-                self.lower_ctx
-                    .emit(MInst::gen_move(dst.to_writable_reg(), regs::rax(), ty));
            } else {
                // The remainder is in rdx.
-                self.lower_ctx
-                    .emit(MInst::gen_move(dst.to_writable_reg(), regs::rdx(), ty));
+                self.lower_ctx.emit(MInst::gen_move(
+                    dst.to_writable_reg(),
+                    dst_remainder.to_reg(),
+                    ty,
+                ));
            }
        }
    }
--- a/cranelift/codegen/src/isa/x64/mod.rs
+++ b/cranelift/codegen/src/isa/x64/mod.rs
@ -427,37 +427,34 @@ mod test {

        // 00000000  55                push rbp
        // 00000001  4889E5            mov rbp,rsp
-        // 00000004  41B900000000      mov r9d,0x0
-        // 0000000A  83FF02            cmp edi,byte +0x2
-        // 0000000D  0F8320000000      jnc near 0x33
-        // 00000013  8BF7              mov esi,edi
-        // 00000015  490F43F1          cmovnc rsi,r9
-        // 00000019  4C8D0D0B000000    lea r9,[rel 0x2b]
-        // 00000020  496374B100        movsxd rsi,dword [r9+rsi*4+0x0]
-        // 00000025  4901F1            add r9,rsi
-        // 00000028  41FFE1            jmp r9
-        // 0000002B  1200              adc al,[rax]
-        // 0000002D  0000              add [rax],al
-        // 0000002F  1C00              sbb al,0x0
-        // 00000031  0000              add [rax],al
-        // 00000033  B803000000        mov eax,0x3
-        // 00000038  4889EC            mov rsp,rbp
-        // 0000003B  5D                pop rbp
-        // 0000003C  C3                ret
-        // 0000003D  B801000000        mov eax,0x1
-        // 00000042  4889EC            mov rsp,rbp
-        // 00000045  5D                pop rbp
-        // 00000046  C3                ret
-        // 00000047  B802000000        mov eax,0x2
-        // 0000004C  4889EC            mov rsp,rbp
-        // 0000004F  5D                pop rbp
-        // 00000050  C3                ret
+        // 00000004  83FF02            cmp edi,byte +0x2
+        // 00000007  0F8327000000      jnc near 0x34
+        // 0000000D  448BDF            mov r11d,edi
+        // 00000010  41BA00000000      mov r10d,0x0
+        // 00000016  4D0F43DA          cmovnc r11,r10
+        // 0000001A  4C8D150B000000    lea r10,[rel 0x2c]
+        // 00000021  4F635C9A00        movsxd r11,dword [r10+r11*4+0x0]
+        // 00000026  4D01DA            add r10,r11
+        // 00000029  41FFE2            jmp r10
+        // 0000002C  120000001C000000  (jumptable data)
+        // 00000034  B803000000        mov eax,0x3
+        // 00000039  4889EC            mov rsp,rbp
+        // 0000003C  5D                pop rbp
+        // 0000003D  C3                ret
+        // 0000003E  B801000000        mov eax,0x1
+        // 00000043  4889EC            mov rsp,rbp
+        // 00000046  5D                pop rbp
+        // 00000047  C3                ret
+        // 00000048  B802000000        mov eax,0x2
+        // 0000004D  4889EC            mov rsp,rbp
+        // 00000050  5D                pop rbp
+        // 00000051  C3                ret

        let golden = vec![
-            85, 72, 137, 229, 65, 185, 0, 0, 0, 0, 131, 255, 2, 15, 131, 32, 0, 0, 0, 139, 247, 73,
-            15, 67, 241, 76, 141, 13, 11, 0, 0, 0, 73, 99, 116, 177, 0, 73, 1, 241, 65, 255, 225,
-            18, 0, 0, 0, 28, 0, 0, 0, 184, 3, 0, 0, 0, 72, 137, 236, 93, 195, 184, 1, 0, 0, 0, 72,
-            137, 236, 93, 195, 184, 2, 0, 0, 0, 72, 137, 236, 93, 195,
+            85, 72, 137, 229, 131, 255, 2, 15, 131, 39, 0, 0, 0, 68, 139, 223, 65, 186, 0, 0, 0, 0,
+            77, 15, 67, 218, 76, 141, 21, 11, 0, 0, 0, 79, 99, 92, 154, 0, 77, 1, 218, 65, 255,
+            226, 18, 0, 0, 0, 28, 0, 0, 0, 184, 3, 0, 0, 0, 72, 137, 236, 93, 195, 184, 1, 0, 0, 0,
+            72, 137, 236, 93, 195, 184, 2, 0, 0, 0, 72, 137, 236, 93, 195,
        ];

        assert_eq!(code, &golden[..]);
--- a/cranelift/filetests/filetests/isa/x64/branches.clif
+++ b/cranelift/filetests/filetests/isa/x64/branches.clif
@ -205,9 +205,8 @@ block2:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   movl    $0, %r8d
 ;   cmpl    $2, %edi
-;   br_table %rdi
+;   br_table %rdi, %r9, %r10
 ; block1:
 ;   jmp     label3
 ; block2:
--- a/cranelift/filetests/filetests/isa/x64/div-checks.clif
+++ b/cranelift/filetests/filetests/isa/x64/div-checks.clif
@ -10,8 +10,9 @@ target x86_64
 function %i8(i8, i8) -> i8 {
 block0(v0: i8, v1: i8):
  v2 = srem.i8 v0, v1
-; check:  movq    %rdi, %rax
-; nextln: movl    $$0, %edx
+; check:  xorl    %r11d, %r11d, %r11d
+; nextln: movq    %rdi, %rax
+; nextln: movq    %r11, %rdx
 ; nextln: srem_seq %al, %dl, %sil, %al, %dl, tmp=(none)
 ; nextln: shrq    $$8, %rax, %rax

@ -21,8 +22,9 @@ block0(v0: i8, v1: i8):
 function %i16(i16, i16) -> i16 {
 block0(v0: i16, v1: i16):
  v2 = srem.i16 v0, v1
-; check:  movq    %rdi, %rax
-; nextln: movl    $$0, %edx
+; check:  xorl    %r11d, %r11d, %r11d
+; nextln: movq    %rdi, %rax
+; nextln: movq    %r11, %rdx
 ; nextln: srem_seq %ax, %dx, %si, %ax, %dx, tmp=(none)
 ; nextln: movq    %rdx, %rax

@ -32,8 +34,9 @@ block0(v0: i16, v1: i16):
 function %i32(i32, i32) -> i32 {
 block0(v0: i32, v1: i32):
  v2 = srem.i32 v0, v1
-; check:  movq    %rdi, %rax
-; nextln: movl    $$0, %edx
+; check:  xorl    %r11d, %r11d, %r11d
+; nextln: movq    %rdi, %rax
+; nextln: movq    %r11, %rdx
 ; nextln: srem_seq %eax, %edx, %esi, %eax, %edx, tmp=(none)
 ; nextln: movq    %rdx, %rax

@ -43,8 +46,9 @@ block0(v0: i32, v1: i32):
 function %i64(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
  v2 = srem.i64 v0, v1
-; check:  movq    %rdi, %rax
-; nextln: movl    $$0, %edx
+; check:  xorl    %r11d, %r11d, %r11d
+; nextln: movq    %rdi, %rax
+; nextln: movq    %r11, %rdx
 ; nextln: srem_seq %rax, %rdx, %rsi, %rax, %rdx, tmp=(none)
 ; nextln: movq    %rdx, %rax

--- a/cranelift/filetests/filetests/isa/x64/fcvt.clif
+++ b/cranelift/filetests/filetests/isa/x64/fcvt.clif
@ -146,16 +146,16 @@ block0(v0: i8, v1: i16, v2: i32, v3: i64):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   movzbq  %dil, %rax
-;   cvtsi2ss %rax, %xmm0
-;   movzwq  %si, %rax
-;   cvtsi2ss %rax, %xmm6
-;   movl    %edx, %eax
-;   cvtsi2ss %rax, %xmm7
-;   u64_to_f32_seq %rcx, %xmm4, %r8, %rdx
+;   movzbq  %dil, %rdi
+;   cvtsi2ss %rdi, %xmm0
+;   movzwq  %si, %rdi
+;   cvtsi2ss %rdi, %xmm5
+;   movl    %edx, %edi
+;   cvtsi2ss %rdi, %xmm6
+;   u64_to_f32_seq %rcx, %xmm2, %rdi, %rax
+;   addss   %xmm0, %xmm5, %xmm0
 ;   addss   %xmm0, %xmm6, %xmm0
-;   addss   %xmm0, %xmm7, %xmm0
-;   addss   %xmm0, %xmm4, %xmm0
+;   addss   %xmm0, %xmm2, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@ -209,7 +209,7 @@ block0(v0: f32):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   cvt_float32_to_uint32_seq %xmm0, %eax, %r10, %xmm6
+;   cvt_float32_to_uint32_seq %xmm0, %eax, %r8, %xmm4
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@ -223,7 +223,7 @@ block0(v0: f32):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   cvt_float32_to_uint64_seq %xmm0, %rax, %r10, %xmm6
+;   cvt_float32_to_uint64_seq %xmm0, %rax, %r8, %xmm4
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@ -237,7 +237,7 @@ block0(v0: f64):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   cvt_float64_to_uint32_seq %xmm0, %eax, %r10, %xmm6
+;   cvt_float64_to_uint32_seq %xmm0, %eax, %r8, %xmm4
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@ -251,7 +251,7 @@ block0(v0: f64):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   cvt_float64_to_uint64_seq %xmm0, %rax, %r10, %xmm6
+;   cvt_float64_to_uint64_seq %xmm0, %rax, %r8, %xmm4
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@ -265,7 +265,7 @@ block0(v0: f32):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   cvt_float32_to_uint32_sat_seq %xmm0, %eax, %r10, %xmm6
+;   cvt_float32_to_uint32_sat_seq %xmm0, %eax, %r8, %xmm4
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@ -279,7 +279,7 @@ block0(v0: f32):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   cvt_float32_to_uint64_sat_seq %xmm0, %rax, %r10, %xmm6
+;   cvt_float32_to_uint64_sat_seq %xmm0, %rax, %r8, %xmm4
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@ -293,7 +293,7 @@ block0(v0: f64):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   cvt_float64_to_uint32_sat_seq %xmm0, %eax, %r10, %xmm6
+;   cvt_float64_to_uint32_sat_seq %xmm0, %eax, %r8, %xmm4
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@ -307,7 +307,7 @@ block0(v0: f64):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   cvt_float64_to_uint64_sat_seq %xmm0, %rax, %r10, %xmm6
+;   cvt_float64_to_uint64_sat_seq %xmm0, %rax, %r8, %xmm4
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@ -321,7 +321,7 @@ block0(v0: f32):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   cvt_float32_to_sint32_seq %xmm0, %eax, %r10, %xmm6
+;   cvt_float32_to_sint32_seq %xmm0, %eax, %r8, %xmm4
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@ -335,7 +335,7 @@ block0(v0: f32):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   cvt_float32_to_sint64_seq %xmm0, %rax, %r10, %xmm6
+;   cvt_float32_to_sint64_seq %xmm0, %rax, %r8, %xmm4
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@ -349,7 +349,7 @@ block0(v0: f64):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   cvt_float64_to_sint32_seq %xmm0, %eax, %r10, %xmm6
+;   cvt_float64_to_sint32_seq %xmm0, %eax, %r8, %xmm4
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@ -363,7 +363,7 @@ block0(v0: f64):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   cvt_float64_to_sint64_seq %xmm0, %rax, %r10, %xmm6
+;   cvt_float64_to_sint64_seq %xmm0, %rax, %r8, %xmm4
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@ -377,7 +377,7 @@ block0(v0: f32):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   cvt_float32_to_sint32_sat_seq %xmm0, %eax, %r10, %xmm6
+;   cvt_float32_to_sint32_sat_seq %xmm0, %eax, %r8, %xmm4
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@ -391,7 +391,7 @@ block0(v0: f32):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   cvt_float32_to_sint64_sat_seq %xmm0, %rax, %r10, %xmm6
+;   cvt_float32_to_sint64_sat_seq %xmm0, %rax, %r8, %xmm4
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@ -405,7 +405,7 @@ block0(v0: f64):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   cvt_float64_to_sint32_sat_seq %xmm0, %eax, %r10, %xmm6
+;   cvt_float64_to_sint32_sat_seq %xmm0, %eax, %r8, %xmm4
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@ -419,7 +419,7 @@ block0(v0: f64):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   cvt_float64_to_sint64_sat_seq %xmm0, %rax, %r10, %xmm6
+;   cvt_float64_to_sint64_sat_seq %xmm0, %rax, %r8, %xmm4
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
--- a/cranelift/filetests/filetests/isa/x64/sdiv.clif
+++ b/cranelift/filetests/filetests/isa/x64/sdiv.clif
@ -11,8 +11,9 @@ block0(v0: i8, v1: i8):
 ;   movq    %rsp, %rbp
 ; block0:
 ;   movq    %rdi, %rax
-;   cbw %al, %dl
-;   idiv    %al, (none), %sil, %al, %dl
+;   cbw %al, %al
+;   movq    %rax, %rdi
+;   idiv    %al, (none), %sil, %al, (none)
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@ -28,6 +29,7 @@ block0(v0: i16, v1: i16):
 ; block0:
 ;   movq    %rdi, %rax
 ;   cwd %ax, %dx
+;   movq    %rdx, %r8
 ;   idiv    %ax, %dx, %si, %ax, %dx
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
@ -44,6 +46,7 @@ block0(v0: i32, v1: i32):
 ; block0:
 ;   movq    %rdi, %rax
 ;   cdq %eax, %edx
+;   movq    %rdx, %r8
 ;   idiv    %eax, %edx, %esi, %eax, %edx
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
@ -60,6 +63,7 @@ block0(v0: i64, v1: i64):
 ; block0:
 ;   movq    %rdi, %rax
 ;   cqo %rax, %rdx
+;   movq    %rdx, %r8
 ;   idiv    %rax, %rdx, %rsi, %rax, %rdx
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
--- a/cranelift/filetests/filetests/isa/x64/shuffle-avx512.clif
+++ b/cranelift/filetests/filetests/isa/x64/shuffle-avx512.clif
@ -12,9 +12,10 @@ block0(v0: i8x16, v1: i8x16):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   movdqa  %xmm0, %xmm9
+;   movdqa  %xmm0, %xmm6
 ;   load_const VCodeConstant(0), %xmm0
-;   vpermi2b %xmm1, %xmm0, %xmm9
+;   movdqa  %xmm6, %xmm8
+;   vpermi2b %xmm1, %xmm8, %xmm0, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@ -31,11 +32,12 @@ block0(v0: i8x16, v1: i8x16):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   movdqa  %xmm0, %xmm12
+;   movdqa  %xmm0, %xmm9
 ;   load_const VCodeConstant(1), %xmm0
-;   load_const VCodeConstant(0), %xmm7
-;   vpermi2b %xmm1, %xmm7, %xmm12
-;   andps   %xmm0, %xmm7, %xmm0
+;   load_const VCodeConstant(0), %xmm8
+;   movdqa  %xmm9, %xmm11
+;   vpermi2b %xmm1, %xmm11, %xmm8, %xmm8
+;   andps   %xmm0, %xmm8, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@ -49,9 +51,10 @@ block0(v0: i8x16, v1: i8x16):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   movdqa  %xmm0, %xmm9
+;   movdqa  %xmm0, %xmm6
 ;   load_const VCodeConstant(0), %xmm0
-;   vpermi2b %xmm1, %xmm0, %xmm9
+;   movdqa  %xmm6, %xmm8
+;   vpermi2b %xmm1, %xmm8, %xmm0, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
--- a/cranelift/filetests/filetests/isa/x64/srem.clif
+++ b/cranelift/filetests/filetests/isa/x64/srem.clif
@ -10,8 +10,9 @@ block0(v0: i8, v1: i8):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
+;   xorl    %r11d, %r11d, %r11d
 ;   movq    %rdi, %rax
-;   movl    $0, %edx
+;   movq    %r11, %rdx
 ;   srem_seq %al, %dl, %sil, %al, %dl, tmp=(none)
 ;   shrq    $8, %rax, %rax
 ;   movq    %rbp, %rsp
@ -27,8 +28,9 @@ block0(v0: i16, v1: i16):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
+;   xorl    %r11d, %r11d, %r11d
 ;   movq    %rdi, %rax
-;   movl    $0, %edx
+;   movq    %r11, %rdx
 ;   srem_seq %ax, %dx, %si, %ax, %dx, tmp=(none)
 ;   movq    %rdx, %rax
 ;   movq    %rbp, %rsp
@ -44,8 +46,9 @@ block0(v0: i32, v1: i32):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
+;   xorl    %r11d, %r11d, %r11d
 ;   movq    %rdi, %rax
-;   movl    $0, %edx
+;   movq    %r11, %rdx
 ;   srem_seq %eax, %edx, %esi, %eax, %edx, tmp=(none)
 ;   movq    %rdx, %rax
 ;   movq    %rbp, %rsp
@ -61,8 +64,9 @@ block0(v0: i64, v1: i64):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
+;   xorl    %r11d, %r11d, %r11d
 ;   movq    %rdi, %rax
-;   movl    $0, %edx
+;   movq    %r11, %rdx
 ;   srem_seq %rax, %rdx, %rsi, %rax, %rdx, tmp=(none)
 ;   movq    %rdx, %rax
 ;   movq    %rbp, %rsp
--- a/cranelift/filetests/filetests/isa/x64/udiv.clif
+++ b/cranelift/filetests/filetests/isa/x64/udiv.clif
@ -10,9 +10,9 @@ block0(v0: i8, v1: i8):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   movq    %rdi, %rax
-;   movzbl  %al, %eax
-;   div     %al, (none), %sil, %al, %dl
+;   movzbl  %dil, %r10d
+;   movq    %r10, %rax
+;   div     %al, (none), %sil, %al, (none)
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@ -26,8 +26,9 @@ block0(v0: i16, v1: i16):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
+;   movl    $0, %r11d
 ;   movq    %rdi, %rax
-;   movl    $0, %edx
+;   movq    %r11, %rdx
 ;   div     %ax, %dx, %si, %ax, %dx
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
@ -42,8 +43,9 @@ block0(v0: i32, v1: i32):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
+;   movl    $0, %r11d
 ;   movq    %rdi, %rax
-;   movl    $0, %edx
+;   movq    %r11, %rdx
 ;   div     %eax, %edx, %esi, %eax, %edx
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
@ -58,8 +60,9 @@ block0(v0: i64, v1: i64):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
+;   movl    $0, %r11d
 ;   movq    %rdi, %rax
-;   movl    $0, %edx
+;   movq    %r11, %rdx
 ;   div     %rax, %rdx, %rsi, %rax, %rdx
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
--- a/cranelift/filetests/filetests/isa/x64/urem.clif
+++ b/cranelift/filetests/filetests/isa/x64/urem.clif
@ -10,9 +10,9 @@ block0(v0: i8, v1: i8):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   movq    %rdi, %rax
-;   movzbl  %al, %eax
-;   div     %al, (none), %sil, %al, %dl
+;   movzbl  %dil, %r10d
+;   movq    %r10, %rax
+;   div     %al, (none), %sil, %al, (none)
 ;   shrq    $8, %rax, %rax
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
@ -27,8 +27,9 @@ block0(v0: i16, v1: i16):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
+;   movl    $0, %r11d
 ;   movq    %rdi, %rax
-;   movl    $0, %edx
+;   movq    %r11, %rdx
 ;   div     %ax, %dx, %si, %ax, %dx
 ;   movq    %rdx, %rax
 ;   movq    %rbp, %rsp
@ -44,8 +45,9 @@ block0(v0: i32, v1: i32):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
+;   movl    $0, %r11d
 ;   movq    %rdi, %rax
-;   movl    $0, %edx
+;   movq    %r11, %rdx
 ;   div     %eax, %edx, %esi, %eax, %edx
 ;   movq    %rdx, %rax
 ;   movq    %rbp, %rsp
@ -61,8 +63,9 @@ block0(v0: i64, v1: i64):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
+;   movl    $0, %r11d
 ;   movq    %rdi, %rax
-;   movl    $0, %edx
+;   movq    %r11, %rdx
 ;   div     %rax, %rdx, %rsi, %rax, %rdx
 ;   movq    %rdx, %rax
 ;   movq    %rbp, %rsp