cranelift: Support callee-saved registers with tail calls on x64 (#8246)

* Add GrowFrame and ShrinkFrame instructions for moving the frame Co-authored-by: Jamey Sharp <jsharp@fastly.com> * Experimentally emit grow/shrink frame instructions for x64 tail calls Co-authored-by: Jamey Sharp <jsharp@fastly.com> * Reuse the epilogue generation functions for tail call emission Instead of building and copying the new frame over the old one, make use of the frame shrink/grow pseudo-instructions to move the frame, and then reuse the existing epilogue generation functions to setup the tail call. Co-authored-by: Jamey Sharp <jsharp@fastly.com> * Enable callee saves with the tail calling convention on x64 Co-authored-by: Jamey Sharp <jsharp@fastly.com> * Remove the requirement that indirect calls go through r15 with the tail cc * Stop using r14 for a temporary during the stack check with the tail cc * Apply suggestions from code review Co-authored-by: Jamey Sharp <jamey@minilop.net> * Remove constants in favor of reusing values computed for FrameLayout Co-authored-by: Jamey Sharp <jsharp@fastly.com> * Suggestions from review * Rename the grow/shrink frame instructions, and adjust their comments * Comments on ArgLoc * Add more tests for return_call, and fix grow/shrink arg area printing --------- Co-authored-by: Jamey Sharp <jsharp@fastly.com> Co-authored-by: Jamey Sharp <jamey@minilop.net>
7 months ago · a4613829ae
16 changed files with 1383 additions and 1144 deletions
--- a/cranelift/codegen/src/isa/aarch64/lower/isle.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower/isle.rs
@ -100,6 +100,9 @@ impl Context for IsleContext<'_, '_, MInst, AArch64Backend> {
            self.lower_ctx.sigs(),
            callee_sig,
            &callee,
+            // TODO: this should be Opcode::ReturnCall, once aarch64 has been ported to the new
+            // tail call strategy.
+            Opcode::Call,
            distance,
            caller_conv,
            self.backend.flags().clone(),
--- a/cranelift/codegen/src/isa/riscv64/lower/isle.rs
+++ b/cranelift/codegen/src/isa/riscv64/lower/isle.rs
@ -18,7 +18,7 @@ use crate::machinst::{VCodeConstant, VCodeConstantData};
 use crate::{
    ir::{
        immediates::*, types::*, AtomicRmwOp, BlockCall, ExternalName, Inst, InstructionData,
-        MemFlags, StackSlot, TrapCode, Value, ValueList,
+        MemFlags, Opcode, StackSlot, TrapCode, Value, ValueList,
    },
    isa::riscv64::inst::*,
    machinst::{ArgPair, InstOutput},
@ -82,6 +82,9 @@ impl generated_code::Context for RV64IsleContext<'_, '_, MInst, Riscv64Backend>
            self.lower_ctx.sigs(),
            callee_sig,
            &callee,
+            // TODO: this should be Opcode::ReturnCall, once riscv64 has been ported to the new
+            // tail call strategy.
+            Opcode::Call,
            distance,
            caller_conv,
            self.backend.flags().clone(),
--- a/cranelift/codegen/src/isa/x64/abi.rs
+++ b/cranelift/codegen/src/isa/x64/abi.rs
@ -53,29 +53,20 @@ impl X64ABIMachineSpec {

    fn gen_probestack_loop(
        insts: &mut SmallInstVec<Inst>,
-        call_conv: isa::CallConv,
+        _call_conv: isa::CallConv,
        frame_size: u32,
        guard_size: u32,
    ) {
        // We have to use a caller-saved register since clobbering only
        // happens after stack probing.
-        let tmp = match call_conv {
-            // All registers are caller-saved on the `tail` calling convention,
-            // and `r15` is not used to pass arguments.
-            isa::CallConv::Tail => regs::r15(),
        // `r11` is caller saved on both Fastcall and SystemV, and not used
        // for argument passing, so it's pretty much free. It is also not
        // used by the stacklimit mechanism.
-            _ => {
        let tmp = regs::r11();
        debug_assert!({
            let real_reg = tmp.to_real_reg().unwrap();
-                    !is_callee_save_systemv(real_reg, false)
-                        && !is_callee_save_fastcall(real_reg, false)
+            !is_callee_save_systemv(real_reg, false) && !is_callee_save_fastcall(real_reg, false)
        });
-                tmp
-            }
-        };

        insts.push(Inst::StackProbeLoop {
            tmp: Writable::from_reg(tmp),
@ -439,20 +430,15 @@ impl ABIMachineSpec for X64ABIMachineSpec {
        Inst::lea(mem, into_reg)
    }

-    fn get_stacklimit_reg(call_conv: isa::CallConv) -> Reg {
+    fn get_stacklimit_reg(_call_conv: isa::CallConv) -> Reg {
        // As per comment on trait definition, we must return a caller-save
        // register that is not used as an argument here.
-        match call_conv {
-            isa::CallConv::Tail => regs::r14(),
-            _ => {
        debug_assert!(!is_callee_save_systemv(
            regs::r10().to_real_reg().unwrap(),
            false
        ));
        regs::r10()
    }
-        }
-    }

    fn gen_load_base_offset(into_reg: Writable<Reg>, base: Reg, offset: i32, ty: Type) -> Self::I {
        // Only ever used for I64s and vectors; if that changes, see if the
@ -595,14 +581,10 @@ impl ABIMachineSpec for X64ABIMachineSpec {
    }

    fn gen_clobber_save(
-        call_conv: isa::CallConv,
+        _call_conv: isa::CallConv,
        flags: &settings::Flags,
        frame_layout: &FrameLayout,
    ) -> SmallVec<[Self::I; 16]> {
-        if call_conv == isa::CallConv::Tail {
-            assert!(frame_layout.clobbered_callee_saves.is_empty());
-        }
-
        let mut insts = SmallVec::new();

        if flags.unwind_info() && frame_layout.setup_area_size > 0 {
@ -857,7 +839,6 @@ impl ABIMachineSpec for X64ABIMachineSpec {

    fn get_regs_clobbered_by_call(call_conv_of_callee: isa::CallConv) -> PRegSet {
        match call_conv_of_callee {
-            isa::CallConv::Tail => ALL_CLOBBERS,
            isa::CallConv::Winch => ALL_CLOBBERS,
            _ if call_conv_of_callee.extends_windows_fastcall() => WINDOWS_CLOBBERS,
            _ => SYSV_CLOBBERS,
@ -882,13 +863,10 @@ impl ABIMachineSpec for X64ABIMachineSpec {
        outgoing_args_size: u32,
    ) -> FrameLayout {
        let mut regs: Vec<Writable<RealReg>> = match call_conv {
-            // The `tail` calling convention doesn't have any callee-save
-            // registers.
-            CallConv::Tail => vec![],
            // The `winch` calling convention doesn't have any callee-save
            // registers.
            CallConv::Winch => vec![],
-            CallConv::Fast | CallConv::Cold | CallConv::SystemV => regs
+            CallConv::Fast | CallConv::Cold | CallConv::SystemV | CallConv::Tail => regs
                .iter()
                .cloned()
                .filter(|r| is_callee_save_systemv(r.to_reg(), flags.enable_pinned_reg()))
@ -926,46 +904,36 @@ impl ABIMachineSpec for X64ABIMachineSpec {

 impl X64CallSite {
    pub fn emit_return_call(mut self, ctx: &mut Lower<Inst>, args: isle::ValueSlice) {
-        let (new_stack_arg_size, old_stack_arg_size) =
-            self.emit_temporary_tail_call_frame(ctx, args);
-
-        // Make a copy of the frame pointer, since we use it when copying down
-        // the new stack frame.
-        let fp = ctx.temp_writable_gpr();
-        let rbp = PReg::from(regs::rbp().to_real_reg().unwrap());
-        ctx.emit(Inst::MovFromPReg { src: rbp, dst: fp });
-
-        // Load the return address, because copying our new stack frame
-        // over our current stack frame might overwrite it, and we'll need to
-        // place it in the correct location after we do that copy.
-        //
-        // But we only need to actually move the return address if the size of
-        // stack arguments changes.
-        let ret_addr = if new_stack_arg_size != old_stack_arg_size {
-            let ret_addr = ctx.temp_writable_gpr();
-            ctx.emit(Inst::Mov64MR {
-                src: SyntheticAmode::Real(Amode::ImmReg {
-                    simm32: 8,
-                    base: *fp.to_reg(),
-                    flags: MemFlags::trusted(),
-                }),
-                dst: ret_addr,
+        let new_stack_arg_size =
+            u32::try_from(self.sig(ctx.sigs()).sized_stack_arg_space()).unwrap();
+        let old_stack_arg_size = ctx.abi().stack_args_size(ctx.sigs());
+
+        match new_stack_arg_size.cmp(&old_stack_arg_size) {
+            core::cmp::Ordering::Equal => {}
+            core::cmp::Ordering::Less => {
+                let tmp = ctx.temp_writable_gpr();
+                ctx.emit(Inst::ShrinkArgumentArea {
+                    amount: old_stack_arg_size - new_stack_arg_size,
+                    tmp,
                });
-            Some(ret_addr.to_reg())
-        } else {
-            None
-        };
+            }
+            core::cmp::Ordering::Greater => {
+                let tmp = ctx.temp_writable_gpr();
+                ctx.emit(Inst::GrowArgumentArea {
+                    amount: new_stack_arg_size - old_stack_arg_size,
+                    tmp,
+                });
+            }
+        }

-        // Finally, emit the macro instruction to copy the new stack frame over
-        // our current one and do the actual tail call!
+        // Put all arguments in registers and stack slots (within that newly
+        // allocated stack space).
+        self.emit_args(ctx, args);
+        self.emit_stack_ret_arg_for_tail_call(ctx);

+        // Finally, do the actual tail call!
        let dest = self.dest().clone();
        let info = Box::new(ReturnCallInfo {
-            new_stack_arg_size,
-            old_stack_arg_size,
-            ret_addr,
-            fp: fp.to_reg(),
-            tmp: ctx.temp_writable_gpr(),
            uses: self.take_uses(),
        });
        match dest {
@ -1029,25 +997,6 @@ impl From<StackAMode> for SyntheticAmode {
 fn get_intreg_for_arg(call_conv: &CallConv, idx: usize, arg_idx: usize) -> Option<Reg> {
    let is_fastcall = call_conv.extends_windows_fastcall();

-    if *call_conv == isa::CallConv::Tail {
-        return match idx {
-            0 => Some(regs::rax()),
-            1 => Some(regs::rcx()),
-            2 => Some(regs::rdx()),
-            3 => Some(regs::rbx()),
-            4 => Some(regs::rsi()),
-            5 => Some(regs::rdi()),
-            6 => Some(regs::r8()),
-            7 => Some(regs::r9()),
-            8 => Some(regs::r10()),
-            9 => Some(regs::r11()),
-            // NB: `r12`, `r13`, `r14` and `r15` are reserved for indirect
-            // callee addresses and temporaries required for our tail call
-            // sequence (fp, ret_addr, tmp).
-            _ => None,
-        };
-    }
-
    // Fastcall counts by absolute argument number; SysV counts by argument of
    // this (integer) class.
    let i = if is_fastcall { arg_idx } else { idx };
@ -1100,16 +1049,12 @@ fn get_intreg_for_retval(
            0 => Some(regs::rax()),
            1 => Some(regs::rcx()),
            2 => Some(regs::rdx()),
-            3 => Some(regs::rbx()),
-            4 => Some(regs::rsi()),
-            5 => Some(regs::rdi()),
-            6 => Some(regs::r8()),
-            7 => Some(regs::r9()),
-            8 => Some(regs::r10()),
-            9 => Some(regs::r11()),
-            10 => Some(regs::r12()),
-            11 => Some(regs::r13()),
-            12 => Some(regs::r14()),
+            3 => Some(regs::rsi()),
+            4 => Some(regs::rdi()),
+            5 => Some(regs::r8()),
+            6 => Some(regs::r9()),
+            7 => Some(regs::r10()),
+            8 => Some(regs::r11()),
            // NB: `r15` is reserved as a scratch register.
            _ => None,
        },
--- a/cranelift/codegen/src/isa/x64/inst.isle
+++ b/cranelift/codegen/src/isa/x64/inst.isle
@ -541,6 +541,23 @@
       (ReturnCallUnknown (callee RegMem)
                          (info BoxReturnCallInfo))

+       ;; GrowArgumentArea does a memmove of everything in the frame except for
+       ;; the argument area, to make room for more arguments. That includes all
+       ;; the stack slots, the callee-saved registers, and the saved FP and
+       ;; return address. To keep the stack pointers in sync with that change,
+       ;; it also subtracts the given amount from both the FP and SP registers.
+       (GrowArgumentArea (amount u32)
+                         (tmp WritableGpr))
+
+       ;; ShrinkArgumentArea does a memmove of everything in the frame except
+       ;; for the argument area, to trim space for fewer arguments. That
+       ;; includes all the stack slots, the callee-saved registers, and the
+       ;; saved FP and return address. To keep the stack pointers in sync with
+       ;; that change, it also adds the given amount to both the FP and SP
+       ;; registers.
+       (ShrinkArgumentArea (amount u32)
+                           (tmp WritableGpr))
+
       ;; A pseudo-instruction that captures register arguments in vregs.
       (Args
        (args VecArgPair))
--- a/cranelift/codegen/src/isa/x64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit.rs
@ -1,6 +1,6 @@
 use crate::ir;
 use crate::ir::immediates::{Ieee32, Ieee64};
-use crate::ir::{KnownSymbol, MemFlags};
+use crate::ir::KnownSymbol;
 use crate::isa::x64::encoding::evex::{EvexInstruction, EvexVectorLength, RegisterOrAmode};
 use crate::isa::x64::encoding::rex::{
    emit_simm, emit_std_enc_enc, emit_std_enc_mem, emit_std_reg_mem, emit_std_reg_reg, int_reg_enc,
@ -1628,18 +1628,7 @@ pub(crate) fn emit(
            callee,
            info: call_info,
        } => {
-            emit_return_call_common_sequence(
-                allocs,
-                sink,
-                info,
-                state,
-                call_info.new_stack_arg_size,
-                call_info.old_stack_arg_size,
-                call_info.ret_addr,
-                call_info.fp,
-                call_info.tmp,
-                &call_info.uses,
-            );
+            emit_return_call_common_sequence(allocs, sink, info, state, &call_info.uses);

            // Finally, jump to the callee!
            //
@ -1660,18 +1649,7 @@ pub(crate) fn emit(
        } => {
            let callee = callee.with_allocs(allocs);

-            emit_return_call_common_sequence(
-                allocs,
-                sink,
-                info,
-                state,
-                call_info.new_stack_arg_size,
-                call_info.old_stack_arg_size,
-                call_info.ret_addr,
-                call_info.fp,
-                call_info.tmp,
-                &call_info.uses,
-            );
+            emit_return_call_common_sequence(allocs, sink, info, state, &call_info.uses);

            Inst::JmpUnknown { target: callee }.emit(&[], sink, info, state);
            sink.add_call_site(ir::Opcode::ReturnCallIndirect);
@ -1722,6 +1700,130 @@ pub(crate) fn emit(
            }
        }

+        Inst::GrowArgumentArea { amount, tmp } => {
+            debug_assert!(*amount > 0);
+            debug_assert_eq!(*amount % 8, 0);
+
+            assert!(
+                info.flags.preserve_frame_pointers(),
+                "frame pointers must be enabled for GrowArgumentArea"
+            );
+
+            let tmp = allocs.next(tmp.to_reg().to_reg());
+            let tmp = Gpr::new(tmp).unwrap();
+            let tmp_w = WritableGpr::from_reg(tmp);
+
+            // As we're increasing the number of stack arguments, we need to move the frame down in
+            // memory, by decrementing SP by `amount` and looping from lower addresses to higher
+            // ones, copying down.
+
+            // Decrement SP and FP by `amount`
+            Inst::alu_rmi_r(
+                OperandSize::Size64,
+                AluRmiROpcode::Sub,
+                RegMemImm::imm(*amount),
+                Writable::from_reg(regs::rsp()),
+            )
+            .emit(&[], sink, info, state);
+            Inst::alu_rmi_r(
+                OperandSize::Size64,
+                AluRmiROpcode::Sub,
+                RegMemImm::imm(*amount),
+                Writable::from_reg(regs::rbp()),
+            )
+            .emit(&[], sink, info, state);
+
+            // The total size that we're going to copy, including the return address and frame
+            // pointer that are pushed on the stack already.
+            let size = i32::try_from(state.nominal_sp_to_fp()).unwrap()
+                + i32::try_from(state.frame_layout().setup_area_size).unwrap();
+
+            debug_assert_eq!(size % 8, 0);
+
+            // Copy the `i`th word in the stack from `SP + amount + i * 8` to `SP + i * 8`. Do this
+            // from lower to higher addresses to avoid clobbering words we haven't copied yet.
+            for sp_word_offset in 0..(size / 8) {
+                let sp_byte_offset = sp_word_offset * 8;
+                Inst::Mov64MR {
+                    src: SyntheticAmode::nominal_sp_offset(
+                        sp_byte_offset + i32::try_from(*amount).unwrap(),
+                    ),
+                    dst: tmp_w,
+                }
+                .emit(&[], sink, info, state);
+
+                Inst::MovRM {
+                    size: OperandSize::Size64,
+                    src: tmp,
+                    dst: SyntheticAmode::nominal_sp_offset(sp_byte_offset),
+                }
+                .emit(&[], sink, info, state);
+            }
+        }
+
+        Inst::ShrinkArgumentArea { amount, tmp } => {
+            debug_assert!(*amount > 0);
+            debug_assert_eq!(*amount % 8, 0);
+
+            assert!(
+                info.flags.preserve_frame_pointers(),
+                "frame pointers must be enabled for ShrinkArgumentArea"
+            );
+
+            let tmp = allocs.next(tmp.to_reg().to_reg());
+            let tmp = Gpr::new(tmp).unwrap();
+            let tmp_w = WritableGpr::from_reg(tmp);
+
+            // As we're decreasing the number of stack arguments, we need to move the frame up in
+            // memory, looping from higher addresses to lower ones copying up, and finally
+            // incrementing `SP` by `amount`.
+
+            // The total size that we're going to copy, including the return address and frame
+            // pointer that are pushed on the stack alreadcy.
+            let size = i32::try_from(state.nominal_sp_to_fp()).unwrap()
+                + i32::try_from(state.frame_layout().setup_area_size).unwrap();
+
+            debug_assert_eq!(size % 8, 0);
+
+            // Copy the `i`th word in the stack from `SP + i * 8` to `SP + amount + i * 8`. Do this
+            // from higher to lower addresses to avoid clobbering words we haven't copied yet.
+            for sp_word_offset in (0..(size / 8)).rev() {
+                let sp_byte_offset = sp_word_offset * 8;
+                Inst::Mov64MR {
+                    src: SyntheticAmode::nominal_sp_offset(sp_byte_offset),
+                    dst: tmp_w,
+                }
+                .emit(&[], sink, info, state);
+
+                Inst::MovRM {
+                    size: OperandSize::Size64,
+                    src: tmp,
+                    dst: SyntheticAmode::nominal_sp_offset(
+                        sp_byte_offset + i32::try_from(*amount).unwrap(),
+                    ),
+                }
+                .emit(&[], sink, info, state);
+            }
+
+            // Increment SP by `amount`
+            Inst::alu_rmi_r(
+                OperandSize::Size64,
+                AluRmiROpcode::Add,
+                RegMemImm::imm(*amount),
+                Writable::from_reg(regs::rsp()),
+            )
+            .emit(&[], sink, info, state);
+
+            // Increment FP by `amount`
+            Inst::alu_rmi_r(
+                OperandSize::Size64,
+                AluRmiROpcode::Add,
+                RegMemImm::imm(*amount),
+                Writable::from_reg(regs::rbp()),
+            )
+            .emit(&[], sink, info, state);
+        }
+
        Inst::Args { .. } => {}
        Inst::Rets { .. } => {}

@ -4252,11 +4354,6 @@ fn emit_return_call_common_sequence(
    sink: &mut MachBuffer<Inst>,
    info: &EmitInfo,
    state: &mut EmitState,
-    new_stack_arg_size: u32,
-    old_stack_arg_size: u32,
-    ret_addr: Option<Gpr>,
-    fp: Gpr,
-    tmp: WritableGpr,
    uses: &CallArgList,
 ) {
    assert!(
@ -4269,124 +4366,18 @@ fn emit_return_call_common_sequence(
        let _ = allocs.next(u.vreg);
    }

-    let ret_addr = ret_addr.map(|r| Gpr::new(allocs.next(*r)).unwrap());
-
-    let fp = allocs.next(*fp);
-
-    let tmp = allocs.next(tmp.to_reg().to_reg());
-    let tmp = Gpr::new(tmp).unwrap();
-    let tmp_w = WritableGpr::from_reg(tmp);
-
-    // Copy the new frame (which is `frame_size` bytes above the SP)
-    // onto our current frame, using only volatile, non-argument
-    // registers.
-    //
-    //
-    // The current stack layout is the following:
-    //
-    //            | ...                 |
-    //            +---------------------+
-    //            | ...                 |
-    //            | stack arguments     |
-    //            | ...                 |
-    //    current | return address      |
-    //    frame   | old FP              | <-- FP
-    //            | ...                 |
-    //            | old stack slots     |
-    //            | ...                 |
-    //            +---------------------+
-    //            | ...                 |
-    //    new     | new stack arguments |
-    //    frame   | ...                 | <-- SP
-    //            +---------------------+
-    //
-    // We need to restore the old FP, copy the new stack arguments over the old
-    // stack arguments, write the return address into the correct slot just
-    // after the new stack arguments, adjust SP to point to the new return
-    // address, and then jump to the callee (which will push the old FP again).
-
-    // Restore the old FP into `rbp`.
-    Inst::Mov64MR {
-        src: SyntheticAmode::Real(Amode::ImmReg {
-            simm32: 0,
-            base: fp,
-            flags: MemFlags::trusted(),
-        }),
-        dst: Writable::from_reg(Gpr::new(regs::rbp()).unwrap()),
-    }
-    .emit(&[], sink, info, state);
-
-    // The new lowest address (top of stack) -- relative to FP -- for
-    // our tail callee. We compute this now so that we can move our
-    // stack arguments into place.
-    let callee_sp_relative_to_fp = i64::from(old_stack_arg_size) - i64::from(new_stack_arg_size);
-
-    // Copy over each word, using `tmp` as a temporary register.
-    //
-    // Note that we have to do this from stack slots with the highest
-    // address to lowest address because in the case of when the tail
-    // callee has more stack arguments than we do, we might otherwise
-    // overwrite some of our stack arguments before they've been copied
-    // into place.
-    assert_eq!(
-        new_stack_arg_size % 8,
-        0,
-        "stack argument space sizes should always be 8-byte aligned"
-    );
-    for i in (0..new_stack_arg_size / 8).rev() {
-        Inst::Mov64MR {
-            src: SyntheticAmode::Real(Amode::ImmReg {
-                simm32: (i * 8).try_into().unwrap(),
-                base: regs::rsp(),
-                flags: MemFlags::trusted(),
-            }),
-            dst: tmp_w,
-        }
-        .emit(&[], sink, info, state);
-        Inst::MovRM {
-            size: OperandSize::Size64,
-            src: tmp,
-            dst: SyntheticAmode::Real(Amode::ImmReg {
-                // Add 2 because we need to skip over the old FP and the
-                // return address.
-                simm32: (callee_sp_relative_to_fp + i64::from((i + 2) * 8))
-                    .try_into()
-                    .unwrap(),
-                base: fp,
-                flags: MemFlags::trusted(),
-            }),
-        }
-        .emit(&[], sink, info, state);
-    }
-
-    // Initialize SP for the tail callee, deallocating the temporary
-    // stack arguments space at the same time.
-    Inst::LoadEffectiveAddress {
-        size: OperandSize::Size64,
-        addr: SyntheticAmode::Real(Amode::ImmReg {
-            // NB: We add a word to `callee_sp_relative_to_fp` here because the
-            // callee will push FP, not us.
-            simm32: callee_sp_relative_to_fp.wrapping_add(8).try_into().unwrap(),
-            base: fp,
-            flags: MemFlags::trusted(),
-        }),
-        dst: Writable::from_reg(Gpr::new(regs::rsp()).unwrap()),
+    for inst in
+        X64ABIMachineSpec::gen_clobber_restore(CallConv::Tail, &info.flags, state.frame_layout())
+    {
+        inst.emit(&[], sink, info, state);
    }
-    .emit(&[], sink, info, state);

-    state.adjust_virtual_sp_offset(-i64::from(new_stack_arg_size));
-
-    // Write the return address into the correct stack slot.
-    if let Some(ret_addr) = ret_addr {
-        Inst::MovRM {
-            size: OperandSize::Size64,
-            src: ret_addr,
-            dst: SyntheticAmode::Real(Amode::ImmReg {
-                simm32: 0,
-                base: regs::rsp(),
-                flags: MemFlags::trusted(),
-            }),
-        }
-        .emit(&[], sink, info, state);
+    for inst in X64ABIMachineSpec::gen_epilogue_frame_restore(
+        CallConv::Tail,
+        &info.flags,
+        &info.isa_flags,
+        state.frame_layout(),
+    ) {
+        inst.emit(&[], sink, info, state);
    }
 }
--- a/cranelift/codegen/src/isa/x64/inst/emit_state.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit_state.rs
@ -14,6 +14,10 @@ pub struct EmitState {
    /// Only used during fuzz-testing. Otherwise, it is a zero-sized struct and
    /// optimized away at compiletime. See [cranelift_control].
    ctrl_plane: ControlPlane,
+
+    /// A copy of the frame layout, used during the emission of `Inst::ReturnCallKnown` and
+    /// `Inst::ReturnCallUnknown` instructions.
+    frame_layout: FrameLayout,
 }

 impl MachInstEmitState<Inst> for EmitState {
@ -23,6 +27,7 @@ impl MachInstEmitState<Inst> for EmitState {
            nominal_sp_to_fp: abi.frame_size() as i64,
            stack_map: None,
            ctrl_plane,
+            frame_layout: abi.frame_layout().clone(),
        }
    }

@ -62,4 +67,8 @@ impl EmitState {
    pub(crate) fn nominal_sp_to_fp(&self) -> i64 {
        self.nominal_sp_to_fp
    }
+
+    pub(crate) fn frame_layout(&self) -> &FrameLayout {
+        &self.frame_layout
+    }
 }
--- a/cranelift/codegen/src/isa/x64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/x64/inst/mod.rs
@ -52,20 +52,6 @@ pub struct CallInfo {
 /// Out-of-line data for return-calls, to keep the size of `Inst` down.
 #[derive(Clone, Debug)]
 pub struct ReturnCallInfo {
-    /// The size of the new stack frame's stack arguments. This is necessary
-    /// for copying the frame over our current frame. It must already be
-    /// allocated on the stack.
-    pub new_stack_arg_size: u32,
-    /// The size of the current/old stack frame's stack arguments.
-    pub old_stack_arg_size: u32,
-    /// The return address. Needs to be written into the correct stack slot
-    /// after the new stack frame is copied into place.
-    pub ret_addr: Option<Gpr>,
-    /// A copy of the frame pointer, because we will overwrite the current
-    /// `rbp`.
-    pub fp: Gpr,
-    /// A temporary register.
-    pub tmp: WritableGpr,
    /// The in-register arguments and their constraints.
    pub uses: CallArgList,
 }
@ -138,6 +124,8 @@ impl Inst {
            | Inst::Pop64 { .. }
            | Inst::Push64 { .. }
            | Inst::StackProbeLoop { .. }
+            | Inst::GrowArgumentArea { .. }
+            | Inst::ShrinkArgumentArea { .. }
            | Inst::Args { .. }
            | Inst::Rets { .. }
            | Inst::Ret { .. }
@ -1675,26 +1663,8 @@ impl PrettyPrint for Inst {
            }

            Inst::ReturnCallKnown { callee, info } => {
-                let ReturnCallInfo {
-                    new_stack_arg_size,
-                    old_stack_arg_size,
-                    ret_addr,
-                    fp,
-                    tmp,
-                    uses,
-                } = &**info;
-                let ret_addr = ret_addr.map(|r| regs::show_reg(*r));
-                let fp = regs::show_reg(fp.to_reg());
-                let tmp = regs::show_reg(tmp.to_reg().to_reg());
-                let mut s = format!(
-                    "return_call_known \
-                     {callee:?} \
-                     new_stack_arg_size:{new_stack_arg_size} \
-                     old_stack_arg_size:{old_stack_arg_size} \
-                     ret_addr:{ret_addr:?} \
-                     fp:{fp} \
-                     tmp:{tmp}"
-                );
+                let ReturnCallInfo { uses } = &**info;
+                let mut s = format!("return_call_known {callee:?}");
                for ret in uses {
                    let preg = regs::show_reg(ret.preg);
                    let vreg = pretty_print_reg(ret.vreg, 8, allocs);
@ -1704,27 +1674,9 @@ impl PrettyPrint for Inst {
            }

            Inst::ReturnCallUnknown { callee, info } => {
-                let ReturnCallInfo {
-                    new_stack_arg_size,
-                    old_stack_arg_size,
-                    ret_addr,
-                    fp,
-                    tmp,
-                    uses,
-                } = &**info;
+                let ReturnCallInfo { uses } = &**info;
                let callee = callee.pretty_print(8, allocs);
-                let ret_addr = ret_addr.map(|r| regs::show_reg(*r));
-                let fp = regs::show_reg(fp.to_reg());
-                let tmp = regs::show_reg(tmp.to_reg().to_reg());
-                let mut s = format!(
-                    "return_call_unknown \
-                     {callee} \
-                     new_stack_arg_size:{new_stack_arg_size} \
-                     old_stack_arg_size:{old_stack_arg_size} \
-                     ret_addr:{ret_addr:?} \
-                     fp:{fp} \
-                     tmp:{tmp}"
-                );
+                let mut s = format!("return_call_unknown {callee}");
                for ret in uses {
                    let preg = regs::show_reg(ret.preg);
                    let vreg = pretty_print_reg(ret.vreg, 8, allocs);
@ -1733,6 +1685,18 @@ impl PrettyPrint for Inst {
                s
            }

+            Inst::GrowArgumentArea { amount, tmp } => {
+                let amount = *amount;
+                let tmp = pretty_print_reg(tmp.to_reg().to_reg(), 8, allocs);
+                format!("grow_argument_area {amount} {tmp}")
+            }
+
+            Inst::ShrinkArgumentArea { amount, tmp } => {
+                let amount = *amount;
+                let tmp = pretty_print_reg(tmp.to_reg().to_reg(), 8, allocs);
+                format!("shrink_argument_area {amount} {tmp}")
+            }
+
            Inst::Args { args } => {
                let mut s = "args".to_string();
                for arg in args {
@ -2365,11 +2329,6 @@ fn x64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandCol
        Inst::CallUnknown { info, dest, .. } => {
            let info = info.as_ref().expect("CallInfo is expected in this path");
            match dest {
-                RegMem::Reg { reg } if info.callee_conv == CallConv::Tail => {
-                    // TODO(https://github.com/bytecodealliance/regalloc2/issues/145):
-                    // This shouldn't be a fixed register constraint.
-                    collector.reg_fixed_use(*reg, regs::r15())
-                }
                RegMem::Reg { reg } if info.callee_conv == CallConv::Winch => {
                    // TODO(https://github.com/bytecodealliance/regalloc2/issues/145):
                    // This shouldn't be a fixed register constraint.
@ -2387,42 +2346,24 @@ fn x64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandCol
        }

        Inst::ReturnCallKnown { callee, info } => {
-            let ReturnCallInfo {
-                ret_addr,
-                fp,
-                tmp,
-                uses,
-                ..
-            } = &**info;
+            let ReturnCallInfo { uses } = &**info;
            // Same as in the `Inst::CallKnown` branch.
            debug_assert_ne!(*callee, ExternalName::LibCall(LibCall::Probestack));
            for u in uses {
                collector.reg_fixed_use(u.vreg, u.preg);
            }
-            if let Some(ret_addr) = ret_addr {
-                collector.reg_use(**ret_addr);
-            }
-            collector.reg_use(**fp);
-            collector.reg_early_def(tmp.to_writable_reg());
        }

        Inst::ReturnCallUnknown { callee, info } => {
-            let ReturnCallInfo {
-                ret_addr,
-                fp,
-                tmp,
-                uses,
-                ..
-            } = &**info;
+            let ReturnCallInfo { uses } = &**info;
            callee.get_operands(collector);
            for u in uses {
                collector.reg_fixed_use(u.vreg, u.preg);
            }
-            if let Some(ret_addr) = ret_addr {
-                collector.reg_use(**ret_addr);
        }
-            collector.reg_use(**fp);
-            collector.reg_early_def(tmp.to_writable_reg());
+
+        Inst::GrowArgumentArea { tmp, .. } | Inst::ShrinkArgumentArea { tmp, .. } => {
+            collector.reg_def(tmp.to_writable_reg());
        }

        Inst::JmpTableSeq {
--- a/cranelift/codegen/src/isa/x64/lower/isle.rs
+++ b/cranelift/codegen/src/isa/x64/lower/isle.rs
@ -118,6 +118,7 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> {
            self.lower_ctx.sigs(),
            callee_sig,
            &callee,
+            Opcode::ReturnCall,
            distance,
            caller_conv,
            self.backend.flags().clone(),
--- a/cranelift/codegen/src/isa/x64/pcc.rs
+++ b/cranelift/codegen/src/isa/x64/pcc.rs
@ -808,6 +808,8 @@ pub(crate) fn check(
        | Inst::ReturnCallKnown { .. }
        | Inst::JmpKnown { .. }
        | Inst::Ret { .. }
+        | Inst::GrowArgumentArea { .. }
+        | Inst::ShrinkArgumentArea { .. }
        | Inst::JmpIf { .. }
        | Inst::JmpCond { .. }
        | Inst::TrapIf { .. }
--- a/cranelift/codegen/src/machinst/abi.rs
+++ b/cranelift/codegen/src/machinst/abi.rs
@ -987,6 +987,7 @@ impl std::ops::Index<Sig> for SigSet {
 }

 /// Structure describing the layout of a function's stack frame.
+#[derive(Clone, Debug, Default)]
 pub struct FrameLayout {
    /// N.B. The areas whose sizes are given in this structure fully
    /// cover the current function's stack frame, from high to low
@ -1886,7 +1887,7 @@ impl<M: ABIMachineSpec> Callee<M> {
    /// This should include any stack frame or other setup necessary to use the
    /// other methods (`load_arg`, `store_retval`, and spillslot accesses.)
    pub fn gen_prologue(&self) -> SmallInstVec<M::I> {
-        let frame_layout = self.frame_layout.as_ref().unwrap();
+        let frame_layout = self.frame_layout();
        let mut insts = smallvec![];

        // Set up frame.
@ -1953,7 +1954,7 @@ impl<M: ABIMachineSpec> Callee<M> {
    /// emitting this in the lowering logic), because the epilogue code comes
    /// before the return and the two are likely closely related.
    pub fn gen_epilogue(&self) -> SmallInstVec<M::I> {
-        let frame_layout = self.frame_layout.as_ref().unwrap();
+        let frame_layout = self.frame_layout();
        let mut insts = smallvec![];

        // Restore clobbered registers.
@ -1988,25 +1989,27 @@ impl<M: ABIMachineSpec> Callee<M> {
        insts
    }

+    /// Return a reference to the computed frame layout information. This
+    /// function will panic if it's called before [`Self::compute_frame_layout`].
+    pub fn frame_layout(&self) -> &FrameLayout {
+        self.frame_layout
+            .as_ref()
+            .expect("frame layout not computed before prologue generation")
+    }
+
    /// Returns the full frame size for the given function, after prologue
    /// emission has run. This comprises the spill slots and stack-storage
    /// slots as well as storage for clobbered callee-save registers, but
    /// not arguments arguments pushed at callsites within this function,
    /// or other ephemeral pushes.
    pub fn frame_size(&self) -> u32 {
-        let frame_layout = self
-            .frame_layout
-            .as_ref()
-            .expect("frame size not computed before prologue generation");
+        let frame_layout = self.frame_layout();
        frame_layout.clobber_size + frame_layout.fixed_frame_storage_size
    }

    /// Returns offset from the nominal SP to caller's SP.
    pub fn nominal_sp_to_caller_sp_offset(&self) -> u32 {
-        let frame_layout = self
-            .frame_layout
-            .as_ref()
-            .expect("frame size not computed before prologue generation");
+        let frame_layout = self.frame_layout();
        frame_layout.clobber_size
            + frame_layout.fixed_frame_storage_size
            + frame_layout.setup_area_size
@ -2068,8 +2071,14 @@ impl<M: ABIMachineSpec> Callee<M> {
 /// The register or stack slot location of an argument.
 #[derive(Clone, Debug)]
 pub enum ArgLoc {
+    /// The physical register that the value will be passed through.
    Reg(PReg),
-    Stack(StackAMode),
+
+    /// The offset into the argument area where this value will be passed. It's up to the consumer
+    /// of the `ArgLoc::Stack` variant to decide how to find the argument area that the `offset`
+    /// value is relative to. Depending on the abi, this may end up being relative to SP or FP, for
+    /// example with a tail call where the frame is reused.
+    Stack { offset: i64, ty: ir::Type },
 }

 /// An input argument to a call instruction: the vreg that is used,
@ -2133,6 +2142,7 @@ impl<M: ABIMachineSpec> CallSite<M> {
        sigs: &SigSet,
        sig_ref: ir::SigRef,
        extname: &ir::ExternalName,
+        opcode: ir::Opcode,
        dist: RelocDistance,
        caller_conv: isa::CallConv,
        flags: settings::Flags,
@ -2145,7 +2155,7 @@ impl<M: ABIMachineSpec> CallSite<M> {
            defs: smallvec![],
            clobbers,
            dest: CallDest::ExtName(extname.clone(), dist),
-            opcode: ir::Opcode::Call,
+            opcode,
            caller_conv,
            flags,
            _mach: PhantomData,
@ -2213,6 +2223,17 @@ impl<M: ABIMachineSpec> CallSite<M> {
    pub(crate) fn take_uses(self) -> CallArgList {
        self.uses
    }
+
+    pub(crate) fn sig<'a>(&self, sigs: &'a SigSet) -> &'a SigData {
+        &sigs[self.sig]
+    }
+
+    pub(crate) fn is_tail_call(&self) -> bool {
+        matches!(
+            self.opcode,
+            ir::Opcode::ReturnCall | ir::Opcode::ReturnCallIndirect
+        )
+    }
 }

 fn adjust_stack_and_nominal_sp<M: ABIMachineSpec>(ctx: &mut Lower<M::I>, amount: i32) {
@ -2329,7 +2350,22 @@ impl<M: ABIMachineSpec> CallSite<M> {
                    vreg,
                    preg: preg.into(),
                }),
-                ArgLoc::Stack(amode) => ctx.emit(M::gen_store_stack(amode, vreg, amode.get_type())),
+                ArgLoc::Stack { offset, ty } => {
+                    let amode = if self.is_tail_call() {
+                        assert!(
+                            self.flags.preserve_frame_pointers(),
+                            "tail calls require frame pointers to be enabled"
+                        );
+
+                        StackAMode::FPOffset(
+                            offset + M::fp_to_arg_offset(self.caller_conv, &self.flags),
+                            ty,
+                        )
+                    } else {
+                        StackAMode::SPOffset(offset, ty)
+                    };
+                    ctx.emit(M::gen_store_stack(amode, vreg, ty))
+                }
            }
        }
    }
@ -2416,10 +2452,7 @@ impl<M: ABIMachineSpec> CallSite<M> {
                                } else {
                                    (*from_reg, ty)
                                };
-                            locs.push((
-                                data.into(),
-                                ArgLoc::Stack(StackAMode::SPOffset(offset, ty)),
-                            ));
+                            locs.push((data.into(), ArgLoc::Stack { offset, ty }));
                        }
                    }
                }
@ -2444,7 +2477,7 @@ impl<M: ABIMachineSpec> CallSite<M> {
                    ABIArgSlot::Reg { reg, .. } => ArgLoc::Reg(reg.into()),
                    ABIArgSlot::Stack { offset, .. } => {
                        let ty = M::word_type();
-                        ArgLoc::Stack(StackAMode::SPOffset(offset, ty))
+                        ArgLoc::Stack { offset, ty }
                    }
                };
                locs.push((tmp.into(), loc));
--- a/cranelift/codegen/src/machinst/isle.rs
+++ b/cranelift/codegen/src/machinst/isle.rs
@ -758,6 +758,7 @@ macro_rules! isle_prelude_caller_methods {
                self.lower_ctx.sigs(),
                sig_ref,
                &extname,
+                Opcode::Call,
                dist,
                caller_conv,
                self.backend.flags().clone(),
--- a/cranelift/filetests/filetests/isa/x64/fuzzbug-60035.clif
+++ b/cranelift/filetests/filetests/isa/x64/fuzzbug-60035.clif
@ -15,25 +15,14 @@ block0:
 ; VCode:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
-;   subq    %rsp, $64, %rsp
-;   movq    %rbx, 16(%rsp)
-;   movq    %r12, 24(%rsp)
-;   movq    %r13, 32(%rsp)
-;   movq    %r14, 40(%rsp)
-;   movq    %r15, 48(%rsp)
+;   subq    %rsp, $16, %rsp
+;   movq    %rbx, 0(%rsp)
 ; block0:
-;   load_ext_name userextname0+0, %r15
-;   movq    %r15, rsp(0 + virtual offset)
-;   movq    rsp(0 + virtual offset), %r15
-;   call    *%r15
-;   movq    rsp(0 + virtual offset), %r15
-;   call    *%r15
-;   movq    16(%rsp), %rbx
-;   movq    24(%rsp), %r12
-;   movq    32(%rsp), %r13
-;   movq    40(%rsp), %r14
-;   movq    48(%rsp), %r15
-;   addq    %rsp, $64, %rsp
+;   load_ext_name userextname0+0, %rbx
+;   call    *%rbx
+;   call    *%rbx
+;   movq    0(%rsp), %rbx
+;   addq    %rsp, $16, %rsp
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@ -42,25 +31,14 @@ block0:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
-;   subq $0x40, %rsp
-;   movq %rbx, 0x10(%rsp)
-;   movq %r12, 0x18(%rsp)
-;   movq %r13, 0x20(%rsp)
-;   movq %r14, 0x28(%rsp)
-;   movq %r15, 0x30(%rsp)
-; block1: ; offset 0x21
-;   movabsq $0, %r15 ; reloc_external Abs8 u1:7 0
-;   movq %r15, (%rsp)
-;   movq (%rsp), %r15
-;   callq *%r15
-;   movq (%rsp), %r15
-;   callq *%r15
-;   movq 0x10(%rsp), %rbx
-;   movq 0x18(%rsp), %r12
-;   movq 0x20(%rsp), %r13
-;   movq 0x28(%rsp), %r14
-;   movq 0x30(%rsp), %r15
-;   addq $0x40, %rsp
+;   subq $0x10, %rsp
+;   movq %rbx, (%rsp)
+; block1: ; offset 0xc
+;   movabsq $0, %rbx ; reloc_external Abs8 u1:7 0
+;   callq *%rbx
+;   callq *%rbx
+;   movq (%rsp), %rbx
+;   addq $0x10, %rsp
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
--- a/cranelift/filetests/filetests/isa/x64/return-call-indirect.clif
+++ b/cranelift/filetests/filetests/isa/x64/return-call-indirect.clif
@ -14,7 +14,7 @@ block0(v0: i64):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   lea     10(%rax), %rax
+;   lea     10(%rdi), %rax
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@ -24,7 +24,7 @@ block0(v0: i64):
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
-;   addq $0xa, %rax
+;   leaq 0xa(%rdi), %rax
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
@ -42,20 +42,18 @@ block0(v0: i64):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   load_ext_name %callee_i64+0, %rdx
-;   movq    %rbp, %rcx
-;   return_call_unknown %rdx new_stack_arg_size:0 old_stack_arg_size:0 ret_addr:None fp:%v194 tmp:%v195 %rax=%rax
+;   load_ext_name %callee_i64+0, %rax
+;   return_call_unknown %rax %rdi=%rdi
 ;
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
-;   movabsq $0, %rdx ; reloc_external Abs8 %callee_i64 0
-;   movq %rbp, %rcx
-;   movq (%rcx), %rbp
-;   leaq 8(%rcx), %rsp
-;   jmpq *%rdx
+;   movabsq $0, %rax ; reloc_external Abs8 %callee_i64 0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   jmpq *%rax

 ;;;; Test colocated tail calls ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

@ -72,20 +70,18 @@ block0(v0: i64):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   load_ext_name %callee_i64+0, %rdx
-;   movq    %rbp, %rcx
-;   return_call_unknown %rdx new_stack_arg_size:0 old_stack_arg_size:0 ret_addr:None fp:%v194 tmp:%v195 %rax=%rax
+;   load_ext_name %callee_i64+0, %rax
+;   return_call_unknown %rax %rdi=%rdi
 ;
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
-;   leaq (%rip), %rdx ; reloc_external CallPCRel4 %callee_i64 -4
-;   movq %rbp, %rcx
-;   movq (%rcx), %rbp
-;   leaq 8(%rcx), %rsp
-;   jmpq *%rdx
+;   leaq (%rip), %rax ; reloc_external CallPCRel4 %callee_i64 -4
+;   movq %rbp, %rsp
+;   popq %rbp
+;   jmpq *%rax

 ;;;; Test passing `f64`s ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

@ -143,20 +139,18 @@ block0(v0: f64):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   load_ext_name %callee_f64+0, %rdx
-;   movq    %rbp, %rcx
-;   return_call_unknown %rdx new_stack_arg_size:0 old_stack_arg_size:0 ret_addr:None fp:%v194 tmp:%v195 %xmm0=%xmm0
+;   load_ext_name %callee_f64+0, %rax
+;   return_call_unknown %rax %xmm0=%xmm0
 ;
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
-;   movabsq $0, %rdx ; reloc_external Abs8 %callee_f64 0
-;   movq %rbp, %rcx
-;   movq (%rcx), %rbp
-;   leaq 8(%rcx), %rsp
-;   jmpq *%rdx
+;   movabsq $0, %rax ; reloc_external Abs8 %callee_f64 0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   jmpq *%rax

 ;;;; Test passing `i8`s ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

@ -171,7 +165,7 @@ block0(v0: i8):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   testb   %al, %al
+;   testb   %dil, %dil
 ;   setz    %al
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
@ -182,7 +176,7 @@ block0(v0: i8):
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
-;   testb %al, %al
+;   testb %dil, %dil
 ;   sete %al
 ;   movq %rbp, %rsp
 ;   popq %rbp
@ -201,18 +195,16 @@ block0(v0: i8):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   load_ext_name %callee_i8+0, %rdx
-;   movq    %rbp, %rcx
-;   return_call_unknown %rdx new_stack_arg_size:0 old_stack_arg_size:0 ret_addr:None fp:%v194 tmp:%v195 %rax=%rax
+;   load_ext_name %callee_i8+0, %rax
+;   return_call_unknown %rax %rdi=%rdi
 ;
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
-;   movabsq $0, %rdx ; reloc_external Abs8 %callee_i8 0
-;   movq %rbp, %rcx
-;   movq (%rcx), %rbp
-;   leaq 8(%rcx), %rsp
-;   jmpq *%rdx
+;   movabsq $0, %rax ; reloc_external Abs8 %callee_i8 0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   jmpq *%rax

--- a/cranelift/filetests/filetests/isa/x64/return-call.clif
+++ b/cranelift/filetests/filetests/isa/x64/return-call.clif
@ -14,7 +14,7 @@ block0(v0: i64):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   lea     10(%rax), %rax
+;   lea     10(%rdi), %rax
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@ -24,7 +24,7 @@ block0(v0: i64):
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
-;   addq $0xa, %rax
+;   leaq 0xa(%rdi), %rax
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
@ -40,20 +40,18 @@ block0(v0: i64):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   movq    %rbp, %rcx
-;   load_ext_name %callee_i64+0, %r8
-;   return_call_unknown %r8 new_stack_arg_size:0 old_stack_arg_size:0 ret_addr:None fp:%v193 tmp:%v194 %rax=%rax
+;   load_ext_name %callee_i64+0, %rax
+;   return_call_unknown %rax %rdi=%rdi
 ;
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
-;   movq %rbp, %rcx
-;   movabsq $0, %r8 ; reloc_external Abs8 %callee_i64 0
-;   movq (%rcx), %rbp
-;   leaq 8(%rcx), %rsp
-;   jmpq *%r8
+;   movabsq $0, %rax ; reloc_external Abs8 %callee_i64 0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   jmpq *%rax

 ;;;; Test colocated tail calls ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

@ -68,18 +66,16 @@ block0(v0: i64):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   movq    %rbp, %rcx
-;   return_call_known TestCase(%callee_i64) new_stack_arg_size:0 old_stack_arg_size:0 ret_addr:None fp:%v193 tmp:%v194 %rax=%rax
+;   return_call_known TestCase(%callee_i64) %rdi=%rdi
 ;
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
-;   movq %rbp, %rcx
-;   movq (%rcx), %rbp
-;   leaq 8(%rcx), %rsp
-;   jmp 0x13 ; reloc_external CallPCRel4 %callee_i64 -4
+;   movq %rbp, %rsp
+;   popq %rbp
+;   jmp 0xd ; reloc_external CallPCRel4 %callee_i64 -4

 ;;;; Test passing `f64`s ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

@ -135,20 +131,18 @@ block0(v0: f64):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   movq    %rbp, %rax
-;   load_ext_name %callee_f64+0, %r8
-;   return_call_unknown %r8 new_stack_arg_size:0 old_stack_arg_size:0 ret_addr:None fp:%v193 tmp:%v194 %xmm0=%xmm0
+;   load_ext_name %callee_f64+0, %rax
+;   return_call_unknown %rax %xmm0=%xmm0
 ;
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
-;   movq %rbp, %rax
-;   movabsq $0, %r8 ; reloc_external Abs8 %callee_f64 0
-;   movq (%rax), %rbp
-;   leaq 8(%rax), %rsp
-;   jmpq *%r8
+;   movabsq $0, %rax ; reloc_external Abs8 %callee_f64 0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   jmpq *%rax

 ;;;; Test passing `i8`s ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

@ -163,7 +157,7 @@ block0(v0: i8):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   testb   %al, %al
+;   testb   %dil, %dil
 ;   setz    %al
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
@ -174,7 +168,7 @@ block0(v0: i8):
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
-;   testb %al, %al
+;   testb %dil, %dil
 ;   sete %al
 ;   movq %rbp, %rsp
 ;   popq %rbp
@ -191,20 +185,177 @@ block0(v0: i8):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   movq    %rbp, %rcx
-;   load_ext_name %callee_i8+0, %r8
-;   return_call_unknown %r8 new_stack_arg_size:0 old_stack_arg_size:0 ret_addr:None fp:%v193 tmp:%v194 %rax=%rax
+;   load_ext_name %callee_i8+0, %rax
+;   return_call_unknown %rax %rdi=%rdi
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   movabsq $0, %rax ; reloc_external Abs8 %callee_i8 0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   jmpq *%rax
+
+;;;; Test passing fewer arguments on the stack ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+function %one_stack_arg(i32, i32, i32, i32, i32, i32, i32) tail {
+block0(v0: i32, v1: i32, v2: i32, v3: i32, v4: i32, v5: i32, v6: i32):
+    return
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    16(%rbp), %r10
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret 16
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   movq 0x10(%rbp), %r10
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq $0x10
+
+function %call_one_stack_arg(i32, i32, i32, i32, i32, i32, i32, i32, i32) tail {
+    fn0 = colocated %one_stack_arg(i32, i32, i32, i32, i32, i32, i32) tail
+
+block0(v0: i32, v1: i32, v2: i32, v3: i32, v4: i32, v5: i32, v6: i32, v7: i32, v8: i32):
+    return_call fn0(v2, v3, v4, v5, v6, v7, v8)
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %r8, %r10
+;   movq    %rdx, %rdi
+;   movq    %rcx, %rsi
+;   movq    %r9, %rcx
+;   movq    16(%rbp), %r8
+;   movq    24(%rbp), %r9
+;   movq    32(%rbp), %rax
+;   shrink_argument_area 16 %rdx
+;   movl    %eax, 16(%rbp)
+;   movq    %r10, %rdx
+;   return_call_known TestCase(%one_stack_arg) %rdi=%rdi %rsi=%rsi %rdx=%rdx %rcx=%rcx %r8=%r8 %r9=%r9
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   movq %r8, %r10
+;   movq %rdx, %rdi
+;   movq %rcx, %rsi
+;   movq %r9, %rcx
+;   movq 0x10(%rbp), %r8
+;   movq 0x18(%rbp), %r9
+;   movq 0x20(%rbp), %rax
+;   movq 8(%rsp), %rdx
+;   movq %rdx, 0x18(%rsp)
+;   movq (%rsp), %rdx
+;   movq %rdx, 0x10(%rsp)
+;   addq $0x10, %rsp
+;   addq $0x10, %rbp
+;   movl %eax, 0x10(%rbp)
+;   movq %r10, %rdx
+;   movq %rbp, %rsp
+;   popq %rbp
+;   jmp 0x46 ; reloc_external CallPCRel4 %one_stack_arg -4
+
+function %call_zero_stack_args(i32, i32, i32, i32, i32, i32, i32, i32, i8) -> i8 tail {
+    fn0 = colocated %callee_i8(i8) -> i8 tail
+
+block0(v0: i32, v1: i32, v2: i32, v3: i32, v4: i32, v5: i32, v6: i32, v7: i32, v8: i8):
+    return_call fn0(v8)
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    16(%rbp), %r10
+;   movq    24(%rbp), %rsi
+;   movq    32(%rbp), %rdi
+;   shrink_argument_area 32 %rdx
+;   return_call_known TestCase(%callee_i8) %rdi=%rdi
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   movq 0x10(%rbp), %r10
+;   movq 0x18(%rbp), %rsi
+;   movq 0x20(%rbp), %rdi
+;   movq 8(%rsp), %rdx
+;   movq %rdx, 0x28(%rsp)
+;   movq (%rsp), %rdx
+;   movq %rdx, 0x20(%rsp)
+;   addq $0x20, %rsp
+;   addq $0x20, %rbp
+;   movq %rbp, %rsp
+;   popq %rbp
+;   jmp 0x34 ; reloc_external CallPCRel4 %callee_i8 -4
+
+;;;; Test growing the argument area when it's non-empty ;;;;;;;;;;;;;;;;;;;;;;;;
+
+function %call_from_one_stack_arg(i32, i32, i32, i32, i32, i32, i32) tail {
+    fn0 = colocated %call_one_stack_arg(i32, i32, i32, i32, i32, i32, i32, i32, i32) tail
+
+block0(v0: i32, v1: i32, v2: i32, v3: i32, v4: i32, v5: i32, v6: i32):
+    return_call fn0(v1, v2, v3, v4, v5, v6, v0, v0, v1)
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdx, %r10
+;   movq    %rcx, %rdx
+;   movq    %r8, %rcx
+;   movq    %r9, %r8
+;   movq    16(%rbp), %r9
+;   grow_argument_area 16 %rax
+;   movl    %edi, 16(%rbp)
+;   movl    %edi, 24(%rbp)
+;   movl    %esi, 32(%rbp)
+;   movq    %rsi, %rdi
+;   movq    %r10, %rsi
+;   return_call_known TestCase(%call_one_stack_arg) %rdi=%rdi %rsi=%rsi %rdx=%rdx %rcx=%rcx %r8=%r8 %r9=%r9
 ;
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
-;   movq %rbp, %rcx
-;   movabsq $0, %r8 ; reloc_external Abs8 %callee_i8 0
-;   movq (%rcx), %rbp
-;   leaq 8(%rcx), %rsp
-;   jmpq *%r8
+;   movq %rdx, %r10
+;   movq %rcx, %rdx
+;   movq %r8, %rcx
+;   movq %r9, %r8
+;   movq 0x10(%rbp), %r9
+;   subq $0x10, %rsp
+;   subq $0x10, %rbp
+;   movq 0x10(%rsp), %rax
+;   movq %rax, (%rsp)
+;   movq 0x18(%rsp), %rax
+;   movq %rax, 8(%rsp)
+;   movl %edi, 0x10(%rbp)
+;   movl %edi, 0x18(%rbp)
+;   movl %esi, 0x20(%rbp)
+;   movq %rsi, %rdi
+;   movq %r10, %rsi
+;   movq %rbp, %rsp
+;   popq %rbp
+;   jmp 0x47 ; reloc_external CallPCRel4 %call_one_stack_arg -4

 ;;;; Test passing many arguments on stack ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

@ -217,50 +368,58 @@ block0(v0: i64, v1: i64, v2: i64, v3: i64, v4: i64, v5: i64, v6: i64, v7: i64, v
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   movq    16(%rbp), %rax
-;   movq    24(%rbp), %rdx
-;   movq    32(%rbp), %r9
-;   movq    40(%rbp), %r11
-;   movq    48(%rbp), %rdi
-;   movq    56(%rbp), %rcx
-;   movq    64(%rbp), %r8
-;   movq    72(%rbp), %r10
-;   movq    80(%rbp), %rsi
-;   movq    88(%rbp), %rax
-;   movq    96(%rbp), %rdx
-;   movq    104(%rbp), %r9
-;   movq    112(%rbp), %r11
-;   movq    120(%rbp), %rdi
-;   movq    128(%rbp), %rcx
-;   movq    136(%rbp), %rax
+;   movq    16(%rbp), %r10
+;   movq    24(%rbp), %rsi
+;   movq    32(%rbp), %rax
+;   movq    40(%rbp), %rdx
+;   movq    48(%rbp), %r9
+;   movq    56(%rbp), %r11
+;   movq    64(%rbp), %rdi
+;   movq    72(%rbp), %rcx
+;   movq    80(%rbp), %r8
+;   movq    88(%rbp), %r10
+;   movq    96(%rbp), %rsi
+;   movq    104(%rbp), %rax
+;   movq    112(%rbp), %rdx
+;   movq    120(%rbp), %r9
+;   movq    128(%rbp), %r11
+;   movq    136(%rbp), %rdi
+;   movq    144(%rbp), %rcx
+;   movq    152(%rbp), %r8
+;   movq    160(%rbp), %r10
+;   movq    168(%rbp), %rax
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
-;   ret 128
+;   ret 160
 ;
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
-;   movq 0x10(%rbp), %rax
-;   movq 0x18(%rbp), %rdx
-;   movq 0x20(%rbp), %r9
-;   movq 0x28(%rbp), %r11
-;   movq 0x30(%rbp), %rdi
-;   movq 0x38(%rbp), %rcx
-;   movq 0x40(%rbp), %r8
-;   movq 0x48(%rbp), %r10
-;   movq 0x50(%rbp), %rsi
-;   movq 0x58(%rbp), %rax
-;   movq 0x60(%rbp), %rdx
-;   movq 0x68(%rbp), %r9
-;   movq 0x70(%rbp), %r11
-;   movq 0x78(%rbp), %rdi
-;   movq 0x80(%rbp), %rcx
-;   movq 0x88(%rbp), %rax
+;   movq 0x10(%rbp), %r10
+;   movq 0x18(%rbp), %rsi
+;   movq 0x20(%rbp), %rax
+;   movq 0x28(%rbp), %rdx
+;   movq 0x30(%rbp), %r9
+;   movq 0x38(%rbp), %r11
+;   movq 0x40(%rbp), %rdi
+;   movq 0x48(%rbp), %rcx
+;   movq 0x50(%rbp), %r8
+;   movq 0x58(%rbp), %r10
+;   movq 0x60(%rbp), %rsi
+;   movq 0x68(%rbp), %rax
+;   movq 0x70(%rbp), %rdx
+;   movq 0x78(%rbp), %r9
+;   movq 0x80(%rbp), %r11
+;   movq 0x88(%rbp), %rdi
+;   movq 0x90(%rbp), %rcx
+;   movq 0x98(%rbp), %r8
+;   movq 0xa0(%rbp), %r10
+;   movq 0xa8(%rbp), %rax
 ;   movq %rbp, %rsp
 ;   popq %rbp
-;   retq $0x80
+;   retq $0xa0

 function %tail_caller_stack_args() -> i64 tail {
    fn0 = %tail_callee_stack_args(i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64) -> i64 tail
@ -298,195 +457,226 @@ block0:
 ; VCode:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
-;   subq    %rsp, $112, %rsp
+;   subq    %rsp, $160, %rsp
+;   movq    %rbx, 112(%rsp)
+;   movq    %r12, 120(%rsp)
+;   movq    %r13, 128(%rsp)
+;   movq    %r14, 136(%rsp)
+;   movq    %r15, 144(%rsp)
 ; block0:
-;   movl    $10, %eax
-;   movq    %rax, rsp(96 + virtual offset)
-;   movl    $15, %ecx
-;   movq    %rcx, rsp(88 + virtual offset)
+;   movl    $10, %edi
+;   movq    %rdi, rsp(96 + virtual offset)
+;   movl    $15, %esi
+;   movq    %rsi, rsp(88 + virtual offset)
 ;   movl    $20, %edx
 ;   movq    %rdx, rsp(80 + virtual offset)
-;   movl    $25, %ebx
-;   movq    %rbx, rsp(72 + virtual offset)
-;   movl    $30, %esi
-;   movq    %rsi, rsp(64 + virtual offset)
-;   movl    $35, %edi
-;   movq    %rdi, rsp(56 + virtual offset)
-;   movl    $40, %r8d
-;   movq    %r8, rsp(48 + virtual offset)
-;   movl    $45, %r9d
-;   movq    %r9, rsp(40 + virtual offset)
-;   movl    $50, %r10d
-;   movq    %r10, rsp(32 + virtual offset)
-;   movl    $55, %r11d
-;   movq    %r11, rsp(24 + virtual offset)
-;   movl    $60, %r15d
-;   movl    $65, %r12d
-;   movl    $70, %r13d
-;   movl    $75, %r14d
-;   movl    $80, %ecx
-;   movq    %rcx, rsp(16 + virtual offset)
-;   movl    $85, %ecx
+;   movl    $25, %ecx
+;   movq    %rcx, rsp(72 + virtual offset)
+;   movl    $30, %r8d
+;   movq    %r8, rsp(64 + virtual offset)
+;   movl    $35, %r9d
+;   movq    %r9, rsp(56 + virtual offset)
+;   movl    $40, %eax
+;   movl    $45, %r10d
+;   movl    $50, %r11d
+;   movl    $55, %r13d
+;   movl    $60, %r14d
+;   movl    $65, %r15d
+;   movl    $70, %ebx
+;   movl    $75, %r12d
+;   movl    $80, %edi
+;   movl    $85, %esi
+;   movq    %rsi, rsp(48 + virtual offset)
 ;   movl    $90, %edx
-;   movl    $95, %ebx
-;   movl    $100, %esi
-;   movl    $105, %edi
-;   movl    $110, %r8d
-;   movl    $115, %r9d
-;   movl    $120, %r10d
-;   movl    $125, %r11d
-;   movl    $130, %eax
-;   movq    %rax, rsp(8 + virtual offset)
-;   movl    $135, %eax
-;   movq    %rax, rsp(0 + virtual offset)
-;   subq    %rsp, $128, %rsp
-;   virtual_sp_offset_adjust 128
-;   movq    %r15, 0(%rsp)
-;   movq    %r12, 8(%rsp)
-;   movq    %r13, 16(%rsp)
-;   movq    %r14, 24(%rsp)
-;   movq    rsp(16 + virtual offset), %rax
-;   movq    %rax, 32(%rsp)
-;   movq    %rcx, 40(%rsp)
-;   movq    %rdx, 48(%rsp)
-;   movq    %rbx, 56(%rsp)
-;   movq    %rsi, 64(%rsp)
-;   movq    %rdi, 72(%rsp)
-;   movq    %r8, 80(%rsp)
-;   movq    %r9, 88(%rsp)
-;   movq    %r10, 96(%rsp)
-;   movq    %r11, 104(%rsp)
-;   movq    rsp(8 + virtual offset), %rax
-;   movq    %rax, 112(%rsp)
-;   movq    rsp(0 + virtual offset), %rax
-;   movq    %rax, 120(%rsp)
-;   movq    %rbp, %r15
-;   movq    8(%r15), %r13
-;   load_ext_name %tail_callee_stack_args+0, %r12
-;   movq    rsp(96 + virtual offset), %rax
-;   movq    rsp(88 + virtual offset), %rcx
+;   movl    $95, %ecx
+;   movl    $100, %r8d
+;   movl    $105, %r9d
+;   movl    $110, %esi
+;   movq    %rsi, rsp(40 + virtual offset)
+;   movl    $115, %esi
+;   movq    %rsi, rsp(32 + virtual offset)
+;   movl    $120, %esi
+;   movq    %rsi, rsp(24 + virtual offset)
+;   movl    $125, %esi
+;   movq    %rsi, rsp(16 + virtual offset)
+;   movl    $130, %esi
+;   movq    %rsi, rsp(8 + virtual offset)
+;   movl    $135, %esi
+;   movq    %rsi, rsp(0 + virtual offset)
+;   grow_argument_area 160 %rsi
+;   movq    %rax, 16(%rbp)
+;   movq    %r10, 24(%rbp)
+;   movq    %r11, 32(%rbp)
+;   movq    %r13, 40(%rbp)
+;   movq    %r14, 48(%rbp)
+;   movq    %r15, 56(%rbp)
+;   movq    %rbx, 64(%rbp)
+;   movq    %r12, 72(%rbp)
+;   movq    %rdi, 80(%rbp)
+;   movq    rsp(48 + virtual offset), %rdi
+;   movq    %rdi, 88(%rbp)
+;   movq    %rdx, 96(%rbp)
+;   movq    %rcx, 104(%rbp)
+;   movq    %r8, 112(%rbp)
+;   movq    %r9, 120(%rbp)
+;   movq    rsp(40 + virtual offset), %rsi
+;   movq    %rsi, 128(%rbp)
+;   movq    rsp(32 + virtual offset), %rsi
+;   movq    %rsi, 136(%rbp)
+;   movq    rsp(24 + virtual offset), %rsi
+;   movq    %rsi, 144(%rbp)
+;   movq    rsp(16 + virtual offset), %rsi
+;   movq    %rsi, 152(%rbp)
+;   movq    rsp(8 + virtual offset), %rsi
+;   movq    %rsi, 160(%rbp)
+;   movq    rsp(0 + virtual offset), %rsi
+;   movq    %rsi, 168(%rbp)
+;   load_ext_name %tail_callee_stack_args+0, %r10
+;   movq    rsp(72 + virtual offset), %rcx
 ;   movq    rsp(80 + virtual offset), %rdx
-;   movq    rsp(72 + virtual offset), %rbx
-;   movq    rsp(64 + virtual offset), %rsi
-;   movq    rsp(56 + virtual offset), %rdi
-;   movq    rsp(48 + virtual offset), %r8
-;   movq    rsp(40 + virtual offset), %r9
-;   movq    rsp(32 + virtual offset), %r10
-;   movq    rsp(24 + virtual offset), %r11
-;   return_call_unknown %r12 new_stack_arg_size:128 old_stack_arg_size:0 ret_addr:Some("%v219") fp:%v218 tmp:%v220 %rax=%rax %rcx=%rcx %rdx=%rdx %rbx=%rbx %rsi=%rsi %rdi=%rdi %r8=%r8 %r9=%r9 %r10=%r10 %r11=%r11
+;   movq    rsp(88 + virtual offset), %rsi
+;   movq    rsp(96 + virtual offset), %rdi
+;   movq    rsp(64 + virtual offset), %r8
+;   movq    rsp(56 + virtual offset), %r9
+;   return_call_unknown %r10 %rdi=%rdi %rsi=%rsi %rdx=%rdx %rcx=%rcx %r8=%r8 %r9=%r9
 ;
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
-;   subq $0x70, %rsp
-; block1: ; offset 0x8
-;   movl $0xa, %eax
-;   movq %rax, 0x60(%rsp)
-;   movl $0xf, %ecx
-;   movq %rcx, 0x58(%rsp)
+;   subq $0xa0, %rsp
+;   movq %rbx, 0x70(%rsp)
+;   movq %r12, 0x78(%rsp)
+;   movq %r13, 0x80(%rsp)
+;   movq %r14, 0x88(%rsp)
+;   movq %r15, 0x90(%rsp)
+; block1: ; offset 0x2d
+;   movl $0xa, %edi
+;   movq %rdi, 0x60(%rsp)
+;   movl $0xf, %esi
+;   movq %rsi, 0x58(%rsp)
 ;   movl $0x14, %edx
 ;   movq %rdx, 0x50(%rsp)
-;   movl $0x19, %ebx
-;   movq %rbx, 0x48(%rsp)
-;   movl $0x1e, %esi
-;   movq %rsi, 0x40(%rsp)
-;   movl $0x23, %edi
-;   movq %rdi, 0x38(%rsp)
-;   movl $0x28, %r8d
-;   movq %r8, 0x30(%rsp)
-;   movl $0x2d, %r9d
-;   movq %r9, 0x28(%rsp)
-;   movl $0x32, %r10d
-;   movq %r10, 0x20(%rsp)
-;   movl $0x37, %r11d
-;   movq %r11, 0x18(%rsp)
-;   movl $0x3c, %r15d
-;   movl $0x41, %r12d
-;   movl $0x46, %r13d
-;   movl $0x4b, %r14d
-;   movl $0x50, %ecx
-;   movq %rcx, 0x10(%rsp)
-;   movl $0x55, %ecx
+;   movl $0x19, %ecx
+;   movq %rcx, 0x48(%rsp)
+;   movl $0x1e, %r8d
+;   movq %r8, 0x40(%rsp)
+;   movl $0x23, %r9d
+;   movq %r9, 0x38(%rsp)
+;   movl $0x28, %eax
+;   movl $0x2d, %r10d
+;   movl $0x32, %r11d
+;   movl $0x37, %r13d
+;   movl $0x3c, %r14d
+;   movl $0x41, %r15d
+;   movl $0x46, %ebx
+;   movl $0x4b, %r12d
+;   movl $0x50, %edi
+;   movl $0x55, %esi
+;   movq %rsi, 0x30(%rsp)
 ;   movl $0x5a, %edx
-;   movl $0x5f, %ebx
-;   movl $0x64, %esi
-;   movl $0x69, %edi
-;   movl $0x6e, %r8d
-;   movl $0x73, %r9d
-;   movl $0x78, %r10d
-;   movl $0x7d, %r11d
-;   movl $0x82, %eax
-;   movq %rax, 8(%rsp)
-;   movl $0x87, %eax
-;   movq %rax, (%rsp)
-;   subq $0x80, %rsp
-;   movq %r15, (%rsp)
-;   movq %r12, 8(%rsp)
-;   movq %r13, 0x10(%rsp)
-;   movq %r14, 0x18(%rsp)
-;   movq 0x90(%rsp), %rax
-;   movq %rax, 0x20(%rsp)
-;   movq %rcx, 0x28(%rsp)
-;   movq %rdx, 0x30(%rsp)
-;   movq %rbx, 0x38(%rsp)
-;   movq %rsi, 0x40(%rsp)
-;   movq %rdi, 0x48(%rsp)
-;   movq %r8, 0x50(%rsp)
-;   movq %r9, 0x58(%rsp)
-;   movq %r10, 0x60(%rsp)
-;   movq %r11, 0x68(%rsp)
-;   movq 0x88(%rsp), %rax
-;   movq %rax, 0x70(%rsp)
-;   movq 0x80(%rsp), %rax
-;   movq %rax, 0x78(%rsp)
-;   movq %rbp, %r15
-;   movq 8(%r15), %r13
-;   movabsq $0, %r12 ; reloc_external Abs8 %tail_callee_stack_args 0
-;   movq 0xe0(%rsp), %rax
-;   movq 0xd8(%rsp), %rcx
-;   movq 0xd0(%rsp), %rdx
-;   movq 0xc8(%rsp), %rbx
+;   movl $0x5f, %ecx
+;   movl $0x64, %r8d
+;   movl $0x69, %r9d
+;   movl $0x6e, %esi
+;   movq %rsi, 0x28(%rsp)
+;   movl $0x73, %esi
+;   movq %rsi, 0x20(%rsp)
+;   movl $0x78, %esi
+;   movq %rsi, 0x18(%rsp)
+;   movl $0x7d, %esi
+;   movq %rsi, 0x10(%rsp)
+;   movl $0x82, %esi
+;   movq %rsi, 8(%rsp)
+;   movl $0x87, %esi
+;   movq %rsi, (%rsp)
+;   subq $0xa0, %rsp
+;   subq $0xa0, %rbp
+;   movq 0xa0(%rsp), %rsi
+;   movq %rsi, (%rsp)
+;   movq 0xa8(%rsp), %rsi
+;   movq %rsi, 8(%rsp)
+;   movq 0xb0(%rsp), %rsi
+;   movq %rsi, 0x10(%rsp)
+;   movq 0xb8(%rsp), %rsi
+;   movq %rsi, 0x18(%rsp)
 ;   movq 0xc0(%rsp), %rsi
-;   movq 0xb8(%rsp), %rdi
-;   movq 0xb0(%rsp), %r8
-;   movq 0xa8(%rsp), %r9
-;   movq 0xa0(%rsp), %r10
-;   movq 0x98(%rsp), %r11
-;   movq (%r15), %rbp
-;   movq 0x78(%rsp), %r14
-;   movq %r14, 8(%r15)
-;   movq 0x70(%rsp), %r14
-;   movq %r14, (%r15)
-;   movq 0x68(%rsp), %r14
-;   movq %r14, -8(%r15)
-;   movq 0x60(%rsp), %r14
-;   movq %r14, -0x10(%r15)
-;   movq 0x58(%rsp), %r14
-;   movq %r14, -0x18(%r15)
-;   movq 0x50(%rsp), %r14
-;   movq %r14, -0x20(%r15)
-;   movq 0x48(%rsp), %r14
-;   movq %r14, -0x28(%r15)
-;   movq 0x40(%rsp), %r14
-;   movq %r14, -0x30(%r15)
-;   movq 0x38(%rsp), %r14
-;   movq %r14, -0x38(%r15)
-;   movq 0x30(%rsp), %r14
-;   movq %r14, -0x40(%r15)
-;   movq 0x28(%rsp), %r14
-;   movq %r14, -0x48(%r15)
-;   movq 0x20(%rsp), %r14
-;   movq %r14, -0x50(%r15)
-;   movq 0x18(%rsp), %r14
-;   movq %r14, -0x58(%r15)
-;   movq 0x10(%rsp), %r14
-;   movq %r14, -0x60(%r15)
-;   movq 8(%rsp), %r14
-;   movq %r14, -0x68(%r15)
-;   movq (%rsp), %r14
-;   movq %r14, -0x70(%r15)
-;   leaq -0x78(%r15), %rsp
-;   movq %r13, (%rsp)
-;   jmpq *%r12
+;   movq %rsi, 0x20(%rsp)
+;   movq 0xc8(%rsp), %rsi
+;   movq %rsi, 0x28(%rsp)
+;   movq 0xd0(%rsp), %rsi
+;   movq %rsi, 0x30(%rsp)
+;   movq 0xd8(%rsp), %rsi
+;   movq %rsi, 0x38(%rsp)
+;   movq 0xe0(%rsp), %rsi
+;   movq %rsi, 0x40(%rsp)
+;   movq 0xe8(%rsp), %rsi
+;   movq %rsi, 0x48(%rsp)
+;   movq 0xf0(%rsp), %rsi
+;   movq %rsi, 0x50(%rsp)
+;   movq 0xf8(%rsp), %rsi
+;   movq %rsi, 0x58(%rsp)
+;   movq 0x100(%rsp), %rsi
+;   movq %rsi, 0x60(%rsp)
+;   movq 0x108(%rsp), %rsi
+;   movq %rsi, 0x68(%rsp)
+;   movq 0x110(%rsp), %rsi
+;   movq %rsi, 0x70(%rsp)
+;   movq 0x118(%rsp), %rsi
+;   movq %rsi, 0x78(%rsp)
+;   movq 0x120(%rsp), %rsi
+;   movq %rsi, 0x80(%rsp)
+;   movq 0x128(%rsp), %rsi
+;   movq %rsi, 0x88(%rsp)
+;   movq 0x130(%rsp), %rsi
+;   movq %rsi, 0x90(%rsp)
+;   movq 0x138(%rsp), %rsi
+;   movq %rsi, 0x98(%rsp)
+;   movq 0x140(%rsp), %rsi
+;   movq %rsi, 0xa0(%rsp)
+;   movq 0x148(%rsp), %rsi
+;   movq %rsi, 0xa8(%rsp)
+;   movq %rax, 0x10(%rbp)
+;   movq %r10, 0x18(%rbp)
+;   movq %r11, 0x20(%rbp)
+;   movq %r13, 0x28(%rbp)
+;   movq %r14, 0x30(%rbp)
+;   movq %r15, 0x38(%rbp)
+;   movq %rbx, 0x40(%rbp)
+;   movq %r12, 0x48(%rbp)
+;   movq %rdi, 0x50(%rbp)
+;   movq 0x30(%rsp), %rdi
+;   movq %rdi, 0x58(%rbp)
+;   movq %rdx, 0x60(%rbp)
+;   movq %rcx, 0x68(%rbp)
+;   movq %r8, 0x70(%rbp)
+;   movq %r9, 0x78(%rbp)
+;   movq 0x28(%rsp), %rsi
+;   movq %rsi, 0x80(%rbp)
+;   movq 0x20(%rsp), %rsi
+;   movq %rsi, 0x88(%rbp)
+;   movq 0x18(%rsp), %rsi
+;   movq %rsi, 0x90(%rbp)
+;   movq 0x10(%rsp), %rsi
+;   movq %rsi, 0x98(%rbp)
+;   movq 8(%rsp), %rsi
+;   movq %rsi, 0xa0(%rbp)
+;   movq (%rsp), %rsi
+;   movq %rsi, 0xa8(%rbp)
+;   movabsq $0, %r10 ; reloc_external Abs8 %tail_callee_stack_args 0
+;   movq 0x48(%rsp), %rcx
+;   movq 0x50(%rsp), %rdx
+;   movq 0x58(%rsp), %rsi
+;   movq 0x60(%rsp), %rdi
+;   movq 0x40(%rsp), %r8
+;   movq 0x38(%rsp), %r9
+;   movq 0x70(%rsp), %rbx
+;   movq 0x78(%rsp), %r12
+;   movq 0x80(%rsp), %r13
+;   movq 0x88(%rsp), %r14
+;   movq 0x90(%rsp), %r15
+;   addq $0xa0, %rsp
+;   movq %rbp, %rsp
+;   popq %rbp
+;   jmpq *%r10

--- a/cranelift/filetests/filetests/isa/x64/tail-call-conv.clif
+++ b/cranelift/filetests/filetests/isa/x64/tail-call-conv.clif
--- a/cranelift/filetests/filetests/isa/x64/tail-stack-limit.clif
+++ b/cranelift/filetests/filetests/isa/x64/tail-stack-limit.clif
@ -19,33 +19,38 @@ block0(v0: i64, v1: i8, v2: i8, v3: i8, v4: i8, v5: i8, v6: i8, v7: i8, v8: i128
 ; VCode:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
-;   movq    %rax, %r14
-;   addq    %r14, $16, %r14
-;   cmpq    %rsp, %r14
+;   movq    %rdi, %r10
+;   addq    %r10, $16, %r10
+;   cmpq    %rsp, %r10
 ;   jnbe #trap=stk_ovf
 ;   subq    %rsp, $16, %rsp
 ; block0:
-;   movq    %r10, %rax
-;   movq    %r11, %rcx
+;   movq    16(%rbp), %r10
+;   movq    24(%rbp), %rsi
+;   movq    32(%rbp), %rax
+;   movq    40(%rbp), %rcx
 ;   addq    %rsp, $16, %rsp
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
-;   ret
+;   ret 32
 ;
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
-;   movq %rax, %r14
-;   addq $0x10, %r14
-;   cmpq %rsp, %r14
-;   ja 0x27
+;   movq %rdi, %r10
+;   addq $0x10, %r10
+;   cmpq %rsp, %r10
+;   ja 0x33
 ;   subq $0x10, %rsp
 ; block1: ; offset 0x18
-;   movq %r10, %rax
-;   movq %r11, %rcx
+;   movq 0x10(%rbp), %r10
+;   movq 0x18(%rbp), %rsi
+;   movq 0x20(%rbp), %rax
+;   movq 0x28(%rbp), %rcx
 ;   addq $0x10, %rsp
 ;   movq %rbp, %rsp
 ;   popq %rbp
-;   retq
+;   retq $0x20
 ;   ud2 ; trap: stk_ovf
+