x64: Lower extractlane, scalar_to_vector, and splat in ISLE (#4780)

Lower extractlane, scalar_to_vector and splat in ISLE. This PR also makes some changes to the SinkableLoad api * change the return type of sink_load to RegMem as there are more functions available for dealing with RegMem * add reg_mem_to_reg_mem_imm and register it as an automatic conversion
2 years ago · 9386409607
10 changed files with 285 additions and 251 deletions
--- a/cranelift/codegen/src/isa/x64/inst.isle
+++ b/cranelift/codegen/src/isa/x64/inst.isle
@ -777,6 +777,13 @@
       (Reg (reg Reg))
       (Mem (addr SyntheticAmode))))
 ;; Convert a RegMem to a RegMemImm.
 (decl reg_mem_to_reg_mem_imm (RegMem) RegMemImm)
 (rule (reg_mem_to_reg_mem_imm (RegMem.Reg reg))
      (RegMemImm.Reg reg))
 (rule (reg_mem_to_reg_mem_imm (RegMem.Mem addr))
      (RegMemImm.Mem addr))
 ;; Put the given clif value into a `RegMem` operand.
 ;;
 ;; Asserts that the value fits into a single register, and doesn't require
@ -1456,13 +1463,17 @@
 ;; This is a side-effectful operation that notifies the context that the
 ;; instruction that produced the `SinkableImm` has been sunk into another
 ;; instruction, and no longer needs to be lowered.
-(decl sink_load (SinkableLoad) RegMemImm)
+(decl sink_load (SinkableLoad) RegMem)
 (extern constructor sink_load sink_load)
 (decl sink_load_to_gpr_mem_imm (SinkableLoad) GprMemImm)
 (rule (sink_load_to_gpr_mem_imm load)
      (gpr_mem_imm_new (sink_load load)))
 (decl sink_load_to_xmm_mem (SinkableLoad) XmmMem)
 (rule (sink_load_to_xmm_mem load)
      (reg_mem_to_xmm_mem (sink_load load)))
 ;;;; Helpers for Sign/Zero Extending ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 (type ExtKind extern
@ -1534,6 +1545,13 @@
      (let ((r WritableXmm (temp_writable_xmm)))
        (x64_pcmpeqd r r)))
 ;; Helper for creating XmmUninitializedValue instructions.
 (decl xmm_uninit_value () Xmm)
 (rule (xmm_uninit_value)
      (let ((dst WritableXmm (temp_writable_xmm))
            (_ Unit (emit (MInst.XmmUninitializedValue dst))))
        dst))
 ;; Helper for creating an SSE register holding an `i64x2` from two `i64` values.
 (decl make_i64x2_from_lanes (GprMem GprMem) Xmm)
 (rule (make_i64x2_from_lanes lo hi)
@ -2828,6 +2846,30 @@
 (rule (x64_psrad src1 src2)
      (xmm_rmi_xmm (SseOpcode.Psrad) src1 src2))
 ;; Helper for creating `pextrb` instructions.
 (decl x64_pextrb (Type Xmm u8) Gpr)
 (rule (x64_pextrb ty src lane)
      (let ((dst WritableGpr (temp_writable_gpr))
            (_ Unit (emit (MInst.XmmRmRImm (SseOpcode.Pextrb)
                                           dst
                                           src
                                           dst
                                           lane
                                           (operand_size_of_type_32_64 (lane_type ty))))))
        dst))
 ;; Helper for creating `pextrw` instructions.
 (decl x64_pextrw (Type Xmm u8) Gpr)
 (rule (x64_pextrw ty src lane)
      (let ((dst WritableGpr (temp_writable_gpr))
            (_ Unit (emit (MInst.XmmRmRImm (SseOpcode.Pextrw)
                                           dst
                                           src
                                           dst
                                           lane
                                           (operand_size_of_type_32_64 (lane_type ty))))))
        dst))
 ;; Helper for creating `pextrd` instructions.
 (decl x64_pextrd (Type Xmm u8) Gpr)
 (rule (x64_pextrd ty src lane)
@ -3707,6 +3749,7 @@
 (convert WritableGpr Gpr writable_gpr_to_gpr)
 (convert RegMemImm GprMemImm gpr_mem_imm_new)
 (convert RegMem GprMem reg_mem_to_gpr_mem)
 (convert RegMem RegMemImm reg_mem_to_reg_mem_imm)
 (convert Reg GprMem reg_to_gpr_mem)
 (convert Reg GprMemImm reg_to_gpr_mem_imm)
 (convert WritableGpr WritableReg writable_gpr_to_reg)
--- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
@ -78,6 +78,17 @@ impl Inst {
            dst: WritableXmm::from_writable_reg(dst).unwrap(),
        }
    }
    // TODO Can be replaced by `Inst::move` (high-level) and `Inst::unary_rm_r` (low-level)
    fn xmm_mov(op: SseOpcode, src: RegMem, dst: Writable<Reg>) -> Inst {
        src.assert_regclass_is(RegClass::Float);
        debug_assert!(dst.to_reg().class() == RegClass::Float);
        Inst::XmmUnaryRmR {
            op,
            src: XmmMem::new(src).unwrap(),
            dst: WritableXmm::from_writable_reg(dst).unwrap(),
        }
    }
 }
 #[test]
--- a/cranelift/codegen/src/isa/x64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/x64/inst/mod.rs
@ -263,17 +263,6 @@ impl Inst {
        Inst::MovRR { size, src, dst }
    }
    // TODO Can be replaced by `Inst::move` (high-level) and `Inst::unary_rm_r` (low-level)
    pub(crate) fn xmm_mov(op: SseOpcode, src: RegMem, dst: Writable<Reg>) -> Inst {
        src.assert_regclass_is(RegClass::Float);
        debug_assert!(dst.to_reg().class() == RegClass::Float);
        Inst::XmmUnaryRmR {
            op,
            src: XmmMem::new(src).unwrap(),
            dst: WritableXmm::from_writable_reg(dst).unwrap(),
        }
    }
    pub(crate) fn xmm_load_const(src: VCodeConstant, dst: Writable<Reg>, ty: Type) -> Inst {
        debug_assert!(dst.to_reg().class() == RegClass::Float);
        debug_assert!(ty.is_vector() && ty.bits() == 128);
@ -316,13 +305,6 @@ impl Inst {
        }
    }
    pub(crate) fn xmm_uninit_value(dst: Writable<Reg>) -> Self {
        debug_assert!(dst.to_reg().class() == RegClass::Float);
        Inst::XmmUninitializedValue {
            dst: WritableXmm::from_writable_reg(dst).unwrap(),
        }
    }
    pub(crate) fn xmm_mov_r_m(op: SseOpcode, src: Reg, dst: impl Into<SyntheticAmode>) -> Inst {
        debug_assert!(src.class() == RegClass::Float);
        Inst::XmmMovRM {
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@ -3547,3 +3547,99 @@
                        mask
                        (x64_xmm_load_const $I8X16 (swizzle_zero_mask)))))
        (x64_pshufb src mask)))
 ;; Rules for `extractlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; Remove the extractlane instruction, leaving the float where it is. The upper
 ;; bits will remain unchanged; for correctness, this relies on Cranelift type
 ;; checking to avoid using those bits.
 (rule (lower (has_type (ty_scalar_float _) (extractlane val (u8_from_uimm8 0))))
      val)
 ;; Cases 2-4 for an F32X4
 (rule (lower (has_type $F32 (extractlane val @ (value_type (ty_vec128 ty))
                                         (u8_from_uimm8 lane))))
      (x64_pshufd val lane (OperandSize.Size32)))
 ;; This is the only remaining case for F64X2 
 (rule (lower (has_type $F64 (extractlane val @ (value_type (ty_vec128 ty))
                                         (u8_from_uimm8 1))))
      ;; 0xee == 0b11_10_11_10
      (x64_pshufd val 0xee (OperandSize.Size32)))
 (rule (lower (extractlane val @ (value_type ty @ (multi_lane 8 16)) (u8_from_uimm8 lane)))
      (x64_pextrb ty val lane))
 (rule (lower (extractlane val @ (value_type ty @ (multi_lane 16 8)) (u8_from_uimm8 lane)))
      (x64_pextrw ty val lane))
 (rule (lower (extractlane val @ (value_type ty @ (multi_lane 32 4)) (u8_from_uimm8 lane)))
      (x64_pextrd ty val lane))
 (rule (lower (extractlane val @ (value_type ty @ (multi_lane 64 2)) (u8_from_uimm8 lane)))
      (x64_pextrd ty val lane))
 ;; Rules for `scalar_to_vector` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; Case 1: when moving a scalar float, we simply move from one XMM register
 ;; to another, expecting the register allocator to elide this. Here we
 ;; assume that the upper bits of a scalar float have not been munged with
 ;; (the same assumption the old backend makes).
 (rule (lower (scalar_to_vector src @ (value_type (ty_scalar_float _))))
      src)
 ;; Case 2: when moving a scalar value of any other type, use MOVD to zero
 ;; the upper lanes.
 (rule (lower (scalar_to_vector src @ (value_type ty)))
      (bitcast_gpr_to_xmm ty src))
 ;; Case 3: when presented with `load + scalar_to_vector`, coalesce into a single
 ;; MOVSS/MOVSD instruction.
 (rule (lower (scalar_to_vector (and (sinkable_load src) (value_type (ty_32 _)))))
      (x64_movss_load (sink_load_to_xmm_mem src)))
 (rule (lower (scalar_to_vector (and (sinkable_load src) (value_type (ty_64 _)))))
      (x64_movsd_load (sink_load_to_xmm_mem src)))
 ;; Rules for `splat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 (rule (lower (has_type (multi_lane 8 16) (splat src)))
      (let ((vec Xmm (vec_insert_lane $I8X16 (xmm_uninit_value) src 0))
            (zeros Xmm (x64_pxor vec vec)))
        ;; Shuffle the lowest byte lane to all other lanes.
        (x64_pshufb vec zeros)))
 (rule (lower (has_type (multi_lane 16 8) (splat src)))
      (let (;; Force the input into a register so that we don't create a
            ;; VCodeConstant.
            (src RegMem (RegMem.Reg src))
            (vec Xmm (vec_insert_lane $I16X8 (xmm_uninit_value) src 0))
            (vec Xmm (vec_insert_lane $I16X8 vec src 1)))
        ;; Shuffle the lowest two lanes to all other lanes.
        (x64_pshufd vec 0 (OperandSize.Size32))))
 (rule (lower (has_type (multi_lane 32 4) (splat src @ (value_type (ty_scalar_float _)))))
      (lower_splat_32x4 $F32X4 src))
 (rule (lower (has_type (multi_lane 32 4) (splat src)))
      (lower_splat_32x4 $I32X4 src))
 (decl lower_splat_32x4 (Type Value) Xmm)
 (rule (lower_splat_32x4 ty src)
      (let ((src RegMem src)
            (vec Xmm (vec_insert_lane ty (xmm_uninit_value) src 0)))
        ;; Shuffle the lowest lane to all other lanes.
        (x64_pshufd vec 0 (OperandSize.Size32))))
 (rule (lower (has_type (multi_lane 64 2) (splat src @ (value_type (ty_scalar_float _)))))
      (lower_splat_64x2 $F64X2 src))
 (rule (lower (has_type (multi_lane 64 2) (splat src)))
      (lower_splat_64x2 $I64X2 src))
 (decl lower_splat_64x2 (Type Value) Xmm)
 (rule (lower_splat_64x2 ty src)
      (let (;; Force the input into a register so that we don't create a
            ;; VCodeConstant.
            (src RegMem (RegMem.Reg src))
            (vec Xmm (vec_insert_lane ty (xmm_uninit_value) src 0)))
        (vec_insert_lane ty vec src 1)))
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@ -3,7 +3,7 @@
 // ISLE integration glue.
 pub(super) mod isle;
-use crate::ir::{types, ExternalName, Inst as IRInst, InstructionData, LibCall, Opcode, Type};
+use crate::ir::{types, ExternalName, Inst as IRInst, LibCall, Opcode, Type};
 use crate::isa::x64::abi::*;
 use crate::isa::x64::inst::args::*;
 use crate::isa::x64::inst::*;
@ -160,100 +160,6 @@ fn input_to_imm(ctx: &mut Lower<Inst>, spec: InsnInput) -> Option<u64> {
        .constant
 }
 /// Emit an instruction to insert a value `src` into a lane of `dst`.
 fn emit_insert_lane(ctx: &mut Lower<Inst>, src: RegMem, dst: Writable<Reg>, lane: u8, ty: Type) {
    if !ty.is_float() {
        let (sse_op, size) = match ty.lane_bits() {
            8 => (SseOpcode::Pinsrb, OperandSize::Size32),
            16 => (SseOpcode::Pinsrw, OperandSize::Size32),
            32 => (SseOpcode::Pinsrd, OperandSize::Size32),
            64 => (SseOpcode::Pinsrd, OperandSize::Size64),
            _ => panic!("Unable to insertlane for lane size: {}", ty.lane_bits()),
        };
        ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, size));
    } else if ty == types::F32 {
        let sse_op = SseOpcode::Insertps;
        // Insert 32-bits from replacement (at index 00, bits 7:8) to vector (lane
        // shifted into bits 5:6).
        let lane = 0b00_00_00_00 | lane << 4;
        ctx.emit(Inst::xmm_rm_r_imm(
            sse_op,
            src,
            dst,
            lane,
            OperandSize::Size32,
        ));
    } else if ty == types::F64 {
        let sse_op = match lane {
            // Move the lowest quadword in replacement to vector without changing
            // the upper bits.
            0 => SseOpcode::Movsd,
            // Move the low 64 bits of replacement vector to the high 64 bits of the
            // vector.
            1 => SseOpcode::Movlhps,
            _ => unreachable!(),
        };
        // Here we use the `xmm_rm_r` encoding because it correctly tells the register
        // allocator how we are using `dst`: we are using `dst` as a `mod` whereas other
        // encoding formats like `xmm_unary_rm_r` treat it as a `def`.
        ctx.emit(Inst::xmm_rm_r(sse_op, src, dst));
    } else {
        panic!("unable to emit insertlane for type: {}", ty)
    }
 }
 /// Emit an instruction to extract a lane of `src` into `dst`.
 fn emit_extract_lane(ctx: &mut Lower<Inst>, src: Reg, dst: Writable<Reg>, lane: u8, ty: Type) {
    if !ty.is_float() {
        let (sse_op, size) = match ty.lane_bits() {
            8 => (SseOpcode::Pextrb, OperandSize::Size32),
            16 => (SseOpcode::Pextrw, OperandSize::Size32),
            32 => (SseOpcode::Pextrd, OperandSize::Size32),
            64 => (SseOpcode::Pextrd, OperandSize::Size64),
            _ => panic!("Unable to extractlane for lane size: {}", ty.lane_bits()),
        };
        let src = RegMem::reg(src);
        ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, size));
    } else if ty == types::F32 || ty == types::F64 {
        if lane == 0 {
            // Remove the extractlane instruction, leaving the float where it is. The upper
            // bits will remain unchanged; for correctness, this relies on Cranelift type
            // checking to avoid using those bits.
            ctx.emit(Inst::gen_move(dst, src, ty));
        } else {
            // Otherwise, shuffle the bits in `lane` to the lowest lane.
            let sse_op = SseOpcode::Pshufd;
            let mask = match ty {
                // Move the value at `lane` to lane 0, copying existing value at lane 0 to
                // other lanes. Again, this relies on Cranelift type checking to avoid
                // using those bits.
                types::F32 => {
                    assert!(lane > 0 && lane < 4);
                    0b00_00_00_00 | lane
                }
                // Move the value at `lane` 1 (we know it must be 1 because of the `if`
                // statement above) to lane 0 and leave lane 1 unchanged. The Cranelift type
                // checking assumption also applies here.
                types::F64 => {
                    assert!(lane == 1);
                    0b11_10_11_10
                }
                _ => unreachable!(),
            };
            let src = RegMem::reg(src);
            ctx.emit(Inst::xmm_rm_r_imm(
                sse_op,
                src,
                dst,
                mask,
                OperandSize::Size32,
            ));
        }
    } else {
        panic!("unable to emit extractlane for type: {}", ty)
    }
 }
 fn emit_vm_call(
    ctx: &mut Lower<Inst>,
    flags: &Flags,
@ -586,132 +492,15 @@ fn lower_insn_to_regs(
        | Opcode::RawBitcast
        | Opcode::Insertlane
        | Opcode::Shuffle
-        | Opcode::Swizzle => {
+        | Opcode::Swizzle
        | Opcode::Extractlane
        | Opcode::ScalarToVector
        | Opcode::Splat => {
            implemented_in_isle(ctx);
        }
        Opcode::DynamicStackAddr => unimplemented!("DynamicStackAddr"),
        Opcode::Extractlane => {
            // The instruction format maps to variables like: %dst = extractlane %src, %lane
            let ty = ty.unwrap();
            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
            let src_ty = ctx.input_ty(insn, 0);
            assert_eq!(src_ty.bits(), 128);
            let src = put_input_in_reg(ctx, inputs[0]);
            let lane = if let InstructionData::BinaryImm8 { imm, .. } = ctx.data(insn) {
                *imm
            } else {
                unreachable!();
            };
            debug_assert!(lane < src_ty.lane_count() as u8);
            emit_extract_lane(ctx, src, dst, lane, ty);
        }
        Opcode::ScalarToVector => {
            // When moving a scalar value to a vector register, we must be handle several
            // situations:
            //  1. a scalar float is already in an XMM register, so we simply move it
            //  2. a scalar of any other type resides in a GPR register: MOVD moves the bits to an
            //     XMM register and zeroes the upper bits
            //  3. a scalar (float or otherwise) that has previously been loaded from memory (e.g.
            //     the default lowering of Wasm's `load[32|64]_zero`) can be lowered to a single
            //     MOVSS/MOVSD instruction; to do this, we rely on `input_to_reg_mem` to sink the
            //     unused load.
            let src = input_to_reg_mem(ctx, inputs[0]);
            let src_ty = ctx.input_ty(insn, 0);
            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
            let dst_ty = ty.unwrap();
            assert!(src_ty == dst_ty.lane_type() && dst_ty.bits() == 128);
            match src {
                RegMem::Reg { reg } => {
                    if src_ty.is_float() {
                        // Case 1: when moving a scalar float, we simply move from one XMM register
                        // to another, expecting the register allocator to elide this. Here we
                        // assume that the upper bits of a scalar float have not been munged with
                        // (the same assumption the old backend makes).
                        ctx.emit(Inst::gen_move(dst, reg, dst_ty));
                    } else {
                        // Case 2: when moving a scalar value of any other type, use MOVD to zero
                        // the upper lanes.
                        let src_size = match src_ty.bits() {
                            32 => OperandSize::Size32,
                            64 => OperandSize::Size64,
                            _ => unimplemented!("invalid source size for type: {}", src_ty),
                        };
                        ctx.emit(Inst::gpr_to_xmm(SseOpcode::Movd, src, src_size, dst));
                    }
                }
                RegMem::Mem { .. } => {
                    // Case 3: when presented with `load + scalar_to_vector`, coalesce into a single
                    // MOVSS/MOVSD instruction.
                    let opcode = match src_ty.bits() {
                        32 => SseOpcode::Movss,
                        64 => SseOpcode::Movsd,
                        _ => unimplemented!("unable to move scalar to vector for type: {}", src_ty),
                    };
                    ctx.emit(Inst::xmm_mov(opcode, src, dst));
                }
            }
        }
        Opcode::Splat => {
            let ty = ty.unwrap();
            assert_eq!(ty.bits(), 128);
            let src_ty = ctx.input_ty(insn, 0);
            assert!(src_ty.bits() < 128);
            let src = input_to_reg_mem(ctx, inputs[0]);
            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
            // We know that splat will overwrite all of the lanes of `dst` but it takes several
            // instructions to do so. Because of the multiple instructions, there is no good way to
            // declare `dst` a `def` except with the following pseudo-instruction.
            ctx.emit(Inst::xmm_uninit_value(dst));
            // TODO: eventually many of these sequences could be optimized with AVX's VBROADCAST*
            // and VPBROADCAST*.
            match ty.lane_bits() {
                8 => {
                    emit_insert_lane(ctx, src, dst, 0, ty.lane_type());
                    // Initialize a register with all 0s.
                    let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
                    ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), tmp));
                    // Shuffle the lowest byte lane to all other lanes.
                    ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp), dst))
                }
                16 => {
                    emit_insert_lane(ctx, src.clone(), dst, 0, ty.lane_type());
                    emit_insert_lane(ctx, src, dst, 1, ty.lane_type());
                    // Shuffle the lowest two lanes to all other lanes.
                    ctx.emit(Inst::xmm_rm_r_imm(
                        SseOpcode::Pshufd,
                        RegMem::from(dst),
                        dst,
                        0,
                        OperandSize::Size32,
                    ))
                }
                32 => {
                    emit_insert_lane(ctx, src, dst, 0, ty.lane_type());
                    // Shuffle the lowest lane to all other lanes.
                    ctx.emit(Inst::xmm_rm_r_imm(
                        SseOpcode::Pshufd,
                        RegMem::from(dst),
                        dst,
                        0,
                        OperandSize::Size32,
                    ))
                }
                64 => {
                    emit_insert_lane(ctx, src.clone(), dst, 0, ty.lane_type());
                    emit_insert_lane(ctx, src, dst, 1, ty.lane_type());
                }
                _ => panic!("Invalid type to splat: {}", ty),
            }
        }
        Opcode::VanyTrue => {
            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
            let src_ty = ctx.input_ty(insn, 0);
--- a/cranelift/codegen/src/isa/x64/lower/isle.rs
+++ b/cranelift/codegen/src/isa/x64/lower/isle.rs
@ -306,10 +306,10 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> {
        None
    }
-    fn sink_load(&mut self, load: &SinkableLoad) -> RegMemImm {
+    fn sink_load(&mut self, load: &SinkableLoad) -> RegMem {
        self.lower_ctx.sink_inst(load.inst);
        let addr = lower_to_amode(self.lower_ctx, load.addr_input, load.offset);
-        RegMemImm::Mem {
+        RegMem::Mem {
            addr: SyntheticAmode::Real(addr),
        }
    }
--- a/cranelift/codegen/src/machinst/isle.rs
+++ b/cranelift/codegen/src/machinst/isle.rs
@ -298,6 +298,24 @@ macro_rules! isle_prelude_methods {
            }
        }
        #[inline]
        fn ty_32(&mut self, ty: Type) -> Option<Type> {
            if ty.bits() == 32 {
                Some(ty)
            } else {
                None
            }
        }
        #[inline]
        fn ty_64(&mut self, ty: Type) -> Option<Type> {
            if ty.bits() == 64 {
                Some(ty)
            } else {
                None
            }
        }
        #[inline]
        fn ty_32_or_64(&mut self, ty: Type) -> Option<Type> {
            if ty.bits() == 32 || ty.bits() == 64 {
--- a/cranelift/codegen/src/prelude.isle
+++ b/cranelift/codegen/src/prelude.isle
@ -328,6 +328,14 @@
 (decl fits_in_64 (Type) Type)
 (extern extractor fits_in_64 fits_in_64)
 ;; An extractor that only matches types that fit in exactly 32 bits.
 (decl ty_32 (Type) Type)
 (extern extractor ty_32 ty_32)
 ;; An extractor that only matches types that fit in exactly 64 bits.
 (decl ty_64 (Type) Type)
 (extern extractor ty_64 ty_64)
 ;; A pure constructor that only matches scalar booleans, integers, and
 ;; references that can fit in 64 bits.
 (decl pure ty_int_bool_ref_scalar_64 (Type) Type)
--- a/cranelift/filetests/filetests/isa/x64/extractlane.clif
+++ b/cranelift/filetests/filetests/isa/x64/extractlane.clif
@ -0,0 +1,87 @@
 test compile precise-output
 target x86_64
 function %f1(i8x16) -> i8 {
 block0(v0: i8x16):
  v1 = extractlane v0, 1
  return v1
 }
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
 ;   pextrb  $1, %xmm0, %rax
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
 function %f2(i16x8) -> i16 {
 block0(v0: i16x8):
  v1 = extractlane v0, 1
  return v1
 }
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
 ;   pextrw  $1, %xmm0, %rax
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
 function %f3(i32x4) -> i32 {
 block0(v0: i32x4):
  v1 = extractlane v0, 1
  return v1
 }
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
 ;   pextrd  $1, %xmm0, %rax
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
 function %f4(i64x2) -> i64 {
 block0(v0: i64x2):
  v1 = extractlane v0, 1
  return v1
 }
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
 ;   pextrd.w $1, %xmm0, %rax
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
 function %f5(f32x4) -> f32 {
 block0(v0: f32x4):
  v1 = extractlane v0, 1
  return v1
 }
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
 ;   pshufd  $1, %xmm0, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
 function %f6(f64x2) -> f64 {
 block0(v0: f64x2):
  v1 = extractlane v0, 1
  return v1
 }
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
 ;   pshufd  $238, %xmm0, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
--- a/cranelift/filetests/filetests/isa/x64/simd-lane-access-compile.clif
+++ b/cranelift/filetests/filetests/isa/x64/simd-lane-access-compile.clif
@ -74,8 +74,8 @@ block0(v0: i8):
 ; block0:
 ;   uninit  %xmm0
 ;   pinsrb  $0, %xmm0, %rdi, %xmm0
-;   pxor    %xmm6, %xmm6, %xmm6
+;   pxor    %xmm7, %xmm7, %xmm7
-;   pshufb  %xmm0, %xmm6, %xmm0
+;   pshufb  %xmm0, %xmm7, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@ -90,11 +90,11 @@ block0:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   movl    $65535, %eax
+;   movl    $65535, %edi
-;   uninit  %xmm0
+;   uninit  %xmm5
-;   pinsrw  $0, %xmm0, %rax, %xmm0
+;   pinsrw  $0, %xmm5, %rdi, %xmm5
-;   pinsrw  $1, %xmm0, %rax, %xmm0
+;   pinsrw  $1, %xmm5, %rdi, %xmm5
-;   pshufd  $0, %xmm0, %xmm0
+;   pshufd  $0, %xmm5, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@ -108,9 +108,9 @@ block0(v0: i32):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   uninit  %xmm0
+;   uninit  %xmm4
-;   pinsrd  $0, %xmm0, %rdi, %xmm0
+;   pinsrd  $0, %xmm4, %rdi, %xmm4
-;   pshufd  $0, %xmm0, %xmm0
+;   pshufd  $0, %xmm4, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@ -124,11 +124,11 @@ block0(v0: f64):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   movdqa  %xmm0, %xmm4
+;   movdqa  %xmm0, %xmm6
 ;   uninit  %xmm0
-;   movdqa  %xmm4, %xmm5
+;   movdqa  %xmm6, %xmm7
-;   movsd   %xmm0, %xmm5, %xmm0
+;   movsd   %xmm0, %xmm7, %xmm0
-;   movlhps %xmm0, %xmm5, %xmm0
+;   movlhps %xmm0, %xmm7, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret