x64: Add AVX support for some more float-related instructions (#6092)

* x64: Add AVX encodings of `vcvt{ss2sd,sd2ss}` Additionally update the instruction helpers to take an `XmmMem` argument to allow load sinking into the instruction. * x64: Add AVX encoding of `sqrts{s,d}` * x64: Add AVX support for `rounds{s,d}`
2 years ago · 0b0ac3ff73
8 changed files with 543 additions and 11 deletions
--- a/cranelift/codegen/src/isa/x64/inst.isle
+++ b/cranelift/codegen/src/isa/x64/inst.isle
@ -1299,6 +1299,12 @@
            Vpmovmskb
            Vcvtsi2ss
            Vcvtsi2sd
            Vcvtss2sd
            Vcvtsd2ss
            Vsqrtss
            Vsqrtsd
            Vroundss
            Vroundsd
          ))
 (type Avx512Opcode extern
@ -3348,11 +3354,17 @@
 (decl x64_roundss (XmmMem RoundImm) Xmm)
 (rule (x64_roundss src1 round)
      (xmm_unary_rm_r_imm (SseOpcode.Roundss) src1 (encode_round_imm round)))
 (rule 1 (x64_roundss src1 round)
        (if-let $true (use_avx_simd))
        (xmm_unary_rm_r_imm_vex (AvxOpcode.Vroundss) src1 (encode_round_imm round)))
 ;; Helper for creating `roundsd` instructions.
 (decl x64_roundsd (XmmMem RoundImm) Xmm)
 (rule (x64_roundsd src1 round)
      (xmm_unary_rm_r_imm (SseOpcode.Roundsd) src1 (encode_round_imm round)))
 (rule 1 (x64_roundsd src1 round)
        (if-let $true (use_avx_simd))
        (xmm_unary_rm_r_imm_vex (AvxOpcode.Vroundsd) src1 (encode_round_imm round)))
 ;; Helper for creating `roundps` instructions.
 (decl x64_roundps (XmmMem RoundImm) Xmm)
@ -3985,10 +3997,16 @@
 ;; Helper for creating `sqrtss` instructions.
 (decl x64_sqrtss (XmmMem) Xmm)
 (rule (x64_sqrtss x) (xmm_unary_rm_r_unaligned (SseOpcode.Sqrtss) x))
 (rule 1 (x64_sqrtss x)
        (if-let $true (use_avx_simd))
        (xmm_unary_rm_r_vex (AvxOpcode.Vsqrtss) x))
 ;; Helper for creating `sqrtsd` instructions.
 (decl x64_sqrtsd (XmmMem) Xmm)
 (rule (x64_sqrtsd x) (xmm_unary_rm_r_unaligned (SseOpcode.Sqrtsd) x))
 (rule 1 (x64_sqrtsd x)
        (if-let $true (use_avx_simd))
        (xmm_unary_rm_r_vex (AvxOpcode.Vsqrtsd) x))
 ;; Helper for creating `sqrtps` instructions.
 (decl x64_sqrtps (XmmMem) Xmm)
@ -4005,12 +4023,18 @@
        (xmm_unary_rm_r_vex (AvxOpcode.Vsqrtpd) x))
 ;; Helper for creating `cvtss2sd` instructions.
-(decl x64_cvtss2sd (Xmm) Xmm)
+(decl x64_cvtss2sd (XmmMem) Xmm)
-(rule (x64_cvtss2sd x) (xmm_unary_rm_r (SseOpcode.Cvtss2sd) x))
+(rule (x64_cvtss2sd x) (xmm_unary_rm_r_unaligned (SseOpcode.Cvtss2sd) x))
 (rule 1 (x64_cvtss2sd x)
        (if-let $true (use_avx_simd))
        (xmm_unary_rm_r_vex (AvxOpcode.Vcvtss2sd) x))
 ;; Helper for creating `cvtsd2ss` instructions.
-(decl x64_cvtsd2ss (Xmm) Xmm)
+(decl x64_cvtsd2ss (XmmMem) Xmm)
-(rule (x64_cvtsd2ss x) (xmm_unary_rm_r (SseOpcode.Cvtsd2ss) x))
+(rule (x64_cvtsd2ss x) (xmm_unary_rm_r_unaligned (SseOpcode.Cvtsd2ss) x))
 (rule 1 (x64_cvtsd2ss x)
        (if-let $true (use_avx_simd))
        (xmm_unary_rm_r_vex (AvxOpcode.Vcvtsd2ss) x))
 ;; Helper for creating `cvtdq2ps` instructions.
 (decl x64_cvtdq2ps (XmmMem) Xmm)
--- a/cranelift/codegen/src/isa/x64/inst/args.rs
+++ b/cranelift/codegen/src/isa/x64/inst/args.rs
@ -1722,7 +1722,13 @@ impl AvxOpcode {
            | AvxOpcode::Vmovmskpd
            | AvxOpcode::Vpmovmskb
            | AvxOpcode::Vcvtsi2ss
-            | AvxOpcode::Vcvtsi2sd => {
+            | AvxOpcode::Vcvtsi2sd
            | AvxOpcode::Vcvtss2sd
            | AvxOpcode::Vcvtsd2ss
            | AvxOpcode::Vsqrtss
            | AvxOpcode::Vsqrtsd
            | AvxOpcode::Vroundss
            | AvxOpcode::Vroundsd => {
                smallvec![InstructionSet::AVX]
            }
--- a/cranelift/codegen/src/isa/x64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit.rs
@ -2405,17 +2405,36 @@ pub(crate) fn emit(
                AvxOpcode::Vbroadcastss => (LegacyPrefixes::_66, OpcodeMap::_0F38, 0x18),
                AvxOpcode::Vmovddup => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x12),
                AvxOpcode::Vcvtss2sd => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x5A),
                AvxOpcode::Vcvtsd2ss => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x5A),
                AvxOpcode::Vsqrtss => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x51),
                AvxOpcode::Vsqrtsd => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x51),
                _ => panic!("unexpected rmr_imm_vex opcode {op:?}"),
            };
-            VexInstruction::new()
+            let vex = VexInstruction::new()
                .length(VexVectorLength::V128)
                .prefix(prefix)
                .map(map)
                .opcode(opcode)
                .reg(dst.to_real_reg().unwrap().hw_enc())
-                .rm(src)
+                .rm(src);
-                .encode(sink);
+
            // These opcodes take a second operand through `vvvv` which copies
            // the upper bits into the destination register. That's not
            // reflected in the CLIF instruction, however, since the SSE version
            // doesn't have this functionality. Instead just copy whatever
            // happens to already be in the destination, which at least is what
            // LLVM seems to do.
            let vex = match op {
                AvxOpcode::Vcvtss2sd
                | AvxOpcode::Vcvtsd2ss
                | AvxOpcode::Vsqrtss
                | AvxOpcode::Vsqrtsd => vex.vvvv(dst.to_real_reg().unwrap().hw_enc()),
                _ => vex,
            };
            vex.encode(sink);
        }
        Inst::XmmUnaryRmRImmVex { op, src, dst, imm } => {
@ -2433,18 +2452,29 @@ pub(crate) fn emit(
                AvxOpcode::Vpshuflw => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x70),
                AvxOpcode::Vpshufhw => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x70),
                AvxOpcode::Vpshufd => (LegacyPrefixes::_66, OpcodeMap::_0F, 0x70),
                AvxOpcode::Vroundss => (LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x0A),
                AvxOpcode::Vroundsd => (LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x0B),
                _ => panic!("unexpected rmr_imm_vex opcode {op:?}"),
            };
-            VexInstruction::new()
+            let vex = VexInstruction::new()
                .length(VexVectorLength::V128)
                .prefix(prefix)
                .map(map)
                .opcode(opcode)
                .reg(dst.to_real_reg().unwrap().hw_enc())
                .rm(src)
-                .imm(*imm)
+                .imm(*imm);
-                .encode(sink);
+
            // See comments in similar block above in `XmmUnaryRmRVex` for what
            // this is doing.
            let vex = match op {
                AvxOpcode::Vroundss | AvxOpcode::Vroundsd => {
                    vex.vvvv(dst.to_real_reg().unwrap().hw_enc())
                }
                _ => vex,
            };
            vex.encode(sink);
        }
        Inst::XmmMovRMVex { op, src, dst } => {
--- a/cranelift/filetests/filetests/isa/x64/ceil-avx.clif
+++ b/cranelift/filetests/filetests/isa/x64/ceil-avx.clif
@ -0,0 +1,104 @@
 test compile precise-output
 set enable_simd
 target x86_64 has_avx
 function %f1(f32) -> f32 {
 block0(v0: f32):
  v1 = ceil v0
  return v1
 }
 ; VCode:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
 ;   vroundss $2, %xmm0, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
 ;   vroundss $2, %xmm0, %xmm0, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
 function %f2(f64) -> f64 {
 block0(v0: f64):
  v1 = ceil v0
  return v1
 }
 ; VCode:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
 ;   vroundsd $2, %xmm0, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
 ;   vroundsd $2, %xmm0, %xmm0, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
 function %f4(f32x4) -> f32x4 {
 block0(v0: f32x4):
  v1 = ceil v0
  return v1
 }
 ; VCode:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
 ;   vroundps $2, %xmm0, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
 ;   vroundps $2, %xmm0, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
 function %f4(f64x2) -> f64x2 {
 block0(v0: f64x2):
  v1 = ceil v0
  return v1
 }
 ; VCode:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
 ;   vroundpd $2, %xmm0, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
 ;   vroundpd $2, %xmm0, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
--- a/cranelift/filetests/filetests/isa/x64/fpromote-demote-avx.clif
+++ b/cranelift/filetests/filetests/isa/x64/fpromote-demote-avx.clif
@ -0,0 +1,130 @@
 test compile precise-output
 set enable_simd
 target x86_64 has_avx
 function %fpromote(f32) -> f64 {
 block0(v0: f32):
    v1 = fpromote.f64 v0
    return v1
 }
 ; VCode:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
 ;   vcvtss2sd %xmm0, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
 ;   vcvtss2sd %xmm0, %xmm0, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
 function %fpromote_load(i64, f32) -> f64 {
    ss0 = explicit_slot 16
 block0(v1: i64, v2: f32):
    v3 = stack_addr.i64 ss0
    store.f32 v2, v3
    v4 = load.f32 v3
    v5 = fpromote.f64 v4
    return v5
 }
 ; VCode:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ;   subq    %rsp, $16, %rsp
 ; block0:
 ;   lea     rsp(0 + virtual offset), %rdx
 ;   vmovss  %xmm0, 0(%rdx)
 ;   vcvtss2sd 0(%rdx), %xmm0
 ;   addq    %rsp, $16, %rsp
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ;   subq $0x10, %rsp
 ; block1: ; offset 0x8
 ;   leaq (%rsp), %rdx
 ;   vmovss %xmm0, (%rdx) ; trap: heap_oob
 ;   vcvtss2sd (%rdx), %xmm0, %xmm0 ; trap: heap_oob
 ;   addq $0x10, %rsp
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
 function %fdemote(f64) -> f32 {
 block0(v0: f64):
    v1 = fdemote.f32 v0
    return v1
 }
 ; VCode:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
 ;   vcvtsd2ss %xmm0, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
 ;   vcvtsd2ss %xmm0, %xmm0, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
 function %fdemote_load(i64, f64) -> f32 {
    ss0 = explicit_slot 16
 block0(v1: i64, v2: f64):
    v3 = stack_addr.i64 ss0
    store.f64 v2, v3
    v4 = load.f64 v3
    v5 = fdemote.f32 v4
    return v5
 }
 ; VCode:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ;   subq    %rsp, $16, %rsp
 ; block0:
 ;   lea     rsp(0 + virtual offset), %rdx
 ;   vmovsd  %xmm0, 0(%rdx)
 ;   vcvtsd2ss 0(%rdx), %xmm0
 ;   addq    %rsp, $16, %rsp
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ;   subq $0x10, %rsp
 ; block1: ; offset 0x8
 ;   leaq (%rsp), %rdx
 ;   vmovsd %xmm0, (%rdx) ; trap: heap_oob
 ;   vcvtsd2ss (%rdx), %xmm0, %xmm0 ; trap: heap_oob
 ;   addq $0x10, %rsp
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
--- a/cranelift/filetests/filetests/isa/x64/fpromote-demote.clif
+++ b/cranelift/filetests/filetests/isa/x64/fpromote-demote.clif
@ -0,0 +1,130 @@
 test compile precise-output
 set enable_simd
 target x86_64
 function %fpromote(f32) -> f64 {
 block0(v0: f32):
    v1 = fpromote.f64 v0
    return v1
 }
 ; VCode:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
 ;   cvtss2sd %xmm0, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
 ;   cvtss2sd %xmm0, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
 function %fpromote_load(i64, f32) -> f64 {
    ss0 = explicit_slot 16
 block0(v1: i64, v2: f32):
    v3 = stack_addr.i64 ss0
    store.f32 v2, v3
    v4 = load.f32 v3
    v5 = fpromote.f64 v4
    return v5
 }
 ; VCode:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ;   subq    %rsp, $16, %rsp
 ; block0:
 ;   lea     rsp(0 + virtual offset), %rdx
 ;   movss   %xmm0, 0(%rdx)
 ;   cvtss2sd 0(%rdx), %xmm0
 ;   addq    %rsp, $16, %rsp
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ;   subq $0x10, %rsp
 ; block1: ; offset 0x8
 ;   leaq (%rsp), %rdx
 ;   movss %xmm0, (%rdx) ; trap: heap_oob
 ;   cvtss2sd (%rdx), %xmm0 ; trap: heap_oob
 ;   addq $0x10, %rsp
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
 function %fdemote(f64) -> f32 {
 block0(v0: f64):
    v1 = fdemote.f32 v0
    return v1
 }
 ; VCode:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
 ;   cvtsd2ss %xmm0, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
 ;   cvtsd2ss %xmm0, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
 function %fdemote_load(i64, f64) -> f32 {
    ss0 = explicit_slot 16
 block0(v1: i64, v2: f64):
    v3 = stack_addr.i64 ss0
    store.f64 v2, v3
    v4 = load.f64 v3
    v5 = fdemote.f32 v4
    return v5
 }
 ; VCode:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ;   subq    %rsp, $16, %rsp
 ; block0:
 ;   lea     rsp(0 + virtual offset), %rdx
 ;   movsd   %xmm0, 0(%rdx)
 ;   cvtsd2ss 0(%rdx), %xmm0
 ;   addq    %rsp, $16, %rsp
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ;   subq $0x10, %rsp
 ; block1: ; offset 0x8
 ;   leaq (%rsp), %rdx
 ;   movsd %xmm0, (%rdx) ; trap: heap_oob
 ;   cvtsd2ss (%rdx), %xmm0 ; trap: heap_oob
 ;   addq $0x10, %rsp
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
--- a/cranelift/filetests/filetests/isa/x64/fsqrt-avx.clif
+++ b/cranelift/filetests/filetests/isa/x64/fsqrt-avx.clif
@ -0,0 +1,54 @@
 test compile precise-output
 set enable_simd
 target x86_64 has_avx
 function %sqrt_f32(f32) -> f32 {
 block0(v0: f32):
    v1 = sqrt v0
    return v1
 }
 ; VCode:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
 ;   vsqrtss %xmm0, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
 ;   vsqrtss %xmm0, %xmm0, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
 function %sqrt_f64(f64) -> f64 {
 block0(v0: f64):
    v1 = sqrt v0
    return v1
 }
 ; VCode:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
 ;   vsqrtsd %xmm0, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
 ;   vsqrtsd %xmm0, %xmm0, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
--- a/cranelift/filetests/filetests/isa/x64/fsqrt.clif
+++ b/cranelift/filetests/filetests/isa/x64/fsqrt.clif
@ -0,0 +1,54 @@
 test compile precise-output
 set enable_simd
 target x86_64
 function %sqrt_f32(f32) -> f32 {
 block0(v0: f32):
    v1 = sqrt v0
    return v1
 }
 ; VCode:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
 ;   sqrtss  %xmm0, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
 ;   sqrtss %xmm0, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
 function %sqrt_f64(f64) -> f64 {
 block0(v0: f64):
    v1 = sqrt v0
    return v1
 }
 ; VCode:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
 ;   sqrtsd  %xmm0, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
 ;   sqrtsd %xmm0, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq