Browse Source

NaN-canonicalization without branching on x64 (#8313)

* NaN-canonicalization without branching on x64

Modify the cranelift pass that performs NaN-canonicalization to avoid
branches on x64. The current implementation uses two branches.

* remove old fcmp case

* Revert "remove old fcmp case"

This reverts commit 48c3712b7e.

* add filetests

* use old version for riscv
pull/8318/head
Adam Bratschi-Kaye 7 months ago
committed by GitHub
parent
commit
72a3b8b99d
No known key found for this signature in database GPG Key ID: B5690EEEBB952194
  1. 11
      cranelift/codegen/src/context.rs
  2. 1
      cranelift/codegen/src/isa/x64/lower.isle
  3. 38
      cranelift/codegen/src/nan_canonicalization.rs
  4. 140
      cranelift/filetests/filetests/isa/aarch64/nan-canonicalization.clif
  5. 160
      cranelift/filetests/filetests/isa/riscv64/nan-canonicalization-has_v.clif
  6. 66
      cranelift/filetests/filetests/isa/riscv64/nan-canonicalization.clif
  7. 112
      cranelift/filetests/filetests/isa/s390x/nan-canonicalization.clif
  8. 140
      cranelift/filetests/filetests/isa/x64/nan-canonicalization-sse41.clif
  9. 131
      cranelift/filetests/filetests/isa/x64/nan-canonicalization.clif

11
cranelift/codegen/src/context.rs

@ -31,6 +31,7 @@ use crate::{timing, CompileError};
use alloc::string::String;
use alloc::vec::Vec;
use cranelift_control::ControlPlane;
use target_lexicon::Architecture;
#[cfg(feature = "souper-harvest")]
use crate::souper_harvest::do_souper_harvest;
@ -282,7 +283,15 @@ impl Context {
/// Perform NaN canonicalizing rewrites on the function.
pub fn canonicalize_nans(&mut self, isa: &dyn TargetIsa) -> CodegenResult<()> {
do_nan_canonicalization(&mut self.func);
// Currently only RiscV64 is the only arch that may not have vector support.
let has_vector_support = match isa.triple().architecture {
Architecture::Riscv64(_) => match isa.isa_flags().iter().find(|f| f.name == "has_v") {
Some(value) => value.as_bool().unwrap_or(false),
None => false,
},
_ => true,
};
do_nan_canonicalization(&mut self.func, has_vector_support);
self.verify_if(isa)
}

1
cranelift/codegen/src/isa/x64/lower.isle

@ -1428,6 +1428,7 @@
(decl pure partial all_ones_or_all_zeros (Value) bool)
(rule (all_ones_or_all_zeros (and (icmp _ _ _) (value_type (multi_lane _ _)))) $true)
(rule (all_ones_or_all_zeros (and (fcmp _ _ _) (value_type (multi_lane _ _)))) $true)
(rule (all_ones_or_all_zeros (and (bitcast _ (fcmp _ _ _)) (value_type (multi_lane _ _)))) $true)
(rule (all_ones_or_all_zeros (vconst (vconst_all_ones_or_all_zeros))) $true)
(decl pure vconst_all_ones_or_all_zeros () Constant)

38
cranelift/codegen/src/nan_canonicalization.rs

@ -5,7 +5,7 @@
use crate::cursor::{Cursor, FuncCursor};
use crate::ir::condcodes::FloatCC;
use crate::ir::immediates::{Ieee32, Ieee64};
use crate::ir::types;
use crate::ir::types::{self};
use crate::ir::{Function, Inst, InstBuilder, InstructionData, Opcode, Value};
use crate::opts::MemFlags;
use crate::timing;
@ -15,13 +15,13 @@ static CANON_32BIT_NAN: u32 = 0b01111111110000000000000000000000;
static CANON_64BIT_NAN: u64 = 0b0111111111111000000000000000000000000000000000000000000000000000;
/// Perform the NaN canonicalization pass.
pub fn do_nan_canonicalization(func: &mut Function) {
pub fn do_nan_canonicalization(func: &mut Function, has_vector_support: bool) {
let _tt = timing::canonicalize_nans();
let mut pos = FuncCursor::new(func);
while let Some(_block) = pos.next_block() {
while let Some(inst) = pos.next_inst() {
if is_fp_arith(&mut pos, inst) {
add_nan_canon_seq(&mut pos, inst);
add_nan_canon_seq(&mut pos, inst, has_vector_support);
}
}
}
@ -57,7 +57,7 @@ fn is_fp_arith(pos: &mut FuncCursor, inst: Inst) -> bool {
}
/// Append a sequence of canonicalizing instructions after the given instruction.
fn add_nan_canon_seq(pos: &mut FuncCursor, inst: Inst) {
fn add_nan_canon_seq(pos: &mut FuncCursor, inst: Inst, has_vector_support: bool) {
// Select the instruction result, result type. Replace the instruction
// result and step forward before inserting the canonicalization sequence.
let val = pos.func.dfg.first_result(inst);
@ -65,16 +65,28 @@ fn add_nan_canon_seq(pos: &mut FuncCursor, inst: Inst) {
let new_res = pos.func.dfg.replace_result(val, val_type);
let _next_inst = pos.next_inst().expect("block missing terminator!");
// Insert a comparison instruction, to check if `inst_res` is NaN. Select
// the canonical NaN value if `val` is NaN, assign the result to `inst`.
let is_nan = pos.ins().fcmp(FloatCC::NotEqual, new_res, new_res);
// Insert a comparison instruction, to check if `inst_res` is NaN (comparing
// against NaN is always unordered). Select the canonical NaN value if `val`
// is NaN, assign the result to `inst`.
let comparison = FloatCC::Unordered;
let vectorized_scalar_select = |pos: &mut FuncCursor, canon_nan: Value, ty: types::Type| {
let canon_nan = pos.ins().scalar_to_vector(ty, canon_nan);
let new_res = pos.ins().scalar_to_vector(ty, new_res);
let is_nan = pos.ins().fcmp(comparison, new_res, new_res);
let is_nan = pos.ins().bitcast(ty, MemFlags::new(), is_nan);
let simd_result = pos.ins().bitselect(is_nan, canon_nan, new_res);
pos.ins().with_result(val).extractlane(simd_result, 0);
};
let scalar_select = |pos: &mut FuncCursor, canon_nan: Value| {
let is_nan = pos.ins().fcmp(comparison, new_res, new_res);
pos.ins()
.with_result(val)
.select(is_nan, canon_nan, new_res);
};
let vector_select = |pos: &mut FuncCursor, canon_nan: Value| {
let is_nan = pos.ins().fcmp(comparison, new_res, new_res);
let is_nan = pos.ins().bitcast(val_type, MemFlags::new(), is_nan);
pos.ins()
.with_result(val)
@ -84,11 +96,19 @@ fn add_nan_canon_seq(pos: &mut FuncCursor, inst: Inst) {
match val_type {
types::F32 => {
let canon_nan = pos.ins().f32const(Ieee32::with_bits(CANON_32BIT_NAN));
scalar_select(pos, canon_nan);
if has_vector_support {
vectorized_scalar_select(pos, canon_nan, types::F32X4);
} else {
scalar_select(pos, canon_nan);
}
}
types::F64 => {
let canon_nan = pos.ins().f64const(Ieee64::with_bits(CANON_64BIT_NAN));
scalar_select(pos, canon_nan);
if has_vector_support {
vectorized_scalar_select(pos, canon_nan, types::F64X2);
} else {
scalar_select(pos, canon_nan);
}
}
types::F32X4 => {
let canon_nan = pos.ins().f32const(Ieee32::with_bits(CANON_32BIT_NAN));

140
cranelift/filetests/filetests/isa/aarch64/nan-canonicalization.clif

@ -0,0 +1,140 @@
test compile precise-output
set enable_nan_canonicalization=true
target x86_64 sse41
function %f0(f32x4, f32x4) -> f32x4 {
block0(v0: f32x4, v1: f32x4):
v2 = fadd v0, v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; addps %xmm0, %xmm1, %xmm0
; movl $2143289344, %r10d
; movd %r10d, %xmm7
; shufps $0, %xmm7, const(0), %xmm7
; movdqa %xmm0, %xmm1
; cmpps $3, %xmm1, %xmm0, %xmm1
; movdqa %xmm0, %xmm2
; movdqa %xmm1, %xmm0
; movdqa %xmm2, %xmm1
; pblendvb %xmm1, %xmm7, %xmm1
; movdqa %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; addps %xmm1, %xmm0
; movl $0x7fc00000, %r10d
; movd %r10d, %xmm7
; shufps $0, 0x26(%rip), %xmm7
; movdqa %xmm0, %xmm1
; cmpunordps %xmm0, %xmm1
; movdqa %xmm0, %xmm2
; movdqa %xmm1, %xmm0
; movdqa %xmm2, %xmm1
; pblendvb %xmm0, %xmm7, %xmm1
; movdqa %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
; addb %al, (%rax)
; addb %al, (%rax)
; addb %al, (%rax)
; sarb $0, (%rdi)
; addb %al, (%rax)
; addb %al, (%rax)
; addb %al, (%rax)
; addb %al, (%rax)
; addb %al, (%rax)
function %f1(f64, f64) -> f64 {
block0(v0: f64, v1: f64):
v2 = fadd v0, v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; addsd %xmm0, %xmm1, %xmm0
; movabsq $9221120237041090560, %r9
; movq %r9, %xmm1
; movdqa %xmm0, %xmm7
; cmppd $3, %xmm7, %xmm0, %xmm7
; movdqa %xmm0, %xmm5
; movdqa %xmm7, %xmm0
; pblendvb %xmm5, %xmm1, %xmm5
; movdqa %xmm5, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; addsd %xmm1, %xmm0
; movabsq $0x7ff8000000000000, %r9
; movq %r9, %xmm1
; movdqa %xmm0, %xmm7
; cmpunordpd %xmm0, %xmm7
; movdqa %xmm0, %xmm5
; movdqa %xmm7, %xmm0
; pblendvb %xmm0, %xmm1, %xmm5
; movdqa %xmm5, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %f1(f32, f32) -> f32 {
block0(v0: f32, v1: f32):
v2 = fadd v0, v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; addss %xmm0, %xmm1, %xmm0
; movl $2143289344, %r9d
; movd %r9d, %xmm1
; movdqa %xmm0, %xmm7
; cmpps $3, %xmm7, %xmm0, %xmm7
; movdqa %xmm0, %xmm5
; movdqa %xmm7, %xmm0
; pblendvb %xmm5, %xmm1, %xmm5
; movdqa %xmm5, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; addss %xmm1, %xmm0
; movl $0x7fc00000, %r9d
; movd %r9d, %xmm1
; movdqa %xmm0, %xmm7
; cmpunordps %xmm0, %xmm7
; movdqa %xmm0, %xmm5
; movdqa %xmm7, %xmm0
; pblendvb %xmm0, %xmm1, %xmm5
; movdqa %xmm5, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq

160
cranelift/filetests/filetests/isa/riscv64/nan-canonicalization-has_v.clif

@ -0,0 +1,160 @@
test compile precise-output
set enable_nan_canonicalization=true
target riscv64 has_v
function %f0(f32x4, f32x4) -> f32x4 {
block0(v0: f32x4, v1: f32x4):
v2 = fadd v0, v1
return v2
}
; VCode:
; addi sp,sp,-16
; sd ra,8(sp)
; sd fp,0(sp)
; mv fp,sp
; block0:
; vle8.v v9,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
; vle8.v v11,32(fp) #avl=16, #vtype=(e8, m1, ta, ma)
; vfadd.vv v13,v9,v11 #avl=4, #vtype=(e32, m1, ta, ma)
; lui a1,523264
; fmv.w.x fa2,a1
; vfmv.v.f v14,fa2 #avl=4, #vtype=(e32, m1, ta, ma)
; vmfne.vv v10,v13,v13 #avl=4, #vtype=(e32, m1, ta, ma)
; vmfne.vv v12,v13,v13 #avl=4, #vtype=(e32, m1, ta, ma)
; vmor.mm v0,v10,v12 #avl=4, #vtype=(e32, m1, ta, ma)
; vmerge.vvm v8,v13,v14,v0.t #avl=4, #vtype=(e32, m1, ta, ma)
; vse8.v v8,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
; ld ra,8(sp)
; ld fp,0(sp)
; addi sp,sp,16
; ret
;
; Disassembled:
; block0: ; offset 0x0
; addi sp, sp, -0x10
; sd ra, 8(sp)
; sd s0, 0(sp)
; mv s0, sp
; block1: ; offset 0x10
; .byte 0x57, 0x70, 0x08, 0xcc
; addi t6, s0, 0x10
; .byte 0x87, 0x84, 0x0f, 0x02
; addi t6, s0, 0x20
; .byte 0x87, 0x85, 0x0f, 0x02
; .byte 0x57, 0x70, 0x02, 0xcd
; .byte 0xd7, 0x96, 0x95, 0x02
; lui a1, 0x7fc00
; fmv.w.x fa2, a1
; .byte 0x57, 0x57, 0x06, 0x5e
; .byte 0x57, 0x95, 0xd6, 0x72
; .byte 0x57, 0x96, 0xd6, 0x72
; .byte 0x57, 0x20, 0xa6, 0x6a
; .byte 0x57, 0x04, 0xd7, 0x5c
; .byte 0x57, 0x70, 0x08, 0xcc
; .byte 0x27, 0x04, 0x05, 0x02
; ld ra, 8(sp)
; ld s0, 0(sp)
; addi sp, sp, 0x10
; ret
function %f1(f64, f64) -> f64 {
block0(v0: f64, v1: f64):
v2 = fadd v0, v1
return v2
}
; VCode:
; block0:
; fadd.d fa1,fa0,fa1,rne
; lui a5,4095
; slli a1,a5,39
; fmv.d.x fa3,a1
; vmv.v.x v8,zero #avl=2, #vtype=(e64, m1, ta, ma)
; vfmv.s.f v10,fa3 #avl=2, #vtype=(e64, m1, ta, ma)
; vmv.v.i v0,1 #avl=2, #vtype=(e64, m1, ta, ma)
; vmerge.vvm v14,v8,v10,v0.t #avl=2, #vtype=(e64, m1, ta, ma)
; vmv.v.x v8,zero #avl=2, #vtype=(e64, m1, ta, ma)
; vfmv.s.f v10,fa1 #avl=2, #vtype=(e64, m1, ta, ma)
; vmv.v.i v0,1 #avl=2, #vtype=(e64, m1, ta, ma)
; vmerge.vvm v15,v8,v10,v0.t #avl=2, #vtype=(e64, m1, ta, ma)
; vmfne.vv v8,v15,v15 #avl=2, #vtype=(e64, m1, ta, ma)
; vmfne.vv v10,v15,v15 #avl=2, #vtype=(e64, m1, ta, ma)
; vmor.mm v0,v8,v10 #avl=2, #vtype=(e64, m1, ta, ma)
; vmerge.vvm v8,v15,v14,v0.t #avl=2, #vtype=(e64, m1, ta, ma)
; vfmv.f.s fa0,v8 #avl=2, #vtype=(e64, m1, ta, ma)
; ret
;
; Disassembled:
; block0: ; offset 0x0
; fadd.d fa1, fa0, fa1, rne
; lui a5, 0xfff
; slli a1, a5, 0x27
; fmv.d.x fa3, a1
; .byte 0x57, 0x70, 0x81, 0xcd
; .byte 0x57, 0x44, 0x00, 0x5e
; .byte 0x57, 0xd5, 0x06, 0x42
; .byte 0x57, 0xb0, 0x00, 0x5e
; .byte 0x57, 0x07, 0x85, 0x5c
; .byte 0x57, 0x44, 0x00, 0x5e
; .byte 0x57, 0xd5, 0x05, 0x42
; .byte 0x57, 0xb0, 0x00, 0x5e
; .byte 0xd7, 0x07, 0x85, 0x5c
; .byte 0x57, 0x94, 0xf7, 0x72
; .byte 0x57, 0x95, 0xf7, 0x72
; .byte 0x57, 0x20, 0x85, 0x6a
; .byte 0x57, 0x04, 0xf7, 0x5c
; .byte 0x57, 0x15, 0x80, 0x42
; ret
function %f1(f32, f32) -> f32 {
block0(v0: f32, v1: f32):
v2 = fadd v0, v1
return v2
}
; VCode:
; block0:
; fadd.s fa0,fa0,fa1,rne
; lui a5,523264
; fmv.w.x fa1,a5
; vmv.v.x v15,zero #avl=4, #vtype=(e32, m1, ta, ma)
; vfmv.s.f v9,fa1 #avl=4, #vtype=(e32, m1, ta, ma)
; vmv.v.i v0,1 #avl=2, #vtype=(e64, m1, ta, ma)
; vmerge.vvm v13,v15,v9,v0.t #avl=4, #vtype=(e32, m1, ta, ma)
; vmv.v.x v15,zero #avl=4, #vtype=(e32, m1, ta, ma)
; vfmv.s.f v9,fa0 #avl=4, #vtype=(e32, m1, ta, ma)
; vmv.v.i v0,1 #avl=2, #vtype=(e64, m1, ta, ma)
; vmerge.vvm v14,v15,v9,v0.t #avl=4, #vtype=(e32, m1, ta, ma)
; vmfne.vv v15,v14,v14 #avl=4, #vtype=(e32, m1, ta, ma)
; vmfne.vv v9,v14,v14 #avl=4, #vtype=(e32, m1, ta, ma)
; vmor.mm v0,v15,v9 #avl=4, #vtype=(e32, m1, ta, ma)
; vmerge.vvm v15,v14,v13,v0.t #avl=4, #vtype=(e32, m1, ta, ma)
; vfmv.f.s fa0,v15 #avl=4, #vtype=(e32, m1, ta, ma)
; ret
;
; Disassembled:
; block0: ; offset 0x0
; fadd.s fa0, fa0, fa1, rne
; lui a5, 0x7fc00
; fmv.w.x fa1, a5
; .byte 0x57, 0x70, 0x02, 0xcd
; .byte 0xd7, 0x47, 0x00, 0x5e
; .byte 0xd7, 0xd4, 0x05, 0x42
; .byte 0x57, 0x70, 0x81, 0xcd
; .byte 0x57, 0xb0, 0x00, 0x5e
; .byte 0x57, 0x70, 0x02, 0xcd
; .byte 0xd7, 0x86, 0xf4, 0x5c
; .byte 0xd7, 0x47, 0x00, 0x5e
; .byte 0xd7, 0x54, 0x05, 0x42
; .byte 0x57, 0x70, 0x81, 0xcd
; .byte 0x57, 0xb0, 0x00, 0x5e
; .byte 0x57, 0x70, 0x02, 0xcd
; .byte 0x57, 0x87, 0xf4, 0x5c
; .byte 0xd7, 0x17, 0xe7, 0x72
; .byte 0xd7, 0x14, 0xe7, 0x72
; .byte 0x57, 0xa0, 0xf4, 0x6a
; .byte 0xd7, 0x87, 0xe6, 0x5c
; .byte 0x57, 0x15, 0xf0, 0x42
; ret

66
cranelift/filetests/filetests/isa/riscv64/nan-canonicalization.clif

@ -0,0 +1,66 @@
test compile precise-output
set enable_nan_canonicalization=true
target riscv64
function %f1(f64, f64) -> f64 {
block0(v0: f64, v1: f64):
v2 = fadd v0, v1
return v2
}
; VCode:
; block0:
; fadd.d fa4,fa0,fa1,rne
; lui a2,4095
; slli a4,a2,39
; fmv.d.x fa0,a4
; feq.d a1,fa4,fa4
; feq.d a3,fa4,fa4
; and a5,a1,a3
; select fa0,fa0,fa4##condition=(a5 eq zero)
; ret
;
; Disassembled:
; block0: ; offset 0x0
; fadd.d fa4, fa0, fa1, rne
; lui a2, 0xfff
; slli a4, a2, 0x27
; fmv.d.x fa0, a4
; feq.d a1, fa4, fa4
; feq.d a3, fa4, fa4
; and a5, a1, a3
; beqz a5, 8
; fmv.d fa0, fa4
; ret
function %f1(f32, f32) -> f32 {
block0(v0: f32, v1: f32):
v2 = fadd v0, v1
return v2
}
; VCode:
; block0:
; fadd.s fa3,fa0,fa1,rne
; lui a2,523264
; fmv.w.x fa4,a2
; feq.s a0,fa3,fa3
; feq.s a2,fa3,fa3
; and a4,a0,a2
; select fa0,fa4,fa3##condition=(a4 eq zero)
; ret
;
; Disassembled:
; block0: ; offset 0x0
; fadd.s fa3, fa0, fa1, rne
; lui a2, 0x7fc00
; fmv.w.x fa4, a2
; feq.s a0, fa3, fa3
; feq.s a2, fa3, fa3
; and a4, a0, a2
; bnez a4, 0xc
; fmv.d fa0, fa4
; j 8
; fmv.d fa0, fa3
; ret

112
cranelift/filetests/filetests/isa/s390x/nan-canonicalization.clif

@ -0,0 +1,112 @@
test compile precise-output
set enable_nan_canonicalization=true
target s390x
function %f0(f32x4, f32x4) -> f32x4 {
block0(v0: f32x4, v1: f32x4):
v2 = fadd v0, v1
return v2
}
; VCode:
; block0:
; vfasb %v17, %v24, %v25
; bras %r1, 8 ; data.f32 NaN ; vlef %v18, 0(%r1), 0
; vrepf %v18, %v18, 0
; vfchesb %v7, %v17, %v17
; vfchesb %v19, %v17, %v17
; vno %v19, %v7, %v19
; vsel %v24, %v18, %v17, %v19
; br %r14
;
; Disassembled:
; block0: ; offset 0x0
; vfasb %v17, %v24, %v25
; bras %r1, 0xe
; su %f12, 0
; vlef %v18, 0(%r1), 0
; vrepf %v18, %v18, 0
; vfchesb %v7, %v17, %v17
; vfchesb %v19, %v17, %v17
; vno %v19, %v7, %v19
; vsel %v24, %v18, %v17, %v19
; br %r14
function %f1(f64, f64) -> f64 {
block0(v0: f64, v1: f64):
v2 = fadd v0, v1
return v2
}
; VCode:
; block0:
; wfadb %v21, %f0, %f2
; bras %r1, 12 ; data.f64 NaN ; vleg %v22, 0(%r1), 0
; vgbm %v20, 0
; vpdi %v22, %v22, %v20, 0
; vgbm %v20, 0
; vpdi %v23, %v21, %v20, 0
; vfchedb %v19, %v23, %v23
; vfchedb %v21, %v23, %v23
; vno %v24, %v19, %v21
; vsel %v21, %v22, %v23, %v24
; vrepg %v0, %v21, 0
; br %r14
;
; Disassembled:
; block0: ; offset 0x0
; wfadb %v21, %f0, %f2
; bras %r1, 0x12
; su %f15, 0(%r8)
; .byte 0x00, 0x00
; .byte 0x00, 0x00
; vleg %v22, 0(%r1), 0
; vzero %v20
; vpdi %v22, %v22, %v20, 0
; vzero %v20
; vpdi %v23, %v21, %v20, 0
; vfchedb %v19, %v23, %v23
; vfchedb %v21, %v23, %v23
; vno %v24, %v19, %v21
; vsel %v21, %v22, %v23, %v24
; vrepg %v0, %v21, 0
; br %r14
function %f1(f32, f32) -> f32 {
block0(v0: f32, v1: f32):
v2 = fadd v0, v1
return v2
}
; VCode:
; block0:
; wfasb %v21, %f0, %f2
; bras %r1, 8 ; data.f32 NaN ; vlef %v22, 0(%r1), 0
; vgbm %v20, 61440
; vn %v22, %v22, %v20
; vgbm %v20, 61440
; vn %v23, %v21, %v20
; vfchesb %v19, %v23, %v23
; vfchesb %v21, %v23, %v23
; vno %v24, %v19, %v21
; vsel %v21, %v22, %v23, %v24
; vrepf %v0, %v21, 0
; br %r14
;
; Disassembled:
; block0: ; offset 0x0
; wfasb %v21, %f0, %f2
; bras %r1, 0xe
; su %f12, 0
; vlef %v22, 0(%r1), 0
; vgbm %v20, 0xf000
; vn %v22, %v22, %v20
; vgbm %v20, 0xf000
; vn %v23, %v21, %v20
; vfchesb %v19, %v23, %v23
; vfchesb %v21, %v23, %v23
; vno %v24, %v19, %v21
; vsel %v21, %v22, %v23, %v24
; vrepf %v0, %v21, 0
; br %r14

140
cranelift/filetests/filetests/isa/x64/nan-canonicalization-sse41.clif

@ -0,0 +1,140 @@
test compile precise-output
set enable_nan_canonicalization=true
target x86_64 sse41
function %f0(f32x4, f32x4) -> f32x4 {
block0(v0: f32x4, v1: f32x4):
v2 = fadd v0, v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; addps %xmm0, %xmm1, %xmm0
; movl $2143289344, %r10d
; movd %r10d, %xmm7
; shufps $0, %xmm7, const(0), %xmm7
; movdqa %xmm0, %xmm1
; cmpps $3, %xmm1, %xmm0, %xmm1
; movdqa %xmm0, %xmm2
; movdqa %xmm1, %xmm0
; movdqa %xmm2, %xmm1
; pblendvb %xmm1, %xmm7, %xmm1
; movdqa %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; addps %xmm1, %xmm0
; movl $0x7fc00000, %r10d
; movd %r10d, %xmm7
; shufps $0, 0x26(%rip), %xmm7
; movdqa %xmm0, %xmm1
; cmpunordps %xmm0, %xmm1
; movdqa %xmm0, %xmm2
; movdqa %xmm1, %xmm0
; movdqa %xmm2, %xmm1
; pblendvb %xmm0, %xmm7, %xmm1
; movdqa %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
; addb %al, (%rax)
; addb %al, (%rax)
; addb %al, (%rax)
; sarb $0, (%rdi)
; addb %al, (%rax)
; addb %al, (%rax)
; addb %al, (%rax)
; addb %al, (%rax)
; addb %al, (%rax)
function %f1(f64, f64) -> f64 {
block0(v0: f64, v1: f64):
v2 = fadd v0, v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; addsd %xmm0, %xmm1, %xmm0
; movabsq $9221120237041090560, %r9
; movq %r9, %xmm1
; movdqa %xmm0, %xmm7
; cmppd $3, %xmm7, %xmm0, %xmm7
; movdqa %xmm0, %xmm5
; movdqa %xmm7, %xmm0
; pblendvb %xmm5, %xmm1, %xmm5
; movdqa %xmm5, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; addsd %xmm1, %xmm0
; movabsq $0x7ff8000000000000, %r9
; movq %r9, %xmm1
; movdqa %xmm0, %xmm7
; cmpunordpd %xmm0, %xmm7
; movdqa %xmm0, %xmm5
; movdqa %xmm7, %xmm0
; pblendvb %xmm0, %xmm1, %xmm5
; movdqa %xmm5, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %f1(f32, f32) -> f32 {
block0(v0: f32, v1: f32):
v2 = fadd v0, v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; addss %xmm0, %xmm1, %xmm0
; movl $2143289344, %r9d
; movd %r9d, %xmm1
; movdqa %xmm0, %xmm7
; cmpps $3, %xmm7, %xmm0, %xmm7
; movdqa %xmm0, %xmm5
; movdqa %xmm7, %xmm0
; pblendvb %xmm5, %xmm1, %xmm5
; movdqa %xmm5, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; addss %xmm1, %xmm0
; movl $0x7fc00000, %r9d
; movd %r9d, %xmm1
; movdqa %xmm0, %xmm7
; cmpunordps %xmm0, %xmm7
; movdqa %xmm0, %xmm5
; movdqa %xmm7, %xmm0
; pblendvb %xmm0, %xmm1, %xmm5
; movdqa %xmm5, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq

131
cranelift/filetests/filetests/isa/x64/nan-canonicalization.clif

@ -0,0 +1,131 @@
test compile precise-output
set enable_nan_canonicalization=true
target x86_64
function %f0(f32x4, f32x4) -> f32x4 {
block0(v0: f32x4, v1: f32x4):
v2 = fadd v0, v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; addps %xmm0, %xmm1, %xmm0
; movdqa %xmm0, %xmm1
; movl $2143289344, %esi
; movd %esi, %xmm5
; shufps $0, %xmm5, const(0), %xmm5
; cmpps $3, %xmm0, %xmm1, %xmm0
; andps %xmm5, %xmm0, %xmm5
; andnps %xmm0, %xmm1, %xmm0
; orps %xmm0, %xmm5, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; addps %xmm1, %xmm0
; movdqa %xmm0, %xmm1
; movl $0x7fc00000, %esi
; movd %esi, %xmm5
; shufps $0, 0x14(%rip), %xmm5
; cmpunordps %xmm1, %xmm0
; andps %xmm0, %xmm5
; andnps %xmm1, %xmm0
; orps %xmm5, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
; addb %al, (%rax)
; addb %al, (%rax)
; sarb $0, (%rdi)
; addb %al, (%rax)
; addb %al, (%rax)
; addb %al, (%rax)
; addb %al, (%rax)
; addb %al, (%rax)
function %f1(f64, f64) -> f64 {
block0(v0: f64, v1: f64):
v2 = fadd v0, v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; addsd %xmm0, %xmm1, %xmm0
; movdqa %xmm0, %xmm7
; movabsq $9221120237041090560, %r11
; movq %r11, %xmm5
; cmppd $3, %xmm0, %xmm7, %xmm0
; andpd %xmm5, %xmm0, %xmm5
; andnpd %xmm0, %xmm7, %xmm0
; orpd %xmm0, %xmm5, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; addsd %xmm1, %xmm0
; movdqa %xmm0, %xmm7
; movabsq $0x7ff8000000000000, %r11
; movq %r11, %xmm5
; cmpunordpd %xmm7, %xmm0
; andpd %xmm0, %xmm5
; andnpd %xmm7, %xmm0
; orpd %xmm5, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %f1(f32, f32) -> f32 {
block0(v0: f32, v1: f32):
v2 = fadd v0, v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; addss %xmm0, %xmm1, %xmm0
; movdqa %xmm0, %xmm7
; movl $2143289344, %r11d
; movd %r11d, %xmm5
; cmpps $3, %xmm0, %xmm7, %xmm0
; andps %xmm5, %xmm0, %xmm5
; andnps %xmm0, %xmm7, %xmm0
; orps %xmm0, %xmm5, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; addss %xmm1, %xmm0
; movdqa %xmm0, %xmm7
; movl $0x7fc00000, %r11d
; movd %r11d, %xmm5
; cmpunordps %xmm7, %xmm0
; andps %xmm0, %xmm5
; andnps %xmm7, %xmm0
; orps %xmm5, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
Loading…
Cancel
Save