diff --git a/cranelift/codegen/src/isa/riscv64/lower.isle b/cranelift/codegen/src/isa/riscv64/lower.isle index df9ebf6948..5891cbbe09 100644 --- a/cranelift/codegen/src/isa/riscv64/lower.isle +++ b/cranelift/codegen/src/isa/riscv64/lower.isle @@ -1838,21 +1838,29 @@ ;; ;; This allows us to skip the mask expansion step and use the more efficient ;; vmerge.vvm instruction. -(rule 2 (lower (has_type (ty_vec_fits_in_register ty) (bitselect (icmp cc a @ (value_type (ty_vec_fits_in_register cmp_ty)) b) x y))) +;; +;; We should be careful to ensure that the mask and the vmerge have the +;; same type. So that we don't generate a mask with length 16 (i.e. for i8x16), and then +;; only copy the first few lanes of the result to the destination register because +;; the bitselect has a different length (i.e. i64x2). +;; +;; See: https://github.com/bytecodealliance/wasmtime/issues/8131 + +(rule 2 (lower (has_type (ty_vec_fits_in_register _ty) (bitselect (icmp cc a @ (value_type (ty_vec_fits_in_register cmp_ty)) b) x y))) (let ((mask VReg (gen_icmp_mask cmp_ty cc a b))) - (rv_vmerge_vvm y x mask ty))) + (rv_vmerge_vvm y x mask cmp_ty))) -(rule 2 (lower (has_type (ty_vec_fits_in_register ty) (bitselect (fcmp cc a @ (value_type (ty_vec_fits_in_register cmp_ty)) b) x y))) +(rule 2 (lower (has_type (ty_vec_fits_in_register _ty) (bitselect (fcmp cc a @ (value_type (ty_vec_fits_in_register cmp_ty)) b) x y))) (let ((mask VReg (gen_fcmp_mask cmp_ty cc a b))) - (rv_vmerge_vvm y x mask ty))) + (rv_vmerge_vvm y x mask cmp_ty))) -(rule 2 (lower (has_type (ty_vec_fits_in_register ty) (bitselect (bitcast _ (fcmp cc a @ (value_type (ty_vec_fits_in_register cmp_ty)) b)) x y))) +(rule 2 (lower (has_type (ty_vec_fits_in_register _ty) (bitselect (bitcast _ (fcmp cc a @ (value_type (ty_vec_fits_in_register cmp_ty)) b)) x y))) (let ((mask VReg (gen_fcmp_mask cmp_ty cc a b))) - (rv_vmerge_vvm y x mask ty))) + (rv_vmerge_vvm y x mask cmp_ty))) -(rule 2 (lower (has_type (ty_vec_fits_in_register ty) (bitselect (bitcast _ (icmp cc a @ (value_type (ty_vec_fits_in_register cmp_ty)) b)) x y))) +(rule 2 (lower (has_type (ty_vec_fits_in_register _ty) (bitselect (bitcast _ (icmp cc a @ (value_type (ty_vec_fits_in_register cmp_ty)) b)) x y))) (let ((mask VReg (gen_icmp_mask cmp_ty cc a b))) - (rv_vmerge_vvm y x mask ty))) + (rv_vmerge_vvm y x mask cmp_ty))) ;;;;; Rules for `isplit`;;;;;;;;; diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-bitselect.clif b/cranelift/filetests/filetests/isa/riscv64/simd-bitselect.clif index 28c0092ac8..eb91868954 100644 --- a/cranelift/filetests/filetests/isa/riscv64/simd-bitselect.clif +++ b/cranelift/filetests/filetests/isa/riscv64/simd-bitselect.clif @@ -410,3 +410,115 @@ block0(v0: i64x2, v1: i64x2, v2: f64x2, v3: f64x2): ; addi sp, sp, 0x10 ; ret +function %bitselect_i8x16_fcmp_f64x2(i8x16) -> i8x16 fast { + const0 = 0x00000000000000000000000000000000 + +block0(v0: i8x16): + v1 = bitcast.f64x2 little v0 + v2 = fcmp eq v1, v1 + v3 = bitcast.i8x16 little v2 + v4 = vconst.i8x16 const0 + v5 = bitselect.i8x16 v3, v0, v4 + return v5 +} + +; VCode: +; addi sp,sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v9,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v14,[const(0)] #avl=16, #vtype=(e8, m1, ta, ma) +; vmfeq.vv v0,v9,v9 #avl=2, #vtype=(e64, m1, ta, ma) +; vmerge.vvm v15,v14,v9,v0.t #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v15,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; addi sp,sp,16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; mv s0, sp +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x84, 0x0f, 0x02 +; auipc t6, 0 +; addi t6, t6, 0x34 +; .byte 0x07, 0x87, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0x90, 0x94, 0x62 +; .byte 0xd7, 0x87, 0xe4, 0x5c +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x07, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret +; .byte 0x00, 0x00, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 + +function %bitselect_i8x16_icmp_i64x2(i8x16) -> i8x16 fast { + const0 = 0x00000000000000000000000000000000 + +block0(v0: i8x16): + v1 = bitcast.i64x2 little v0 + v2 = icmp eq v1, v1 + v3 = bitcast.i8x16 little v2 + v4 = vconst.i8x16 const0 + v5 = bitselect.i8x16 v3, v0, v4 + return v5 +} + +; VCode: +; addi sp,sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v9,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v14,[const(0)] #avl=16, #vtype=(e8, m1, ta, ma) +; vmseq.vv v0,v9,v9 #avl=2, #vtype=(e64, m1, ta, ma) +; vmerge.vvm v15,v14,v9,v0.t #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v15,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; addi sp,sp,16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; mv s0, sp +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x84, 0x0f, 0x02 +; auipc t6, 0 +; addi t6, t6, 0x34 +; .byte 0x07, 0x87, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0x80, 0x94, 0x62 +; .byte 0xd7, 0x87, 0xe4, 0x5c +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x07, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret +; .byte 0x00, 0x00, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 + diff --git a/cranelift/filetests/filetests/runtests/simd-bitselect.clif b/cranelift/filetests/filetests/runtests/simd-bitselect.clif index 7d250c779c..6c399ee034 100644 --- a/cranelift/filetests/filetests/runtests/simd-bitselect.clif +++ b/cranelift/filetests/filetests/runtests/simd-bitselect.clif @@ -94,3 +94,37 @@ block0(v0: i64x2, v1: i64x2, v2: i64x2): ; run: %bitwise_bitselect_i64x2(0x11111111111111111111111111111111, 0x11111111111111111111111111111111, 0x00000000000000000000000000000000) == 0x11111111111111111111111111111111 ; run: %bitwise_bitselect_i64x2(0x01010011000011110000000011111111, 0x11111111111111111111111111111111, 0x00000000000000000000000000000000) == 0x01010011000011110000000011111111 ; run: %bitwise_bitselect_i64x2(0x00000000000000001111111111111111, 0x00000000000000000000000000000000, 0x11111111111111111111111111111111) == 0x11111111111111110000000000000000 + + +;; See issue #8131 +;; +;; These tests test the fusion of `bitselect+bitcast+{f,i}cmp` that +;; some backends perform. Importantly the `fcmp` and the `bitselect` +;; have both different type sizes as well as a different number of +;; lanes. + +function %bitselect_i8x16_fcmp_f64x2(i8x16) -> i8x16 fast { + const0 = 0x00000000000000000000000000000000 + +block0(v0: i8x16): + v1 = bitcast.f64x2 little v0 + v2 = fcmp eq v1, v1 + v3 = bitcast.i8x16 little v2 + v4 = vconst.i8x16 const0 + v5 = bitselect.i8x16 v3, v0, v4 ; v3 = const0 + return v5 +} +; run: %bitselect_i8x16_fcmp_f64x2(0x80808080808080808080808080808080) == 0x80808080808080808080808080808080 + +function %bitselect_i8x16_icmp_i64x2(i8x16) -> i8x16 fast { + const0 = 0x00000000000000000000000000000000 + +block0(v0: i8x16): + v1 = bitcast.i64x2 little v0 + v2 = icmp eq v1, v1 + v3 = bitcast.i8x16 little v2 + v4 = vconst.i8x16 const0 + v5 = bitselect.i8x16 v3, v0, v4 ; v3 = const0 + return v5 +} +; run: %bitselect_i8x16_icmp_i64x2(0x80808080808080808080808080808080) == 0x80808080808080808080808080808080