riscv64: Ensure that we use the same vector length when lowering `bitselect+bitcast+{i,f}cmp` (#8133)

We have a special lowering that allows us to fuse a `bitselect` with a comparison instruction. This saves us a few instructions due to the mismatch that exists between native RISC-V masks and WASM masks. Native RISC-V masks have a single bit per lane, whereas WASM masks have all bits in a lane set to 1. The lowering for `bitselect+bitcast+{i,f}cmp` avoids the need to generate the WASM mask, by directly using the comparison mask with `vmerge`. The bug that this fixes was that when we introduce a `bitcast` in the middle, the comparison and the merge may have different types with different lanes. And if that happens the `vmerge` will only look at the first n bits of the mask. n being the number of lanes currently configured. This commit ensures that they are always equal by using the same type for both vmerge and the comparison instruction. I also manually checked all other uses of `gen_{f,i}cmp_mask` and they are all using the same type in the subsequent instructions. With this fix we no longer really care about the type of the `bitselect` as long as it has the same bitlength as the type of `{i,f}cmp`, which I think is enforced by the verifier. (i.e. We would have the same bug if `bitselect.i8x16+icmp.i8xi8` was legal.)
8 months ago · 34f504cd98
3 changed files with 162 additions and 8 deletions
--- a/cranelift/codegen/src/isa/riscv64/lower.isle
+++ b/cranelift/codegen/src/isa/riscv64/lower.isle
@ -1838,21 +1838,29 @@
 ;;
 ;; This allows us to skip the mask expansion step and use the more efficient
 ;; vmerge.vvm instruction.
-(rule 2 (lower (has_type (ty_vec_fits_in_register ty) (bitselect (icmp cc a @ (value_type (ty_vec_fits_in_register cmp_ty)) b) x y)))
+;;
+;; We should be careful to ensure that the mask and the vmerge have the
+;; same type. So that we don't generate a mask with length 16 (i.e. for i8x16), and then
+;; only copy the first few lanes of the result to the destination register because
+;; the bitselect has a different length (i.e. i64x2).
+;;
+;; See: https://github.com/bytecodealliance/wasmtime/issues/8131
+
+(rule 2 (lower (has_type (ty_vec_fits_in_register _ty) (bitselect (icmp cc a @ (value_type (ty_vec_fits_in_register cmp_ty)) b) x y)))
  (let ((mask VReg (gen_icmp_mask cmp_ty cc a b)))
-    (rv_vmerge_vvm y x mask ty)))
+    (rv_vmerge_vvm y x mask cmp_ty)))

-(rule 2 (lower (has_type (ty_vec_fits_in_register ty) (bitselect (fcmp cc a @ (value_type (ty_vec_fits_in_register cmp_ty)) b) x y)))
+(rule 2 (lower (has_type (ty_vec_fits_in_register _ty) (bitselect (fcmp cc a @ (value_type (ty_vec_fits_in_register cmp_ty)) b) x y)))
  (let ((mask VReg (gen_fcmp_mask cmp_ty cc a b)))
-    (rv_vmerge_vvm y x mask ty)))
+    (rv_vmerge_vvm y x mask cmp_ty)))

-(rule 2 (lower (has_type (ty_vec_fits_in_register ty) (bitselect (bitcast _ (fcmp cc a @ (value_type (ty_vec_fits_in_register cmp_ty)) b)) x y)))
+(rule 2 (lower (has_type (ty_vec_fits_in_register _ty) (bitselect (bitcast _ (fcmp cc a @ (value_type (ty_vec_fits_in_register cmp_ty)) b)) x y)))
  (let ((mask VReg (gen_fcmp_mask cmp_ty cc a b)))
-    (rv_vmerge_vvm y x mask ty)))
+    (rv_vmerge_vvm y x mask cmp_ty)))

-(rule 2 (lower (has_type (ty_vec_fits_in_register ty) (bitselect (bitcast _ (icmp cc a @ (value_type (ty_vec_fits_in_register cmp_ty)) b)) x y)))
+(rule 2 (lower (has_type (ty_vec_fits_in_register _ty) (bitselect (bitcast _ (icmp cc a @ (value_type (ty_vec_fits_in_register cmp_ty)) b)) x y)))
  (let ((mask VReg (gen_icmp_mask cmp_ty cc a b)))
-    (rv_vmerge_vvm y x mask ty)))
+    (rv_vmerge_vvm y x mask cmp_ty)))


 ;;;;;  Rules for `isplit`;;;;;;;;;
--- a/cranelift/filetests/filetests/isa/riscv64/simd-bitselect.clif
+++ b/cranelift/filetests/filetests/isa/riscv64/simd-bitselect.clif
@ -410,3 +410,115 @@ block0(v0: i64x2, v1: i64x2, v2: f64x2, v3: f64x2):
 ;   addi sp, sp, 0x10
 ;   ret

+function %bitselect_i8x16_fcmp_f64x2(i8x16) -> i8x16 fast {
+    const0 = 0x00000000000000000000000000000000
+
+block0(v0: i8x16):
+    v1 = bitcast.f64x2 little v0
+    v2 = fcmp eq v1, v1
+    v3 = bitcast.i8x16 little v2
+    v4 = vconst.i8x16 const0
+    v5 = bitselect.i8x16 v3, v0, v4
+    return v5
+}
+
+; VCode:
+;   addi sp,sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v9,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vle8.v v14,[const(0)] #avl=16, #vtype=(e8, m1, ta, ma)
+;   vmfeq.vv v0,v9,v9 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vmerge.vvm v15,v14,v9,v0.t #avl=2, #vtype=(e64, m1, ta, ma)
+;   vse8.v v15,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   addi sp,sp,16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   mv s0, sp
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x84, 0x0f, 0x02
+;   auipc t6, 0
+;   addi t6, t6, 0x34
+;   .byte 0x07, 0x87, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0x57, 0x90, 0x94, 0x62
+;   .byte 0xd7, 0x87, 0xe4, 0x5c
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0xa7, 0x07, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+
+function %bitselect_i8x16_icmp_i64x2(i8x16) -> i8x16 fast {
+    const0 = 0x00000000000000000000000000000000
+
+block0(v0: i8x16):
+    v1 = bitcast.i64x2 little v0
+    v2 = icmp eq v1, v1
+    v3 = bitcast.i8x16 little v2
+    v4 = vconst.i8x16 const0
+    v5 = bitselect.i8x16 v3, v0, v4
+    return v5
+}
+
+; VCode:
+;   addi sp,sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v9,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vle8.v v14,[const(0)] #avl=16, #vtype=(e8, m1, ta, ma)
+;   vmseq.vv v0,v9,v9 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vmerge.vvm v15,v14,v9,v0.t #avl=2, #vtype=(e64, m1, ta, ma)
+;   vse8.v v15,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   addi sp,sp,16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   mv s0, sp
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x84, 0x0f, 0x02
+;   auipc t6, 0
+;   addi t6, t6, 0x34
+;   .byte 0x07, 0x87, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0x57, 0x80, 0x94, 0x62
+;   .byte 0xd7, 0x87, 0xe4, 0x5c
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0xa7, 0x07, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+
--- a/cranelift/filetests/filetests/runtests/simd-bitselect.clif
+++ b/cranelift/filetests/filetests/runtests/simd-bitselect.clif
@ -94,3 +94,37 @@ block0(v0: i64x2, v1: i64x2, v2: i64x2):
 ; run: %bitwise_bitselect_i64x2(0x11111111111111111111111111111111, 0x11111111111111111111111111111111, 0x00000000000000000000000000000000) == 0x11111111111111111111111111111111
 ; run: %bitwise_bitselect_i64x2(0x01010011000011110000000011111111, 0x11111111111111111111111111111111, 0x00000000000000000000000000000000) == 0x01010011000011110000000011111111
 ; run: %bitwise_bitselect_i64x2(0x00000000000000001111111111111111, 0x00000000000000000000000000000000, 0x11111111111111111111111111111111) == 0x11111111111111110000000000000000
+
+
+;; See issue #8131
+;;
+;; These tests test the fusion of `bitselect+bitcast+{f,i}cmp` that
+;; some backends perform. Importantly the `fcmp` and the `bitselect`
+;; have both different type sizes as well as a different number of
+;; lanes.
+
+function %bitselect_i8x16_fcmp_f64x2(i8x16) -> i8x16 fast {
+    const0 = 0x00000000000000000000000000000000
+
+block0(v0: i8x16):
+    v1 = bitcast.f64x2 little v0
+    v2 = fcmp eq v1, v1
+    v3 = bitcast.i8x16 little v2
+    v4 = vconst.i8x16 const0
+    v5 = bitselect.i8x16 v3, v0, v4  ; v3 = const0
+    return v5
+}
+; run: %bitselect_i8x16_fcmp_f64x2(0x80808080808080808080808080808080) == 0x80808080808080808080808080808080
+
+function %bitselect_i8x16_icmp_i64x2(i8x16) -> i8x16 fast {
+    const0 = 0x00000000000000000000000000000000
+
+block0(v0: i8x16):
+    v1 = bitcast.i64x2 little v0
+    v2 = icmp eq v1, v1
+    v3 = bitcast.i8x16 little v2
+    v4 = vconst.i8x16 const0
+    v5 = bitselect.i8x16 v3, v0, v4  ; v3 = const0
+    return v5
+}
+; run: %bitselect_i8x16_icmp_i64x2(0x80808080808080808080808080808080) == 0x80808080808080808080808080808080