From 728d0f5db313084f3a807b2eeb3aaf890b81f127 Mon Sep 17 00:00:00 2001 From: Afonso Bordado Date: Sat, 17 Jun 2023 19:28:36 +0100 Subject: [PATCH] riscv64: Add SIMD `avg_round` (#6599) --- .../codegen/src/isa/riscv64/inst/vector.rs | 3 + .../codegen/src/isa/riscv64/inst_vector.isle | 10 + cranelift/codegen/src/isa/riscv64/lower.isle | 25 +++ .../filetests/isa/riscv64/simd-avg_round.clif | 194 ++++++++++++++++++ .../runtests/simd-avg-round-small.clif | 1 + .../filetests/runtests/simd-avg-round.clif | 1 + 6 files changed, 234 insertions(+) create mode 100644 cranelift/filetests/filetests/isa/riscv64/simd-avg_round.clif diff --git a/cranelift/codegen/src/isa/riscv64/inst/vector.rs b/cranelift/codegen/src/isa/riscv64/inst/vector.rs index fe670d59e7..a2d1cf5a00 100644 --- a/cranelift/codegen/src/isa/riscv64/inst/vector.rs +++ b/cranelift/codegen/src/isa/riscv64/inst/vector.rs @@ -508,6 +508,7 @@ impl VecAluOpRRImm5 { VecAluOpRRImm5::VorVI => 0b001010, VecAluOpRRImm5::VxorVI => 0b001011, VecAluOpRRImm5::VslidedownVI => 0b001111, + VecAluOpRRImm5::VssrlVI => 0b101010, VecAluOpRRImm5::VmergeVIM => 0b010111, VecAluOpRRImm5::VsadduVI => 0b100000, VecAluOpRRImm5::VsaddVI => 0b100001, @@ -526,6 +527,7 @@ impl VecAluOpRRImm5 { | VecAluOpRRImm5::VandVI | VecAluOpRRImm5::VorVI | VecAluOpRRImm5::VxorVI + | VecAluOpRRImm5::VssrlVI | VecAluOpRRImm5::VslidedownVI | VecAluOpRRImm5::VmergeVIM | VecAluOpRRImm5::VsadduVI @@ -539,6 +541,7 @@ impl VecAluOpRRImm5 { match self { VecAluOpRRImm5::VsllVI | VecAluOpRRImm5::VsrlVI + | VecAluOpRRImm5::VssrlVI | VecAluOpRRImm5::VsraVI | VecAluOpRRImm5::VslidedownVI | VecAluOpRRImm5::VrgatherVI diff --git a/cranelift/codegen/src/isa/riscv64/inst_vector.isle b/cranelift/codegen/src/isa/riscv64/inst_vector.isle index 78f91e700a..baef0578a1 100644 --- a/cranelift/codegen/src/isa/riscv64/inst_vector.isle +++ b/cranelift/codegen/src/isa/riscv64/inst_vector.isle @@ -188,6 +188,7 @@ (VandVI) (VorVI) (VxorVI) + (VssrlVI) (VslidedownVI) (VmergeVIM) (VrgatherVI) @@ -663,6 +664,15 @@ (rule (rv_vxor_vi vs2 imm mask vstate) (vec_alu_rr_imm5 (VecAluOpRRImm5.VxorVI) vs2 imm mask vstate)) +;; Helper for emitting the `vssrl.vi` instruction. +;; +;; vd[i] = (unsigned(vs2[i]) >> imm) + r +;; +;; `r` here is the rounding mode currently selected. +(decl rv_vssrl_vi (VReg UImm5 VecOpMasking VState) VReg) +(rule (rv_vssrl_vi vs2 imm mask vstate) + (vec_alu_rr_uimm5 (VecAluOpRRImm5.VssrlVI) vs2 imm mask vstate)) + ;; Helper for emitting the `vnot.v` instruction. ;; This is just a mnemonic for `vxor.vi vd, vs, -1` (decl rv_vnot_v (VReg VecOpMasking VState) VReg) diff --git a/cranelift/codegen/src/isa/riscv64/lower.isle b/cranelift/codegen/src/isa/riscv64/lower.isle index 459ebeee42..93e0bda7fd 100644 --- a/cranelift/codegen/src/isa/riscv64/lower.isle +++ b/cranelift/codegen/src/isa/riscv64/lower.isle @@ -1817,3 +1817,28 @@ (rhs_hi VReg (rv_vcompress_vm y even_mask ty)) (rhs VReg (rv_vslideup_vvi rhs_lo rhs_hi half_size (unmasked) ty))) (rv_vadd_vv lhs rhs (unmasked) ty))) + +;;;; Rules for `avg_round` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; `avg_round` computes the unsigned average with rounding: a := (x + y + 1) // 2 +;; +;; See Section "2–5 Average of Two Integers" of the Hacker's Delight book +;; +;; The floor average of two integers without overflow can be computed as: +;; t = (x & y) + ((x ^ y) >> 1) +;; +;; The right shift should be a logical shift if the integers are unsigned. +;; +;; We are however interested in the ceiling average (x + y + 1). For that +;; we use a special rounding mode in the right shift instruction. +;; +;; For the right shift instruction we use `vssrl` which is a Scaling Shift +;; Right Logical instruction using the `vxrm` fixed-point rouding mode. The +;; default rounding mode is `rnu` (round-to-nearest-up (add +0.5 LSB)). +;; Which is coincidentally the rounding mode we want for `avg_round`. +(rule (lower (has_type (ty_vec_fits_in_register ty) (avg_round x y))) + (if-let one (u64_to_uimm5 1)) + (let ((lhs VReg (rv_vand_vv x y (unmasked) ty)) + (xor VReg (rv_vxor_vv x y (unmasked) ty)) + (rhs VReg (rv_vssrl_vi xor one (unmasked) ty))) + (rv_vadd_vv lhs rhs (unmasked) ty))) diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-avg_round.clif b/cranelift/filetests/filetests/isa/riscv64/simd-avg_round.clif new file mode 100644 index 0000000000..88bbd5c8ae --- /dev/null +++ b/cranelift/filetests/filetests/isa/riscv64/simd-avg_round.clif @@ -0,0 +1,194 @@ +test compile precise-output +set unwind_info=false +target riscv64 has_v + +function %avg_round_i8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = avg_round v0, v1 + return v2 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vand.vv v6,v1,v3 #avl=16, #vtype=(e8, m1, ta, ma) +; vxor.vv v8,v1,v3 #avl=16, #vtype=(e8, m1, ta, ma) +; vssrl.vi v10,v8,1 #avl=16, #vtype=(e8, m1, ta, ma) +; vadd.vv v12,v6,v10 #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v12,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x87, 0x81, 0x0f, 0x02 +; .byte 0x57, 0x83, 0x11, 0x26 +; .byte 0x57, 0x84, 0x11, 0x2e +; .byte 0x57, 0xb5, 0x80, 0xaa +; .byte 0x57, 0x06, 0x65, 0x02 +; .byte 0x27, 0x06, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %avg_round_i16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = avg_round v0, v1 + return v2 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vand.vv v6,v1,v3 #avl=8, #vtype=(e16, m1, ta, ma) +; vxor.vv v8,v1,v3 #avl=8, #vtype=(e16, m1, ta, ma) +; vssrl.vi v10,v8,1 #avl=8, #vtype=(e16, m1, ta, ma) +; vadd.vv v12,v6,v10 #avl=8, #vtype=(e16, m1, ta, ma) +; vse8.v v12,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x87, 0x81, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x84, 0xcc +; .byte 0x57, 0x83, 0x11, 0x26 +; .byte 0x57, 0x84, 0x11, 0x2e +; .byte 0x57, 0xb5, 0x80, 0xaa +; .byte 0x57, 0x06, 0x65, 0x02 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x06, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %avg_round_i32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = avg_round v0, v1 + return v2 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vand.vv v6,v1,v3 #avl=4, #vtype=(e32, m1, ta, ma) +; vxor.vv v8,v1,v3 #avl=4, #vtype=(e32, m1, ta, ma) +; vssrl.vi v10,v8,1 #avl=4, #vtype=(e32, m1, ta, ma) +; vadd.vv v12,v6,v10 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v12,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x87, 0x81, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0x57, 0x83, 0x11, 0x26 +; .byte 0x57, 0x84, 0x11, 0x2e +; .byte 0x57, 0xb5, 0x80, 0xaa +; .byte 0x57, 0x06, 0x65, 0x02 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x06, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %avg_round_i64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = avg_round v0, v1 + return v2 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vand.vv v6,v1,v3 #avl=2, #vtype=(e64, m1, ta, ma) +; vxor.vv v8,v1,v3 #avl=2, #vtype=(e64, m1, ta, ma) +; vssrl.vi v10,v8,1 #avl=2, #vtype=(e64, m1, ta, ma) +; vadd.vv v12,v6,v10 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v12,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x87, 0x81, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0x83, 0x11, 0x26 +; .byte 0x57, 0x84, 0x11, 0x2e +; .byte 0x57, 0xb5, 0x80, 0xaa +; .byte 0x57, 0x06, 0x65, 0x02 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x06, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + diff --git a/cranelift/filetests/filetests/runtests/simd-avg-round-small.clif b/cranelift/filetests/filetests/runtests/simd-avg-round-small.clif index d3823bd56e..7ec46a1fbb 100644 --- a/cranelift/filetests/filetests/runtests/simd-avg-round-small.clif +++ b/cranelift/filetests/filetests/runtests/simd-avg-round-small.clif @@ -1,6 +1,7 @@ ; the interpreter does not currently support SIMD `avg_round`. test run target aarch64 +target riscv64 has_v ; x86_64 and s390x do not currently support 64-bit vectors, or ; `avg_round` on `i64x2` values. ; x86_64 also does not currently support `avg_round.i32x4`. diff --git a/cranelift/filetests/filetests/runtests/simd-avg-round.clif b/cranelift/filetests/filetests/runtests/simd-avg-round.clif index 484eec66c7..de0a13e504 100644 --- a/cranelift/filetests/filetests/runtests/simd-avg-round.clif +++ b/cranelift/filetests/filetests/runtests/simd-avg-round.clif @@ -4,6 +4,7 @@ target s390x set enable_simd target x86_64 target x86_64 skylake +target riscv64 has_v function %average_rounding_i8x16(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16):