Browse Source

riscv64: Add SIMD `avg_round` (#6599)

pull/6602/head
Afonso Bordado 1 year ago
committed by GitHub
parent
commit
728d0f5db3
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
  1. 3
      cranelift/codegen/src/isa/riscv64/inst/vector.rs
  2. 10
      cranelift/codegen/src/isa/riscv64/inst_vector.isle
  3. 25
      cranelift/codegen/src/isa/riscv64/lower.isle
  4. 194
      cranelift/filetests/filetests/isa/riscv64/simd-avg_round.clif
  5. 1
      cranelift/filetests/filetests/runtests/simd-avg-round-small.clif
  6. 1
      cranelift/filetests/filetests/runtests/simd-avg-round.clif

3
cranelift/codegen/src/isa/riscv64/inst/vector.rs

@ -508,6 +508,7 @@ impl VecAluOpRRImm5 {
VecAluOpRRImm5::VorVI => 0b001010,
VecAluOpRRImm5::VxorVI => 0b001011,
VecAluOpRRImm5::VslidedownVI => 0b001111,
VecAluOpRRImm5::VssrlVI => 0b101010,
VecAluOpRRImm5::VmergeVIM => 0b010111,
VecAluOpRRImm5::VsadduVI => 0b100000,
VecAluOpRRImm5::VsaddVI => 0b100001,
@ -526,6 +527,7 @@ impl VecAluOpRRImm5 {
| VecAluOpRRImm5::VandVI
| VecAluOpRRImm5::VorVI
| VecAluOpRRImm5::VxorVI
| VecAluOpRRImm5::VssrlVI
| VecAluOpRRImm5::VslidedownVI
| VecAluOpRRImm5::VmergeVIM
| VecAluOpRRImm5::VsadduVI
@ -539,6 +541,7 @@ impl VecAluOpRRImm5 {
match self {
VecAluOpRRImm5::VsllVI
| VecAluOpRRImm5::VsrlVI
| VecAluOpRRImm5::VssrlVI
| VecAluOpRRImm5::VsraVI
| VecAluOpRRImm5::VslidedownVI
| VecAluOpRRImm5::VrgatherVI

10
cranelift/codegen/src/isa/riscv64/inst_vector.isle

@ -188,6 +188,7 @@
(VandVI)
(VorVI)
(VxorVI)
(VssrlVI)
(VslidedownVI)
(VmergeVIM)
(VrgatherVI)
@ -663,6 +664,15 @@
(rule (rv_vxor_vi vs2 imm mask vstate)
(vec_alu_rr_imm5 (VecAluOpRRImm5.VxorVI) vs2 imm mask vstate))
;; Helper for emitting the `vssrl.vi` instruction.
;;
;; vd[i] = (unsigned(vs2[i]) >> imm) + r
;;
;; `r` here is the rounding mode currently selected.
(decl rv_vssrl_vi (VReg UImm5 VecOpMasking VState) VReg)
(rule (rv_vssrl_vi vs2 imm mask vstate)
(vec_alu_rr_uimm5 (VecAluOpRRImm5.VssrlVI) vs2 imm mask vstate))
;; Helper for emitting the `vnot.v` instruction.
;; This is just a mnemonic for `vxor.vi vd, vs, -1`
(decl rv_vnot_v (VReg VecOpMasking VState) VReg)

25
cranelift/codegen/src/isa/riscv64/lower.isle

@ -1817,3 +1817,28 @@
(rhs_hi VReg (rv_vcompress_vm y even_mask ty))
(rhs VReg (rv_vslideup_vvi rhs_lo rhs_hi half_size (unmasked) ty)))
(rv_vadd_vv lhs rhs (unmasked) ty)))
;;;; Rules for `avg_round` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; `avg_round` computes the unsigned average with rounding: a := (x + y + 1) // 2
;;
;; See Section "2–5 Average of Two Integers" of the Hacker's Delight book
;;
;; The floor average of two integers without overflow can be computed as:
;; t = (x & y) + ((x ^ y) >> 1)
;;
;; The right shift should be a logical shift if the integers are unsigned.
;;
;; We are however interested in the ceiling average (x + y + 1). For that
;; we use a special rounding mode in the right shift instruction.
;;
;; For the right shift instruction we use `vssrl` which is a Scaling Shift
;; Right Logical instruction using the `vxrm` fixed-point rouding mode. The
;; default rounding mode is `rnu` (round-to-nearest-up (add +0.5 LSB)).
;; Which is coincidentally the rounding mode we want for `avg_round`.
(rule (lower (has_type (ty_vec_fits_in_register ty) (avg_round x y)))
(if-let one (u64_to_uimm5 1))
(let ((lhs VReg (rv_vand_vv x y (unmasked) ty))
(xor VReg (rv_vxor_vv x y (unmasked) ty))
(rhs VReg (rv_vssrl_vi xor one (unmasked) ty)))
(rv_vadd_vv lhs rhs (unmasked) ty)))

194
cranelift/filetests/filetests/isa/riscv64/simd-avg_round.clif

@ -0,0 +1,194 @@
test compile precise-output
set unwind_info=false
target riscv64 has_v
function %avg_round_i8x16(i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16):
v2 = avg_round v0, v1
return v2
}
; VCode:
; add sp,-16
; sd ra,8(sp)
; sd fp,0(sp)
; mv fp,sp
; block0:
; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma)
; vand.vv v6,v1,v3 #avl=16, #vtype=(e8, m1, ta, ma)
; vxor.vv v8,v1,v3 #avl=16, #vtype=(e8, m1, ta, ma)
; vssrl.vi v10,v8,1 #avl=16, #vtype=(e8, m1, ta, ma)
; vadd.vv v12,v6,v10 #avl=16, #vtype=(e8, m1, ta, ma)
; vse8.v v12,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
; ld ra,8(sp)
; ld fp,0(sp)
; add sp,+16
; ret
;
; Disassembled:
; block0: ; offset 0x0
; addi sp, sp, -0x10
; sd ra, 8(sp)
; sd s0, 0(sp)
; ori s0, sp, 0
; block1: ; offset 0x10
; .byte 0x57, 0x70, 0x08, 0xcc
; addi t6, s0, 0x10
; .byte 0x87, 0x80, 0x0f, 0x02
; addi t6, s0, 0x20
; .byte 0x87, 0x81, 0x0f, 0x02
; .byte 0x57, 0x83, 0x11, 0x26
; .byte 0x57, 0x84, 0x11, 0x2e
; .byte 0x57, 0xb5, 0x80, 0xaa
; .byte 0x57, 0x06, 0x65, 0x02
; .byte 0x27, 0x06, 0x05, 0x02
; ld ra, 8(sp)
; ld s0, 0(sp)
; addi sp, sp, 0x10
; ret
function %avg_round_i16x8(i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8):
v2 = avg_round v0, v1
return v2
}
; VCode:
; add sp,-16
; sd ra,8(sp)
; sd fp,0(sp)
; mv fp,sp
; block0:
; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma)
; vand.vv v6,v1,v3 #avl=8, #vtype=(e16, m1, ta, ma)
; vxor.vv v8,v1,v3 #avl=8, #vtype=(e16, m1, ta, ma)
; vssrl.vi v10,v8,1 #avl=8, #vtype=(e16, m1, ta, ma)
; vadd.vv v12,v6,v10 #avl=8, #vtype=(e16, m1, ta, ma)
; vse8.v v12,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
; ld ra,8(sp)
; ld fp,0(sp)
; add sp,+16
; ret
;
; Disassembled:
; block0: ; offset 0x0
; addi sp, sp, -0x10
; sd ra, 8(sp)
; sd s0, 0(sp)
; ori s0, sp, 0
; block1: ; offset 0x10
; .byte 0x57, 0x70, 0x08, 0xcc
; addi t6, s0, 0x10
; .byte 0x87, 0x80, 0x0f, 0x02
; addi t6, s0, 0x20
; .byte 0x87, 0x81, 0x0f, 0x02
; .byte 0x57, 0x70, 0x84, 0xcc
; .byte 0x57, 0x83, 0x11, 0x26
; .byte 0x57, 0x84, 0x11, 0x2e
; .byte 0x57, 0xb5, 0x80, 0xaa
; .byte 0x57, 0x06, 0x65, 0x02
; .byte 0x57, 0x70, 0x08, 0xcc
; .byte 0x27, 0x06, 0x05, 0x02
; ld ra, 8(sp)
; ld s0, 0(sp)
; addi sp, sp, 0x10
; ret
function %avg_round_i32x4(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):
v2 = avg_round v0, v1
return v2
}
; VCode:
; add sp,-16
; sd ra,8(sp)
; sd fp,0(sp)
; mv fp,sp
; block0:
; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma)
; vand.vv v6,v1,v3 #avl=4, #vtype=(e32, m1, ta, ma)
; vxor.vv v8,v1,v3 #avl=4, #vtype=(e32, m1, ta, ma)
; vssrl.vi v10,v8,1 #avl=4, #vtype=(e32, m1, ta, ma)
; vadd.vv v12,v6,v10 #avl=4, #vtype=(e32, m1, ta, ma)
; vse8.v v12,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
; ld ra,8(sp)
; ld fp,0(sp)
; add sp,+16
; ret
;
; Disassembled:
; block0: ; offset 0x0
; addi sp, sp, -0x10
; sd ra, 8(sp)
; sd s0, 0(sp)
; ori s0, sp, 0
; block1: ; offset 0x10
; .byte 0x57, 0x70, 0x08, 0xcc
; addi t6, s0, 0x10
; .byte 0x87, 0x80, 0x0f, 0x02
; addi t6, s0, 0x20
; .byte 0x87, 0x81, 0x0f, 0x02
; .byte 0x57, 0x70, 0x02, 0xcd
; .byte 0x57, 0x83, 0x11, 0x26
; .byte 0x57, 0x84, 0x11, 0x2e
; .byte 0x57, 0xb5, 0x80, 0xaa
; .byte 0x57, 0x06, 0x65, 0x02
; .byte 0x57, 0x70, 0x08, 0xcc
; .byte 0x27, 0x06, 0x05, 0x02
; ld ra, 8(sp)
; ld s0, 0(sp)
; addi sp, sp, 0x10
; ret
function %avg_round_i64x2(i64x2, i64x2) -> i64x2 {
block0(v0: i64x2, v1: i64x2):
v2 = avg_round v0, v1
return v2
}
; VCode:
; add sp,-16
; sd ra,8(sp)
; sd fp,0(sp)
; mv fp,sp
; block0:
; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma)
; vand.vv v6,v1,v3 #avl=2, #vtype=(e64, m1, ta, ma)
; vxor.vv v8,v1,v3 #avl=2, #vtype=(e64, m1, ta, ma)
; vssrl.vi v10,v8,1 #avl=2, #vtype=(e64, m1, ta, ma)
; vadd.vv v12,v6,v10 #avl=2, #vtype=(e64, m1, ta, ma)
; vse8.v v12,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
; ld ra,8(sp)
; ld fp,0(sp)
; add sp,+16
; ret
;
; Disassembled:
; block0: ; offset 0x0
; addi sp, sp, -0x10
; sd ra, 8(sp)
; sd s0, 0(sp)
; ori s0, sp, 0
; block1: ; offset 0x10
; .byte 0x57, 0x70, 0x08, 0xcc
; addi t6, s0, 0x10
; .byte 0x87, 0x80, 0x0f, 0x02
; addi t6, s0, 0x20
; .byte 0x87, 0x81, 0x0f, 0x02
; .byte 0x57, 0x70, 0x81, 0xcd
; .byte 0x57, 0x83, 0x11, 0x26
; .byte 0x57, 0x84, 0x11, 0x2e
; .byte 0x57, 0xb5, 0x80, 0xaa
; .byte 0x57, 0x06, 0x65, 0x02
; .byte 0x57, 0x70, 0x08, 0xcc
; .byte 0x27, 0x06, 0x05, 0x02
; ld ra, 8(sp)
; ld s0, 0(sp)
; addi sp, sp, 0x10
; ret

1
cranelift/filetests/filetests/runtests/simd-avg-round-small.clif

@ -1,6 +1,7 @@
; the interpreter does not currently support SIMD `avg_round`.
test run
target aarch64
target riscv64 has_v
; x86_64 and s390x do not currently support 64-bit vectors, or
; `avg_round` on `i64x2` values.
; x86_64 also does not currently support `avg_round.i32x4`.

1
cranelift/filetests/filetests/runtests/simd-avg-round.clif

@ -4,6 +4,7 @@ target s390x
set enable_simd
target x86_64
target x86_64 skylake
target riscv64 has_v
function %average_rounding_i8x16(i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16):

Loading…
Cancel
Save