Browse Source

x64: Improve lowerings for `ctz`/`clz` instructions (#8673)

* x64: Add some more tests for `ctz`/`clz`

* x64: Improve lowerings for i8/i16/i128 `ctz` and `clz` intructions
pull/8677/head
Afonso Bordado 6 months ago
committed by GitHub
parent
commit
3eae74d16b
No known key found for this signature in database GPG Key ID: B5690EEEBB952194
  1. 52
      cranelift/codegen/src/isa/x64/lower.isle
  2. 5
      cranelift/codegen/src/isle_prelude.rs
  3. 3
      cranelift/codegen/src/prelude.isle
  4. 95
      cranelift/filetests/filetests/isa/x64/clz-lzcnt.clif
  5. 198
      cranelift/filetests/filetests/isa/x64/clz.clif
  6. 95
      cranelift/filetests/filetests/isa/x64/ctz-bmi1.clif
  7. 172
      cranelift/filetests/filetests/isa/x64/ctz.clif
  8. 3
      cranelift/filetests/filetests/runtests/i128-bitops-count.clif
  9. 2
      cranelift/filetests/filetests/runtests/popcnt.clif

52
cranelift/codegen/src/isa/x64/lower.isle

@ -2129,21 +2129,14 @@
;; Rules for `clz` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Rules for `clz` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; If available, we can use a plain lzcnt instruction here. Note no
;; special handling is required for zero inputs, because the machine
;; instruction does what the CLIF expects for zero, i.e. it returns
;; zero.
(rule 3 (lower (has_type (ty_32_or_64 ty) (clz src)))
(if-let $true (use_lzcnt))
(x64_lzcnt ty src))
(rule 2 (lower (has_type (ty_32_or_64 ty) (clz src))) (rule 2 (lower (has_type (ty_32_or_64 ty) (clz src)))
(do_clz ty ty src)) (do_clz ty ty src))
(rule 1 (lower (rule 1 (lower (has_type (ty_8_or_16 ty) (clz src)))
(has_type (ty_8_or_16 ty) (let ((extended Gpr (extend_to_gpr src $I64 (ExtendKind.Zero)))
(clz src))) (clz Gpr (do_clz $I64 $I64 extended)))
(do_clz $I32 ty (extend_to_gpr src $I32 (ExtendKind.Zero)))) (x64_sub $I64 clz (RegMemImm.Imm (u32_sub 64 (ty_bits ty))))))
(rule 0 (lower (rule 0 (lower
(has_type $I128 (has_type $I128
@ -2160,27 +2153,29 @@
;; Implementation helper for clz; operates on 32 or 64-bit units. ;; Implementation helper for clz; operates on 32 or 64-bit units.
(decl do_clz (Type Type Gpr) Gpr) (decl do_clz (Type Type Gpr) Gpr)
(rule (do_clz ty orig_ty src)
;; If available, we can use a plain lzcnt instruction here. Note no
;; special handling is required for zero inputs, because the machine
;; instruction does what the CLIF expects for zero, i.e. it returns
;; zero.
(rule 1 (do_clz ty orig_ty src)
(if-let $true (use_lzcnt))
(x64_lzcnt ty src))
(rule 0 (do_clz ty orig_ty src)
(let ((highest_bit_index Reg (bsr_or_else ty src (imm_i64 $I64 -1))) (let ((highest_bit_index Reg (bsr_or_else ty src (imm_i64 $I64 -1)))
(bits_minus_1 Reg (imm ty (u64_sub (ty_bits_u64 orig_ty) 1)))) (bits_minus_1 Reg (imm ty (u64_sub (ty_bits_u64 orig_ty) 1))))
(x64_sub ty bits_minus_1 highest_bit_index))) (x64_sub ty bits_minus_1 highest_bit_index)))
;; Rules for `ctz` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Rules for `ctz` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Analogous to `clz` cases above, but using mirror instructions
;; (tzcnt vs lzcnt, bsf vs bsr).
(rule 3 (lower (has_type (ty_32_or_64 ty) (ctz src)))
(if-let $true (use_bmi1))
(x64_tzcnt ty src))
(rule 2 (lower (has_type (ty_32_or_64 ty) (ctz src))) (rule 2 (lower (has_type (ty_32_or_64 ty) (ctz src)))
(do_ctz ty ty src)) (do_ctz ty ty src))
(rule 1 (lower (rule 1 (lower (has_type (ty_8_or_16 ty) (ctz src)))
(has_type (ty_8_or_16 ty) (let ((extended Gpr (extend_to_gpr src $I32 (ExtendKind.Zero)))
(ctz src))) (stopbit Gpr (x64_or $I32 extended (RegMemImm.Imm (u32_shl 1 (ty_bits ty))))))
(do_ctz $I32 ty (extend_to_gpr src $I32 (ExtendKind.Zero)))) (do_ctz $I32 ty stopbit)))
(rule 0 (lower (rule 0 (lower
(has_type $I128 (has_type $I128
@ -2196,7 +2191,14 @@
(value_regs result_lo (imm $I64 0)))) (value_regs result_lo (imm $I64 0))))
(decl do_ctz (Type Type Gpr) Gpr) (decl do_ctz (Type Type Gpr) Gpr)
(rule (do_ctz ty orig_ty src)
;; Analogous to `clz` cases above, but using mirror instructions
;; (tzcnt vs lzcnt, bsf vs bsr).
(rule 1 (do_ctz ty orig_ty src)
(if-let $true (use_bmi1))
(x64_tzcnt ty src))
(rule 0 (do_ctz ty orig_ty src)
(bsf_or_else ty src (imm $I64 (ty_bits_u64 orig_ty)))) (bsf_or_else ty src (imm $I64 (ty_bits_u64 orig_ty))))
;; Rules for `popcnt` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Rules for `popcnt` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

5
cranelift/codegen/src/isle_prelude.rs

@ -683,6 +683,11 @@ macro_rules! isle_common_prelude_methods {
a & b a & b
} }
#[inline]
fn u32_shl(&mut self, x: u32, y: u32) -> u32 {
x << y
}
#[inline] #[inline]
fn s32_add_fallible(&mut self, a: i32, b: i32) -> Option<i32> { fn s32_add_fallible(&mut self, a: i32, b: i32) -> Option<i32> {
a.checked_add(b) a.checked_add(b)

3
cranelift/codegen/src/prelude.isle

@ -133,6 +133,9 @@
(decl pure u32_and (u32 u32) u32) (decl pure u32_and (u32 u32) u32)
(extern constructor u32_and u32_and) (extern constructor u32_and u32_and)
(decl pure u32_shl (u32 u32) u32)
(extern constructor u32_shl u32_shl)
;; Pure/fallible constructor that tries to add two `u32`s, interpreted ;; Pure/fallible constructor that tries to add two `u32`s, interpreted
;; as signed values, and fails to match on overflow. ;; as signed values, and fails to match on overflow.
(decl pure partial s32_add_fallible (i32 i32) i32) (decl pure partial s32_add_fallible (i32 i32) i32)

95
cranelift/filetests/filetests/isa/x64/clz-lzcnt.clif

@ -1,6 +1,43 @@
test compile precise-output test compile precise-output
set enable_llvm_abi_extensions=true
target x86_64 has_lzcnt target x86_64 has_lzcnt
function %clz(i128) -> i128 {
block0(v0: i128):
v1 = clz v0
return v1
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; lzcntq %rsi, %rcx
; lzcntq %rdi, %rax
; addq %rax, $64, %rax
; cmpq $64, %rcx
; cmovnzq %rcx, %rax, %rax
; xorq %rdx, %rdx, %rdx
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; lzcntq %rsi, %rcx
; lzcntq %rdi, %rax
; addq $0x40, %rax
; cmpq $0x40, %rcx
; cmovneq %rcx, %rax
; xorq %rdx, %rdx
; movq %rbp, %rsp
; popq %rbp
; retq
function %clz(i64) -> i64 { function %clz(i64) -> i64 {
block0(v0: i64): block0(v0: i64):
v1 = clz v0 v1 = clz v0
@ -51,3 +88,61 @@ block0(v0: i32):
; popq %rbp ; popq %rbp
; retq ; retq
function %clz(i16) -> i16 {
block0(v0: i16):
v1 = clz v0
return v1
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movzwq %di, %rax
; lzcntq %rax, %rax
; subq %rax, $48, %rax
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movzwq %di, %rax
; lzcntq %rax, %rax
; subq $0x30, %rax
; movq %rbp, %rsp
; popq %rbp
; retq
function %clz(i8) -> i8 {
block0(v0: i8):
v1 = clz v0
return v1
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movzbq %dil, %rax
; lzcntq %rax, %rax
; subq %rax, $56, %rax
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movzbq %dil, %rax
; lzcntq %rax, %rax
; subq $0x38, %rax
; movq %rbp, %rsp
; popq %rbp
; retq

198
cranelift/filetests/filetests/isa/x64/clz.clif

@ -0,0 +1,198 @@
test compile precise-output
set enable_llvm_abi_extensions=true
target x86_64
function %clz(i128) -> i128 {
block0(v0: i128):
v1 = clz v0
return v1
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movq %rdi, %r8
; movabsq $-1, %rcx
; bsrq %rsi, %r9
; cmovzq %rcx, %r9, %r9
; movl $63, %edi
; subq %rdi, %r9, %rdi
; movabsq $-1, %rdx
; bsrq %r8, %r10
; cmovzq %rdx, %r10, %r10
; movl $63, %eax
; subq %rax, %r10, %rax
; addq %rax, $64, %rax
; cmpq $64, %rdi
; cmovnzq %rdi, %rax, %rax
; xorq %rdx, %rdx, %rdx
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movq %rdi, %r8
; movq $18446744073709551615, %rcx
; bsrq %rsi, %r9
; cmoveq %rcx, %r9
; movl $0x3f, %edi
; subq %r9, %rdi
; movq $18446744073709551615, %rdx
; bsrq %r8, %r10
; cmoveq %rdx, %r10
; movl $0x3f, %eax
; subq %r10, %rax
; addq $0x40, %rax
; cmpq $0x40, %rdi
; cmovneq %rdi, %rax
; xorq %rdx, %rdx
; movq %rbp, %rsp
; popq %rbp
; retq
function %clz(i64) -> i64 {
block0(v0: i64):
v1 = clz v0
return v1
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movabsq $-1, %rax
; bsrq %rdi, %r8
; cmovzq %rax, %r8, %r8
; movl $63, %eax
; subq %rax, %r8, %rax
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movq $18446744073709551615, %rax
; bsrq %rdi, %r8
; cmoveq %rax, %r8
; movl $0x3f, %eax
; subq %r8, %rax
; movq %rbp, %rsp
; popq %rbp
; retq
function %clz(i32) -> i32 {
block0(v0: i32):
v1 = clz v0
return v1
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movabsq $-1, %rax
; bsrl %edi, %r8d
; cmovzl %eax, %r8d, %r8d
; movl $31, %eax
; subl %eax, %r8d, %eax
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movq $18446744073709551615, %rax
; bsrl %edi, %r8d
; cmovel %eax, %r8d
; movl $0x1f, %eax
; subl %r8d, %eax
; movq %rbp, %rsp
; popq %rbp
; retq
function %clz(i16) -> i16 {
block0(v0: i16):
v1 = clz v0
return v1
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movzwq %di, %rax
; movabsq $-1, %rdx
; bsrq %rax, %r10
; cmovzq %rdx, %r10, %r10
; movl $63, %eax
; subq %rax, %r10, %rax
; subq %rax, $48, %rax
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movzwq %di, %rax
; movq $18446744073709551615, %rdx
; bsrq %rax, %r10
; cmoveq %rdx, %r10
; movl $0x3f, %eax
; subq %r10, %rax
; subq $0x30, %rax
; movq %rbp, %rsp
; popq %rbp
; retq
function %clz(i8) -> i8 {
block0(v0: i8):
v1 = clz v0
return v1
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movzbq %dil, %rax
; movabsq $-1, %rdx
; bsrq %rax, %r10
; cmovzq %rdx, %r10, %r10
; movl $63, %eax
; subq %rax, %r10, %rax
; subq %rax, $56, %rax
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movzbq %dil, %rax
; movq $18446744073709551615, %rdx
; bsrq %rax, %r10
; cmoveq %rdx, %r10
; movl $0x3f, %eax
; subq %r10, %rax
; subq $0x38, %rax
; movq %rbp, %rsp
; popq %rbp
; retq

95
cranelift/filetests/filetests/isa/x64/ctz-bmi1.clif

@ -1,6 +1,43 @@
test compile precise-output test compile precise-output
set enable_llvm_abi_extensions=true
target x86_64 has_bmi1 target x86_64 has_bmi1
function %ctz(i128) -> i128 {
block0(v0: i128):
v1 = ctz v0
return v1
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; tzcntq %rdi, %rax
; tzcntq %rsi, %r9
; addq %r9, $64, %r9
; cmpq $64, %rax
; cmovzq %r9, %rax, %rax
; xorq %rdx, %rdx, %rdx
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; tzcntq %rdi, %rax
; tzcntq %rsi, %r9
; addq $0x40, %r9
; cmpq $0x40, %rax
; cmoveq %r9, %rax
; xorq %rdx, %rdx
; movq %rbp, %rsp
; popq %rbp
; retq
function %ctz(i64) -> i64 { function %ctz(i64) -> i64 {
block0(v0: i64): block0(v0: i64):
v1 = ctz v0 v1 = ctz v0
@ -51,3 +88,61 @@ block0(v0: i32):
; popq %rbp ; popq %rbp
; retq ; retq
function %ctz(i16) -> i16 {
block0(v0: i16):
v1 = ctz v0
return v1
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movzwl %di, %ecx
; orl %ecx, $65536, %ecx
; tzcntl %ecx, %eax
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movzwl %di, %ecx
; orl $0x10000, %ecx
; tzcntl %ecx, %eax
; movq %rbp, %rsp
; popq %rbp
; retq
function %ctz(i8) -> i8 {
block0(v0: i8):
v1 = ctz v0
return v1
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movzbl %dil, %ecx
; orl %ecx, $256, %ecx
; tzcntl %ecx, %eax
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movzbl %dil, %ecx
; orl $0x100, %ecx
; tzcntl %ecx, %eax
; movq %rbp, %rsp
; popq %rbp
; retq

172
cranelift/filetests/filetests/isa/x64/ctz.clif

@ -0,0 +1,172 @@
test compile precise-output
set enable_llvm_abi_extensions=true
target x86_64
function %ctz(i128) -> i128 {
block0(v0: i128):
v1 = ctz v0
return v1
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movl $64, %ecx
; bsfq %rdi, %rax
; cmovzq %rcx, %rax, %rax
; movl $64, %edi
; bsfq %rsi, %rdx
; cmovzq %rdi, %rdx, %rdx
; addq %rdx, $64, %rdx
; cmpq $64, %rax
; cmovzq %rdx, %rax, %rax
; xorq %rdx, %rdx, %rdx
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movl $0x40, %ecx
; bsfq %rdi, %rax
; cmoveq %rcx, %rax
; movl $0x40, %edi
; bsfq %rsi, %rdx
; cmoveq %rdi, %rdx
; addq $0x40, %rdx
; cmpq $0x40, %rax
; cmoveq %rdx, %rax
; xorq %rdx, %rdx
; movq %rbp, %rsp
; popq %rbp
; retq
function %ctz(i64) -> i64 {
block0(v0: i64):
v1 = ctz v0
return v1
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movl $64, %ecx
; bsfq %rdi, %rax
; cmovzq %rcx, %rax, %rax
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movl $0x40, %ecx
; bsfq %rdi, %rax
; cmoveq %rcx, %rax
; movq %rbp, %rsp
; popq %rbp
; retq
function %ctz(i32) -> i32 {
block0(v0: i32):
v1 = ctz v0
return v1
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movl $32, %ecx
; bsfl %edi, %eax
; cmovzl %ecx, %eax, %eax
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movl $0x20, %ecx
; bsfl %edi, %eax
; cmovel %ecx, %eax
; movq %rbp, %rsp
; popq %rbp
; retq
function %ctz(i16) -> i16 {
block0(v0: i16):
v1 = ctz v0
return v1
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movzwl %di, %ecx
; orl %ecx, $65536, %ecx
; movl $16, %r9d
; bsfl %ecx, %eax
; cmovzl %r9d, %eax, %eax
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movzwl %di, %ecx
; orl $0x10000, %ecx
; movl $0x10, %r9d
; bsfl %ecx, %eax
; cmovel %r9d, %eax
; movq %rbp, %rsp
; popq %rbp
; retq
function %ctz(i8) -> i8 {
block0(v0: i8):
v1 = ctz v0
return v1
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movzbl %dil, %ecx
; orl %ecx, $256, %ecx
; movl $8, %r9d
; bsfl %ecx, %eax
; cmovzl %r9d, %eax, %eax
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movzbl %dil, %ecx
; orl $0x100, %ecx
; movl $8, %r9d
; bsfl %ecx, %eax
; cmovel %r9d, %eax
; movq %rbp, %rsp
; popq %rbp
; retq

3
cranelift/filetests/filetests/runtests/i128-bitops-count.clif

@ -3,6 +3,9 @@ set enable_llvm_abi_extensions=true
target aarch64 target aarch64
target s390x target s390x
target x86_64 target x86_64
target x86_64 has_lzcnt
target x86_64 has_bmi1
target x86_64 has_popcnt has_sse42
target riscv64 target riscv64
target riscv64 has_zbb target riscv64 has_zbb
target riscv64 has_zbb has_zbs target riscv64 has_zbb has_zbs

2
cranelift/filetests/filetests/runtests/popcnt.clif

@ -3,7 +3,7 @@ test run
target aarch64 target aarch64
target s390x target s390x
target x86_64 target x86_64
target x86_64 has_popcnt target x86_64 has_popcnt has_sse42
target riscv64 target riscv64
target riscv64 has_c has_zcb target riscv64 has_c has_zcb

Loading…
Cancel
Save