From 3eae74d16bdc5ab9e86dd501b3feb2b0981d9785 Mon Sep 17 00:00:00 2001 From: Afonso Bordado Date: Tue, 21 May 2024 22:24:32 +0100 Subject: [PATCH] x64: Improve lowerings for `ctz`/`clz` instructions (#8673) * x64: Add some more tests for `ctz`/`clz` * x64: Improve lowerings for i8/i16/i128 `ctz` and `clz` intructions --- cranelift/codegen/src/isa/x64/lower.isle | 52 ++--- cranelift/codegen/src/isle_prelude.rs | 5 + cranelift/codegen/src/prelude.isle | 3 + .../filetests/isa/x64/clz-lzcnt.clif | 95 +++++++++ .../filetests/filetests/isa/x64/clz.clif | 198 ++++++++++++++++++ .../filetests/filetests/isa/x64/ctz-bmi1.clif | 95 +++++++++ .../filetests/filetests/isa/x64/ctz.clif | 172 +++++++++++++++ .../filetests/runtests/i128-bitops-count.clif | 3 + .../filetests/filetests/runtests/popcnt.clif | 2 +- 9 files changed, 599 insertions(+), 26 deletions(-) create mode 100644 cranelift/filetests/filetests/isa/x64/clz.clif create mode 100644 cranelift/filetests/filetests/isa/x64/ctz.clif diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index cd26074d64..9857f8dd21 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -2129,21 +2129,14 @@ ;; Rules for `clz` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; If available, we can use a plain lzcnt instruction here. Note no -;; special handling is required for zero inputs, because the machine -;; instruction does what the CLIF expects for zero, i.e. it returns -;; zero. -(rule 3 (lower (has_type (ty_32_or_64 ty) (clz src))) - (if-let $true (use_lzcnt)) - (x64_lzcnt ty src)) - (rule 2 (lower (has_type (ty_32_or_64 ty) (clz src))) (do_clz ty ty src)) -(rule 1 (lower - (has_type (ty_8_or_16 ty) - (clz src))) - (do_clz $I32 ty (extend_to_gpr src $I32 (ExtendKind.Zero)))) +(rule 1 (lower (has_type (ty_8_or_16 ty) (clz src))) + (let ((extended Gpr (extend_to_gpr src $I64 (ExtendKind.Zero))) + (clz Gpr (do_clz $I64 $I64 extended))) + (x64_sub $I64 clz (RegMemImm.Imm (u32_sub 64 (ty_bits ty)))))) + (rule 0 (lower (has_type $I128 @@ -2160,27 +2153,29 @@ ;; Implementation helper for clz; operates on 32 or 64-bit units. (decl do_clz (Type Type Gpr) Gpr) -(rule (do_clz ty orig_ty src) + +;; If available, we can use a plain lzcnt instruction here. Note no +;; special handling is required for zero inputs, because the machine +;; instruction does what the CLIF expects for zero, i.e. it returns +;; zero. +(rule 1 (do_clz ty orig_ty src) + (if-let $true (use_lzcnt)) + (x64_lzcnt ty src)) + +(rule 0 (do_clz ty orig_ty src) (let ((highest_bit_index Reg (bsr_or_else ty src (imm_i64 $I64 -1))) (bits_minus_1 Reg (imm ty (u64_sub (ty_bits_u64 orig_ty) 1)))) (x64_sub ty bits_minus_1 highest_bit_index))) ;; Rules for `ctz` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; Analogous to `clz` cases above, but using mirror instructions -;; (tzcnt vs lzcnt, bsf vs bsr). - -(rule 3 (lower (has_type (ty_32_or_64 ty) (ctz src))) - (if-let $true (use_bmi1)) - (x64_tzcnt ty src)) - (rule 2 (lower (has_type (ty_32_or_64 ty) (ctz src))) (do_ctz ty ty src)) -(rule 1 (lower - (has_type (ty_8_or_16 ty) - (ctz src))) - (do_ctz $I32 ty (extend_to_gpr src $I32 (ExtendKind.Zero)))) +(rule 1 (lower (has_type (ty_8_or_16 ty) (ctz src))) + (let ((extended Gpr (extend_to_gpr src $I32 (ExtendKind.Zero))) + (stopbit Gpr (x64_or $I32 extended (RegMemImm.Imm (u32_shl 1 (ty_bits ty)))))) + (do_ctz $I32 ty stopbit))) (rule 0 (lower (has_type $I128 @@ -2196,7 +2191,14 @@ (value_regs result_lo (imm $I64 0)))) (decl do_ctz (Type Type Gpr) Gpr) -(rule (do_ctz ty orig_ty src) + +;; Analogous to `clz` cases above, but using mirror instructions +;; (tzcnt vs lzcnt, bsf vs bsr). +(rule 1 (do_ctz ty orig_ty src) + (if-let $true (use_bmi1)) + (x64_tzcnt ty src)) + +(rule 0 (do_ctz ty orig_ty src) (bsf_or_else ty src (imm $I64 (ty_bits_u64 orig_ty)))) ;; Rules for `popcnt` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/cranelift/codegen/src/isle_prelude.rs b/cranelift/codegen/src/isle_prelude.rs index 424531d645..b20b9ac2d5 100644 --- a/cranelift/codegen/src/isle_prelude.rs +++ b/cranelift/codegen/src/isle_prelude.rs @@ -683,6 +683,11 @@ macro_rules! isle_common_prelude_methods { a & b } + #[inline] + fn u32_shl(&mut self, x: u32, y: u32) -> u32 { + x << y + } + #[inline] fn s32_add_fallible(&mut self, a: i32, b: i32) -> Option { a.checked_add(b) diff --git a/cranelift/codegen/src/prelude.isle b/cranelift/codegen/src/prelude.isle index 1df50f950d..3322c90cae 100644 --- a/cranelift/codegen/src/prelude.isle +++ b/cranelift/codegen/src/prelude.isle @@ -133,6 +133,9 @@ (decl pure u32_and (u32 u32) u32) (extern constructor u32_and u32_and) +(decl pure u32_shl (u32 u32) u32) +(extern constructor u32_shl u32_shl) + ;; Pure/fallible constructor that tries to add two `u32`s, interpreted ;; as signed values, and fails to match on overflow. (decl pure partial s32_add_fallible (i32 i32) i32) diff --git a/cranelift/filetests/filetests/isa/x64/clz-lzcnt.clif b/cranelift/filetests/filetests/isa/x64/clz-lzcnt.clif index 24c8e3bcaa..af8ca47f5a 100644 --- a/cranelift/filetests/filetests/isa/x64/clz-lzcnt.clif +++ b/cranelift/filetests/filetests/isa/x64/clz-lzcnt.clif @@ -1,6 +1,43 @@ test compile precise-output +set enable_llvm_abi_extensions=true target x86_64 has_lzcnt + +function %clz(i128) -> i128 { +block0(v0: i128): + v1 = clz v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; lzcntq %rsi, %rcx +; lzcntq %rdi, %rax +; addq %rax, $64, %rax +; cmpq $64, %rcx +; cmovnzq %rcx, %rax, %rax +; xorq %rdx, %rdx, %rdx +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; lzcntq %rsi, %rcx +; lzcntq %rdi, %rax +; addq $0x40, %rax +; cmpq $0x40, %rcx +; cmovneq %rcx, %rax +; xorq %rdx, %rdx +; movq %rbp, %rsp +; popq %rbp +; retq + function %clz(i64) -> i64 { block0(v0: i64): v1 = clz v0 @@ -51,3 +88,61 @@ block0(v0: i32): ; popq %rbp ; retq +function %clz(i16) -> i16 { +block0(v0: i16): + v1 = clz v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movzwq %di, %rax +; lzcntq %rax, %rax +; subq %rax, $48, %rax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movzwq %di, %rax +; lzcntq %rax, %rax +; subq $0x30, %rax +; movq %rbp, %rsp +; popq %rbp +; retq + +function %clz(i8) -> i8 { +block0(v0: i8): + v1 = clz v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movzbq %dil, %rax +; lzcntq %rax, %rax +; subq %rax, $56, %rax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movzbq %dil, %rax +; lzcntq %rax, %rax +; subq $0x38, %rax +; movq %rbp, %rsp +; popq %rbp +; retq + diff --git a/cranelift/filetests/filetests/isa/x64/clz.clif b/cranelift/filetests/filetests/isa/x64/clz.clif new file mode 100644 index 0000000000..542024eb24 --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/clz.clif @@ -0,0 +1,198 @@ +test compile precise-output +set enable_llvm_abi_extensions=true +target x86_64 + + +function %clz(i128) -> i128 { +block0(v0: i128): + v1 = clz v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movq %rdi, %r8 +; movabsq $-1, %rcx +; bsrq %rsi, %r9 +; cmovzq %rcx, %r9, %r9 +; movl $63, %edi +; subq %rdi, %r9, %rdi +; movabsq $-1, %rdx +; bsrq %r8, %r10 +; cmovzq %rdx, %r10, %r10 +; movl $63, %eax +; subq %rax, %r10, %rax +; addq %rax, $64, %rax +; cmpq $64, %rdi +; cmovnzq %rdi, %rax, %rax +; xorq %rdx, %rdx, %rdx +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movq %rdi, %r8 +; movq $18446744073709551615, %rcx +; bsrq %rsi, %r9 +; cmoveq %rcx, %r9 +; movl $0x3f, %edi +; subq %r9, %rdi +; movq $18446744073709551615, %rdx +; bsrq %r8, %r10 +; cmoveq %rdx, %r10 +; movl $0x3f, %eax +; subq %r10, %rax +; addq $0x40, %rax +; cmpq $0x40, %rdi +; cmovneq %rdi, %rax +; xorq %rdx, %rdx +; movq %rbp, %rsp +; popq %rbp +; retq + +function %clz(i64) -> i64 { +block0(v0: i64): + v1 = clz v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movabsq $-1, %rax +; bsrq %rdi, %r8 +; cmovzq %rax, %r8, %r8 +; movl $63, %eax +; subq %rax, %r8, %rax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movq $18446744073709551615, %rax +; bsrq %rdi, %r8 +; cmoveq %rax, %r8 +; movl $0x3f, %eax +; subq %r8, %rax +; movq %rbp, %rsp +; popq %rbp +; retq + +function %clz(i32) -> i32 { +block0(v0: i32): + v1 = clz v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movabsq $-1, %rax +; bsrl %edi, %r8d +; cmovzl %eax, %r8d, %r8d +; movl $31, %eax +; subl %eax, %r8d, %eax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movq $18446744073709551615, %rax +; bsrl %edi, %r8d +; cmovel %eax, %r8d +; movl $0x1f, %eax +; subl %r8d, %eax +; movq %rbp, %rsp +; popq %rbp +; retq + +function %clz(i16) -> i16 { +block0(v0: i16): + v1 = clz v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movzwq %di, %rax +; movabsq $-1, %rdx +; bsrq %rax, %r10 +; cmovzq %rdx, %r10, %r10 +; movl $63, %eax +; subq %rax, %r10, %rax +; subq %rax, $48, %rax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movzwq %di, %rax +; movq $18446744073709551615, %rdx +; bsrq %rax, %r10 +; cmoveq %rdx, %r10 +; movl $0x3f, %eax +; subq %r10, %rax +; subq $0x30, %rax +; movq %rbp, %rsp +; popq %rbp +; retq + +function %clz(i8) -> i8 { +block0(v0: i8): + v1 = clz v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movzbq %dil, %rax +; movabsq $-1, %rdx +; bsrq %rax, %r10 +; cmovzq %rdx, %r10, %r10 +; movl $63, %eax +; subq %rax, %r10, %rax +; subq %rax, $56, %rax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movzbq %dil, %rax +; movq $18446744073709551615, %rdx +; bsrq %rax, %r10 +; cmoveq %rdx, %r10 +; movl $0x3f, %eax +; subq %r10, %rax +; subq $0x38, %rax +; movq %rbp, %rsp +; popq %rbp +; retq + diff --git a/cranelift/filetests/filetests/isa/x64/ctz-bmi1.clif b/cranelift/filetests/filetests/isa/x64/ctz-bmi1.clif index 473efb005f..a2634db915 100644 --- a/cranelift/filetests/filetests/isa/x64/ctz-bmi1.clif +++ b/cranelift/filetests/filetests/isa/x64/ctz-bmi1.clif @@ -1,6 +1,43 @@ test compile precise-output +set enable_llvm_abi_extensions=true target x86_64 has_bmi1 + +function %ctz(i128) -> i128 { +block0(v0: i128): + v1 = ctz v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; tzcntq %rdi, %rax +; tzcntq %rsi, %r9 +; addq %r9, $64, %r9 +; cmpq $64, %rax +; cmovzq %r9, %rax, %rax +; xorq %rdx, %rdx, %rdx +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; tzcntq %rdi, %rax +; tzcntq %rsi, %r9 +; addq $0x40, %r9 +; cmpq $0x40, %rax +; cmoveq %r9, %rax +; xorq %rdx, %rdx +; movq %rbp, %rsp +; popq %rbp +; retq + function %ctz(i64) -> i64 { block0(v0: i64): v1 = ctz v0 @@ -51,3 +88,61 @@ block0(v0: i32): ; popq %rbp ; retq +function %ctz(i16) -> i16 { +block0(v0: i16): + v1 = ctz v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movzwl %di, %ecx +; orl %ecx, $65536, %ecx +; tzcntl %ecx, %eax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movzwl %di, %ecx +; orl $0x10000, %ecx +; tzcntl %ecx, %eax +; movq %rbp, %rsp +; popq %rbp +; retq + +function %ctz(i8) -> i8 { +block0(v0: i8): + v1 = ctz v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movzbl %dil, %ecx +; orl %ecx, $256, %ecx +; tzcntl %ecx, %eax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movzbl %dil, %ecx +; orl $0x100, %ecx +; tzcntl %ecx, %eax +; movq %rbp, %rsp +; popq %rbp +; retq + diff --git a/cranelift/filetests/filetests/isa/x64/ctz.clif b/cranelift/filetests/filetests/isa/x64/ctz.clif new file mode 100644 index 0000000000..9c9ef5c053 --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/ctz.clif @@ -0,0 +1,172 @@ +test compile precise-output +set enable_llvm_abi_extensions=true +target x86_64 + + +function %ctz(i128) -> i128 { +block0(v0: i128): + v1 = ctz v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movl $64, %ecx +; bsfq %rdi, %rax +; cmovzq %rcx, %rax, %rax +; movl $64, %edi +; bsfq %rsi, %rdx +; cmovzq %rdi, %rdx, %rdx +; addq %rdx, $64, %rdx +; cmpq $64, %rax +; cmovzq %rdx, %rax, %rax +; xorq %rdx, %rdx, %rdx +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movl $0x40, %ecx +; bsfq %rdi, %rax +; cmoveq %rcx, %rax +; movl $0x40, %edi +; bsfq %rsi, %rdx +; cmoveq %rdi, %rdx +; addq $0x40, %rdx +; cmpq $0x40, %rax +; cmoveq %rdx, %rax +; xorq %rdx, %rdx +; movq %rbp, %rsp +; popq %rbp +; retq + +function %ctz(i64) -> i64 { +block0(v0: i64): + v1 = ctz v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movl $64, %ecx +; bsfq %rdi, %rax +; cmovzq %rcx, %rax, %rax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movl $0x40, %ecx +; bsfq %rdi, %rax +; cmoveq %rcx, %rax +; movq %rbp, %rsp +; popq %rbp +; retq + +function %ctz(i32) -> i32 { +block0(v0: i32): + v1 = ctz v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movl $32, %ecx +; bsfl %edi, %eax +; cmovzl %ecx, %eax, %eax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movl $0x20, %ecx +; bsfl %edi, %eax +; cmovel %ecx, %eax +; movq %rbp, %rsp +; popq %rbp +; retq + +function %ctz(i16) -> i16 { +block0(v0: i16): + v1 = ctz v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movzwl %di, %ecx +; orl %ecx, $65536, %ecx +; movl $16, %r9d +; bsfl %ecx, %eax +; cmovzl %r9d, %eax, %eax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movzwl %di, %ecx +; orl $0x10000, %ecx +; movl $0x10, %r9d +; bsfl %ecx, %eax +; cmovel %r9d, %eax +; movq %rbp, %rsp +; popq %rbp +; retq + +function %ctz(i8) -> i8 { +block0(v0: i8): + v1 = ctz v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movzbl %dil, %ecx +; orl %ecx, $256, %ecx +; movl $8, %r9d +; bsfl %ecx, %eax +; cmovzl %r9d, %eax, %eax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movzbl %dil, %ecx +; orl $0x100, %ecx +; movl $8, %r9d +; bsfl %ecx, %eax +; cmovel %r9d, %eax +; movq %rbp, %rsp +; popq %rbp +; retq + diff --git a/cranelift/filetests/filetests/runtests/i128-bitops-count.clif b/cranelift/filetests/filetests/runtests/i128-bitops-count.clif index d97b5511ce..0fe625b12c 100644 --- a/cranelift/filetests/filetests/runtests/i128-bitops-count.clif +++ b/cranelift/filetests/filetests/runtests/i128-bitops-count.clif @@ -3,6 +3,9 @@ set enable_llvm_abi_extensions=true target aarch64 target s390x target x86_64 +target x86_64 has_lzcnt +target x86_64 has_bmi1 +target x86_64 has_popcnt has_sse42 target riscv64 target riscv64 has_zbb target riscv64 has_zbb has_zbs diff --git a/cranelift/filetests/filetests/runtests/popcnt.clif b/cranelift/filetests/filetests/runtests/popcnt.clif index 0a93cee98b..f128e89d10 100644 --- a/cranelift/filetests/filetests/runtests/popcnt.clif +++ b/cranelift/filetests/filetests/runtests/popcnt.clif @@ -3,7 +3,7 @@ test run target aarch64 target s390x target x86_64 -target x86_64 has_popcnt +target x86_64 has_popcnt has_sse42 target riscv64 target riscv64 has_c has_zcb