From 010e028d671c436940beaec1fbb7c5c9aec0c99d Mon Sep 17 00:00:00 2001
From: Sam Parker <sam.parker@arm.com>
Date: Wed, 25 May 2022 09:19:24 +0100
Subject: [PATCH] [AArch64] Port AtomicCAS to isle (#4140)

Copyright (c) 2022, Arm Limited.
---
 cranelift/codegen/src/isa/aarch64/inst.isle   | 38 +++++++++++++++-
 cranelift/codegen/src/isa/aarch64/lower.isle  | 28 ++++++++----
 .../codegen/src/isa/aarch64/lower_inst.rs     | 45 +------------------
 3 files changed, 57 insertions(+), 54 deletions(-)

diff --git a/cranelift/codegen/src/isa/aarch64/inst.isle b/cranelift/codegen/src/isa/aarch64/inst.isle
index 1999a21fd9..12324ab7db 100644
--- a/cranelift/codegen/src/isa/aarch64/inst.isle
+++ b/cranelift/codegen/src/isa/aarch64/inst.isle
@@ -244,7 +244,9 @@
          (rn Reg)
          (ty Type))
 
-       ;; An atomic compare-and-swap operation. This instruction is sequentially consistent.
+       ;; An atomic compare-and-swap operation. These instructions require the
+       ;; Large System Extension (LSE) ISA support (FEAT_LSE). The instructions have
+       ;; acquire-release semantics.
        (AtomicCAS
          (rs WritableReg)
          (rt Reg)
@@ -2129,6 +2131,16 @@
         )
         dst))
 
+;; Helper for emitting `MInst.AtomicCAS` instructions.
+(decl lse_atomic_cas (Reg Reg Reg Type) Reg)
+(rule (lse_atomic_cas addr expect replace ty)
+      (let (
+            (dst WritableReg (temp_writable_reg ty))
+            (_1 Unit (emit (MInst.Mov (operand_size ty) dst expect)))
+            (_2 Unit (emit (MInst.AtomicCAS dst replace addr ty)))
+          )
+          dst))
+
 ;; Helper for emitting `MInst.AtomicRMWLoop` instructions.
 ;; - Make sure that both args are in virtual regs, since in effect
 ;; we have to do a parallel copy to get them safely to the AtomicRMW input
@@ -2145,3 +2157,27 @@
           (_ Unit (emit (MInst.AtomicRMWLoop ty op)))
         )
         (mov64_from_real 27)))
+
+;; Helper for emitting `MInst.AtomicCASLoop` instructions.
+;; This is very similar to, but not identical to, the AtomicRmw case.  Note
+;; that the AtomicCASLoop sequence does its own masking, so we don't need to worry
+;; about zero-extending narrow (I8/I16/I32) values here.
+;; Make sure that all three args are in virtual regs.  See corresponding comment
+;; for `atomic_rmw_loop` above.
+(decl atomic_cas_loop (Reg Reg Reg Type) Reg)
+(rule (atomic_cas_loop addr expect replace ty)
+      (let (
+          (v_addr Reg (ensure_in_vreg addr $I64))
+          (v_exp Reg (ensure_in_vreg expect $I64))
+          (v_rep Reg (ensure_in_vreg replace $I64))
+          ;; Move the args to the preordained AtomicCASLoop input regs
+          (r_addr Reg (mov64_to_real 25 v_addr))
+          (r_exp Reg (mov64_to_real 26 v_exp))
+          (r_rep Reg (mov64_to_real 28 v_rep))
+          ;; Now the AtomicCASLoop itself, implemented in the normal way, with a
+          ;; load-exclusive, store-exclusive loop
+          (_ Unit (emit (MInst.AtomicCASLoop ty)))
+        )
+        ;; And finally, copy the preordained AtomicCASLoop output reg to its destination.
+        ;; Also, x24 and x28 are trashed.
+        (mov64_from_real 27)))
diff --git a/cranelift/codegen/src/isa/aarch64/lower.isle b/cranelift/codegen/src/isa/aarch64/lower.isle
index fed903523a..b298a30509 100644
--- a/cranelift/codegen/src/isa/aarch64/lower.isle
+++ b/cranelift/codegen/src/isa/aarch64/lower.isle
@@ -1225,39 +1225,39 @@
 
 ;;;; Rules for `AtomicRMW` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
-(rule (lower (and (use_lse)
+(rule 1 (lower (and (use_lse)
                   (has_type (valid_atomic_transaction ty)
                       (atomic_rmw flags (AtomicRmwOp.Add) addr src))))
       (lse_atomic_rmw (AtomicRMWOp.Add) addr src ty))
-(rule (lower (and (use_lse)
+(rule 1 (lower (and (use_lse)
                   (has_type (valid_atomic_transaction ty)
                       (atomic_rmw flags (AtomicRmwOp.Xor) addr src))))
       (lse_atomic_rmw (AtomicRMWOp.Eor) addr src ty))
-(rule (lower (and (use_lse)
+(rule 1 (lower (and (use_lse)
                   (has_type (valid_atomic_transaction ty)
                       (atomic_rmw flags (AtomicRmwOp.Or) addr src))))
       (lse_atomic_rmw (AtomicRMWOp.Set) addr src ty))
-(rule (lower (and (use_lse)
+(rule 1 (lower (and (use_lse)
                   (has_type (valid_atomic_transaction ty)
                       (atomic_rmw flags (AtomicRmwOp.Smax) addr src))))
       (lse_atomic_rmw (AtomicRMWOp.Smax) addr src ty))
-(rule (lower (and (use_lse)
+(rule 1 (lower (and (use_lse)
                   (has_type (valid_atomic_transaction ty)
                       (atomic_rmw flags (AtomicRmwOp.Smin) addr src))))
       (lse_atomic_rmw (AtomicRMWOp.Smin) addr src ty))
-(rule (lower (and (use_lse)
+(rule 1 (lower (and (use_lse)
                   (has_type (valid_atomic_transaction ty)
                       (atomic_rmw flags (AtomicRmwOp.Umax) addr src))))
       (lse_atomic_rmw (AtomicRMWOp.Umax) addr src ty))
-(rule (lower (and (use_lse)
+(rule 1 (lower (and (use_lse)
                   (has_type (valid_atomic_transaction ty)
                       (atomic_rmw flags (AtomicRmwOp.Umin) addr src))))
       (lse_atomic_rmw (AtomicRMWOp.Umin) addr src ty))
-(rule (lower (and (use_lse)
+(rule 1 (lower (and (use_lse)
                   (has_type (valid_atomic_transaction ty)
                       (atomic_rmw flags (AtomicRmwOp.Sub) addr src))))
       (lse_atomic_rmw (AtomicRMWOp.Add) addr (sub ty (zero_reg) src) ty))
-(rule (lower (and (use_lse)
+(rule 1 (lower (and (use_lse)
                   (has_type (valid_atomic_transaction ty)
                       (atomic_rmw flags (AtomicRmwOp.And) addr src))))
       (lse_atomic_rmw (AtomicRMWOp.Clr) addr (eon ty src (zero_reg)) ty))
@@ -1296,3 +1296,13 @@
 (rule (lower (has_type (valid_atomic_transaction ty)
              (atomic_rmw flags (AtomicRmwOp.Xchg) addr src)))
       (atomic_rmw_loop (AtomicRMWLoopOp.Xchg) addr src ty))
+
+;;;; Rules for `AtomicCAS` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+(rule 1 (lower (and (use_lse)
+                  (has_type (valid_atomic_transaction ty)
+                  (atomic_cas flags addr src1 src2))))
+      (lse_atomic_cas addr src1 src2 ty))
+
+(rule (lower (and (has_type (valid_atomic_transaction ty)
+                  (atomic_cas flags addr src1 src2))))
+      (atomic_cas_loop addr src1 src2 ty))
diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
index f952785728..7618ed1b30 100644
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -239,50 +239,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
 
         Opcode::AtomicRmw => implemented_in_isle(ctx),
 
-        Opcode::AtomicCas => {
-            let r_dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-            let mut r_addr = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
-            let mut r_expected = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
-            let mut r_replacement = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
-            let ty_access = ty.unwrap();
-            assert!(is_valid_atomic_transaction_ty(ty_access));
-
-            if isa_flags.use_lse() {
-                ctx.emit(Inst::gen_move(r_dst, r_expected, ty_access));
-                ctx.emit(Inst::AtomicCAS {
-                    rs: r_dst,
-                    rt: r_replacement,
-                    rn: r_addr,
-                    ty: ty_access,
-                });
-            } else {
-                // This is very similar to, but not identical to, the AtomicRmw case.  Note
-                // that the AtomicCASLoop sequence does its own masking, so we don't need to worry
-                // about zero-extending narrow (I8/I16/I32) values here.
-                // Make sure that all three args are in virtual regs.  See corresponding comment
-                // for `Opcode::AtomicRmw` above.
-                r_addr = ctx.ensure_in_vreg(r_addr, I64);
-                r_expected = ctx.ensure_in_vreg(r_expected, I64);
-                r_replacement = ctx.ensure_in_vreg(r_replacement, I64);
-                // Move the args to the preordained AtomicCASLoop input regs
-                ctx.emit(Inst::gen_move(Writable::from_reg(xreg(25)), r_addr, I64));
-                ctx.emit(Inst::gen_move(
-                    Writable::from_reg(xreg(26)),
-                    r_expected,
-                    I64,
-                ));
-                ctx.emit(Inst::gen_move(
-                    Writable::from_reg(xreg(28)),
-                    r_replacement,
-                    I64,
-                ));
-                // Now the AtomicCASLoop itself, implemented in the normal way, with an LL-SC loop
-                ctx.emit(Inst::AtomicCASLoop { ty: ty_access });
-                // And finally, copy the preordained AtomicCASLoop output reg to its destination.
-                ctx.emit(Inst::gen_move(r_dst, xreg(27), I64));
-                // Also, x24 and x28 are trashed.  `fn aarch64_get_regs` must mention that.
-            }
-        }
+        Opcode::AtomicCas => implemented_in_isle(ctx),
 
         Opcode::AtomicLoad => {
             let rt = get_output_reg(ctx, outputs[0]).only_reg().unwrap();