diff --git a/cranelift/codegen/meta/src/shared/formats.rs b/cranelift/codegen/meta/src/shared/formats.rs
index 204f3fccb1..3d081951a5 100644
--- a/cranelift/codegen/meta/src/shared/formats.rs
+++ b/cranelift/codegen/meta/src/shared/formats.rs
@@ -3,7 +3,10 @@ use crate::shared::{entities::EntityRefs, immediates::Immediates};
 use std::rc::Rc;
 
 pub(crate) struct Formats {
+    pub(crate) atomic_cas: Rc<InstructionFormat>,
+    pub(crate) atomic_rmw: Rc<InstructionFormat>,
     pub(crate) binary: Rc<InstructionFormat>,
+    pub(crate) binary_imm8: Rc<InstructionFormat>,
     pub(crate) binary_imm64: Rc<InstructionFormat>,
     pub(crate) branch: Rc<InstructionFormat>,
     pub(crate) branch_float: Rc<InstructionFormat>,
@@ -17,7 +20,6 @@ pub(crate) struct Formats {
     pub(crate) cond_trap: Rc<InstructionFormat>,
     pub(crate) copy_special: Rc<InstructionFormat>,
     pub(crate) copy_to_ssa: Rc<InstructionFormat>,
-    pub(crate) binary_imm8: Rc<InstructionFormat>,
     pub(crate) float_compare: Rc<InstructionFormat>,
     pub(crate) float_cond: Rc<InstructionFormat>,
     pub(crate) float_cond_trap: Rc<InstructionFormat>,
@@ -32,6 +34,7 @@ pub(crate) struct Formats {
     pub(crate) jump: Rc<InstructionFormat>,
     pub(crate) load: Rc<InstructionFormat>,
     pub(crate) load_complex: Rc<InstructionFormat>,
+    pub(crate) load_no_offset: Rc<InstructionFormat>,
     pub(crate) multiary: Rc<InstructionFormat>,
     pub(crate) nullary: Rc<InstructionFormat>,
     pub(crate) reg_fill: Rc<InstructionFormat>,
@@ -42,6 +45,7 @@ pub(crate) struct Formats {
     pub(crate) stack_store: Rc<InstructionFormat>,
     pub(crate) store: Rc<InstructionFormat>,
     pub(crate) store_complex: Rc<InstructionFormat>,
+    pub(crate) store_no_offset: Rc<InstructionFormat>,
     pub(crate) table_addr: Rc<InstructionFormat>,
     pub(crate) ternary: Rc<InstructionFormat>,
     pub(crate) ternary_imm8: Rc<InstructionFormat>,
@@ -202,6 +206,21 @@ impl Formats {
 
             func_addr: Builder::new("FuncAddr").imm(&entities.func_ref).build(),
 
+            atomic_rmw: Builder::new("AtomicRmw")
+                .imm(&imm.memflags)
+                .imm(&imm.atomic_rmw_op)
+                .value()
+                .value()
+                .build(),
+
+            atomic_cas: Builder::new("AtomicCas")
+                .imm(&imm.memflags)
+                .value()
+                .value()
+                .value()
+                .typevar_operand(2)
+                .build(),
+
             load: Builder::new("Load")
                 .imm(&imm.memflags)
                 .value()
@@ -214,6 +233,11 @@ impl Formats {
                 .imm(&imm.offset32)
                 .build(),
 
+            load_no_offset: Builder::new("LoadNoOffset")
+                .imm(&imm.memflags)
+                .value()
+                .build(),
+
             store: Builder::new("Store")
                 .imm(&imm.memflags)
                 .value()
@@ -228,6 +252,12 @@ impl Formats {
                 .imm(&imm.offset32)
                 .build(),
 
+            store_no_offset: Builder::new("StoreNoOffset")
+                .imm(&imm.memflags)
+                .value()
+                .value()
+                .build(),
+
             stack_load: Builder::new("StackLoad")
                 .imm(&entities.stack_slot)
                 .imm(&imm.offset32)
diff --git a/cranelift/codegen/meta/src/shared/immediates.rs b/cranelift/codegen/meta/src/shared/immediates.rs
index d8382e4067..0aa4129daf 100644
--- a/cranelift/codegen/meta/src/shared/immediates.rs
+++ b/cranelift/codegen/meta/src/shared/immediates.rs
@@ -71,6 +71,9 @@ pub(crate) struct Immediates {
     ///
     /// The Rust enum type also has a `User(u16)` variant for user-provided trap codes.
     pub trapcode: OperandKind,
+
+    /// A code indicating the arithmetic operation to perform in an atomic_rmw memory access.
+    pub atomic_rmw_op: OperandKind,
 }
 
 fn new_imm(format_field_name: &'static str, rust_type: &'static str) -> OperandKind {
@@ -156,6 +159,17 @@ impl Immediates {
                 trapcode_values.insert("int_divz", "IntegerDivisionByZero");
                 new_enum("code", "ir::TrapCode", trapcode_values).with_doc("A trap reason code.")
             },
+            atomic_rmw_op: {
+                let mut atomic_rmw_op_values = HashMap::new();
+                atomic_rmw_op_values.insert("add", "Add");
+                atomic_rmw_op_values.insert("sub", "Sub");
+                atomic_rmw_op_values.insert("and", "And");
+                atomic_rmw_op_values.insert("or", "Or");
+                atomic_rmw_op_values.insert("xor", "Xor");
+                atomic_rmw_op_values.insert("xchg", "Xchg");
+                new_enum("op", "ir::AtomicRmwOp", atomic_rmw_op_values)
+                    .with_doc("Atomic Read-Modify-Write Ops")
+            },
         }
     }
 }
diff --git a/cranelift/codegen/meta/src/shared/instructions.rs b/cranelift/codegen/meta/src/shared/instructions.rs
index 057ae7a0b2..93f80d498e 100644
--- a/cranelift/codegen/meta/src/shared/instructions.rs
+++ b/cranelift/codegen/meta/src/shared/instructions.rs
@@ -4305,5 +4305,109 @@ pub(crate) fn define(
         .is_ghost(true),
     );
 
+    // Instructions relating to atomic memory accesses and fences
+    let AtomicMem = &TypeVar::new(
+        "AtomicMem",
+        "Any type that can be stored in memory, which can be used in an atomic operation",
+        TypeSetBuilder::new().ints(8..64).build(),
+    );
+    let x = &Operand::new("x", AtomicMem).with_doc("Value to be atomically stored");
+    let a = &Operand::new("a", AtomicMem).with_doc("Value atomically loaded");
+    let e = &Operand::new("e", AtomicMem).with_doc("Expected value in CAS");
+    let p = &Operand::new("p", iAddr);
+    let MemFlags = &Operand::new("MemFlags", &imm.memflags);
+    let AtomicRmwOp = &Operand::new("AtomicRmwOp", &imm.atomic_rmw_op);
+
+    ig.push(
+        Inst::new(
+            "atomic_rmw",
+            r#"
+        Atomically read-modify-write memory at `p`, with second operand `x`.  The old value is
+        returned.  `p` has the type of the target word size, and `x` may be an integer type of
+        8, 16, 32 or 64 bits, even on a 32-bit target.  The type of the returned value is the
+        same as the type of `x`.  This operation is sequentially consistent and creates
+        happens-before edges that order normal (non-atomic) loads and stores.
+        "#,
+            &formats.atomic_rmw,
+        )
+        .operands_in(vec![MemFlags, AtomicRmwOp, p, x])
+        .operands_out(vec![a])
+        .can_load(true)
+        .can_store(true)
+        .other_side_effects(true),
+    );
+
+    ig.push(
+        Inst::new(
+            "atomic_cas",
+            r#"
+        Perform an atomic compare-and-swap operation on memory at `p`, with expected value `e`,
+        storing `x` if the value at `p` equals `e`.  The old value at `p` is returned,
+        regardless of whether the operation succeeds or fails.  `p` has the type of the target
+        word size, and `x` and `e` must have the same type and the same size, which may be an
+        integer type of 8, 16, 32 or 64 bits, even on a 32-bit target.  The type of the returned
+        value is the same as the type of `x` and `e`.  This operation is sequentially
+        consistent and creates happens-before edges that order normal (non-atomic) loads and
+        stores.
+        "#,
+            &formats.atomic_cas,
+        )
+        .operands_in(vec![MemFlags, p, e, x])
+        .operands_out(vec![a])
+        .can_load(true)
+        .can_store(true)
+        .other_side_effects(true),
+    );
+
+    ig.push(
+        Inst::new(
+            "atomic_load",
+            r#"
+        Atomically load from memory at `p`.
+
+        This is a polymorphic instruction that can load any value type which has a memory
+        representation.  It should only be used for integer types with 8, 16, 32 or 64 bits.
+        This operation is sequentially consistent and creates happens-before edges that order
+        normal (non-atomic) loads and stores.
+        "#,
+            &formats.load_no_offset,
+        )
+        .operands_in(vec![MemFlags, p])
+        .operands_out(vec![a])
+        .can_load(true)
+        .other_side_effects(true),
+    );
+
+    ig.push(
+        Inst::new(
+            "atomic_store",
+            r#"
+        Atomically store `x` to memory at `p`.
+
+        This is a polymorphic instruction that can store any value type with a memory
+        representation.  It should only be used for integer types with 8, 16, 32 or 64 bits.
+        This operation is sequentially consistent and creates happens-before edges that order
+        normal (non-atomic) loads and stores.
+        "#,
+            &formats.store_no_offset,
+        )
+        .operands_in(vec![MemFlags, x, p])
+        .can_store(true)
+        .other_side_effects(true),
+    );
+
+    ig.push(
+        Inst::new(
+            "fence",
+            r#"
+        A memory fence.  This must provide ordering to ensure that, at a minimum, neither loads
+        nor stores of any kind may move forwards or backwards across the fence.  This operation
+        is sequentially consistent.
+        "#,
+            &formats.nullary,
+        )
+        .other_side_effects(true),
+    );
+
     ig.build()
 }
diff --git a/cranelift/codegen/src/ir/atomic_rmw_op.rs b/cranelift/codegen/src/ir/atomic_rmw_op.rs
new file mode 100644
index 0000000000..c93756147a
--- /dev/null
+++ b/cranelift/codegen/src/ir/atomic_rmw_op.rs
@@ -0,0 +1,52 @@
+/// Describes the arithmetic operation in an atomic memory read-modify-write operation.
+use core::fmt::{self, Display, Formatter};
+use core::str::FromStr;
+#[cfg(feature = "enable-serde")]
+use serde::{Deserialize, Serialize};
+
+#[derive(Clone, Copy, PartialEq, Eq, Debug, Hash)]
+#[cfg_attr(feature = "enable-serde", derive(Serialize, Deserialize))]
+/// Describes the arithmetic operation in an atomic memory read-modify-write operation.
+pub enum AtomicRmwOp {
+    /// Add
+    Add,
+    /// Sub
+    Sub,
+    /// And
+    And,
+    /// Or
+    Or,
+    /// Xor
+    Xor,
+    /// Exchange
+    Xchg,
+}
+
+impl Display for AtomicRmwOp {
+    fn fmt(&self, f: &mut Formatter) -> fmt::Result {
+        let s = match self {
+            AtomicRmwOp::Add => "add",
+            AtomicRmwOp::Sub => "sub",
+            AtomicRmwOp::And => "and",
+            AtomicRmwOp::Or => "or",
+            AtomicRmwOp::Xor => "xor",
+            AtomicRmwOp::Xchg => "xchg",
+        };
+        f.write_str(s)
+    }
+}
+
+impl FromStr for AtomicRmwOp {
+    type Err = ();
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            "add" => Ok(AtomicRmwOp::Add),
+            "sub" => Ok(AtomicRmwOp::Sub),
+            "and" => Ok(AtomicRmwOp::And),
+            "or" => Ok(AtomicRmwOp::Or),
+            "xor" => Ok(AtomicRmwOp::Xor),
+            "xchg" => Ok(AtomicRmwOp::Xchg),
+            _ => Err(()),
+        }
+    }
+}
diff --git a/cranelift/codegen/src/ir/mod.rs b/cranelift/codegen/src/ir/mod.rs
index 7f3c36b7be..4dbe90df34 100644
--- a/cranelift/codegen/src/ir/mod.rs
+++ b/cranelift/codegen/src/ir/mod.rs
@@ -1,5 +1,6 @@
 //! Representation of Cranelift IR functions.
 
+mod atomic_rmw_op;
 mod builder;
 pub mod constant;
 pub mod dfg;
@@ -26,6 +27,7 @@ mod valueloc;
 #[cfg(feature = "enable-serde")]
 use serde::{Deserialize, Serialize};
 
+pub use crate::ir::atomic_rmw_op::AtomicRmwOp;
 pub use crate::ir::builder::{
     InsertBuilder, InstBuilder, InstBuilderBase, InstInserterBase, ReplaceBuilder,
 };
diff --git a/cranelift/codegen/src/ir/trapcode.rs b/cranelift/codegen/src/ir/trapcode.rs
index 0f1f62e3b6..612c979a0a 100644
--- a/cranelift/codegen/src/ir/trapcode.rs
+++ b/cranelift/codegen/src/ir/trapcode.rs
@@ -24,6 +24,9 @@ pub enum TrapCode {
     /// offset-guard pages.
     HeapOutOfBounds,
 
+    /// A wasm atomic operation was presented with a not-naturally-aligned linear-memory address.
+    HeapMisaligned,
+
     /// A `table_addr` instruction detected an out-of-bounds error.
     TableOutOfBounds,
 
@@ -59,6 +62,7 @@ impl Display for TrapCode {
         let identifier = match *self {
             StackOverflow => "stk_ovf",
             HeapOutOfBounds => "heap_oob",
+            HeapMisaligned => "heap_misaligned",
             TableOutOfBounds => "table_oob",
             IndirectCallToNull => "icall_null",
             BadSignature => "bad_sig",
@@ -81,6 +85,7 @@ impl FromStr for TrapCode {
         match s {
             "stk_ovf" => Ok(StackOverflow),
             "heap_oob" => Ok(HeapOutOfBounds),
+            "heap_misaligned" => Ok(HeapMisaligned),
             "table_oob" => Ok(TableOutOfBounds),
             "icall_null" => Ok(IndirectCallToNull),
             "bad_sig" => Ok(BadSignature),
@@ -101,9 +106,10 @@ mod tests {
     use alloc::string::ToString;
 
     // Everything but user-defined codes.
-    const CODES: [TrapCode; 10] = [
+    const CODES: [TrapCode; 11] = [
         TrapCode::StackOverflow,
         TrapCode::HeapOutOfBounds,
+        TrapCode::HeapMisaligned,
         TrapCode::TableOutOfBounds,
         TrapCode::IndirectCallToNull,
         TrapCode::BadSignature,
diff --git a/cranelift/codegen/src/isa/aarch64/inst/args.rs b/cranelift/codegen/src/isa/aarch64/inst/args.rs
index 43dcc816e5..28fad2763d 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/args.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/args.rs
@@ -3,6 +3,7 @@
 // Some variants are never constructed, but we still want them as options in the future.
 #![allow(dead_code)]
 
+use crate::ir;
 use crate::ir::types::{F32X2, F32X4, F64X2, I16X4, I16X8, I32X2, I32X4, I64X2, I8X16, I8X8};
 use crate::ir::Type;
 use crate::isa::aarch64::inst::*;
@@ -14,6 +15,9 @@ use regalloc::{RealRegUniverse, Reg, Writable};
 use core::convert::Into;
 use std::string::String;
 
+//=============================================================================
+// Instruction sub-components: shift and extend descriptors
+
 /// A shift operator for a register or immediate.
 #[derive(Clone, Copy, Debug)]
 #[repr(u8)]
@@ -645,3 +649,30 @@ impl VectorSize {
         }
     }
 }
+
+//=============================================================================
+// Instruction sub-components: atomic memory update operations
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+#[repr(u8)]
+pub enum AtomicRMWOp {
+    Add,
+    Sub,
+    And,
+    Or,
+    Xor,
+    Xchg,
+}
+
+impl AtomicRMWOp {
+    pub fn from(ir_op: ir::AtomicRmwOp) -> Self {
+        match ir_op {
+            ir::AtomicRmwOp::Add => AtomicRMWOp::Add,
+            ir::AtomicRmwOp::Sub => AtomicRMWOp::Sub,
+            ir::AtomicRmwOp::And => AtomicRMWOp::And,
+            ir::AtomicRmwOp::Or => AtomicRMWOp::Or,
+            ir::AtomicRmwOp::Xor => AtomicRMWOp::Xor,
+            ir::AtomicRmwOp::Xchg => AtomicRMWOp::Xchg,
+        }
+    }
+}
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
index 9a280e0d01..8b063d36c1 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -378,6 +378,39 @@ fn enc_vec_lanes(q: u32, u: u32, size: u32, opcode: u32, rd: Writable<Reg>, rn:
         | machreg_to_vec(rd.to_reg())
 }
 
+fn enc_dmb_ish() -> u32 {
+    0xD5033BBF
+}
+
+fn enc_ldxr(ty: Type, rt: Writable<Reg>, rn: Reg) -> u32 {
+    let sz = match ty {
+        I64 => 0b11,
+        I32 => 0b10,
+        I16 => 0b01,
+        I8 => 0b00,
+        _ => unreachable!(),
+    };
+    0b00001000_01011111_01111100_00000000
+        | (sz << 30)
+        | (machreg_to_gpr(rn) << 5)
+        | machreg_to_gpr(rt.to_reg())
+}
+
+fn enc_stxr(ty: Type, rs: Writable<Reg>, rt: Reg, rn: Reg) -> u32 {
+    let sz = match ty {
+        I64 => 0b11,
+        I32 => 0b10,
+        I16 => 0b01,
+        I8 => 0b00,
+        _ => unreachable!(),
+    };
+    0b00001000_00000000_01111100_00000000
+        | (sz << 30)
+        | (machreg_to_gpr(rs.to_reg()) << 16)
+        | (machreg_to_gpr(rn) << 5)
+        | machreg_to_gpr(rt)
+}
+
 /// State carried between emissions of a sequence of instructions.
 #[derive(Default, Clone, Debug)]
 pub struct EmitState {
@@ -1005,6 +1038,219 @@ impl MachInstEmit for Inst {
             } => {
                 sink.put4(enc_ccmp_imm(size, rn, imm, nzcv, cond));
             }
+            &Inst::AtomicRMW { ty, op, srcloc } => {
+                /* Emit this:
+                      dmb         ish
+                     again:
+                      ldxr{,b,h}  x/w27, [x25]
+                      op          x28, x27, x26 // op is add,sub,and,orr,eor
+                      stxr{,b,h}  w24, x/w28, [x25]
+                      cbnz        x24, again
+                      dmb         ish
+
+                   Operand conventions:
+                      IN:  x25 (addr), x26 (2nd arg for op)
+                      OUT: x27 (old value), x24 (trashed), x28 (trashed)
+
+                   It is unfortunate that, per the ARM documentation, x28 cannot be used for
+                   both the store-data and success-flag operands of stxr.  This causes the
+                   instruction's behaviour to be "CONSTRAINED UNPREDICTABLE", so we use x24
+                   instead for the success-flag.
+
+                   In the case where the operation is 'xchg', the second insn is instead
+                     mov          x28, x26
+                   so that we simply write in the destination, the "2nd arg for op".
+                */
+                let xzr = zero_reg();
+                let x24 = xreg(24);
+                let x25 = xreg(25);
+                let x26 = xreg(26);
+                let x27 = xreg(27);
+                let x28 = xreg(28);
+                let x24wr = writable_xreg(24);
+                let x27wr = writable_xreg(27);
+                let x28wr = writable_xreg(28);
+                let again_label = sink.get_label();
+
+                sink.put4(enc_dmb_ish()); // dmb ish
+
+                // again:
+                sink.bind_label(again_label);
+                if let Some(srcloc) = srcloc {
+                    sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
+                }
+                sink.put4(enc_ldxr(ty, x27wr, x25)); // ldxr x27, [x25]
+
+                if op == AtomicRMWOp::Xchg {
+                    // mov x28, x26
+                    sink.put4(enc_arith_rrr(0b101_01010_00_0, 0b000000, x28wr, xzr, x26))
+                } else {
+                    // add/sub/and/orr/eor x28, x27, x26
+                    let bits_31_21 = match op {
+                        AtomicRMWOp::Add => 0b100_01011_00_0,
+                        AtomicRMWOp::Sub => 0b110_01011_00_0,
+                        AtomicRMWOp::And => 0b100_01010_00_0,
+                        AtomicRMWOp::Or => 0b101_01010_00_0,
+                        AtomicRMWOp::Xor => 0b110_01010_00_0,
+                        AtomicRMWOp::Xchg => unreachable!(),
+                    };
+                    sink.put4(enc_arith_rrr(bits_31_21, 0b000000, x28wr, x27, x26));
+                }
+
+                if let Some(srcloc) = srcloc {
+                    sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
+                }
+                sink.put4(enc_stxr(ty, x24wr, x28, x25)); // stxr w24, x28, [x25]
+
+                // cbnz w24, again
+                // Note, we're actually testing x24, and relying on the default zero-high-half
+                // rule in the assignment that `stxr` does.
+                let br_offset = sink.cur_offset();
+                sink.put4(enc_conditional_br(
+                    BranchTarget::Label(again_label),
+                    CondBrKind::NotZero(x24),
+                ));
+                sink.use_label_at_offset(br_offset, again_label, LabelUse::Branch19);
+
+                sink.put4(enc_dmb_ish()); // dmb ish
+            }
+            &Inst::AtomicCAS { ty, srcloc } => {
+                /* Emit this:
+                     dmb         ish
+                    again:
+                     ldxr{,b,h}  x/w27, [x25]
+                     and         x24, x26, MASK (= 2^size_bits - 1)
+                     cmp         x27, x24
+                     b.ne        out
+                     stxr{,b,h}  w24, x/w28, [x25]
+                     cbnz        x24, again
+                    out:
+                     dmb         ish
+
+                  Operand conventions:
+                     IN:  x25 (addr), x26 (expected value), x28 (replacement value)
+                     OUT: x27 (old value), x24 (trashed)
+                */
+                let xzr = zero_reg();
+                let x24 = xreg(24);
+                let x25 = xreg(25);
+                let x26 = xreg(26);
+                let x27 = xreg(27);
+                let x28 = xreg(28);
+                let xzrwr = writable_zero_reg();
+                let x24wr = writable_xreg(24);
+                let x27wr = writable_xreg(27);
+                let again_label = sink.get_label();
+                let out_label = sink.get_label();
+
+                sink.put4(enc_dmb_ish()); // dmb ish
+
+                // again:
+                sink.bind_label(again_label);
+                if let Some(srcloc) = srcloc {
+                    sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
+                }
+                sink.put4(enc_ldxr(ty, x27wr, x25)); // ldxr x27, [x25]
+
+                if ty == I64 {
+                    // mov x24, x26
+                    sink.put4(enc_arith_rrr(0b101_01010_00_0, 0b000000, x24wr, xzr, x26))
+                } else {
+                    // and x24, x26, 0xFF/0xFFFF/0xFFFFFFFF
+                    let (mask, s) = match ty {
+                        I8 => (0xFF, 7),
+                        I16 => (0xFFFF, 15),
+                        I32 => (0xFFFFFFFF, 31),
+                        _ => unreachable!(),
+                    };
+                    sink.put4(enc_arith_rr_imml(
+                        0b100_100100,
+                        ImmLogic::from_n_r_s(mask, true, 0, s, OperandSize::Size64).enc_bits(),
+                        x26,
+                        x24wr,
+                    ))
+                }
+
+                // cmp x27, x24 (== subs xzr, x27, x24)
+                sink.put4(enc_arith_rrr(0b111_01011_00_0, 0b000000, xzrwr, x27, x24));
+
+                // b.ne out
+                let br_out_offset = sink.cur_offset();
+                sink.put4(enc_conditional_br(
+                    BranchTarget::Label(out_label),
+                    CondBrKind::Cond(Cond::Ne),
+                ));
+                sink.use_label_at_offset(br_out_offset, out_label, LabelUse::Branch19);
+
+                if let Some(srcloc) = srcloc {
+                    sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
+                }
+                sink.put4(enc_stxr(ty, x24wr, x28, x25)); // stxr w24, x28, [x25]
+
+                // cbnz w24, again.
+                // Note, we're actually testing x24, and relying on the default zero-high-half
+                // rule in the assignment that `stxr` does.
+                let br_again_offset = sink.cur_offset();
+                sink.put4(enc_conditional_br(
+                    BranchTarget::Label(again_label),
+                    CondBrKind::NotZero(x24),
+                ));
+                sink.use_label_at_offset(br_again_offset, again_label, LabelUse::Branch19);
+
+                // out:
+                sink.bind_label(out_label);
+                sink.put4(enc_dmb_ish()); // dmb ish
+            }
+            &Inst::AtomicLoad {
+                ty,
+                r_data,
+                r_addr,
+                srcloc,
+            } => {
+                let op = match ty {
+                    I8 => 0b0011100001,
+                    I16 => 0b0111100001,
+                    I32 => 0b1011100001,
+                    I64 => 0b1111100001,
+                    _ => unreachable!(),
+                };
+                sink.put4(enc_dmb_ish()); // dmb ish
+
+                if let Some(srcloc) = srcloc {
+                    sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
+                }
+                let uimm12scaled_zero = UImm12Scaled::zero(I8 /*irrelevant*/);
+                sink.put4(enc_ldst_uimm12(
+                    op,
+                    uimm12scaled_zero,
+                    r_addr,
+                    r_data.to_reg(),
+                ));
+            }
+            &Inst::AtomicStore {
+                ty,
+                r_data,
+                r_addr,
+                srcloc,
+            } => {
+                let op = match ty {
+                    I8 => 0b0011100000,
+                    I16 => 0b0111100000,
+                    I32 => 0b1011100000,
+                    I64 => 0b1111100000,
+                    _ => unreachable!(),
+                };
+
+                if let Some(srcloc) = srcloc {
+                    sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
+                }
+                let uimm12scaled_zero = UImm12Scaled::zero(I8 /*irrelevant*/);
+                sink.put4(enc_ldst_uimm12(op, uimm12scaled_zero, r_addr, r_data));
+                sink.put4(enc_dmb_ish()); // dmb ish
+            }
+            &Inst::Fence {} => {
+                sink.put4(enc_dmb_ish()); // dmb ish
+            }
             &Inst::FpuMove64 { rd, rn } => {
                 sink.put4(enc_vecmov(/* 16b = */ false, rd, rn));
             }
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
index c7d01d679e..5f00e3c7fd 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@@ -4262,6 +4262,90 @@ fn test_aarch64_binemit() {
         "frintn d23, d24",
     ));
 
+    insns.push((
+        Inst::AtomicRMW {
+            ty: I16,
+            op: AtomicRMWOp::Xor,
+            srcloc: None,
+        },
+        "BF3B03D53B7F5F487C031ACA3C7F1848B8FFFFB5BF3B03D5",
+        "atomically { 16_bits_at_[x25]) Xor= x26 ; x27 = old_value_at_[x25]; x24,x28 = trash }",
+    ));
+
+    insns.push((
+        Inst::AtomicRMW {
+            ty: I32,
+            op: AtomicRMWOp::Xchg,
+            srcloc: None,
+        },
+        "BF3B03D53B7F5F88FC031AAA3C7F1888B8FFFFB5BF3B03D5",
+        "atomically { 32_bits_at_[x25]) Xchg= x26 ; x27 = old_value_at_[x25]; x24,x28 = trash }",
+    ));
+
+    insns.push((
+        Inst::AtomicCAS {
+            ty: I8,
+            srcloc: None,
+        },
+        "BF3B03D53B7F5F08581F40927F0318EB610000543C7F180878FFFFB5BF3B03D5",
+        "atomically { compare-and-swap(8_bits_at_[x25], x26 -> x28), x27 = old_value_at_[x25]; x24 = trash }"
+    ));
+
+    insns.push((
+        Inst::AtomicCAS {
+            ty: I64,
+            srcloc: None,
+        },
+        "BF3B03D53B7F5FC8F8031AAA7F0318EB610000543C7F18C878FFFFB5BF3B03D5",
+        "atomically { compare-and-swap(64_bits_at_[x25], x26 -> x28), x27 = old_value_at_[x25]; x24 = trash }"
+    ));
+
+    insns.push((
+        Inst::AtomicLoad {
+            ty: I8,
+            r_data: writable_xreg(7),
+            r_addr: xreg(28),
+            srcloc: None,
+        },
+        "BF3B03D587034039",
+        "atomically { x7 = zero_extend_8_bits_at[x28] }",
+    ));
+
+    insns.push((
+        Inst::AtomicLoad {
+            ty: I64,
+            r_data: writable_xreg(28),
+            r_addr: xreg(7),
+            srcloc: None,
+        },
+        "BF3B03D5FC0040F9",
+        "atomically { x28 = zero_extend_64_bits_at[x7] }",
+    ));
+
+    insns.push((
+        Inst::AtomicStore {
+            ty: I16,
+            r_data: xreg(17),
+            r_addr: xreg(8),
+            srcloc: None,
+        },
+        "11010079BF3B03D5",
+        "atomically { 16_bits_at[x8] = x17 }",
+    ));
+
+    insns.push((
+        Inst::AtomicStore {
+            ty: I32,
+            r_data: xreg(18),
+            r_addr: xreg(7),
+            srcloc: None,
+        },
+        "F20000B9BF3B03D5",
+        "atomically { 32_bits_at[x7] = x18 }",
+    ));
+
+    insns.push((Inst::Fence {}, "BF3B03D5", "dmb ish"));
+
     let rru = create_reg_universe(&settings::Flags::new(settings::builder()));
     for (insn, expected_encoding, expected_printing) in insns {
         println!(
diff --git a/cranelift/codegen/src/isa/aarch64/inst/imms.rs b/cranelift/codegen/src/isa/aarch64/inst/imms.rs
index 7561d5ff46..f1a98ab66c 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/imms.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/imms.rs
@@ -328,8 +328,7 @@ impl Imm12 {
 }
 
 /// An immediate for logical instructions.
-#[derive(Clone, Debug)]
-#[cfg_attr(test, derive(PartialEq))]
+#[derive(Clone, Debug, PartialEq)]
 pub struct ImmLogic {
     /// The actual value.
     value: u64,
@@ -551,6 +550,37 @@ impl ImmLogic {
         // For every ImmLogical immediate, the inverse can also be encoded.
         Self::maybe_from_u64(!self.value, self.size.to_ty()).unwrap()
     }
+
+    /// This provides a safe(ish) way to avoid the costs of `maybe_from_u64` when we want to
+    /// encode a constant that we know at compiler-build time.  It constructs an `ImmLogic` from
+    /// the fields `n`, `r`, `s` and `size`, but in a debug build, checks that `value_to_check`
+    /// corresponds to those four fields.  The intention is that, in a non-debug build, this
+    /// reduces to something small enough that it will be a candidate for inlining.
+    pub fn from_n_r_s(value_to_check: u64, n: bool, r: u8, s: u8, size: OperandSize) -> Self {
+        // Construct it from the components we got given.
+        let imml = Self {
+            value: value_to_check,
+            n,
+            r,
+            s,
+            size,
+        };
+
+        // In debug mode, check that `n`/`r`/`s` are correct, given `value` and `size`.
+        debug_assert!(match ImmLogic::maybe_from_u64(
+            value_to_check,
+            if size == OperandSize::Size64 {
+                I64
+            } else {
+                I32
+            }
+        ) {
+            None => false, // fail: `value` is unrepresentable
+            Some(imml_check) => imml_check == imml,
+        });
+
+        imml
+    }
 }
 
 /// An immediate for shift instructions.
diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
index bfa296dba3..489e20576e 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -606,6 +606,68 @@ pub enum Inst {
         cond: Cond,
     },
 
+    /// A synthetic insn, which is a load-linked store-conditional loop, that has the overall
+    /// effect of atomically modifying a memory location in a particular way.  Because we have
+    /// no way to explain to the regalloc about earlyclobber registers, this instruction has
+    /// completely fixed operand registers, and we rely on the RA's coalescing to remove copies
+    /// in the surrounding code to the extent it can.  The sequence is both preceded and
+    /// followed by a fence which is at least as comprehensive as that of the `Fence`
+    /// instruction below.  This instruction is sequentially consistent.  The operand
+    /// conventions are:
+    ///
+    /// x25   (rd) address
+    /// x26   (rd) second operand for `op`
+    /// x27   (wr) old value
+    /// x24   (wr) scratch reg; value afterwards has no meaning
+    /// x28   (wr) scratch reg; value afterwards has no meaning
+    AtomicRMW {
+        ty: Type, // I8, I16, I32 or I64
+        op: AtomicRMWOp,
+        srcloc: Option<SourceLoc>,
+    },
+
+    /// Similar to AtomicRMW, a compare-and-swap operation implemented using a load-linked
+    /// store-conditional loop.  (Although we could possibly implement it more directly using
+    /// CAS insns that are available in some revisions of AArch64 above 8.0).  The sequence is
+    /// both preceded and followed by a fence which is at least as comprehensive as that of the
+    /// `Fence` instruction below.  This instruction is sequentially consistent.  Note that the
+    /// operand conventions, although very similar to AtomicRMW, are different:
+    ///
+    /// x25   (rd) address
+    /// x26   (rd) expected value
+    /// x28   (rd) replacement value
+    /// x27   (wr) old value
+    /// x24   (wr) scratch reg; value afterwards has no meaning
+    AtomicCAS {
+        ty: Type, // I8, I16, I32 or I64
+        srcloc: Option<SourceLoc>,
+    },
+
+    /// Read `ty` bits from address `r_addr`, zero extend the loaded value to 64 bits and put it
+    /// in `r_data`.  The load instruction is preceded by a fence at least as comprehensive as
+    /// that of the `Fence` instruction below.  This instruction is sequentially consistent.
+    AtomicLoad {
+        ty: Type, // I8, I16, I32 or I64
+        r_data: Writable<Reg>,
+        r_addr: Reg,
+        srcloc: Option<SourceLoc>,
+    },
+
+    /// Write the lowest `ty` bits of `r_data` to address `r_addr`, with a memory fence
+    /// instruction following the store.  The fence is at least as comprehensive as that of the
+    /// `Fence` instruction below.  This instruction is sequentially consistent.
+    AtomicStore {
+        ty: Type, // I8, I16, I32 or I64
+        r_data: Reg,
+        r_addr: Reg,
+        srcloc: Option<SourceLoc>,
+    },
+
+    /// A memory fence.  This must provide ordering to ensure that, at a minimum, neither loads
+    /// nor stores may move forwards or backwards across the fence.  Currently emitted as "dmb
+    /// ish".  This instruction is sequentially consistent.
+    Fence,
+
     /// FPU move. Note that this is distinct from a vector-register
     /// move; moving just 64 bits seems to be significantly faster.
     FpuMove64 {
@@ -1249,6 +1311,29 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
         &Inst::CCmpImm { rn, .. } => {
             collector.add_use(rn);
         }
+        &Inst::AtomicRMW { .. } => {
+            collector.add_use(xreg(25));
+            collector.add_use(xreg(26));
+            collector.add_def(writable_xreg(24));
+            collector.add_def(writable_xreg(27));
+            collector.add_def(writable_xreg(28));
+        }
+        &Inst::AtomicCAS { .. } => {
+            collector.add_use(xreg(25));
+            collector.add_use(xreg(26));
+            collector.add_use(xreg(28));
+            collector.add_def(writable_xreg(24));
+            collector.add_def(writable_xreg(27));
+        }
+        &Inst::AtomicLoad { r_data, r_addr, .. } => {
+            collector.add_use(r_addr);
+            collector.add_def(r_data);
+        }
+        &Inst::AtomicStore { r_data, r_addr, .. } => {
+            collector.add_use(r_addr);
+            collector.add_use(r_data);
+        }
+        &Inst::Fence {} => {}
         &Inst::FpuMove64 { rd, rn } => {
             collector.add_def(rd);
             collector.add_use(rn);
@@ -1721,6 +1806,29 @@ fn aarch64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
         &mut Inst::CCmpImm { ref mut rn, .. } => {
             map_use(mapper, rn);
         }
+        &mut Inst::AtomicRMW { .. } => {
+            // There are no vregs to map in this insn.
+        }
+        &mut Inst::AtomicCAS { .. } => {
+            // There are no vregs to map in this insn.
+        }
+        &mut Inst::AtomicLoad {
+            ref mut r_data,
+            ref mut r_addr,
+            ..
+        } => {
+            map_def(mapper, r_data);
+            map_use(mapper, r_addr);
+        }
+        &mut Inst::AtomicStore {
+            ref mut r_data,
+            ref mut r_addr,
+            ..
+        } => {
+            map_use(mapper, r_data);
+            map_use(mapper, r_addr);
+        }
+        &mut Inst::Fence {} => {}
         &mut Inst::FpuMove64 {
             ref mut rd,
             ref mut rn,
@@ -2534,6 +2642,28 @@ impl Inst {
                 let cond = cond.show_rru(mb_rru);
                 format!("ccmp {}, {}, {}, {}", rn, imm, nzcv, cond)
             }
+            &Inst::AtomicRMW { ty, op, .. } => {
+                format!(
+                    "atomically {{ {}_bits_at_[x25]) {:?}= x26 ; x27 = old_value_at_[x25]; x24,x28 = trash }}",
+                    ty.bits(), op)
+            }
+            &Inst::AtomicCAS { ty, .. } => {
+                format!(
+                    "atomically {{ compare-and-swap({}_bits_at_[x25], x26 -> x28), x27 = old_value_at_[x25]; x24 = trash }}",
+                    ty.bits())
+            }
+            &Inst::AtomicLoad { ty, r_data, r_addr, .. } => {
+                format!(
+                    "atomically {{ {} = zero_extend_{}_bits_at[{}] }}",
+                    r_data.show_rru(mb_rru), ty.bits(), r_addr.show_rru(mb_rru))
+            }
+            &Inst::AtomicStore { ty, r_data, r_addr, .. } => {
+                format!(
+                    "atomically {{ {}_bits_at[{}] = {} }}", ty.bits(), r_addr.show_rru(mb_rru), r_data.show_rru(mb_rru))
+            }
+            &Inst::Fence {} => {
+                format!("dmb ish")
+            }
             &Inst::FpuMove64 { rd, rn } => {
                 let rd = rd.to_reg().show_rru(mb_rru);
                 let rn = rn.show_rru(mb_rru);
diff --git a/cranelift/codegen/src/isa/aarch64/lower.rs b/cranelift/codegen/src/isa/aarch64/lower.rs
index 321ee77c66..076145d6d6 100644
--- a/cranelift/codegen/src/isa/aarch64/lower.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower.rs
@@ -10,7 +10,7 @@
 use crate::ir::condcodes::{FloatCC, IntCC};
 use crate::ir::types::*;
 use crate::ir::Inst as IRInst;
-use crate::ir::{InstructionData, Opcode, TrapCode, Type};
+use crate::ir::{AtomicRmwOp, InstructionData, Opcode, TrapCode, Type};
 use crate::machinst::lower::*;
 use crate::machinst::*;
 use crate::CodegenResult;
@@ -1082,6 +1082,13 @@ pub(crate) fn inst_trapcode(data: &InstructionData) -> Option<TrapCode> {
     }
 }
 
+pub(crate) fn inst_atomic_rmw_op(data: &InstructionData) -> Option<AtomicRmwOp> {
+    match data {
+        &InstructionData::AtomicRmw { op, .. } => Some(op),
+        _ => None,
+    }
+}
+
 /// Checks for an instance of `op` feeding the given input.
 pub(crate) fn maybe_input_insn<C: LowerCtx<I = Inst>>(
     c: &mut C,
diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
index c90530c21f..aae8b2e607 100644
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -12,7 +12,7 @@ use crate::CodegenResult;
 use crate::isa::aarch64::abi::*;
 use crate::isa::aarch64::inst::*;
 
-use regalloc::RegClass;
+use regalloc::{RegClass, Writable};
 
 use alloc::boxed::Box;
 use alloc::vec::Vec;
@@ -21,6 +21,13 @@ use smallvec::SmallVec;
 
 use super::lower::*;
 
+fn is_single_word_int_ty(ty: Type) -> bool {
+    match ty {
+        I8 | I16 | I32 | I64 => true,
+        _ => false,
+    }
+}
+
 /// Actually codegen an instruction's results into registers.
 pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
     ctx: &mut C,
@@ -1108,6 +1115,123 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             ctx.emit(inst);
         }
 
+        Opcode::AtomicRmw => {
+            let r_dst = get_output_reg(ctx, outputs[0]);
+            let mut r_addr = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+            let mut r_arg2 = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
+            let ty_access = ty.unwrap();
+            assert!(is_single_word_int_ty(ty_access));
+            let memflags = ctx.memflags(insn).expect("memory flags");
+            let srcloc = if !memflags.notrap() {
+                Some(ctx.srcloc(insn))
+            } else {
+                None
+            };
+            // Make sure that both args are in virtual regs, since in effect
+            // we have to do a parallel copy to get them safely to the AtomicRMW input
+            // regs, and that's not guaranteed safe if either is in a real reg.
+            r_addr = ctx.ensure_in_vreg(r_addr, I64);
+            r_arg2 = ctx.ensure_in_vreg(r_arg2, I64);
+            // Move the args to the preordained AtomicRMW input regs
+            ctx.emit(Inst::gen_move(Writable::from_reg(xreg(25)), r_addr, I64));
+            ctx.emit(Inst::gen_move(Writable::from_reg(xreg(26)), r_arg2, I64));
+            // Now the AtomicRMW insn itself
+            let op = AtomicRMWOp::from(inst_atomic_rmw_op(ctx.data(insn)).unwrap());
+            ctx.emit(Inst::AtomicRMW {
+                ty: ty_access,
+                op,
+                srcloc,
+            });
+            // And finally, copy the preordained AtomicRMW output reg to its destination.
+            ctx.emit(Inst::gen_move(r_dst, xreg(27), I64));
+            // Also, x24 and x28 are trashed.  `fn aarch64_get_regs` must mention that.
+        }
+
+        Opcode::AtomicCas => {
+            // This is very similar to, but not identical to, the AtomicRmw case.  Note
+            // that the AtomicCAS sequence does its own masking, so we don't need to worry
+            // about zero-extending narrow (I8/I16/I32) values here.
+            let r_dst = get_output_reg(ctx, outputs[0]);
+            let mut r_addr = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+            let mut r_expected = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
+            let mut r_replacement = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
+            let ty_access = ty.unwrap();
+            assert!(is_single_word_int_ty(ty_access));
+            let memflags = ctx.memflags(insn).expect("memory flags");
+            let srcloc = if !memflags.notrap() {
+                Some(ctx.srcloc(insn))
+            } else {
+                None
+            };
+            // Make sure that all three args are in virtual regs.  See corresponding comment
+            // for `Opcode::AtomicRmw` above.
+            r_addr = ctx.ensure_in_vreg(r_addr, I64);
+            r_expected = ctx.ensure_in_vreg(r_expected, I64);
+            r_replacement = ctx.ensure_in_vreg(r_replacement, I64);
+            // Move the args to the preordained AtomicCAS input regs
+            ctx.emit(Inst::gen_move(Writable::from_reg(xreg(25)), r_addr, I64));
+            ctx.emit(Inst::gen_move(
+                Writable::from_reg(xreg(26)),
+                r_expected,
+                I64,
+            ));
+            ctx.emit(Inst::gen_move(
+                Writable::from_reg(xreg(28)),
+                r_replacement,
+                I64,
+            ));
+            // Now the AtomicCAS itself, implemented in the normal way, with an LL-SC loop
+            ctx.emit(Inst::AtomicCAS {
+                ty: ty_access,
+                srcloc,
+            });
+            // And finally, copy the preordained AtomicCAS output reg to its destination.
+            ctx.emit(Inst::gen_move(r_dst, xreg(27), I64));
+            // Also, x24 and x28 are trashed.  `fn aarch64_get_regs` must mention that.
+        }
+
+        Opcode::AtomicLoad => {
+            let r_data = get_output_reg(ctx, outputs[0]);
+            let r_addr = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+            let ty_access = ty.unwrap();
+            assert!(is_single_word_int_ty(ty_access));
+            let memflags = ctx.memflags(insn).expect("memory flags");
+            let srcloc = if !memflags.notrap() {
+                Some(ctx.srcloc(insn))
+            } else {
+                None
+            };
+            ctx.emit(Inst::AtomicLoad {
+                ty: ty_access,
+                r_data,
+                r_addr,
+                srcloc,
+            });
+        }
+
+        Opcode::AtomicStore => {
+            let r_data = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+            let r_addr = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
+            let ty_access = ctx.input_ty(insn, 0);
+            assert!(is_single_word_int_ty(ty_access));
+            let memflags = ctx.memflags(insn).expect("memory flags");
+            let srcloc = if !memflags.notrap() {
+                Some(ctx.srcloc(insn))
+            } else {
+                None
+            };
+            ctx.emit(Inst::AtomicStore {
+                ty: ty_access,
+                r_data,
+                r_addr,
+                srcloc,
+            });
+        }
+
+        Opcode::Fence => {
+            ctx.emit(Inst::Fence {});
+        }
+
         Opcode::StackLoad | Opcode::StackStore => {
             panic!("Direct stack memory access not supported; should not be used by Wasm");
         }
@@ -1544,11 +1668,10 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 cond
             };
 
-            ctx.emit(Inst::TrapIf {
+            ctx.emit_safepoint(Inst::TrapIf {
                 trap_info,
                 kind: CondBrKind::Cond(cond),
             });
-            ctx.emit_safepoint(Inst::Udf { trap_info })
         }
 
         Opcode::Safepoint => {
diff --git a/cranelift/codegen/src/machinst/lower.rs b/cranelift/codegen/src/machinst/lower.rs
index c9cb27ba35..d1b08e4d27 100644
--- a/cranelift/codegen/src/machinst/lower.rs
+++ b/cranelift/codegen/src/machinst/lower.rs
@@ -160,6 +160,9 @@ pub trait LowerCtx {
     fn is_reg_needed(&self, ir_inst: Inst, reg: Reg) -> bool;
     /// Retrieve constant data given a handle.
     fn get_constant_data(&self, constant_handle: Constant) -> &ConstantData;
+    /// Cause the value in `reg` to be in a virtual reg, by copying it into a new virtual reg
+    /// if `reg` is a real reg.  `ty` describes the type of the value in `reg`.
+    fn ensure_in_vreg(&mut self, reg: Reg, ty: Type) -> Reg;
 }
 
 /// A representation of all of the ways in which an instruction input is
@@ -904,10 +907,14 @@ impl<'func, I: VCodeInst> LowerCtx for Lower<'func, I> {
 
     fn memflags(&self, ir_inst: Inst) -> Option<MemFlags> {
         match &self.f.dfg[ir_inst] {
+            &InstructionData::AtomicCas { flags, .. } => Some(flags),
+            &InstructionData::AtomicRmw { flags, .. } => Some(flags),
             &InstructionData::Load { flags, .. }
             | &InstructionData::LoadComplex { flags, .. }
+            | &InstructionData::LoadNoOffset { flags, .. }
             | &InstructionData::Store { flags, .. }
             | &InstructionData::StoreComplex { flags, .. } => Some(flags),
+            &InstructionData::StoreNoOffset { flags, .. } => Some(flags),
             _ => None,
         }
     }
@@ -989,6 +996,17 @@ impl<'func, I: VCodeInst> LowerCtx for Lower<'func, I> {
     fn get_constant_data(&self, constant_handle: Constant) -> &ConstantData {
         self.f.dfg.constants.get(constant_handle)
     }
+
+    fn ensure_in_vreg(&mut self, reg: Reg, ty: Type) -> Reg {
+        if reg.is_virtual() {
+            reg
+        } else {
+            let rc = reg.get_class();
+            let new_reg = self.alloc_tmp(rc, ty);
+            self.emit(I::gen_move(new_reg, reg, ty));
+            new_reg.to_reg()
+        }
+    }
 }
 
 /// Visit all successors of a block with a given visitor closure.
diff --git a/cranelift/codegen/src/verifier/mod.rs b/cranelift/codegen/src/verifier/mod.rs
index aff4bcae26..dae9ff983d 100644
--- a/cranelift/codegen/src/verifier/mod.rs
+++ b/cranelift/codegen/src/verifier/mod.rs
@@ -749,7 +749,11 @@ impl<'a> Verifier<'a> {
             }
 
             // Exhaustive list so we can't forget to add new formats
-            Unary { .. }
+            AtomicCas { .. }
+            | AtomicRmw { .. }
+            | LoadNoOffset { .. }
+            | StoreNoOffset { .. }
+            | Unary { .. }
             | UnaryConst { .. }
             | UnaryImm { .. }
             | UnaryIeee32 { .. }
diff --git a/cranelift/codegen/src/write.rs b/cranelift/codegen/src/write.rs
index ba4543d39f..8d73e2d1e4 100644
--- a/cranelift/codegen/src/write.rs
+++ b/cranelift/codegen/src/write.rs
@@ -498,6 +498,10 @@ pub fn write_operands(
     let pool = &dfg.value_lists;
     use crate::ir::instructions::InstructionData::*;
     match dfg[inst] {
+        AtomicRmw { op, args, .. } => write!(w, " {}, {}, {}", op, args[0], args[1]),
+        AtomicCas { args, .. } => write!(w, " {}, {}, {}", args[0], args[1], args[2]),
+        LoadNoOffset { flags, arg, .. } => write!(w, "{} {}", flags, arg),
+        StoreNoOffset { flags, args, .. } => write!(w, "{} {}, {}", flags, args[0], args[1]),
         Unary { arg, .. } => write!(w, " {}", arg),
         UnaryImm { imm, .. } => write!(w, " {}", imm),
         UnaryIeee32 { imm, .. } => write!(w, " {}", imm),
diff --git a/cranelift/reader/src/parser.rs b/cranelift/reader/src/parser.rs
index 4d483847fe..44a2ea30e3 100644
--- a/cranelift/reader/src/parser.rs
+++ b/cranelift/reader/src/parser.rs
@@ -3202,6 +3202,52 @@ impl<'a> Parser<'a> {
                     code,
                 }
             }
+            InstructionFormat::AtomicCas => {
+                let flags = self.optional_memflags();
+                let addr = self.match_value("expected SSA value address")?;
+                self.match_token(Token::Comma, "expected ',' between operands")?;
+                let expected = self.match_value("expected SSA value address")?;
+                self.match_token(Token::Comma, "expected ',' between operands")?;
+                let replacement = self.match_value("expected SSA value address")?;
+                InstructionData::AtomicCas {
+                    opcode,
+                    flags,
+                    args: [addr, expected, replacement],
+                }
+            }
+            InstructionFormat::AtomicRmw => {
+                let flags = self.optional_memflags();
+                let op = self.match_enum("expected AtomicRmwOp")?;
+                let addr = self.match_value("expected SSA value address")?;
+                self.match_token(Token::Comma, "expected ',' between operands")?;
+                let arg2 = self.match_value("expected SSA value address")?;
+                InstructionData::AtomicRmw {
+                    opcode,
+                    flags,
+                    op,
+                    args: [addr, arg2],
+                }
+            }
+            InstructionFormat::LoadNoOffset => {
+                let flags = self.optional_memflags();
+                let addr = self.match_value("expected SSA value address")?;
+                InstructionData::LoadNoOffset {
+                    opcode,
+                    flags,
+                    arg: addr,
+                }
+            }
+            InstructionFormat::StoreNoOffset => {
+                let flags = self.optional_memflags();
+                let arg = self.match_value("expected SSA value operand")?;
+                self.match_token(Token::Comma, "expected ',' between operands")?;
+                let addr = self.match_value("expected SSA value address")?;
+                InstructionData::StoreNoOffset {
+                    opcode,
+                    flags,
+                    args: [arg, addr],
+                }
+            }
         };
         Ok(idata)
     }
diff --git a/cranelift/serde/src/serde_clif_json.rs b/cranelift/serde/src/serde_clif_json.rs
index 80ee84633a..3ec9917553 100644
--- a/cranelift/serde/src/serde_clif_json.rs
+++ b/cranelift/serde/src/serde_clif_json.rs
@@ -252,6 +252,27 @@ pub enum SerInstData {
         cond: String,
         code: String,
     },
+    AtomicCas {
+        opcode: String,
+        args: [String; 3],
+        flags: String,
+    },
+    AtomicRmw {
+        opcode: String,
+        args: [String; 2],
+        flags: String,
+        op: String,
+    },
+    LoadNoOffset {
+        opcode: String,
+        arg: String,
+        flags: String,
+    },
+    StoreNoOffset {
+        opcode: String,
+        args: [String; 2],
+        flags: String,
+    },
 }
 
 /// Convert Cranelift IR instructions to JSON format.
@@ -739,6 +760,53 @@ pub fn get_inst_data(inst_index: Inst, func: &Function) -> SerInstData {
             cond: cond.to_string(),
             code: code.to_string(),
         },
+        InstructionData::AtomicCas {
+            opcode,
+            args,
+            flags,
+        } => {
+            let hold_args = [
+                args[0].to_string(),
+                args[1].to_string(),
+                args[2].to_string(),
+            ];
+            SerInstData::AtomicCas {
+                opcode: opcode.to_string(),
+                args: hold_args,
+                flags: flags.to_string(),
+            }
+        }
+        InstructionData::AtomicRmw {
+            opcode,
+            args,
+            flags,
+            op,
+        } => {
+            let hold_args = [args[0].to_string(), args[1].to_string()];
+            SerInstData::AtomicRmw {
+                opcode: opcode.to_string(),
+                args: hold_args,
+                flags: flags.to_string(),
+                op: op.to_string(),
+            }
+        }
+        InstructionData::LoadNoOffset { opcode, arg, flags } => SerInstData::LoadNoOffset {
+            opcode: opcode.to_string(),
+            arg: arg.to_string(),
+            flags: flags.to_string(),
+        },
+        InstructionData::StoreNoOffset {
+            opcode,
+            args,
+            flags,
+        } => {
+            let hold_args = [args[0].to_string(), args[1].to_string()];
+            SerInstData::StoreNoOffset {
+                opcode: opcode.to_string(),
+                args: hold_args,
+                flags: flags.to_string(),
+            }
+        }
     }
 }
 
diff --git a/cranelift/wasm/src/code_translator.rs b/cranelift/wasm/src/code_translator.rs
index 79eae5c2a6..c7a95ccabc 100644
--- a/cranelift/wasm/src/code_translator.rs
+++ b/cranelift/wasm/src/code_translator.rs
@@ -36,7 +36,7 @@ use cranelift_codegen::ir::condcodes::{FloatCC, IntCC};
 use cranelift_codegen::ir::immediates::Offset32;
 use cranelift_codegen::ir::types::*;
 use cranelift_codegen::ir::{
-    self, ConstantData, InstBuilder, JumpTableData, MemFlags, Value, ValueLabel,
+    self, AtomicRmwOp, ConstantData, InstBuilder, JumpTableData, MemFlags, Value, ValueLabel,
 };
 use cranelift_codegen::packed_option::ReservedValue;
 use cranelift_frontend::{FunctionBuilder, Variable};
@@ -1051,74 +1051,285 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
             let index = FuncIndex::from_u32(*function_index);
             state.push1(environ.translate_ref_func(builder.cursor(), index)?);
         }
-        Operator::AtomicNotify { .. }
-        | Operator::I32AtomicWait { .. }
-        | Operator::I64AtomicWait { .. }
-        | Operator::I32AtomicLoad { .. }
-        | Operator::I64AtomicLoad { .. }
-        | Operator::I32AtomicLoad8U { .. }
-        | Operator::I32AtomicLoad16U { .. }
-        | Operator::I64AtomicLoad8U { .. }
-        | Operator::I64AtomicLoad16U { .. }
-        | Operator::I64AtomicLoad32U { .. }
-        | Operator::I32AtomicStore { .. }
-        | Operator::I64AtomicStore { .. }
-        | Operator::I32AtomicStore8 { .. }
-        | Operator::I32AtomicStore16 { .. }
-        | Operator::I64AtomicStore8 { .. }
-        | Operator::I64AtomicStore16 { .. }
-        | Operator::I64AtomicStore32 { .. }
-        | Operator::I32AtomicRmwAdd { .. }
-        | Operator::I64AtomicRmwAdd { .. }
-        | Operator::I32AtomicRmw8AddU { .. }
-        | Operator::I32AtomicRmw16AddU { .. }
-        | Operator::I64AtomicRmw8AddU { .. }
-        | Operator::I64AtomicRmw16AddU { .. }
-        | Operator::I64AtomicRmw32AddU { .. }
-        | Operator::I32AtomicRmwSub { .. }
-        | Operator::I64AtomicRmwSub { .. }
-        | Operator::I32AtomicRmw8SubU { .. }
-        | Operator::I32AtomicRmw16SubU { .. }
-        | Operator::I64AtomicRmw8SubU { .. }
-        | Operator::I64AtomicRmw16SubU { .. }
-        | Operator::I64AtomicRmw32SubU { .. }
-        | Operator::I32AtomicRmwAnd { .. }
-        | Operator::I64AtomicRmwAnd { .. }
-        | Operator::I32AtomicRmw8AndU { .. }
-        | Operator::I32AtomicRmw16AndU { .. }
-        | Operator::I64AtomicRmw8AndU { .. }
-        | Operator::I64AtomicRmw16AndU { .. }
-        | Operator::I64AtomicRmw32AndU { .. }
-        | Operator::I32AtomicRmwOr { .. }
-        | Operator::I64AtomicRmwOr { .. }
-        | Operator::I32AtomicRmw8OrU { .. }
-        | Operator::I32AtomicRmw16OrU { .. }
-        | Operator::I64AtomicRmw8OrU { .. }
-        | Operator::I64AtomicRmw16OrU { .. }
-        | Operator::I64AtomicRmw32OrU { .. }
-        | Operator::I32AtomicRmwXor { .. }
-        | Operator::I64AtomicRmwXor { .. }
-        | Operator::I32AtomicRmw8XorU { .. }
-        | Operator::I32AtomicRmw16XorU { .. }
-        | Operator::I64AtomicRmw8XorU { .. }
-        | Operator::I64AtomicRmw16XorU { .. }
-        | Operator::I64AtomicRmw32XorU { .. }
-        | Operator::I32AtomicRmwXchg { .. }
-        | Operator::I64AtomicRmwXchg { .. }
-        | Operator::I32AtomicRmw8XchgU { .. }
-        | Operator::I32AtomicRmw16XchgU { .. }
-        | Operator::I64AtomicRmw8XchgU { .. }
-        | Operator::I64AtomicRmw16XchgU { .. }
-        | Operator::I64AtomicRmw32XchgU { .. }
-        | Operator::I32AtomicRmwCmpxchg { .. }
-        | Operator::I64AtomicRmwCmpxchg { .. }
-        | Operator::I32AtomicRmw8CmpxchgU { .. }
-        | Operator::I32AtomicRmw16CmpxchgU { .. }
-        | Operator::I64AtomicRmw8CmpxchgU { .. }
-        | Operator::I64AtomicRmw16CmpxchgU { .. }
-        | Operator::I64AtomicRmw32CmpxchgU { .. }
-        | Operator::AtomicFence { .. } => {
-            return Err(wasm_unsupported!("proposed thread operator {:?}", op));
+        Operator::I32AtomicWait { .. } | Operator::I64AtomicWait { .. } => {
+            // The WebAssembly MVP only supports one linear memory and
+            // wasmparser will ensure that the memory indices specified are
+            // zero.
+            let implied_ty = match op {
+                Operator::I64AtomicWait { .. } => I64,
+                Operator::I32AtomicWait { .. } => I32,
+                _ => unreachable!(),
+            };
+            let heap_index = MemoryIndex::from_u32(0);
+            let heap = state.get_heap(builder.func, 0, environ)?;
+            let timeout = state.pop1(); // 64 (fixed)
+            let expected = state.pop1(); // 32 or 64 (per the `Ixx` in `IxxAtomicWait`)
+            let addr = state.pop1(); // 32 (fixed)
+            assert!(builder.func.dfg.value_type(expected) == implied_ty);
+            // `fn translate_atomic_wait` can inspect the type of `expected` to figure out what
+            // code it needs to generate, if it wants.
+            let res = environ.translate_atomic_wait(
+                builder.cursor(),
+                heap_index,
+                heap,
+                addr,
+                expected,
+                timeout,
+            )?;
+            state.push1(res);
+        }
+        Operator::AtomicNotify { .. } => {
+            // The WebAssembly MVP only supports one linear memory and
+            // wasmparser will ensure that the memory indices specified are
+            // zero.
+            let heap_index = MemoryIndex::from_u32(0);
+            let heap = state.get_heap(builder.func, 0, environ)?;
+            let count = state.pop1(); // 32 (fixed)
+            let addr = state.pop1(); // 32 (fixed)
+            let res =
+                environ.translate_atomic_notify(builder.cursor(), heap_index, heap, addr, count)?;
+            state.push1(res);
+        }
+        Operator::I32AtomicLoad {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_load(I32, I32, *offset, builder, state, environ)?,
+        Operator::I64AtomicLoad {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_load(I64, I64, *offset, builder, state, environ)?,
+        Operator::I32AtomicLoad8U {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_load(I32, I8, *offset, builder, state, environ)?,
+        Operator::I32AtomicLoad16U {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_load(I32, I16, *offset, builder, state, environ)?,
+        Operator::I64AtomicLoad8U {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_load(I64, I8, *offset, builder, state, environ)?,
+        Operator::I64AtomicLoad16U {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_load(I64, I16, *offset, builder, state, environ)?,
+        Operator::I64AtomicLoad32U {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_load(I64, I32, *offset, builder, state, environ)?,
+
+        Operator::I32AtomicStore {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_store(I32, *offset, builder, state, environ)?,
+        Operator::I64AtomicStore {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_store(I64, *offset, builder, state, environ)?,
+        Operator::I32AtomicStore8 {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_store(I8, *offset, builder, state, environ)?,
+        Operator::I32AtomicStore16 {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_store(I16, *offset, builder, state, environ)?,
+        Operator::I64AtomicStore8 {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_store(I8, *offset, builder, state, environ)?,
+        Operator::I64AtomicStore16 {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_store(I16, *offset, builder, state, environ)?,
+        Operator::I64AtomicStore32 {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_store(I32, *offset, builder, state, environ)?,
+
+        Operator::I32AtomicRmwAdd {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_rmw(I32, I32, AtomicRmwOp::Add, *offset, builder, state, environ)?,
+        Operator::I64AtomicRmwAdd {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_rmw(I64, I64, AtomicRmwOp::Add, *offset, builder, state, environ)?,
+        Operator::I32AtomicRmw8AddU {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_rmw(I32, I8, AtomicRmwOp::Add, *offset, builder, state, environ)?,
+        Operator::I32AtomicRmw16AddU {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_rmw(I32, I16, AtomicRmwOp::Add, *offset, builder, state, environ)?,
+        Operator::I64AtomicRmw8AddU {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_rmw(I64, I8, AtomicRmwOp::Add, *offset, builder, state, environ)?,
+        Operator::I64AtomicRmw16AddU {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_rmw(I64, I16, AtomicRmwOp::Add, *offset, builder, state, environ)?,
+        Operator::I64AtomicRmw32AddU {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_rmw(I64, I32, AtomicRmwOp::Add, *offset, builder, state, environ)?,
+
+        Operator::I32AtomicRmwSub {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_rmw(I32, I32, AtomicRmwOp::Sub, *offset, builder, state, environ)?,
+        Operator::I64AtomicRmwSub {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_rmw(I64, I64, AtomicRmwOp::Sub, *offset, builder, state, environ)?,
+        Operator::I32AtomicRmw8SubU {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_rmw(I32, I8, AtomicRmwOp::Sub, *offset, builder, state, environ)?,
+        Operator::I32AtomicRmw16SubU {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_rmw(I32, I16, AtomicRmwOp::Sub, *offset, builder, state, environ)?,
+        Operator::I64AtomicRmw8SubU {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_rmw(I64, I8, AtomicRmwOp::Sub, *offset, builder, state, environ)?,
+        Operator::I64AtomicRmw16SubU {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_rmw(I64, I16, AtomicRmwOp::Sub, *offset, builder, state, environ)?,
+        Operator::I64AtomicRmw32SubU {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_rmw(I64, I32, AtomicRmwOp::Sub, *offset, builder, state, environ)?,
+
+        Operator::I32AtomicRmwAnd {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_rmw(I32, I32, AtomicRmwOp::And, *offset, builder, state, environ)?,
+        Operator::I64AtomicRmwAnd {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_rmw(I64, I64, AtomicRmwOp::And, *offset, builder, state, environ)?,
+        Operator::I32AtomicRmw8AndU {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_rmw(I32, I8, AtomicRmwOp::And, *offset, builder, state, environ)?,
+        Operator::I32AtomicRmw16AndU {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_rmw(I32, I16, AtomicRmwOp::And, *offset, builder, state, environ)?,
+        Operator::I64AtomicRmw8AndU {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_rmw(I64, I8, AtomicRmwOp::And, *offset, builder, state, environ)?,
+        Operator::I64AtomicRmw16AndU {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_rmw(I64, I16, AtomicRmwOp::And, *offset, builder, state, environ)?,
+        Operator::I64AtomicRmw32AndU {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_rmw(I64, I32, AtomicRmwOp::And, *offset, builder, state, environ)?,
+
+        Operator::I32AtomicRmwOr {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_rmw(I32, I32, AtomicRmwOp::Or, *offset, builder, state, environ)?,
+        Operator::I64AtomicRmwOr {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_rmw(I64, I64, AtomicRmwOp::Or, *offset, builder, state, environ)?,
+        Operator::I32AtomicRmw8OrU {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_rmw(I32, I8, AtomicRmwOp::Or, *offset, builder, state, environ)?,
+        Operator::I32AtomicRmw16OrU {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_rmw(I32, I16, AtomicRmwOp::Or, *offset, builder, state, environ)?,
+        Operator::I64AtomicRmw8OrU {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_rmw(I64, I8, AtomicRmwOp::Or, *offset, builder, state, environ)?,
+        Operator::I64AtomicRmw16OrU {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_rmw(I64, I16, AtomicRmwOp::Or, *offset, builder, state, environ)?,
+        Operator::I64AtomicRmw32OrU {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_rmw(I64, I32, AtomicRmwOp::Or, *offset, builder, state, environ)?,
+
+        Operator::I32AtomicRmwXor {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_rmw(I32, I32, AtomicRmwOp::Xor, *offset, builder, state, environ)?,
+        Operator::I64AtomicRmwXor {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_rmw(I64, I64, AtomicRmwOp::Xor, *offset, builder, state, environ)?,
+        Operator::I32AtomicRmw8XorU {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_rmw(I32, I8, AtomicRmwOp::Xor, *offset, builder, state, environ)?,
+        Operator::I32AtomicRmw16XorU {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_rmw(I32, I16, AtomicRmwOp::Xor, *offset, builder, state, environ)?,
+        Operator::I64AtomicRmw8XorU {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_rmw(I64, I8, AtomicRmwOp::Xor, *offset, builder, state, environ)?,
+        Operator::I64AtomicRmw16XorU {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_rmw(I64, I16, AtomicRmwOp::Xor, *offset, builder, state, environ)?,
+        Operator::I64AtomicRmw32XorU {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_rmw(I64, I32, AtomicRmwOp::Xor, *offset, builder, state, environ)?,
+
+        Operator::I32AtomicRmwXchg {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_rmw(
+            I32,
+            I32,
+            AtomicRmwOp::Xchg,
+            *offset,
+            builder,
+            state,
+            environ,
+        )?,
+        Operator::I64AtomicRmwXchg {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_rmw(
+            I64,
+            I64,
+            AtomicRmwOp::Xchg,
+            *offset,
+            builder,
+            state,
+            environ,
+        )?,
+        Operator::I32AtomicRmw8XchgU {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_rmw(I32, I8, AtomicRmwOp::Xchg, *offset, builder, state, environ)?,
+        Operator::I32AtomicRmw16XchgU {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_rmw(
+            I32,
+            I16,
+            AtomicRmwOp::Xchg,
+            *offset,
+            builder,
+            state,
+            environ,
+        )?,
+        Operator::I64AtomicRmw8XchgU {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_rmw(I64, I8, AtomicRmwOp::Xchg, *offset, builder, state, environ)?,
+        Operator::I64AtomicRmw16XchgU {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_rmw(
+            I64,
+            I16,
+            AtomicRmwOp::Xchg,
+            *offset,
+            builder,
+            state,
+            environ,
+        )?,
+        Operator::I64AtomicRmw32XchgU {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_rmw(
+            I64,
+            I32,
+            AtomicRmwOp::Xchg,
+            *offset,
+            builder,
+            state,
+            environ,
+        )?,
+
+        Operator::I32AtomicRmwCmpxchg {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_cas(I32, I32, *offset, builder, state, environ)?,
+        Operator::I64AtomicRmwCmpxchg {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_cas(I64, I64, *offset, builder, state, environ)?,
+        Operator::I32AtomicRmw8CmpxchgU {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_cas(I32, I8, *offset, builder, state, environ)?,
+        Operator::I32AtomicRmw16CmpxchgU {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_cas(I32, I16, *offset, builder, state, environ)?,
+        Operator::I64AtomicRmw8CmpxchgU {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_cas(I64, I8, *offset, builder, state, environ)?,
+        Operator::I64AtomicRmw16CmpxchgU {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_cas(I64, I16, *offset, builder, state, environ)?,
+        Operator::I64AtomicRmw32CmpxchgU {
+            memarg: MemoryImmediate { flags: _, offset },
+        } => translate_atomic_cas(I64, I32, *offset, builder, state, environ)?,
+
+        Operator::AtomicFence { .. } => {
+            builder.ins().fence();
         }
         Operator::MemoryCopy => {
             // The WebAssembly MVP only supports one linear memory and
@@ -1906,7 +2117,7 @@ fn translate_store<FE: FuncEnvironment + ?Sized>(
         environ.pointer_type(),
         builder,
     );
-    // See the comments in `translate_load` about the flags.
+    // See the comments in `prepare_load` about the flags.
     let flags = MemFlags::new();
     builder
         .ins()
@@ -1930,6 +2141,233 @@ fn translate_icmp(cc: IntCC, builder: &mut FunctionBuilder, state: &mut FuncTran
     state.push1(builder.ins().bint(I32, val));
 }
 
+// For an atomic memory operation, emit an alignment check for the linear memory address,
+// and then compute the final effective address.
+fn finalise_atomic_mem_addr<FE: FuncEnvironment + ?Sized>(
+    linear_mem_addr: Value,
+    offset: u32,
+    access_ty: Type,
+    builder: &mut FunctionBuilder,
+    state: &mut FuncTranslationState,
+    environ: &mut FE,
+) -> WasmResult<Value> {
+    // Check the alignment of `linear_mem_addr`.
+    let access_ty_bytes = access_ty.bytes();
+    let final_lma = builder.ins().iadd_imm(linear_mem_addr, i64::from(offset));
+    if access_ty_bytes != 1 {
+        assert!(access_ty_bytes == 2 || access_ty_bytes == 4 || access_ty_bytes == 8);
+        let final_lma_misalignment = builder
+            .ins()
+            .band_imm(final_lma, i64::from(access_ty_bytes - 1));
+        let f = builder
+            .ins()
+            .ifcmp_imm(final_lma_misalignment, i64::from(0));
+        builder
+            .ins()
+            .trapif(IntCC::NotEqual, f, ir::TrapCode::HeapMisaligned);
+    }
+
+    // Compute the final effective address.  Note, we don't yet support multiple linear memories.
+    let heap = state.get_heap(builder.func, 0, environ)?;
+    let (base, offset) = get_heap_addr(
+        heap,
+        final_lma,
+        /*offset=*/ 0,
+        access_ty.bytes(),
+        environ.pointer_type(),
+        builder,
+    );
+
+    let final_effective_address = builder.ins().iadd_imm(base, i64::from(offset));
+    Ok(final_effective_address)
+}
+
+fn translate_atomic_rmw<FE: FuncEnvironment + ?Sized>(
+    widened_ty: Type,
+    access_ty: Type,
+    op: AtomicRmwOp,
+    offset: u32,
+    builder: &mut FunctionBuilder,
+    state: &mut FuncTranslationState,
+    environ: &mut FE,
+) -> WasmResult<()> {
+    let (linear_mem_addr, mut arg2) = state.pop2();
+    let arg2_ty = builder.func.dfg.value_type(arg2);
+
+    // The operation is performed at type `access_ty`, and the old value is zero-extended
+    // to type `widened_ty`.
+    match access_ty {
+        I8 | I16 | I32 | I64 => {}
+        _ => {
+            return Err(wasm_unsupported!(
+                "atomic_rmw: unsupported access type {:?}",
+                access_ty
+            ))
+        }
+    };
+    let w_ty_ok = match widened_ty {
+        I32 | I64 => true,
+        _ => false,
+    };
+    assert!(w_ty_ok && widened_ty.bytes() >= access_ty.bytes());
+
+    assert!(arg2_ty.bytes() >= access_ty.bytes());
+    if arg2_ty.bytes() > access_ty.bytes() {
+        arg2 = builder.ins().ireduce(access_ty, arg2);
+    }
+
+    let final_effective_address =
+        finalise_atomic_mem_addr(linear_mem_addr, offset, access_ty, builder, state, environ)?;
+
+    // See the comments in `prepare_load` about the flags.
+    let flags = MemFlags::new();
+    let mut res = builder
+        .ins()
+        .atomic_rmw(access_ty, flags, op, final_effective_address, arg2);
+    if access_ty != widened_ty {
+        res = builder.ins().uextend(widened_ty, res);
+    }
+    state.push1(res);
+    Ok(())
+}
+
+fn translate_atomic_cas<FE: FuncEnvironment + ?Sized>(
+    widened_ty: Type,
+    access_ty: Type,
+    offset: u32,
+    builder: &mut FunctionBuilder,
+    state: &mut FuncTranslationState,
+    environ: &mut FE,
+) -> WasmResult<()> {
+    let (linear_mem_addr, mut expected, mut replacement) = state.pop3();
+    let expected_ty = builder.func.dfg.value_type(expected);
+    let replacement_ty = builder.func.dfg.value_type(replacement);
+
+    // The compare-and-swap is performed at type `access_ty`, and the old value is zero-extended
+    // to type `widened_ty`.
+    match access_ty {
+        I8 | I16 | I32 | I64 => {}
+        _ => {
+            return Err(wasm_unsupported!(
+                "atomic_cas: unsupported access type {:?}",
+                access_ty
+            ))
+        }
+    };
+    let w_ty_ok = match widened_ty {
+        I32 | I64 => true,
+        _ => false,
+    };
+    assert!(w_ty_ok && widened_ty.bytes() >= access_ty.bytes());
+
+    assert!(expected_ty.bytes() >= access_ty.bytes());
+    if expected_ty.bytes() > access_ty.bytes() {
+        expected = builder.ins().ireduce(access_ty, expected);
+    }
+    assert!(replacement_ty.bytes() >= access_ty.bytes());
+    if replacement_ty.bytes() > access_ty.bytes() {
+        replacement = builder.ins().ireduce(access_ty, replacement);
+    }
+
+    let final_effective_address =
+        finalise_atomic_mem_addr(linear_mem_addr, offset, access_ty, builder, state, environ)?;
+
+    // See the comments in `prepare_load` about the flags.
+    let flags = MemFlags::new();
+    let mut res = builder
+        .ins()
+        .atomic_cas(flags, final_effective_address, expected, replacement);
+    if access_ty != widened_ty {
+        res = builder.ins().uextend(widened_ty, res);
+    }
+    state.push1(res);
+    Ok(())
+}
+
+fn translate_atomic_load<FE: FuncEnvironment + ?Sized>(
+    widened_ty: Type,
+    access_ty: Type,
+    offset: u32,
+    builder: &mut FunctionBuilder,
+    state: &mut FuncTranslationState,
+    environ: &mut FE,
+) -> WasmResult<()> {
+    let linear_mem_addr = state.pop1();
+
+    // The load is performed at type `access_ty`, and the loaded value is zero extended
+    // to `widened_ty`.
+    match access_ty {
+        I8 | I16 | I32 | I64 => {}
+        _ => {
+            return Err(wasm_unsupported!(
+                "atomic_load: unsupported access type {:?}",
+                access_ty
+            ))
+        }
+    };
+    let w_ty_ok = match widened_ty {
+        I32 | I64 => true,
+        _ => false,
+    };
+    assert!(w_ty_ok && widened_ty.bytes() >= access_ty.bytes());
+
+    let final_effective_address =
+        finalise_atomic_mem_addr(linear_mem_addr, offset, access_ty, builder, state, environ)?;
+
+    // See the comments in `prepare_load` about the flags.
+    let flags = MemFlags::new();
+    let mut res = builder
+        .ins()
+        .atomic_load(access_ty, flags, final_effective_address);
+    if access_ty != widened_ty {
+        res = builder.ins().uextend(widened_ty, res);
+    }
+    state.push1(res);
+    Ok(())
+}
+
+fn translate_atomic_store<FE: FuncEnvironment + ?Sized>(
+    access_ty: Type,
+    offset: u32,
+    builder: &mut FunctionBuilder,
+    state: &mut FuncTranslationState,
+    environ: &mut FE,
+) -> WasmResult<()> {
+    let (linear_mem_addr, mut data) = state.pop2();
+    let data_ty = builder.func.dfg.value_type(data);
+
+    // The operation is performed at type `access_ty`, and the data to be stored may first
+    // need to be narrowed accordingly.
+    match access_ty {
+        I8 | I16 | I32 | I64 => {}
+        _ => {
+            return Err(wasm_unsupported!(
+                "atomic_store: unsupported access type {:?}",
+                access_ty
+            ))
+        }
+    };
+    let d_ty_ok = match data_ty {
+        I32 | I64 => true,
+        _ => false,
+    };
+    assert!(d_ty_ok && data_ty.bytes() >= access_ty.bytes());
+
+    if data_ty.bytes() > access_ty.bytes() {
+        data = builder.ins().ireduce(access_ty, data);
+    }
+
+    let final_effective_address =
+        finalise_atomic_mem_addr(linear_mem_addr, offset, access_ty, builder, state, environ)?;
+
+    // See the comments in `prepare_load` about the flags.
+    let flags = MemFlags::new();
+    builder
+        .ins()
+        .atomic_store(flags, data, final_effective_address);
+    Ok(())
+}
+
 fn translate_vector_icmp(
     cc: IntCC,
     needed_type: Type,
diff --git a/cranelift/wasm/src/environ/dummy.rs b/cranelift/wasm/src/environ/dummy.rs
index 5b64254fa3..8bf4e0b1f9 100644
--- a/cranelift/wasm/src/environ/dummy.rs
+++ b/cranelift/wasm/src/environ/dummy.rs
@@ -538,6 +538,29 @@ impl<'dummy_environment> FuncEnvironment for DummyFuncEnvironment<'dummy_environ
     ) -> WasmResult<()> {
         Ok(())
     }
+
+    fn translate_atomic_wait(
+        &mut self,
+        mut pos: FuncCursor,
+        _index: MemoryIndex,
+        _heap: ir::Heap,
+        _addr: ir::Value,
+        _expected: ir::Value,
+        _timeout: ir::Value,
+    ) -> WasmResult<ir::Value> {
+        Ok(pos.ins().iconst(I32, -1))
+    }
+
+    fn translate_atomic_notify(
+        &mut self,
+        mut pos: FuncCursor,
+        _index: MemoryIndex,
+        _heap: ir::Heap,
+        _addr: ir::Value,
+        _count: ir::Value,
+    ) -> WasmResult<ir::Value> {
+        Ok(pos.ins().iconst(I32, 0))
+    }
 }
 
 impl TargetEnvironment for DummyEnvironment {
diff --git a/cranelift/wasm/src/environ/spec.rs b/cranelift/wasm/src/environ/spec.rs
index 56d86522b1..045d6e2f29 100644
--- a/cranelift/wasm/src/environ/spec.rs
+++ b/cranelift/wasm/src/environ/spec.rs
@@ -560,6 +560,38 @@ pub trait FuncEnvironment: TargetEnvironment {
         val: ir::Value,
     ) -> WasmResult<()>;
 
+    /// Translate an `i32.atomic.wait` or `i64.atomic.wait` WebAssembly instruction.
+    /// The `index` provided identifies the linear memory containing the value
+    /// to wait on, and `heap` is the heap reference returned by `make_heap`
+    /// for the same index.  Whether the waited-on value is 32- or 64-bit can be
+    /// determined by examining the type of `expected`, which must be only I32 or I64.
+    ///
+    /// Returns an i32, which is negative if the helper call failed.
+    fn translate_atomic_wait(
+        &mut self,
+        pos: FuncCursor,
+        index: MemoryIndex,
+        heap: ir::Heap,
+        addr: ir::Value,
+        expected: ir::Value,
+        timeout: ir::Value,
+    ) -> WasmResult<ir::Value>;
+
+    /// Translate an `atomic.notify` WebAssembly instruction.
+    /// The `index` provided identifies the linear memory containing the value
+    /// to wait on, and `heap` is the heap reference returned by `make_heap`
+    /// for the same index.
+    ///
+    /// Returns an i64, which is negative if the helper call failed.
+    fn translate_atomic_notify(
+        &mut self,
+        pos: FuncCursor,
+        index: MemoryIndex,
+        heap: ir::Heap,
+        addr: ir::Value,
+        count: ir::Value,
+    ) -> WasmResult<ir::Value>;
+
     /// Emit code at the beginning of every wasm loop.
     ///
     /// This can be used to insert explicit interrupt or safepoint checking at
diff --git a/crates/environ/src/func_environ.rs b/crates/environ/src/func_environ.rs
index 2def7447da..e2a4be7530 100644
--- a/crates/environ/src/func_environ.rs
+++ b/crates/environ/src/func_environ.rs
@@ -1612,6 +1612,33 @@ impl<'module_environment> cranelift_wasm::FuncEnvironment for FuncEnvironment<'m
         Ok(())
     }
 
+    fn translate_atomic_wait(
+        &mut self,
+        _pos: FuncCursor,
+        _index: MemoryIndex,
+        _heap: ir::Heap,
+        _addr: ir::Value,
+        _expected: ir::Value,
+        _timeout: ir::Value,
+    ) -> WasmResult<ir::Value> {
+        Err(WasmError::Unsupported(
+            "wasm atomics (fn translate_atomic_wait)".to_string(),
+        ))
+    }
+
+    fn translate_atomic_notify(
+        &mut self,
+        _pos: FuncCursor,
+        _index: MemoryIndex,
+        _heap: ir::Heap,
+        _addr: ir::Value,
+        _count: ir::Value,
+    ) -> WasmResult<ir::Value> {
+        Err(WasmError::Unsupported(
+            "wasm atomics (fn translate_atomic_notify)".to_string(),
+        ))
+    }
+
     fn translate_loop_header(&mut self, mut pos: FuncCursor) -> WasmResult<()> {
         if !self.tunables.interruptable {
             return Ok(());
diff --git a/crates/wasmtime/src/trap.rs b/crates/wasmtime/src/trap.rs
index f8ec2acfae..02f6577463 100644
--- a/crates/wasmtime/src/trap.rs
+++ b/crates/wasmtime/src/trap.rs
@@ -109,6 +109,7 @@ impl Trap {
         let desc = match code {
             StackOverflow => "call stack exhausted",
             HeapOutOfBounds => "out of bounds memory access",
+            HeapMisaligned => "misaligned memory access",
             TableOutOfBounds => "undefined element: out of bounds table access",
             IndirectCallToNull => "uninitialized element",
             BadSignature => "indirect call type mismatch",