From 409d824f460eed9cb4b24c3be159c43d09330d56 Mon Sep 17 00:00:00 2001
From: Jules Bertholet <julesbertholet@quoi.xyz>
Date: Mon, 10 Jun 2024 12:14:42 -0400
Subject: [PATCH 1/3] Add `CachePadded` variants with capped max alignment

These are useful for ensuring that values don't *cross* a cache line,
without restricting whether they can *share* one.
---
 crossbeam-utils/src/cache_padded.rs | 405 +++++++++++++++-------------
 crossbeam-utils/src/lib.rs          |   2 +-
 2 files changed, 221 insertions(+), 186 deletions(-)
diff --git a/crossbeam-utils/src/cache_padded.rs b/crossbeam-utils/src/cache_padded.rs
index aa5260136..06adcf225 100644
--- a/crossbeam-utils/src/cache_padded.rs
+++ b/crossbeam-utils/src/cache_padded.rs
@@ -1,215 +1,250 @@
 use core::fmt;
 use core::ops::{Deref, DerefMut};
 
-/// Pads and aligns a value to the length of a cache line.
-///
-/// In concurrent programming, sometimes it is desirable to make sure commonly accessed pieces of
-/// data are not placed into the same cache line. Updating an atomic value invalidates the whole
-/// cache line it belongs to, which makes the next access to the same cache line slower for other
-/// CPU cores. Use `CachePadded` to ensure updating one piece of data doesn't invalidate other
-/// cached data.
-///
-/// # Size and alignment
-///
-/// Cache lines are assumed to be N bytes long, depending on the architecture:
-///
-/// * On x86-64, aarch64, and powerpc64, N = 128.
-/// * On arm, mips, mips64, sparc, and hexagon, N = 32.
-/// * On m68k, N = 16.
-/// * On s390x, N = 256.
-/// * On all others, N = 64.
-///
-/// Note that N is just a reasonable guess and is not guaranteed to match the actual cache line
-/// length of the machine the program is running on. On modern Intel architectures, spatial
-/// prefetcher is pulling pairs of 64-byte cache lines at a time, so we pessimistically assume that
-/// cache lines are 128 bytes long.
-///
-/// The size of `CachePadded<T>` is the smallest multiple of N bytes large enough to accommodate
-/// a value of type `T`.
-///
-/// The alignment of `CachePadded<T>` is the maximum of N bytes and the alignment of `T`.
-///
-/// # Examples
-///
-/// Alignment and padding:
-///
-/// ```
-/// use crossbeam_utils::CachePadded;
-///
-/// let array = [CachePadded::new(1i8), CachePadded::new(2i8)];
-/// let addr1 = &*array[0] as *const i8 as usize;
-/// let addr2 = &*array[1] as *const i8 as usize;
-///
-/// assert!(addr2 - addr1 >= 32);
-/// assert_eq!(addr1 % 32, 0);
-/// assert_eq!(addr2 % 32, 0);
-/// ```
-///
-/// When building a concurrent queue with a head and a tail index, it is wise to place them in
-/// different cache lines so that concurrent threads pushing and popping elements don't invalidate
-/// each other's cache lines:
-///
-/// ```
-/// use crossbeam_utils::CachePadded;
-/// use std::sync::atomic::AtomicUsize;
-///
-/// struct Queue<T> {
-///     head: CachePadded<AtomicUsize>,
-///     tail: CachePadded<AtomicUsize>,
-///     buffer: *mut T,
-/// }
-/// ```
-#[derive(Clone, Copy, Default, Hash, PartialEq, Eq)]
-// Starting from Intel's Sandy Bridge, spatial prefetcher is now pulling pairs of 64-byte cache
-// lines at a time, so we have to align to 128 bytes rather than 64.
-//
-// Sources:
-// - https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
-// - https://github.com/facebook/folly/blob/1b5288e6eea6df074758f877c849b6e73bbb9fbb/folly/lang/Align.h#L107
-//
-// ARM's big.LITTLE architecture has asymmetric cores and "big" cores have 128-byte cache line size.
-//
-// Sources:
-// - https://www.mono-project.com/news/2016/09/12/arm64-icache/
-//
-// powerpc64 has 128-byte cache line size.
-//
-// Sources:
-// - https://github.com/golang/go/blob/3dd58676054223962cd915bb0934d1f9f489d4d2/src/internal/cpu/cpu_ppc64x.go#L9
-// - https://github.com/torvalds/linux/blob/3516bd729358a2a9b090c1905bd2a3fa926e24c6/arch/powerpc/include/asm/cache.h#L26
-#[cfg_attr(
-    any(
-        target_arch = "x86_64",
-        target_arch = "aarch64",
-        target_arch = "powerpc64",
-    ),
-    repr(align(128))
-)]
-// arm, mips, mips64, sparc, and hexagon have 32-byte cache line size.
-//
-// Sources:
-// - https://github.com/golang/go/blob/3dd58676054223962cd915bb0934d1f9f489d4d2/src/internal/cpu/cpu_arm.go#L7
-// - https://github.com/golang/go/blob/3dd58676054223962cd915bb0934d1f9f489d4d2/src/internal/cpu/cpu_mips.go#L7
-// - https://github.com/golang/go/blob/3dd58676054223962cd915bb0934d1f9f489d4d2/src/internal/cpu/cpu_mipsle.go#L7
-// - https://github.com/golang/go/blob/3dd58676054223962cd915bb0934d1f9f489d4d2/src/internal/cpu/cpu_mips64x.go#L9
-// - https://github.com/torvalds/linux/blob/3516bd729358a2a9b090c1905bd2a3fa926e24c6/arch/sparc/include/asm/cache.h#L17
-// - https://github.com/torvalds/linux/blob/3516bd729358a2a9b090c1905bd2a3fa926e24c6/arch/hexagon/include/asm/cache.h#L12
-#[cfg_attr(
-    any(
-        target_arch = "arm",
-        target_arch = "mips",
-        target_arch = "mips32r6",
-        target_arch = "mips64",
-        target_arch = "mips64r6",
-        target_arch = "sparc",
-        target_arch = "hexagon",
-    ),
-    repr(align(32))
-)]
-// m68k has 16-byte cache line size.
-//
-// Sources:
-// - https://github.com/torvalds/linux/blob/3516bd729358a2a9b090c1905bd2a3fa926e24c6/arch/m68k/include/asm/cache.h#L9
-#[cfg_attr(target_arch = "m68k", repr(align(16)))]
-// s390x has 256-byte cache line size.
-//
-// Sources:
-// - https://github.com/golang/go/blob/3dd58676054223962cd915bb0934d1f9f489d4d2/src/internal/cpu/cpu_s390x.go#L7
-// - https://github.com/torvalds/linux/blob/3516bd729358a2a9b090c1905bd2a3fa926e24c6/arch/s390/include/asm/cache.h#L13
-#[cfg_attr(target_arch = "s390x", repr(align(256)))]
-// x86, wasm, riscv, and sparc64 have 64-byte cache line size.
-//
-// Sources:
-// - https://github.com/golang/go/blob/dda2991c2ea0c5914714469c4defc2562a907230/src/internal/cpu/cpu_x86.go#L9
-// - https://github.com/golang/go/blob/3dd58676054223962cd915bb0934d1f9f489d4d2/src/internal/cpu/cpu_wasm.go#L7
-// - https://github.com/torvalds/linux/blob/3516bd729358a2a9b090c1905bd2a3fa926e24c6/arch/riscv/include/asm/cache.h#L10
-// - https://github.com/torvalds/linux/blob/3516bd729358a2a9b090c1905bd2a3fa926e24c6/arch/sparc/include/asm/cache.h#L19
-//
-// All others are assumed to have 64-byte cache line size.
-#[cfg_attr(
-    not(any(
-        target_arch = "x86_64",
-        target_arch = "aarch64",
-        target_arch = "powerpc64",
-        target_arch = "arm",
-        target_arch = "mips",
-        target_arch = "mips32r6",
-        target_arch = "mips64",
-        target_arch = "mips64r6",
-        target_arch = "sparc",
-        target_arch = "hexagon",
-        target_arch = "m68k",
-        target_arch = "s390x",
-    )),
-    repr(align(64))
-)]
-pub struct CachePadded<T> {
-    value: T,
-}
+macro_rules! cache_padded {
+    (
+        $(#[$attr:meta])*
+        $Name:ident {
+            $($(|)? $($target:literal)|* => $align:literal,)*
+            _ => $default_align:literal $(,)?
+
+        }
+    ) => {
+        $(#[$attr])*
+        #[repr(C)]
+        $(#[cfg_attr(
+            any(
+                $(target_arch = $target),*
+            ),
+            repr(align($align))
+        )])*
+        #[cfg_attr(
+            not(any(
+                $($(target_arch = $target),*),*
+            )),
+            repr(align($default_align))
+        )]
+        #[derive(Clone, Copy, Default, Hash, PartialEq, Eq)]
+        pub struct $Name<T> {
+            value: T,
+        }
+
+        impl<T> $Name<T> {
+            /// Pads and aligns a value to the length of a cache line.
+            ///
+            /// # Examples
+            ///
+            /// ```
+            #[doc = concat!("use crossbeam_utils::", stringify!($Name), ";")]
+            ///
+            #[doc = concat!("let padded_value = ", stringify!($Name), "::new(1);")]
+            /// ```
+            pub const fn new(t: T) -> Self {
+                Self { value: t }
+            }
+
+            /// Returns the inner value.
+            ///
+            /// # Examples
+            ///
+            /// ```
+            #[doc = concat!("use crossbeam_utils::", stringify!($Name), ";")]
+            ///
+            #[doc = concat!("let padded_value = ", stringify!($Name), "::new(7);")]
+            /// let value = padded_value.into_inner();
+            /// assert_eq!(value, 7);
+            /// ```
+            pub fn into_inner(self) -> T {
+                self.value
+            }
+        }
+
+        impl<T> Deref for $Name<T> {
+            type Target = T;
 
-unsafe impl<T: Send> Send for CachePadded<T> {}
-unsafe impl<T: Sync> Sync for CachePadded<T> {}
+            fn deref(&self) -> &T {
+                &self.value
+            }
+        }
 
-impl<T> CachePadded<T> {
+        impl<T> DerefMut for $Name<T> {
+            fn deref_mut(&mut self) -> &mut T {
+                &mut self.value
+            }
+        }
+
+        impl<T: fmt::Debug> fmt::Debug for $Name<T> {
+            fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+                f.debug_struct(stringify!($Name))
+                    .field("value", &self.value)
+                    .finish()
+            }
+        }
+
+        impl<T> From<T> for $Name<T> {
+            fn from(t: T) -> Self {
+                Self::new(t)
+            }
+        }
+
+        impl<T: fmt::Display> fmt::Display for $Name<T> {
+            fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+                fmt::Display::fmt(&self.value, f)
+            }
+        }
+    }
+}
+
+cache_padded! {
     /// Pads and aligns a value to the length of a cache line.
     ///
+    /// In concurrent programming, sometimes it is desirable to make sure commonly accessed pieces of
+    /// data are not placed into the same cache line. Updating an atomic value invalidates the whole
+    /// cache line it belongs to, which makes the next access to the same cache line slower for other
+    /// CPU cores. Use `CachePadded` to ensure updating one piece of data doesn't invalidate other
+    /// cached data.
+    ///
+    /// # Size and alignment
+    ///
+    /// Cache lines are assumed to be N bytes long, depending on the architecture:
+    ///
+    /// * On x86-64, aarch64, and powerpc64, N = 128.
+    /// * On arm, mips, mips64, sparc, and hexagon, N = 32.
+    /// * On m68k, N = 16.
+    /// * On s390x, N = 256.
+    /// * On all others, N = 64.
+    ///
+    /// Note that N is just a reasonable guess and is not guaranteed to match the actual cache line
+    /// length of the machine the program is running on. On modern Intel architectures, spatial
+    /// prefetcher is pulling pairs of 64-byte cache lines at a time, so we pessimistically assume that
+    /// cache lines are 128 bytes long.
+    ///
+    /// The size of `CachePadded<T>` is the smallest multiple of N bytes large enough to accommodate
+    /// a value of type `T`.
+    ///
+    /// The alignment of `CachePadded<T>` is the maximum of N bytes and the alignment of `T`.
+    ///
     /// # Examples
     ///
+    /// Alignment and padding:
+    ///
     /// ```
     /// use crossbeam_utils::CachePadded;
     ///
-    /// let padded_value = CachePadded::new(1);
+    /// let array = [CachePadded::new(1i8), CachePadded::new(2i8)];
+    /// let addr1 = &*array[0] as *const i8 as usize;
+    /// let addr2 = &*array[1] as *const i8 as usize;
+    ///
+    /// assert!(addr2 - addr1 >= 32);
+    /// assert_eq!(addr1 % 32, 0);
+    /// assert_eq!(addr2 % 32, 0);
     /// ```
-    pub const fn new(t: T) -> Self {
-        Self { value: t }
-    }
-
-    /// Returns the inner value.
     ///
-    /// # Examples
+    /// When building a concurrent queue with a head and a tail index, it is wise to place them in
+    /// different cache lines so that concurrent threads pushing and popping elements don't invalidate
+    /// each other's cache lines:
     ///
     /// ```
     /// use crossbeam_utils::CachePadded;
+    /// use std::sync::atomic::AtomicUsize;
     ///
-    /// let padded_value = CachePadded::new(7);
-    /// let value = padded_value.into_inner();
-    /// assert_eq!(value, 7);
+    /// struct Queue<T> {
+    ///     head: CachePadded<AtomicUsize>,
+    ///     tail: CachePadded<AtomicUsize>,
+    ///     buffer: *mut T,
+    /// }
     /// ```
-    pub fn into_inner(self) -> T {
-        self.value
-    }
-}
-
-impl<T> Deref for CachePadded<T> {
-    type Target = T;
-
-    fn deref(&self) -> &T {
-        &self.value
-    }
-}
-
-impl<T> DerefMut for CachePadded<T> {
-    fn deref_mut(&mut self) -> &mut T {
-        &mut self.value
+    CachePadded {
+        // Starting from Intel's Sandy Bridge, spatial prefetcher is now pulling pairs of 64-byte cache
+        // lines at a time, so we have to align to 128 bytes rather than 64.
+        //
+        // Sources:
+        // - https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
+        // - https://github.com/facebook/folly/blob/1b5288e6eea6df074758f877c849b6e73bbb9fbb/folly/lang/Align.h#L107
+        //
+        // ARM's big.LITTLE architecture has asymmetric cores and "big" cores have 128-byte cache line size.
+        //
+        // Sources:
+        // - https://www.mono-project.com/news/2016/09/12/arm64-icache/
+        //
+        // powerpc64 has 128-byte cache line size.
+        //
+        // Sources:
+        // - https://github.com/golang/go/blob/3dd58676054223962cd915bb0934d1f9f489d4d2/src/internal/cpu/cpu_ppc64x.go#L9
+        // - https://github.com/torvalds/linux/blob/3516bd729358a2a9b090c1905bd2a3fa926e24c6/arch/powerpc/include/asm/cache.h#L26
+        "x86_64" | "aarch64" | "powerpc64" => 128,
+        // arm, mips, mips64, sparc, and hexagon have 32-byte cache line size.
+        //
+        // Sources:
+        // - https://github.com/golang/go/blob/3dd58676054223962cd915bb0934d1f9f489d4d2/src/internal/cpu/cpu_arm.go#L7
+        // - https://github.com/golang/go/blob/3dd58676054223962cd915bb0934d1f9f489d4d2/src/internal/cpu/cpu_mips.go#L7
+        // - https://github.com/golang/go/blob/3dd58676054223962cd915bb0934d1f9f489d4d2/src/internal/cpu/cpu_mipsle.go#L7
+        // - https://github.com/golang/go/blob/3dd58676054223962cd915bb0934d1f9f489d4d2/src/internal/cpu/cpu_mips64x.go#L9
+        // - https://github.com/torvalds/linux/blob/3516bd729358a2a9b090c1905bd2a3fa926e24c6/arch/sparc/include/asm/cache.h#L17
+        // - https://github.com/torvalds/linux/blob/3516bd729358a2a9b090c1905bd2a3fa926e24c6/arch/hexagon/include/asm/cache.h#L12
+        "arm" | "mips" | "mips32r6" | "mips64" | "mips64r6" | "sparc" | "hexagon" => 32,
+        // m68k has 16-byte cache line size.
+        //
+        // Sources:
+        // - https://github.com/torvalds/linux/blob/3516bd729358a2a9b090c1905bd2a3fa926e24c6/arch/m68k/include/asm/cache.h#L9
+        "m68k" => 16,
+        // s390x has 256-byte cache line size.
+        //
+        // Sources:
+        // - https://github.com/golang/go/blob/3dd58676054223962cd915bb0934d1f9f489d4d2/src/internal/cpu/cpu_s390x.go#L7
+        // - https://github.com/torvalds/linux/blob/3516bd729358a2a9b090c1905bd2a3fa926e24c6/arch/s390/include/asm/cache.h#L13
+        "s390x" => 256,
+        // x86, wasm, riscv, and sparc64 have 64-byte cache line size.
+        //
+        // Sources:
+        // - https://github.com/golang/go/blob/dda2991c2ea0c5914714469c4defc2562a907230/src/internal/cpu/cpu_x86.go#L9
+        // - https://github.com/golang/go/blob/3dd58676054223962cd915bb0934d1f9f489d4d2/src/internal/cpu/cpu_wasm.go#L7
+        // - https://github.com/torvalds/linux/blob/3516bd729358a2a9b090c1905bd2a3fa926e24c6/arch/riscv/include/asm/cache.h#L10
+        // - https://github.com/torvalds/linux/blob/3516bd729358a2a9b090c1905bd2a3fa926e24c6/arch/sparc/include/asm/cache.h#L19
+        //
+        // All others are assumed to have 64-byte cache line size.
+        _ => 64,
     }
 }
 
-impl<T: fmt::Debug> fmt::Debug for CachePadded<T> {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        f.debug_struct("CachePadded")
-            .field("value", &self.value)
-            .finish()
+cache_padded! {
+    /// Like [`CachePadded`], but with its alignment modifier capped at 128.
+    /// This is useful if you have a value no larger than 128 bytes
+    /// and you don't want it to *cross* a cache line,
+    /// but you don't mind if it *shares* one.
+    ///
+    /// See the documentation of [`CachePadded`] for more.
+    CachePaddedMax128 {
+        "x86_64" | "aarch64" | "powerpc64" | "s390x" => 128,
+        "arm" | "mips" | "mips32r6" | "mips64" | "mips64r6" | "sparc" | "hexagon" => 32,
+        "m68k" => 16,
+        _ => 64,
     }
 }
 
-impl<T> From<T> for CachePadded<T> {
-    fn from(t: T) -> Self {
-        Self::new(t)
+cache_padded! {
+    /// Like [`CachePadded`], but with its alignment modifier capped at 64.
+    /// This is useful if you have a value no larger than 64 bytes
+    /// and you don't want it to *cross* a cache line,
+    /// but you don't mind if it *shares* one.
+    ///
+    /// See the documentation of [`CachePadded`] for more.
+    CachePaddedMax64 {
+        "arm" | "mips" | "mips32r6" | "mips64" | "mips64r6" | "sparc" | "hexagon" => 32,
+        "m68k" => 16,
+        _ => 64,
     }
 }
 
-impl<T: fmt::Display> fmt::Display for CachePadded<T> {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        fmt::Display::fmt(&self.value, f)
+cache_padded! {
+    /// Like [`CachePadded`], but with its alignment modifier capped at 32.
+    /// This is useful if you have a value no larger than 32 bytes
+    /// and you don't want it to *cross* a cache line,
+    /// but you don't mind if it *shares* one.
+    ///
+    /// See the documentation of [`CachePadded`] for more.
+    CachePaddedMax32 {
+        "m68k" => 16,
+        _ => 32,
     }
 }
diff --git a/crossbeam-utils/src/lib.rs b/crossbeam-utils/src/lib.rs
index 9d954888c..439e7dc5c 100644
--- a/crossbeam-utils/src/lib.rs
+++ b/crossbeam-utils/src/lib.rs
@@ -92,7 +92,7 @@ mod primitive {
 pub mod atomic;
 
 mod cache_padded;
-pub use crate::cache_padded::CachePadded;
+pub use crate::cache_padded::{CachePadded, CachePaddedMax128, CachePaddedMax32, CachePaddedMax64};
 
 mod backoff;
 pub use crate::backoff::Backoff;

From 389ee6a53931a21fc73a227216c492569c44e968 Mon Sep 17 00:00:00 2001
From: Jules Bertholet <julesbertholet@quoi.xyz>
Date: Mon, 10 Jun 2024 12:25:10 -0400
Subject: [PATCH 2/3] `impl Ord for CachePadded`

---
 crossbeam-utils/src/cache_padded.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crossbeam-utils/src/cache_padded.rs b/crossbeam-utils/src/cache_padded.rs
index 06adcf225..48fbbc37d 100644
--- a/crossbeam-utils/src/cache_padded.rs
+++ b/crossbeam-utils/src/cache_padded.rs
@@ -24,7 +24,7 @@ macro_rules! cache_padded {
             )),
             repr(align($default_align))
         )]
-        #[derive(Clone, Copy, Default, Hash, PartialEq, Eq)]
+        #[derive(Clone, Copy, Default, PartialEq, Eq, Hash, PartialOrd, Ord)]
         pub struct $Name<T> {
             value: T,
         }

From 47fc70b957ef471f11a4473b3835c4a0ce6b4a99 Mon Sep 17 00:00:00 2001
From: Jules Bertholet <julesbertholet@quoi.xyz>
Date: Mon, 10 Jun 2024 12:31:44 -0400
Subject: [PATCH 3/3] Rephrase docs

---
 crossbeam-utils/src/cache_padded.rs | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/crossbeam-utils/src/cache_padded.rs b/crossbeam-utils/src/cache_padded.rs
index 48fbbc37d..de2e2fe0c 100644
--- a/crossbeam-utils/src/cache_padded.rs
+++ b/crossbeam-utils/src/cache_padded.rs
@@ -210,8 +210,8 @@ cache_padded! {
 cache_padded! {
     /// Like [`CachePadded`], but with its alignment modifier capped at 128.
     /// This is useful if you have a value no larger than 128 bytes
-    /// and you don't want it to *cross* a cache line,
-    /// but you don't mind if it *shares* one.
+    /// and you want it to *cross* as few cache lines as possible,
+    /// but you don't mind if it *shares* a cache line.
     ///
     /// See the documentation of [`CachePadded`] for more.
     CachePaddedMax128 {
@@ -225,8 +225,8 @@ cache_padded! {
 cache_padded! {
     /// Like [`CachePadded`], but with its alignment modifier capped at 64.
     /// This is useful if you have a value no larger than 64 bytes
-    /// and you don't want it to *cross* a cache line,
-    /// but you don't mind if it *shares* one.
+    /// and you want it to *cross* as few cache lines as possible,
+    /// but you don't mind if it *shares* a cache line.
     ///
     /// See the documentation of [`CachePadded`] for more.
     CachePaddedMax64 {
@@ -239,8 +239,8 @@ cache_padded! {
 cache_padded! {
     /// Like [`CachePadded`], but with its alignment modifier capped at 32.
     /// This is useful if you have a value no larger than 32 bytes
-    /// and you don't want it to *cross* a cache line,
-    /// but you don't mind if it *shares* one.
+    /// and you want it to *cross* as few cache lines as possible,
+    /// but you don't mind if it *shares* a cache line.
     ///
     /// See the documentation of [`CachePadded`] for more.
     CachePaddedMax32 {