From 409d824f460eed9cb4b24c3be159c43d09330d56 Mon Sep 17 00:00:00 2001 From: Jules Bertholet Date: Mon, 10 Jun 2024 12:14:42 -0400 Subject: [PATCH 1/3] Add `CachePadded` variants with capped max alignment These are useful for ensuring that values don't *cross* a cache line, without restricting whether they can *share* one. --- crossbeam-utils/src/cache_padded.rs | 405 +++++++++++++++------------- crossbeam-utils/src/lib.rs | 2 +- 2 files changed, 221 insertions(+), 186 deletions(-) diff --git a/crossbeam-utils/src/cache_padded.rs b/crossbeam-utils/src/cache_padded.rs index aa5260136..06adcf225 100644 --- a/crossbeam-utils/src/cache_padded.rs +++ b/crossbeam-utils/src/cache_padded.rs @@ -1,215 +1,250 @@ use core::fmt; use core::ops::{Deref, DerefMut}; -/// Pads and aligns a value to the length of a cache line. -/// -/// In concurrent programming, sometimes it is desirable to make sure commonly accessed pieces of -/// data are not placed into the same cache line. Updating an atomic value invalidates the whole -/// cache line it belongs to, which makes the next access to the same cache line slower for other -/// CPU cores. Use `CachePadded` to ensure updating one piece of data doesn't invalidate other -/// cached data. -/// -/// # Size and alignment -/// -/// Cache lines are assumed to be N bytes long, depending on the architecture: -/// -/// * On x86-64, aarch64, and powerpc64, N = 128. -/// * On arm, mips, mips64, sparc, and hexagon, N = 32. -/// * On m68k, N = 16. -/// * On s390x, N = 256. -/// * On all others, N = 64. -/// -/// Note that N is just a reasonable guess and is not guaranteed to match the actual cache line -/// length of the machine the program is running on. On modern Intel architectures, spatial -/// prefetcher is pulling pairs of 64-byte cache lines at a time, so we pessimistically assume that -/// cache lines are 128 bytes long. -/// -/// The size of `CachePadded` is the smallest multiple of N bytes large enough to accommodate -/// a value of type `T`. -/// -/// The alignment of `CachePadded` is the maximum of N bytes and the alignment of `T`. -/// -/// # Examples -/// -/// Alignment and padding: -/// -/// ``` -/// use crossbeam_utils::CachePadded; -/// -/// let array = [CachePadded::new(1i8), CachePadded::new(2i8)]; -/// let addr1 = &*array[0] as *const i8 as usize; -/// let addr2 = &*array[1] as *const i8 as usize; -/// -/// assert!(addr2 - addr1 >= 32); -/// assert_eq!(addr1 % 32, 0); -/// assert_eq!(addr2 % 32, 0); -/// ``` -/// -/// When building a concurrent queue with a head and a tail index, it is wise to place them in -/// different cache lines so that concurrent threads pushing and popping elements don't invalidate -/// each other's cache lines: -/// -/// ``` -/// use crossbeam_utils::CachePadded; -/// use std::sync::atomic::AtomicUsize; -/// -/// struct Queue { -/// head: CachePadded, -/// tail: CachePadded, -/// buffer: *mut T, -/// } -/// ``` -#[derive(Clone, Copy, Default, Hash, PartialEq, Eq)] -// Starting from Intel's Sandy Bridge, spatial prefetcher is now pulling pairs of 64-byte cache -// lines at a time, so we have to align to 128 bytes rather than 64. -// -// Sources: -// - https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf -// - https://github.com/facebook/folly/blob/1b5288e6eea6df074758f877c849b6e73bbb9fbb/folly/lang/Align.h#L107 -// -// ARM's big.LITTLE architecture has asymmetric cores and "big" cores have 128-byte cache line size. -// -// Sources: -// - https://www.mono-project.com/news/2016/09/12/arm64-icache/ -// -// powerpc64 has 128-byte cache line size. -// -// Sources: -// - https://github.com/golang/go/blob/3dd58676054223962cd915bb0934d1f9f489d4d2/src/internal/cpu/cpu_ppc64x.go#L9 -// - https://github.com/torvalds/linux/blob/3516bd729358a2a9b090c1905bd2a3fa926e24c6/arch/powerpc/include/asm/cache.h#L26 -#[cfg_attr( - any( - target_arch = "x86_64", - target_arch = "aarch64", - target_arch = "powerpc64", - ), - repr(align(128)) -)] -// arm, mips, mips64, sparc, and hexagon have 32-byte cache line size. -// -// Sources: -// - https://github.com/golang/go/blob/3dd58676054223962cd915bb0934d1f9f489d4d2/src/internal/cpu/cpu_arm.go#L7 -// - https://github.com/golang/go/blob/3dd58676054223962cd915bb0934d1f9f489d4d2/src/internal/cpu/cpu_mips.go#L7 -// - https://github.com/golang/go/blob/3dd58676054223962cd915bb0934d1f9f489d4d2/src/internal/cpu/cpu_mipsle.go#L7 -// - https://github.com/golang/go/blob/3dd58676054223962cd915bb0934d1f9f489d4d2/src/internal/cpu/cpu_mips64x.go#L9 -// - https://github.com/torvalds/linux/blob/3516bd729358a2a9b090c1905bd2a3fa926e24c6/arch/sparc/include/asm/cache.h#L17 -// - https://github.com/torvalds/linux/blob/3516bd729358a2a9b090c1905bd2a3fa926e24c6/arch/hexagon/include/asm/cache.h#L12 -#[cfg_attr( - any( - target_arch = "arm", - target_arch = "mips", - target_arch = "mips32r6", - target_arch = "mips64", - target_arch = "mips64r6", - target_arch = "sparc", - target_arch = "hexagon", - ), - repr(align(32)) -)] -// m68k has 16-byte cache line size. -// -// Sources: -// - https://github.com/torvalds/linux/blob/3516bd729358a2a9b090c1905bd2a3fa926e24c6/arch/m68k/include/asm/cache.h#L9 -#[cfg_attr(target_arch = "m68k", repr(align(16)))] -// s390x has 256-byte cache line size. -// -// Sources: -// - https://github.com/golang/go/blob/3dd58676054223962cd915bb0934d1f9f489d4d2/src/internal/cpu/cpu_s390x.go#L7 -// - https://github.com/torvalds/linux/blob/3516bd729358a2a9b090c1905bd2a3fa926e24c6/arch/s390/include/asm/cache.h#L13 -#[cfg_attr(target_arch = "s390x", repr(align(256)))] -// x86, wasm, riscv, and sparc64 have 64-byte cache line size. -// -// Sources: -// - https://github.com/golang/go/blob/dda2991c2ea0c5914714469c4defc2562a907230/src/internal/cpu/cpu_x86.go#L9 -// - https://github.com/golang/go/blob/3dd58676054223962cd915bb0934d1f9f489d4d2/src/internal/cpu/cpu_wasm.go#L7 -// - https://github.com/torvalds/linux/blob/3516bd729358a2a9b090c1905bd2a3fa926e24c6/arch/riscv/include/asm/cache.h#L10 -// - https://github.com/torvalds/linux/blob/3516bd729358a2a9b090c1905bd2a3fa926e24c6/arch/sparc/include/asm/cache.h#L19 -// -// All others are assumed to have 64-byte cache line size. -#[cfg_attr( - not(any( - target_arch = "x86_64", - target_arch = "aarch64", - target_arch = "powerpc64", - target_arch = "arm", - target_arch = "mips", - target_arch = "mips32r6", - target_arch = "mips64", - target_arch = "mips64r6", - target_arch = "sparc", - target_arch = "hexagon", - target_arch = "m68k", - target_arch = "s390x", - )), - repr(align(64)) -)] -pub struct CachePadded { - value: T, -} +macro_rules! cache_padded { + ( + $(#[$attr:meta])* + $Name:ident { + $($(|)? $($target:literal)|* => $align:literal,)* + _ => $default_align:literal $(,)? + + } + ) => { + $(#[$attr])* + #[repr(C)] + $(#[cfg_attr( + any( + $(target_arch = $target),* + ), + repr(align($align)) + )])* + #[cfg_attr( + not(any( + $($(target_arch = $target),*),* + )), + repr(align($default_align)) + )] + #[derive(Clone, Copy, Default, Hash, PartialEq, Eq)] + pub struct $Name { + value: T, + } + + impl $Name { + /// Pads and aligns a value to the length of a cache line. + /// + /// # Examples + /// + /// ``` + #[doc = concat!("use crossbeam_utils::", stringify!($Name), ";")] + /// + #[doc = concat!("let padded_value = ", stringify!($Name), "::new(1);")] + /// ``` + pub const fn new(t: T) -> Self { + Self { value: t } + } + + /// Returns the inner value. + /// + /// # Examples + /// + /// ``` + #[doc = concat!("use crossbeam_utils::", stringify!($Name), ";")] + /// + #[doc = concat!("let padded_value = ", stringify!($Name), "::new(7);")] + /// let value = padded_value.into_inner(); + /// assert_eq!(value, 7); + /// ``` + pub fn into_inner(self) -> T { + self.value + } + } + + impl Deref for $Name { + type Target = T; -unsafe impl Send for CachePadded {} -unsafe impl Sync for CachePadded {} + fn deref(&self) -> &T { + &self.value + } + } -impl CachePadded { + impl DerefMut for $Name { + fn deref_mut(&mut self) -> &mut T { + &mut self.value + } + } + + impl fmt::Debug for $Name { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct(stringify!($Name)) + .field("value", &self.value) + .finish() + } + } + + impl From for $Name { + fn from(t: T) -> Self { + Self::new(t) + } + } + + impl fmt::Display for $Name { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Display::fmt(&self.value, f) + } + } + } +} + +cache_padded! { /// Pads and aligns a value to the length of a cache line. /// + /// In concurrent programming, sometimes it is desirable to make sure commonly accessed pieces of + /// data are not placed into the same cache line. Updating an atomic value invalidates the whole + /// cache line it belongs to, which makes the next access to the same cache line slower for other + /// CPU cores. Use `CachePadded` to ensure updating one piece of data doesn't invalidate other + /// cached data. + /// + /// # Size and alignment + /// + /// Cache lines are assumed to be N bytes long, depending on the architecture: + /// + /// * On x86-64, aarch64, and powerpc64, N = 128. + /// * On arm, mips, mips64, sparc, and hexagon, N = 32. + /// * On m68k, N = 16. + /// * On s390x, N = 256. + /// * On all others, N = 64. + /// + /// Note that N is just a reasonable guess and is not guaranteed to match the actual cache line + /// length of the machine the program is running on. On modern Intel architectures, spatial + /// prefetcher is pulling pairs of 64-byte cache lines at a time, so we pessimistically assume that + /// cache lines are 128 bytes long. + /// + /// The size of `CachePadded` is the smallest multiple of N bytes large enough to accommodate + /// a value of type `T`. + /// + /// The alignment of `CachePadded` is the maximum of N bytes and the alignment of `T`. + /// /// # Examples /// + /// Alignment and padding: + /// /// ``` /// use crossbeam_utils::CachePadded; /// - /// let padded_value = CachePadded::new(1); + /// let array = [CachePadded::new(1i8), CachePadded::new(2i8)]; + /// let addr1 = &*array[0] as *const i8 as usize; + /// let addr2 = &*array[1] as *const i8 as usize; + /// + /// assert!(addr2 - addr1 >= 32); + /// assert_eq!(addr1 % 32, 0); + /// assert_eq!(addr2 % 32, 0); /// ``` - pub const fn new(t: T) -> Self { - Self { value: t } - } - - /// Returns the inner value. /// - /// # Examples + /// When building a concurrent queue with a head and a tail index, it is wise to place them in + /// different cache lines so that concurrent threads pushing and popping elements don't invalidate + /// each other's cache lines: /// /// ``` /// use crossbeam_utils::CachePadded; + /// use std::sync::atomic::AtomicUsize; /// - /// let padded_value = CachePadded::new(7); - /// let value = padded_value.into_inner(); - /// assert_eq!(value, 7); + /// struct Queue { + /// head: CachePadded, + /// tail: CachePadded, + /// buffer: *mut T, + /// } /// ``` - pub fn into_inner(self) -> T { - self.value - } -} - -impl Deref for CachePadded { - type Target = T; - - fn deref(&self) -> &T { - &self.value - } -} - -impl DerefMut for CachePadded { - fn deref_mut(&mut self) -> &mut T { - &mut self.value + CachePadded { + // Starting from Intel's Sandy Bridge, spatial prefetcher is now pulling pairs of 64-byte cache + // lines at a time, so we have to align to 128 bytes rather than 64. + // + // Sources: + // - https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf + // - https://github.com/facebook/folly/blob/1b5288e6eea6df074758f877c849b6e73bbb9fbb/folly/lang/Align.h#L107 + // + // ARM's big.LITTLE architecture has asymmetric cores and "big" cores have 128-byte cache line size. + // + // Sources: + // - https://www.mono-project.com/news/2016/09/12/arm64-icache/ + // + // powerpc64 has 128-byte cache line size. + // + // Sources: + // - https://github.com/golang/go/blob/3dd58676054223962cd915bb0934d1f9f489d4d2/src/internal/cpu/cpu_ppc64x.go#L9 + // - https://github.com/torvalds/linux/blob/3516bd729358a2a9b090c1905bd2a3fa926e24c6/arch/powerpc/include/asm/cache.h#L26 + "x86_64" | "aarch64" | "powerpc64" => 128, + // arm, mips, mips64, sparc, and hexagon have 32-byte cache line size. + // + // Sources: + // - https://github.com/golang/go/blob/3dd58676054223962cd915bb0934d1f9f489d4d2/src/internal/cpu/cpu_arm.go#L7 + // - https://github.com/golang/go/blob/3dd58676054223962cd915bb0934d1f9f489d4d2/src/internal/cpu/cpu_mips.go#L7 + // - https://github.com/golang/go/blob/3dd58676054223962cd915bb0934d1f9f489d4d2/src/internal/cpu/cpu_mipsle.go#L7 + // - https://github.com/golang/go/blob/3dd58676054223962cd915bb0934d1f9f489d4d2/src/internal/cpu/cpu_mips64x.go#L9 + // - https://github.com/torvalds/linux/blob/3516bd729358a2a9b090c1905bd2a3fa926e24c6/arch/sparc/include/asm/cache.h#L17 + // - https://github.com/torvalds/linux/blob/3516bd729358a2a9b090c1905bd2a3fa926e24c6/arch/hexagon/include/asm/cache.h#L12 + "arm" | "mips" | "mips32r6" | "mips64" | "mips64r6" | "sparc" | "hexagon" => 32, + // m68k has 16-byte cache line size. + // + // Sources: + // - https://github.com/torvalds/linux/blob/3516bd729358a2a9b090c1905bd2a3fa926e24c6/arch/m68k/include/asm/cache.h#L9 + "m68k" => 16, + // s390x has 256-byte cache line size. + // + // Sources: + // - https://github.com/golang/go/blob/3dd58676054223962cd915bb0934d1f9f489d4d2/src/internal/cpu/cpu_s390x.go#L7 + // - https://github.com/torvalds/linux/blob/3516bd729358a2a9b090c1905bd2a3fa926e24c6/arch/s390/include/asm/cache.h#L13 + "s390x" => 256, + // x86, wasm, riscv, and sparc64 have 64-byte cache line size. + // + // Sources: + // - https://github.com/golang/go/blob/dda2991c2ea0c5914714469c4defc2562a907230/src/internal/cpu/cpu_x86.go#L9 + // - https://github.com/golang/go/blob/3dd58676054223962cd915bb0934d1f9f489d4d2/src/internal/cpu/cpu_wasm.go#L7 + // - https://github.com/torvalds/linux/blob/3516bd729358a2a9b090c1905bd2a3fa926e24c6/arch/riscv/include/asm/cache.h#L10 + // - https://github.com/torvalds/linux/blob/3516bd729358a2a9b090c1905bd2a3fa926e24c6/arch/sparc/include/asm/cache.h#L19 + // + // All others are assumed to have 64-byte cache line size. + _ => 64, } } -impl fmt::Debug for CachePadded { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("CachePadded") - .field("value", &self.value) - .finish() +cache_padded! { + /// Like [`CachePadded`], but with its alignment modifier capped at 128. + /// This is useful if you have a value no larger than 128 bytes + /// and you don't want it to *cross* a cache line, + /// but you don't mind if it *shares* one. + /// + /// See the documentation of [`CachePadded`] for more. + CachePaddedMax128 { + "x86_64" | "aarch64" | "powerpc64" | "s390x" => 128, + "arm" | "mips" | "mips32r6" | "mips64" | "mips64r6" | "sparc" | "hexagon" => 32, + "m68k" => 16, + _ => 64, } } -impl From for CachePadded { - fn from(t: T) -> Self { - Self::new(t) +cache_padded! { + /// Like [`CachePadded`], but with its alignment modifier capped at 64. + /// This is useful if you have a value no larger than 64 bytes + /// and you don't want it to *cross* a cache line, + /// but you don't mind if it *shares* one. + /// + /// See the documentation of [`CachePadded`] for more. + CachePaddedMax64 { + "arm" | "mips" | "mips32r6" | "mips64" | "mips64r6" | "sparc" | "hexagon" => 32, + "m68k" => 16, + _ => 64, } } -impl fmt::Display for CachePadded { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - fmt::Display::fmt(&self.value, f) +cache_padded! { + /// Like [`CachePadded`], but with its alignment modifier capped at 32. + /// This is useful if you have a value no larger than 32 bytes + /// and you don't want it to *cross* a cache line, + /// but you don't mind if it *shares* one. + /// + /// See the documentation of [`CachePadded`] for more. + CachePaddedMax32 { + "m68k" => 16, + _ => 32, } } diff --git a/crossbeam-utils/src/lib.rs b/crossbeam-utils/src/lib.rs index 9d954888c..439e7dc5c 100644 --- a/crossbeam-utils/src/lib.rs +++ b/crossbeam-utils/src/lib.rs @@ -92,7 +92,7 @@ mod primitive { pub mod atomic; mod cache_padded; -pub use crate::cache_padded::CachePadded; +pub use crate::cache_padded::{CachePadded, CachePaddedMax128, CachePaddedMax32, CachePaddedMax64}; mod backoff; pub use crate::backoff::Backoff; From 389ee6a53931a21fc73a227216c492569c44e968 Mon Sep 17 00:00:00 2001 From: Jules Bertholet Date: Mon, 10 Jun 2024 12:25:10 -0400 Subject: [PATCH 2/3] `impl Ord for CachePadded` --- crossbeam-utils/src/cache_padded.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crossbeam-utils/src/cache_padded.rs b/crossbeam-utils/src/cache_padded.rs index 06adcf225..48fbbc37d 100644 --- a/crossbeam-utils/src/cache_padded.rs +++ b/crossbeam-utils/src/cache_padded.rs @@ -24,7 +24,7 @@ macro_rules! cache_padded { )), repr(align($default_align)) )] - #[derive(Clone, Copy, Default, Hash, PartialEq, Eq)] + #[derive(Clone, Copy, Default, PartialEq, Eq, Hash, PartialOrd, Ord)] pub struct $Name { value: T, } From 47fc70b957ef471f11a4473b3835c4a0ce6b4a99 Mon Sep 17 00:00:00 2001 From: Jules Bertholet Date: Mon, 10 Jun 2024 12:31:44 -0400 Subject: [PATCH 3/3] Rephrase docs --- crossbeam-utils/src/cache_padded.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/crossbeam-utils/src/cache_padded.rs b/crossbeam-utils/src/cache_padded.rs index 48fbbc37d..de2e2fe0c 100644 --- a/crossbeam-utils/src/cache_padded.rs +++ b/crossbeam-utils/src/cache_padded.rs @@ -210,8 +210,8 @@ cache_padded! { cache_padded! { /// Like [`CachePadded`], but with its alignment modifier capped at 128. /// This is useful if you have a value no larger than 128 bytes - /// and you don't want it to *cross* a cache line, - /// but you don't mind if it *shares* one. + /// and you want it to *cross* as few cache lines as possible, + /// but you don't mind if it *shares* a cache line. /// /// See the documentation of [`CachePadded`] for more. CachePaddedMax128 { @@ -225,8 +225,8 @@ cache_padded! { cache_padded! { /// Like [`CachePadded`], but with its alignment modifier capped at 64. /// This is useful if you have a value no larger than 64 bytes - /// and you don't want it to *cross* a cache line, - /// but you don't mind if it *shares* one. + /// and you want it to *cross* as few cache lines as possible, + /// but you don't mind if it *shares* a cache line. /// /// See the documentation of [`CachePadded`] for more. CachePaddedMax64 { @@ -239,8 +239,8 @@ cache_padded! { cache_padded! { /// Like [`CachePadded`], but with its alignment modifier capped at 32. /// This is useful if you have a value no larger than 32 bytes - /// and you don't want it to *cross* a cache line, - /// but you don't mind if it *shares* one. + /// and you want it to *cross* as few cache lines as possible, + /// but you don't mind if it *shares* a cache line. /// /// See the documentation of [`CachePadded`] for more. CachePaddedMax32 {