From 9359fb515282982735541abbf10c75d07a8058cc Mon Sep 17 00:00:00 2001 From: Tip ten Brink <75669206+tiptenbrink@users.noreply.github.com> Date: Mon, 29 Jul 2024 02:10:15 +0200 Subject: [PATCH] compact set mostly done --- tiauth-core/Cargo.toml | 33 ++-- .../src/{mergerange.rs => compactset.rs} | 179 +++++++++++++++--- tiauth-core/src/lib.rs | 2 +- 3 files changed, 176 insertions(+), 38 deletions(-) rename tiauth-core/src/{mergerange.rs => compactset.rs} (66%) diff --git a/tiauth-core/Cargo.toml b/tiauth-core/Cargo.toml index bd1015b..6d0978b 100644 --- a/tiauth-core/Cargo.toml +++ b/tiauth-core/Cargo.toml @@ -49,22 +49,25 @@ tiauth-core = { path = ".", features = ["test", "action", "app"] } name = "deser" harness = false -[[bin]] -name = "experiment" -required-features = ["test"] +# [[bin]] +# name = "experiment" +# required-features = ["test"] -[[bin]] -name = "experiment_big" -required-features = ["test"] +# [[bin]] +# name = "experiment_big" +# required-features = ["test"] -[[bin]] -name = "experiment_ses" -required-features = ["test"] +# [[bin]] +# name = "experiment_ses" +# required-features = ["test"] -[[bin]] -name = "multi_db" -required-features = ["test"] +# [[bin]] +# name = "multi_db" +# required-features = ["test"] -[[bin]] -name = "shared_counter" -required-features = ["test"] \ No newline at end of file +# [[bin]] +# name = "shared_counter" +# required-features = ["test"] + +# [profile.release] +# debug = true \ No newline at end of file diff --git a/tiauth-core/src/mergerange.rs b/tiauth-core/src/compactset.rs similarity index 66% rename from tiauth-core/src/mergerange.rs rename to tiauth-core/src/compactset.rs index 00c132f..1978919 100644 --- a/tiauth-core/src/mergerange.rs +++ b/tiauth-core/src/compactset.rs @@ -1,4 +1,33 @@ -use std::{cmp::Ordering, collections::VecDeque}; +use std::{cmp::Ordering, collections::VecDeque, ops::DerefMut, path::Display, sync::{Arc, Mutex}, time::Instant}; + +#[derive(Clone)] +pub struct CompactSet { + ranges: Arc>>> +} + +impl CompactSet { + pub fn new() -> Self { + let mut ranges = VecDeque::new(); + ranges.push_back(Range::new(1)); + + Self { + ranges: Arc::new(Mutex::new(ranges)) + } + } + + pub fn num_exists(&self, num: u64, expires: u64, time: Option) -> bool { + let mut ranges = self.ranges.lock().unwrap(); + + let exists = add_num(ranges.deref_mut(), num, expires); + + // Some basic experimentation showed that by doing it every four provides the best space/time tradeoff + if time.is_some() && num % 4 == 0 { + check_expired(ranges.deref_mut(), time.unwrap()); + } + + exists + } +} #[derive(Debug, Clone, Copy, PartialEq)] enum Expiry { @@ -6,9 +35,9 @@ enum Expiry { Unknown } -#[derive(Debug, Clone)] +#[derive(Clone)] struct Range { - members: Option<[u64; N]>, + members: Option<[u8; N]>, min: u64, max: u64, expires: Expiry @@ -17,7 +46,7 @@ struct Range { impl Range { fn new(min: u64) -> Self { Self { - members: Some([0u64; N]), + members: Some([0u8; N]), min, max: min+(N as u64)-1, expires: Expiry::Unknown @@ -35,17 +64,21 @@ impl Range { } fn add_num(&self, num: u64) -> Option { + // To conserve space, we represent numbers using the min as offset, because the array won't have a size + // that can't be represented with a single byte + // 0 is a special value, it represents the "unfilled" value, therefore 0 can never be used + let num_small = (num - self.min + 1) as u8; self.members.and_then(|mut members| { for i in 0..N { let val = members[i]; - if val == num { + if val == num_small { return None } else if val == 0 { let members = if i == N - 1 { // In this case we are the last index, so it's full now None } else { - members[i] = num; + members[i] = num_small; Some(members) }; @@ -63,6 +96,14 @@ impl Range { } } +impl std::fmt::Debug for Range { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{{{}-{};{:?};exp={:?}}}", self.min, self.max, self.members.map(|a| a.map(|v| { + if v != 0 { (v as u64) + self.min - 1 } else { 0 } + })), self.expires) + } +} + // impl PartialOrd for Range { // fn partial_cmp(&self, other: &Self) -> Option { // match self.expires { @@ -88,11 +129,8 @@ fn add_num_ranges(ranges: &mut VecDeque>, num: u64) -> Err(u) => u }; - if n_i == 0 { - println!("{:?}", ranges); - } - let target_i = n_i-1; + let initial_max = { ranges[target_i].max }; @@ -151,7 +189,6 @@ fn add_num_ranges(ranges: &mut VecDeque>, num: u64) -> } } ranges.truncate(ranges.len()-2); - ranges.shrink_to_fit(); target_i-1 } else if let Some((prev_min, prev_expires)) = prev_min_exp { ranges[target_i-1] = Range::full(prev_min, target_range.max, prev_expires); @@ -162,7 +199,6 @@ fn add_num_ranges(ranges: &mut VecDeque>, num: u64) -> } } ranges.truncate(ranges.len()-1); - ranges.shrink_to_fit(); target_i-1 } else if let Some(next_max) = next_max { ranges[target_i] = Range::full(target_range.min, next_max, target_range.expires); @@ -173,7 +209,6 @@ fn add_num_ranges(ranges: &mut VecDeque>, num: u64) -> } } ranges.truncate(ranges.len()-1); - ranges.shrink_to_fit(); target_i } else { target_i @@ -182,6 +217,7 @@ fn add_num_ranges(ranges: &mut VecDeque>, num: u64) -> (target_i, false) } +// The majority of the time is spent in Range::add_num and binary_search (based on flamegraph) fn add_num(ranges: &mut VecDeque>, num: u64, expires: u64) -> bool { let (target_i, contains) = add_num_ranges(ranges, num); @@ -194,6 +230,23 @@ fn add_num(ranges: &mut VecDeque>, num: u64, expires: u contains } +// fn exp_lin_search(ranges: &VecDeque>, time: u64) -> usize { +// for i in 0..ranges.len() { +// let range = &ranges[i]; +// match range.expires { +// Expiry::At(at) => { +// if at > time { +// return i +// } +// }, +// Expiry::Unknown => return i, +// } +// } + +// ranges.len() +// } + +// Linear search is not faster, but doesn't seem much slower either, but for better worst-case we'll use binary search fn expired_search(ranges: &VecDeque>, time: u64) -> usize { match ranges.binary_search_by(|r| { match r.expires { @@ -209,7 +262,9 @@ fn expired_search(ranges: &VecDeque>, time: u64) -> usi } } +// Basically all time is spent in the binary search fn check_expired(ranges: &mut VecDeque>, time: u64) { + //let exp_i = exp_lin_search(ranges, time); let exp_i = expired_search(ranges, time); let first_min = ranges[0].min; @@ -223,6 +278,7 @@ fn check_expired(ranges: &mut VecDeque>, time: u64) { ranges.rotate_left(exp_i); ranges.truncate(ranges.len()-exp_i); ranges.push_front(new_range); + ranges.shrink_to_fit(); } // fn check_expired(ranges: VecDeque>, time: u64) -> VecDeque> { @@ -244,6 +300,31 @@ fn check_expired(ranges: &mut VecDeque>, time: u64) { // new_vec // } +// struct BitHole { +// arr: VecDeque, +// offset: usize, +// first_hole: usize +// } + +// const BIT_MASK: [u8; 8] = [128, 64, 32, 16, 8, 4, 2, 1]; + +// impl BitHole { +// fn add_num(&mut self, num: usize) -> bool { +// let b = (num-self.offset)/8; +// let b_i = num - b*8; +// while self.arr.len() < b { +// self.arr.push_back(0); +// } + +// let exists = self.arr[b] & BIT_MASK[b_i] != 0; +// if !exists { +// self.arr[b] += BIT_MASK[b_i]; +// } + +// exists +// } +// } + #[cfg(test)] mod test { use std::{collections::{BTreeMap, BTreeSet, HashMap, HashSet}, time::Instant}; @@ -261,18 +342,20 @@ mod test { for i in 1..10 { let mut range = Range::::new(i*(RANGE_SIZE as u64)); if i < 7 { - range.expires = Expiry::At(i*100); + range.expires = Expiry::At((i as u64)*100); } ranges.push_back(range) } + println!("{:?}", ranges); let exists = add_num(&mut ranges, 8, 150); + println!("{:?}", ranges); assert!(!exists); - assert_eq!(ranges[3].members, Some([8, 0])); + assert_eq!(ranges[3].members, Some([1, 0])); assert_eq!(ranges[2].expires, Expiry::At(150)); let exists = add_num(&mut ranges, 8, 150); assert!(exists); - assert_eq!(ranges[3].members, Some([8, 0])); + assert_eq!(ranges[3].members, Some([1, 0])); let exists = add_num(&mut ranges, 9, 125); assert!(!exists); @@ -343,12 +426,15 @@ mod test { // When max_distance is 50 (size 50k), it's must more efficient than a bit array // When max distance is 2.5k (size 50k), it's less efficient - // Same holds for much larger size + // For larger size it's much more efficient except for very significant shuffling #[test] fn routine() { let mut rng = thread_rng(); - let size = 500000; + // For better benchmark do 500k and 50 amount + // There it can reach 20M ops/secs + // `cargo test --package tiauth-core --lib --release --all-features -- compactset::test::routine --exact --show-output` + let size = 5000; let amnt = 10; let ops = amnt*size; let mut add_time = 0f64; @@ -363,7 +449,7 @@ mod test { values.push((i, expiry)); } - lightly_shuffle(&mut values, size/1000); + lightly_shuffle(&mut values, size/100); let mut ranges: VecDeque> = VecDeque::new(); ranges.push_back(Range::new(1)); @@ -373,14 +459,15 @@ mod test { let mut time = 0; for (i, expires) in values { - let around: i32 = rng.gen_range(-900..900); + //println!("sp: {}", ranges.len()); + let around: i32 = rng.gen_range(-900..100); time = time.max(0.max((expires as i32)+around) as u64); let now = Instant::now(); add_num(&mut ranges, i as u64, expires); add_time += now.elapsed().as_secs_f64(); max_space = max_space.max(ranges.len()); let now_again = Instant::now(); - if i % 5 == 0 { + if i % 4 == 0 { check_expired(&mut ranges, time); } check_time += now_again.elapsed().as_secs_f64(); @@ -393,6 +480,54 @@ mod test { let max_space: usize = maxes.into_iter().sum::()/amnt; println!("add: {} ops/s.\ncheck: {} ops/s.", (ops as f64)/add_time, (ops as f64)/check_time); - println!("avg max space: {}", (max_space)*(std::mem::size_of::>())); + let mem_size = std::mem::size_of::>(); + println!("avg max space: {}; mem_size: {}", (max_space)*mem_size, mem_size); + println!("total {} ops/s.", (ops as f64)/(add_time+check_time)); + } + + #[test] + fn routine_compactset() { + let mut rng = thread_rng(); + + // For better benchmark do 500k and 50 amount + // There it can reach 20M ops/secs + // `cargo test --package tiauth-core --lib --release --all-features -- compactset::test::routine --exact --show-output` + let size = 500000; + let amnt = 50; + let ops = amnt*size; + let mut add_time = 0f64; + let mut check_time = 0f64; + + for _ in 0..amnt { + let mut values = Vec::new(); + let mut expiry: u64 = 0; + for i in 1..size { + expiry += rng.gen_range(0..1000); + values.push((i, expiry)); + } + + lightly_shuffle(&mut values, size/100); + + let mut ranges: VecDeque> = VecDeque::new(); + ranges.push_back(Range::new(1)); + + let mut compact_set = CompactSet::new(); + + compact_set.ranges = Arc::new(Mutex::new(ranges)); + + let mut time = 0; + + for (i, expires) in values { + //println!("sp: {}", ranges.len()); + let around: i32 = rng.gen_range(-900..100); + time = time.max(0.max((expires as i32)+around) as u64); + let now = Instant::now(); + compact_set.num_exists((i as u64), expires, Some(time)); + add_time += now.elapsed().as_secs_f64(); + } + } + + + println!("total: {} ops/s.", (ops as f64)/add_time); } } \ No newline at end of file diff --git a/tiauth-core/src/lib.rs b/tiauth-core/src/lib.rs index 9c427bc..70f4300 100644 --- a/tiauth-core/src/lib.rs +++ b/tiauth-core/src/lib.rs @@ -6,7 +6,7 @@ pub mod encoded; pub mod error; mod proof; mod util; -mod mergerange; +mod compactset; #[cfg(feature = "action")] mod ops;