Skip to content

Commit

Permalink
minhash bottom-k
Browse files Browse the repository at this point in the history
  • Loading branch information
jianshu93 committed Jan 24, 2024
1 parent 2edbad8 commit 24f8bd6
Show file tree
Hide file tree
Showing 7 changed files with 472 additions and 60 deletions.
22 changes: 15 additions & 7 deletions src/densminhash.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ impl <F: Float + SampleUniform + std::fmt::Debug, D:Hash + Copy, H : Hasher+Def
/// allocate a struct to do .
/// size is size of sketch. build_hasher is the build hasher for the type of Hasher we want.
pub fn new(size:usize, build_hasher: BuildHasherDefault<H>) -> OptDensMinHash<F, D, H> {
log::info!("\n allocating-sketcher \n");
let mut sketch_init = Vec::<F>::with_capacity(size);
let mut values = Vec::<u64>::with_capacity(size);
let mut init = Vec::<bool>::with_capacity(size);
Expand Down Expand Up @@ -132,6 +133,8 @@ impl <F: Float + SampleUniform + std::fmt::Debug, D:Hash + Copy, H : Hasher+Def
for d in to_sketch {
self.sketch(d);
}
log::info!("optdensminhash::sketch_slice sketch size : {:?}, nb empy slots : {:?}", m, self.nb_empty);

log::debug!("optdensminhash::sketch_slice sketch size : {:?}, nb empy slots : {:?}", m, self.nb_empty);
//
if self.nb_empty > 0 {
Expand Down Expand Up @@ -207,6 +210,7 @@ impl <F: Float + SampleUniform + std::fmt::Debug, D:Hash + Copy, H : Hasher+Def
}
}
//
log::info!("end of pass {}, nb empty : {}", nbpass, self.nb_empty);
log::debug!("end of pass {}, nb empty : {}", nbpass, self.nb_empty);
assert_eq!(self.nb_empty, 0);
//
Expand Down Expand Up @@ -248,6 +252,7 @@ impl <F: Float + SampleUniform + std::fmt::Debug, D:Hash + Copy, H : Hasher+Def
/// allocate a struct to do superminhash.
/// size is size of sketch. build_hasher is the build hasher for the type of Hasher we want.
pub fn new(size:usize, build_hasher: BuildHasherDefault<H>) -> RevOptDensMinHash<F, D, H> {
log::info!("\n allocating-sketcher \n");
let mut sketch_init = Vec::<F>::with_capacity(size);
let mut values: Vec<u64> = Vec::<u64>::with_capacity(size);
let mut init: Vec<bool> = Vec::<bool>::with_capacity(size);
Expand Down Expand Up @@ -342,6 +347,8 @@ impl <F: Float + SampleUniform + std::fmt::Debug, D:Hash + Copy, H : Hasher+Def
let res = self.densify();
assert!(res.is_ok());
}
log::info!("fastdensminhash::sketch_slice sketch size : {:?}, nb empy slots : {:?}", m, self.nb_empty);

log::debug!("fastdensminhash::sketch_slice sketch size : {:?}, nb empy slots : {:?}", m, self.nb_empty);
//
return Ok(());
Expand Down Expand Up @@ -374,6 +381,7 @@ impl <F: Float + SampleUniform + std::fmt::Debug, D:Hash + Copy, H : Hasher+Def
log::debug!("end of pass {}, nb empty : {}", pass, self.nb_empty);
}
//
log::info!("end of pass {}, nb empty : {}", pass, self.nb_empty);
log::debug!("end of pass {}, nb empty : {}", pass, self.nb_empty);
assert_eq!(self.nb_empty, 0);
//
Expand Down Expand Up @@ -449,12 +457,12 @@ mod tests {
//
let vamax = 300000;
let va : Vec<usize> = (0..vamax).collect();
let vbmin = 50000;
let vbmax = 2 * vamax;
let vb : Vec<usize> = (vbmin..vbmax).collect();
let vbmin = 290000;
let vbmax = 2.0 * vamax as f64;
let vb : Vec<usize> = (vbmin..vbmax as usize).collect();
let inter = vamax - vbmin;
let jexact = inter as f64 / vbmax as f64;
let size = 50000;
let size = 90000;
//
let _res = test_optdens(&va, &vb, jexact, size);
} // end of test_optdens_fewbins_fnv_f64
Expand All @@ -466,14 +474,14 @@ mod tests {
// we construct 2 ranges [a..b] [c..d], with a<b, b < d, c<d sketch them and compute jaccard.
// we should get something like max(b,c) - min(b,c)/ (b-a+d-c)
//
let vamax = 300000;
let vamax = 3000000;
let va : Vec<usize> = (0..vamax).collect();
let vbmin = 50000;
let vbmin = 2900000;
let vbmax = 2 * vamax;
let vb : Vec<usize> = (vbmin..vbmax).collect();
let inter = vamax - vbmin;
let jexact = inter as f64 / vbmax as f64;
let size = 50000;
let size = 100000;
//
let res = test_revoptdens(&va, &vb, jexact, size).unwrap();
assert!( res.0 > 0. && (res.0 - jexact).abs() < 3. * res.1);
Expand Down
130 changes: 130 additions & 0 deletions src/hashed.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
//! Basic stuff about hashed items

use std::cmp::Ordering;
use std::marker::PhantomData;


/// We use maximum size to store hash value but with invertible 32 hash
/// the value stored is in fact a u32.
/// We would like to template over item hash but Hasher has u64 as arrival type
pub type ItemHash = u64;



// If we use an inversible hash we do not need to keep item (the kmer)
// for other hash we need copying and storing of Kmer... whence the Option<T> field

/// A HashedItem is a hashed item and possibly the associated object (of type T) if
/// we want to keep track of objects contributiong to minhash signature.
/// Note that using invertible hash if objects hashes
/// are stored in a u32 or a u64 (as in some Kmer representation) we can retrive objects
/// from hashed value. (See module invhash)
#[derive(Debug,Clone,Copy)]
pub struct HashedItem<T:Clone+Copy> {
pub(crate) hash: ItemHash,
///
#[allow(unused)]
pub(crate) item: Option<T>,
}

impl<T:Clone+Copy> PartialEq for HashedItem<T> {
fn eq(&self, other: &HashedItem<T>) -> bool {
other.hash.eq(&self.hash)
}
}

impl<T:Clone+Copy> Eq for HashedItem<T> {}

impl<T:Clone+Copy> Ord for HashedItem<T> {
fn cmp(&self, other: &HashedItem<T>) -> Ordering {
self.hash.cmp(&other.hash)
}
}

impl<T:Clone+Copy> PartialOrd for HashedItem<T> {
fn partial_cmp(&self, other: &HashedItem<T>) -> Option<Ordering> {
Some(self.hash.cmp(&other.hash))
}
}



// size is 2*8+2 bytes !!
/// to store count of object
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct HashCount<T:Clone+Copy> {
pub hashed: HashedItem<T>,
pub count: u16,
}


//=================== For items hashed by invertible hash


/// possibly something can be hashed with some inversible hash so we do not need to store the original item
#[derive(Debug,Clone)]
pub struct InvHashedItem<T:Clone+Copy> {
pub(crate) hash: ItemHash,
pub(crate) t_marker: PhantomData<T>,
}



impl <T:Clone+Copy> InvHashedItem<T> {

pub fn new(hash:ItemHash) -> Self {
InvHashedItem{hash:hash, t_marker:PhantomData,}
}
pub fn get_hash(&self) -> ItemHash {
return self.hash;
} // end of get impl

} // end of impl InvHashedItem




impl<T:Copy+Clone> PartialEq for InvHashedItem<T> {
fn eq(&self, other: &InvHashedItem<T>) -> bool {
other.hash.eq(&self.hash)
}
}



impl<T:Copy+Clone> Eq for InvHashedItem<T> {}


impl<T:Clone+Copy> Ord for InvHashedItem<T> {
fn cmp(&self, other: &InvHashedItem<T>) -> Ordering {
self.hash.cmp(&other.hash)
}
}


impl<T:Clone+Copy> PartialOrd for InvHashedItem<T> {
fn partial_cmp(&self, other: &InvHashedItem<T>) -> Option<Ordering> {
Some(self.hash.cmp(&other.hash))
}
}


//====================================================================================//


/// To count occurences of an inversible hashed objects
// size is 8 + 1 bytes !!
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct InvHashCount<T:Clone+Copy> {
pub hashed: InvHashedItem<T>,
pub(crate) count: u8,
}

impl <T:Clone+Copy> InvHashCount<T> {
pub fn new(hashed: InvHashedItem<T>, count:u8) -> Self {
InvHashCount { hashed: hashed, count:count,}
}
pub fn get_count(&self) -> u8 {
self.count
}
} // end of impl block for InvHashCount
3 changes: 2 additions & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ pub mod probminhasher;
pub mod superminhasher;
pub mod setsketcher;
pub mod densminhash;

pub mod minhash;

pub mod exp01;
pub mod fyshuffle;
Expand All @@ -25,6 +25,7 @@ pub mod superminhasher2;

pub mod invhash;
pub mod nohasher;
pub mod hashed;

// hashing stuff

Expand Down
Loading

0 comments on commit 24f8bd6

Please sign in to comment.