Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

seek_exact + cost based intersection #2538

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ fnv = "1.0.7"
winapi = "0.3.9"

[dev-dependencies]
binggan = "0.14.0"
binggan = "0.14.2"
rand = "0.8.5"
maplit = "1.0.2"
matches = "0.1.9"
Expand Down Expand Up @@ -162,3 +162,8 @@ harness = false
[[bench]]
name = "agg_bench"
harness = false

[[bench]]
name = "range_query"
harness = false

260 changes: 260 additions & 0 deletions benches/range_query.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,260 @@
use std::fmt::Display;
use std::net::Ipv6Addr;
use std::ops::RangeInclusive;

use binggan::plugins::PeakMemAllocPlugin;
use binggan::{black_box, BenchRunner, OutputValue, PeakMemAlloc, INSTRUMENTED_SYSTEM};
use columnar::MonotonicallyMappableToU128;
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use tantivy::collector::{Count, TopDocs};
use tantivy::query::QueryParser;
use tantivy::schema::*;
use tantivy::{doc, Index};

#[global_allocator]
pub static GLOBAL: &PeakMemAlloc<std::alloc::System> = &INSTRUMENTED_SYSTEM;

fn main() {
bench_range_query();
}

fn bench_range_query() {
let index = get_index_0_to_100();
let mut runner = BenchRunner::new();
runner.add_plugin(PeakMemAllocPlugin::new(GLOBAL));

runner.set_name("range_query on u64");
let field_name_and_descr: Vec<_> = vec![
("id", "Single Valued Range Field"),
("ids", "Multi Valued Range Field"),
];
let range_num_hits = vec![
("90_percent", get_90_percent()),
("10_percent", get_10_percent()),
("1_percent", get_1_percent()),
];

test_range(&mut runner, &index, &field_name_and_descr, range_num_hits);

runner.set_name("range_query on ip");
let field_name_and_descr: Vec<_> = vec![
("ip", "Single Valued Range Field"),
("ips", "Multi Valued Range Field"),
];
let range_num_hits = vec![
("90_percent", get_90_percent_ip()),
("10_percent", get_10_percent_ip()),
("1_percent", get_1_percent_ip()),
];

test_range(&mut runner, &index, &field_name_and_descr, range_num_hits);
}

fn test_range<T: Display>(
runner: &mut BenchRunner,
index: &Index,
field_name_and_descr: &[(&str, &str)],
range_num_hits: Vec<(&str, RangeInclusive<T>)>,
) {
for (field, suffix) in field_name_and_descr {
let term_num_hits = vec![
("", ""),
("1_percent", "veryfew"),
("10_percent", "few"),
("90_percent", "most"),
];
let mut group = runner.new_group();
group.set_name(suffix);
// all intersect combinations
for (range_name, range) in &range_num_hits {
for (term_name, term) in &term_num_hits {
let index = &index;
let test_name = if term_name.is_empty() {
format!("id_range_hit_{}", range_name)
} else {
format!(
"id_range_hit_{}_intersect_with_term_{}",
range_name, term_name
)
};
group.register(test_name, move |_| {
let query = if term_name.is_empty() {
"".to_string()
} else {
format!("AND id_name:{}", term)
};
black_box(execute_query(field, range, &query, index));
});
}
}
group.run();
}
}

fn get_index_0_to_100() -> Index {
let mut rng = StdRng::from_seed([1u8; 32]);
let num_vals = 100_000;
let docs: Vec<_> = (0..num_vals)
.map(|_i| {
let id_name = if rng.gen_bool(0.01) {
"veryfew".to_string() // 1%
} else if rng.gen_bool(0.1) {
"few".to_string() // 9%
} else {
"most".to_string() // 90%
};
Doc {
id_name,
id: rng.gen_range(0..100),
// Multiply by 1000, so that we create most buckets in the compact space
// The benches depend on this range to select n-percent of elements with the
// methods below.
ip: Ipv6Addr::from_u128(rng.gen_range(0..100) * 1000),
}
})
.collect();

create_index_from_docs(&docs)
}

#[derive(Clone, Debug)]
pub struct Doc {
pub id_name: String,
pub id: u64,
pub ip: Ipv6Addr,
}

pub fn create_index_from_docs(docs: &[Doc]) -> Index {
let mut schema_builder = Schema::builder();
let id_u64_field = schema_builder.add_u64_field("id", INDEXED | STORED | FAST);
let ids_u64_field =
schema_builder.add_u64_field("ids", NumericOptions::default().set_fast().set_indexed());

let id_f64_field = schema_builder.add_f64_field("id_f64", INDEXED | STORED | FAST);
let ids_f64_field = schema_builder.add_f64_field(
"ids_f64",
NumericOptions::default().set_fast().set_indexed(),
);

let id_i64_field = schema_builder.add_i64_field("id_i64", INDEXED | STORED | FAST);
let ids_i64_field = schema_builder.add_i64_field(
"ids_i64",
NumericOptions::default().set_fast().set_indexed(),
);

let text_field = schema_builder.add_text_field("id_name", STRING | STORED);
let text_field2 = schema_builder.add_text_field("id_name_fast", STRING | STORED | FAST);

let ip_field = schema_builder.add_ip_addr_field("ip", FAST);
let ips_field = schema_builder.add_ip_addr_field("ips", FAST);

let schema = schema_builder.build();

let index = Index::create_in_ram(schema);

{
let mut index_writer = index.writer_with_num_threads(1, 50_000_000).unwrap();
for doc in docs.iter() {
index_writer
.add_document(doc!(
ids_i64_field => doc.id as i64,
ids_i64_field => doc.id as i64,
ids_f64_field => doc.id as f64,
ids_f64_field => doc.id as f64,
ids_u64_field => doc.id,
ids_u64_field => doc.id,
id_u64_field => doc.id,
id_f64_field => doc.id as f64,
id_i64_field => doc.id as i64,
text_field => doc.id_name.to_string(),
text_field2 => doc.id_name.to_string(),
ips_field => doc.ip,
ips_field => doc.ip,
ip_field => doc.ip,
))
.unwrap();
}

index_writer.commit().unwrap();
}
index
}

fn get_90_percent() -> RangeInclusive<u64> {
0..=90
}

fn get_10_percent() -> RangeInclusive<u64> {
0..=10
}

fn get_1_percent() -> RangeInclusive<u64> {
10..=10
}

fn get_90_percent_ip() -> RangeInclusive<Ipv6Addr> {
let start = Ipv6Addr::from_u128(0);
let end = Ipv6Addr::from_u128(90 * 1000);
start..=end
}

fn get_10_percent_ip() -> RangeInclusive<Ipv6Addr> {
let start = Ipv6Addr::from_u128(0);
let end = Ipv6Addr::from_u128(10 * 1000);
start..=end
}

fn get_1_percent_ip() -> RangeInclusive<Ipv6Addr> {
let start = Ipv6Addr::from_u128(10 * 1000);
let end = Ipv6Addr::from_u128(10 * 1000);
start..=end
}

struct NumHits {
count: usize,
}
impl OutputValue for NumHits {
fn column_title() -> &'static str {
"NumHits"
}
fn format(&self) -> Option<String> {
Some(self.count.to_string())
}
}

fn execute_query<T: Display>(
field: &str,
id_range: &RangeInclusive<T>,
suffix: &str,
index: &Index,
) -> NumHits {
let gen_query_inclusive = |from: &T, to: &T| {
format!(
"{}:[{} TO {}] {}",
field,
&from.to_string(),
&to.to_string(),
suffix
)
};

let query = gen_query_inclusive(id_range.start(), id_range.end());
execute_query_(&query, index)
}

fn execute_query_(query: &str, index: &Index) -> NumHits {
let query_from_text = |text: &str| {
QueryParser::for_index(index, vec![])
.parse_query(text)
.unwrap()
};
let query = query_from_text(query);
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let num_hits = searcher
.search(&query, &(TopDocs::with_limit(10), Count))
.unwrap()
.1;
NumHits { count: num_hits }
}
54 changes: 54 additions & 0 deletions src/docset.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,25 @@ pub trait DocSet: Send {
doc
}

/// Seeks to the target if possible and returns true if the target is in the DocSet.
///
/// Implementations may choose to advance past the target if target does not exist.
///
/// DocSets that already have an efficient `seek` method don't need to implement `seek_exact`.
/// All wapper DocSets should forward `seek_exact` to the underlying DocSet.
PSeitz marked this conversation as resolved.
Show resolved Hide resolved
///
/// ## API Behaviour
/// If `seek_exact` is returning true, a call to `doc()` has to return target.
/// If `seek_exact` is returning false, a call to `doc()` may return the previous doc,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you clarify that the result should still be a document in the posting list or TERMINATED, and that it could be something before or after the target.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We also need to clarify what happens if the target is lower than the current doc.

Copy link
Collaborator

@fulmicoton fulmicoton Nov 12, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

actually it seem like your current implementation might return self.doc() that is not in the docset at all.

This is tricky, because right now we are saying that self.doc could return "anything".

This is not exactly true though.
We need to have an accurate definition of what can we done with docset after having called seek_exact once.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added some comments to clarify the contract

/// which may be lower than target.
fn seek_exact(&mut self, target: DocId) -> bool {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let's find a better name

let current_doc = self.doc();
if current_doc < target {
self.seek(target);
}
self.doc() == target
PSeitz marked this conversation as resolved.
Show resolved Hide resolved
}

/// Fills a given mutable buffer with the next doc ids from the
/// `DocSet`
///
Expand Down Expand Up @@ -87,6 +106,23 @@ pub trait DocSet: Send {
/// length of the docset.
fn size_hint(&self) -> u32;

/// Returns a best-effort hint of the
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not accurate enough to be usable, and this is actually important here.

Is it the cost of calling advance? Is it the cost of consuming the entire docset? Or is it something else?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added come comments to clarify

/// cost to drive the docset.
///
/// By default this returns `size_hint()`.
///
/// DocSets may have vastly different cost depending on their type,
/// e.g. an intersection with 10 hits is much cheaper than
/// a phrase search with 10 hits, since it needs to load positions.
///
/// ### Future Work
/// We may want to differentiate `DocSet` costs more more granular, e.g.
/// creation_cost, advance_cost, seek_cost on to get a good estimation
/// what query types to choose.
fn cost(&self) -> u64 {
self.size_hint() as u64
}

/// Returns the number documents matching.
/// Calling this method consumes the `DocSet`.
fn count(&mut self, alive_bitset: &AliveBitSet) -> u32 {
Expand Down Expand Up @@ -126,6 +162,10 @@ impl DocSet for &mut dyn DocSet {
(**self).seek(target)
}

fn seek_exact(&mut self, target: DocId) -> bool {
(**self).seek_exact(target)
}

fn doc(&self) -> u32 {
(**self).doc()
}
Expand All @@ -134,6 +174,10 @@ impl DocSet for &mut dyn DocSet {
(**self).size_hint()
}

fn cost(&self) -> u64 {
(**self).cost()
}

fn count(&mut self, alive_bitset: &AliveBitSet) -> u32 {
(**self).count(alive_bitset)
}
Expand All @@ -154,6 +198,11 @@ impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
unboxed.seek(target)
}

fn seek_exact(&mut self, target: DocId) -> bool {
let unboxed: &mut TDocSet = self.borrow_mut();
unboxed.seek_exact(target)
}

fn fill_buffer(&mut self, buffer: &mut [DocId; COLLECT_BLOCK_BUFFER_LEN]) -> usize {
let unboxed: &mut TDocSet = self.borrow_mut();
unboxed.fill_buffer(buffer)
Expand All @@ -169,6 +218,11 @@ impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
unboxed.size_hint()
}

fn cost(&self) -> u64 {
let unboxed: &TDocSet = self.borrow();
unboxed.cost()
}

fn count(&mut self, alive_bitset: &AliveBitSet) -> u32 {
let unboxed: &mut TDocSet = self.borrow_mut();
unboxed.count(alive_bitset)
Expand Down
15 changes: 9 additions & 6 deletions src/postings/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -667,12 +667,15 @@ mod bench {
.read_postings(&TERM_D, IndexRecordOption::Basic)
.unwrap()
.unwrap();
let mut intersection = Intersection::new(vec![
segment_postings_a,
segment_postings_b,
segment_postings_c,
segment_postings_d,
]);
let mut intersection = Intersection::new(
vec![
segment_postings_a,
segment_postings_b,
segment_postings_c,
segment_postings_d,
],
reader.searcher().num_docs() as u32,
);
while intersection.advance() != TERMINATED {}
});
}
Expand Down
Loading
Loading