Skip to content

Commit

Permalink
seek_exact + cost based intersection
Browse files Browse the repository at this point in the history
Adds `seek_exact` and `cost` to `DocSet` for a more efficient intersection.
Unlike `seek`, `seek_exact` does not require the DocSet to advance to the next hit, if the target does not exist.

`cost` allows to address the different DocSet types and their cost
model and is used to determine the DocSet that drives the intersection.
E.g. fast field range queries may do a full scan. Phrase queries load the positions to check if a we have a hit.
They both have a higher cost than their size_hint would suggest.

Improves `size_hint` estimation for intersection and union, by having a
estimation based on random distribution with a co-location factor.

Refactor range query benchmark.

Closes #2531

*Future Work*

Implement `seek_exact` for BufferedUnionScorer and RangeDocSet (fast field range queries)
Evaluate replacing `seek` with `seek_exact` to reduce code complexity
  • Loading branch information
PSeitz committed Nov 7, 2024
1 parent c35a782 commit 8d7bc08
Show file tree
Hide file tree
Showing 21 changed files with 749 additions and 522 deletions.
7 changes: 6 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ fnv = "1.0.7"
winapi = "0.3.9"

[dev-dependencies]
binggan = "0.14.0"
binggan = "0.14.2"
rand = "0.8.5"
maplit = "1.0.2"
matches = "0.1.9"
Expand Down Expand Up @@ -162,3 +162,8 @@ harness = false
[[bench]]
name = "agg_bench"
harness = false

[[bench]]
name = "range_query"
harness = false

260 changes: 260 additions & 0 deletions benches/range_query.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,260 @@
use std::fmt::Display;
use std::net::Ipv6Addr;
use std::ops::RangeInclusive;

use binggan::plugins::PeakMemAllocPlugin;
use binggan::{black_box, BenchRunner, OutputValue, PeakMemAlloc, INSTRUMENTED_SYSTEM};
use columnar::MonotonicallyMappableToU128;
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use tantivy::collector::{Count, TopDocs};
use tantivy::query::QueryParser;
use tantivy::schema::*;
use tantivy::{doc, Index};

#[global_allocator]
pub static GLOBAL: &PeakMemAlloc<std::alloc::System> = &INSTRUMENTED_SYSTEM;

fn main() {
bench_range_query();
}

fn bench_range_query() {
let index = get_index_0_to_100();
let mut runner = BenchRunner::new();
runner.add_plugin(PeakMemAllocPlugin::new(GLOBAL));

runner.set_name("range_query on u64");
let field_name_and_descr: Vec<_> = vec![
("id", "Single Valued Range Field"),
("ids", "Multi Valued Range Field"),
];
let range_num_hits = vec![
("90_percent", get_90_percent()),
("10_percent", get_10_percent()),
("1_percent", get_1_percent()),
];

test_range(&mut runner, &index, &field_name_and_descr, range_num_hits);

runner.set_name("range_query on ip");
let field_name_and_descr: Vec<_> = vec![
("ip", "Single Valued Range Field"),
("ips", "Multi Valued Range Field"),
];
let range_num_hits = vec![
("90_percent", get_90_percent_ip()),
("10_percent", get_10_percent_ip()),
("1_percent", get_1_percent_ip()),
];

test_range(&mut runner, &index, &field_name_and_descr, range_num_hits);
}

fn test_range<T: Display>(
runner: &mut BenchRunner,
index: &Index,
field_name_and_descr: &[(&str, &str)],
range_num_hits: Vec<(&str, RangeInclusive<T>)>,
) {
for (field, suffix) in field_name_and_descr {
let term_num_hits = vec![
("", ""),
("1_percent", "veryfew"),
("10_percent", "few"),
("90_percent", "most"),
];
let mut group = runner.new_group();
group.set_name(suffix);
// all intersect combinations
for (range_name, range) in &range_num_hits {
for (term_name, term) in &term_num_hits {
let index = &index;
let test_name = if term_name.is_empty() {
format!("id_range_hit_{}", range_name)
} else {
format!(
"id_range_hit_{}_intersect_with_term_{}",
range_name, term_name
)
};
group.register(test_name, move |_| {
let query = if term_name.is_empty() {
"".to_string()
} else {
format!("AND id_name:{}", term)
};
black_box(execute_query(field, range, &query, index));
});
}
}
group.run();
}
}

fn get_index_0_to_100() -> Index {
let mut rng = StdRng::from_seed([1u8; 32]);
let num_vals = 100_000;
let docs: Vec<_> = (0..num_vals)
.map(|_i| {
let id_name = if rng.gen_bool(0.01) {
"veryfew".to_string() // 1%
} else if rng.gen_bool(0.1) {
"few".to_string() // 9%
} else {
"most".to_string() // 90%
};
Doc {
id_name,
id: rng.gen_range(0..100),
// Multiply by 1000, so that we create most buckets in the compact space
// The benches depend on this range to select n-percent of elements with the
// methods below.
ip: Ipv6Addr::from_u128(rng.gen_range(0..100) * 1000),
}
})
.collect();

create_index_from_docs(&docs)
}

#[derive(Clone, Debug)]
pub struct Doc {
pub id_name: String,
pub id: u64,
pub ip: Ipv6Addr,
}

pub fn create_index_from_docs(docs: &[Doc]) -> Index {
let mut schema_builder = Schema::builder();
let id_u64_field = schema_builder.add_u64_field("id", INDEXED | STORED | FAST);
let ids_u64_field =
schema_builder.add_u64_field("ids", NumericOptions::default().set_fast().set_indexed());

let id_f64_field = schema_builder.add_f64_field("id_f64", INDEXED | STORED | FAST);
let ids_f64_field = schema_builder.add_f64_field(
"ids_f64",
NumericOptions::default().set_fast().set_indexed(),
);

let id_i64_field = schema_builder.add_i64_field("id_i64", INDEXED | STORED | FAST);
let ids_i64_field = schema_builder.add_i64_field(
"ids_i64",
NumericOptions::default().set_fast().set_indexed(),
);

let text_field = schema_builder.add_text_field("id_name", STRING | STORED);
let text_field2 = schema_builder.add_text_field("id_name_fast", STRING | STORED | FAST);

let ip_field = schema_builder.add_ip_addr_field("ip", FAST);
let ips_field = schema_builder.add_ip_addr_field("ips", FAST);

let schema = schema_builder.build();

let index = Index::create_in_ram(schema);

{
let mut index_writer = index.writer_with_num_threads(1, 50_000_000).unwrap();
for doc in docs.iter() {
index_writer
.add_document(doc!(
ids_i64_field => doc.id as i64,
ids_i64_field => doc.id as i64,
ids_f64_field => doc.id as f64,
ids_f64_field => doc.id as f64,
ids_u64_field => doc.id,
ids_u64_field => doc.id,
id_u64_field => doc.id,
id_f64_field => doc.id as f64,
id_i64_field => doc.id as i64,
text_field => doc.id_name.to_string(),
text_field2 => doc.id_name.to_string(),
ips_field => doc.ip,
ips_field => doc.ip,
ip_field => doc.ip,
))
.unwrap();
}

index_writer.commit().unwrap();
}
index
}

fn get_90_percent() -> RangeInclusive<u64> {
0..=90
}

fn get_10_percent() -> RangeInclusive<u64> {
0..=10
}

fn get_1_percent() -> RangeInclusive<u64> {
10..=10
}

fn get_90_percent_ip() -> RangeInclusive<Ipv6Addr> {
let start = Ipv6Addr::from_u128(0);
let end = Ipv6Addr::from_u128(90 * 1000);
start..=end
}

fn get_10_percent_ip() -> RangeInclusive<Ipv6Addr> {
let start = Ipv6Addr::from_u128(0);
let end = Ipv6Addr::from_u128(10 * 1000);
start..=end
}

fn get_1_percent_ip() -> RangeInclusive<Ipv6Addr> {
let start = Ipv6Addr::from_u128(10 * 1000);
let end = Ipv6Addr::from_u128(10 * 1000);
start..=end
}

struct NumHits {
count: usize,
}
impl OutputValue for NumHits {
fn column_title() -> &'static str {
"NumHits"
}
fn format(&self) -> Option<String> {
Some(self.count.to_string())
}
}

fn execute_query<T: Display>(
field: &str,
id_range: &RangeInclusive<T>,
suffix: &str,
index: &Index,
) -> NumHits {
let gen_query_inclusive = |from: &T, to: &T| {
format!(
"{}:[{} TO {}] {}",
field,
&from.to_string(),
&to.to_string(),
suffix
)
};

let query = gen_query_inclusive(id_range.start(), id_range.end());
execute_query_(&query, index)
}

fn execute_query_(query: &str, index: &Index) -> NumHits {
let query_from_text = |text: &str| {
QueryParser::for_index(index, vec![])
.parse_query(text)
.unwrap()
};
let query = query_from_text(query);
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let num_hits = searcher
.search(&query, &(TopDocs::with_limit(10), Count))
.unwrap()
.1;
NumHits { count: num_hits }
}
55 changes: 55 additions & 0 deletions src/docset.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,26 @@ pub trait DocSet: Send {
doc
}

/// Seeks to the target if possible and checks if the target is an exact match.
///
/// Implementations may choose to advance past the target if target does not exist.
/// The return value is `true` if the docset contains target, and `false` otherwise.
///
/// DocSets that already have an efficient `seek` method don't need to implement `seek_exact`.
/// All wapper DocSets should forward `seek_exact` to the underlying DocSet.
///
/// ## API Behaviour
/// If `seek_exact` is returning true, a call to `doc()` has to return target.
/// If `seek_exact` is returning false, a call to `doc()` may return the previous doc,
/// which may be lower than target.
fn seek_exact(&mut self, target: DocId) -> bool {
let current_doc = self.doc();
if current_doc < target {
self.seek(target);
}
self.doc() == target
}

/// Fills a given mutable buffer with the next doc ids from the
/// `DocSet`
///
Expand Down Expand Up @@ -87,6 +107,23 @@ pub trait DocSet: Send {
/// length of the docset.
fn size_hint(&self) -> u32;

/// Returns a best-effort hint of the
/// cost to drive the docset.
///
/// By default this returns `size_hint()`.
///
/// DocSets may have vastly different cost depending on their type,
/// e.g. an intersection with 10 hits is much cheaper than
/// a phrase search with 10 hits, since it needs to load positions.
///
/// ### Future Work
/// We may want to differentiate `DocSet` costs more more granular, e.g.
/// creation_cost, advance_cost, seek_cost on to get a good estimation
/// what query types to choose.
fn cost(&self) -> u64 {
self.size_hint() as u64
}

/// Returns the number documents matching.
/// Calling this method consumes the `DocSet`.
fn count(&mut self, alive_bitset: &AliveBitSet) -> u32 {
Expand Down Expand Up @@ -126,6 +163,10 @@ impl DocSet for &mut dyn DocSet {
(**self).seek(target)
}

fn seek_exact(&mut self, target: DocId) -> bool {
(**self).seek_exact(target)
}

fn doc(&self) -> u32 {
(**self).doc()
}
Expand All @@ -134,6 +175,10 @@ impl DocSet for &mut dyn DocSet {
(**self).size_hint()
}

fn cost(&self) -> u64 {
(**self).cost()
}

fn count(&mut self, alive_bitset: &AliveBitSet) -> u32 {
(**self).count(alive_bitset)
}
Expand All @@ -154,6 +199,11 @@ impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
unboxed.seek(target)
}

fn seek_exact(&mut self, target: DocId) -> bool {
let unboxed: &mut TDocSet = self.borrow_mut();
unboxed.seek_exact(target)
}

fn fill_buffer(&mut self, buffer: &mut [DocId; COLLECT_BLOCK_BUFFER_LEN]) -> usize {
let unboxed: &mut TDocSet = self.borrow_mut();
unboxed.fill_buffer(buffer)
Expand All @@ -169,6 +219,11 @@ impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
unboxed.size_hint()
}

fn cost(&self) -> u64 {
let unboxed: &TDocSet = self.borrow();
unboxed.cost()
}

fn count(&mut self, alive_bitset: &AliveBitSet) -> u32 {
let unboxed: &mut TDocSet = self.borrow_mut();
unboxed.count(alive_bitset)
Expand Down
15 changes: 9 additions & 6 deletions src/postings/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -667,12 +667,15 @@ mod bench {
.read_postings(&TERM_D, IndexRecordOption::Basic)
.unwrap()
.unwrap();
let mut intersection = Intersection::new(vec![
segment_postings_a,
segment_postings_b,
segment_postings_c,
segment_postings_d,
]);
let mut intersection = Intersection::new(
vec![
segment_postings_a,
segment_postings_b,
segment_postings_c,
segment_postings_d,
],
reader.searcher().num_docs() as u32,
);
while intersection.advance() != TERMINATED {}
});
}
Expand Down
Loading

0 comments on commit 8d7bc08

Please sign in to comment.