Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

seek_exact + cost based intersection #2538

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 9 additions & 11 deletions src/docset.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,24 +53,22 @@ pub trait DocSet: Send {

/// Seeks to the target if possible and returns true if the target is in the DocSet.
///
/// Implementations may choose to advance past the target if target does not exist.
///
/// DocSets that already have an efficient `seek` method don't need to implement `seek_exact`.
/// All wrapper DocSets should forward `seek_exact` to the underlying DocSet.
///
/// ## API Behaviour
/// If `seek_exact` is returning true, a call to `doc()` has to return target.
/// If `seek_exact` is returning false, a call to `doc()` may return any doc and should not be
/// used until `seek_exact` returns true again. The DocSet is considered to be in an invalid
/// state until `seek_exact` returns true again.
/// If `seek_exact` is returning false, a call to `doc()` may return any doc between
/// the last doc that matched and target or a doc that is a valid next hit after target.
/// The DocSet is considered to be in an invalid state until `seek_exact` returns true again.
///
/// target needs to be equal or larger than `doc` when in a valid state.
/// `target` needs to be equal or larger than `doc` when in a valid state.
///
/// Consecutive calls are not allowed to have decreasing `target` values.
///
/// # Warning
/// This is an advanced API used by intersection. The API contract is tricky, avoid using it.
fn seek_exact(&mut self, target: DocId) -> bool {
fn seek_into_the_danger_zone(&mut self, target: DocId) -> bool {
let current_doc = self.doc();
if current_doc < target {
self.seek(target);
Expand Down Expand Up @@ -175,8 +173,8 @@ impl DocSet for &mut dyn DocSet {
(**self).seek(target)
}

fn seek_exact(&mut self, target: DocId) -> bool {
(**self).seek_exact(target)
fn seek_into_the_danger_zone(&mut self, target: DocId) -> bool {
(**self).seek_into_the_danger_zone(target)
}

fn doc(&self) -> u32 {
Expand Down Expand Up @@ -211,9 +209,9 @@ impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
unboxed.seek(target)
}

fn seek_exact(&mut self, target: DocId) -> bool {
fn seek_into_the_danger_zone(&mut self, target: DocId) -> bool {
let unboxed: &mut TDocSet = self.borrow_mut();
unboxed.seek_exact(target)
unboxed.seek_into_the_danger_zone(target)
}

fn fill_buffer(&mut self, buffer: &mut [DocId; COLLECT_BLOCK_BUFFER_LEN]) -> usize {
Expand Down
1 change: 1 addition & 0 deletions src/postings/compression/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ const COMPRESSED_BLOCK_MAX_SIZE: usize = COMPRESSION_BLOCK_SIZE * u32::SIZE_IN_B
mod vint;

/// Returns the size in bytes of a compressed block, given `num_bits`.
#[inline]
pub fn compressed_block_size(num_bits: u8) -> usize {
(num_bits as usize) * COMPRESSION_BLOCK_SIZE / 8
}
Expand Down
4 changes: 2 additions & 2 deletions src/query/boost_query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -104,8 +104,8 @@ impl<S: Scorer> DocSet for BoostScorer<S> {
fn seek(&mut self, target: DocId) -> DocId {
self.underlying.seek(target)
}
fn seek_exact(&mut self, target: DocId) -> bool {
self.underlying.seek_exact(target)
fn seek_into_the_danger_zone(&mut self, target: DocId) -> bool {
self.underlying.seek_into_the_danger_zone(target)
}

fn fill_buffer(&mut self, buffer: &mut [DocId; COLLECT_BLOCK_BUFFER_LEN]) -> usize {
Expand Down
4 changes: 2 additions & 2 deletions src/query/disjunction.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,8 @@ impl<T: Scorer> DocSet for ScorerWrapper<T> {
self.current_doc = doc_id;
doc_id
}
fn seek_exact(&mut self, target: DocId) -> bool {
let found = self.scorer.seek_exact(target);
fn seek_into_the_danger_zone(&mut self, target: DocId) -> bool {
let found = self.scorer.seek_into_the_danger_zone(target);
self.current_doc = self.scorer.doc();
found
}
Expand Down
23 changes: 14 additions & 9 deletions src/query/intersection.rs
Original file line number Diff line number Diff line change
Expand Up @@ -115,12 +115,17 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
// of the two rarest `DocSet` in the intersection.

loop {
if right.seek_exact(candidate) {
if right.seek_into_the_danger_zone(candidate) {
break;
}
// `left.advance().max(right.doc())` yielded a regression in the search game
// benchmark It may make sense in certain scenarios though.
candidate = left.advance();
let right_doc = right.doc();
// TODO: Think about which value would make sense here
// It depends on the DocSet implementation, when a seek would outweigh an advance.
if right_doc > candidate.wrapping_add(100) {
candidate = left.seek(right_doc);
} else {
candidate = left.advance();
}
if candidate == TERMINATED {
return TERMINATED;
}
Expand All @@ -131,7 +136,7 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
if self
.others
.iter_mut()
.all(|docset| docset.seek_exact(candidate))
.all(|docset| docset.seek_into_the_danger_zone(candidate))
{
debug_assert_eq!(candidate, self.left.doc());
debug_assert_eq!(candidate, self.right.doc());
Expand All @@ -158,13 +163,13 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
///
/// Some implementations may choose to advance past the target if beneficial for performance.
/// The return value is `true` if the target is in the docset, and `false` otherwise.
fn seek_exact(&mut self, target: DocId) -> bool {
self.left.seek_exact(target)
&& self.right.seek_exact(target)
fn seek_into_the_danger_zone(&mut self, target: DocId) -> bool {
self.left.seek_into_the_danger_zone(target)
&& self.right.seek_into_the_danger_zone(target)
&& self
.others
.iter_mut()
.all(|docset| docset.seek_exact(target))
.all(|docset| docset.seek_into_the_danger_zone(target))
}

fn doc(&self) -> DocId {
Expand Down
4 changes: 2 additions & 2 deletions src/query/phrase_prefix_query/phrase_prefix_scorer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -193,8 +193,8 @@ impl<TPostings: Postings> DocSet for PhrasePrefixScorer<TPostings> {
self.advance()
}

fn seek_exact(&mut self, target: DocId) -> bool {
if self.phrase_scorer.seek_exact(target) {
fn seek_into_the_danger_zone(&mut self, target: DocId) -> bool {
if self.phrase_scorer.seek_into_the_danger_zone(target) {
self.matches_prefix()
} else {
false
Expand Down
4 changes: 2 additions & 2 deletions src/query/phrase_query/phrase_scorer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -530,9 +530,9 @@ impl<TPostings: Postings> DocSet for PhraseScorer<TPostings> {
self.advance()
}

fn seek_exact(&mut self, target: DocId) -> bool {
fn seek_into_the_danger_zone(&mut self, target: DocId) -> bool {
debug_assert!(target >= self.doc());
if self.intersection_docset.seek_exact(target) && self.phrase_match() {
if self.intersection_docset.seek_into_the_danger_zone(target) && self.phrase_match() {
return true;
}
false
Expand Down
4 changes: 2 additions & 2 deletions src/query/reqopt_scorer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,9 @@ where
self.req_scorer.seek(target)
}

fn seek_exact(&mut self, target: DocId) -> bool {
fn seek_into_the_danger_zone(&mut self, target: DocId) -> bool {
self.score_cache = None;
self.req_scorer.seek_exact(target)
self.req_scorer.seek_into_the_danger_zone(target)
}

fn doc(&self) -> DocId {
Expand Down
3 changes: 3 additions & 0 deletions src/query/term_query/term_scorer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -98,14 +98,17 @@ impl TermScorer {
}

impl DocSet for TermScorer {
#[inline]
fn advance(&mut self) -> DocId {
self.postings.advance()
}

#[inline]
fn seek(&mut self, target: DocId) -> DocId {
self.postings.seek(target)
}

#[inline]
fn doc(&self) -> DocId {
self.postings.doc()
}
Expand Down
12 changes: 12 additions & 0 deletions src/query/union/buffered_union.rs
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,18 @@ where
}
}

fn seek_into_the_danger_zone(&mut self, target: DocId) -> bool {
let is_hit = self
.docsets
.iter_mut()
.all(|docset| docset.seek_into_the_danger_zone(target));
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why all? isn't that a union?

also are we ok with the lazy evaluation here?

// The API requires the DocSet to be in a valid state when `seek_exact` returns true.
if is_hit {
self.seek(target);
}
is_hit
}

fn doc(&self) -> DocId {
self.doc
}
Expand Down
Loading