Skip to content

Commit

Permalink
Expose phrase-prefix queries via the built-in query parser (quickwit-…
Browse files Browse the repository at this point in the history
…oss#2044)

* Expose phrase-prefix queries via the built-in query parser

This proposes the less-than-imaginative syntax `field:"phrase ter"*` to
perform a phrase prefix query against `field` using `phrase` and `ter` as the
terms. The aim of this is to make this type of query more discoverable and
simplify manual testing.

I did consider exposing the `max_expansions` parameter similar to how slop is
handled, but I think that this is rather something that should be configured via
the querser parser (similar to `set_field_boost` and `set_field_fuzzy`) as
choosing it requires rather intimiate knowledge of the backing index.

* Prevent construction of zero or one term phrase-prefix queries via the query parser.

* Add example using phrase-prefix search via surface API to improve feature discoverability.
  • Loading branch information
adamreichold committed Jun 1, 2023
1 parent 7ee78bd commit b325d56
Show file tree
Hide file tree
Showing 6 changed files with 232 additions and 46 deletions.
79 changes: 79 additions & 0 deletions examples/phrase_prefix_search.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::*;
use tantivy::{doc, Index, ReloadPolicy, Result};
use tempfile::TempDir;

fn main() -> Result<()> {
let index_path = TempDir::new()?;

let mut schema_builder = Schema::builder();
schema_builder.add_text_field("title", TEXT | STORED);
schema_builder.add_text_field("body", TEXT);
let schema = schema_builder.build();

let title = schema.get_field("title").unwrap();
let body = schema.get_field("body").unwrap();

let index = Index::create_in_dir(&index_path, schema)?;

let mut index_writer = index.writer(50_000_000)?;

index_writer.add_document(doc!(
title => "The Old Man and the Sea",
body => "He was an old man who fished alone in a skiff in the Gulf Stream and he had gone \
eighty-four days now without taking a fish.",
))?;

index_writer.add_document(doc!(
title => "Of Mice and Men",
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
))?;

// Multivalued field just need to be repeated.
index_writer.add_document(doc!(
title => "Frankenstein",
title => "The Modern Prometheus",
body => "You will rejoice to hear that no disaster has accompanied the commencement of an \
enterprise which you have regarded with such evil forebodings. I arrived here \
yesterday, and my first task is to assure my dear sister of my welfare and \
increasing confidence in the success of my undertaking."
))?;

index_writer.commit()?;

let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::OnCommit)
.try_into()?;

let searcher = reader.searcher();

let query_parser = QueryParser::for_index(&index, vec![title, body]);
// This will match documents containing the phrase "in the"
// followed by some word starting with "su",
// i.e. it will match "in the sunlight" and "in the success",
// but not "in the Gulf Stream".
let query = query_parser.parse_query("\"in the su\"*")?;

let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
let mut titles = top_docs
.into_iter()
.map(|(_score, doc_address)| {
let doc = searcher.doc(doc_address)?;
let title = doc.get_first(title).unwrap().as_text().unwrap().to_owned();
Ok(title)
})
.collect::<Result<Vec<_>>>()?;
titles.sort_unstable();
assert_eq!(titles, ["Frankenstein", "Of Mice and Men"]);

Ok(())
}
39 changes: 30 additions & 9 deletions query-grammar/src/query_grammar.rs
Original file line number Diff line number Diff line change
Expand Up @@ -162,14 +162,22 @@ fn term_val<'a>() -> impl Parser<&'a str, Output = (Delimiter, String)> {
}

fn term_query<'a>() -> impl Parser<&'a str, Output = UserInputLiteral> {
(field_name(), term_val(), slop_val()).map(|(field_name, (delimiter, phrase), slop)| {
UserInputLiteral {
(field_name(), term_val(), slop_or_prefix_val()).map(
|(field_name, (delimiter, phrase), (slop, prefix))| UserInputLiteral {
field_name: Some(field_name),
phrase,
delimiter,
slop,
}
})
prefix,
},
)
}

fn slop_or_prefix_val<'a>() -> impl Parser<&'a str, Output = (u32, bool)> {
let prefix_val = char('*').map(|_ast| (0, true));
let slop_val = slop_val().map(|slop| (slop, false));

prefix_val.or(slop_val)
}

fn slop_val<'a>() -> impl Parser<&'a str, Output = u32> {
Expand All @@ -186,11 +194,14 @@ fn slop_val<'a>() -> impl Parser<&'a str, Output = u32> {

fn literal<'a>() -> impl Parser<&'a str, Output = UserInputLeaf> {
let term_default_field =
(term_val(), slop_val()).map(|((delimiter, phrase), slop)| UserInputLiteral {
field_name: None,
phrase,
delimiter,
slop,
(term_val(), slop_or_prefix_val()).map(|((delimiter, phrase), (slop, prefix))| {
UserInputLiteral {
field_name: None,
phrase,
delimiter,
slop,
prefix,
}
});

attempt(term_query())
Expand Down Expand Up @@ -872,6 +883,16 @@ mod test {
test_parse_query_to_ast_helper("\"a b\"~300^2", "(\"a b\"~300)^2");
}

#[test]
fn test_phrase_prefix() {
test_parse_query_to_ast_helper("\"a b\"*", "\"a b\"*");
test_parse_query_to_ast_helper("\"a\"*", "\"a\"*");
test_parse_query_to_ast_helper("\"\"*", "\"\"*");
test_parse_query_to_ast_helper("foo:\"a b\"*", "\"foo\":\"a b\"*");
test_parse_query_to_ast_helper("foo:\"a\"*", "\"foo\":\"a\"*");
test_parse_query_to_ast_helper("foo:\"\"*", "\"foo\":\"\"*");
}

#[test]
fn test_not_queries_are_consistent() {
test_parse_query_to_ast_helper("tata -toto", "(*tata -toto)");
Expand Down
3 changes: 3 additions & 0 deletions query-grammar/src/user_input_ast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ pub struct UserInputLiteral {
pub phrase: String,
pub delimiter: Delimiter,
pub slop: u32,
pub prefix: bool,
}

impl fmt::Debug for UserInputLiteral {
Expand All @@ -86,6 +87,8 @@ impl fmt::Debug for UserInputLiteral {
}
if self.slop > 0 {
write!(formatter, "~{}", self.slop)?;
} else if self.prefix {
write!(formatter, "*")?;
}
Ok(())
}
Expand Down
3 changes: 0 additions & 3 deletions src/query/phrase_prefix_query/phrase_prefix_query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -88,9 +88,6 @@ impl PhrasePrefixQuery {
/// a specialized type [`PhraseQueryWeight`] instead of a Boxed trait.
/// If the query was only one term long, this returns `None` wherease [`Query::weight`]
/// returns a boxed [`RangeWeight`]
///
/// Returns `None`, if phrase_terms is empty, which happens if the phrase prefix query was
/// built with a single term.
pub(crate) fn phrase_prefix_query_weight(
&self,
enable_scoring: EnableScoring<'_>,
Expand Down
14 changes: 12 additions & 2 deletions src/query/query_parser/logical_ast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,11 @@ use crate::Score;
#[derive(Clone)]
pub enum LogicalLiteral {
Term(Term),
Phrase(Vec<(usize, Term)>, u32),
Phrase {
terms: Vec<(usize, Term)>,
slop: u32,
prefix: bool,
},
Range {
field: String,
value_type: Type,
Expand Down Expand Up @@ -79,10 +83,16 @@ impl fmt::Debug for LogicalLiteral {
fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
match *self {
LogicalLiteral::Term(ref term) => write!(formatter, "{term:?}"),
LogicalLiteral::Phrase(ref terms, slop) => {
LogicalLiteral::Phrase {
ref terms,
slop,
prefix,
} => {
write!(formatter, "\"{terms:?}\"")?;
if slop > 0 {
write!(formatter, "~{slop:?}")
} else if prefix {
write!(formatter, "*")
} else {
Ok(())
}
Expand Down
Loading

0 comments on commit b325d56

Please sign in to comment.