diff --git a/examples/phrase_prefix_search.rs b/examples/phrase_prefix_search.rs new file mode 100644 index 0000000000..4e7a853b0a --- /dev/null +++ b/examples/phrase_prefix_search.rs @@ -0,0 +1,79 @@ +use tantivy::collector::TopDocs; +use tantivy::query::QueryParser; +use tantivy::schema::*; +use tantivy::{doc, Index, ReloadPolicy, Result}; +use tempfile::TempDir; + +fn main() -> Result<()> { + let index_path = TempDir::new()?; + + let mut schema_builder = Schema::builder(); + schema_builder.add_text_field("title", TEXT | STORED); + schema_builder.add_text_field("body", TEXT); + let schema = schema_builder.build(); + + let title = schema.get_field("title").unwrap(); + let body = schema.get_field("body").unwrap(); + + let index = Index::create_in_dir(&index_path, schema)?; + + let mut index_writer = index.writer(50_000_000)?; + + index_writer.add_document(doc!( + title => "The Old Man and the Sea", + body => "He was an old man who fished alone in a skiff in the Gulf Stream and he had gone \ + eighty-four days now without taking a fish.", + ))?; + + index_writer.add_document(doc!( + title => "Of Mice and Men", + body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \ + bank and runs deep and green. The water is warm too, for it has slipped twinkling \ + over the yellow sands in the sunlight before reaching the narrow pool. On one \ + side of the river the golden foothill slopes curve up to the strong and rocky \ + Gabilan Mountains, but on the valley side the water is lined with trees—willows \ + fresh and green with every spring, carrying in their lower leaf junctures the \ + debris of the winter’s flooding; and sycamores with mottled, white, recumbent \ + limbs and branches that arch over the pool" + ))?; + + // Multivalued field just need to be repeated. + index_writer.add_document(doc!( + title => "Frankenstein", + title => "The Modern Prometheus", + body => "You will rejoice to hear that no disaster has accompanied the commencement of an \ + enterprise which you have regarded with such evil forebodings. I arrived here \ + yesterday, and my first task is to assure my dear sister of my welfare and \ + increasing confidence in the success of my undertaking." + ))?; + + index_writer.commit()?; + + let reader = index + .reader_builder() + .reload_policy(ReloadPolicy::OnCommit) + .try_into()?; + + let searcher = reader.searcher(); + + let query_parser = QueryParser::for_index(&index, vec![title, body]); + // This will match documents containing the phrase "in the" + // followed by some word starting with "su", + // i.e. it will match "in the sunlight" and "in the success", + // but not "in the Gulf Stream". + let query = query_parser.parse_query("\"in the su\"*")?; + + let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?; + let mut titles = top_docs + .into_iter() + .map(|(_score, doc_address)| { + let doc = searcher.doc(doc_address)?; + let title = doc.get_first(title).unwrap().as_text().unwrap().to_owned(); + Ok(title) + }) + .collect::>>()?; + titles.sort_unstable(); + assert_eq!(titles, ["Frankenstein", "Of Mice and Men"]); + + Ok(()) +} diff --git a/query-grammar/src/query_grammar.rs b/query-grammar/src/query_grammar.rs index 41c5b2cb9a..c731b08a3c 100644 --- a/query-grammar/src/query_grammar.rs +++ b/query-grammar/src/query_grammar.rs @@ -162,14 +162,22 @@ fn term_val<'a>() -> impl Parser<&'a str, Output = (Delimiter, String)> { } fn term_query<'a>() -> impl Parser<&'a str, Output = UserInputLiteral> { - (field_name(), term_val(), slop_val()).map(|(field_name, (delimiter, phrase), slop)| { - UserInputLiteral { + (field_name(), term_val(), slop_or_prefix_val()).map( + |(field_name, (delimiter, phrase), (slop, prefix))| UserInputLiteral { field_name: Some(field_name), phrase, delimiter, slop, - } - }) + prefix, + }, + ) +} + +fn slop_or_prefix_val<'a>() -> impl Parser<&'a str, Output = (u32, bool)> { + let prefix_val = char('*').map(|_ast| (0, true)); + let slop_val = slop_val().map(|slop| (slop, false)); + + prefix_val.or(slop_val) } fn slop_val<'a>() -> impl Parser<&'a str, Output = u32> { @@ -186,11 +194,14 @@ fn slop_val<'a>() -> impl Parser<&'a str, Output = u32> { fn literal<'a>() -> impl Parser<&'a str, Output = UserInputLeaf> { let term_default_field = - (term_val(), slop_val()).map(|((delimiter, phrase), slop)| UserInputLiteral { - field_name: None, - phrase, - delimiter, - slop, + (term_val(), slop_or_prefix_val()).map(|((delimiter, phrase), (slop, prefix))| { + UserInputLiteral { + field_name: None, + phrase, + delimiter, + slop, + prefix, + } }); attempt(term_query()) @@ -872,6 +883,16 @@ mod test { test_parse_query_to_ast_helper("\"a b\"~300^2", "(\"a b\"~300)^2"); } + #[test] + fn test_phrase_prefix() { + test_parse_query_to_ast_helper("\"a b\"*", "\"a b\"*"); + test_parse_query_to_ast_helper("\"a\"*", "\"a\"*"); + test_parse_query_to_ast_helper("\"\"*", "\"\"*"); + test_parse_query_to_ast_helper("foo:\"a b\"*", "\"foo\":\"a b\"*"); + test_parse_query_to_ast_helper("foo:\"a\"*", "\"foo\":\"a\"*"); + test_parse_query_to_ast_helper("foo:\"\"*", "\"foo\":\"\"*"); + } + #[test] fn test_not_queries_are_consistent() { test_parse_query_to_ast_helper("tata -toto", "(*tata -toto)"); diff --git a/query-grammar/src/user_input_ast.rs b/query-grammar/src/user_input_ast.rs index ba4613a530..02d93336cb 100644 --- a/query-grammar/src/user_input_ast.rs +++ b/query-grammar/src/user_input_ast.rs @@ -66,6 +66,7 @@ pub struct UserInputLiteral { pub phrase: String, pub delimiter: Delimiter, pub slop: u32, + pub prefix: bool, } impl fmt::Debug for UserInputLiteral { @@ -86,6 +87,8 @@ impl fmt::Debug for UserInputLiteral { } if self.slop > 0 { write!(formatter, "~{}", self.slop)?; + } else if self.prefix { + write!(formatter, "*")?; } Ok(()) } diff --git a/src/query/phrase_prefix_query/phrase_prefix_query.rs b/src/query/phrase_prefix_query/phrase_prefix_query.rs index 2bb2d4cc2e..8cbbe637e5 100644 --- a/src/query/phrase_prefix_query/phrase_prefix_query.rs +++ b/src/query/phrase_prefix_query/phrase_prefix_query.rs @@ -88,9 +88,6 @@ impl PhrasePrefixQuery { /// a specialized type [`PhraseQueryWeight`] instead of a Boxed trait. /// If the query was only one term long, this returns `None` wherease [`Query::weight`] /// returns a boxed [`RangeWeight`] - /// - /// Returns `None`, if phrase_terms is empty, which happens if the phrase prefix query was - /// built with a single term. pub(crate) fn phrase_prefix_query_weight( &self, enable_scoring: EnableScoring<'_>, diff --git a/src/query/query_parser/logical_ast.rs b/src/query/query_parser/logical_ast.rs index 311377f18c..436b033d3e 100644 --- a/src/query/query_parser/logical_ast.rs +++ b/src/query/query_parser/logical_ast.rs @@ -8,7 +8,11 @@ use crate::Score; #[derive(Clone)] pub enum LogicalLiteral { Term(Term), - Phrase(Vec<(usize, Term)>, u32), + Phrase { + terms: Vec<(usize, Term)>, + slop: u32, + prefix: bool, + }, Range { field: String, value_type: Type, @@ -79,10 +83,16 @@ impl fmt::Debug for LogicalLiteral { fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> { match *self { LogicalLiteral::Term(ref term) => write!(formatter, "{term:?}"), - LogicalLiteral::Phrase(ref terms, slop) => { + LogicalLiteral::Phrase { + ref terms, + slop, + prefix, + } => { write!(formatter, "\"{terms:?}\"")?; if slop > 0 { write!(formatter, "~{slop:?}") + } else if prefix { + write!(formatter, "*") } else { Ok(()) } diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index e8f1c0f6e3..cfb7cbd5b7 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -15,21 +15,12 @@ use crate::core::json_utils::{ use crate::core::Index; use crate::query::range_query::{is_type_valid_for_fastfield_range_query, RangeQuery}; use crate::query::{ - AllQuery, - BooleanQuery, - BoostQuery, - EmptyQuery, - FuzzyTermQuery, - Occur, - PhraseQuery, - Query, - // RangeQuery, - TermQuery, - TermSetQuery, + AllQuery, BooleanQuery, BoostQuery, EmptyQuery, FuzzyTermQuery, Occur, PhrasePrefixQuery, + PhraseQuery, Query, TermQuery, TermSetQuery, }; use crate::schema::{ Facet, FacetParseError, Field, FieldType, IndexRecordOption, IntoIpv6Addr, JsonObjectOptions, - Schema, Term, Type, + Schema, Term, TextFieldIndexing, Type, }; use crate::time::format_description::well_known::Rfc3339; use crate::time::OffsetDateTime; @@ -79,6 +70,17 @@ pub enum QueryParserError { /// have any positions indexed. #[error("The field '{0}' does not have positions indexed")] FieldDoesNotHavePositionsIndexed(String), + /// A phrase-prefix query requires at least two terms + #[error( + "The phrase '{phrase:?}' does not produce at least two terms using the tokenizer \ + '{tokenizer:?}'" + )] + PhrasePrefixRequiresAtLeastTwoTerms { + /// The phrase which triggered the issue + phrase: String, + /// The tokenizer configured for the field + tokenizer: String, + }, /// The tokenizer for the given field is unknown /// The two argument strings are the name of the field, the name of the tokenizer #[error("The tokenizer '{tokenizer:?}' for the field '{field:?}' is unknown")] @@ -194,6 +196,10 @@ fn trim_ast(logical_ast: LogicalAst) -> Option { /// /// Phrase terms support the `~` slop operator which allows to set the phrase's matching /// distance in words. `"big wolf"~1` will return documents containing the phrase `"big bad wolf"`. +/// +/// Phrase terms also support the `*` prefix operator which switches the phrase's matching +/// to consider all documents which contain the last term as a prefix, e.g. `"big bad wo"*` will +/// match `"big bad wolf"`. #[derive(Clone)] pub struct QueryParser { schema: Schema, @@ -446,6 +452,7 @@ impl QueryParser { json_path: &str, phrase: &str, slop: u32, + prefix: bool, ) -> Result, QueryParserError> { let field_entry = self.schema.get_field_entry(field); let field_type = field_entry.field_type(); @@ -486,25 +493,25 @@ impl QueryParser { Ok(vec![LogicalLiteral::Term(dt_term)]) } FieldType::Str(ref str_options) => { - let option = str_options.get_indexing_options().ok_or_else(|| { + let indexing_options = str_options.get_indexing_options().ok_or_else(|| { // This should have been seen earlier really. QueryParserError::FieldNotIndexed(field_name.to_string()) })?; - let text_analyzer = - self.tokenizer_manager - .get(option.tokenizer()) - .ok_or_else(|| QueryParserError::UnknownTokenizer { - field: field_name.to_string(), - tokenizer: option.tokenizer().to_string(), - })?; - let index_record_option = option.index_option(); + let text_analyzer = self + .tokenizer_manager + .get(indexing_options.tokenizer()) + .ok_or_else(|| QueryParserError::UnknownTokenizer { + field: field_name.to_string(), + tokenizer: indexing_options.tokenizer().to_string(), + })?; Ok(generate_literals_for_str( field_name, field, phrase, slop, + prefix, + indexing_options, &text_analyzer, - index_record_option, )? .into_iter() .collect()) @@ -661,9 +668,13 @@ impl QueryParser { self.compute_path_triplets_for_literal(&literal)?; let mut asts: Vec = Vec::new(); for (field, json_path, phrase) in term_phrases { - for ast in - self.compute_logical_ast_for_leaf(field, json_path, phrase, literal.slop)? - { + for ast in self.compute_logical_ast_for_leaf( + field, + json_path, + phrase, + literal.slop, + literal.prefix, + )? { // Apply some field specific boost defined at the query parser level. let boost = self.field_boost(field); asts.push(LogicalAst::Leaf(Box::new(ast)).boost(boost)); @@ -753,9 +764,17 @@ fn convert_literal_to_query( Box::new(TermQuery::new(term, IndexRecordOption::WithFreqs)) } } - LogicalLiteral::Phrase(term_with_offsets, slop) => Box::new( - PhraseQuery::new_with_offset_and_slop(term_with_offsets, slop), - ), + LogicalLiteral::Phrase { + terms, + slop, + prefix, + } => { + if prefix { + Box::new(PhrasePrefixQuery::new_with_offset(terms)) + } else { + Box::new(PhraseQuery::new_with_offset_and_slop(terms, slop)) + } + } LogicalLiteral::Range { field, value_type, @@ -774,8 +793,9 @@ fn generate_literals_for_str( field: Field, phrase: &str, slop: u32, + prefix: bool, + indexing_options: &TextFieldIndexing, text_analyzer: &TextAnalyzer, - index_record_option: IndexRecordOption, ) -> Result, QueryParserError> { let mut terms: Vec<(usize, Term)> = Vec::new(); let mut token_stream = text_analyzer.token_stream(phrase); @@ -784,18 +804,28 @@ fn generate_literals_for_str( terms.push((token.position, term)); }); if terms.len() <= 1 { + if prefix { + return Err(QueryParserError::PhrasePrefixRequiresAtLeastTwoTerms { + phrase: phrase.to_owned(), + tokenizer: indexing_options.tokenizer().to_owned(), + }); + } let term_literal_opt = terms .into_iter() .next() .map(|(_, term)| LogicalLiteral::Term(term)); return Ok(term_literal_opt); } - if !index_record_option.has_positions() { + if !indexing_options.index_option().has_positions() { return Err(QueryParserError::FieldDoesNotHavePositionsIndexed( field_name.to_string(), )); } - Ok(Some(LogicalLiteral::Phrase(terms, slop))) + Ok(Some(LogicalLiteral::Phrase { + terms, + slop, + prefix, + })) } fn generate_literals_for_json_object( @@ -841,7 +871,11 @@ fn generate_literals_for_json_object( field_name.to_string(), )); } - logical_literals.push(LogicalLiteral::Phrase(terms, 0)); + logical_literals.push(LogicalLiteral::Phrase { + terms, + slop: 0, + prefix: false, + }); Ok(logical_literals) } @@ -1643,6 +1677,48 @@ mod test { ); } + #[test] + pub fn test_phrase_prefix() { + test_parse_query_to_logical_ast_helper( + "\"big bad wo\"*", + r#"("[(0, Term(field=0, type=Str, "big")), (1, Term(field=0, type=Str, "bad")), (2, Term(field=0, type=Str, "wo"))]"* "[(0, Term(field=1, type=Str, "big")), (1, Term(field=1, type=Str, "bad")), (2, Term(field=1, type=Str, "wo"))]"*)"#, + false, + ); + + let query_parser = make_query_parser(); + let query = query_parser.parse_query("\"big bad wo\"*").unwrap(); + assert_eq!( + format!("{query:?}"), + "BooleanQuery { subqueries: [(Should, PhrasePrefixQuery { field: Field(0), \ + phrase_terms: [(0, Term(field=0, type=Str, \"big\")), (1, Term(field=0, type=Str, \ + \"bad\"))], prefix: (2, Term(field=0, type=Str, \"wo\")), max_expansions: 50 }), \ + (Should, PhrasePrefixQuery { field: Field(1), phrase_terms: [(0, Term(field=1, \ + type=Str, \"big\")), (1, Term(field=1, type=Str, \"bad\"))], prefix: (2, \ + Term(field=1, type=Str, \"wo\")), max_expansions: 50 })] }" + ); + } + + #[test] + pub fn test_phrase_prefix_too_short() { + let err = parse_query_to_logical_ast("\"wo\"*", true).unwrap_err(); + assert_eq!( + err, + QueryParserError::PhrasePrefixRequiresAtLeastTwoTerms { + phrase: "wo".to_owned(), + tokenizer: "default".to_owned() + } + ); + + let err = parse_query_to_logical_ast("\"\"*", true).unwrap_err(); + assert_eq!( + err, + QueryParserError::PhrasePrefixRequiresAtLeastTwoTerms { + phrase: "".to_owned(), + tokenizer: "default".to_owned() + } + ); + } + #[test] pub fn test_term_set_query() { test_parse_query_to_logical_ast_helper(