diff --git a/parser/llguidance.h b/parser/llguidance.h index 9ea56e8..88eef6c 100644 --- a/parser/llguidance.h +++ b/parser/llguidance.h @@ -28,6 +28,11 @@ typedef struct LlgParserLimits { * Default: 500_000 (~10ms) */ uint64_t step_lexer_fuel; + /** + * Number of Earley items created for the whole token mask. + * Default: 100_000 (~3ms) + */ + size_t step_max_items; /** * Maximum number of lexer states. * Default: 10_000 diff --git a/parser/src/api.rs b/parser/src/api.rs index c1b6c26..24c7610 100644 --- a/parser/src/api.rs +++ b/parser/src/api.rs @@ -378,6 +378,10 @@ pub struct ParserLimits { /// Default: 500_000 (~10ms) pub step_lexer_fuel: u64, + /// Number of Earley items created for the whole token mask. + /// Default: 100_000 (~3ms) + pub step_max_items: usize, + /// Maximum number of lexer states. /// Default: 10_000 pub max_lexer_states: usize, @@ -395,6 +399,7 @@ impl Default for ParserLimits { step_lexer_fuel: 500_000, // 500k => 10ms max_lexer_states: 10_000, // ? max_grammar_size: 500_000, // fhir schema => 200k + step_max_items: 100_000, // } } } diff --git a/parser/src/earley/mod.rs b/parser/src/earley/mod.rs index ad555db..9f6e59c 100644 --- a/parser/src/earley/mod.rs +++ b/parser/src/earley/mod.rs @@ -9,4 +9,6 @@ pub mod regexvec; pub use from_guidance::grammars_from_json; #[allow(unused_imports)] pub use grammar::{CGrammar, CSymIdx, Grammar}; -pub use parser::{BiasComputer, DefaultBiasComputer, Parser, ParserRecognizer, ParserStats}; +pub use parser::{ + BiasComputer, DefaultBiasComputer, Parser, ParserError, ParserRecognizer, ParserStats, +}; diff --git a/parser/src/earley/parser.rs b/parser/src/earley/parser.rs index 265f3b4..d38011a 100644 --- a/parser/src/earley/parser.rs +++ b/parser/src/earley/parser.rs @@ -20,7 +20,7 @@ use serde::{Deserialize, Serialize}; use toktrie::{Recognizer, SimpleVob, SpecialToken, TokEnv, TokTrie, TokenId}; use crate::{ - api::{GenGrammarOptions, ParserLimits}, + api::{GenGrammarOptions, ParserLimits, StopReason}, earley::lexer::Lexer, }; @@ -237,6 +237,8 @@ struct ParserState { trie_gen_grammar: Option, trie_gen_grammar_accepting: bool, limits: ParserLimits, + max_all_items: usize, + parser_error: Option, } #[derive(Clone)] @@ -371,6 +373,7 @@ impl ParserState { last_collapse: 0, token_idx: 0, byte_idx: 0, + max_all_items: usize::MAX, options, trie_gen_grammar: None, trie_gen_grammar_accepting: false, @@ -380,6 +383,7 @@ impl ParserState { lexer_state, byte: None, }], + parser_error: None, }; // Initialize the Earley table with the predictions in @@ -426,12 +430,23 @@ impl ParserState { dfa.set_fuel(self.limits.step_lexer_fuel); dfa.set_max_states(self.limits.max_lexer_states); + self.max_all_items = self.stats.all_items + self.limits.step_max_items as usize; + let mut r = ParserRecognizer { shared, state: self, }; let mut set = computer.compute_bias(&mut r, start); + if self.stats.all_items > self.max_all_items && self.parser_error.is_none() { + self.parser_error = Some(format!( + "Too many items (limit {}); try avoiding single-byte/short lexemes", + self.limits.step_max_items + )); + } + + self.max_all_items = usize::MAX; + self.stats.lexer_cost = shared.lexer.dfa.total_fuel_spent(); // The SPECIAL_TOKEN_PREFIX_BYTE should never be allowed by itself @@ -1528,6 +1543,10 @@ impl ParserState { // This is never inlined anyways, so better make it formal #[inline(never)] fn advance_parser(&mut self, shared: &mut SharedState, pre_lexeme: PreLexeme) -> bool { + if self.stats.all_items > self.max_all_items { + return false; + } + // this byte will be applied to the next lexeme let transition_byte = if pre_lexeme.byte_next_row { pre_lexeme.byte @@ -1732,6 +1751,27 @@ fn item_to_string(g: &CGrammar, item: &Item) -> String { ) } +pub enum ParserError { + LexerError(String), + ParserError(String), +} + +impl ParserError { + pub fn stop_reason(&self) -> StopReason { + match self { + ParserError::LexerError(_) => StopReason::LexerTooComplex, + ParserError::ParserError(_) => StopReason::ParserTooComplex, + } + } + + pub fn message(&self) -> String { + match self { + ParserError::LexerError(s) => format!("lexer error: {}", s), + ParserError::ParserError(s) => format!("parser error: {}", s), + } + } +} + impl Parser { pub fn new( grammar: Arc, @@ -1792,9 +1832,15 @@ impl Parser { shared.lexer.dfa.stats() } - pub fn lexer_error(&self) -> Option { + pub fn get_error(&self) -> Option { let shared = self.shared.lock().unwrap(); - shared.lexer.dfa.get_error() + if let Some(e) = shared.lexer.dfa.get_error() { + return Some(ParserError::LexerError(e)); + } + if let Some(e) = &self.state.parser_error { + return Some(ParserError::ParserError(e.clone())); + } + None } pub fn with_recognizer(&mut self, f: impl FnOnce(&mut ParserRecognizer) -> T) -> T { diff --git a/parser/src/tokenparser.rs b/parser/src/tokenparser.rs index f8e413a..214f271 100644 --- a/parser/src/tokenparser.rs +++ b/parser/src/tokenparser.rs @@ -4,7 +4,7 @@ use crate::{ api::{GenGrammarOptions, ParserLimits, StopReason, TopLevelGrammar}, earley::{ grammars_from_json, BiasComputer, CGrammar, CSymIdx, DefaultBiasComputer, Parser, - ParserStats, + ParserError, ParserStats, }, infoln, warn, Logger, }; @@ -384,6 +384,10 @@ impl TokenParser { r } + fn stop_for_parser_error(&mut self, pref: &str, err: ParserError) -> StepResult { + self.stop(&format!("{}{}", pref, err.message()), err.stop_reason()) + } + fn mid_process_inner(&mut self, mut arg: StepArg) -> StepResult { let start_time = instant::Instant::now(); @@ -650,14 +654,14 @@ impl TokenParser { .parser .compute_bias(&*self.bias_computer, &token_prefix); let p_stats = self.parser.stats().delta(&pre_stats); - if let Some(err) = self.parser.lexer_error() { - let err = format!("lexer error: {}", err); - return self.stop(&err, StopReason::LexerTooComplex); - } self.last_bias_time = Duration::from_micros(p_stats.compute_time_us); self.last_step_stats = p_stats.clone(); self.max_step_stats = self.max_step_stats.max(&p_stats); + if let Some(err) = self.parser.get_error() { + return self.stop_for_parser_error("", err); + } + if inner_accepting { let mut all_accepting = true; if self.parser_stack.len() > 0 { @@ -668,9 +672,8 @@ impl TokenParser { let (is_accepting, mask) = pentry .parser .compute_bias_after_gen_grammar(&*self.bias_computer, pentry.symidx); - if let Some(err) = pentry.parser.lexer_error() { - let err = format!("lexer error (inner): {}", err); - return self.stop(&err, StopReason::LexerTooComplex); + if let Some(err) = pentry.parser.get_error() { + return self.stop_for_parser_error("inner parser: ", err); } infoln!(self, "bias for upper parser: {}", trie.token_set_dbg(&mask)); pentry.mask = Some(mask); diff --git a/sample_parser/run.sh b/sample_parser/run.sh index 9e1fb41..88742c7 100755 --- a/sample_parser/run.sh +++ b/sample_parser/run.sh @@ -1,9 +1,9 @@ #!/bin/sh # cargo run data/blog.schema.ll.json data/blog.sample.json -# cargo run data/blog.schema.json data/blog.sample.json +cargo run --release data/blog.schema.json data/blog.sample.json # cargo run --release --bin minimal data/blog.schema.json data/blog.sample.json -cargo run --release data/rfc.lark data/rfc.xml +# cargo run --release data/rfc.lark data/rfc.xml # mkdir -p tmp # strip -o tmp/minimal ../../target/release/minimal # ls -l ../../target/release/minimal tmp/minimal diff --git a/sample_parser/src/sample_parser.rs b/sample_parser/src/sample_parser.rs index 0550ffb..208cb28 100644 --- a/sample_parser/src/sample_parser.rs +++ b/sample_parser/src/sample_parser.rs @@ -85,7 +85,7 @@ fn main() { let p_stats = constraint.parser.last_step_stats(); println!( - "SAMPLE {}: {} {}; stats: {} lex, {} rows, {} us", + "SAMPLE {}: {} {}; stats: {} lex, {} items, {} us", idx, sampled_token, tok_env.tok_trie().token_dbg(sampled_token),