Skip to content

Commit

Permalink
add parser item limit
Browse files Browse the repository at this point in the history
  • Loading branch information
mmoskal committed Nov 15, 2024
1 parent f49791c commit b5ca97b
Show file tree
Hide file tree
Showing 7 changed files with 76 additions and 15 deletions.
5 changes: 5 additions & 0 deletions parser/llguidance.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,11 @@ typedef struct LlgParserLimits {
* Default: 500_000 (~10ms)
*/
uint64_t step_lexer_fuel;
/**
* Number of Earley items created for the whole token mask.
* Default: 100_000 (~3ms)
*/
size_t step_max_items;
/**
* Maximum number of lexer states.
* Default: 10_000
Expand Down
5 changes: 5 additions & 0 deletions parser/src/api.rs
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,10 @@ pub struct ParserLimits {
/// Default: 500_000 (~10ms)
pub step_lexer_fuel: u64,

/// Number of Earley items created for the whole token mask.
/// Default: 100_000 (~3ms)
pub step_max_items: usize,

/// Maximum number of lexer states.
/// Default: 10_000
pub max_lexer_states: usize,
Expand All @@ -395,6 +399,7 @@ impl Default for ParserLimits {
step_lexer_fuel: 500_000, // 500k => 10ms
max_lexer_states: 10_000, // ?
max_grammar_size: 500_000, // fhir schema => 200k
step_max_items: 100_000, //
}
}
}
Expand Down
4 changes: 3 additions & 1 deletion parser/src/earley/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,6 @@ pub mod regexvec;
pub use from_guidance::grammars_from_json;
#[allow(unused_imports)]
pub use grammar::{CGrammar, CSymIdx, Grammar};
pub use parser::{BiasComputer, DefaultBiasComputer, Parser, ParserRecognizer, ParserStats};
pub use parser::{
BiasComputer, DefaultBiasComputer, Parser, ParserError, ParserRecognizer, ParserStats,
};
52 changes: 49 additions & 3 deletions parser/src/earley/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ use serde::{Deserialize, Serialize};
use toktrie::{Recognizer, SimpleVob, SpecialToken, TokEnv, TokTrie, TokenId};

use crate::{
api::{GenGrammarOptions, ParserLimits},
api::{GenGrammarOptions, ParserLimits, StopReason},
earley::lexer::Lexer,
};

Expand Down Expand Up @@ -237,6 +237,8 @@ struct ParserState {
trie_gen_grammar: Option<CSymIdx>,
trie_gen_grammar_accepting: bool,
limits: ParserLimits,
max_all_items: usize,
parser_error: Option<String>,
}

#[derive(Clone)]
Expand Down Expand Up @@ -371,6 +373,7 @@ impl ParserState {
last_collapse: 0,
token_idx: 0,
byte_idx: 0,
max_all_items: usize::MAX,
options,
trie_gen_grammar: None,
trie_gen_grammar_accepting: false,
Expand All @@ -380,6 +383,7 @@ impl ParserState {
lexer_state,
byte: None,
}],
parser_error: None,
};

// Initialize the Earley table with the predictions in
Expand Down Expand Up @@ -426,12 +430,23 @@ impl ParserState {
dfa.set_fuel(self.limits.step_lexer_fuel);
dfa.set_max_states(self.limits.max_lexer_states);

self.max_all_items = self.stats.all_items + self.limits.step_max_items as usize;

let mut r = ParserRecognizer {
shared,
state: self,
};
let mut set = computer.compute_bias(&mut r, start);

if self.stats.all_items > self.max_all_items && self.parser_error.is_none() {
self.parser_error = Some(format!(
"Too many items (limit {}); try avoiding single-byte/short lexemes",
self.limits.step_max_items
));
}

self.max_all_items = usize::MAX;

self.stats.lexer_cost = shared.lexer.dfa.total_fuel_spent();

// The SPECIAL_TOKEN_PREFIX_BYTE should never be allowed by itself
Expand Down Expand Up @@ -1528,6 +1543,10 @@ impl ParserState {
// This is never inlined anyways, so better make it formal
#[inline(never)]
fn advance_parser(&mut self, shared: &mut SharedState, pre_lexeme: PreLexeme) -> bool {
if self.stats.all_items > self.max_all_items {
return false;
}

// this byte will be applied to the next lexeme
let transition_byte = if pre_lexeme.byte_next_row {
pre_lexeme.byte
Expand Down Expand Up @@ -1732,6 +1751,27 @@ fn item_to_string(g: &CGrammar, item: &Item) -> String {
)
}

pub enum ParserError {
LexerError(String),
ParserError(String),
}

impl ParserError {
pub fn stop_reason(&self) -> StopReason {
match self {
ParserError::LexerError(_) => StopReason::LexerTooComplex,
ParserError::ParserError(_) => StopReason::ParserTooComplex,
}
}

pub fn message(&self) -> String {
match self {
ParserError::LexerError(s) => format!("lexer error: {}", s),
ParserError::ParserError(s) => format!("parser error: {}", s),
}
}
}

impl Parser {
pub fn new(
grammar: Arc<CGrammar>,
Expand Down Expand Up @@ -1792,9 +1832,15 @@ impl Parser {
shared.lexer.dfa.stats()
}

pub fn lexer_error(&self) -> Option<String> {
pub fn get_error(&self) -> Option<ParserError> {
let shared = self.shared.lock().unwrap();
shared.lexer.dfa.get_error()
if let Some(e) = shared.lexer.dfa.get_error() {
return Some(ParserError::LexerError(e));
}
if let Some(e) = &self.state.parser_error {
return Some(ParserError::ParserError(e.clone()));
}
None
}

pub fn with_recognizer<T>(&mut self, f: impl FnOnce(&mut ParserRecognizer) -> T) -> T {
Expand Down
19 changes: 11 additions & 8 deletions parser/src/tokenparser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use crate::{
api::{GenGrammarOptions, ParserLimits, StopReason, TopLevelGrammar},
earley::{
grammars_from_json, BiasComputer, CGrammar, CSymIdx, DefaultBiasComputer, Parser,
ParserStats,
ParserError, ParserStats,
},
infoln, warn, Logger,
};
Expand Down Expand Up @@ -384,6 +384,10 @@ impl TokenParser {
r
}

fn stop_for_parser_error(&mut self, pref: &str, err: ParserError) -> StepResult {
self.stop(&format!("{}{}", pref, err.message()), err.stop_reason())
}

fn mid_process_inner(&mut self, mut arg: StepArg) -> StepResult {
let start_time = instant::Instant::now();

Expand Down Expand Up @@ -650,14 +654,14 @@ impl TokenParser {
.parser
.compute_bias(&*self.bias_computer, &token_prefix);
let p_stats = self.parser.stats().delta(&pre_stats);
if let Some(err) = self.parser.lexer_error() {
let err = format!("lexer error: {}", err);
return self.stop(&err, StopReason::LexerTooComplex);
}
self.last_bias_time = Duration::from_micros(p_stats.compute_time_us);
self.last_step_stats = p_stats.clone();
self.max_step_stats = self.max_step_stats.max(&p_stats);

if let Some(err) = self.parser.get_error() {
return self.stop_for_parser_error("", err);
}

if inner_accepting {
let mut all_accepting = true;
if self.parser_stack.len() > 0 {
Expand All @@ -668,9 +672,8 @@ impl TokenParser {
let (is_accepting, mask) = pentry
.parser
.compute_bias_after_gen_grammar(&*self.bias_computer, pentry.symidx);
if let Some(err) = pentry.parser.lexer_error() {
let err = format!("lexer error (inner): {}", err);
return self.stop(&err, StopReason::LexerTooComplex);
if let Some(err) = pentry.parser.get_error() {
return self.stop_for_parser_error("inner parser: ", err);
}
infoln!(self, "bias for upper parser: {}", trie.token_set_dbg(&mask));
pentry.mask = Some(mask);
Expand Down
4 changes: 2 additions & 2 deletions sample_parser/run.sh
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
#!/bin/sh

# cargo run data/blog.schema.ll.json data/blog.sample.json
# cargo run data/blog.schema.json data/blog.sample.json
cargo run --release data/blog.schema.json data/blog.sample.json
# cargo run --release --bin minimal data/blog.schema.json data/blog.sample.json
cargo run --release data/rfc.lark data/rfc.xml
# cargo run --release data/rfc.lark data/rfc.xml
# mkdir -p tmp
# strip -o tmp/minimal ../../target/release/minimal
# ls -l ../../target/release/minimal tmp/minimal
2 changes: 1 addition & 1 deletion sample_parser/src/sample_parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ fn main() {

let p_stats = constraint.parser.last_step_stats();
println!(
"SAMPLE {}: {} {}; stats: {} lex, {} rows, {} us",
"SAMPLE {}: {} {}; stats: {} lex, {} items, {} us",
idx,
sampled_token,
tok_env.tok_trie().token_dbg(sampled_token),
Expand Down

0 comments on commit b5ca97b

Please sign in to comment.