From de92b7d3934fd2fdfdc33788f3b03e830ea45cb7 Mon Sep 17 00:00:00 2001 From: Michal Moskal Date: Tue, 12 Nov 2024 14:53:14 -0800 Subject: [PATCH] add nested grammars and special tokens to lark syntax --- parser/src/api.rs | 34 +++++++++++++++++++++++++++--- parser/src/earley/from_guidance.rs | 26 +++++++++++++++++------ parser/src/earley/grammar.rs | 23 ++++++++++++++------ parser/src/earley/parser.rs | 6 ++++-- parser/src/grammar_builder.rs | 13 +++++++++++- parser/src/lark/README.md | 11 +++++++++- parser/src/lark/ast.rs | 2 ++ parser/src/lark/compiler.rs | 22 ++++++++++++++++++- parser/src/lark/lexer.rs | 4 ++++ parser/src/lark/parser.rs | 4 ++++ parser/src/tokenparser.rs | 2 +- 11 files changed, 125 insertions(+), 22 deletions(-) diff --git a/parser/src/api.rs b/parser/src/api.rs index ec44552..c1b6c26 100644 --- a/parser/src/api.rs +++ b/parser/src/api.rs @@ -1,4 +1,4 @@ -use std::fmt::Debug; +use std::fmt::{Debug, Display}; use serde::{Deserialize, Serialize}; use serde_json::Value; @@ -19,6 +19,9 @@ pub const DEFAULT_CONTEXTUAL: bool = true; #[derive(Serialize, Deserialize, Clone, Default)] pub struct GrammarWithLexer { + /// The name of this grammar, can be used in GenGrammar nodes. + pub name: Option, + /// The start symbol is at nodes[0] /// When nodes is empty, then one of json_schema or lark_grammar must be set. #[serde(default)] @@ -257,6 +260,31 @@ impl RegexSpec { } } +#[derive(Serialize, Deserialize, Hash, PartialEq, Eq, Clone, Debug)] +#[serde(untagged)] +pub enum GrammarId { + Index(usize), + Name(String), +} + +impl GrammarId { + pub fn to_index(&self) -> Option { + match self { + GrammarId::Index(i) => Some(*i), + GrammarId::Name(_) => None, + } + } +} + +impl Display for GrammarId { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + GrammarId::Index(i) => write!(f, "@#{}", i), + GrammarId::Name(s) => write!(f, "@{}", s), + } + } +} + macro_rules! id_type { ($name:ident) => { #[derive(Serialize, Deserialize, Hash, PartialEq, Eq, Clone, Copy, Debug)] @@ -265,7 +293,6 @@ macro_rules! id_type { }; } -id_type!(GrammarId); id_type!(NodeId); id_type!(RegexId); @@ -286,7 +313,7 @@ impl Node { impl Default for GenGrammarOptions { fn default() -> Self { GenGrammarOptions { - grammar: GrammarId(0), + grammar: GrammarId::Index(0), temperature: None, max_tokens_grm: usize::MAX, } @@ -376,6 +403,7 @@ impl TopLevelGrammar { pub fn from_regex(rx: RegexNode) -> Self { TopLevelGrammar { grammars: vec![GrammarWithLexer { + name: Some("regex_grammar".to_string()), nodes: vec![Node::Lexeme { rx: RegexSpec::RegexId(RegexId(0)), contextual: None, diff --git a/parser/src/earley/from_guidance.rs b/parser/src/earley/from_guidance.rs index f4b82ac..68c879c 100644 --- a/parser/src/earley/from_guidance.rs +++ b/parser/src/earley/from_guidance.rs @@ -1,10 +1,11 @@ +use std::collections::HashMap; use std::fmt::Write; use std::{sync::Arc, vec}; use super::{grammar::SymbolProps, lexerspec::LexerSpec, CGrammar, Grammar}; use crate::api::{ - GrammarWithLexer, Node, ParserLimits, RegexId, RegexNode, RegexSpec, TopLevelGrammar, - DEFAULT_CONTEXTUAL, + GrammarId, GrammarWithLexer, Node, ParserLimits, RegexId, RegexNode, RegexSpec, + TopLevelGrammar, DEFAULT_CONTEXTUAL, }; use crate::{lark_to_llguidance, loginfo, JsonCompileOptions, Logger}; use anyhow::{bail, ensure, Result}; @@ -304,14 +305,26 @@ pub fn grammars_from_json( extra_lexemes: Vec, ) -> Result>> { let t0 = Instant::now(); - let grammars = input + let mut grammars = input .grammars .into_iter() .map(|g| grammar_from_json(tok_env, &mut limits, g)) .collect::>>()?; - for (_, g) in &grammars { - g.validate_grammar_refs(&grammars)?; + let mut grammar_by_idx = HashMap::new(); + for (idx, (_, g)) in grammars.iter().enumerate() { + grammar_by_idx.insert(GrammarId::Index(idx), idx); + if let Some(n) = g.name() { + let n = GrammarId::Name(n.to_string()); + if grammar_by_idx.contains_key(&n) { + bail!("duplicate grammar name: {}", n); + } + grammar_by_idx.insert(n, idx); + } + } + + for (_, g) in grammars.iter_mut() { + g.validate_grammar_refs(&grammar_by_idx)?; } let t1 = Instant::now(); @@ -327,8 +340,9 @@ pub fn grammars_from_json( if log_grammar { writeln!( logger.info_logger(), - "Grammar #{}:\n{:?}\n{:?}\n", + "Grammar #{} {}:\n{:?}\n{:?}\n", idx, + grm.name().unwrap_or(""), lex, grm ) diff --git a/parser/src/earley/grammar.rs b/parser/src/earley/grammar.rs index cbbb8bf..e80e41d 100644 --- a/parser/src/earley/grammar.rs +++ b/parser/src/earley/grammar.rs @@ -1,8 +1,8 @@ -use std::{fmt::Debug, hash::Hash}; +use std::{collections::HashMap, fmt::Debug, hash::Hash}; use anyhow::{bail, ensure, Result}; -use crate::api::GenGrammarOptions; +use crate::api::{GenGrammarOptions, GrammarId}; use super::lexerspec::{LexemeIdx, LexerSpec}; use rustc_hash::FxHashMap; @@ -126,6 +126,7 @@ impl Rule { } pub struct Grammar { + name: Option, symbols: Vec, symbol_by_name: FxHashMap, } @@ -133,6 +134,7 @@ pub struct Grammar { impl Grammar { pub fn new() -> Self { Grammar { + name: None, symbols: vec![], symbol_by_name: FxHashMap::default(), } @@ -338,6 +340,7 @@ impl Grammar { } let mut outp = Grammar::new(); + outp.name = self.name.clone(); let start_data = self.sym_data(self.start()); if start_data.is_terminal() || start_data.rules.iter().any(|r| r.rhs.is_empty()) { @@ -370,16 +373,22 @@ impl Grammar { r } + pub fn name(&self) -> Option<&str> { + self.name.as_deref() + } + pub fn compile(&self, lexer_spec: LexerSpec) -> CGrammar { CGrammar::from_grammar(self, lexer_spec) } - pub fn validate_grammar_refs(&self, grammars: &[(LexerSpec, Grammar)]) -> Result<()> { - for sym in &self.symbols { + pub fn validate_grammar_refs(&mut self, ctx: &HashMap) -> Result<()> { + for sym in &mut self.symbols { match sym.gen_grammar { - Some(ref opts) => { - if opts.grammar.0 >= grammars.len() { - bail!("unknown grammar {}", opts.grammar.0); + Some(ref mut opts) => { + if let Some(idx) = ctx.get(&opts.grammar) { + opts.grammar = GrammarId::Index(*idx); + } else { + bail!("unknown grammar {}", opts.grammar); } } None => {} diff --git a/parser/src/earley/parser.rs b/parser/src/earley/parser.rs index 60f7b1c..1c7de11 100644 --- a/parser/src/earley/parser.rs +++ b/parser/src/earley/parser.rs @@ -963,7 +963,10 @@ impl ParserState { let sym_data = self.grammar.sym_data_dot(pos); if let Some(ref gg) = sym_data.gen_grammar { // break ties by preferring the one with the lowest grammar number - if res.is_none() || res.as_ref().unwrap().grammar.0 > gg.grammar.0 { + if res.is_none() + || res.as_ref().unwrap().grammar.to_index().unwrap() + > gg.grammar.to_index().unwrap() + { res = Some(gg.clone()); res_idx = Some(idx); } @@ -1496,7 +1499,6 @@ impl ParserState { } } - /// Advance the parser with given 'pre_lexeme'. /// On return, the lexer_state will be the state *after* consuming /// 'pre_lexeme'. As a special case, a following single byte lexeme diff --git a/parser/src/grammar_builder.rs b/parser/src/grammar_builder.rs index 79c2f98..99ad23b 100644 --- a/parser/src/grammar_builder.rs +++ b/parser/src/grammar_builder.rs @@ -3,7 +3,8 @@ use std::{collections::HashMap, sync::atomic::AtomicU32}; use anyhow::{ensure, Result}; use crate::api::{ - GrammarWithLexer, Node, NodeId, NodeProps, RegexId, RegexNode, RegexSpec, TopLevelGrammar, + GenGrammarOptions, GrammarId, GrammarWithLexer, Node, NodeId, NodeProps, RegexId, RegexNode, + RegexSpec, TopLevelGrammar, }; #[derive(Clone, Copy, PartialEq, Eq, Debug)] @@ -192,6 +193,16 @@ impl GrammarBuilder { }) } + pub fn gen_grammar(&mut self, name: GrammarId) -> NodeRef { + self.add_node(Node::GenGrammar { + data: GenGrammarOptions { + grammar: name, + ..GenGrammarOptions::default() + }, + props: NodeProps::default(), + }) + } + pub fn lexeme(&mut self, rx: RegexSpec, json_quoted: bool) -> NodeRef { self.add_node(Node::Lexeme { rx, diff --git a/parser/src/lark/README.md b/parser/src/lark/README.md index 45e7536..3f032e7 100644 --- a/parser/src/lark/README.md +++ b/parser/src/lark/README.md @@ -4,6 +4,16 @@ This module converts from [Lark-like](https://github.com/lark-parser/lark) synta It makes it easier to get started with a new grammar, and provides a familiar syntax, however is not a drop-in replacement for Lark. +Following are the extensions to Lark syntax: + +- when several grammars are passed in one request (`grammars` field), + they ones using Lark can reference others using syntax like `@17` refering + to grammar at index 17 in the `grammars` list, or `@my_grammar` refering to grammar + with `"name": "my_grammar"`. +- special tokens can referenced via `` syntax, for example `<|ENDOFTEXT|>`; + they cannot be used inside of terminals, but can be used in regular rules; + the exact syntax depends on the tokenizer + Following are currently not supported: - lookarounds in lexer regexes @@ -15,7 +25,6 @@ Following are currently not supported: Following features of llguidance are currently not exposed in Lark syntax: -- composite/nested grammars - `max_tokens` limits - hiding of `stop=...` - per-lexeme contextual and lazy flags diff --git a/parser/src/lark/ast.rs b/parser/src/lark/ast.rs index be728ac..9dde28a 100644 --- a/parser/src/lark/ast.rs +++ b/parser/src/lark/ast.rs @@ -107,6 +107,8 @@ pub enum Value { Name(String), LiteralString(String, String), LiteralRegex(String, String), + GrammarRef(String), + SpecialToken(String), #[allow(dead_code)] TemplateUsage { name: String, diff --git a/parser/src/lark/compiler.rs b/parser/src/lark/compiler.rs index 3c48a99..21e2902 100644 --- a/parser/src/lark/compiler.rs +++ b/parser/src/lark/compiler.rs @@ -6,7 +6,7 @@ use std::{ use anyhow::{anyhow, bail, ensure, Result}; use crate::{ - api::{GrammarWithLexer, RegexId, RegexSpec, TopLevelGrammar}, + api::{GrammarId, GrammarWithLexer, RegexId, RegexSpec, TopLevelGrammar}, GrammarBuilder, NodeRef, }; @@ -130,6 +130,15 @@ impl Compiler { }; self.mk_regex("regex", rx) } + Value::SpecialToken(s) => { + bail!("special tokens (like {:?}) cannot be used as terminals", s); + } + Value::GrammarRef(g) => { + bail!( + "grammar references (like {:?}) cannot be used as terminals", + g + ); + } Value::TemplateUsage { .. } => bail!("template usage not supported yet"), }, } @@ -200,6 +209,17 @@ impl Compiler { bail!("unknown name: {:?}", n); } } + Value::SpecialToken(s) => return Ok(self.builder.special_token(s)), + Value::GrammarRef(g) => { + assert!(g.starts_with("@")); + // see if g[1..] is an integer + let id = if let Ok(id) = g[1..].parse::() { + GrammarId::Index(id) + } else { + GrammarId::Name(g[1..].to_string()) + }; + return Ok(self.builder.gen_grammar(id)); + } Value::LiteralRange(_, _) | Value::LiteralString(_, _) | Value::LiteralRegex(_, _) => { diff --git a/parser/src/lark/lexer.rs b/parser/src/lark/lexer.rs index d025774..36bc358 100644 --- a/parser/src/lark/lexer.rs +++ b/parser/src/lark/lexer.rs @@ -35,6 +35,8 @@ pub enum Token { Number, Newline, VBar, + SpecialToken, // + GrammarRef, // @grammar_id or @7 // special SKIP, EOF, @@ -100,6 +102,8 @@ impl Token { (Token::Regexp, r#"/(\\.|[^/\\])+/[imslux]*"#), (Token::Number, r#"[+-]?[0-9]+"#), (Token::Newline, r"(\r?\n)+[ \t]*"), + (Token::SpecialToken, r"<[^<>\s]+>"), + (Token::GrammarRef, r"@[a-zA-Z0-9_\-]+"), ]; } diff --git a/parser/src/lark/parser.rs b/parser/src/lark/parser.rs index 030934f..80471a0 100644 --- a/parser/src/lark/parser.rs +++ b/parser/src/lark/parser.rs @@ -318,6 +318,10 @@ impl Parser { let flags = inner[last_slash_idx + 1..].to_string(); let regex = inner[1..last_slash_idx].to_string(); Ok(Value::LiteralRegex(regex, flags)) + } else if let Some(grammar_ref) = self.match_token_with_value(Token::GrammarRef) { + Ok(Value::GrammarRef(grammar_ref.value.clone())) + } else if let Some(special_token) = self.match_token_with_value(Token::SpecialToken) { + Ok(Value::SpecialToken(special_token.value.clone())) } else if let Some(name_token) = self .match_token_with_value(Token::Rule) .or_else(|| self.match_token_with_value(Token::Token)) diff --git a/parser/src/tokenparser.rs b/parser/src/tokenparser.rs index ff7d904..225956a 100644 --- a/parser/src/tokenparser.rs +++ b/parser/src/tokenparser.rs @@ -709,7 +709,7 @@ impl TokenParser { if msg.len() > 0 { warn!(self, "{}", msg); } - let grm = Arc::clone(&self.compiled_grammars[gen_grammar.grammar.0]); + let grm = Arc::clone(&self.compiled_grammars[gen_grammar.grammar.to_index().unwrap()]); let max_tokens = self.parser.grammar().sym_data(symidx).props.max_tokens; let parser = Parser::new(grm, gen_grammar, self.limits.clone())?; let mut old_parser = std::mem::replace(&mut self.parser, parser);