From de92b7d3934fd2fdfdc33788f3b03e830ea45cb7 Mon Sep 17 00:00:00 2001
From: Michal Moskal <michal@moskal.me>
Date: Tue, 12 Nov 2024 14:53:14 -0800
Subject: [PATCH] add nested grammars and special tokens to lark syntax

---
 parser/src/api.rs                  | 34 +++++++++++++++++++++++++++---
 parser/src/earley/from_guidance.rs | 26 +++++++++++++++++------
 parser/src/earley/grammar.rs       | 23 ++++++++++++++------
 parser/src/earley/parser.rs        |  6 ++++--
 parser/src/grammar_builder.rs      | 13 +++++++++++-
 parser/src/lark/README.md          | 11 +++++++++-
 parser/src/lark/ast.rs             |  2 ++
 parser/src/lark/compiler.rs        | 22 ++++++++++++++++++-
 parser/src/lark/lexer.rs           |  4 ++++
 parser/src/lark/parser.rs          |  4 ++++
 parser/src/tokenparser.rs          |  2 +-
 11 files changed, 125 insertions(+), 22 deletions(-)
diff --git a/parser/src/api.rs b/parser/src/api.rs
index ec44552..c1b6c26 100644
--- a/parser/src/api.rs
+++ b/parser/src/api.rs
@@ -1,4 +1,4 @@
-use std::fmt::Debug;
+use std::fmt::{Debug, Display};
 
 use serde::{Deserialize, Serialize};
 use serde_json::Value;
@@ -19,6 +19,9 @@ pub const DEFAULT_CONTEXTUAL: bool = true;
 
 #[derive(Serialize, Deserialize, Clone, Default)]
 pub struct GrammarWithLexer {
+    /// The name of this grammar, can be used in GenGrammar nodes.
+    pub name: Option<String>,
+
     /// The start symbol is at nodes[0]
     /// When nodes is empty, then one of json_schema or lark_grammar must be set.
     #[serde(default)]
@@ -257,6 +260,31 @@ impl RegexSpec {
     }
 }
 
+#[derive(Serialize, Deserialize, Hash, PartialEq, Eq, Clone, Debug)]
+#[serde(untagged)]
+pub enum GrammarId {
+    Index(usize),
+    Name(String),
+}
+
+impl GrammarId {
+    pub fn to_index(&self) -> Option<usize> {
+        match self {
+            GrammarId::Index(i) => Some(*i),
+            GrammarId::Name(_) => None,
+        }
+    }
+}
+
+impl Display for GrammarId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            GrammarId::Index(i) => write!(f, "@#{}", i),
+            GrammarId::Name(s) => write!(f, "@{}", s),
+        }
+    }
+}
+
 macro_rules! id_type {
     ($name:ident) => {
         #[derive(Serialize, Deserialize, Hash, PartialEq, Eq, Clone, Copy, Debug)]
@@ -265,7 +293,6 @@ macro_rules! id_type {
     };
 }
 
-id_type!(GrammarId);
 id_type!(NodeId);
 id_type!(RegexId);
 
@@ -286,7 +313,7 @@ impl Node {
 impl Default for GenGrammarOptions {
     fn default() -> Self {
         GenGrammarOptions {
-            grammar: GrammarId(0),
+            grammar: GrammarId::Index(0),
             temperature: None,
             max_tokens_grm: usize::MAX,
         }
@@ -376,6 +403,7 @@ impl TopLevelGrammar {
     pub fn from_regex(rx: RegexNode) -> Self {
         TopLevelGrammar {
             grammars: vec![GrammarWithLexer {
+                name: Some("regex_grammar".to_string()),
                 nodes: vec![Node::Lexeme {
                     rx: RegexSpec::RegexId(RegexId(0)),
                     contextual: None,
diff --git a/parser/src/earley/from_guidance.rs b/parser/src/earley/from_guidance.rs
index f4b82ac..68c879c 100644
--- a/parser/src/earley/from_guidance.rs
+++ b/parser/src/earley/from_guidance.rs
@@ -1,10 +1,11 @@
+use std::collections::HashMap;
 use std::fmt::Write;
 use std::{sync::Arc, vec};
 
 use super::{grammar::SymbolProps, lexerspec::LexerSpec, CGrammar, Grammar};
 use crate::api::{
-    GrammarWithLexer, Node, ParserLimits, RegexId, RegexNode, RegexSpec, TopLevelGrammar,
-    DEFAULT_CONTEXTUAL,
+    GrammarId, GrammarWithLexer, Node, ParserLimits, RegexId, RegexNode, RegexSpec,
+    TopLevelGrammar, DEFAULT_CONTEXTUAL,
 };
 use crate::{lark_to_llguidance, loginfo, JsonCompileOptions, Logger};
 use anyhow::{bail, ensure, Result};
@@ -304,14 +305,26 @@ pub fn grammars_from_json(
     extra_lexemes: Vec<String>,
 ) -> Result<Vec<Arc<CGrammar>>> {
     let t0 = Instant::now();
-    let grammars = input
+    let mut grammars = input
         .grammars
         .into_iter()
         .map(|g| grammar_from_json(tok_env, &mut limits, g))
         .collect::<Result<Vec<_>>>()?;
 
-    for (_, g) in &grammars {
-        g.validate_grammar_refs(&grammars)?;
+    let mut grammar_by_idx = HashMap::new();
+    for (idx, (_, g)) in grammars.iter().enumerate() {
+        grammar_by_idx.insert(GrammarId::Index(idx), idx);
+        if let Some(n) = g.name() {
+            let n = GrammarId::Name(n.to_string());
+            if grammar_by_idx.contains_key(&n) {
+                bail!("duplicate grammar name: {}", n);
+            }
+            grammar_by_idx.insert(n, idx);
+        }
+    }
+
+    for (_, g) in grammars.iter_mut() {
+        g.validate_grammar_refs(&grammar_by_idx)?;
     }
 
     let t1 = Instant::now();
@@ -327,8 +340,9 @@ pub fn grammars_from_json(
             if log_grammar {
                 writeln!(
                     logger.info_logger(),
-                    "Grammar #{}:\n{:?}\n{:?}\n",
+                    "Grammar #{} {}:\n{:?}\n{:?}\n",
                     idx,
+                    grm.name().unwrap_or(""),
                     lex,
                     grm
                 )
diff --git a/parser/src/earley/grammar.rs b/parser/src/earley/grammar.rs
index cbbb8bf..e80e41d 100644
--- a/parser/src/earley/grammar.rs
+++ b/parser/src/earley/grammar.rs
@@ -1,8 +1,8 @@
-use std::{fmt::Debug, hash::Hash};
+use std::{collections::HashMap, fmt::Debug, hash::Hash};
 
 use anyhow::{bail, ensure, Result};
 
-use crate::api::GenGrammarOptions;
+use crate::api::{GenGrammarOptions, GrammarId};
 
 use super::lexerspec::{LexemeIdx, LexerSpec};
 use rustc_hash::FxHashMap;
@@ -126,6 +126,7 @@ impl Rule {
 }
 
 pub struct Grammar {
+    name: Option<String>,
     symbols: Vec<Symbol>,
     symbol_by_name: FxHashMap<String, SymIdx>,
 }
@@ -133,6 +134,7 @@ pub struct Grammar {
 impl Grammar {
     pub fn new() -> Self {
         Grammar {
+            name: None,
             symbols: vec![],
             symbol_by_name: FxHashMap::default(),
         }
@@ -338,6 +340,7 @@ impl Grammar {
         }
 
         let mut outp = Grammar::new();
+        outp.name = self.name.clone();
 
         let start_data = self.sym_data(self.start());
         if start_data.is_terminal() || start_data.rules.iter().any(|r| r.rhs.is_empty()) {
@@ -370,16 +373,22 @@ impl Grammar {
         r
     }
 
+    pub fn name(&self) -> Option<&str> {
+        self.name.as_deref()
+    }
+
     pub fn compile(&self, lexer_spec: LexerSpec) -> CGrammar {
         CGrammar::from_grammar(self, lexer_spec)
     }
 
-    pub fn validate_grammar_refs(&self, grammars: &[(LexerSpec, Grammar)]) -> Result<()> {
-        for sym in &self.symbols {
+    pub fn validate_grammar_refs(&mut self, ctx: &HashMap<GrammarId, usize>) -> Result<()> {
+        for sym in &mut self.symbols {
             match sym.gen_grammar {
-                Some(ref opts) => {
-                    if opts.grammar.0 >= grammars.len() {
-                        bail!("unknown grammar {}", opts.grammar.0);
+                Some(ref mut opts) => {
+                    if let Some(idx) = ctx.get(&opts.grammar) {
+                        opts.grammar = GrammarId::Index(*idx);
+                    } else {
+                        bail!("unknown grammar {}", opts.grammar);
                     }
                 }
                 None => {}
diff --git a/parser/src/earley/parser.rs b/parser/src/earley/parser.rs
index 60f7b1c..1c7de11 100644
--- a/parser/src/earley/parser.rs
+++ b/parser/src/earley/parser.rs
@@ -963,7 +963,10 @@ impl ParserState {
             let sym_data = self.grammar.sym_data_dot(pos);
             if let Some(ref gg) = sym_data.gen_grammar {
                 // break ties by preferring the one with the lowest grammar number
-                if res.is_none() || res.as_ref().unwrap().grammar.0 > gg.grammar.0 {
+                if res.is_none()
+                    || res.as_ref().unwrap().grammar.to_index().unwrap()
+                        > gg.grammar.to_index().unwrap()
+                {
                     res = Some(gg.clone());
                     res_idx = Some(idx);
                 }
@@ -1496,7 +1499,6 @@ impl ParserState {
         }
     }
 
-
     /// Advance the parser with given 'pre_lexeme'.
     /// On return, the lexer_state will be the state *after* consuming
     /// 'pre_lexeme'.  As a special case, a following single byte lexeme
diff --git a/parser/src/grammar_builder.rs b/parser/src/grammar_builder.rs
index 79c2f98..99ad23b 100644
--- a/parser/src/grammar_builder.rs
+++ b/parser/src/grammar_builder.rs
@@ -3,7 +3,8 @@ use std::{collections::HashMap, sync::atomic::AtomicU32};
 use anyhow::{ensure, Result};
 
 use crate::api::{
-    GrammarWithLexer, Node, NodeId, NodeProps, RegexId, RegexNode, RegexSpec, TopLevelGrammar,
+    GenGrammarOptions, GrammarId, GrammarWithLexer, Node, NodeId, NodeProps, RegexId, RegexNode,
+    RegexSpec, TopLevelGrammar,
 };
 
 #[derive(Clone, Copy, PartialEq, Eq, Debug)]
@@ -192,6 +193,16 @@ impl GrammarBuilder {
         })
     }
 
+    pub fn gen_grammar(&mut self, name: GrammarId) -> NodeRef {
+        self.add_node(Node::GenGrammar {
+            data: GenGrammarOptions {
+                grammar: name,
+                ..GenGrammarOptions::default()
+            },
+            props: NodeProps::default(),
+        })
+    }
+
     pub fn lexeme(&mut self, rx: RegexSpec, json_quoted: bool) -> NodeRef {
         self.add_node(Node::Lexeme {
             rx,
diff --git a/parser/src/lark/README.md b/parser/src/lark/README.md
index 45e7536..3f032e7 100644
--- a/parser/src/lark/README.md
+++ b/parser/src/lark/README.md
@@ -4,6 +4,16 @@ This module converts from [Lark-like](https://github.com/lark-parser/lark) synta
 It makes it easier to get started with a new grammar,
 and provides a familiar syntax, however is not a drop-in replacement for Lark.
 
+Following are the extensions to Lark syntax:
+
+- when several grammars are passed in one request (`grammars` field),
+  they ones using Lark can reference others using syntax like `@17` refering
+  to grammar at index 17 in the `grammars` list, or `@my_grammar` refering to grammar
+  with `"name": "my_grammar"`.
+- special tokens can referenced via `<token_name>` syntax, for example `<|ENDOFTEXT|>`;
+  they cannot be used inside of terminals, but can be used in regular rules;
+  the exact syntax depends on the tokenizer
+
 Following are currently not supported:
 
 - lookarounds in lexer regexes
@@ -15,7 +25,6 @@ Following are currently not supported:
 
 Following features of llguidance are currently not exposed in Lark syntax:
 
-- composite/nested grammars
 - `max_tokens` limits
 - hiding of `stop=...`
 - per-lexeme contextual and lazy flags
diff --git a/parser/src/lark/ast.rs b/parser/src/lark/ast.rs
index be728ac..9dde28a 100644
--- a/parser/src/lark/ast.rs
+++ b/parser/src/lark/ast.rs
@@ -107,6 +107,8 @@ pub enum Value {
     Name(String),
     LiteralString(String, String),
     LiteralRegex(String, String),
+    GrammarRef(String),
+    SpecialToken(String),
     #[allow(dead_code)]
     TemplateUsage {
         name: String,
diff --git a/parser/src/lark/compiler.rs b/parser/src/lark/compiler.rs
index 3c48a99..21e2902 100644
--- a/parser/src/lark/compiler.rs
+++ b/parser/src/lark/compiler.rs
@@ -6,7 +6,7 @@ use std::{
 use anyhow::{anyhow, bail, ensure, Result};
 
 use crate::{
-    api::{GrammarWithLexer, RegexId, RegexSpec, TopLevelGrammar},
+    api::{GrammarId, GrammarWithLexer, RegexId, RegexSpec, TopLevelGrammar},
     GrammarBuilder, NodeRef,
 };
 
@@ -130,6 +130,15 @@ impl Compiler {
                     };
                     self.mk_regex("regex", rx)
                 }
+                Value::SpecialToken(s) => {
+                    bail!("special tokens (like {:?}) cannot be used as terminals", s);
+                }
+                Value::GrammarRef(g) => {
+                    bail!(
+                        "grammar references (like {:?}) cannot be used as terminals",
+                        g
+                    );
+                }
                 Value::TemplateUsage { .. } => bail!("template usage not supported yet"),
             },
         }
@@ -200,6 +209,17 @@ impl Compiler {
                             bail!("unknown name: {:?}", n);
                         }
                     }
+                    Value::SpecialToken(s) => return Ok(self.builder.special_token(s)),
+                    Value::GrammarRef(g) => {
+                        assert!(g.starts_with("@"));
+                        // see if g[1..] is an integer
+                        let id = if let Ok(id) = g[1..].parse::<usize>() {
+                            GrammarId::Index(id)
+                        } else {
+                            GrammarId::Name(g[1..].to_string())
+                        };
+                        return Ok(self.builder.gen_grammar(id));
+                    }
                     Value::LiteralRange(_, _)
                     | Value::LiteralString(_, _)
                     | Value::LiteralRegex(_, _) => {
diff --git a/parser/src/lark/lexer.rs b/parser/src/lark/lexer.rs
index d025774..36bc358 100644
--- a/parser/src/lark/lexer.rs
+++ b/parser/src/lark/lexer.rs
@@ -35,6 +35,8 @@ pub enum Token {
     Number,
     Newline,
     VBar,
+    SpecialToken, // <something>
+    GrammarRef, // @grammar_id or @7
     // special
     SKIP,
     EOF,
@@ -100,6 +102,8 @@ impl Token {
         (Token::Regexp, r#"/(\\.|[^/\\])+/[imslux]*"#),
         (Token::Number, r#"[+-]?[0-9]+"#),
         (Token::Newline, r"(\r?\n)+[ \t]*"),
+        (Token::SpecialToken, r"<[^<>\s]+>"),
+        (Token::GrammarRef, r"@[a-zA-Z0-9_\-]+"),
     ];
 }
 
diff --git a/parser/src/lark/parser.rs b/parser/src/lark/parser.rs
index 030934f..80471a0 100644
--- a/parser/src/lark/parser.rs
+++ b/parser/src/lark/parser.rs
@@ -318,6 +318,10 @@ impl Parser {
             let flags = inner[last_slash_idx + 1..].to_string();
             let regex = inner[1..last_slash_idx].to_string();
             Ok(Value::LiteralRegex(regex, flags))
+        } else if let Some(grammar_ref) = self.match_token_with_value(Token::GrammarRef) {
+            Ok(Value::GrammarRef(grammar_ref.value.clone()))
+        } else if let Some(special_token) = self.match_token_with_value(Token::SpecialToken) {
+            Ok(Value::SpecialToken(special_token.value.clone()))
         } else if let Some(name_token) = self
             .match_token_with_value(Token::Rule)
             .or_else(|| self.match_token_with_value(Token::Token))
diff --git a/parser/src/tokenparser.rs b/parser/src/tokenparser.rs
index ff7d904..225956a 100644
--- a/parser/src/tokenparser.rs
+++ b/parser/src/tokenparser.rs
@@ -709,7 +709,7 @@ impl TokenParser {
             if msg.len() > 0 {
                 warn!(self, "{}", msg);
             }
-            let grm = Arc::clone(&self.compiled_grammars[gen_grammar.grammar.0]);
+            let grm = Arc::clone(&self.compiled_grammars[gen_grammar.grammar.to_index().unwrap()]);
             let max_tokens = self.parser.grammar().sym_data(symidx).props.max_tokens;
             let parser = Parser::new(grm, gen_grammar, self.limits.clone())?;
             let mut old_parser = std::mem::replace(&mut self.parser, parser);