Skip to content

Commit

Permalink
add nested grammars and special tokens to lark syntax
Browse files Browse the repository at this point in the history
  • Loading branch information
mmoskal committed Nov 12, 2024
1 parent 84e5f41 commit de92b7d
Show file tree
Hide file tree
Showing 11 changed files with 125 additions and 22 deletions.
34 changes: 31 additions & 3 deletions parser/src/api.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use std::fmt::Debug;
use std::fmt::{Debug, Display};

use serde::{Deserialize, Serialize};
use serde_json::Value;
Expand All @@ -19,6 +19,9 @@ pub const DEFAULT_CONTEXTUAL: bool = true;

#[derive(Serialize, Deserialize, Clone, Default)]
pub struct GrammarWithLexer {
/// The name of this grammar, can be used in GenGrammar nodes.
pub name: Option<String>,

/// The start symbol is at nodes[0]
/// When nodes is empty, then one of json_schema or lark_grammar must be set.
#[serde(default)]
Expand Down Expand Up @@ -257,6 +260,31 @@ impl RegexSpec {
}
}

#[derive(Serialize, Deserialize, Hash, PartialEq, Eq, Clone, Debug)]
#[serde(untagged)]
pub enum GrammarId {
Index(usize),
Name(String),
}

impl GrammarId {
pub fn to_index(&self) -> Option<usize> {
match self {
GrammarId::Index(i) => Some(*i),
GrammarId::Name(_) => None,
}
}
}

impl Display for GrammarId {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
GrammarId::Index(i) => write!(f, "@#{}", i),
GrammarId::Name(s) => write!(f, "@{}", s),
}
}
}

macro_rules! id_type {
($name:ident) => {
#[derive(Serialize, Deserialize, Hash, PartialEq, Eq, Clone, Copy, Debug)]
Expand All @@ -265,7 +293,6 @@ macro_rules! id_type {
};
}

id_type!(GrammarId);
id_type!(NodeId);
id_type!(RegexId);

Expand All @@ -286,7 +313,7 @@ impl Node {
impl Default for GenGrammarOptions {
fn default() -> Self {
GenGrammarOptions {
grammar: GrammarId(0),
grammar: GrammarId::Index(0),
temperature: None,
max_tokens_grm: usize::MAX,
}
Expand Down Expand Up @@ -376,6 +403,7 @@ impl TopLevelGrammar {
pub fn from_regex(rx: RegexNode) -> Self {
TopLevelGrammar {
grammars: vec![GrammarWithLexer {
name: Some("regex_grammar".to_string()),
nodes: vec![Node::Lexeme {
rx: RegexSpec::RegexId(RegexId(0)),
contextual: None,
Expand Down
26 changes: 20 additions & 6 deletions parser/src/earley/from_guidance.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
use std::collections::HashMap;
use std::fmt::Write;
use std::{sync::Arc, vec};

use super::{grammar::SymbolProps, lexerspec::LexerSpec, CGrammar, Grammar};
use crate::api::{
GrammarWithLexer, Node, ParserLimits, RegexId, RegexNode, RegexSpec, TopLevelGrammar,
DEFAULT_CONTEXTUAL,
GrammarId, GrammarWithLexer, Node, ParserLimits, RegexId, RegexNode, RegexSpec,
TopLevelGrammar, DEFAULT_CONTEXTUAL,
};
use crate::{lark_to_llguidance, loginfo, JsonCompileOptions, Logger};
use anyhow::{bail, ensure, Result};
Expand Down Expand Up @@ -304,14 +305,26 @@ pub fn grammars_from_json(
extra_lexemes: Vec<String>,
) -> Result<Vec<Arc<CGrammar>>> {
let t0 = Instant::now();
let grammars = input
let mut grammars = input
.grammars
.into_iter()
.map(|g| grammar_from_json(tok_env, &mut limits, g))
.collect::<Result<Vec<_>>>()?;

for (_, g) in &grammars {
g.validate_grammar_refs(&grammars)?;
let mut grammar_by_idx = HashMap::new();
for (idx, (_, g)) in grammars.iter().enumerate() {
grammar_by_idx.insert(GrammarId::Index(idx), idx);
if let Some(n) = g.name() {
let n = GrammarId::Name(n.to_string());
if grammar_by_idx.contains_key(&n) {
bail!("duplicate grammar name: {}", n);
}
grammar_by_idx.insert(n, idx);
}
}

for (_, g) in grammars.iter_mut() {
g.validate_grammar_refs(&grammar_by_idx)?;
}

let t1 = Instant::now();
Expand All @@ -327,8 +340,9 @@ pub fn grammars_from_json(
if log_grammar {
writeln!(
logger.info_logger(),
"Grammar #{}:\n{:?}\n{:?}\n",
"Grammar #{} {}:\n{:?}\n{:?}\n",
idx,
grm.name().unwrap_or(""),
lex,
grm
)
Expand Down
23 changes: 16 additions & 7 deletions parser/src/earley/grammar.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
use std::{fmt::Debug, hash::Hash};
use std::{collections::HashMap, fmt::Debug, hash::Hash};

use anyhow::{bail, ensure, Result};

use crate::api::GenGrammarOptions;
use crate::api::{GenGrammarOptions, GrammarId};

use super::lexerspec::{LexemeIdx, LexerSpec};
use rustc_hash::FxHashMap;
Expand Down Expand Up @@ -126,13 +126,15 @@ impl Rule {
}

pub struct Grammar {
name: Option<String>,
symbols: Vec<Symbol>,
symbol_by_name: FxHashMap<String, SymIdx>,
}

impl Grammar {
pub fn new() -> Self {
Grammar {
name: None,
symbols: vec![],
symbol_by_name: FxHashMap::default(),
}
Expand Down Expand Up @@ -338,6 +340,7 @@ impl Grammar {
}

let mut outp = Grammar::new();
outp.name = self.name.clone();

let start_data = self.sym_data(self.start());
if start_data.is_terminal() || start_data.rules.iter().any(|r| r.rhs.is_empty()) {
Expand Down Expand Up @@ -370,16 +373,22 @@ impl Grammar {
r
}

pub fn name(&self) -> Option<&str> {
self.name.as_deref()
}

pub fn compile(&self, lexer_spec: LexerSpec) -> CGrammar {
CGrammar::from_grammar(self, lexer_spec)
}

pub fn validate_grammar_refs(&self, grammars: &[(LexerSpec, Grammar)]) -> Result<()> {
for sym in &self.symbols {
pub fn validate_grammar_refs(&mut self, ctx: &HashMap<GrammarId, usize>) -> Result<()> {
for sym in &mut self.symbols {
match sym.gen_grammar {
Some(ref opts) => {
if opts.grammar.0 >= grammars.len() {
bail!("unknown grammar {}", opts.grammar.0);
Some(ref mut opts) => {
if let Some(idx) = ctx.get(&opts.grammar) {
opts.grammar = GrammarId::Index(*idx);
} else {
bail!("unknown grammar {}", opts.grammar);
}
}
None => {}
Expand Down
6 changes: 4 additions & 2 deletions parser/src/earley/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -963,7 +963,10 @@ impl ParserState {
let sym_data = self.grammar.sym_data_dot(pos);
if let Some(ref gg) = sym_data.gen_grammar {
// break ties by preferring the one with the lowest grammar number
if res.is_none() || res.as_ref().unwrap().grammar.0 > gg.grammar.0 {
if res.is_none()
|| res.as_ref().unwrap().grammar.to_index().unwrap()
> gg.grammar.to_index().unwrap()
{
res = Some(gg.clone());
res_idx = Some(idx);
}
Expand Down Expand Up @@ -1496,7 +1499,6 @@ impl ParserState {
}
}


/// Advance the parser with given 'pre_lexeme'.
/// On return, the lexer_state will be the state *after* consuming
/// 'pre_lexeme'. As a special case, a following single byte lexeme
Expand Down
13 changes: 12 additions & 1 deletion parser/src/grammar_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ use std::{collections::HashMap, sync::atomic::AtomicU32};
use anyhow::{ensure, Result};

use crate::api::{
GrammarWithLexer, Node, NodeId, NodeProps, RegexId, RegexNode, RegexSpec, TopLevelGrammar,
GenGrammarOptions, GrammarId, GrammarWithLexer, Node, NodeId, NodeProps, RegexId, RegexNode,
RegexSpec, TopLevelGrammar,
};

#[derive(Clone, Copy, PartialEq, Eq, Debug)]
Expand Down Expand Up @@ -192,6 +193,16 @@ impl GrammarBuilder {
})
}

pub fn gen_grammar(&mut self, name: GrammarId) -> NodeRef {
self.add_node(Node::GenGrammar {
data: GenGrammarOptions {
grammar: name,
..GenGrammarOptions::default()
},
props: NodeProps::default(),
})
}

pub fn lexeme(&mut self, rx: RegexSpec, json_quoted: bool) -> NodeRef {
self.add_node(Node::Lexeme {
rx,
Expand Down
11 changes: 10 additions & 1 deletion parser/src/lark/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,16 @@ This module converts from [Lark-like](https://github.com/lark-parser/lark) synta
It makes it easier to get started with a new grammar,
and provides a familiar syntax, however is not a drop-in replacement for Lark.

Following are the extensions to Lark syntax:

- when several grammars are passed in one request (`grammars` field),
they ones using Lark can reference others using syntax like `@17` refering
to grammar at index 17 in the `grammars` list, or `@my_grammar` refering to grammar
with `"name": "my_grammar"`.
- special tokens can referenced via `<token_name>` syntax, for example `<|ENDOFTEXT|>`;
they cannot be used inside of terminals, but can be used in regular rules;
the exact syntax depends on the tokenizer

Following are currently not supported:

- lookarounds in lexer regexes
Expand All @@ -15,7 +25,6 @@ Following are currently not supported:

Following features of llguidance are currently not exposed in Lark syntax:

- composite/nested grammars
- `max_tokens` limits
- hiding of `stop=...`
- per-lexeme contextual and lazy flags
2 changes: 2 additions & 0 deletions parser/src/lark/ast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,8 @@ pub enum Value {
Name(String),
LiteralString(String, String),
LiteralRegex(String, String),
GrammarRef(String),
SpecialToken(String),
#[allow(dead_code)]
TemplateUsage {
name: String,
Expand Down
22 changes: 21 additions & 1 deletion parser/src/lark/compiler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use std::{
use anyhow::{anyhow, bail, ensure, Result};

use crate::{
api::{GrammarWithLexer, RegexId, RegexSpec, TopLevelGrammar},
api::{GrammarId, GrammarWithLexer, RegexId, RegexSpec, TopLevelGrammar},
GrammarBuilder, NodeRef,
};

Expand Down Expand Up @@ -130,6 +130,15 @@ impl Compiler {
};
self.mk_regex("regex", rx)
}
Value::SpecialToken(s) => {
bail!("special tokens (like {:?}) cannot be used as terminals", s);
}
Value::GrammarRef(g) => {
bail!(
"grammar references (like {:?}) cannot be used as terminals",
g
);
}
Value::TemplateUsage { .. } => bail!("template usage not supported yet"),
},
}
Expand Down Expand Up @@ -200,6 +209,17 @@ impl Compiler {
bail!("unknown name: {:?}", n);
}
}
Value::SpecialToken(s) => return Ok(self.builder.special_token(s)),
Value::GrammarRef(g) => {
assert!(g.starts_with("@"));
// see if g[1..] is an integer
let id = if let Ok(id) = g[1..].parse::<usize>() {
GrammarId::Index(id)
} else {
GrammarId::Name(g[1..].to_string())
};
return Ok(self.builder.gen_grammar(id));
}
Value::LiteralRange(_, _)
| Value::LiteralString(_, _)
| Value::LiteralRegex(_, _) => {
Expand Down
4 changes: 4 additions & 0 deletions parser/src/lark/lexer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ pub enum Token {
Number,
Newline,
VBar,
SpecialToken, // <something>
GrammarRef, // @grammar_id or @7
// special
SKIP,
EOF,
Expand Down Expand Up @@ -100,6 +102,8 @@ impl Token {
(Token::Regexp, r#"/(\\.|[^/\\])+/[imslux]*"#),
(Token::Number, r#"[+-]?[0-9]+"#),
(Token::Newline, r"(\r?\n)+[ \t]*"),
(Token::SpecialToken, r"<[^<>\s]+>"),
(Token::GrammarRef, r"@[a-zA-Z0-9_\-]+"),
];
}

Expand Down
4 changes: 4 additions & 0 deletions parser/src/lark/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,10 @@ impl Parser {
let flags = inner[last_slash_idx + 1..].to_string();
let regex = inner[1..last_slash_idx].to_string();
Ok(Value::LiteralRegex(regex, flags))
} else if let Some(grammar_ref) = self.match_token_with_value(Token::GrammarRef) {
Ok(Value::GrammarRef(grammar_ref.value.clone()))
} else if let Some(special_token) = self.match_token_with_value(Token::SpecialToken) {
Ok(Value::SpecialToken(special_token.value.clone()))
} else if let Some(name_token) = self
.match_token_with_value(Token::Rule)
.or_else(|| self.match_token_with_value(Token::Token))
Expand Down
2 changes: 1 addition & 1 deletion parser/src/tokenparser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -709,7 +709,7 @@ impl TokenParser {
if msg.len() > 0 {
warn!(self, "{}", msg);
}
let grm = Arc::clone(&self.compiled_grammars[gen_grammar.grammar.0]);
let grm = Arc::clone(&self.compiled_grammars[gen_grammar.grammar.to_index().unwrap()]);
let max_tokens = self.parser.grammar().sym_data(symidx).props.max_tokens;
let parser = Parser::new(grm, gen_grammar, self.limits.clone())?;
let mut old_parser = std::mem::replace(&mut self.parser, parser);
Expand Down

0 comments on commit de92b7d

Please sign in to comment.