From 09d8d590504b5ef8eb9aa03324805327ec89cd2b Mon Sep 17 00:00:00 2001 From: Hudson Cooper Date: Fri, 25 Oct 2024 11:43:35 -0700 Subject: [PATCH 01/73] implement schema JSON -> llgrammar JSON --- python/llguidance/__init__.py | 3 ++- python/llguidance/_lib.pyi | 21 ++++++++++++++++++++- rust/src/py.rs | 30 +++++++++++++++++++++++++++++- 3 files changed, 51 insertions(+), 3 deletions(-) diff --git a/python/llguidance/__init__.py b/python/llguidance/__init__.py index d171c8d..0726170 100644 --- a/python/llguidance/__init__.py +++ b/python/llguidance/__init__.py @@ -1,8 +1,9 @@ -from ._lib import LLTokenizer, LLInterpreter +from ._lib import LLTokenizer, LLInterpreter, JsonCompiler from ._tokenizer import TokenizerWrapper __all__ = [ "LLTokenizer", "LLInterpreter", + "JsonCompiler", "TokenizerWrapper", ] diff --git a/python/llguidance/_lib.pyi b/python/llguidance/_lib.pyi index c05bc61..ee7f776 100644 --- a/python/llguidance/_lib.pyi +++ b/python/llguidance/_lib.pyi @@ -138,4 +138,23 @@ class LLInterpreter: def has_pending_stop(self) -> bool: """ If true, next mid_process() call will return stop - """ \ No newline at end of file + """ + +class JsonCompiler: + def __new__( + cls, + compact: bool = False, + ) -> "JsonCompiler": + """ + Create a new JSON compiler. + Args: + compact: bool - whether to use compact JSON representation + """ + + def compile( + self, + schema: str, + ) -> str: + """ + Compile the JSON representation of the AG2 grammar/constraint. + """ diff --git a/rust/src/py.rs b/rust/src/py.rs index 2c423e9..d49e06a 100644 --- a/rust/src/py.rs +++ b/rust/src/py.rs @@ -6,9 +6,10 @@ use llguidance_parser::toktrie::{ self, InferenceCapabilities, TokRxInfo, TokTrie, TokenId, TokenizerEnv, }; use llguidance_parser::{api::TopLevelGrammar, output::ParserOutput, TokenParser}; -use llguidance_parser::{Constraint, Logger}; +use llguidance_parser::{Constraint, JsonCompileOptions, Logger}; use pyo3::{exceptions::PyValueError, prelude::*}; use serde::{Deserialize, Serialize}; +use serde_json::Value; #[derive(Clone)] #[pyclass] @@ -240,9 +241,36 @@ impl TokenizerEnv for LLTokenizer { } } +#[derive(Clone)] +#[pyclass] +struct JsonCompiler { + #[pyo3(get, set)] + compact: bool, +} + +#[pymethods] +impl JsonCompiler { + #[new] + fn py_new(compact: Option) -> Self { + JsonCompiler { + compact: compact.unwrap_or(false), + } + } + fn compile(&self, schema: &str) -> PyResult { + let schema: Value = serde_json::from_str(schema).map_err(val_error)?; + let compile_options = JsonCompileOptions { + compact: self.compact, + }; + let grammar = compile_options.json_to_llg(&schema).map_err(val_error)?; + Ok(serde_json::to_string(&grammar).map_err(val_error)?) + } + +} + pub(crate) fn init(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; + m.add_class::()?; Ok(()) } From 78ccc30e7a9ff28d4478c76833379c660278c85b Mon Sep 17 00:00:00 2001 From: Hudson Cooper Date: Thu, 7 Nov 2024 12:32:57 -0800 Subject: [PATCH 02/73] Return GrammarWithLexer instead of TopLevelGrammar --- rust/src/py.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/rust/src/py.rs b/rust/src/py.rs index d49e06a..a0d38b6 100644 --- a/rust/src/py.rs +++ b/rust/src/py.rs @@ -261,8 +261,9 @@ impl JsonCompiler { let compile_options = JsonCompileOptions { compact: self.compact, }; - let grammar = compile_options.json_to_llg(&schema).map_err(val_error)?; - Ok(serde_json::to_string(&grammar).map_err(val_error)?) + let tlg = compile_options.json_to_llg(&schema).map_err(val_error)?; + let grammar = &tlg.grammars[0]; + Ok(serde_json::to_string(grammar).map_err(val_error)?) } } From b30d5e85c992e6c929d41de450bff289018b3da3 Mon Sep 17 00:00:00 2001 From: Hudson Cooper Date: Thu, 7 Nov 2024 13:11:19 -0800 Subject: [PATCH 03/73] refactor json into mod --- parser/src/{json.rs => json/compiler.rs} | 0 parser/src/json/mod.rs | 1 + parser/src/lib.rs | 2 +- 3 files changed, 2 insertions(+), 1 deletion(-) rename parser/src/{json.rs => json/compiler.rs} (100%) create mode 100644 parser/src/json/mod.rs diff --git a/parser/src/json.rs b/parser/src/json/compiler.rs similarity index 100% rename from parser/src/json.rs rename to parser/src/json/compiler.rs diff --git a/parser/src/json/mod.rs b/parser/src/json/mod.rs new file mode 100644 index 0000000..59d8df7 --- /dev/null +++ b/parser/src/json/mod.rs @@ -0,0 +1 @@ +pub mod compiler; diff --git a/parser/src/lib.rs b/parser/src/lib.rs index c33c4ad..5b1d84f 100644 --- a/parser/src/lib.rs +++ b/parser/src/lib.rs @@ -27,7 +27,7 @@ pub mod lark; mod grammar_builder; mod json; pub use grammar_builder::{GrammarBuilder, NodeRef}; -pub use json::JsonCompileOptions; +pub use json::compiler::JsonCompileOptions; pub use tokenizer_json::token_bytes_from_tokenizer_json; #[macro_export] From 5f836aef4ebbd6da901eac6f9a0c6029f39b1534 Mon Sep 17 00:00:00 2001 From: Hudson Cooper Date: Thu, 7 Nov 2024 16:25:11 -0800 Subject: [PATCH 04/73] default to all types if type unspecified --- parser/src/json/compiler.rs | 55 +++++++++++++++++++------------------ 1 file changed, 29 insertions(+), 26 deletions(-) diff --git a/parser/src/json/compiler.rs b/parser/src/json/compiler.rs index cd09798..9d266f5 100644 --- a/parser/src/json/compiler.rs +++ b/parser/src/json/compiler.rs @@ -22,6 +22,7 @@ fn to_compact_json(target: &serde_json::Value) -> String { serde_json::to_string(target).unwrap() } +const TYPES: [&str; 7] = ["null", "boolean", "integer", "number", "string", "array", "object"]; const KEYWORDS: [&str; 10] = [ "anyOf", "oneOf", @@ -60,6 +61,7 @@ const DEFS_KEYS: [&str; 4] = ["$defs", "definitions", "defs", "refs"]; const ARRAY_KEYS: [&str; 4] = ["items", "prefixItems", "minItems", "maxItems"]; const OBJECT_KEYS: [&str; 2] = ["properties", "additionalProperties"]; +const STRING_KEYS: [&str; 4] = ["minLength", "maxLength", "pattern", "format"]; const CHAR_REGEX: &str = r#"(\\([\"\\\/bfnrt]|u[a-fA-F0-9]{4})|[^\"\\\x00-\x1F\x7F])"#; @@ -78,17 +80,9 @@ fn validate_json_node_keys(node: &Value) -> Result<()> { .ok_or_else(|| anyhow!("Expected object as json schema, got: {}", limited_str(node))) .unwrap(); - let typ = node.get("type").and_then(|v| v.as_str()).unwrap_or(""); - for key in node.keys() { let key = &key.as_str(); - if KEYWORDS.contains(key) || IGNORED_KEYS.contains(key) || DEFS_KEYS.contains(key) { - continue; - } - if typ == "array" && ARRAY_KEYS.contains(key) { - continue; - } - if typ == "object" && OBJECT_KEYS.contains(key) { + if KEYWORDS.contains(key) || IGNORED_KEYS.contains(key) || DEFS_KEYS.contains(key) || ARRAY_KEYS.contains(key) || OBJECT_KEYS.contains(key) || STRING_KEYS.contains(key) { continue; } if key.starts_with("x-") || key.starts_with("$xsd-") { @@ -299,6 +293,13 @@ impl Compiler { bail!("'false' not supported as schema here"); } + if let Some(json_schema) = json_schema.as_object() { + // TODO: should be sufficient to have only ignored keys here + if json_schema.is_empty() { + return Ok(self.gen_json_any()); + } + } + // eprintln!("gen_json: {}", limited_str(json_schema)); validate_json_node_keys(json_schema)?; @@ -343,26 +344,28 @@ impl Compiler { return Ok(self.builder.select(&options)); } - // Process type-specific keywords - if let Some(arr) = json_schema["type"].as_array() { - let nodes = arr - .iter() - .map(|v| { - let tp = v.as_str().ok_or_else(|| { - anyhow!("Expected string in type list, got: {}", limited_str(v)) - })?; - self.gen_json_type(tp, json_schema) - }) - .collect::>>()?; - return Ok(self.builder.select(&nodes)); + // Process type + if let Some(tp) = json_schema.opt_str("type")? { + return self.gen_json_type(tp, json_schema) } - if let Some(target_type_str) = json_schema.opt_str("type")? { - return self.gen_json_type(target_type_str, json_schema); - } + let types = match json_schema.opt_array("type")? { + Some(types) => types, + None => { + &TYPES.iter().map(|s| Value::String(s.to_string())).collect::>() + } + }; - // Fallback to "any" type - Ok(self.gen_json_any()) + let nodes = types + .iter() + .map(|v| { + let tp = v.as_str().ok_or_else(|| { + anyhow!("Expected string in type list, got: {}", limited_str(v)) + })?; + self.gen_json_type(tp, json_schema) + }) + .collect::>>()?; + Ok(self.builder.select(&nodes)) } fn gen_json_type(&mut self, target_type_str: &str, json_schema: &Value) -> Result { From e67ed0fc68aea50289d2a68a742434eb33b94005 Mon Sep 17 00:00:00 2001 From: Hudson Cooper Date: Thu, 7 Nov 2024 16:26:09 -0800 Subject: [PATCH 05/73] basic formats --- parser/src/json/compiler.rs | 19 ++++++++++++++++++- parser/src/json/formats.rs | 17 +++++++++++++++++ parser/src/json/mod.rs | 1 + 3 files changed, 36 insertions(+), 1 deletion(-) create mode 100644 parser/src/json/formats.rs diff --git a/parser/src/json/compiler.rs b/parser/src/json/compiler.rs index 9d266f5..e7ab928 100644 --- a/parser/src/json/compiler.rs +++ b/parser/src/json/compiler.rs @@ -4,6 +4,7 @@ use lazy_static::lazy_static; use serde_json::{json, Value}; use std::{collections::HashMap, vec}; +use super::formats::lookup_format; use crate::{ api::{GrammarWithLexer, RegexSpec, TopLevelGrammar}, GrammarBuilder, NodeRef, @@ -378,7 +379,8 @@ impl Compiler { let min_length = json_schema.opt_u64("minLength")?.unwrap_or(0); let max_length = json_schema.opt_u64("maxLength")?; let pattern = json_schema.opt_str("pattern")?; - return self.gen_json_string(min_length, max_length, pattern); + let format = json_schema.opt_str("format")?; + return self.gen_json_string(min_length, max_length, pattern, format); } "array" => { let empty = vec![]; @@ -520,7 +522,22 @@ impl Compiler { min_length: u64, max_length: Option, regex: Option<&str>, + format: Option<&str>, ) -> Result { + + let mut regex = regex; + + if let Some(format) = format { + if regex.is_some() { + bail!("Cannot specify both a regex and a format for a JSON string"); + } + if let Some(r) = lookup_format(format) { + regex = Some(r); + } else { + bail!("Unknown format: {}", format) + }; + } + if min_length == 0 && max_length.is_none() && regex.is_none() { return Ok(self.json_simple_string()); } diff --git a/parser/src/json/formats.rs b/parser/src/json/formats.rs new file mode 100644 index 0000000..bce6173 --- /dev/null +++ b/parser/src/json/formats.rs @@ -0,0 +1,17 @@ +use lazy_static::lazy_static; +use std::collections::HashMap; + +lazy_static! { + static ref FORMAT_PATTERNS: HashMap<&'static str, &'static str> = { + let mut m = HashMap::new(); + m.insert("email", r"^[^\s@]+@[^\s@]+\.[^\s@]+$"); + m.insert("ipv4", r"^(?:[0-9]{1,3}\.){3}[0-9]{1,3}$"); + m.insert("date", r"^\d{4}-\d{2}-\d{2}$"); + // Add more patterns as needed + m + }; +} + +pub fn lookup_format(name: &str) -> Option<&str> { + FORMAT_PATTERNS.get(name).copied() +} diff --git a/parser/src/json/mod.rs b/parser/src/json/mod.rs index 59d8df7..3c89888 100644 --- a/parser/src/json/mod.rs +++ b/parser/src/json/mod.rs @@ -1 +1,2 @@ pub mod compiler; +mod formats; From 261d27ebb4a635d9d1fc052f9cd9a975de6c312f Mon Sep 17 00:00:00 2001 From: Hudson Cooper Date: Thu, 7 Nov 2024 16:52:34 -0800 Subject: [PATCH 06/73] format parity --- parser/src/json/formats.rs | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/parser/src/json/formats.rs b/parser/src/json/formats.rs index bce6173..e8924d3 100644 --- a/parser/src/json/formats.rs +++ b/parser/src/json/formats.rs @@ -3,12 +3,18 @@ use std::collections::HashMap; lazy_static! { static ref FORMAT_PATTERNS: HashMap<&'static str, &'static str> = { - let mut m = HashMap::new(); - m.insert("email", r"^[^\s@]+@[^\s@]+\.[^\s@]+$"); - m.insert("ipv4", r"^(?:[0-9]{1,3}\.){3}[0-9]{1,3}$"); - m.insert("date", r"^\d{4}-\d{2}-\d{2}$"); - // Add more patterns as needed - m + HashMap::from([ + ("date-time", r"(?P[0-9]{4}-(?:0[1-9]|1[0-2])-(?:0[1-9]|[12][0-9]|3[01]))[tT](?P