Skip to content

Commit

Permalink
add one more test program; run them all
Browse files Browse the repository at this point in the history
  • Loading branch information
mmoskal committed Nov 20, 2024
1 parent 6e199da commit 203e710
Show file tree
Hide file tree
Showing 4 changed files with 125 additions and 7 deletions.
4 changes: 4 additions & 0 deletions sample_parser/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,7 @@ path = "src/minimal.rs"
[[bin]]
name = "lark_test"
path = "src/lark_test.rs"

[[bin]]
name = "grammar_tester"
path = "src/grammar_tester.rs"
11 changes: 5 additions & 6 deletions sample_parser/run.sh
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
#!/bin/sh

# cargo run data/blog.schema.ll.json data/blog.sample.json
set -e
cargo run data/blog.schema.ll.json data/blog.sample.json
cargo run --release data/blog.schema.json data/blog.sample.json
# cargo run --release --bin minimal data/blog.schema.json data/blog.sample.json
# cargo run --release data/rfc.lark data/rfc.xml
# mkdir -p tmp
# strip -o tmp/minimal ../../target/release/minimal
# ls -l ../../target/release/minimal tmp/minimal
cargo run --bin grammar_tester
cargo run --release --bin minimal data/blog.schema.json data/blog.sample.json
cargo run --release data/rfc.lark data/rfc.xml
115 changes: 115 additions & 0 deletions sample_parser/src/grammar_tester.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
use std::{hint::black_box, vec};

use llguidance_parser::{
api::{GrammarWithLexer, ParserLimits},
toktrie::{InferenceCapabilities, TokEnv},
Constraint, GrammarBuilder, TokenParser,
};

fn main() {
let mut builder = GrammarBuilder::new();

builder.add_grammar(GrammarWithLexer::default());
let n0 = builder.gen_rx(".*", "\n");
let n1 = builder.string("\n");
let n2 = builder.join(&[n0, n1]);
builder.set_start_node(n2);

let schema = builder.finalize().unwrap();
let obj_str = "this is\na test";

let tok_env: TokEnv =
toktrie_hf_tokenizers::ByteTokenizerEnv::from_name("microsoft/Phi-3.5-mini-instruct", None)
.unwrap()
.to_env();

let tokens = tok_env.tokenize(&obj_str);

let stderr_log_level = 2;
let buffer_log_level = 0;

let parser = TokenParser::from_llguidance_json(
tok_env.clone(),
schema,
llguidance_parser::Logger::new(buffer_log_level, stderr_log_level),
InferenceCapabilities {
ff_tokens: true, // can the engine append multiple tokens?
backtrack: false, // can the engine remove generated tokens?

conditional_ff_tokens: false, // not used
fork: false, // not used
},
ParserLimits::default(),
vec![],
)
.unwrap();
let mut constraint = Constraint::new(parser);

// enable sending parser results back via the logs (constraint.flush_logs())
constraint.log_json_progress = true;

let trie = tok_env.tok_trie();

eprintln!("Parsing tokens: {}", trie.tokens_dbg(&tokens));

let mut idx = 0;
while idx < tokens.len() {
let res = constraint.compute_mask().unwrap();

if res.is_stop() {
// stop sequence
break;
}

let sampled_token = if let Some(mask) = &res.sample_mask {
// Simulate sampling - it should use the mask and temperature
black_box(mask);
black_box(constraint.temperature);
let sampled_token = tokens[idx];

let p_stats = constraint.parser.last_step_stats();
println!(
"SAMPLE {}: {} {}; stats: {} lex, {} items, {} us",
idx,
sampled_token,
tok_env.tok_trie().token_dbg(sampled_token),
p_stats.lexer_cost,
p_stats.all_items,
p_stats.compute_time_us,
);
Some(sampled_token)
} else {
// sampling not required
println!("NO SAMPLE");
None
};

let splice = constraint.commit_token(sampled_token).unwrap();
if splice.stop {
// stop sequence
break;
}

assert!(splice.backtrack == 0); // we didn't allow backtracking in InferenceCaps

// if this fails, our test data is broken
if tokens[idx..idx + splice.ff_tokens.len()] != splice.ff_tokens {
panic!(
"BAD TEST: ff_tokens mismatch:\n{}\n{}",
trie.tokens_dbg(&tokens[idx..idx + splice.ff_tokens.len()]),
trie.tokens_dbg(&splice.ff_tokens)
);
}

if splice.ff_tokens.len() > 1 {
println!("FF: {}", trie.tokens_dbg(&splice.ff_tokens));
}

idx += splice.ff_tokens.len();
}

// the stop reason should be likely also sent to the user
println!("Stop reason: {:?}", constraint.parser.stop_reason());

println!("Max step stats: {:?}", constraint.parser.max_step_stats());
}
2 changes: 1 addition & 1 deletion scripts/test-guidance.sh
Original file line number Diff line number Diff line change
Expand Up @@ -39,4 +39,4 @@ else
fi

python -m pytest $PYTEST_FLAGS tests/unit/test_ll.py # main test
python -m pytest $PYTEST_FLAGS tests/unit/test_[lgmp]*.py tests/unit/library
python -m pytest $PYTEST_FLAGS tests/unit/test_[lgmp]*.py tests/unit/library "$@"

0 comments on commit 203e710

Please sign in to comment.