Skip to content

Commit

Permalink
Fix #609 to ensure token consistency
Browse files Browse the repository at this point in the history
  • Loading branch information
slundberg committed Jan 31, 2024
1 parent 513966f commit 4187419
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 7 deletions.
22 changes: 16 additions & 6 deletions guidance/_grammar.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,29 @@
from . import _serialization_pb2
from . import _parser

tag_start = "{{G|"
tag_end = "|G}}"
_call_pool = {}
_tag_pattern = re.compile(re.escape(tag_start) + r"([^\|]+)" + re.escape(tag_end))

# to support the embedding of guidance functions inside Python f-strings we use tags with these delimiters
tag_start = "{{G|" # start of a call tag
tag_end = "|G}}" # end of a call tag
_call_pool = {} # the functions associated with the call tags
_tag_pattern = re.compile(re.escape(tag_start) + r"([^\|]+)" + re.escape(tag_end)) # the pattern for matching call tags

class StatefulException(Exception):
'''This is raised when we try and use the state of a grammar object like it was a live model.
Note that eventually we do want to support stateful parser/grammar constructs directly, but
for now we use a traditional parser and grammar separation (hence the need for this exception).'''
Note that eventually it would be nice to support stateful parser/grammar constructs directly, but
such "parser combinators" cannot be run effciently in Python. So we use a traditional parser and
grammar separation (hence the need for this exception).'''
pass

class Function():
''' This is the abstract class representing all guidance functions.
There are two main subclasses: GrammarFunction and RawFunction. GrammarFunctions
represent guidance grammars that can be serialized and sent across the wire, while
RawFunctions represent unconstrained native Python functions.
'''

def __init__(self, name, value=None) -> None:
self.name = name
self.value = value
Expand Down
3 changes: 2 additions & 1 deletion guidance/models/transformers/_transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,8 @@ def _model_and_tokenizer(self, model, tokenizer, **kwargs):
return model, tokenizer

def _joint_tokenize(self, token_ids):
first_decode = self.tokenizer._orig_tokenizer.decode(token_ids)
# first_decode = self.tokenizer._orig_tokenizer.decode(token_ids)
first_decode = b''.join([self.tokenizer.tokens[id] for id in token_ids]).decode("utf8")
new_ids = self.tokenizer._orig_tokenizer(first_decode, add_special_tokens=False)["input_ids"]

# HACK: check for a bug in the HuggingFace tokenizer (that will just add extra spaces during an encode-decode cycle)
Expand Down

0 comments on commit 4187419

Please sign in to comment.