Skip to content

Commit

Permalink
whisper : add whisper_token_count helper
Browse files Browse the repository at this point in the history
  • Loading branch information
ggerganov committed Mar 25, 2024
1 parent 5c2c07d commit ba69578
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 1 deletion.
4 changes: 4 additions & 0 deletions whisper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3731,6 +3731,10 @@ int whisper_tokenize(struct whisper_context * ctx, const char * text, whisper_to
return res.size();
}

int whisper_token_count(struct whisper_context * ctx, const char * text) {
return -whisper_tokenize(ctx, text, NULL, 0);
}

int whisper_lang_max_id() {
auto max_id = 0;
for (const auto & kv : g_lang) {
Expand Down
6 changes: 5 additions & 1 deletion whisper.h
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,10 @@ extern "C" {
whisper_token * tokens,
int n_max_tokens);

// Return the number of tokens in the provided text
// Equivalent to: -whisper_tokenize(ctx, text, NULL, 0)
int whisper_token_count(struct whisper_context * ctx, const char * text);

// Largest language id (i.e. number of available languages - 1)
WHISPER_API int whisper_lang_max_id();

Expand Down Expand Up @@ -504,7 +508,7 @@ extern "C" {
// tokens to provide to the whisper decoder as initial prompt
// these are prepended to any existing text context from a previous call
// use whisper_tokenize() to convert text to tokens
// maximum of whisper_n_text_ctx()/2 tokens are used
// maximum of whisper_n_text_ctx()/2 tokens are used (typically 224)
const char * initial_prompt;
const whisper_token * prompt_tokens;
int prompt_n_tokens;
Expand Down

0 comments on commit ba69578

Please sign in to comment.