From 2676819cb599072d83b0d4d09d1d773967e3c7b4 Mon Sep 17 00:00:00 2001 From: bobqianic <129547291+bobqianic@users.noreply.github.com> Date: Tue, 16 Jan 2024 23:32:55 +0000 Subject: [PATCH] edit some comments --- whisper.cpp | 27 ++++++++++++--------------- whisper.h | 2 +- 2 files changed, 13 insertions(+), 16 deletions(-) diff --git a/whisper.cpp b/whisper.cpp index 69844f0759d..dcf47f2bdb3 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -372,17 +372,17 @@ struct whisper_vocab { std::map id_to_token; // reference: https://github.com/openai/whisper/blob/248b6cb124225dd263bb9bd32d060b6517e067f8/whisper/tokenizer.py#L334-L349 - id token_eot = 50256; // anything before that are normal tokens - id token_sot = 50257; + id token_eot = 50256; // end of text, anything before that are normal tokens + id token_sot = 50257; // start of transcript // task tokens (used only for multilingual models) id token_translate = 50357; id token_transcribe = 50358; // other special tokens id token_solm = 50359; // [TDRZ] used by tinydiarize models to indicate speaker turn - id token_prev = 50360; - id token_nosp = 50361; + id token_prev = 50360; // TODO: I don't understand the meaning of this token + id token_nosp = 50361; // no speech id token_not = 50362; // no timestamps - id token_beg = 50363; // begin timestamps + id token_beg = 50363; // the first timestamp token <|0.00|> bool is_multilingual() const { return n_vocab >= 51865; @@ -1261,23 +1261,23 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con if (i > vocab.token_beg) { word = "[_TT_" + std::to_string(i - vocab.token_beg) + "]"; } else if (i == vocab.token_eot) { - word = "[_EOT_]"; + word = "[_EOT_]"; // <|endoftext|> } else if (i == vocab.token_sot) { - word = "[_SOT_]"; + word = "[_SOT_]"; // <|startoftranscript|> } else if (i == vocab.token_translate) { word = "[_TRANSLATE_]"; } else if (i == vocab.token_transcribe) { word = "[_TRANSCRIBE_]"; } else if (i == vocab.token_solm) { - word = "[_SOLM_]"; + word = "[_SOLM_]"; // <|startoflm|> } else if (i == vocab.token_prev) { - word = "[_PREV_]"; + word = "[_PREV_]"; // <|startofprev|> } else if (i == vocab.token_nosp) { - word = "[_NOSP_]"; + word = "[_NOSP_]"; // <|nospeech|> } else if (i == vocab.token_not) { - word = "[_NOT_]"; + word = "[_NOT_]"; // <|notimestamps|> } else if (i == vocab.token_beg) { - word = "[_BEG_]"; + word = "[_BEG_]"; // the first timestamp token <|0.00|> } else if (i > vocab.token_sot && i <= vocab.token_sot + vocab.num_languages()) { word = "[_LANG_" + std::string(whisper_lang_str(i - vocab.token_sot - 1)) + "]"; } else { @@ -4533,7 +4533,6 @@ static void whisper_no_speech_probs( const auto & vocab= ctx.vocab; const int n_logits = vocab.id_to_token.size(); const int logits_offset = decoder.i_batch - (prompt_init.size() - 1); - printf("[%i, %i, %i]\n", decoder.i_batch, (prompt_init.size() - 1), logits_offset); double no_speech_probs = 0.0; WHISPER_ASSERT(n_logits == ctx.vocab.n_vocab); @@ -5353,8 +5352,6 @@ int whisper_full_with_state( whisper_process_logits(*ctx, *state, state->decoders[0], params, t_cur); - printf("Non-speach: %f\n", state->decoders[0].sequence.no_speech_probs); - for (int j = 1; j < n_decoders_cur; ++j) { auto & decoder = state->decoders[j]; diff --git a/whisper.h b/whisper.h index 8a0262f594a..a321e76044a 100644 --- a/whisper.h +++ b/whisper.h @@ -487,7 +487,7 @@ extern "C" { float temperature_inc; float entropy_thold; // similar to OpenAI's "compression_ratio_threshold" float logprob_thold; - float no_speech_thold; // probability of silence + float no_speech_thold; struct { int best_of; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L264