From bf385356d1ebdfe9ab8b17fe63f3e51aac8d5627 Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Thu, 31 Oct 2024 01:01:27 +0000 Subject: [PATCH 01/79] add llguidance based logits processor --- CMakeLists.txt | 6 ++ src/generators.cpp | 5 ++ src/generators.h | 4 ++ src/models/decoder_only.cpp | 2 +- src/models/logits.cpp | 36 +++++++++- src/models/logits.h | 9 ++- src/models/logits_processor.cpp | 112 ++++++++++++++++++++++++++++++++ src/models/logits_processor.h | 41 ++++++++++++ src/models/model.h | 2 + src/ort_genai.h | 4 ++ src/ort_genai_c.cpp | 8 +++ src/ort_genai_c.h | 3 + src/python/python.cpp | 6 +- test/CMakeLists.txt | 3 + 14 files changed, 237 insertions(+), 4 deletions(-) create mode 100644 src/models/logits_processor.cpp create mode 100644 src/models/logits_processor.h diff --git a/CMakeLists.txt b/CMakeLists.txt index db64f6958..588362e44 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -96,6 +96,12 @@ target_include_directories(onnxruntime-genai-static PRIVATE ${onnxruntime_extens target_include_directories(onnxruntime-genai-static PUBLIC ${onnxruntime_extensions_SOURCE_DIR}/shared/api/) target_link_libraries(onnxruntime-genai PRIVATE onnxruntime_extensions) target_link_libraries(onnxruntime-genai-static PUBLIC onnxruntime_extensions) +target_include_directories(onnxruntime-genai PUBLIC "/home/yingxiong/projects/llguidance/parser") +target_include_directories(onnxruntime-genai-static PUBLIC "/home/yingxiong/projects/llguidance/parser") +target_link_directories(onnxruntime-genai PRIVATE "/home/yingxiong/projects/llguidance/parser/target/release") +target_link_directories(onnxruntime-genai-static PUBLIC "/home/yingxiong/projects/llguidance/parser/target/release") +target_link_libraries(onnxruntime-genai PRIVATE llguidance_parser) +target_link_libraries(onnxruntime-genai-static PUBLIC llguidance_parser) target_link_directories(onnxruntime-genai PRIVATE ${ORT_LIB_DIR}) # we keep the shared libraries disconnected on Android as they will come from separate AARs and we don't want to force diff --git a/src/generators.cpp b/src/generators.cpp index 3ab83f988..93cc51323 100644 --- a/src/generators.cpp +++ b/src/generators.cpp @@ -244,6 +244,11 @@ void GeneratorParams::SetInputs(const NamedTensors& named_tensors) { } } +void GeneratorParams::SetGuidance(std::string_view type, std::string_view data) { + guidance_type = type; + guidance_data = data; +} + std::unique_ptr CreateGenerator(const Model& model, const GeneratorParams& params) { return std::make_unique(model, params); } diff --git a/src/generators.h b/src/generators.h index 2dde56c32..366877b25 100644 --- a/src/generators.h +++ b/src/generators.h @@ -98,6 +98,10 @@ struct GeneratorParams : std::enable_shared_from_this, LeakChec void SetInputs(const NamedTensors& inputs); + std::string guidance_type; + std::string guidance_data; + void SetGuidance(std::string_view type, std::string_view data); + private: bool is_cuda_graph_enabled_{}; }; diff --git a/src/models/decoder_only.cpp b/src/models/decoder_only.cpp index e91ea14cf..6baf8712a 100644 --- a/src/models/decoder_only.cpp +++ b/src/models/decoder_only.cpp @@ -40,7 +40,7 @@ void DecoderOnly_State::UpdateInputsOutputs(const RoamingArray& next_to input_ids_.Update(next_tokens_unk); position_inputs_.Update(current_length); kv_cache_.Update(beam_indices.GetCPU(), current_length); - logits_.Update(); + logits_.Update(next_tokens_unk); } } // namespace Generators diff --git a/src/models/logits.cpp b/src/models/logits.cpp index 0e333e150..5414835bb 100644 --- a/src/models/logits.cpp +++ b/src/models/logits.cpp @@ -31,6 +31,11 @@ Logits::Logits(State& state) cudaMemcpyAsync(cuda_eos_token_ids_.data(), cpu_ids.data(), cpu_ids.size() * sizeof(int32_t), ::cudaMemcpyHostToDevice, model_.cuda_stream_); } #endif + if (!state_.params_->guidance_type.empty() && !state_.params_->guidance_data.empty()) { + auto tokenizer = model_.CreateTokenizer(); + constrained_logits_processor_ = std::make_unique(model_.config_->model.vocab_size, model_.config_->model.eos_token_id, + state_.params_->guidance_type, state_.params_->guidance_data, tokenizer); + } } #pragma warning(push) @@ -157,6 +162,11 @@ RoamingArray Logits::Get() { assert(shape_[1] == 1); + std::vector logits_mask; + if (constrained_logits_processor_) { + logits_mask = constrained_logits_processor_->ComputeMask(); + } + #if USE_CUDA if (model_.device_type_ == DeviceType::CUDA) { auto batched_logits_gpu = gpu_span{logits_of_last_token->GetTensorMutableData(), element_count}; @@ -194,12 +204,20 @@ RoamingArray Logits::Get() { auto batched_logits_cpu = cpu_span{logits_of_last_token->GetTensorMutableData(), element_count}; HandleEOSArray(batched_logits_cpu); + if (!logits_mask.empty()) { + AddMask(batched_logits_cpu, logits_mask); + } return batched_logits_cpu; } #pragma warning(pop) -void Logits::Update() { +void Logits::Update(RoamingArray next_tokens_unk) { + if (constrained_logits_processor_) { + auto next_tokens = next_tokens_unk.GetCPU(); + constrained_logits_processor_->CommitTokens(static_cast(next_tokens[0])); + } + if (output_raw_.get()->GetTensorTypeAndShapeInfo()->GetShape()[1] == 1) { return; } @@ -230,6 +248,22 @@ void Logits::HandleEOSArray(cpu_span batched_logits) { } } +void Logits::AddMask(cpu_span logits, std::vector mask) { + size_t vocab_size = shape_[2]; + size_t vocab_index = 0; + + for (int index = 0; index < shape_[0]; index++) { + auto logits_span = logits.subspan(vocab_index, vocab_size); + for (size_t i = 0; i < vocab_size; i++) { + // if (mask[i / 32] & (1 << (i % 32))){ + // printf("TEST: allowed token %d \n", i); + // } + logits_span[i] = mask[i / 32] & (1 << (i % 32)) ? logits_span[i] : std::numeric_limits::lowest(); + } + vocab_index += vocab_size; + } +} + void Logits::Add() { output_index_ = state_.outputs_.size(); diff --git a/src/models/logits.h b/src/models/logits.h index 49b3a827f..0d5a9e970 100644 --- a/src/models/logits.h +++ b/src/models/logits.h @@ -2,7 +2,10 @@ // Licensed under the MIT License. #pragma once +#include +#include "model.h" #include "static_buffer.h" +#include "logits_processor.h" namespace Generators { @@ -12,11 +15,13 @@ struct Logits { void Add(); RoamingArray Get(); - void Update(); + void Update(RoamingArray next_tokens_unk=RoamingArray{}); private: void HandleEOSArray(cpu_span logits); + void AddMask(cpu_span logits, std::vector mask); + State& state_; const Model& model_{state_.model_}; size_t output_index_{~0U}; @@ -35,6 +40,8 @@ struct Logits { StaticBuffer* sb_logits32_{}; StaticBuffer* sb_logits16_{}; + std::unique_ptr constrained_logits_processor_; + #if USE_CUDA cuda_unique_ptr cuda_eos_token_ids_ptr_; // eos_token_ids from params, but in cuda accessible memory gpu_span cuda_eos_token_ids_; diff --git a/src/models/logits_processor.cpp b/src/models/logits_processor.cpp new file mode 100644 index 000000000..0b84fa157 --- /dev/null +++ b/src/models/logits_processor.cpp @@ -0,0 +1,112 @@ +#include +#include +#include +#include +#include +#include + +#include "llguidance.h" + +#include "logits_processor.h" + +namespace Generators { + +ConstrainedLogitsProcessor::ConstrainedLogitsProcessor(int vocab_size, uint32_t eos_token, + const std::string& guidance_type, const std::string& guidance_data, + std::shared_ptr tokenizer) + : tokenizer_(std::move(tokenizer)), vocab_size_(vocab_size) { + if (guidance_type.empty() || guidance_data.empty()) { + throw std::runtime_error("Guidance type and data must be provided"); + } + + if (guidance_type != "json_schema" && guidance_type != "regex" && guidance_type != "grammar") { + throw std::runtime_error("Unsupported guidance type: " + guidance_type); + } + + std::vector tokens; + std::vector token_lens; + for (int i = 0; i < vocab_size; i++) { + std::vector ids = {i}; + std::string token = tokenizer_->Decode(ids); + token_lens.push_back(token.size()); + for (char c : token) { + tokens.push_back(c); + } + } + + LlgTokenizeFn tokenizer_fn = [](const void* user_data, const uint8_t* bytes, + size_t bytes_len, uint32_t* output_tokens, size_t output_tokens_len) -> unsigned long { + std::string input_string = "\x02"; + input_string.reserve(bytes_len); + for (size_t i = 0; i < bytes_len; i++) { + input_string.push_back(bytes[i]); + } + const Tokenizer* tokenizer = reinterpret_cast(user_data); + std::vector output_ids = tokenizer->Encode(input_string.c_str()); + std::string prefix = "\x02"; + std::vector prefix_ids = tokenizer->Encode(prefix.c_str()); + auto prefix_len = prefix_ids.size(); + + size_t output_size = std::min(output_tokens_len, output_ids.size() - prefix_len); + for (size_t i = 0; i < output_size; i++) { + output_tokens[i] = output_ids[prefix_len + i]; + } + return output_ids.size(); + }; + + LlgTokenizerInit tokenizer_init = { + .vocab_size = static_cast(vocab_size), + .tok_eos = eos_token, + .token_lens = token_lens.data(), + .token_bytes = tokens.data(), + .tokenize_assumes_string = true, + .tokenize_fn = tokenizer_fn, + .tokenize_user_data = tokenizer_.get(), + }; + + llg_tokenizer_ = std::unique_ptr(llg_new_tokenizer(&tokenizer_init)); + + LlgConstraintInit constraint_init; + llg_constraint_init_set_defaults(&constraint_init, llg_tokenizer_.get()); + LlgConstraint* constraint_ptr; + if (guidance_type == "json_schema") { + constraint_ptr = llg_new_constraint_json(&constraint_init, guidance_data.c_str()); + } else if (guidance_type == "regex") { + constraint_ptr = llg_new_constraint_regex(&constraint_init, guidance_data.c_str()); + } else { + constraint_ptr = llg_new_constraint(&constraint_init, guidance_data.c_str()); + } + if (llg_get_error(constraint_ptr) != nullptr) { + std::string error_message = llg_get_error(constraint_ptr); + llg_free_constraint(constraint_ptr); + throw std::runtime_error("Error creating grammar: " + error_message); + } + llg_constraint_ = std::unique_ptr(constraint_ptr); +} + +std::vector ConstrainedLogitsProcessor::ComputeMask() { + // LlgMaskResult mask_result; + auto error = llg_compute_mask(llg_constraint_.get(), &mask_result_); + if (error != 0) { + std::string error_message = llg_get_error(llg_constraint_.get()); + throw std::runtime_error("Error computing mask: " + error_message); + } + + std::vector mask; + mask.reserve((vocab_size_ - 1) / 32 + 1); + for (int i = 0; i < (vocab_size_ - 1) / 32 + 1; i++) { + mask.push_back(mask_result_.sample_mask[i]); + } + return mask; +} + +void ConstrainedLogitsProcessor::CommitTokens(uint32_t token) { + LlgCommitResult commit_result; + auto error = llg_commit_token(llg_constraint_.get(), token, &commit_result); + if (error != 0) { + std::string error_message = llg_get_error(llg_constraint_.get()); + throw std::runtime_error("Error committing tokens: " + error_message); + } +} + +} // namespace Generators \ No newline at end of file diff --git a/src/models/logits_processor.h b/src/models/logits_processor.h new file mode 100644 index 000000000..d7abb0274 --- /dev/null +++ b/src/models/logits_processor.h @@ -0,0 +1,41 @@ +#pragma once + +#include +#include +#include +#include + +#include +#include "model.h" + +namespace Generators { + +struct LogitsProcessor { + virtual std::vector ComputeMask() = 0; + virtual void CommitTokens(uint32_t token) = 0; +}; + +struct LlgConstraintDeleter { + void operator()(LlgConstraint* lc) const { + llg_free_constraint(lc); + } +}; + +struct LlgTokenizerDeleter { + void operator()(LlgTokenizer* lt) const { + llg_free_tokenizer(lt); + } +}; + +struct ConstrainedLogitsProcessor : public LogitsProcessor { + ConstrainedLogitsProcessor(int vocab_size, uint32_t eos_token, const std::string& guidance_type, const std::string& guidance_data, std::shared_ptr tokenizer); + std::vector ComputeMask() override; + void CommitTokens(uint32_t token) override; + + size_t vocab_size_; + std::unique_ptr llg_constraint_; + std::unique_ptr llg_tokenizer_; + std::shared_ptr tokenizer_; + LlgMaskResult mask_result_; +}; +} // namespace Generators \ No newline at end of file diff --git a/src/models/model.h b/src/models/model.h index 1a6b1b0f0..a296d3bfe 100644 --- a/src/models/model.h +++ b/src/models/model.h @@ -48,6 +48,8 @@ struct State { std::vector input_names_, output_names_; std::vector adapter_names_; + std::string guided_data_; + std::string guided_type_; std::vector inputs_, outputs_; protected: diff --git a/src/ort_genai.h b/src/ort_genai.h index beffd0730..53e048879 100644 --- a/src/ort_genai.h +++ b/src/ort_genai.h @@ -222,6 +222,10 @@ struct OgaGeneratorParams : OgaAbstract { OgaCheckResult(OgaGeneratorParamsTryGraphCaptureWithMaxBatchSize(this, max_batch_size)); } + void SetGuidance(const char* type, const char* data) { + OgaCheckResult(OgaGeneratorParamsSetGuidance(this, type, data)); + } + static void operator delete(void* p) { OgaDestroyGeneratorParams(reinterpret_cast(p)); } }; diff --git a/src/ort_genai_c.cpp b/src/ort_genai_c.cpp index 629d6e962..a4381bf63 100644 --- a/src/ort_genai_c.cpp +++ b/src/ort_genai_c.cpp @@ -232,6 +232,14 @@ OgaResult* OGA_API_CALL OgaGeneratorParamsSetWhisperInputFeatures(OgaGeneratorPa OGA_CATCH } +OgaResult* OGA_API_CALL OgaGeneratorParamsSetGuidance(OgaGeneratorParams* oga_params, const char* type, const char* data) { + OGA_TRY + auto& params = *reinterpret_cast(oga_params); + params.SetGuidance(type, data); + return nullptr; + OGA_CATCH +} + OgaResult* OGA_API_CALL OgaGenerate(const OgaModel* model, const OgaGeneratorParams* generator_params, OgaSequences** out) { OGA_TRY auto result = Generators::Generate(*reinterpret_cast(model), *reinterpret_cast(generator_params)); diff --git a/src/ort_genai_c.h b/src/ort_genai_c.h index e4e72fe6f..304104bad 100644 --- a/src/ort_genai_c.h +++ b/src/ort_genai_c.h @@ -225,6 +225,9 @@ OGA_EXPORT OgaResult* OGA_API_CALL OgaGeneratorParamsSetModelInput(OgaGeneratorP OGA_EXPORT OgaResult* OGA_API_CALL OgaGeneratorParamsSetWhisperInputFeatures(OgaGeneratorParams*, OgaTensor* tensor); +OGA_EXPORT OgaResult* OGA_API_CALL OgaGeneratorParamsSetGuidance(OgaGeneratorParams*, const char* type, const char* data); + + /* * \brief Creates a generator from the given model and generator params. * \param[in] model The model to use for generation. diff --git a/src/python/python.cpp b/src/python/python.cpp index 9bf4836f9..c86bbec49 100644 --- a/src/python/python.cpp +++ b/src/python/python.cpp @@ -294,6 +294,9 @@ struct PyGeneratorParams { params_->TryGraphCapture(max_batch_size.cast()); } + void SetGuidance(const std::string& type, const std::string& data) { + params_->SetGuidance(type, data); + } pybind11::array_t py_input_ids_; pybind11::array py_whisper_input_features_; pybind11::array py_alignment_heads_; @@ -425,7 +428,8 @@ PYBIND11_MODULE(onnxruntime_genai, m) { .def("set_model_input", &PyGeneratorParams::SetModelInput) .def("set_search_options", &PyGeneratorParams::SetSearchOptions) // See config.h 'struct Search' for the options .def("try_use_cuda_graph_with_max_batch_size", &PyGeneratorParams::TryUseCudaGraphWithMaxBatchSize) // will be deprecated - .def("try_graph_capture_with_max_batch_size", &PyGeneratorParams::TryGraphCaptureWithMaxBatchSize); + .def("try_graph_capture_with_max_batch_size", &PyGeneratorParams::TryGraphCaptureWithMaxBatchSize) + .def("set_guidance", &PyGeneratorParams::SetGuidance); pybind11::class_(m, "TokenizerStream") .def("decode", [](TokenizerStream& t, int32_t token) { return t.Decode(token); }); diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 75070cbd7..0789459c4 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -13,6 +13,7 @@ add_executable(unit_tests model_tests.cpp sampling_tests.cpp sampling_benchmark.cpp + logits_processor_tests.cpp ) target_include_directories(unit_tests PRIVATE @@ -21,6 +22,7 @@ target_include_directories(unit_tests PRIVATE ) target_link_directories(unit_tests PRIVATE ${ORT_LIB_DIR}) +target_link_directories(unit_tests PRIVATE "/home/yingxiong/projects/llguidance/parser/target/release") target_link_libraries(unit_tests PRIVATE onnxruntime-genai-static GTest::gtest_main @@ -56,3 +58,4 @@ if (NOT MSVC) endif() include(GoogleTest) + From c151d52a32cf3585d96623d2bd3622fdf8bb4f82 Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Thu, 31 Oct 2024 01:12:08 +0000 Subject: [PATCH 02/79] add unit test --- test/logits_processor_tests.cpp | 68 +++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 test/logits_processor_tests.cpp diff --git a/test/logits_processor_tests.cpp b/test/logits_processor_tests.cpp new file mode 100644 index 000000000..37ec418e7 --- /dev/null +++ b/test/logits_processor_tests.cpp @@ -0,0 +1,68 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef MODEL_PATH +#define MODEL_PATH "../../test/test_models/" +#endif +#ifndef PHI2_PATH +#if USE_CUDA +#define PHI2_PATH MODEL_PATH "phi-2/int4/cuda" +#else +#define PHI2_PATH MODEL_PATH "phi-2/int4/cpu" +#endif +#endif +#ifndef SCHEMA_PATH +#define SCHEMA_PATH MODEL_PATH "grammars/blog.schema.json" +#endif + +std::string read_file(const char* filePath) { + std::ifstream file(filePath); + std::stringstream buffer; + buffer << file.rdbuf(); + return buffer.str(); +} + +TEST(LogitsProcessorTests, TestRegex) { + std::string regex = "answer: .*"; + std::string text = "\x02" + std::string("answer: I am a robot"); + // auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32"); + auto model = Generators::CreateModel(Generators::GetOrtEnv(), "/home/yingxiong/projects/onnxruntime-genai/phi3_cpu_test"); + auto tokenizer = model->CreateTokenizer(); + auto processor = std::make_unique(model->config_->model.vocab_size, + model->config_->model.eos_token_id, "regex", + regex, tokenizer); + auto target_ids = tokenizer->Encode(text.c_str()); + std::vector tids(target_ids.begin() + 2, target_ids.end()); + for (auto id : tids) { + auto mask = processor->ComputeMask(); + processor->CommitTokens(id); + } +} + +TEST(LogitsProcessorTests, TestJsonSchema) { + std::string json_schema = read_file(MODEL_PATH "grammars/blog.schema.json"); + std::string text = "\x02" + read_file(MODEL_PATH "grammars/blog.sample.json"); + // auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32"); + auto model = Generators::CreateModel(Generators::GetOrtEnv(), "/home/yingxiong/projects/onnxruntime-genai/phi3_cpu_test"); + auto tokenizer = model->CreateTokenizer(); + auto processor = std::make_unique(model->config_->model.vocab_size, + model->config_->model.eos_token_id, "json_schema", + json_schema, tokenizer); + auto target_ids = tokenizer->Encode(text.c_str()); + std::vector tids(target_ids.begin() + 2, target_ids.end()); + for (auto id : tids) { + std::cout << id << std::endl; + auto mask = processor->ComputeMask(); + processor->CommitTokens(id); + } +} From 9d5a8a023509028f84014066a663c4b645944e30 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Moskal?= Date: Thu, 31 Oct 2024 20:44:52 -0700 Subject: [PATCH 03/79] constrained decoding fixes (#1023) @Taka152 --- src/models/logits_processor.cpp | 58 ++++++++++++++++++++++----------- src/models/logits_processor.h | 3 ++ test/logits_processor_tests.cpp | 16 ++++----- 3 files changed, 50 insertions(+), 27 deletions(-) diff --git a/src/models/logits_processor.cpp b/src/models/logits_processor.cpp index 0b84fa157..5bc228cbd 100644 --- a/src/models/logits_processor.cpp +++ b/src/models/logits_processor.cpp @@ -11,6 +11,19 @@ namespace Generators { +std::vector tokenize_partial(const Tokenizer* tokenizer, const uint8_t* bytes, + size_t bytes_len) { + std::string input_string = "\x02"; + input_string.reserve(bytes_len + 2); + for (size_t i = 0; i < bytes_len; i++) { + input_string.push_back(bytes[i]); + } + std::vector output_ids = tokenizer->Encode(input_string.c_str()); + std::vector prefix_ids = tokenizer->Encode("\x02"); + auto prefix_len = prefix_ids.size(); // TODO cache this somewhere? + return std::vector(output_ids.begin() + prefix_len, output_ids.end()); +} + ConstrainedLogitsProcessor::ConstrainedLogitsProcessor(int vocab_size, uint32_t eos_token, const std::string& guidance_type, const std::string& guidance_data, std::shared_ptr tokenizer) @@ -23,37 +36,42 @@ ConstrainedLogitsProcessor::ConstrainedLogitsProcessor(int vocab_size, uint32_t throw std::runtime_error("Unsupported guidance type: " + guidance_type); } + std::unordered_map token_id_to_byte; + for (int i = 0x00; i <= 0xFF; ++i) { + char byte_str[10]; + snprintf(byte_str, sizeof(byte_str), "<0x%02X>", i); + auto token = tokenizer_->TokenToTokenId(byte_str); + if (token > 0) + token_id_to_byte[token] = static_cast(i); + } + std::vector tokens; std::vector token_lens; for (int i = 0; i < vocab_size; i++) { std::vector ids = {i}; - std::string token = tokenizer_->Decode(ids); - token_lens.push_back(token.size()); - for (char c : token) { - tokens.push_back(c); + if (token_id_to_byte.find(i) != token_id_to_byte.end()) { + tokens.push_back(token_id_to_byte[i]); + token_lens.push_back(1); + } else { + std::string token = tokenizer_->Decode(ids); + token_lens.push_back(token.size()); + for (char c : token) { + tokens.push_back(c); + } } } LlgTokenizeFn tokenizer_fn = [](const void* user_data, const uint8_t* bytes, size_t bytes_len, uint32_t* output_tokens, size_t output_tokens_len) -> unsigned long { - std::string input_string = "\x02"; - input_string.reserve(bytes_len); - for (size_t i = 0; i < bytes_len; i++) { - input_string.push_back(bytes[i]); - } - const Tokenizer* tokenizer = reinterpret_cast(user_data); - std::vector output_ids = tokenizer->Encode(input_string.c_str()); - std::string prefix = "\x02"; - std::vector prefix_ids = tokenizer->Encode(prefix.c_str()); - auto prefix_len = prefix_ids.size(); - - size_t output_size = std::min(output_tokens_len, output_ids.size() - prefix_len); + auto output_ids = tokenize_partial(reinterpret_cast(user_data), bytes, bytes_len); + size_t output_size = std::min(output_tokens_len, output_ids.size()); for (size_t i = 0; i < output_size; i++) { - output_tokens[i] = output_ids[prefix_len + i]; + output_tokens[i] = output_ids[i]; } return output_ids.size(); }; + // TODO reuse the tokenizer between constraints LlgTokenizerInit tokenizer_init = { .vocab_size = static_cast(vocab_size), .tok_eos = eos_token, @@ -68,6 +86,7 @@ ConstrainedLogitsProcessor::ConstrainedLogitsProcessor(int vocab_size, uint32_t LlgConstraintInit constraint_init; llg_constraint_init_set_defaults(&constraint_init, llg_tokenizer_.get()); + // constraint_init.log_stderr_level = 2; LlgConstraint* constraint_ptr; if (guidance_type == "json_schema") { constraint_ptr = llg_new_constraint_json(&constraint_init, guidance_data.c_str()); @@ -78,8 +97,9 @@ ConstrainedLogitsProcessor::ConstrainedLogitsProcessor(int vocab_size, uint32_t } if (llg_get_error(constraint_ptr) != nullptr) { std::string error_message = llg_get_error(constraint_ptr); - llg_free_constraint(constraint_ptr); - throw std::runtime_error("Error creating grammar: " + error_message); + auto error = std::runtime_error("Error creating grammar: " + error_message); + llg_free_constraint(constraint_ptr); // only free constraint, after we have saved the error message + throw error; } llg_constraint_ = std::unique_ptr(constraint_ptr); } diff --git a/src/models/logits_processor.h b/src/models/logits_processor.h index d7abb0274..f0d2447db 100644 --- a/src/models/logits_processor.h +++ b/src/models/logits_processor.h @@ -10,6 +10,9 @@ namespace Generators { +std::vector tokenize_partial(const Tokenizer* tokenizer, const uint8_t* bytes, + size_t bytes_len); + struct LogitsProcessor { virtual std::vector ComputeMask() = 0; virtual void CommitTokens(uint32_t token) = 0; diff --git a/test/logits_processor_tests.cpp b/test/logits_processor_tests.cpp index 37ec418e7..63313a13c 100644 --- a/test/logits_processor_tests.cpp +++ b/test/logits_processor_tests.cpp @@ -34,16 +34,15 @@ std::string read_file(const char* filePath) { TEST(LogitsProcessorTests, TestRegex) { std::string regex = "answer: .*"; - std::string text = "\x02" + std::string("answer: I am a robot"); + std::string text = "answer: I am a robot"; // auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32"); auto model = Generators::CreateModel(Generators::GetOrtEnv(), "/home/yingxiong/projects/onnxruntime-genai/phi3_cpu_test"); auto tokenizer = model->CreateTokenizer(); auto processor = std::make_unique(model->config_->model.vocab_size, model->config_->model.eos_token_id, "regex", regex, tokenizer); - auto target_ids = tokenizer->Encode(text.c_str()); - std::vector tids(target_ids.begin() + 2, target_ids.end()); - for (auto id : tids) { + auto target_ids = tokenize_partial(tokenizer.get(), reinterpret_cast(text.c_str()), text.size()); + for (auto id : target_ids) { auto mask = processor->ComputeMask(); processor->CommitTokens(id); } @@ -51,16 +50,17 @@ TEST(LogitsProcessorTests, TestRegex) { TEST(LogitsProcessorTests, TestJsonSchema) { std::string json_schema = read_file(MODEL_PATH "grammars/blog.schema.json"); - std::string text = "\x02" + read_file(MODEL_PATH "grammars/blog.sample.json"); + std::string text = read_file(MODEL_PATH "grammars/blog.sample.json"); // auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32"); auto model = Generators::CreateModel(Generators::GetOrtEnv(), "/home/yingxiong/projects/onnxruntime-genai/phi3_cpu_test"); + // auto model = Generators::CreateModel(Generators::GetOrtEnv(), "/Users/mimoskal/ai/onnxruntime-genai/examples/c/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4"); + auto tokenizer = model->CreateTokenizer(); auto processor = std::make_unique(model->config_->model.vocab_size, model->config_->model.eos_token_id, "json_schema", json_schema, tokenizer); - auto target_ids = tokenizer->Encode(text.c_str()); - std::vector tids(target_ids.begin() + 2, target_ids.end()); - for (auto id : tids) { + auto target_ids = tokenize_partial(tokenizer.get(), reinterpret_cast(text.c_str()), text.size()); + for (auto id : target_ids) { std::cout << id << std::endl; auto mask = processor->ComputeMask(); processor->CommitTokens(id); From 48c3e96a508575f3d636e2a2edac63a57c3b5930 Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Fri, 1 Nov 2024 07:05:38 +0000 Subject: [PATCH 04/79] add test grammars --- .gitignore | 1 + test/test_models/grammars/blog.sample.json | 20 ++++++++ test/test_models/grammars/blog.schema.json | 54 ++++++++++++++++++++++ 3 files changed, 75 insertions(+) create mode 100644 test/test_models/grammars/blog.sample.json create mode 100644 test/test_models/grammars/blog.schema.json diff --git a/.gitignore b/.gitignore index 5ee2887de..eb916f66a 100644 --- a/.gitignore +++ b/.gitignore @@ -30,6 +30,7 @@ examples/csharp/HelloPhi/models !test/test_models/hf-internal-testing/ !test/test_models/hf-internal-testing/tiny-random-gpt2*/*.onnx +!test/test_models/grammars/ .ipynb_checkpoints/ /src/java/.gradle diff --git a/test/test_models/grammars/blog.sample.json b/test/test_models/grammars/blog.sample.json new file mode 100644 index 000000000..592499a6e --- /dev/null +++ b/test/test_models/grammars/blog.sample.json @@ -0,0 +1,20 @@ +{ + "title": "New Blog Post", + "content": "This is the content of the blog post...", + "publishedDate": "2023-08-25T15:00:00Z", + "author": { + "username": "authoruser", + "email": "author@example.com", + "fullName": "Author User", + "age": 30, + "location": "Earth", + "interests": [ + "Technology", + "Foo" + ] + }, + "tags": [ + "Technology", + "Programming" + ] +} \ No newline at end of file diff --git a/test/test_models/grammars/blog.schema.json b/test/test_models/grammars/blog.schema.json new file mode 100644 index 000000000..11e042c29 --- /dev/null +++ b/test/test_models/grammars/blog.schema.json @@ -0,0 +1,54 @@ +{ + "description": "A representation of a blog post", + "type": "object", + "required": [ + "title", + "content", + "author" + ], + "additionalProperties": false, + "properties": { + "title": { + "type": "string" + }, + "content": { + "type": "string" + }, + "publishedDate": { + "type": "string" + }, + "author": { + "type": "object", + "properties": { + "username": { + "type": "string" + }, + "email": { + "type": "string" + }, + "fullName": { + "type": "string" + }, + "age": { + "type": "integer" + }, + "location": { + "type": "string" + }, + "interests": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "additionalProperties": false + }, + "tags": { + "type": "array", + "items": { + "type": "string" + } + } + } +} \ No newline at end of file From d70b849a5a15b7472bb824c6410ade539dc0ff19 Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Fri, 1 Nov 2024 09:59:53 +0000 Subject: [PATCH 05/79] support cuda --- src/cuda/interface.cpp | 4 ++++ src/cuda/interface.h | 1 + src/cuda/model_kernels.cu | 16 +++++++++++++++ src/generators.cpp | 1 + src/models/kernels.h | 1 + src/models/logits.cpp | 35 ++++++++++++++++++++++++--------- src/models/logits.h | 6 +++++- src/models/logits_processor.cpp | 15 ++++++++------ src/models/logits_processor.h | 2 +- test/logits_processor_tests.cpp | 1 - 10 files changed, 64 insertions(+), 18 deletions(-) diff --git a/src/cuda/interface.cpp b/src/cuda/interface.cpp index e1a38f0a9..8f38655d5 100644 --- a/src/cuda/interface.cpp +++ b/src/cuda/interface.cpp @@ -118,6 +118,10 @@ struct CudaInterfaceImpl : CudaInterface { cuda::LaunchHandleEOSArray(batch_logits, batch_beam_size, vocab_size, eos_token_ids, eos_token_ids_count, stream); } + void LaunchAddLogitsMask(float* batch_logits, int batch_beam_size, int vocab_size, const uint32_t* logits_mask, cudaStream_t stream) override { + cuda::LaunchAddLogitsMask(batch_logits, batch_beam_size, vocab_size, logits_mask, stream); + } + void UpdateCacheIndirectionKernelLauncher(int32_t* tgt_indir_cache, const int32_t* src_indir_cache, const int32_t* beam_ids, int batch_size, int beam_width, int input_seq_length, int max_seq_length, int current_length, cudaStream_t stream) override { cuda::UpdateCacheIndirectionKernelLauncher(tgt_indir_cache, src_indir_cache, beam_ids, batch_size, beam_width, input_seq_length, max_seq_length, current_length, stream); } diff --git a/src/cuda/interface.h b/src/cuda/interface.h index 346e72041..47903eea2 100644 --- a/src/cuda/interface.h +++ b/src/cuda/interface.h @@ -31,6 +31,7 @@ struct CudaInterface : DeviceInterface { virtual void Launch_UpdateAttentionMask(int32_t* mask_data, const int32_t* old_mask_data, int batch_beam_size, int current_length, int max_length, bool update_only, cudaStream_t stream) = 0; virtual void Launch_UpdateAttentionMask(int64_t* mask_data, const int64_t* old_mask_data, int batch_beam_size, int current_length, int max_length, bool update_only, cudaStream_t stream) = 0; virtual void LaunchHandleEOSArray(float* batch_logits, int batch_beam_size, int vocab_size, const int32_t* eos_token_ids, int eos_token_ids_count, cudaStream_t stream) = 0; + virtual void LaunchAddLogitsMask(float* batch_logits, int batch_beam_size, int vocab_size, const uint32_t* logits_mask, cudaStream_t stream) = 0; virtual void UpdateCacheIndirectionKernelLauncher(int32_t* tgt_indir_cache, const int32_t* src_indir_cache, const int32_t* beam_ids, int batch_size, int beam_width, int input_seq_length, int max_seq_length, int current_length, cudaStream_t stream) = 0; virtual void ReorderPastStatesKernelLauncher(void* out_buffer, const void* in_buffer, int batch_size, int num_heads, int max_length, int head_size, int chunk_size, cudaStream_t stream) = 0; virtual void LaunchCopyCrossQKSingleDecodeStep(cudaStream_t stream, float* cross_qk_buffer_data, float** qk_layer_pointers, int token_index, int batch_beam_size, int num_layers, int num_heads, int num_alignment_heads, const int* alignment_heads, int frames, int max_length) = 0; diff --git a/src/cuda/model_kernels.cu b/src/cuda/model_kernels.cu index c93e20497..4b2bf14db 100644 --- a/src/cuda/model_kernels.cu +++ b/src/cuda/model_kernels.cu @@ -84,6 +84,22 @@ void LaunchHandleEOSArray(float* batch_logits, int batch_beam_size, int vocab_si HandleEOSArray<<<(batch_beam_size + 255) / 256, 256, 0, stream>>>(batch_logits, batch_beam_size, vocab_size, eos_token_ids, eos_token_ids_count); } +__global__ void AddLogitsMask(float* batch_logits, int batch_beam_size, int vocab_size, const uint32_t* logits_mask) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= batch_beam_size * vocab_size) + return; + int batch_index = index / vocab_size; + int vocab_index = index % vocab_size; + if (!(logits_mask[vocab_index / 32] & (1 << (vocab_index % 32)))) + batch_logits[index] = std::numeric_limits::lowest(); +} + +void LaunchAddLogitsMask(float* batch_logits, int batch_beam_size, int vocab_size, const uint32_t* logits_mask, cudaStream_t stream){ + int block_size = 256; + int num_blocks = (batch_beam_size * vocab_size + block_size - 1) / block_size; + AddLogitsMask<<>>(batch_logits, batch_beam_size, vocab_size, logits_mask); +} + __global__ void ConvertFp16ToFp32(const half* src, float* dst, int count) { int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx < count) diff --git a/src/generators.cpp b/src/generators.cpp index 93cc51323..3e3785e39 100644 --- a/src/generators.cpp +++ b/src/generators.cpp @@ -176,6 +176,7 @@ void Launch_UpdateAttentionMask(int32_t* mask_data, const int32_t* old_ template <> void Launch_UpdateAttentionMask(int64_t* mask_data, const int64_t* old_mask_data, int batch_beam_size, int current_length, int max_length, bool update_only, cudaStream_t stream) { GetCudaInterface()->Launch_UpdateAttentionMask(mask_data, old_mask_data, batch_beam_size, current_length, max_length, update_only, stream); } void LaunchHandleEOSArray(float* batch_logits, int batch_beam_size, int vocab_size, const int32_t* eos_token_ids, int eos_token_ids_count, cudaStream_t stream) { GetCudaInterface()->LaunchHandleEOSArray(batch_logits, batch_beam_size, vocab_size, eos_token_ids, eos_token_ids_count, stream); } +void LaunchAddLogitsMask(float* batch_logits, int batch_beam_size, int vocab_size, const uint32_t* logits_mask, cudaStream_t stream){ GetCudaInterface()->LaunchAddLogitsMask(batch_logits, batch_beam_size, vocab_size, logits_mask, stream); } void UpdateCacheIndirectionKernelLauncher(int32_t* tgt_indir_cache, const int32_t* src_indir_cache, const int32_t* beam_ids, int batch_size, int beam_width, int input_seq_length, int max_seq_length, int current_length, cudaStream_t stream) { GetCudaInterface()->UpdateCacheIndirectionKernelLauncher(tgt_indir_cache, src_indir_cache, beam_ids, batch_size, beam_width, input_seq_length, max_seq_length, current_length, stream); } void ReorderPastStatesKernelLauncher(void* out_buffer, const void* in_buffer, int batch_size, int num_heads, int max_length, int head_size, int chunk_size, cudaStream_t stream) { GetCudaInterface()->ReorderPastStatesKernelLauncher(out_buffer, in_buffer, batch_size, num_heads, max_length, head_size, chunk_size, stream); } template <> diff --git a/src/models/kernels.h b/src/models/kernels.h index 442172696..251268624 100644 --- a/src/models/kernels.h +++ b/src/models/kernels.h @@ -12,6 +12,7 @@ void Launch_UpdateAttentionMask(T* mask_data, const T* old_mask_data, int batch_ int max_length, bool update_only, cudaStream_t stream); void LaunchHandleEOSArray(float* batch_logits, int batch_beam_size, int vocab_size, const int32_t* eos_token_ids, int eos_token_ids_count, cudaStream_t stream); +void LaunchAddLogitsMask(float* batch_logits, int batch_beam_size, int vocab_size, const uint32_t* logits_mask, cudaStream_t stream); void LaunchFp16ToFp32(const uint16_t* fp16, float* fp32, int count, cudaStream_t stream); void LaunchFp32ToFp16(const float* fp32, uint16_t* fp16, int count, cudaStream_t stream); diff --git a/src/models/logits.cpp b/src/models/logits.cpp index 5414835bb..34cbfa3c7 100644 --- a/src/models/logits.cpp +++ b/src/models/logits.cpp @@ -2,11 +2,12 @@ // Licensed under the MIT License. #include "../generators.h" #include "model.h" -#include "logits.h" + #if USE_CUDA #include "kernels.h" #endif +#include "logits.h" namespace Generators { Logits::Logits(State& state) @@ -35,6 +36,11 @@ Logits::Logits(State& state) auto tokenizer = model_.CreateTokenizer(); constrained_logits_processor_ = std::make_unique(model_.config_->model.vocab_size, model_.config_->model.eos_token_id, state_.params_->guidance_type, state_.params_->guidance_data, tokenizer); +#if USE_CUDA + if (model_.device_type_ == DeviceType::CUDA) { + cuda_logits_mask_ptr_ = CudaMallocArray(shape_[0] * shape_[2]); + } +#endif } } @@ -162,9 +168,8 @@ RoamingArray Logits::Get() { assert(shape_[1] == 1); - std::vector logits_mask; if (constrained_logits_processor_) { - logits_mask = constrained_logits_processor_->ComputeMask(); + logits_mask_ = constrained_logits_processor_->ComputeMask(); } #if USE_CUDA @@ -178,6 +183,11 @@ RoamingArray Logits::Get() { cuda_eos_token_ids_.data(), static_cast(cuda_eos_token_ids_.size()), model_.cuda_stream_); + + if (!logits_mask_.empty()) { + cudaMemcpyAsync(cuda_logits_mask_ptr_.get(), logits_mask_.data(), logits_mask_.size() * sizeof(uint32_t), ::cudaMemcpyHostToDevice, model_.cuda_stream_); + AddMask(batched_logits_gpu, cuda_logits_mask_ptr_.get()); + } return batched_logits_gpu; } #elif USE_DML @@ -198,14 +208,17 @@ RoamingArray Logits::Get() { auto batched_logits_cpu = cpu_span{cpu_tensor, element_count}; HandleEOSArray(batched_logits_cpu); + if (!logits_mask_.empty()) { + AddMask(batched_logits_cpu, logits_mask_); + } return batched_logits_cpu; } #endif auto batched_logits_cpu = cpu_span{logits_of_last_token->GetTensorMutableData(), element_count}; HandleEOSArray(batched_logits_cpu); - if (!logits_mask.empty()) { - AddMask(batched_logits_cpu, logits_mask); + if (!logits_mask_.empty()) { + AddMask(batched_logits_cpu, logits_mask_); } return batched_logits_cpu; } @@ -248,22 +261,26 @@ void Logits::HandleEOSArray(cpu_span batched_logits) { } } -void Logits::AddMask(cpu_span logits, std::vector mask) { +void Logits::AddMask(cpu_span logits, std::vector& mask) { size_t vocab_size = shape_[2]; size_t vocab_index = 0; for (int index = 0; index < shape_[0]; index++) { auto logits_span = logits.subspan(vocab_index, vocab_size); for (size_t i = 0; i < vocab_size; i++) { - // if (mask[i / 32] & (1 << (i % 32))){ - // printf("TEST: allowed token %d \n", i); - // } logits_span[i] = mask[i / 32] & (1 << (i % 32)) ? logits_span[i] : std::numeric_limits::lowest(); } vocab_index += vocab_size; } } +#if USE_CUDA +void Logits::AddMask(gpu_span logits, const uint32_t* mask) { + cuda::LaunchAddLogitsMask(logits.data(), static_cast(shape_[0]), + static_cast(shape_[2]), mask, model_.cuda_stream_); +} +#endif + void Logits::Add() { output_index_ = state_.outputs_.size(); diff --git a/src/models/logits.h b/src/models/logits.h index 0d5a9e970..a4d89446e 100644 --- a/src/models/logits.h +++ b/src/models/logits.h @@ -20,7 +20,7 @@ struct Logits { private: void HandleEOSArray(cpu_span logits); - void AddMask(cpu_span logits, std::vector mask); + void AddMask(cpu_span logits, std::vector& mask); State& state_; const Model& model_{state_.model_}; @@ -41,10 +41,14 @@ struct Logits { StaticBuffer* sb_logits16_{}; std::unique_ptr constrained_logits_processor_; + std::vector logits_mask_; #if USE_CUDA cuda_unique_ptr cuda_eos_token_ids_ptr_; // eos_token_ids from params, but in cuda accessible memory gpu_span cuda_eos_token_ids_; + cuda_unique_ptr cuda_logits_mask_ptr_; + void AddMask(gpu_span logits, const uint32_t* mask); + #endif #if USE_DML diff --git a/src/models/logits_processor.cpp b/src/models/logits_processor.cpp index 5bc228cbd..ae858b0aa 100644 --- a/src/models/logits_processor.cpp +++ b/src/models/logits_processor.cpp @@ -82,11 +82,14 @@ ConstrainedLogitsProcessor::ConstrainedLogitsProcessor(int vocab_size, uint32_t .tokenize_user_data = tokenizer_.get(), }; - llg_tokenizer_ = std::unique_ptr(llg_new_tokenizer(&tokenizer_init)); + char error_buf[128]; + llg_tokenizer_ = std::unique_ptr(llg_new_tokenizer(&tokenizer_init, error_buf, sizeof(error_buf))); + if (!llg_tokenizer_) { + throw std::runtime_error("Error creating tokenizer: " + std::string(error_buf)); + } LlgConstraintInit constraint_init; llg_constraint_init_set_defaults(&constraint_init, llg_tokenizer_.get()); - // constraint_init.log_stderr_level = 2; LlgConstraint* constraint_ptr; if (guidance_type == "json_schema") { constraint_ptr = llg_new_constraint_json(&constraint_init, guidance_data.c_str()); @@ -98,15 +101,15 @@ ConstrainedLogitsProcessor::ConstrainedLogitsProcessor(int vocab_size, uint32_t if (llg_get_error(constraint_ptr) != nullptr) { std::string error_message = llg_get_error(constraint_ptr); auto error = std::runtime_error("Error creating grammar: " + error_message); - llg_free_constraint(constraint_ptr); // only free constraint, after we have saved the error message + llg_free_constraint(constraint_ptr); // only free constraint, after we have saved the error message throw error; } llg_constraint_ = std::unique_ptr(constraint_ptr); } std::vector ConstrainedLogitsProcessor::ComputeMask() { - // LlgMaskResult mask_result; - auto error = llg_compute_mask(llg_constraint_.get(), &mask_result_); + LlgMaskResult mask_result; + auto error = llg_compute_mask(llg_constraint_.get(), &mask_result); if (error != 0) { std::string error_message = llg_get_error(llg_constraint_.get()); throw std::runtime_error("Error computing mask: " + error_message); @@ -115,7 +118,7 @@ std::vector ConstrainedLogitsProcessor::ComputeMask() { std::vector mask; mask.reserve((vocab_size_ - 1) / 32 + 1); for (int i = 0; i < (vocab_size_ - 1) / 32 + 1; i++) { - mask.push_back(mask_result_.sample_mask[i]); + mask.push_back(mask_result.sample_mask[i]); } return mask; } diff --git a/src/models/logits_processor.h b/src/models/logits_processor.h index f0d2447db..d3d3c02c6 100644 --- a/src/models/logits_processor.h +++ b/src/models/logits_processor.h @@ -6,6 +6,7 @@ #include #include + #include "model.h" namespace Generators { @@ -39,6 +40,5 @@ struct ConstrainedLogitsProcessor : public LogitsProcessor { std::unique_ptr llg_constraint_; std::unique_ptr llg_tokenizer_; std::shared_ptr tokenizer_; - LlgMaskResult mask_result_; }; } // namespace Generators \ No newline at end of file diff --git a/test/logits_processor_tests.cpp b/test/logits_processor_tests.cpp index 63313a13c..5888b9496 100644 --- a/test/logits_processor_tests.cpp +++ b/test/logits_processor_tests.cpp @@ -61,7 +61,6 @@ TEST(LogitsProcessorTests, TestJsonSchema) { json_schema, tokenizer); auto target_ids = tokenize_partial(tokenizer.get(), reinterpret_cast(text.c_str()), text.size()); for (auto id : target_ids) { - std::cout << id << std::endl; auto mask = processor->ComputeMask(); processor->CommitTokens(id); } From 6b90c1cd2044a57d71de98e18f28b33a97ba357e Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Mon, 4 Nov 2024 10:01:12 +0000 Subject: [PATCH 06/79] use tokenize.json to generate token_bytes --- src/models/logits.cpp | 3 +- src/models/logits.h | 3 +- src/models/logits_processor.cpp | 82 +++++++++++++-------------------- src/models/logits_processor.h | 18 ++++++-- test/logits_processor_tests.cpp | 17 ++++--- 5 files changed, 56 insertions(+), 67 deletions(-) diff --git a/src/models/logits.cpp b/src/models/logits.cpp index 34cbfa3c7..6bac3c865 100644 --- a/src/models/logits.cpp +++ b/src/models/logits.cpp @@ -35,7 +35,8 @@ Logits::Logits(State& state) if (!state_.params_->guidance_type.empty() && !state_.params_->guidance_data.empty()) { auto tokenizer = model_.CreateTokenizer(); constrained_logits_processor_ = std::make_unique(model_.config_->model.vocab_size, model_.config_->model.eos_token_id, - state_.params_->guidance_type, state_.params_->guidance_data, tokenizer); + state_.params_->guidance_type, state_.params_->guidance_data, tokenizer, + model_.config_->config_path.string().c_str()); #if USE_CUDA if (model_.device_type_ == DeviceType::CUDA) { cuda_logits_mask_ptr_ = CudaMallocArray(shape_[0] * shape_[2]); diff --git a/src/models/logits.h b/src/models/logits.h index a4d89446e..1233cdd98 100644 --- a/src/models/logits.h +++ b/src/models/logits.h @@ -15,7 +15,7 @@ struct Logits { void Add(); RoamingArray Get(); - void Update(RoamingArray next_tokens_unk=RoamingArray{}); + void Update(RoamingArray next_tokens_unk = RoamingArray{}); private: void HandleEOSArray(cpu_span logits); @@ -48,7 +48,6 @@ struct Logits { gpu_span cuda_eos_token_ids_; cuda_unique_ptr cuda_logits_mask_ptr_; void AddMask(gpu_span logits, const uint32_t* mask); - #endif #if USE_DML diff --git a/src/models/logits_processor.cpp b/src/models/logits_processor.cpp index ae858b0aa..470f968d1 100644 --- a/src/models/logits_processor.cpp +++ b/src/models/logits_processor.cpp @@ -1,9 +1,10 @@ #include #include +#include #include +#include #include #include -#include #include "llguidance.h" @@ -11,22 +12,9 @@ namespace Generators { -std::vector tokenize_partial(const Tokenizer* tokenizer, const uint8_t* bytes, - size_t bytes_len) { - std::string input_string = "\x02"; - input_string.reserve(bytes_len + 2); - for (size_t i = 0; i < bytes_len; i++) { - input_string.push_back(bytes[i]); - } - std::vector output_ids = tokenizer->Encode(input_string.c_str()); - std::vector prefix_ids = tokenizer->Encode("\x02"); - auto prefix_len = prefix_ids.size(); // TODO cache this somewhere? - return std::vector(output_ids.begin() + prefix_len, output_ids.end()); -} - ConstrainedLogitsProcessor::ConstrainedLogitsProcessor(int vocab_size, uint32_t eos_token, const std::string& guidance_type, const std::string& guidance_data, - std::shared_ptr tokenizer) + std::shared_ptr tokenizer, const std::string& tokenizer_path) : tokenizer_(std::move(tokenizer)), vocab_size_(vocab_size) { if (guidance_type.empty() || guidance_data.empty()) { throw std::runtime_error("Guidance type and data must be provided"); @@ -36,34 +24,10 @@ ConstrainedLogitsProcessor::ConstrainedLogitsProcessor(int vocab_size, uint32_t throw std::runtime_error("Unsupported guidance type: " + guidance_type); } - std::unordered_map token_id_to_byte; - for (int i = 0x00; i <= 0xFF; ++i) { - char byte_str[10]; - snprintf(byte_str, sizeof(byte_str), "<0x%02X>", i); - auto token = tokenizer_->TokenToTokenId(byte_str); - if (token > 0) - token_id_to_byte[token] = static_cast(i); - } - - std::vector tokens; - std::vector token_lens; - for (int i = 0; i < vocab_size; i++) { - std::vector ids = {i}; - if (token_id_to_byte.find(i) != token_id_to_byte.end()) { - tokens.push_back(token_id_to_byte[i]); - token_lens.push_back(1); - } else { - std::string token = tokenizer_->Decode(ids); - token_lens.push_back(token.size()); - for (char c : token) { - tokens.push_back(c); - } - } - } - - LlgTokenizeFn tokenizer_fn = [](const void* user_data, const uint8_t* bytes, - size_t bytes_len, uint32_t* output_tokens, size_t output_tokens_len) -> unsigned long { - auto output_ids = tokenize_partial(reinterpret_cast(user_data), bytes, bytes_len); + LlgTokenizeFn tokenize_fn = [](const void* user_data, const uint8_t* bytes, + size_t bytes_len, uint32_t* output_tokens, size_t output_tokens_len) -> unsigned long { + const TokenizeData* tokenize_data = reinterpret_cast(user_data); + auto output_ids = tokenize_partial(reinterpret_cast(tokenize_data->tokenizer), tokenize_data->prefix_len, bytes, bytes_len); size_t output_size = std::min(output_tokens_len, output_ids.size()); for (size_t i = 0; i < output_size; i++) { output_tokens[i] = output_ids[i]; @@ -72,20 +36,25 @@ ConstrainedLogitsProcessor::ConstrainedLogitsProcessor(int vocab_size, uint32_t }; // TODO reuse the tokenizer between constraints + fs::path tokenizer_path_fs(tokenizer_path); + fs::path json_path(tokenizer_path_fs / kDefaultVocabFile); + std::ifstream json_file(json_path.string()); + std::stringstream json_buffer; + json_buffer << json_file.rdbuf(); + std::string json_data = json_buffer.str(); + auto prefix_len = tokenizer_->Encode(kTokenizePrefixStr).size(); + tokenize_data_ = {tokenizer_.get(), prefix_len}; LlgTokenizerInit tokenizer_init = { - .vocab_size = static_cast(vocab_size), .tok_eos = eos_token, - .token_lens = token_lens.data(), - .token_bytes = tokens.data(), - .tokenize_assumes_string = true, - .tokenize_fn = tokenizer_fn, - .tokenize_user_data = tokenizer_.get(), + .tokenizer_json = json_data.c_str(), + .tokenize_fn = tokenize_fn, + .tokenize_user_data = &tokenize_data_, }; char error_buf[128]; llg_tokenizer_ = std::unique_ptr(llg_new_tokenizer(&tokenizer_init, error_buf, sizeof(error_buf))); if (!llg_tokenizer_) { - throw std::runtime_error("Error creating tokenizer: " + std::string(error_buf)); + throw std::runtime_error("Error creating llg_tokenizer: " + std::string(error_buf)); } LlgConstraintInit constraint_init; @@ -101,7 +70,7 @@ ConstrainedLogitsProcessor::ConstrainedLogitsProcessor(int vocab_size, uint32_t if (llg_get_error(constraint_ptr) != nullptr) { std::string error_message = llg_get_error(constraint_ptr); auto error = std::runtime_error("Error creating grammar: " + error_message); - llg_free_constraint(constraint_ptr); // only free constraint, after we have saved the error message + llg_free_constraint(constraint_ptr); throw error; } llg_constraint_ = std::unique_ptr(constraint_ptr); @@ -132,4 +101,15 @@ void ConstrainedLogitsProcessor::CommitTokens(uint32_t token) { } } +std::vector ConstrainedLogitsProcessor::tokenize_partial(const Tokenizer* tokenizer, const size_t prefix_len, + const uint8_t* bytes, size_t bytes_len) { + std::string input_string = kTokenizePrefixStr; + input_string.reserve(bytes_len + 2); + for (size_t i = 0; i < bytes_len; i++) { + input_string.push_back(bytes[i]); + } + std::vector output_ids = tokenizer->Encode(input_string.c_str()); + return std::vector(output_ids.begin() + prefix_len, output_ids.end()); +} + } // namespace Generators \ No newline at end of file diff --git a/src/models/logits_processor.h b/src/models/logits_processor.h index d3d3c02c6..7c32488bd 100644 --- a/src/models/logits_processor.h +++ b/src/models/logits_processor.h @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -11,9 +12,6 @@ namespace Generators { -std::vector tokenize_partial(const Tokenizer* tokenizer, const uint8_t* bytes, - size_t bytes_len); - struct LogitsProcessor { virtual std::vector ComputeMask() = 0; virtual void CommitTokens(uint32_t token) = 0; @@ -32,7 +30,10 @@ struct LlgTokenizerDeleter { }; struct ConstrainedLogitsProcessor : public LogitsProcessor { - ConstrainedLogitsProcessor(int vocab_size, uint32_t eos_token, const std::string& guidance_type, const std::string& guidance_data, std::shared_ptr tokenizer); + static constexpr const char* kDefaultVocabFile = "tokenizer.json"; + static constexpr const char* kTokenizePrefixStr = "\x02"; + + ConstrainedLogitsProcessor(int vocab_size, uint32_t eos_token, const std::string& guidance_type, const std::string& guidance_data, std::shared_ptr tokenizer, const std::string& tokenizer_path); std::vector ComputeMask() override; void CommitTokens(uint32_t token) override; @@ -40,5 +41,14 @@ struct ConstrainedLogitsProcessor : public LogitsProcessor { std::unique_ptr llg_constraint_; std::unique_ptr llg_tokenizer_; std::shared_ptr tokenizer_; + + static std::vector tokenize_partial(const Tokenizer* tokenizer, const size_t prefix_len, + const uint8_t* bytes, size_t bytes_len); + + struct TokenizeData { + Tokenizer* tokenizer; + size_t prefix_len; + }; + TokenizeData tokenize_data_; }; } // namespace Generators \ No newline at end of file diff --git a/test/logits_processor_tests.cpp b/test/logits_processor_tests.cpp index 5888b9496..81312746d 100644 --- a/test/logits_processor_tests.cpp +++ b/test/logits_processor_tests.cpp @@ -35,13 +35,13 @@ std::string read_file(const char* filePath) { TEST(LogitsProcessorTests, TestRegex) { std::string regex = "answer: .*"; std::string text = "answer: I am a robot"; - // auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32"); - auto model = Generators::CreateModel(Generators::GetOrtEnv(), "/home/yingxiong/projects/onnxruntime-genai/phi3_cpu_test"); + auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32"); auto tokenizer = model->CreateTokenizer(); auto processor = std::make_unique(model->config_->model.vocab_size, model->config_->model.eos_token_id, "regex", - regex, tokenizer); - auto target_ids = tokenize_partial(tokenizer.get(), reinterpret_cast(text.c_str()), text.size()); + regex, tokenizer, model->config_->config_path.string().c_str()); + auto target_ids = Generators::ConstrainedLogitsProcessor::tokenize_partial(tokenizer.get(), tokenizer->Encode(Generators::ConstrainedLogitsProcessor::kTokenizePrefixStr).size(), + reinterpret_cast(text.c_str()), text.size()); for (auto id : target_ids) { auto mask = processor->ComputeMask(); processor->CommitTokens(id); @@ -51,15 +51,14 @@ TEST(LogitsProcessorTests, TestRegex) { TEST(LogitsProcessorTests, TestJsonSchema) { std::string json_schema = read_file(MODEL_PATH "grammars/blog.schema.json"); std::string text = read_file(MODEL_PATH "grammars/blog.sample.json"); - // auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32"); - auto model = Generators::CreateModel(Generators::GetOrtEnv(), "/home/yingxiong/projects/onnxruntime-genai/phi3_cpu_test"); - // auto model = Generators::CreateModel(Generators::GetOrtEnv(), "/Users/mimoskal/ai/onnxruntime-genai/examples/c/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4"); + auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32"); auto tokenizer = model->CreateTokenizer(); auto processor = std::make_unique(model->config_->model.vocab_size, model->config_->model.eos_token_id, "json_schema", - json_schema, tokenizer); - auto target_ids = tokenize_partial(tokenizer.get(), reinterpret_cast(text.c_str()), text.size()); + json_schema, tokenizer, model->config_->config_path.string().c_str()); + auto target_ids = Generators::ConstrainedLogitsProcessor::tokenize_partial(tokenizer.get(), tokenizer->Encode(Generators::ConstrainedLogitsProcessor::kTokenizePrefixStr).size(), + reinterpret_cast(text.c_str()), text.size()); for (auto id : target_ids) { auto mask = processor->ComputeMask(); processor->CommitTokens(id); From bdb9ca4d7408e12628df7a5e2506b560b51cc6e1 Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Tue, 5 Nov 2024 10:27:11 +0000 Subject: [PATCH 07/79] fix win build --- CMakeLists.txt | 4 ++++ src/cuda/model_kernels.cu | 4 ++-- src/models/logits_processor.cpp | 21 +++++++++++++-------- 3 files changed, 19 insertions(+), 10 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 588362e44..dafb0f35f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -102,6 +102,10 @@ target_link_directories(onnxruntime-genai PRIVATE "/home/yingxiong/projects/llgu target_link_directories(onnxruntime-genai-static PUBLIC "/home/yingxiong/projects/llguidance/parser/target/release") target_link_libraries(onnxruntime-genai PRIVATE llguidance_parser) target_link_libraries(onnxruntime-genai-static PUBLIC llguidance_parser) +if (WIN32) + target_link_libraries(onnxruntime-genai PRIVATE ws2_32 ntdll bcrypt userenv) + target_link_libraries(onnxruntime-genai-static PRIVATE ws2_32 ntdll bcrypt userenv) +endif() target_link_directories(onnxruntime-genai PRIVATE ${ORT_LIB_DIR}) # we keep the shared libraries disconnected on Android as they will come from separate AARs and we don't want to force diff --git a/src/cuda/model_kernels.cu b/src/cuda/model_kernels.cu index 4b2bf14db..84178f16f 100644 --- a/src/cuda/model_kernels.cu +++ b/src/cuda/model_kernels.cu @@ -90,11 +90,11 @@ __global__ void AddLogitsMask(float* batch_logits, int batch_beam_size, int voca return; int batch_index = index / vocab_size; int vocab_index = index % vocab_size; - if (!(logits_mask[vocab_index / 32] & (1 << (vocab_index % 32)))) + if (!(logits_mask[(batch_index * vocab_size + vocab_index) / 32] & (1 << (vocab_index % 32)))) batch_logits[index] = std::numeric_limits::lowest(); } -void LaunchAddLogitsMask(float* batch_logits, int batch_beam_size, int vocab_size, const uint32_t* logits_mask, cudaStream_t stream){ +void LaunchAddLogitsMask(float* batch_logits, int batch_beam_size, int vocab_size, const uint32_t* logits_mask, cudaStream_t stream) { int block_size = 256; int num_blocks = (batch_beam_size * vocab_size + block_size - 1) / block_size; AddLogitsMask<<>>(batch_logits, batch_beam_size, vocab_size, logits_mask); diff --git a/src/models/logits_processor.cpp b/src/models/logits_processor.cpp index 470f968d1..12b17a86e 100644 --- a/src/models/logits_processor.cpp +++ b/src/models/logits_processor.cpp @@ -15,7 +15,7 @@ namespace Generators { ConstrainedLogitsProcessor::ConstrainedLogitsProcessor(int vocab_size, uint32_t eos_token, const std::string& guidance_type, const std::string& guidance_data, std::shared_ptr tokenizer, const std::string& tokenizer_path) - : tokenizer_(std::move(tokenizer)), vocab_size_(vocab_size) { + : vocab_size_(vocab_size), tokenizer_(std::move(tokenizer)) { if (guidance_type.empty() || guidance_data.empty()) { throw std::runtime_error("Guidance type and data must be provided"); } @@ -24,15 +24,15 @@ ConstrainedLogitsProcessor::ConstrainedLogitsProcessor(int vocab_size, uint32_t throw std::runtime_error("Unsupported guidance type: " + guidance_type); } - LlgTokenizeFn tokenize_fn = [](const void* user_data, const uint8_t* bytes, - size_t bytes_len, uint32_t* output_tokens, size_t output_tokens_len) -> unsigned long { + auto tokenize_fn = (LlgTokenizeFn) + [](const void* user_data, const uint8_t* bytes, + size_t bytes_len, uint32_t* output_tokens, size_t output_tokens_len) -> unsigned long { const TokenizeData* tokenize_data = reinterpret_cast(user_data); auto output_ids = tokenize_partial(reinterpret_cast(tokenize_data->tokenizer), tokenize_data->prefix_len, bytes, bytes_len); size_t output_size = std::min(output_tokens_len, output_ids.size()); for (size_t i = 0; i < output_size; i++) { output_tokens[i] = output_ids[i]; } - return output_ids.size(); + return static_cast(output_ids.size()); }; // TODO reuse the tokenizer between constraints @@ -45,10 +45,15 @@ ConstrainedLogitsProcessor::ConstrainedLogitsProcessor(int vocab_size, uint32_t auto prefix_len = tokenizer_->Encode(kTokenizePrefixStr).size(); tokenize_data_ = {tokenizer_.get(), prefix_len}; LlgTokenizerInit tokenizer_init = { - .tok_eos = eos_token, - .tokenizer_json = json_data.c_str(), - .tokenize_fn = tokenize_fn, - .tokenize_user_data = &tokenize_data_, + static_cast(vocab_size_), + eos_token, + nullptr, + nullptr, + json_data.c_str(), + false, + tokenize_fn, + false, + &tokenize_data_, }; char error_buf[128]; From a25de8ef44edf5d196c9b71e35376251ec405970 Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Wed, 6 Nov 2024 06:36:36 +0000 Subject: [PATCH 08/79] async compute mask --- src/models/logits.cpp | 4 +++- src/models/logits.h | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/models/logits.cpp b/src/models/logits.cpp index 6bac3c865..25ba4e7ae 100644 --- a/src/models/logits.cpp +++ b/src/models/logits.cpp @@ -37,6 +37,7 @@ Logits::Logits(State& state) constrained_logits_processor_ = std::make_unique(model_.config_->model.vocab_size, model_.config_->model.eos_token_id, state_.params_->guidance_type, state_.params_->guidance_data, tokenizer, model_.config_->config_path.string().c_str()); + mask_future_ = std::async(std::launch::async, [&]() { return constrained_logits_processor_->ComputeMask(); }); #if USE_CUDA if (model_.device_type_ == DeviceType::CUDA) { cuda_logits_mask_ptr_ = CudaMallocArray(shape_[0] * shape_[2]); @@ -170,7 +171,7 @@ RoamingArray Logits::Get() { assert(shape_[1] == 1); if (constrained_logits_processor_) { - logits_mask_ = constrained_logits_processor_->ComputeMask(); + logits_mask_ = mask_future_.get(); } #if USE_CUDA @@ -230,6 +231,7 @@ void Logits::Update(RoamingArray next_tokens_unk) { if (constrained_logits_processor_) { auto next_tokens = next_tokens_unk.GetCPU(); constrained_logits_processor_->CommitTokens(static_cast(next_tokens[0])); + mask_future_ = std::async(std::launch::async, [&]() { return constrained_logits_processor_->ComputeMask(); }); } if (output_raw_.get()->GetTensorTypeAndShapeInfo()->GetShape()[1] == 1) { diff --git a/src/models/logits.h b/src/models/logits.h index 1233cdd98..382f3b88b 100644 --- a/src/models/logits.h +++ b/src/models/logits.h @@ -3,6 +3,7 @@ #pragma once #include +#include #include "model.h" #include "static_buffer.h" #include "logits_processor.h" @@ -41,6 +42,7 @@ struct Logits { StaticBuffer* sb_logits16_{}; std::unique_ptr constrained_logits_processor_; + std::future> mask_future_; std::vector logits_mask_; #if USE_CUDA From edc0baed7905b7ab43b7a1ccb76ad3ec05c925d0 Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Wed, 6 Nov 2024 10:48:02 +0000 Subject: [PATCH 09/79] add llguidance build in cmake --- CMakeLists.txt | 12 ++++++++---- cmake/deps.txt | 1 + cmake/external/onnxruntime_external_deps.cmake | 15 +++++++++++++++ 3 files changed, 24 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index dafb0f35f..a453e4f93 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -96,10 +96,14 @@ target_include_directories(onnxruntime-genai-static PRIVATE ${onnxruntime_extens target_include_directories(onnxruntime-genai-static PUBLIC ${onnxruntime_extensions_SOURCE_DIR}/shared/api/) target_link_libraries(onnxruntime-genai PRIVATE onnxruntime_extensions) target_link_libraries(onnxruntime-genai-static PUBLIC onnxruntime_extensions) -target_include_directories(onnxruntime-genai PUBLIC "/home/yingxiong/projects/llguidance/parser") -target_include_directories(onnxruntime-genai-static PUBLIC "/home/yingxiong/projects/llguidance/parser") -target_link_directories(onnxruntime-genai PRIVATE "/home/yingxiong/projects/llguidance/parser/target/release") -target_link_directories(onnxruntime-genai-static PUBLIC "/home/yingxiong/projects/llguidance/parser/target/release") +# target_include_directories(onnxruntime-genai PUBLIC "/home/yingxiong/projects/llguidance/parser") +# target_include_directories(onnxruntime-genai-static PUBLIC "/home/yingxiong/projects/llguidance/parser") +# target_link_directories(onnxruntime-genai PRIVATE "/home/yingxiong/projects/llguidance/parser/target/release") +# target_link_directories(onnxruntime-genai-static PUBLIC "/home/yingxiong/projects/llguidance/parser/target/release") +# add_dependencies(onnxruntime-genai llguidance_parser) +# add_dependencies(onnxruntime-genai-static llguidance_parser) +target_include_directories(onnxruntime-genai PUBLIC ${llguidance_SOURCE_DIR}/parser/) +target_include_directories(onnxruntime-genai-static PUBLIC ${llguidance_SOURCE_DIR}/parser/) target_link_libraries(onnxruntime-genai PRIVATE llguidance_parser) target_link_libraries(onnxruntime-genai-static PUBLIC llguidance_parser) if (WIN32) diff --git a/cmake/deps.txt b/cmake/deps.txt index 6e504c9a7..454e2eb8e 100644 --- a/cmake/deps.txt +++ b/cmake/deps.txt @@ -15,3 +15,4 @@ googletest;https://github.com/google/googletest/archive/530d5c8c84abd2a46f38583e microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.230629.1.zip;e4a542a323c070376f7c2d1973d0f7ddbc1d2fa5 directx_headers;https://github.com/microsoft/DirectX-Headers/archive/refs/tags/v1.613.1.zip;47653509a3371eabb156360f42faf582f314bf2e onnxruntime_extensions;https://github.com/microsoft/onnxruntime-extensions.git;2c3e936cfc3401ba7ebb79d02b9e52a50439ffc3 +llguidance;https://github.com/microsoft/llguidance.git;b298c148b5cd79407d5b17484ae87b77264b379a \ No newline at end of file diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake index 8dde244c7..aebe1016e 100644 --- a/cmake/external/onnxruntime_external_deps.cmake +++ b/cmake/external/onnxruntime_external_deps.cmake @@ -93,3 +93,18 @@ list(APPEND EXTERNAL_LIBRARIES ocos_operators noexcep_operators ) + +FetchContent_Declare( + Corrosion + GIT_REPOSITORY https://github.com/corrosion-rs/corrosion.git + GIT_TAG v0.5 + ) +# Set any global configuration variables such as `Rust_TOOLCHAIN` before this line! +onnxruntime_fetchcontent_makeavailable(Corrosion) +FetchContent_Declare( + llguidance + GIT_REPOSITORY ${DEP_URL_llguidance} + GIT_TAG ${DEP_SHA1_llguidance} +) +onnxruntime_fetchcontent_makeavailable(llguidance) +corrosion_import_crate(MANIFEST_PATH ${llguidance_SOURCE_DIR}/parser/Cargo.toml) From 09861d7135427899e41f31081534d6c3c7c569cd Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Wed, 6 Nov 2024 19:12:28 +0800 Subject: [PATCH 10/79] update windows build --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a453e4f93..3058c54aa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -107,8 +107,8 @@ target_include_directories(onnxruntime-genai-static PUBLIC ${llguidance_SOURCE_D target_link_libraries(onnxruntime-genai PRIVATE llguidance_parser) target_link_libraries(onnxruntime-genai-static PUBLIC llguidance_parser) if (WIN32) - target_link_libraries(onnxruntime-genai PRIVATE ws2_32 ntdll bcrypt userenv) - target_link_libraries(onnxruntime-genai-static PRIVATE ws2_32 ntdll bcrypt userenv) + target_link_libraries(onnxruntime-genai PRIVATE bcrypt) + target_link_libraries(onnxruntime-genai-static PRIVATE bcrypt) endif() target_link_directories(onnxruntime-genai PRIVATE ${ORT_LIB_DIR}) From ee94df8d3eaf43dfbb6b6416415ae6c2c980c6f1 Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Wed, 6 Nov 2024 11:22:45 +0000 Subject: [PATCH 11/79] clean cmake --- CMakeLists.txt | 6 ------ test/CMakeLists.txt | 2 +- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3058c54aa..f31b4a176 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -96,12 +96,6 @@ target_include_directories(onnxruntime-genai-static PRIVATE ${onnxruntime_extens target_include_directories(onnxruntime-genai-static PUBLIC ${onnxruntime_extensions_SOURCE_DIR}/shared/api/) target_link_libraries(onnxruntime-genai PRIVATE onnxruntime_extensions) target_link_libraries(onnxruntime-genai-static PUBLIC onnxruntime_extensions) -# target_include_directories(onnxruntime-genai PUBLIC "/home/yingxiong/projects/llguidance/parser") -# target_include_directories(onnxruntime-genai-static PUBLIC "/home/yingxiong/projects/llguidance/parser") -# target_link_directories(onnxruntime-genai PRIVATE "/home/yingxiong/projects/llguidance/parser/target/release") -# target_link_directories(onnxruntime-genai-static PUBLIC "/home/yingxiong/projects/llguidance/parser/target/release") -# add_dependencies(onnxruntime-genai llguidance_parser) -# add_dependencies(onnxruntime-genai-static llguidance_parser) target_include_directories(onnxruntime-genai PUBLIC ${llguidance_SOURCE_DIR}/parser/) target_include_directories(onnxruntime-genai-static PUBLIC ${llguidance_SOURCE_DIR}/parser/) target_link_libraries(onnxruntime-genai PRIVATE llguidance_parser) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 0789459c4..c9834338b 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -22,7 +22,7 @@ target_include_directories(unit_tests PRIVATE ) target_link_directories(unit_tests PRIVATE ${ORT_LIB_DIR}) -target_link_directories(unit_tests PRIVATE "/home/yingxiong/projects/llguidance/parser/target/release") +target_link_libraries(unit_tests PRIVATE llguidance_parser) target_link_libraries(unit_tests PRIVATE onnxruntime-genai-static GTest::gtest_main From 4d077cfe17c65a876d3b89cf3139e998ef71c265 Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Wed, 6 Nov 2024 11:45:23 +0000 Subject: [PATCH 12/79] add install rust to GHA --- .github/workflows/linux-cpu-arm64-build.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/linux-cpu-arm64-build.yml b/.github/workflows/linux-cpu-arm64-build.yml index 362331591..28006ff5c 100644 --- a/.github/workflows/linux-cpu-arm64-build.yml +++ b/.github/workflows/linux-cpu-arm64-build.yml @@ -29,6 +29,9 @@ jobs: with: dotnet-version: '8.0.x' + - name: Install Rust + uses: dtolnay/rust-toolchain@stable + - name: Get the Latest OnnxRuntime Nightly Version shell: pwsh run: | From 15b20b80eb2997a1206c2ed6cc88e565af5ffb7b Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Wed, 6 Nov 2024 11:52:14 +0000 Subject: [PATCH 13/79] test action --- .github/workflows/linux-cpu-arm64-build.yml | 2 -- .github/workflows/linux-cpu-x64-build.yml | 3 +++ 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/linux-cpu-arm64-build.yml b/.github/workflows/linux-cpu-arm64-build.yml index 28006ff5c..b2bb3e618 100644 --- a/.github/workflows/linux-cpu-arm64-build.yml +++ b/.github/workflows/linux-cpu-arm64-build.yml @@ -29,8 +29,6 @@ jobs: with: dotnet-version: '8.0.x' - - name: Install Rust - uses: dtolnay/rust-toolchain@stable - name: Get the Latest OnnxRuntime Nightly Version shell: pwsh diff --git a/.github/workflows/linux-cpu-x64-build.yml b/.github/workflows/linux-cpu-x64-build.yml index 8141bd27c..97022025b 100644 --- a/.github/workflows/linux-cpu-x64-build.yml +++ b/.github/workflows/linux-cpu-x64-build.yml @@ -27,6 +27,9 @@ jobs: with: dotnet-version: '8.0.x' + - name: Install Rust + uses: dtolnay/rust-toolchain@stable + - name: Get the Latest OnnxRuntime Nightly Version shell: pwsh run: | From 6029510513d1c80ad169923f8433a1a25bf26e9d Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Wed, 6 Nov 2024 12:00:25 +0000 Subject: [PATCH 14/79] test win cpu build action --- .github/workflows/win-cpu-x64-build.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/win-cpu-x64-build.yml b/.github/workflows/win-cpu-x64-build.yml index 131b58862..9a0fcf38a 100644 --- a/.github/workflows/win-cpu-x64-build.yml +++ b/.github/workflows/win-cpu-x64-build.yml @@ -41,6 +41,9 @@ jobs: with: dotnet-version: '8.0.x' + - name: Install Rust + uses: dtolnay/rust-toolchain@stable + - name: Download OnnxRuntime Nightly shell: pwsh run: | From 4d8d8a60d7705d339fca987bd51ae6247e758955 Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Wed, 6 Nov 2024 12:15:44 +0000 Subject: [PATCH 15/79] update win build action --- .github/workflows/win-cpu-x64-build.yml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/.github/workflows/win-cpu-x64-build.yml b/.github/workflows/win-cpu-x64-build.yml index 9a0fcf38a..48d149fba 100644 --- a/.github/workflows/win-cpu-x64-build.yml +++ b/.github/workflows/win-cpu-x64-build.yml @@ -42,7 +42,14 @@ jobs: dotnet-version: '8.0.x' - name: Install Rust - uses: dtolnay/rust-toolchain@stable + run: | + $exePath = "$env:TEMP\rustup-init.exe" + (New-Object Net.WebClient).DownloadFile('https://static.rust-lang.org/rustup/dist/x86_64-pc-windows-msvc/rustup-init.exe', $exePath) + cmd /c start /wait $exePath -y + Remove-Item $exePath + cargo --version + rustup --version + rustc --version - name: Download OnnxRuntime Nightly shell: pwsh From 346f88c00b25f9b8d1c4e7d0d959c7a561d53f6d Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Wed, 6 Nov 2024 12:28:11 +0000 Subject: [PATCH 16/79] update win build action --- .github/workflows/win-cpu-x64-build.yml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.github/workflows/win-cpu-x64-build.yml b/.github/workflows/win-cpu-x64-build.yml index 48d149fba..ddbac9ad2 100644 --- a/.github/workflows/win-cpu-x64-build.yml +++ b/.github/workflows/win-cpu-x64-build.yml @@ -45,11 +45,8 @@ jobs: run: | $exePath = "$env:TEMP\rustup-init.exe" (New-Object Net.WebClient).DownloadFile('https://static.rust-lang.org/rustup/dist/x86_64-pc-windows-msvc/rustup-init.exe', $exePath) - cmd /c start /wait $exePath -y + & $exePath -y Remove-Item $exePath - cargo --version - rustup --version - rustc --version - name: Download OnnxRuntime Nightly shell: pwsh From c00d8fa824b3a68b01dbc12453d703d529ff08f7 Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Wed, 6 Nov 2024 13:04:50 +0000 Subject: [PATCH 17/79] update win build action --- .github/workflows/win-cpu-x64-build.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/win-cpu-x64-build.yml b/.github/workflows/win-cpu-x64-build.yml index ddbac9ad2..afe2b9bbf 100644 --- a/.github/workflows/win-cpu-x64-build.yml +++ b/.github/workflows/win-cpu-x64-build.yml @@ -47,6 +47,7 @@ jobs: (New-Object Net.WebClient).DownloadFile('https://static.rust-lang.org/rustup/dist/x86_64-pc-windows-msvc/rustup-init.exe', $exePath) & $exePath -y Remove-Item $exePath + $env:Path = %USERPROFILE%\.cargo\bin; + $env:Path - name: Download OnnxRuntime Nightly shell: pwsh From 39fb7ed233c6381c8e64141722df07554382fb0f Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Wed, 6 Nov 2024 13:30:43 +0000 Subject: [PATCH 18/79] update win build action --- .github/workflows/win-cpu-x64-build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/win-cpu-x64-build.yml b/.github/workflows/win-cpu-x64-build.yml index afe2b9bbf..19ed9edf1 100644 --- a/.github/workflows/win-cpu-x64-build.yml +++ b/.github/workflows/win-cpu-x64-build.yml @@ -47,7 +47,7 @@ jobs: (New-Object Net.WebClient).DownloadFile('https://static.rust-lang.org/rustup/dist/x86_64-pc-windows-msvc/rustup-init.exe', $exePath) & $exePath -y Remove-Item $exePath - $env:Path = %USERPROFILE%\.cargo\bin; + $env:Path + $env:Path = $env:USERPROFILE\.cargo\bin; + $env:Path - name: Download OnnxRuntime Nightly shell: pwsh From 8038723c1fc196a4e79965f744072cbfcac8d993 Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Wed, 6 Nov 2024 13:39:41 +0000 Subject: [PATCH 19/79] update win build action --- .github/workflows/win-cpu-x64-build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/win-cpu-x64-build.yml b/.github/workflows/win-cpu-x64-build.yml index 19ed9edf1..bbc3fe0cd 100644 --- a/.github/workflows/win-cpu-x64-build.yml +++ b/.github/workflows/win-cpu-x64-build.yml @@ -47,7 +47,7 @@ jobs: (New-Object Net.WebClient).DownloadFile('https://static.rust-lang.org/rustup/dist/x86_64-pc-windows-msvc/rustup-init.exe', $exePath) & $exePath -y Remove-Item $exePath - $env:Path = $env:USERPROFILE\.cargo\bin; + $env:Path + $env:Path = $env:USERPROFILE+ "\.cargo\bin;" + $env:Path - name: Download OnnxRuntime Nightly shell: pwsh From 324f550a0b1f63f356ad6c5c158bd5176be32e38 Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Wed, 6 Nov 2024 13:56:42 +0000 Subject: [PATCH 20/79] update win build action --- .github/workflows/win-cpu-x64-build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/win-cpu-x64-build.yml b/.github/workflows/win-cpu-x64-build.yml index bbc3fe0cd..2e6efaa48 100644 --- a/.github/workflows/win-cpu-x64-build.yml +++ b/.github/workflows/win-cpu-x64-build.yml @@ -47,7 +47,7 @@ jobs: (New-Object Net.WebClient).DownloadFile('https://static.rust-lang.org/rustup/dist/x86_64-pc-windows-msvc/rustup-init.exe', $exePath) & $exePath -y Remove-Item $exePath - $env:Path = $env:USERPROFILE+ "\.cargo\bin;" + $env:Path + Add-Content $env:GITHUB_PATH $env:USERPROFILE + "\.cargo\bin;" - name: Download OnnxRuntime Nightly shell: pwsh From 8722727721ea015914e12234d0c4ff9c7529c06f Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Wed, 6 Nov 2024 14:18:36 +0000 Subject: [PATCH 21/79] update win build action --- .github/workflows/win-cpu-x64-build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/win-cpu-x64-build.yml b/.github/workflows/win-cpu-x64-build.yml index 2e6efaa48..ada2898cd 100644 --- a/.github/workflows/win-cpu-x64-build.yml +++ b/.github/workflows/win-cpu-x64-build.yml @@ -47,7 +47,7 @@ jobs: (New-Object Net.WebClient).DownloadFile('https://static.rust-lang.org/rustup/dist/x86_64-pc-windows-msvc/rustup-init.exe', $exePath) & $exePath -y Remove-Item $exePath - Add-Content $env:GITHUB_PATH $env:USERPROFILE + "\.cargo\bin;" + Add-Content $env:GITHUB_PATH "$env:USERPROFILE\.cargo\bin" - name: Download OnnxRuntime Nightly shell: pwsh From d6204223a1001d20a364c0125facad7a1894cb78 Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Thu, 7 Nov 2024 03:45:26 +0000 Subject: [PATCH 22/79] add rust install to workflows --- .github/workflows/android-build.yml | 3 +++ .github/workflows/linux-cpu-arm64-build.yml | 2 ++ .github/workflows/linux-cpu-x64-nightly-build.yml | 3 ++- .github/workflows/linux-gpu-x64-build.yml | 3 +++ .github/workflows/mac-cpu-arm64-build.yml | 3 +++ .github/workflows/win-cpu-arm64-build.yml | 8 ++++++++ .github/workflows/win-cuda-x64-build.yml | 10 +++++++++- .github/workflows/win-directml-x64-build.yml | 8 ++++++++ 8 files changed, 38 insertions(+), 2 deletions(-) diff --git a/.github/workflows/android-build.yml b/.github/workflows/android-build.yml index b814f4939..962483d00 100644 --- a/.github/workflows/android-build.yml +++ b/.github/workflows/android-build.yml @@ -82,6 +82,9 @@ jobs: unzip microsoft.ml.onnxruntime/${{ env.ORT_NIGHTLY_VERSION }}/runtimes/android/native/onnxruntime.aar -d ort ls -lR ort + - name: Install Rust + uses: dtolnay/rust-toolchain@stable + - name: Create Android build run: | set -e -x diff --git a/.github/workflows/linux-cpu-arm64-build.yml b/.github/workflows/linux-cpu-arm64-build.yml index b2bb3e618..28006ff5c 100644 --- a/.github/workflows/linux-cpu-arm64-build.yml +++ b/.github/workflows/linux-cpu-arm64-build.yml @@ -29,6 +29,8 @@ jobs: with: dotnet-version: '8.0.x' + - name: Install Rust + uses: dtolnay/rust-toolchain@stable - name: Get the Latest OnnxRuntime Nightly Version shell: pwsh diff --git a/.github/workflows/linux-cpu-x64-nightly-build.yml b/.github/workflows/linux-cpu-x64-nightly-build.yml index 65573778c..6c9e32bbe 100644 --- a/.github/workflows/linux-cpu-x64-nightly-build.yml +++ b/.github/workflows/linux-cpu-x64-nightly-build.yml @@ -22,7 +22,8 @@ jobs: - name: Checkout OnnxRuntime GenAI repo uses: actions/checkout@v2 - + - name: Install Rust + uses: dtolnay/rust-toolchain@stable - name: Download OnnxRuntime run: | diff --git a/.github/workflows/linux-gpu-x64-build.yml b/.github/workflows/linux-gpu-x64-build.yml index 040ddb712..b2358a02f 100644 --- a/.github/workflows/linux-gpu-x64-build.yml +++ b/.github/workflows/linux-gpu-x64-build.yml @@ -41,6 +41,9 @@ jobs: with: dotnet-version: '8.0.x' + - name: Install Rust + uses: dtolnay/rust-toolchain@stable + - name: Get the Latest OnnxRuntime Nightly Version shell: pwsh run: | diff --git a/.github/workflows/mac-cpu-arm64-build.yml b/.github/workflows/mac-cpu-arm64-build.yml index 79f03240a..8531ab226 100644 --- a/.github/workflows/mac-cpu-arm64-build.yml +++ b/.github/workflows/mac-cpu-arm64-build.yml @@ -36,6 +36,9 @@ jobs: mv ${{ env.ORT_PACKAGE_NAME }}/build/native/include ort/ mv ${{ env.ORT_PACKAGE_NAME }}/runtimes/osx-arm64/native/* ort/lib/ + - name: Install Rust + uses: dtolnay/rust-toolchain@stable + - name: Configure CMake run: | cmake --preset macos_arm64_cpu_release diff --git a/.github/workflows/win-cpu-arm64-build.yml b/.github/workflows/win-cpu-arm64-build.yml index 9450e10df..9ad9cef7e 100644 --- a/.github/workflows/win-cpu-arm64-build.yml +++ b/.github/workflows/win-cpu-arm64-build.yml @@ -52,6 +52,14 @@ jobs: move ${{ env.ORT_PACKAGE_NAME }}/build/native/include ort/ move ${{ env.ORT_PACKAGE_NAME }}/runtimes/win-arm64/native/* ort/lib/ + - name: Install Rust + run: | + $exePath = "$env:TEMP\rustup-init.exe" + (New-Object Net.WebClient).DownloadFile('https://static.rust-lang.org/rustup/dist/x86_64-pc-windows-msvc/rustup-init.exe', $exePath) + & $exePath -y + Remove-Item $exePath + Add-Content $env:GITHUB_PATH "$env:USERPROFILE\.cargo\bin" + - name: Configure CMake run: | python -m pip install wheel diff --git a/.github/workflows/win-cuda-x64-build.yml b/.github/workflows/win-cuda-x64-build.yml index bd7d53e3b..0f71e771d 100644 --- a/.github/workflows/win-cuda-x64-build.yml +++ b/.github/workflows/win-cuda-x64-build.yml @@ -59,7 +59,15 @@ jobs: run: | mkdir ort/lib move ${{ env.ORT_PACKAGE_NAME }}/buildTransitive/native/include ort/ - move ${{ env.ORT_PACKAGE_NAME }}/runtimes/win-x64/native/* ort/lib/ + move ${{ env.ORT_PACKAGE_NAME }}/runtimes/win-x64/native/* ort/lib/ + + - name: Install Rust + run: | + $exePath = "$env:TEMP\rustup-init.exe" + (New-Object Net.WebClient).DownloadFile('https://static.rust-lang.org/rustup/dist/x86_64-pc-windows-msvc/rustup-init.exe', $exePath) + & $exePath -y + Remove-Item $exePath + Add-Content $env:GITHUB_PATH "$env:USERPROFILE\.cargo\bin" - name: Configure CMake run: | diff --git a/.github/workflows/win-directml-x64-build.yml b/.github/workflows/win-directml-x64-build.yml index 53cf74ed3..2600e5d23 100644 --- a/.github/workflows/win-directml-x64-build.yml +++ b/.github/workflows/win-directml-x64-build.yml @@ -78,6 +78,14 @@ jobs: mv $env:d3d12_dir\build\native\bin\x64\D3D12Core.dll ort\lib mv $env:dml_dir\include\DirectML.h ort\include + - name: Install Rust + run: | + $exePath = "$env:TEMP\rustup-init.exe" + (New-Object Net.WebClient).DownloadFile('https://static.rust-lang.org/rustup/dist/x86_64-pc-windows-msvc/rustup-init.exe', $exePath) + & $exePath -y + Remove-Item $exePath + Add-Content $env:GITHUB_PATH "$env:USERPROFILE\.cargo\bin" + - name: Configure CMake run: | cmake --preset windows_x64_directml_release -DTEST_PHI2=False From c1ede01530c8cd46951108e859306ff475a45f0c Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Thu, 7 Nov 2024 08:49:37 +0000 Subject: [PATCH 23/79] support batch infer --- src/models/logits.cpp | 57 +++++++++++++++++++++++++++++++------------ src/models/logits.h | 8 +++--- 2 files changed, 45 insertions(+), 20 deletions(-) diff --git a/src/models/logits.cpp b/src/models/logits.cpp index 25ba4e7ae..fa8ffcb0c 100644 --- a/src/models/logits.cpp +++ b/src/models/logits.cpp @@ -34,13 +34,24 @@ Logits::Logits(State& state) #endif if (!state_.params_->guidance_type.empty() && !state_.params_->guidance_data.empty()) { auto tokenizer = model_.CreateTokenizer(); - constrained_logits_processor_ = std::make_unique(model_.config_->model.vocab_size, model_.config_->model.eos_token_id, - state_.params_->guidance_type, state_.params_->guidance_data, tokenizer, - model_.config_->config_path.string().c_str()); - mask_future_ = std::async(std::launch::async, [&]() { return constrained_logits_processor_->ComputeMask(); }); + constrained_logits_processors_.resize(shape_[0]); + for (int i = 0; i < shape_[0]; i++) { + constrained_logits_processors_[i] = std::make_unique(model_.config_->model.vocab_size, model_.config_->model.eos_token_id, + state_.params_->guidance_type, state_.params_->guidance_data, tokenizer, + model_.config_->config_path.string().c_str()); + } + mask_future_ = std::async(std::launch::async, [&]() { + std::vector> result; + for (int i = 0; i < shape_[0]; i++) { + auto processor = constrained_logits_processors_.at(i).get(); + auto mask = processor->ComputeMask(); + result.push_back(std::move(mask)); + } + return result; + }); #if USE_CUDA if (model_.device_type_ == DeviceType::CUDA) { - cuda_logits_mask_ptr_ = CudaMallocArray(shape_[0] * shape_[2]); + cuda_logits_mask_ptr_ = CudaMallocArray(shape_[0] * shape_[2] / 32); } #endif } @@ -170,8 +181,8 @@ RoamingArray Logits::Get() { assert(shape_[1] == 1); - if (constrained_logits_processor_) { - logits_mask_ = mask_future_.get(); + if (!constrained_logits_processors_.empty()) { + logits_masks_ = mask_future_.get(); } #if USE_CUDA @@ -186,8 +197,11 @@ RoamingArray Logits::Get() { static_cast(cuda_eos_token_ids_.size()), model_.cuda_stream_); - if (!logits_mask_.empty()) { - cudaMemcpyAsync(cuda_logits_mask_ptr_.get(), logits_mask_.data(), logits_mask_.size() * sizeof(uint32_t), ::cudaMemcpyHostToDevice, model_.cuda_stream_); + if (!logits_masks_.empty()) { + for (int i = 0; i < logits_masks_.size(); i++) { + cudaMemcpyAsync(cuda_logits_mask_ptr_.get() + (i * shape_[2] / 32), logits_masks_.at(i).data(), + logits_masks_.at(i).size() * sizeof(uint32_t), ::cudaMemcpyHostToDevice, model_.cuda_stream_); + } AddMask(batched_logits_gpu, cuda_logits_mask_ptr_.get()); } return batched_logits_gpu; @@ -211,7 +225,7 @@ RoamingArray Logits::Get() { auto batched_logits_cpu = cpu_span{cpu_tensor, element_count}; HandleEOSArray(batched_logits_cpu); if (!logits_mask_.empty()) { - AddMask(batched_logits_cpu, logits_mask_); + AddMask(batched_logits_cpu, logits_masks_); } return batched_logits_cpu; } @@ -219,8 +233,8 @@ RoamingArray Logits::Get() { auto batched_logits_cpu = cpu_span{logits_of_last_token->GetTensorMutableData(), element_count}; HandleEOSArray(batched_logits_cpu); - if (!logits_mask_.empty()) { - AddMask(batched_logits_cpu, logits_mask_); + if (!logits_masks_.empty()) { + AddMask(batched_logits_cpu, logits_masks_); } return batched_logits_cpu; } @@ -228,10 +242,20 @@ RoamingArray Logits::Get() { #pragma warning(pop) void Logits::Update(RoamingArray next_tokens_unk) { - if (constrained_logits_processor_) { + if (!constrained_logits_processors_.empty()) { auto next_tokens = next_tokens_unk.GetCPU(); - constrained_logits_processor_->CommitTokens(static_cast(next_tokens[0])); - mask_future_ = std::async(std::launch::async, [&]() { return constrained_logits_processor_->ComputeMask(); }); + for (int i = 0; i < next_tokens.size(); i++) { + constrained_logits_processors_[i]->CommitTokens(static_cast(next_tokens[i])); + } + mask_future_ = std::async(std::launch::async, [&]() { + std::vector> result; + for (int i = 0; i < shape_[0]; i++) { + auto processor = constrained_logits_processors_.at(i).get(); + auto mask = processor->ComputeMask(); + result.push_back(mask); + } + return result; + }); } if (output_raw_.get()->GetTensorTypeAndShapeInfo()->GetShape()[1] == 1) { @@ -264,12 +288,13 @@ void Logits::HandleEOSArray(cpu_span batched_logits) { } } -void Logits::AddMask(cpu_span logits, std::vector& mask) { +void Logits::AddMask(cpu_span logits, std::vector>& masks) { size_t vocab_size = shape_[2]; size_t vocab_index = 0; for (int index = 0; index < shape_[0]; index++) { auto logits_span = logits.subspan(vocab_index, vocab_size); + auto& mask = masks[index]; for (size_t i = 0; i < vocab_size; i++) { logits_span[i] = mask[i / 32] & (1 << (i % 32)) ? logits_span[i] : std::numeric_limits::lowest(); } diff --git a/src/models/logits.h b/src/models/logits.h index 382f3b88b..df10194a1 100644 --- a/src/models/logits.h +++ b/src/models/logits.h @@ -21,7 +21,7 @@ struct Logits { private: void HandleEOSArray(cpu_span logits); - void AddMask(cpu_span logits, std::vector& mask); + void AddMask(cpu_span logits, std::vector>& mask); State& state_; const Model& model_{state_.model_}; @@ -41,9 +41,9 @@ struct Logits { StaticBuffer* sb_logits32_{}; StaticBuffer* sb_logits16_{}; - std::unique_ptr constrained_logits_processor_; - std::future> mask_future_; - std::vector logits_mask_; + std::vector> constrained_logits_processors_; + std::future>> mask_future_; + std::vector> logits_masks_; #if USE_CUDA cuda_unique_ptr cuda_eos_token_ids_ptr_; // eos_token_ids from params, but in cuda accessible memory From d2f47e20a7a722a1e308323e624d4eb922dbe084 Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Fri, 8 Nov 2024 05:15:23 +0000 Subject: [PATCH 24/79] add corrosion to deps.txt --- cmake/deps.txt | 3 ++- cmake/external/onnxruntime_external_deps.cmake | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/cmake/deps.txt b/cmake/deps.txt index 454e2eb8e..de11f4d2a 100644 --- a/cmake/deps.txt +++ b/cmake/deps.txt @@ -15,4 +15,5 @@ googletest;https://github.com/google/googletest/archive/530d5c8c84abd2a46f38583e microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.230629.1.zip;e4a542a323c070376f7c2d1973d0f7ddbc1d2fa5 directx_headers;https://github.com/microsoft/DirectX-Headers/archive/refs/tags/v1.613.1.zip;47653509a3371eabb156360f42faf582f314bf2e onnxruntime_extensions;https://github.com/microsoft/onnxruntime-extensions.git;2c3e936cfc3401ba7ebb79d02b9e52a50439ffc3 -llguidance;https://github.com/microsoft/llguidance.git;b298c148b5cd79407d5b17484ae87b77264b379a \ No newline at end of file +llguidance;https://github.com/microsoft/llguidance.git;b298c148b5cd79407d5b17484ae87b77264b379a +corrosion;https://github.com/corrosion-rs/corrosion.git;64289b1d79d6d19cd2e241db515381a086bb8407 \ No newline at end of file diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake index aebe1016e..c0033fd21 100644 --- a/cmake/external/onnxruntime_external_deps.cmake +++ b/cmake/external/onnxruntime_external_deps.cmake @@ -96,8 +96,8 @@ list(APPEND EXTERNAL_LIBRARIES FetchContent_Declare( Corrosion - GIT_REPOSITORY https://github.com/corrosion-rs/corrosion.git - GIT_TAG v0.5 + GIT_REPOSITORY ${DEP_URL_corrosion} + GIT_TAG ${DEP_SHA1_corrosion} ) # Set any global configuration variables such as `Rust_TOOLCHAIN` before this line! onnxruntime_fetchcontent_makeavailable(Corrosion) From b256e6de69382481fd703544e60bc58990797b93 Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Fri, 8 Nov 2024 05:44:55 +0000 Subject: [PATCH 25/79] fix merge --- src/models/logits.cpp | 18 +++++++++--------- src/models/logits.h | 8 ++++---- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/models/logits.cpp b/src/models/logits.cpp index ac874a804..19b127b58 100644 --- a/src/models/logits.cpp +++ b/src/models/logits.cpp @@ -53,7 +53,7 @@ Logits::Logits(State& state) }); #if USE_CUDA if (model_.device_type_ == DeviceType::CUDA) { - cuda_logits_mask_ptr_ = CudaMallocArray(shape_[0] * shape_[2] / 32); + cuda_logits_mask_ptr_ = state_.params_->p_device->Allocate(shape_[0] * shape_[2] / 32); } #endif } @@ -198,10 +198,10 @@ DeviceSpan Logits::Get() { if (!logits_masks_.empty()) { for (int i = 0; i < logits_masks_.size(); i++) { - cudaMemcpyAsync(cuda_logits_mask_ptr_.get() + (i * shape_[2] / 32), logits_masks_.at(i).data(), + cudaMemcpyAsync(cuda_logits_mask_ptr_.Span().data() + (i * shape_[2] / 32), logits_masks_.at(i).data(), logits_masks_.at(i).size() * sizeof(uint32_t), ::cudaMemcpyHostToDevice, model_.cuda_stream_); } - AddMask(logits_.Span().data(), cuda_logits_mask_ptr_.get()); + AddMask(logits_, cuda_logits_mask_ptr_); } return logits_; } @@ -242,9 +242,9 @@ DeviceSpan Logits::Get() { #pragma warning(pop) -void Logits::Update(RoamingArray next_tokens_unk) { +void Logits::Update(DeviceSpan next_tokens_unk) { if (!constrained_logits_processors_.empty()) { - auto next_tokens = next_tokens_unk.GetCPU(); + auto next_tokens = next_tokens_unk.Span(); for (int i = 0; i < next_tokens.size(); i++) { constrained_logits_processors_[i]->CommitTokens(static_cast(next_tokens[i])); } @@ -289,7 +289,7 @@ void Logits::HandleEOSArray(std::span batched_logits) { } } -void Logits::AddMask(cpu_span logits, std::vector>& masks) { +void Logits::AddMask(std::span logits, std::vector>& masks) { size_t vocab_size = shape_[2]; size_t vocab_index = 0; @@ -304,9 +304,9 @@ void Logits::AddMask(cpu_span logits, std::vector>& } #if USE_CUDA -void Logits::AddMask(gpu_span logits, const uint32_t* mask) { - cuda::LaunchAddLogitsMask(logits.data(), static_cast(shape_[0]), - static_cast(shape_[2]), mask, model_.cuda_stream_); +void Logits::AddMask(DeviceSpan logits, DeviceSpan mask) { + cuda::LaunchAddLogitsMask(logits.Span().data(), static_cast(shape_[0]), + static_cast(shape_[2]), mask.Span().data(), model_.cuda_stream_); } #endif diff --git a/src/models/logits.h b/src/models/logits.h index 83ebd51dc..2d7ce0120 100644 --- a/src/models/logits.h +++ b/src/models/logits.h @@ -16,12 +16,12 @@ struct Logits { void Add(); DeviceSpan Get(); - void Update(RoamingArray next_tokens_unk = RoamingArray{}); + void Update(DeviceSpan next_tokens_unk = DeviceSpan{}); private: void HandleEOSArray(std::span logits); - void AddMask(cpu_span logits, std::vector>& mask); + void AddMask(std::span logits, std::vector>& mask); State& state_; const Model& model_{state_.model_}; @@ -51,8 +51,8 @@ struct Logits { #if USE_CUDA DeviceSpan cuda_eos_token_ids_; // eos_token_ids from params, but in cuda accessible memory - cuda_unique_ptr cuda_logits_mask_ptr_; - void AddMask(gpu_span logits, const uint32_t* mask); + DeviceSpan cuda_logits_mask_ptr_; + void AddMask(DeviceSpan logits, DeviceSpan mask); #endif #if USE_DML From e5d6dadbf5e15698c43c696239ad849a107b60a9 Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Fri, 8 Nov 2024 06:50:40 +0000 Subject: [PATCH 26/79] fix bugs --- .github/workflows/android-build.yml | 4 ++++ .github/workflows/linux-gpu-x64-build.yml | 4 ++++ .github/workflows/win-cpu-arm64-build.yml | 1 - .github/workflows/win-cpu-x64-build.yml | 1 - .github/workflows/win-cuda-x64-build.yml | 1 - .github/workflows/win-directml-x64-build.yml | 1 - src/generators.cpp | 2 +- src/models/logits.cpp | 2 +- src/models/model.h | 2 +- src/ort_genai_c.h | 1 - 10 files changed, 11 insertions(+), 8 deletions(-) diff --git a/.github/workflows/android-build.yml b/.github/workflows/android-build.yml index 962483d00..174bff28c 100644 --- a/.github/workflows/android-build.yml +++ b/.github/workflows/android-build.yml @@ -85,6 +85,10 @@ jobs: - name: Install Rust uses: dtolnay/rust-toolchain@stable + - name: Install Rust Toolchain + run: | + rustup target add --toolchain stable-x86_64-unknown-linux-gnu x86_64-linux-android + - name: Create Android build run: | set -e -x diff --git a/.github/workflows/linux-gpu-x64-build.yml b/.github/workflows/linux-gpu-x64-build.yml index 1adf8ca4f..0258b0772 100644 --- a/.github/workflows/linux-gpu-x64-build.yml +++ b/.github/workflows/linux-gpu-x64-build.yml @@ -95,8 +95,10 @@ jobs: --gpus all \ --rm \ --volume $GITHUB_WORKSPACE:/ort_genai_src \ + --volume $$CARGO_HOME:/root/.cargo \ -w /ort_genai_src onnxruntimecudabuildx64 \ bash -c " \ + export PATH=/root/.cargo/bin:$PATH && \ /usr/bin/cmake --preset linux_gcc_cuda_release \ -DMANYLINUX=ON \ -DPYTHON_EXECUTABLE=${{ env.PYTHON_EXECUTABLE }} " @@ -108,8 +110,10 @@ jobs: --gpus all \ --rm \ --volume $GITHUB_WORKSPACE:/ort_genai_src \ + --volume $$CARGO_HOME:/root/.cargo \ -w /ort_genai_src onnxruntimecudabuildx64 \ bash -c " \ + export PATH=/root/.cargo/bin:$PATH && \ /usr/bin/cmake --build --preset linux_gcc_cuda_release" - name: Use Dummy HuggingFace Token diff --git a/.github/workflows/win-cpu-arm64-build.yml b/.github/workflows/win-cpu-arm64-build.yml index 9ad9cef7e..16cc1e9cf 100644 --- a/.github/workflows/win-cpu-arm64-build.yml +++ b/.github/workflows/win-cpu-arm64-build.yml @@ -57,7 +57,6 @@ jobs: $exePath = "$env:TEMP\rustup-init.exe" (New-Object Net.WebClient).DownloadFile('https://static.rust-lang.org/rustup/dist/x86_64-pc-windows-msvc/rustup-init.exe', $exePath) & $exePath -y - Remove-Item $exePath Add-Content $env:GITHUB_PATH "$env:USERPROFILE\.cargo\bin" - name: Configure CMake diff --git a/.github/workflows/win-cpu-x64-build.yml b/.github/workflows/win-cpu-x64-build.yml index c465c48df..7e03dc11b 100644 --- a/.github/workflows/win-cpu-x64-build.yml +++ b/.github/workflows/win-cpu-x64-build.yml @@ -46,7 +46,6 @@ jobs: $exePath = "$env:TEMP\rustup-init.exe" (New-Object Net.WebClient).DownloadFile('https://static.rust-lang.org/rustup/dist/x86_64-pc-windows-msvc/rustup-init.exe', $exePath) & $exePath -y - Remove-Item $exePath Add-Content $env:GITHUB_PATH "$env:USERPROFILE\.cargo\bin" - name: Download OnnxRuntime Nightly diff --git a/.github/workflows/win-cuda-x64-build.yml b/.github/workflows/win-cuda-x64-build.yml index b16df90ea..8614c1009 100644 --- a/.github/workflows/win-cuda-x64-build.yml +++ b/.github/workflows/win-cuda-x64-build.yml @@ -66,7 +66,6 @@ jobs: $exePath = "$env:TEMP\rustup-init.exe" (New-Object Net.WebClient).DownloadFile('https://static.rust-lang.org/rustup/dist/x86_64-pc-windows-msvc/rustup-init.exe', $exePath) & $exePath -y - Remove-Item $exePath Add-Content $env:GITHUB_PATH "$env:USERPROFILE\.cargo\bin" - name: Configure CMake diff --git a/.github/workflows/win-directml-x64-build.yml b/.github/workflows/win-directml-x64-build.yml index d991a0916..2bd019c2a 100644 --- a/.github/workflows/win-directml-x64-build.yml +++ b/.github/workflows/win-directml-x64-build.yml @@ -83,7 +83,6 @@ jobs: $exePath = "$env:TEMP\rustup-init.exe" (New-Object Net.WebClient).DownloadFile('https://static.rust-lang.org/rustup/dist/x86_64-pc-windows-msvc/rustup-init.exe', $exePath) & $exePath -y - Remove-Item $exePath Add-Content $env:GITHUB_PATH "$env:USERPROFILE\.cargo\bin" - name: Configure CMake diff --git a/src/generators.cpp b/src/generators.cpp index 191bb0487..b3a4fe0c5 100644 --- a/src/generators.cpp +++ b/src/generators.cpp @@ -146,7 +146,7 @@ void Launch_UpdateAttentionMask(int32_t* mask_data, const int32_t* old_ template <> void Launch_UpdateAttentionMask(int64_t* mask_data, const int64_t* old_mask_data, int batch_beam_size, int current_length, int max_length, bool update_only, cudaStream_t stream) { GetCudaInterface()->Launch_UpdateAttentionMask(mask_data, old_mask_data, batch_beam_size, current_length, max_length, update_only, stream); } void LaunchHandleEOSArray(float* batch_logits, int batch_beam_size, int vocab_size, const int32_t* eos_token_ids, int eos_token_ids_count, cudaStream_t stream) { GetCudaInterface()->LaunchHandleEOSArray(batch_logits, batch_beam_size, vocab_size, eos_token_ids, eos_token_ids_count, stream); } -void LaunchAddLogitsMask(float* batch_logits, int batch_beam_size, int vocab_size, const uint32_t* logits_mask, cudaStream_t stream){ GetCudaInterface()->LaunchAddLogitsMask(batch_logits, batch_beam_size, vocab_size, logits_mask, stream); } +void LaunchAddLogitsMask(float* batch_logits, int batch_beam_size, int vocab_size, const uint32_t* logits_mask, cudaStream_t stream) { GetCudaInterface()->LaunchAddLogitsMask(batch_logits, batch_beam_size, vocab_size, logits_mask, stream); } void UpdateCacheIndirectionKernelLauncher(int32_t* tgt_indir_cache, const int32_t* src_indir_cache, const int32_t* beam_ids, int batch_size, int beam_width, int input_seq_length, int max_seq_length, int current_length, cudaStream_t stream) { GetCudaInterface()->UpdateCacheIndirectionKernelLauncher(tgt_indir_cache, src_indir_cache, beam_ids, batch_size, beam_width, input_seq_length, max_seq_length, current_length, stream); } void ReorderPastStatesKernelLauncher(void* out_buffer, const void* in_buffer, int batch_size, int num_heads, int max_length, int head_size, int chunk_size, cudaStream_t stream) { GetCudaInterface()->ReorderPastStatesKernelLauncher(out_buffer, in_buffer, batch_size, num_heads, max_length, head_size, chunk_size, stream); } template <> diff --git a/src/models/logits.cpp b/src/models/logits.cpp index 19b127b58..c43470682 100644 --- a/src/models/logits.cpp +++ b/src/models/logits.cpp @@ -224,7 +224,7 @@ DeviceSpan Logits::Get() { auto batched_logits_cpu = cpu_span{cpu_tensor, element_count}; HandleEOSArray(batched_logits_cpu); - if (!logits_mask_.empty()) { + if (!logits_masks_.empty()) { AddMask(batched_logits_cpu, logits_masks_); } diff --git a/src/models/model.h b/src/models/model.h index 583bd06d5..5c5a3d679 100644 --- a/src/models/model.h +++ b/src/models/model.h @@ -52,7 +52,7 @@ struct State { std::vector input_names_, output_names_; std::vector adapter_names_; std::string guided_data_; - std::string guided_type_; + std::string guided_type_; std::vector inputs_, outputs_; protected: diff --git a/src/ort_genai_c.h b/src/ort_genai_c.h index bcc1d5d09..81a1f4ad7 100644 --- a/src/ort_genai_c.h +++ b/src/ort_genai_c.h @@ -259,7 +259,6 @@ OGA_EXPORT OgaResult* OGA_API_CALL OgaGeneratorParamsSetWhisperInputFeatures(Oga OGA_EXPORT OgaResult* OGA_API_CALL OgaGeneratorParamsSetGuidance(OgaGeneratorParams*, const char* type, const char* data); - /* * \brief Creates a generator from the given model and generator params. * \param[in] model The model to use for generation. From 2fd52d29b81a8ab8d220c1fd4103d7fe9c4ff0da Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Fri, 8 Nov 2024 07:08:25 +0000 Subject: [PATCH 27/79] update linux gpu workflow --- .github/workflows/linux-gpu-x64-build.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/linux-gpu-x64-build.yml b/.github/workflows/linux-gpu-x64-build.yml index 0258b0772..a257b7461 100644 --- a/.github/workflows/linux-gpu-x64-build.yml +++ b/.github/workflows/linux-gpu-x64-build.yml @@ -95,7 +95,7 @@ jobs: --gpus all \ --rm \ --volume $GITHUB_WORKSPACE:/ort_genai_src \ - --volume $$CARGO_HOME:/root/.cargo \ + --volume $CARGO_HOME:/root/.cargo \ -w /ort_genai_src onnxruntimecudabuildx64 \ bash -c " \ export PATH=/root/.cargo/bin:$PATH && \ @@ -110,7 +110,7 @@ jobs: --gpus all \ --rm \ --volume $GITHUB_WORKSPACE:/ort_genai_src \ - --volume $$CARGO_HOME:/root/.cargo \ + --volume $CARGO_HOME:/root/.cargo \ -w /ort_genai_src onnxruntimecudabuildx64 \ bash -c " \ export PATH=/root/.cargo/bin:$PATH && \ From a11684b731922ea3596f0c18241fd682feacf8b6 Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Fri, 8 Nov 2024 08:04:21 +0000 Subject: [PATCH 28/79] update linux gpu workfow --- .github/workflows/linux-gpu-x64-build.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/linux-gpu-x64-build.yml b/.github/workflows/linux-gpu-x64-build.yml index a257b7461..38f178697 100644 --- a/.github/workflows/linux-gpu-x64-build.yml +++ b/.github/workflows/linux-gpu-x64-build.yml @@ -95,10 +95,11 @@ jobs: --gpus all \ --rm \ --volume $GITHUB_WORKSPACE:/ort_genai_src \ - --volume $CARGO_HOME:/root/.cargo \ + --volume $CARGO_HOME:/.cargo \ -w /ort_genai_src onnxruntimecudabuildx64 \ bash -c " \ - export PATH=/root/.cargo/bin:$PATH && \ + export CARGO_HOME=/.cargo && \ + curl --proto '=https' --tlsv1.2 https://sh.rustup.rs -sSf | sh -s -- -y && \ /usr/bin/cmake --preset linux_gcc_cuda_release \ -DMANYLINUX=ON \ -DPYTHON_EXECUTABLE=${{ env.PYTHON_EXECUTABLE }} " From 56663c0b6bbdc8322e5bb941cc21bf88b414b4e0 Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Fri, 8 Nov 2024 08:31:54 +0000 Subject: [PATCH 29/79] update linux gpu workflow --- .github/workflows/linux-gpu-x64-build.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/linux-gpu-x64-build.yml b/.github/workflows/linux-gpu-x64-build.yml index 38f178697..830409826 100644 --- a/.github/workflows/linux-gpu-x64-build.yml +++ b/.github/workflows/linux-gpu-x64-build.yml @@ -95,12 +95,12 @@ jobs: --gpus all \ --rm \ --volume $GITHUB_WORKSPACE:/ort_genai_src \ - --volume $CARGO_HOME:/.cargo \ -w /ort_genai_src onnxruntimecudabuildx64 \ bash -c " \ export CARGO_HOME=/.cargo && \ curl --proto '=https' --tlsv1.2 https://sh.rustup.rs -sSf | sh -s -- -y && \ /usr/bin/cmake --preset linux_gcc_cuda_release \ + -DRust_COMPILER=/.cargo/bin/rustc \ -DMANYLINUX=ON \ -DPYTHON_EXECUTABLE=${{ env.PYTHON_EXECUTABLE }} " @@ -115,7 +115,9 @@ jobs: -w /ort_genai_src onnxruntimecudabuildx64 \ bash -c " \ export PATH=/root/.cargo/bin:$PATH && \ - /usr/bin/cmake --build --preset linux_gcc_cuda_release" + curl --proto '=https' --tlsv1.2 https://sh.rustup.rs -sSf | sh -s -- -y && \ + /usr/bin/cmake --build --preset linux_gcc_cuda_release \ + -DRust_COMPILER=/.cargo/bin/rustc " - name: Use Dummy HuggingFace Token run: | From 89970646ba8717f9180e8d6aa2dc44c39f87bec1 Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Fri, 8 Nov 2024 08:53:53 +0000 Subject: [PATCH 30/79] update workflow --- .github/workflows/linux-gpu-x64-build.yml | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/.github/workflows/linux-gpu-x64-build.yml b/.github/workflows/linux-gpu-x64-build.yml index 830409826..42c3f1bfe 100644 --- a/.github/workflows/linux-gpu-x64-build.yml +++ b/.github/workflows/linux-gpu-x64-build.yml @@ -97,10 +97,10 @@ jobs: --volume $GITHUB_WORKSPACE:/ort_genai_src \ -w /ort_genai_src onnxruntimecudabuildx64 \ bash -c " \ - export CARGO_HOME=/.cargo && \ + export CARGO_HOME=/ort_genai_src/.cargo && \ curl --proto '=https' --tlsv1.2 https://sh.rustup.rs -sSf | sh -s -- -y && \ /usr/bin/cmake --preset linux_gcc_cuda_release \ - -DRust_COMPILER=/.cargo/bin/rustc \ + -DRust_COMPILER=/ort_genai_src/.cargo/bin/rustc \ -DMANYLINUX=ON \ -DPYTHON_EXECUTABLE=${{ env.PYTHON_EXECUTABLE }} " @@ -111,13 +111,11 @@ jobs: --gpus all \ --rm \ --volume $GITHUB_WORKSPACE:/ort_genai_src \ - --volume $CARGO_HOME:/root/.cargo \ -w /ort_genai_src onnxruntimecudabuildx64 \ bash -c " \ - export PATH=/root/.cargo/bin:$PATH && \ - curl --proto '=https' --tlsv1.2 https://sh.rustup.rs -sSf | sh -s -- -y && \ + export CARGO_HOME=/ort_genai_src/.cargo && \ /usr/bin/cmake --build --preset linux_gcc_cuda_release \ - -DRust_COMPILER=/.cargo/bin/rustc " + -DRust_COMPILER=/ort_genai_src/.cargo/bin/rustc " - name: Use Dummy HuggingFace Token run: | From ddda727753787244a0e6f6417a6cc03d3b35562a Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Fri, 8 Nov 2024 09:15:42 +0000 Subject: [PATCH 31/79] update workflow --- .github/workflows/linux-gpu-x64-build.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/linux-gpu-x64-build.yml b/.github/workflows/linux-gpu-x64-build.yml index 42c3f1bfe..0e953111b 100644 --- a/.github/workflows/linux-gpu-x64-build.yml +++ b/.github/workflows/linux-gpu-x64-build.yml @@ -97,7 +97,7 @@ jobs: --volume $GITHUB_WORKSPACE:/ort_genai_src \ -w /ort_genai_src onnxruntimecudabuildx64 \ bash -c " \ - export CARGO_HOME=/ort_genai_src/.cargo && \ + export CARGO_HOME=/ort_genai_src/.cargo RUSTUP_HOME=/ort_genai_src/.rustup && \ curl --proto '=https' --tlsv1.2 https://sh.rustup.rs -sSf | sh -s -- -y && \ /usr/bin/cmake --preset linux_gcc_cuda_release \ -DRust_COMPILER=/ort_genai_src/.cargo/bin/rustc \ @@ -113,7 +113,7 @@ jobs: --volume $GITHUB_WORKSPACE:/ort_genai_src \ -w /ort_genai_src onnxruntimecudabuildx64 \ bash -c " \ - export CARGO_HOME=/ort_genai_src/.cargo && \ + export CARGO_HOME=/ort_genai_src/.cargo RUSTUP_HOME=/ort_genai_src/.rustup && \ /usr/bin/cmake --build --preset linux_gcc_cuda_release \ -DRust_COMPILER=/ort_genai_src/.cargo/bin/rustc " From cb5577803bc88caedfd0a22edd35a037c55de4c0 Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Fri, 8 Nov 2024 10:04:43 +0000 Subject: [PATCH 32/79] update workflows --- .github/workflows/linux-cpu-arm64-build.yml | 13 ++++++++----- .github/workflows/linux-gpu-x64-build.yml | 6 +----- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/.github/workflows/linux-cpu-arm64-build.yml b/.github/workflows/linux-cpu-arm64-build.yml index 28006ff5c..2abdbeb81 100644 --- a/.github/workflows/linux-cpu-arm64-build.yml +++ b/.github/workflows/linux-cpu-arm64-build.yml @@ -29,9 +29,6 @@ jobs: with: dotnet-version: '8.0.x' - - name: Install Rust - uses: dtolnay/rust-toolchain@stable - - name: Get the Latest OnnxRuntime Nightly Version shell: pwsh run: | @@ -77,13 +74,19 @@ jobs: run: | docker run --rm \ --volume $GITHUB_WORKSPACE:/onnxruntime_src \ - -w /onnxruntime_src ort_genai_linux_arm64_gha bash -c "/usr/bin/cmake --preset linux_gcc_cpu_release" + -w /onnxruntime_src ort_genai_linux_arm64_gha bash -c " \ + export CARGO_HOME=/onnxruntime_src/.cargo RUSTUP_HOME=/onnxruntime_src/.rustup && \ + curl --proto '=https' --tlsv1.2 https://sh.rustup.rs -sSf | sh -s -- -y && \ + /usr/bin/cmake --preset linux_gcc_cpu_release \ + -DRust_COMPILER=/onnxruntime_src/.cargo/bin/rustc " - name: Docker -- Build with CMake and GCC run: | docker run --rm \ --volume $GITHUB_WORKSPACE:/onnxruntime_src \ - -w /onnxruntime_src ort_genai_linux_arm64_gha bash -c "/usr/bin/cmake --build --preset linux_gcc_cpu_release" + -w /onnxruntime_src ort_genai_linux_arm64_gha bash -c " \ + export CARGO_HOME=/onnxruntime_src/.cargo RUSTUP_HOME=/onnxruntime_src/.rustup && \ + /usr/bin/cmake --build --preset linux_gcc_cpu_release" - name: Docker -- Check test directory run: | diff --git a/.github/workflows/linux-gpu-x64-build.yml b/.github/workflows/linux-gpu-x64-build.yml index 0e953111b..c34920ad0 100644 --- a/.github/workflows/linux-gpu-x64-build.yml +++ b/.github/workflows/linux-gpu-x64-build.yml @@ -40,9 +40,6 @@ jobs: - uses: actions/setup-dotnet@v4 with: dotnet-version: '8.0.x' - - - name: Install Rust - uses: dtolnay/rust-toolchain@stable - name: Get the Latest OnnxRuntime Nightly Version shell: pwsh @@ -114,8 +111,7 @@ jobs: -w /ort_genai_src onnxruntimecudabuildx64 \ bash -c " \ export CARGO_HOME=/ort_genai_src/.cargo RUSTUP_HOME=/ort_genai_src/.rustup && \ - /usr/bin/cmake --build --preset linux_gcc_cuda_release \ - -DRust_COMPILER=/ort_genai_src/.cargo/bin/rustc " + /usr/bin/cmake --build --preset linux_gcc_cuda_release" - name: Use Dummy HuggingFace Token run: | From 4cf5b5f768c7702ce22a0a40160c09f082bed16d Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Fri, 15 Nov 2024 03:58:31 +0000 Subject: [PATCH 33/79] add shared lib of llguidance --- CMakeLists.txt | 2 +- cmake/deps.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 73ccdc2e6..6429c3f61 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -105,7 +105,7 @@ target_link_libraries(onnxruntime-genai PRIVATE onnxruntime_extensions) target_link_libraries(onnxruntime-genai-static PUBLIC onnxruntime_extensions) target_include_directories(onnxruntime-genai PUBLIC ${llguidance_SOURCE_DIR}/parser/) target_include_directories(onnxruntime-genai-static PUBLIC ${llguidance_SOURCE_DIR}/parser/) -target_link_libraries(onnxruntime-genai PRIVATE llguidance_parser) +target_link_libraries(onnxruntime-genai PRIVATE llguidance_parser-shared) target_link_libraries(onnxruntime-genai-static PUBLIC llguidance_parser) if (WIN32) target_link_libraries(onnxruntime-genai PRIVATE bcrypt) diff --git a/cmake/deps.txt b/cmake/deps.txt index de11f4d2a..49058600a 100644 --- a/cmake/deps.txt +++ b/cmake/deps.txt @@ -15,5 +15,5 @@ googletest;https://github.com/google/googletest/archive/530d5c8c84abd2a46f38583e microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.230629.1.zip;e4a542a323c070376f7c2d1973d0f7ddbc1d2fa5 directx_headers;https://github.com/microsoft/DirectX-Headers/archive/refs/tags/v1.613.1.zip;47653509a3371eabb156360f42faf582f314bf2e onnxruntime_extensions;https://github.com/microsoft/onnxruntime-extensions.git;2c3e936cfc3401ba7ebb79d02b9e52a50439ffc3 -llguidance;https://github.com/microsoft/llguidance.git;b298c148b5cd79407d5b17484ae87b77264b379a +llguidance;https://github.com/microsoft/llguidance.git;4dc358feef3cdf0542a5f95b5f4e92761887a25d corrosion;https://github.com/corrosion-rs/corrosion.git;64289b1d79d6d19cd2e241db515381a086bb8407 \ No newline at end of file From 18c2f6cbe000bbbcf5301b7a8d785b512b2bd483 Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Fri, 15 Nov 2024 09:01:13 +0000 Subject: [PATCH 34/79] add disable_guidance option --- CMakeLists.txt | 9 +++++- build.py | 3 ++ cmake/options.cmake | 1 + src/models/logits.cpp | 52 ++++++++++++++++++++------------- src/models/logits.h | 2 +- src/models/logits_processor.cpp | 30 ++++++++++++++----- src/models/logits_processor.h | 24 +++++++++++++-- test/logits_processor_tests.cpp | 45 +++++++++++++++++++++------- 8 files changed, 123 insertions(+), 43 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6429c3f61..a7bb0ce70 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -47,6 +47,13 @@ include(cmake/check_webgpu.cmake) include(cmake/cxx_standard.cmake) add_compile_definitions(BUILDING_ORT_GENAI_C) + +if(USE_GUIDANCE) + add_compile_definitions(USE_GUIDANCE=1) +else() + add_compile_definitions(USE_GUIDANCE=0) +endif() + if(MSVC) # set updated value for __cplusplus macro instead of 199711L add_compile_options($<$:/Zc:__cplusplus>) @@ -105,7 +112,7 @@ target_link_libraries(onnxruntime-genai PRIVATE onnxruntime_extensions) target_link_libraries(onnxruntime-genai-static PUBLIC onnxruntime_extensions) target_include_directories(onnxruntime-genai PUBLIC ${llguidance_SOURCE_DIR}/parser/) target_include_directories(onnxruntime-genai-static PUBLIC ${llguidance_SOURCE_DIR}/parser/) -target_link_libraries(onnxruntime-genai PRIVATE llguidance_parser-shared) +target_link_libraries(onnxruntime-genai PRIVATE llguidance_parser) target_link_libraries(onnxruntime-genai-static PUBLIC llguidance_parser) if (WIN32) target_link_libraries(onnxruntime-genai PRIVATE bcrypt) diff --git a/build.py b/build.py index d8fb42867..6b7770982 100644 --- a/build.py +++ b/build.py @@ -130,6 +130,8 @@ class HelpFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescript parser.add_argument("--use_dml", action="store_true", help="Whether to use DML. Default is to not use DML.") + parser.add_argument("--disable_guidance", action="store_true", help="Whether to use DML. Default is to not use DML.") + # The following options are mutually exclusive (cross compiling options such as android, ios, etc.) platform_group = parser.add_mutually_exclusive_group() platform_group.add_argument("--android", action="store_true", help="Build for Android") @@ -477,6 +479,7 @@ def update(args: argparse.Namespace, env: dict[str, str]): f"-DUSE_DML={'ON' if args.use_dml else 'OFF'}", f"-DENABLE_JAVA={'ON' if args.build_java else 'OFF'}", f"-DBUILD_WHEEL={build_wheel}", + f"-DUSE_GUIDANCE={'ON' if not args.disable_guidance else 'OFF'}", ] if args.ort_home: diff --git a/cmake/options.cmake b/cmake/options.cmake index fcb8454bb..d57ad6ac7 100644 --- a/cmake/options.cmake +++ b/cmake/options.cmake @@ -5,6 +5,7 @@ option(USE_CUDA "Build with CUDA support" ON) option(USE_ROCM "Build with ROCm support" ON) option(USE_DML "Build with DML support" OFF) option(USE_WEBGPU "Build with WEBGPU support" ON) +option(USE_GUIDANCE "Build with guidance support" ON) # bindings option(ENABLE_JAVA "Build the Java API." OFF) diff --git a/src/models/logits.cpp b/src/models/logits.cpp index c43470682..7bea28e3e 100644 --- a/src/models/logits.cpp +++ b/src/models/logits.cpp @@ -36,26 +36,38 @@ Logits::Logits(State& state) #endif if (!state_.params_->guidance_type.empty() && !state_.params_->guidance_data.empty()) { auto tokenizer = model_.CreateTokenizer(); - constrained_logits_processors_.resize(shape_[0]); + LogitsProcessorConfig config = { + model_.config_->model.vocab_size, + static_cast(model_.config_->model.eos_token_id), + state_.params_->guidance_type, + state_.params_->guidance_data, + tokenizer, + model_.config_->config_path.string()}; + logits_processors_.resize(shape_[0]); for (int i = 0; i < shape_[0]; i++) { - constrained_logits_processors_[i] = std::make_unique(model_.config_->model.vocab_size, model_.config_->model.eos_token_id, - state_.params_->guidance_type, state_.params_->guidance_data, tokenizer, - model_.config_->config_path.string().c_str()); + logits_processors_[i] = CreateLogitsProcessor(config); } - mask_future_ = std::async(std::launch::async, [&]() { - std::vector> result; - for (int i = 0; i < shape_[0]; i++) { - auto processor = constrained_logits_processors_.at(i).get(); - auto mask = processor->ComputeMask(); - result.push_back(std::move(mask)); - } - return result; - }); + if (logits_processors_.at(0)) { + // Compute the mask maybe time consuming, so we do it in a separate thread + mask_future_ = std::async(std::launch::async, [&]() { + std::vector> result; + for (int i = 0; i < shape_[0]; i++) { + auto processor = logits_processors_.at(i).get(); + if (processor == nullptr) { + result.push_back({}); + continue; + } + auto mask = processor->ComputeMask(); + result.push_back(std::move(mask)); + } + return result; + }); #if USE_CUDA - if (model_.device_type_ == DeviceType::CUDA) { - cuda_logits_mask_ptr_ = state_.params_->p_device->Allocate(shape_[0] * shape_[2] / 32); - } + if (model_.device_type_ == DeviceType::CUDA) { + cuda_logits_mask_ptr_ = state_.params_->p_device->Allocate(shape_[0] * shape_[2] / 32); + } #endif + } } } @@ -181,7 +193,7 @@ DeviceSpan Logits::Get() { if (logits_.empty() || logits_of_last_token->GetTensorMutableRawData() != logits_.Span().data()) logits_ = WrapTensor(*state_.params_->p_device, *logits_of_last_token); - if (!constrained_logits_processors_.empty()) { + if (!logits_processors_.empty() && logits_processors_.at(0)) { logits_masks_ = mask_future_.get(); } @@ -243,15 +255,15 @@ DeviceSpan Logits::Get() { #pragma warning(pop) void Logits::Update(DeviceSpan next_tokens_unk) { - if (!constrained_logits_processors_.empty()) { + if (!logits_processors_.empty() && logits_processors_.at(0)) { auto next_tokens = next_tokens_unk.Span(); for (int i = 0; i < next_tokens.size(); i++) { - constrained_logits_processors_[i]->CommitTokens(static_cast(next_tokens[i])); + logits_processors_[i]->CommitTokens(static_cast(next_tokens[i])); } mask_future_ = std::async(std::launch::async, [&]() { std::vector> result; for (int i = 0; i < shape_[0]; i++) { - auto processor = constrained_logits_processors_.at(i).get(); + auto processor = logits_processors_.at(i).get(); auto mask = processor->ComputeMask(); result.push_back(mask); } diff --git a/src/models/logits.h b/src/models/logits.h index 2d7ce0120..0d3fe7db8 100644 --- a/src/models/logits.h +++ b/src/models/logits.h @@ -45,7 +45,7 @@ struct Logits { StaticBuffer* sb_logits32_{}; StaticBuffer* sb_logits16_{}; - std::vector> constrained_logits_processors_; + std::vector> logits_processors_; std::future>> mask_future_; std::vector> logits_masks_; diff --git a/src/models/logits_processor.cpp b/src/models/logits_processor.cpp index 12b17a86e..b365356c9 100644 --- a/src/models/logits_processor.cpp +++ b/src/models/logits_processor.cpp @@ -12,9 +12,10 @@ namespace Generators { -ConstrainedLogitsProcessor::ConstrainedLogitsProcessor(int vocab_size, uint32_t eos_token, - const std::string& guidance_type, const std::string& guidance_data, - std::shared_ptr tokenizer, const std::string& tokenizer_path) +#if USE_GUIDANCE +GuidanceLogitsProcessor::GuidanceLogitsProcessor(int vocab_size, uint32_t eos_token, + const std::string& guidance_type, const std::string& guidance_data, + std::shared_ptr tokenizer, const std::string& tokenizer_path) : vocab_size_(vocab_size), tokenizer_(std::move(tokenizer)) { if (guidance_type.empty() || guidance_data.empty()) { throw std::runtime_error("Guidance type and data must be provided"); @@ -81,7 +82,7 @@ ConstrainedLogitsProcessor::ConstrainedLogitsProcessor(int vocab_size, uint32_t llg_constraint_ = std::unique_ptr(constraint_ptr); } -std::vector ConstrainedLogitsProcessor::ComputeMask() { +std::vector GuidanceLogitsProcessor::ComputeMask() { LlgMaskResult mask_result; auto error = llg_compute_mask(llg_constraint_.get(), &mask_result); if (error != 0) { @@ -97,7 +98,7 @@ std::vector ConstrainedLogitsProcessor::ComputeMask() { return mask; } -void ConstrainedLogitsProcessor::CommitTokens(uint32_t token) { +void GuidanceLogitsProcessor::CommitTokens(uint32_t token) { LlgCommitResult commit_result; auto error = llg_commit_token(llg_constraint_.get(), token, &commit_result); if (error != 0) { @@ -106,8 +107,9 @@ void ConstrainedLogitsProcessor::CommitTokens(uint32_t token) { } } -std::vector ConstrainedLogitsProcessor::tokenize_partial(const Tokenizer* tokenizer, const size_t prefix_len, - const uint8_t* bytes, size_t bytes_len) { +std::vector GuidanceLogitsProcessor::tokenize_partial(const Tokenizer* tokenizer, const size_t prefix_len, + const uint8_t* bytes, size_t bytes_len) { + // add prefix to tokenize for partial tokenization, it will produce ids more stable std::string input_string = kTokenizePrefixStr; input_string.reserve(bytes_len + 2); for (size_t i = 0; i < bytes_len; i++) { @@ -116,5 +118,17 @@ std::vector ConstrainedLogitsProcessor::tokenize_partial(const Tokenize std::vector output_ids = tokenizer->Encode(input_string.c_str()); return std::vector(output_ids.begin() + prefix_len, output_ids.end()); } +#endif -} // namespace Generators \ No newline at end of file +std::unique_ptr CreateLogitsProcessor(const LogitsProcessorConfig& config) { +#if USE_GUIDANCE + if (!config.guidance_type.empty() && !config.guidance_data.empty()) { + return std::make_unique(config.vocab_size, config.eos_token, config.guidance_type, config.guidance_data, config.tokenizer, config.tokenizer_path); + } + +#endif + + Log("warning", "No supported LogitsProcessor found. e.g. to use guidance, build with USE_GUIDANCE=1"); + return nullptr; +} +} // namespace Generators diff --git a/src/models/logits_processor.h b/src/models/logits_processor.h index 7c32488bd..ad7edffc2 100644 --- a/src/models/logits_processor.h +++ b/src/models/logits_processor.h @@ -6,17 +6,31 @@ #include #include +#if USE_GUIDANCE #include +#endif #include "model.h" namespace Generators { +struct LogitsProcessorConfig { + int vocab_size; + uint32_t eos_token; + std::string guidance_type; + std::string guidance_data; + std::shared_ptr tokenizer; + std::string tokenizer_path; +}; + struct LogitsProcessor { + LogitsProcessor() = default; virtual std::vector ComputeMask() = 0; virtual void CommitTokens(uint32_t token) = 0; + }; +#if USE_GUIDANCE struct LlgConstraintDeleter { void operator()(LlgConstraint* lc) const { llg_free_constraint(lc); @@ -29,11 +43,13 @@ struct LlgTokenizerDeleter { } }; -struct ConstrainedLogitsProcessor : public LogitsProcessor { +struct GuidanceLogitsProcessor : public LogitsProcessor { static constexpr const char* kDefaultVocabFile = "tokenizer.json"; static constexpr const char* kTokenizePrefixStr = "\x02"; - ConstrainedLogitsProcessor(int vocab_size, uint32_t eos_token, const std::string& guidance_type, const std::string& guidance_data, std::shared_ptr tokenizer, const std::string& tokenizer_path); + GuidanceLogitsProcessor(int vocab_size, uint32_t eos_token, const std::string& guidance_type, + const std::string& guidance_data, std::shared_ptr tokenizer, + const std::string& tokenizer_path); std::vector ComputeMask() override; void CommitTokens(uint32_t token) override; @@ -51,4 +67,8 @@ struct ConstrainedLogitsProcessor : public LogitsProcessor { }; TokenizeData tokenize_data_; }; +#endif + +std::unique_ptr CreateLogitsProcessor(const LogitsProcessorConfig& config); + } // namespace Generators \ No newline at end of file diff --git a/test/logits_processor_tests.cpp b/test/logits_processor_tests.cpp index 81312746d..cd6f52ff3 100644 --- a/test/logits_processor_tests.cpp +++ b/test/logits_processor_tests.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #ifndef MODEL_PATH #define MODEL_PATH "../../test/test_models/" @@ -31,17 +32,17 @@ std::string read_file(const char* filePath) { buffer << file.rdbuf(); return buffer.str(); } - +#if USE_GUIDANCE TEST(LogitsProcessorTests, TestRegex) { std::string regex = "answer: .*"; std::string text = "answer: I am a robot"; auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32"); auto tokenizer = model->CreateTokenizer(); - auto processor = std::make_unique(model->config_->model.vocab_size, - model->config_->model.eos_token_id, "regex", - regex, tokenizer, model->config_->config_path.string().c_str()); - auto target_ids = Generators::ConstrainedLogitsProcessor::tokenize_partial(tokenizer.get(), tokenizer->Encode(Generators::ConstrainedLogitsProcessor::kTokenizePrefixStr).size(), - reinterpret_cast(text.c_str()), text.size()); + auto processor = std::make_unique(model->config_->model.vocab_size, + model->config_->model.eos_token_id, "regex", + regex, tokenizer, model->config_->config_path.string().c_str()); + auto target_ids = Generators::GuidanceLogitsProcessor::tokenize_partial(tokenizer.get(), tokenizer->Encode(Generators::GuidanceLogitsProcessor::kTokenizePrefixStr).size(), + reinterpret_cast(text.c_str()), text.size()); for (auto id : target_ids) { auto mask = processor->ComputeMask(); processor->CommitTokens(id); @@ -54,13 +55,35 @@ TEST(LogitsProcessorTests, TestJsonSchema) { auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32"); auto tokenizer = model->CreateTokenizer(); - auto processor = std::make_unique(model->config_->model.vocab_size, - model->config_->model.eos_token_id, "json_schema", - json_schema, tokenizer, model->config_->config_path.string().c_str()); - auto target_ids = Generators::ConstrainedLogitsProcessor::tokenize_partial(tokenizer.get(), tokenizer->Encode(Generators::ConstrainedLogitsProcessor::kTokenizePrefixStr).size(), - reinterpret_cast(text.c_str()), text.size()); + auto processor = std::make_unique(model->config_->model.vocab_size, + model->config_->model.eos_token_id, "json_schema", + json_schema, tokenizer, model->config_->config_path.string().c_str()); + auto target_ids = Generators::GuidanceLogitsProcessor::tokenize_partial(tokenizer.get(), tokenizer->Encode(Generators::GuidanceLogitsProcessor::kTokenizePrefixStr).size(), + reinterpret_cast(text.c_str()), text.size()); for (auto id : target_ids) { auto mask = processor->ComputeMask(); processor->CommitTokens(id); } } + +TEST(LogitsProcessorTests, TestModel) { +#if TEST_PHI2 + auto model = Generators::CreateModel(Generators::GetOrtEnv(), PHI2_PATH); + auto tokenizer = model->CreateTokenizer(); + auto test_input = "who are you?"; + std::string regex = "answer: .*"; + auto input_ids = tokenizer->Encode(test_input); + + auto params = Generators::CreateGeneratorParams(*model); + params->SetGuidance("regex", regex); + params->search.max_length = 10; + params->search.do_sample = true; + params->input_ids = input_ids; + // Verify outputs match expected outputs + // High level version + auto result = Generators::Generate(*model, *params); + auto output = tokenizer->Decode(result[0]); + EXPECT_TRUE(std::regex_match(output, std::regex("answer: .*"))); +#endif +} +#endif \ No newline at end of file From 65340f25379f6f1f18f52069402e747fdc27f5d9 Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Fri, 15 Nov 2024 09:05:06 +0000 Subject: [PATCH 35/79] fix format --- src/models/logits_processor.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/models/logits_processor.h b/src/models/logits_processor.h index ad7edffc2..4322bd411 100644 --- a/src/models/logits_processor.h +++ b/src/models/logits_processor.h @@ -27,7 +27,6 @@ struct LogitsProcessor { LogitsProcessor() = default; virtual std::vector ComputeMask() = 0; virtual void CommitTokens(uint32_t token) = 0; - }; #if USE_GUIDANCE From eca06f5091f30ca82fd188939dfe82da9299a7f4 Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Fri, 15 Nov 2024 17:31:31 +0800 Subject: [PATCH 36/79] fix win error --- src/models/logits_processor.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/models/logits_processor.h b/src/models/logits_processor.h index 4322bd411..7ac5925c7 100644 --- a/src/models/logits_processor.h +++ b/src/models/logits_processor.h @@ -25,6 +25,7 @@ struct LogitsProcessorConfig { struct LogitsProcessor { LogitsProcessor() = default; + virtual ~LogitsProcessor() = default; virtual std::vector ComputeMask() = 0; virtual void CommitTokens(uint32_t token) = 0; }; From b3064288523d40700222682982fd203c40a0d0bf Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Mon, 18 Nov 2024 04:32:23 +0000 Subject: [PATCH 37/79] fix segfault --- src/models/logits.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/models/logits.cpp b/src/models/logits.cpp index 7bea28e3e..59182a988 100644 --- a/src/models/logits.cpp +++ b/src/models/logits.cpp @@ -256,7 +256,7 @@ DeviceSpan Logits::Get() { void Logits::Update(DeviceSpan next_tokens_unk) { if (!logits_processors_.empty() && logits_processors_.at(0)) { - auto next_tokens = next_tokens_unk.Span(); + auto next_tokens = next_tokens_unk.CpuSpan(); for (int i = 0; i < next_tokens.size(); i++) { logits_processors_[i]->CommitTokens(static_cast(next_tokens[i])); } From df34b1e775ae53784e5f0354f880784b0b930793 Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Mon, 18 Nov 2024 07:30:03 +0000 Subject: [PATCH 38/79] fix segfault and move test --- src/models/logits.cpp | 2 +- test/c_api_tests.cpp | 35 ++++++++++++++++++++++++++------- test/logits_processor_tests.cpp | 20 ------------------- 3 files changed, 29 insertions(+), 28 deletions(-) diff --git a/src/models/logits.cpp b/src/models/logits.cpp index 59182a988..e4a0c65af 100644 --- a/src/models/logits.cpp +++ b/src/models/logits.cpp @@ -256,7 +256,7 @@ DeviceSpan Logits::Get() { void Logits::Update(DeviceSpan next_tokens_unk) { if (!logits_processors_.empty() && logits_processors_.at(0)) { - auto next_tokens = next_tokens_unk.CpuSpan(); + auto next_tokens = next_tokens_unk.CopyDeviceToCpu(); for (int i = 0; i < next_tokens.size(); i++) { logits_processors_[i]->CommitTokens(static_cast(next_tokens[i])); } diff --git a/test/c_api_tests.cpp b/test/c_api_tests.cpp index a93f7c645..de066215d 100644 --- a/test/c_api_tests.cpp +++ b/test/c_api_tests.cpp @@ -7,6 +7,7 @@ #include "../src/span.h" #include #include +#include #ifndef MODEL_PATH #define MODEL_PATH "../../test/test_models/" @@ -299,8 +300,7 @@ TEST(CAPITests, SetTerminate) { generator->ComputeLogits(); generator->GenerateNextToken(); } - } - catch (const std::exception& e) { + } catch (const std::exception& e) { EXPECT_EQ(generator->IsSessionTerminated(), true); std::cout << "Session Terminated: " << e.what() << std::endl; } @@ -427,14 +427,12 @@ TEST(CAPITests, TopKTopPCAPI) { #if TEST_PHI2 TEST(CAPITests, AdaptersTest) { - #ifdef USE_CUDA -using OutputType = Ort::Float16_t; + using OutputType = Ort::Float16_t; #else -using OutputType = float; + using OutputType = float; #endif - // The python unit tests create the adapter model. // In order to run this test, the python unit test must have been run first. auto model = OgaModel::Create(MODEL_PATH "adapters"); @@ -498,7 +496,7 @@ using OutputType = float; ASSERT_TRUE(std::equal(output_shape.begin(), output_shape.end(), shape.begin(), shape.end())); const auto size = static_cast(std::accumulate(shape.begin(), shape.end(), 1LL, - std::multiplies())); + std::multiplies())); ASSERT_EQ(output_size, size); std::span src(reinterpret_cast(logits->Data()), size); ASSERT_FALSE(std::equal(base_output.begin(), base_output.end(), src.begin(), src.end())); @@ -560,3 +558,26 @@ void CheckResult(OgaResult* result) { throw std::runtime_error(string); } } + +TEST(CAPITests, SetGuidance) { +#if TEST_PHI2 + + auto model = OgaModel::Create(PHI2_PATH); + auto tokenizer = OgaTokenizer::Create(*model); + auto tokenizer_stream = OgaTokenizerStream::Create(*tokenizer); + + const char* input_string = "who are you?"; + auto input_sequences = OgaSequences::Create(); + tokenizer->Encode(input_string, *input_sequences); + auto params = OgaGeneratorParams::Create(*model); + params->SetInputSequences(*input_sequences); + params->SetSearchOption("max_length", 32); + params->SetGuidance("regex", "answer: .*"); + + auto sequences = model->Generate(*params); + auto out_string = tokenizer->Decode(sequences->SequenceData(0), sequences->SequenceCount(0)); + auto output = std::string(out_string).substr(std::string(input_string).size()); + EXPECT_TRUE(std::regex_match(output, std::regex("answer: .*"))); + +#endif +} diff --git a/test/logits_processor_tests.cpp b/test/logits_processor_tests.cpp index cd6f52ff3..074272c93 100644 --- a/test/logits_processor_tests.cpp +++ b/test/logits_processor_tests.cpp @@ -66,24 +66,4 @@ TEST(LogitsProcessorTests, TestJsonSchema) { } } -TEST(LogitsProcessorTests, TestModel) { -#if TEST_PHI2 - auto model = Generators::CreateModel(Generators::GetOrtEnv(), PHI2_PATH); - auto tokenizer = model->CreateTokenizer(); - auto test_input = "who are you?"; - std::string regex = "answer: .*"; - auto input_ids = tokenizer->Encode(test_input); - - auto params = Generators::CreateGeneratorParams(*model); - params->SetGuidance("regex", regex); - params->search.max_length = 10; - params->search.do_sample = true; - params->input_ids = input_ids; - // Verify outputs match expected outputs - // High level version - auto result = Generators::Generate(*model, *params); - auto output = tokenizer->Decode(result[0]); - EXPECT_TRUE(std::regex_match(output, std::regex("answer: .*"))); -#endif -} #endif \ No newline at end of file From 92251d42c0ee0557ad8e6591db049a09399c1471 Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Mon, 18 Nov 2024 09:20:52 +0000 Subject: [PATCH 39/79] minor fixes --- .github/workflows/linux-gpu-x64-build.yml | 2 +- .github/workflows/win-cuda-x64-build.yml | 2 +- build.py | 2 +- cmake/external/onnxruntime_external_deps.cmake | 1 - src/models/decoder_only_pipeline.cpp | 2 +- src/models/gpt.cpp | 2 +- 6 files changed, 5 insertions(+), 6 deletions(-) diff --git a/.github/workflows/linux-gpu-x64-build.yml b/.github/workflows/linux-gpu-x64-build.yml index c34920ad0..e53880b50 100644 --- a/.github/workflows/linux-gpu-x64-build.yml +++ b/.github/workflows/linux-gpu-x64-build.yml @@ -40,7 +40,7 @@ jobs: - uses: actions/setup-dotnet@v4 with: dotnet-version: '8.0.x' - + - name: Get the Latest OnnxRuntime Nightly Version shell: pwsh run: | diff --git a/.github/workflows/win-cuda-x64-build.yml b/.github/workflows/win-cuda-x64-build.yml index 8614c1009..5d574f6a4 100644 --- a/.github/workflows/win-cuda-x64-build.yml +++ b/.github/workflows/win-cuda-x64-build.yml @@ -59,7 +59,7 @@ jobs: run: | mkdir ort/lib move ${{ env.ORT_PACKAGE_NAME }}/buildTransitive/native/include ort/ - move ${{ env.ORT_PACKAGE_NAME }}/runtimes/win-x64/native/* ort/lib/ + move ${{ env.ORT_PACKAGE_NAME }}/runtimes/win-x64/native/* ort/lib/ - name: Install Rust run: | diff --git a/build.py b/build.py index 6b7770982..256219fa0 100644 --- a/build.py +++ b/build.py @@ -130,7 +130,7 @@ class HelpFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescript parser.add_argument("--use_dml", action="store_true", help="Whether to use DML. Default is to not use DML.") - parser.add_argument("--disable_guidance", action="store_true", help="Whether to use DML. Default is to not use DML.") + parser.add_argument("--disable_guidance", action="store_true", help="Whether to add guidance support. Default is True.") # The following options are mutually exclusive (cross compiling options such as android, ios, etc.) platform_group = parser.add_mutually_exclusive_group() diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake index c0033fd21..e42a76a53 100644 --- a/cmake/external/onnxruntime_external_deps.cmake +++ b/cmake/external/onnxruntime_external_deps.cmake @@ -99,7 +99,6 @@ FetchContent_Declare( GIT_REPOSITORY ${DEP_URL_corrosion} GIT_TAG ${DEP_SHA1_corrosion} ) -# Set any global configuration variables such as `Rust_TOOLCHAIN` before this line! onnxruntime_fetchcontent_makeavailable(Corrosion) FetchContent_Declare( llguidance diff --git a/src/models/decoder_only_pipeline.cpp b/src/models/decoder_only_pipeline.cpp index 35702c292..56460df06 100644 --- a/src/models/decoder_only_pipeline.cpp +++ b/src/models/decoder_only_pipeline.cpp @@ -244,7 +244,7 @@ void DecoderOnlyPipelineState::UpdateInputsOutputs(const DeviceSpan& ne input_ids_.Update(next_tokens_unk); position_inputs_.Update(current_length); if (kv_cache_) kv_cache_->Update(beam_indices, current_length); - logits_.Update(); + logits_.Update(next_tokens_unk); } OrtValue* DecoderOnlyPipelineState::GetOutput(const char* name) { diff --git a/src/models/gpt.cpp b/src/models/gpt.cpp index b10f0ec2d..75f97d9ca 100644 --- a/src/models/gpt.cpp +++ b/src/models/gpt.cpp @@ -39,7 +39,7 @@ void Gpt_State::UpdateInputsOutputs(DeviceSpan& next_tokens, DeviceSpan input_ids_.Update(next_tokens); position_inputs_.Update(current_length); kv_cache_.Update(beam_indices, current_length); - logits_.Update(); + logits_.Update(next_tokens); } } // namespace Generators From 56ab9ee597c95662d005d61163f4293031d9516c Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Wed, 20 Nov 2024 03:17:13 +0000 Subject: [PATCH 40/79] fix bug when is_stop --- src/models/logits_processor.cpp | 14 ++++++++++---- src/models/logits_processor.h | 1 + 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/src/models/logits_processor.cpp b/src/models/logits_processor.cpp index b365356c9..9d669c618 100644 --- a/src/models/logits_processor.cpp +++ b/src/models/logits_processor.cpp @@ -16,7 +16,7 @@ namespace Generators { GuidanceLogitsProcessor::GuidanceLogitsProcessor(int vocab_size, uint32_t eos_token, const std::string& guidance_type, const std::string& guidance_data, std::shared_ptr tokenizer, const std::string& tokenizer_path) - : vocab_size_(vocab_size), tokenizer_(std::move(tokenizer)) { + : vocab_size_(vocab_size), tokenizer_(std::move(tokenizer)), eos_token_(eos_token) { if (guidance_type.empty() || guidance_data.empty()) { throw std::runtime_error("Guidance type and data must be provided"); } @@ -91,9 +91,15 @@ std::vector GuidanceLogitsProcessor::ComputeMask() { } std::vector mask; - mask.reserve((vocab_size_ - 1) / 32 + 1); - for (int i = 0; i < (vocab_size_ - 1) / 32 + 1; i++) { - mask.push_back(mask_result.sample_mask[i]); + if (mask_result.is_stop) { + mask = std::vector((vocab_size_ - 1) / 32 + 1, 0); + uint32_t eos_mask32 = 1 << (eos_token_ % 32); + mask[eos_token_ / 32] = eos_mask32; + } else { + mask.reserve((vocab_size_ - 1) / 32 + 1); + for (int i = 0; i < (vocab_size_ - 1) / 32 + 1; i++) { + mask.push_back(mask_result.sample_mask[i]); + } } return mask; } diff --git a/src/models/logits_processor.h b/src/models/logits_processor.h index 7ac5925c7..05fd9ea01 100644 --- a/src/models/logits_processor.h +++ b/src/models/logits_processor.h @@ -54,6 +54,7 @@ struct GuidanceLogitsProcessor : public LogitsProcessor { void CommitTokens(uint32_t token) override; size_t vocab_size_; + uint32_t eos_token_; std::unique_ptr llg_constraint_; std::unique_ptr llg_tokenizer_; std::shared_ptr tokenizer_; From 03a6bb7acdc9b7facdeab62158eca1f8caa9a921 Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Wed, 20 Nov 2024 03:56:02 +0000 Subject: [PATCH 41/79] fixes for reviews --- .github/workflows/linux-cpu-arm64-build.yml | 3 --- .github/workflows/linux-gpu-x64-build.yml | 4 --- CMakeLists.txt | 18 ++++++++----- build.py | 17 ++++++++++-- .../external/onnxruntime_external_deps.cmake | 26 ++++++++++--------- src/models/logits_processor.cpp | 6 ++++- src/models/logits_processor.h | 2 ++ test/CMakeLists.txt | 2 +- 8 files changed, 48 insertions(+), 30 deletions(-) diff --git a/.github/workflows/linux-cpu-arm64-build.yml b/.github/workflows/linux-cpu-arm64-build.yml index 2abdbeb81..6073d8e70 100644 --- a/.github/workflows/linux-cpu-arm64-build.yml +++ b/.github/workflows/linux-cpu-arm64-build.yml @@ -75,8 +75,6 @@ jobs: docker run --rm \ --volume $GITHUB_WORKSPACE:/onnxruntime_src \ -w /onnxruntime_src ort_genai_linux_arm64_gha bash -c " \ - export CARGO_HOME=/onnxruntime_src/.cargo RUSTUP_HOME=/onnxruntime_src/.rustup && \ - curl --proto '=https' --tlsv1.2 https://sh.rustup.rs -sSf | sh -s -- -y && \ /usr/bin/cmake --preset linux_gcc_cpu_release \ -DRust_COMPILER=/onnxruntime_src/.cargo/bin/rustc " @@ -85,7 +83,6 @@ jobs: docker run --rm \ --volume $GITHUB_WORKSPACE:/onnxruntime_src \ -w /onnxruntime_src ort_genai_linux_arm64_gha bash -c " \ - export CARGO_HOME=/onnxruntime_src/.cargo RUSTUP_HOME=/onnxruntime_src/.rustup && \ /usr/bin/cmake --build --preset linux_gcc_cpu_release" - name: Docker -- Check test directory diff --git a/.github/workflows/linux-gpu-x64-build.yml b/.github/workflows/linux-gpu-x64-build.yml index e53880b50..843485fa9 100644 --- a/.github/workflows/linux-gpu-x64-build.yml +++ b/.github/workflows/linux-gpu-x64-build.yml @@ -94,10 +94,7 @@ jobs: --volume $GITHUB_WORKSPACE:/ort_genai_src \ -w /ort_genai_src onnxruntimecudabuildx64 \ bash -c " \ - export CARGO_HOME=/ort_genai_src/.cargo RUSTUP_HOME=/ort_genai_src/.rustup && \ - curl --proto '=https' --tlsv1.2 https://sh.rustup.rs -sSf | sh -s -- -y && \ /usr/bin/cmake --preset linux_gcc_cuda_release \ - -DRust_COMPILER=/ort_genai_src/.cargo/bin/rustc \ -DMANYLINUX=ON \ -DPYTHON_EXECUTABLE=${{ env.PYTHON_EXECUTABLE }} " @@ -110,7 +107,6 @@ jobs: --volume $GITHUB_WORKSPACE:/ort_genai_src \ -w /ort_genai_src onnxruntimecudabuildx64 \ bash -c " \ - export CARGO_HOME=/ort_genai_src/.cargo RUSTUP_HOME=/ort_genai_src/.rustup && \ /usr/bin/cmake --build --preset linux_gcc_cuda_release" - name: Use Dummy HuggingFace Token diff --git a/CMakeLists.txt b/CMakeLists.txt index a7bb0ce70..52e3b7074 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -110,13 +110,17 @@ target_include_directories(onnxruntime-genai-static PRIVATE ${onnxruntime_extens target_include_directories(onnxruntime-genai-static PUBLIC ${onnxruntime_extensions_SOURCE_DIR}/shared/api/) target_link_libraries(onnxruntime-genai PRIVATE onnxruntime_extensions) target_link_libraries(onnxruntime-genai-static PUBLIC onnxruntime_extensions) -target_include_directories(onnxruntime-genai PUBLIC ${llguidance_SOURCE_DIR}/parser/) -target_include_directories(onnxruntime-genai-static PUBLIC ${llguidance_SOURCE_DIR}/parser/) -target_link_libraries(onnxruntime-genai PRIVATE llguidance_parser) -target_link_libraries(onnxruntime-genai-static PUBLIC llguidance_parser) -if (WIN32) - target_link_libraries(onnxruntime-genai PRIVATE bcrypt) - target_link_libraries(onnxruntime-genai-static PRIVATE bcrypt) + +if(USE_GUIDANCE) + target_include_directories(onnxruntime-genai PUBLIC ${llguidance_SOURCE_DIR}/parser/) + target_include_directories(onnxruntime-genai-static PUBLIC ${llguidance_SOURCE_DIR}/parser/) + target_link_libraries(onnxruntime-genai PRIVATE llguidance_parser) + target_link_libraries(onnxruntime-genai-static PUBLIC llguidance_parser) + if (WIN32) + # bcrypt is needed for the rust std lib + target_link_libraries(onnxruntime-genai PRIVATE bcrypt) + target_link_libraries(onnxruntime-genai-static PRIVATE bcrypt) + endif() endif() target_link_directories(onnxruntime-genai PRIVATE ${ORT_LIB_DIR}) diff --git a/build.py b/build.py index 256219fa0..171a10c28 100644 --- a/build.py +++ b/build.py @@ -26,6 +26,19 @@ def _path_from_env_var(env_var: str): env_var_value = os.environ.get(env_var) return Path(env_var_value) if env_var_value is not None else None +def strtobool (val): + """Convert a string representation of truth to true (1) or false (0). + True values are 'y', 'yes', 't', 'true', 'on', and '1'; false values + are 'n', 'no', 'f', 'false', 'off', and '0'. Raises ValueError if + 'val' is anything else. + """ + val = str(val).lower() + if val in ('y', 'yes', 't', 'true', 'on', '1'): + return True + elif val in ('n', 'no', 'f', 'false', 'off', '0'): + return False + else: + raise ValueError("invalid truth value %r" % (val,)) def _parse_args(): class Parser(argparse.ArgumentParser): @@ -130,7 +143,7 @@ class HelpFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescript parser.add_argument("--use_dml", action="store_true", help="Whether to use DML. Default is to not use DML.") - parser.add_argument("--disable_guidance", action="store_true", help="Whether to add guidance support. Default is True.") + parser.add_argument("--use_guidance", default=True, type=strtobool, help="Whether to add guidance support. Default is True.") # The following options are mutually exclusive (cross compiling options such as android, ios, etc.) platform_group = parser.add_mutually_exclusive_group() @@ -479,7 +492,7 @@ def update(args: argparse.Namespace, env: dict[str, str]): f"-DUSE_DML={'ON' if args.use_dml else 'OFF'}", f"-DENABLE_JAVA={'ON' if args.build_java else 'OFF'}", f"-DBUILD_WHEEL={build_wheel}", - f"-DUSE_GUIDANCE={'ON' if not args.disable_guidance else 'OFF'}", + f"-DUSE_GUIDANCE={'ON' if args.use_guidance else 'OFF'}", ] if args.ort_home: diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake index e42a76a53..aeb561c1e 100644 --- a/cmake/external/onnxruntime_external_deps.cmake +++ b/cmake/external/onnxruntime_external_deps.cmake @@ -94,16 +94,18 @@ list(APPEND EXTERNAL_LIBRARIES noexcep_operators ) -FetchContent_Declare( - Corrosion - GIT_REPOSITORY ${DEP_URL_corrosion} - GIT_TAG ${DEP_SHA1_corrosion} +if(USE_GUIDANCE) + FetchContent_Declare( + Corrosion + GIT_REPOSITORY ${DEP_URL_corrosion} + GIT_TAG ${DEP_SHA1_corrosion} + ) + onnxruntime_fetchcontent_makeavailable(Corrosion) + FetchContent_Declare( + llguidance + GIT_REPOSITORY ${DEP_URL_llguidance} + GIT_TAG ${DEP_SHA1_llguidance} ) -onnxruntime_fetchcontent_makeavailable(Corrosion) -FetchContent_Declare( - llguidance - GIT_REPOSITORY ${DEP_URL_llguidance} - GIT_TAG ${DEP_SHA1_llguidance} -) -onnxruntime_fetchcontent_makeavailable(llguidance) -corrosion_import_crate(MANIFEST_PATH ${llguidance_SOURCE_DIR}/parser/Cargo.toml) + onnxruntime_fetchcontent_makeavailable(llguidance) + corrosion_import_crate(MANIFEST_PATH ${llguidance_SOURCE_DIR}/parser/Cargo.toml) +endif() \ No newline at end of file diff --git a/src/models/logits_processor.cpp b/src/models/logits_processor.cpp index 9d669c618..618f46e10 100644 --- a/src/models/logits_processor.cpp +++ b/src/models/logits_processor.cpp @@ -1,3 +1,5 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. #include #include #include @@ -6,7 +8,9 @@ #include #include +#if USE_GUIDANCE #include "llguidance.h" +#endif #include "logits_processor.h" @@ -134,7 +138,7 @@ std::unique_ptr CreateLogitsProcessor(const LogitsProcessorConf #endif - Log("warning", "No supported LogitsProcessor found. e.g. to use guidance, build with USE_GUIDANCE=1"); + Log("warning", "No supported LogitsProcessor found. e.g. to use guidance, build with use_guidance=true"); return nullptr; } } // namespace Generators diff --git a/src/models/logits_processor.h b/src/models/logits_processor.h index 05fd9ea01..5d4e7924b 100644 --- a/src/models/logits_processor.h +++ b/src/models/logits_processor.h @@ -1,3 +1,5 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. #pragma once #include diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index c9834338b..40ef5e15e 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -22,7 +22,7 @@ target_include_directories(unit_tests PRIVATE ) target_link_directories(unit_tests PRIVATE ${ORT_LIB_DIR}) -target_link_libraries(unit_tests PRIVATE llguidance_parser) + target_link_libraries(unit_tests PRIVATE onnxruntime-genai-static GTest::gtest_main From 29fc868c018e00321b5304c24c302e7882dff222 Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Wed, 20 Nov 2024 04:10:09 +0000 Subject: [PATCH 42/79] fix --- .../github/linux/docker/manylinux/scripts/install_deps.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/ci_build/github/linux/docker/manylinux/scripts/install_deps.sh b/tools/ci_build/github/linux/docker/manylinux/scripts/install_deps.sh index 2a4fd31f2..8d7b5b10d 100755 --- a/tools/ci_build/github/linux/docker/manylinux/scripts/install_deps.sh +++ b/tools/ci_build/github/linux/docker/manylinux/scripts/install_deps.sh @@ -80,5 +80,8 @@ cmake --build build-cmake mv ./build-cmake/ninja /usr/bin popd +# Install Rust +curl --proto '=https' --tlsv1.2 https://sh.rustup.rs -sSf | sh -s -- -y\ + cd / rm -rf /tmp/src From 3b046c3fc827bd6eebaa7b75c87e7948ddc7ee5d Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Wed, 20 Nov 2024 04:52:27 +0000 Subject: [PATCH 43/79] fix win error --- src/models/logits_processor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/models/logits_processor.cpp b/src/models/logits_processor.cpp index 618f46e10..fd9bc7c26 100644 --- a/src/models/logits_processor.cpp +++ b/src/models/logits_processor.cpp @@ -20,7 +20,7 @@ namespace Generators { GuidanceLogitsProcessor::GuidanceLogitsProcessor(int vocab_size, uint32_t eos_token, const std::string& guidance_type, const std::string& guidance_data, std::shared_ptr tokenizer, const std::string& tokenizer_path) - : vocab_size_(vocab_size), tokenizer_(std::move(tokenizer)), eos_token_(eos_token) { + : vocab_size_(vocab_size), eos_token_(eos_token), tokenizer_(std::move(tokenizer)) { if (guidance_type.empty() || guidance_data.empty()) { throw std::runtime_error("Guidance type and data must be provided"); } From e9c818ee215f3ed394b4fb4d8e0a1144d9880da5 Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Wed, 20 Nov 2024 14:40:36 +0800 Subject: [PATCH 44/79] add rust env to dockerfile --- .../linux/docker/inference/aarch64/default/cpu/Dockerfile | 3 +++ .../linux/docker/manylinux/Dockerfile.manylinux2_28_cuda_12.2 | 3 +++ 2 files changed, 6 insertions(+) diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile index efb417e51..7d4e15dec 100644 --- a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile +++ b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile @@ -9,5 +9,8 @@ RUN cd /tmp/scripts && /tmp/scripts/install_centos.sh && /tmp/scripts/install_de ARG BUILD_UID=1001 ARG BUILD_USER=onnxruntimedev RUN adduser --uid $BUILD_UID $BUILD_USER +ENV PATH=$HOME/.cargo/bin:$PATH +ENV RUSTUP_HOME=$HOME/.rustup +ENV CARGO_HOME=$HOME/.cargo WORKDIR /home/$BUILD_USER USER $BUILD_USER diff --git a/tools/ci_build/github/linux/docker/manylinux/Dockerfile.manylinux2_28_cuda_12.2 b/tools/ci_build/github/linux/docker/manylinux/Dockerfile.manylinux2_28_cuda_12.2 index c865579bd..7e05f5cb8 100644 --- a/tools/ci_build/github/linux/docker/manylinux/Dockerfile.manylinux2_28_cuda_12.2 +++ b/tools/ci_build/github/linux/docker/manylinux/Dockerfile.manylinux2_28_cuda_12.2 @@ -16,5 +16,8 @@ RUN adduser --uid $BUILD_UID $BUILD_USER WORKDIR /home/$BUILD_USER USER $BUILD_USER ENV PATH /usr/local/dotnet:$PATH +ENV PATH=$HOME/.cargo/bin:$PATH +ENV RUSTUP_HOME=$HOME/.rustup +ENV CARGO_HOME=$HOME/.cargo ENV CUDAHOSTCXX /opt/rh/gcc-toolset-12/root/usr/bin/g++ ENV CUDA_MODULE_LOADING "LAZY" \ No newline at end of file From e06fb0acb8189acd9ebdc38c759464d23fce0cf2 Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Wed, 20 Nov 2024 14:58:06 +0800 Subject: [PATCH 45/79] fix dockerfile env --- .github/workflows/linux-cpu-arm64-build.yml | 3 +-- .../linux/docker/inference/aarch64/default/cpu/Dockerfile | 6 +++--- .../docker/manylinux/Dockerfile.manylinux2_28_cuda_12.2 | 6 +++--- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/.github/workflows/linux-cpu-arm64-build.yml b/.github/workflows/linux-cpu-arm64-build.yml index 6073d8e70..8e2e091bc 100644 --- a/.github/workflows/linux-cpu-arm64-build.yml +++ b/.github/workflows/linux-cpu-arm64-build.yml @@ -75,8 +75,7 @@ jobs: docker run --rm \ --volume $GITHUB_WORKSPACE:/onnxruntime_src \ -w /onnxruntime_src ort_genai_linux_arm64_gha bash -c " \ - /usr/bin/cmake --preset linux_gcc_cpu_release \ - -DRust_COMPILER=/onnxruntime_src/.cargo/bin/rustc " + /usr/bin/cmake --preset linux_gcc_cpu_release " - name: Docker -- Build with CMake and GCC run: | diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile index 7d4e15dec..a7e3a3018 100644 --- a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile +++ b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile @@ -6,11 +6,11 @@ FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_aar ADD scripts /tmp/scripts RUN cd /tmp/scripts && /tmp/scripts/install_centos.sh && /tmp/scripts/install_deps.sh && rm -rf /tmp/scripts -ARG BUILD_UID=1001 -ARG BUILD_USER=onnxruntimedev -RUN adduser --uid $BUILD_UID $BUILD_USER ENV PATH=$HOME/.cargo/bin:$PATH ENV RUSTUP_HOME=$HOME/.rustup ENV CARGO_HOME=$HOME/.cargo +ARG BUILD_UID=1001 +ARG BUILD_USER=onnxruntimedev +RUN adduser --uid $BUILD_UID $BUILD_USER WORKDIR /home/$BUILD_USER USER $BUILD_USER diff --git a/tools/ci_build/github/linux/docker/manylinux/Dockerfile.manylinux2_28_cuda_12.2 b/tools/ci_build/github/linux/docker/manylinux/Dockerfile.manylinux2_28_cuda_12.2 index 7e05f5cb8..b20f761bd 100644 --- a/tools/ci_build/github/linux/docker/manylinux/Dockerfile.manylinux2_28_cuda_12.2 +++ b/tools/ci_build/github/linux/docker/manylinux/Dockerfile.manylinux2_28_cuda_12.2 @@ -10,14 +10,14 @@ else \ echo "Using default gcc because CUDA version is less than 12"; \ cd /tmp/scripts && /tmp/scripts/install_centos.sh && /tmp/scripts/install_deps.sh && rm -rf /tmp/scripts; \ fi +ENV PATH=$HOME/.cargo/bin:$PATH +ENV RUSTUP_HOME=$HOME/.rustup +ENV CARGO_HOME=$HOME/.cargo ARG BUILD_UID=1001 ARG BUILD_USER=onnxruntimedev RUN adduser --uid $BUILD_UID $BUILD_USER WORKDIR /home/$BUILD_USER USER $BUILD_USER ENV PATH /usr/local/dotnet:$PATH -ENV PATH=$HOME/.cargo/bin:$PATH -ENV RUSTUP_HOME=$HOME/.rustup -ENV CARGO_HOME=$HOME/.cargo ENV CUDAHOSTCXX /opt/rh/gcc-toolset-12/root/usr/bin/g++ ENV CUDA_MODULE_LOADING "LAZY" \ No newline at end of file From ef141f2457d3b9c61539b3dc0160880067fdf656 Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Wed, 20 Nov 2024 15:18:16 +0800 Subject: [PATCH 46/79] update workflows --- .../linux/docker/inference/aarch64/default/cpu/Dockerfile | 6 +++--- .../inference/aarch64/default/cpu/scripts/install_deps.sh | 3 +++ .../docker/manylinux/Dockerfile.manylinux2_28_cuda_12.2 | 6 +++--- .../github/linux/docker/manylinux/scripts/install_deps.sh | 2 +- 4 files changed, 10 insertions(+), 7 deletions(-) diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile index a7e3a3018..6f75f2464 100644 --- a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile +++ b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile @@ -6,9 +6,9 @@ FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_aar ADD scripts /tmp/scripts RUN cd /tmp/scripts && /tmp/scripts/install_centos.sh && /tmp/scripts/install_deps.sh && rm -rf /tmp/scripts -ENV PATH=$HOME/.cargo/bin:$PATH -ENV RUSTUP_HOME=$HOME/.rustup -ENV CARGO_HOME=$HOME/.cargo +ENV PATH="$HOME/.cargo/bin:$PATH" +ENV RUSTUP_HOME="$HOME/.rustup" +ENV CARGO_HOME="$HOME/.cargo" ARG BUILD_UID=1001 ARG BUILD_USER=onnxruntimedev RUN adduser --uid $BUILD_UID $BUILD_USER diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_deps.sh b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_deps.sh index efbe3ef40..e1cedfe12 100755 --- a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_deps.sh +++ b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_deps.sh @@ -65,5 +65,8 @@ fi GetFile https://nodejs.org/dist/v18.17.1/node-v18.17.1-linux-${NODEJS_ARCH}.tar.gz /tmp/src/node-v18.17.1-linux-${NODEJS_ARCH}.tar.gz tar --strip 1 -xf /tmp/src/node-v18.17.1-linux-${NODEJS_ARCH}.tar.gz -C /usr +# Install Rust +curl --proto '=https' --tlsv1.2 https://sh.rustup.rs -sSf | sh -s -- -y + cd / rm -rf /tmp/src diff --git a/tools/ci_build/github/linux/docker/manylinux/Dockerfile.manylinux2_28_cuda_12.2 b/tools/ci_build/github/linux/docker/manylinux/Dockerfile.manylinux2_28_cuda_12.2 index b20f761bd..b261efe10 100644 --- a/tools/ci_build/github/linux/docker/manylinux/Dockerfile.manylinux2_28_cuda_12.2 +++ b/tools/ci_build/github/linux/docker/manylinux/Dockerfile.manylinux2_28_cuda_12.2 @@ -10,9 +10,9 @@ else \ echo "Using default gcc because CUDA version is less than 12"; \ cd /tmp/scripts && /tmp/scripts/install_centos.sh && /tmp/scripts/install_deps.sh && rm -rf /tmp/scripts; \ fi -ENV PATH=$HOME/.cargo/bin:$PATH -ENV RUSTUP_HOME=$HOME/.rustup -ENV CARGO_HOME=$HOME/.cargo +ENV PATH="$HOME/.cargo/bin:$PATH" +ENV RUSTUP_HOME="$HOME/.rustup" +ENV CARGO_HOME="$HOME/.cargo" ARG BUILD_UID=1001 ARG BUILD_USER=onnxruntimedev RUN adduser --uid $BUILD_UID $BUILD_USER diff --git a/tools/ci_build/github/linux/docker/manylinux/scripts/install_deps.sh b/tools/ci_build/github/linux/docker/manylinux/scripts/install_deps.sh index 8d7b5b10d..e5856554c 100755 --- a/tools/ci_build/github/linux/docker/manylinux/scripts/install_deps.sh +++ b/tools/ci_build/github/linux/docker/manylinux/scripts/install_deps.sh @@ -81,7 +81,7 @@ mv ./build-cmake/ninja /usr/bin popd # Install Rust -curl --proto '=https' --tlsv1.2 https://sh.rustup.rs -sSf | sh -s -- -y\ +curl --proto '=https' --tlsv1.2 https://sh.rustup.rs -sSf | sh -s -- -y cd / rm -rf /tmp/src From 13056b6bd7d69dc1eeb08ca0536a6ccedeeca2ee Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Wed, 20 Nov 2024 15:34:13 +0800 Subject: [PATCH 47/79] Update Rust environment in Dockerfiles --- .../linux/docker/inference/aarch64/default/cpu/Dockerfile | 6 +++--- .../inference/aarch64/default/cpu/scripts/install_deps.sh | 2 ++ .../docker/manylinux/Dockerfile.manylinux2_28_cuda_12.2 | 6 +++--- .../github/linux/docker/manylinux/scripts/install_deps.sh | 2 ++ 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile index 6f75f2464..a1405b097 100644 --- a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile +++ b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile @@ -6,9 +6,9 @@ FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_aar ADD scripts /tmp/scripts RUN cd /tmp/scripts && /tmp/scripts/install_centos.sh && /tmp/scripts/install_deps.sh && rm -rf /tmp/scripts -ENV PATH="$HOME/.cargo/bin:$PATH" -ENV RUSTUP_HOME="$HOME/.rustup" -ENV CARGO_HOME="$HOME/.cargo" +ENV PATH="/usr/.cargo/bin:$PATH" +ENV RUSTUP_HOME="/usr/.rustup" +ENV CARGO_HOME="/usr/.cargo" ARG BUILD_UID=1001 ARG BUILD_USER=onnxruntimedev RUN adduser --uid $BUILD_UID $BUILD_USER diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_deps.sh b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_deps.sh index e1cedfe12..dc3085d8b 100755 --- a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_deps.sh +++ b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_deps.sh @@ -66,6 +66,8 @@ GetFile https://nodejs.org/dist/v18.17.1/node-v18.17.1-linux-${NODEJS_ARCH}.tar. tar --strip 1 -xf /tmp/src/node-v18.17.1-linux-${NODEJS_ARCH}.tar.gz -C /usr # Install Rust +export RUSTUP_HOME=/usr/.rustup +export CARGO_HOME=/usr/.cargo curl --proto '=https' --tlsv1.2 https://sh.rustup.rs -sSf | sh -s -- -y cd / diff --git a/tools/ci_build/github/linux/docker/manylinux/Dockerfile.manylinux2_28_cuda_12.2 b/tools/ci_build/github/linux/docker/manylinux/Dockerfile.manylinux2_28_cuda_12.2 index b261efe10..add1e4ef2 100644 --- a/tools/ci_build/github/linux/docker/manylinux/Dockerfile.manylinux2_28_cuda_12.2 +++ b/tools/ci_build/github/linux/docker/manylinux/Dockerfile.manylinux2_28_cuda_12.2 @@ -10,9 +10,9 @@ else \ echo "Using default gcc because CUDA version is less than 12"; \ cd /tmp/scripts && /tmp/scripts/install_centos.sh && /tmp/scripts/install_deps.sh && rm -rf /tmp/scripts; \ fi -ENV PATH="$HOME/.cargo/bin:$PATH" -ENV RUSTUP_HOME="$HOME/.rustup" -ENV CARGO_HOME="$HOME/.cargo" +ENV PATH="/usr/.cargo/bin:$PATH" +ENV RUSTUP_HOME="/usr/.rustup" +ENV CARGO_HOME="/usr/.cargo" ARG BUILD_UID=1001 ARG BUILD_USER=onnxruntimedev RUN adduser --uid $BUILD_UID $BUILD_USER diff --git a/tools/ci_build/github/linux/docker/manylinux/scripts/install_deps.sh b/tools/ci_build/github/linux/docker/manylinux/scripts/install_deps.sh index e5856554c..4a186544a 100755 --- a/tools/ci_build/github/linux/docker/manylinux/scripts/install_deps.sh +++ b/tools/ci_build/github/linux/docker/manylinux/scripts/install_deps.sh @@ -81,6 +81,8 @@ mv ./build-cmake/ninja /usr/bin popd # Install Rust +export RUSTUP_HOME=/usr/.rustup +export CARGO_HOME=/usr/.cargo curl --proto '=https' --tlsv1.2 https://sh.rustup.rs -sSf | sh -s -- -y cd / From 22c7c37cde2050f41b4406ae0f792a04e1f21cad Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Wed, 20 Nov 2024 15:57:40 +0800 Subject: [PATCH 48/79] Update Rust environment permissions in Dockerfiles --- .../inference/aarch64/default/cpu/scripts/install_deps.sh | 2 ++ .../github/linux/docker/manylinux/scripts/install_deps.sh | 2 ++ 2 files changed, 4 insertions(+) diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_deps.sh b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_deps.sh index dc3085d8b..0bad07204 100755 --- a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_deps.sh +++ b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_deps.sh @@ -68,6 +68,8 @@ tar --strip 1 -xf /tmp/src/node-v18.17.1-linux-${NODEJS_ARCH}.tar.gz -C /usr # Install Rust export RUSTUP_HOME=/usr/.rustup export CARGO_HOME=/usr/.cargo +chmod -R 777 /usr/.rustup +chmod -R 777 /usr/.cargo curl --proto '=https' --tlsv1.2 https://sh.rustup.rs -sSf | sh -s -- -y cd / diff --git a/tools/ci_build/github/linux/docker/manylinux/scripts/install_deps.sh b/tools/ci_build/github/linux/docker/manylinux/scripts/install_deps.sh index 4a186544a..d21e45f73 100755 --- a/tools/ci_build/github/linux/docker/manylinux/scripts/install_deps.sh +++ b/tools/ci_build/github/linux/docker/manylinux/scripts/install_deps.sh @@ -83,6 +83,8 @@ popd # Install Rust export RUSTUP_HOME=/usr/.rustup export CARGO_HOME=/usr/.cargo +chmod -R 777 /usr/.rustup +chmod -R 777 /usr/.cargo curl --proto '=https' --tlsv1.2 https://sh.rustup.rs -sSf | sh -s -- -y cd / From a1186a5e34b10791dc608d7ab5a0caa2b25871cc Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Wed, 20 Nov 2024 16:04:11 +0800 Subject: [PATCH 49/79] Update Rust installation in Dockerfiles --- .../inference/aarch64/default/cpu/scripts/install_deps.sh | 2 +- .../github/linux/docker/manylinux/scripts/install_deps.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_deps.sh b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_deps.sh index 0bad07204..31056c4e0 100755 --- a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_deps.sh +++ b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_deps.sh @@ -68,9 +68,9 @@ tar --strip 1 -xf /tmp/src/node-v18.17.1-linux-${NODEJS_ARCH}.tar.gz -C /usr # Install Rust export RUSTUP_HOME=/usr/.rustup export CARGO_HOME=/usr/.cargo +curl --proto '=https' --tlsv1.2 https://sh.rustup.rs -sSf | sh -s -- -y chmod -R 777 /usr/.rustup chmod -R 777 /usr/.cargo -curl --proto '=https' --tlsv1.2 https://sh.rustup.rs -sSf | sh -s -- -y cd / rm -rf /tmp/src diff --git a/tools/ci_build/github/linux/docker/manylinux/scripts/install_deps.sh b/tools/ci_build/github/linux/docker/manylinux/scripts/install_deps.sh index d21e45f73..03b5ae720 100755 --- a/tools/ci_build/github/linux/docker/manylinux/scripts/install_deps.sh +++ b/tools/ci_build/github/linux/docker/manylinux/scripts/install_deps.sh @@ -83,9 +83,9 @@ popd # Install Rust export RUSTUP_HOME=/usr/.rustup export CARGO_HOME=/usr/.cargo +curl --proto '=https' --tlsv1.2 https://sh.rustup.rs -sSf | sh -s -- -y chmod -R 777 /usr/.rustup chmod -R 777 /usr/.cargo -curl --proto '=https' --tlsv1.2 https://sh.rustup.rs -sSf | sh -s -- -y cd / rm -rf /tmp/src From 2c9b02c0ae69507d78f43669051919b319ae8d5d Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Wed, 20 Nov 2024 10:12:13 +0000 Subject: [PATCH 50/79] revert linux arm workflow --- .github/workflows/linux-cpu-arm64-build.yml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/workflows/linux-cpu-arm64-build.yml b/.github/workflows/linux-cpu-arm64-build.yml index 8e2e091bc..362331591 100644 --- a/.github/workflows/linux-cpu-arm64-build.yml +++ b/.github/workflows/linux-cpu-arm64-build.yml @@ -74,15 +74,13 @@ jobs: run: | docker run --rm \ --volume $GITHUB_WORKSPACE:/onnxruntime_src \ - -w /onnxruntime_src ort_genai_linux_arm64_gha bash -c " \ - /usr/bin/cmake --preset linux_gcc_cpu_release " + -w /onnxruntime_src ort_genai_linux_arm64_gha bash -c "/usr/bin/cmake --preset linux_gcc_cpu_release" - name: Docker -- Build with CMake and GCC run: | docker run --rm \ --volume $GITHUB_WORKSPACE:/onnxruntime_src \ - -w /onnxruntime_src ort_genai_linux_arm64_gha bash -c " \ - /usr/bin/cmake --build --preset linux_gcc_cpu_release" + -w /onnxruntime_src ort_genai_linux_arm64_gha bash -c "/usr/bin/cmake --build --preset linux_gcc_cpu_release" - name: Docker -- Check test directory run: | From 899edf9b73a4b1259cdb889d0de4297aca507feb Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Fri, 22 Nov 2024 03:45:06 +0000 Subject: [PATCH 51/79] Update Rust installation with specific version --- .github/workflows/android-build.yml | 4 ++-- .github/workflows/linux-cpu-x64-build.yml | 4 ++-- .github/workflows/linux-cpu-x64-nightly-build.yml | 4 ++-- .github/workflows/mac-cpu-arm64-build.yml | 4 ++-- .github/workflows/win-cpu-arm64-build.yml | 4 ++-- .github/workflows/win-cpu-x64-build.yml | 4 ++-- .github/workflows/win-cuda-x64-build.yml | 4 ++-- .github/workflows/win-directml-x64-build.yml | 4 ++-- .../inference/aarch64/default/cpu/scripts/install_deps.sh | 2 +- .../github/linux/docker/manylinux/scripts/install_deps.sh | 2 +- 10 files changed, 18 insertions(+), 18 deletions(-) diff --git a/.github/workflows/android-build.yml b/.github/workflows/android-build.yml index 174bff28c..be545705d 100644 --- a/.github/workflows/android-build.yml +++ b/.github/workflows/android-build.yml @@ -82,8 +82,8 @@ jobs: unzip microsoft.ml.onnxruntime/${{ env.ORT_NIGHTLY_VERSION }}/runtimes/android/native/onnxruntime.aar -d ort ls -lR ort - - name: Install Rust - uses: dtolnay/rust-toolchain@stable + - name: Install Rust Toolchain + uses: dtolnay/rust-toolchain@1.82.0 - name: Install Rust Toolchain run: | diff --git a/.github/workflows/linux-cpu-x64-build.yml b/.github/workflows/linux-cpu-x64-build.yml index fd6f109bd..b07d16ed1 100644 --- a/.github/workflows/linux-cpu-x64-build.yml +++ b/.github/workflows/linux-cpu-x64-build.yml @@ -27,8 +27,8 @@ jobs: with: dotnet-version: '8.0.x' - - name: Install Rust - uses: dtolnay/rust-toolchain@stable + - name: Install Rust Toolchain + uses: dtolnay/rust-toolchain@1.82.0 - name: Get the Latest OnnxRuntime Nightly Version shell: pwsh diff --git a/.github/workflows/linux-cpu-x64-nightly-build.yml b/.github/workflows/linux-cpu-x64-nightly-build.yml index d58774f6d..1b8bbb151 100644 --- a/.github/workflows/linux-cpu-x64-nightly-build.yml +++ b/.github/workflows/linux-cpu-x64-nightly-build.yml @@ -22,8 +22,8 @@ jobs: - name: Checkout OnnxRuntime GenAI repo uses: actions/checkout@v2 - - name: Install Rust - uses: dtolnay/rust-toolchain@stable + - name: Install Rust Toolchain + uses: dtolnay/rust-toolchain@1.82.0 - name: Download OnnxRuntime run: | diff --git a/.github/workflows/mac-cpu-arm64-build.yml b/.github/workflows/mac-cpu-arm64-build.yml index c25648bed..3f153107d 100644 --- a/.github/workflows/mac-cpu-arm64-build.yml +++ b/.github/workflows/mac-cpu-arm64-build.yml @@ -40,8 +40,8 @@ jobs: mv ${{ env.ORT_PACKAGE_NAME }}/build/native/include ort/ mv ${{ env.ORT_PACKAGE_NAME }}/runtimes/osx-arm64/native/* ort/lib/ - - name: Install Rust - uses: dtolnay/rust-toolchain@stable + - name: Install Rust Toolchain + uses: dtolnay/rust-toolchain@1.82.0 - name: Configure CMake run: | diff --git a/.github/workflows/win-cpu-arm64-build.yml b/.github/workflows/win-cpu-arm64-build.yml index 16cc1e9cf..8df626819 100644 --- a/.github/workflows/win-cpu-arm64-build.yml +++ b/.github/workflows/win-cpu-arm64-build.yml @@ -52,11 +52,11 @@ jobs: move ${{ env.ORT_PACKAGE_NAME }}/build/native/include ort/ move ${{ env.ORT_PACKAGE_NAME }}/runtimes/win-arm64/native/* ort/lib/ - - name: Install Rust + - name: Install Rust Toolchain run: | $exePath = "$env:TEMP\rustup-init.exe" (New-Object Net.WebClient).DownloadFile('https://static.rust-lang.org/rustup/dist/x86_64-pc-windows-msvc/rustup-init.exe', $exePath) - & $exePath -y + & $exePath -y --default-toolchain=1.82.0 Add-Content $env:GITHUB_PATH "$env:USERPROFILE\.cargo\bin" - name: Configure CMake diff --git a/.github/workflows/win-cpu-x64-build.yml b/.github/workflows/win-cpu-x64-build.yml index 6fe5d2f79..ba2fd11c0 100644 --- a/.github/workflows/win-cpu-x64-build.yml +++ b/.github/workflows/win-cpu-x64-build.yml @@ -41,11 +41,11 @@ jobs: with: dotnet-version: '8.0.x' - - name: Install Rust + - name: Install Rust Toolchain run: | $exePath = "$env:TEMP\rustup-init.exe" (New-Object Net.WebClient).DownloadFile('https://static.rust-lang.org/rustup/dist/x86_64-pc-windows-msvc/rustup-init.exe', $exePath) - & $exePath -y + & $exePath -y --default-toolchain=1.82.0 Add-Content $env:GITHUB_PATH "$env:USERPROFILE\.cargo\bin" - name: Download OnnxRuntime Nightly diff --git a/.github/workflows/win-cuda-x64-build.yml b/.github/workflows/win-cuda-x64-build.yml index 5d574f6a4..dc9294f8e 100644 --- a/.github/workflows/win-cuda-x64-build.yml +++ b/.github/workflows/win-cuda-x64-build.yml @@ -61,11 +61,11 @@ jobs: move ${{ env.ORT_PACKAGE_NAME }}/buildTransitive/native/include ort/ move ${{ env.ORT_PACKAGE_NAME }}/runtimes/win-x64/native/* ort/lib/ - - name: Install Rust + - name: Install Rust Toolchain run: | $exePath = "$env:TEMP\rustup-init.exe" (New-Object Net.WebClient).DownloadFile('https://static.rust-lang.org/rustup/dist/x86_64-pc-windows-msvc/rustup-init.exe', $exePath) - & $exePath -y + & $exePath -y --default-toolchain=1.82.0 Add-Content $env:GITHUB_PATH "$env:USERPROFILE\.cargo\bin" - name: Configure CMake diff --git a/.github/workflows/win-directml-x64-build.yml b/.github/workflows/win-directml-x64-build.yml index 2bd019c2a..04ac024b5 100644 --- a/.github/workflows/win-directml-x64-build.yml +++ b/.github/workflows/win-directml-x64-build.yml @@ -78,11 +78,11 @@ jobs: mv $env:d3d12_dir\build\native\bin\x64\D3D12Core.dll ort\lib mv $env:dml_dir\include\DirectML.h ort\include - - name: Install Rust + - name: Install Rust Toolchain run: | $exePath = "$env:TEMP\rustup-init.exe" (New-Object Net.WebClient).DownloadFile('https://static.rust-lang.org/rustup/dist/x86_64-pc-windows-msvc/rustup-init.exe', $exePath) - & $exePath -y + & $exePath -y --default-toolchain=1.82.0 Add-Content $env:GITHUB_PATH "$env:USERPROFILE\.cargo\bin" - name: Configure CMake diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_deps.sh b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_deps.sh index 31056c4e0..9718081d7 100755 --- a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_deps.sh +++ b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_deps.sh @@ -68,7 +68,7 @@ tar --strip 1 -xf /tmp/src/node-v18.17.1-linux-${NODEJS_ARCH}.tar.gz -C /usr # Install Rust export RUSTUP_HOME=/usr/.rustup export CARGO_HOME=/usr/.cargo -curl --proto '=https' --tlsv1.2 https://sh.rustup.rs -sSf | sh -s -- -y +curl --proto '=https' --tlsv1.2 https://sh.rustup.rs -sSf | sh -s -- -y --default-toolchain=1.82.0 chmod -R 777 /usr/.rustup chmod -R 777 /usr/.cargo diff --git a/tools/ci_build/github/linux/docker/manylinux/scripts/install_deps.sh b/tools/ci_build/github/linux/docker/manylinux/scripts/install_deps.sh index 03b5ae720..fa488debe 100755 --- a/tools/ci_build/github/linux/docker/manylinux/scripts/install_deps.sh +++ b/tools/ci_build/github/linux/docker/manylinux/scripts/install_deps.sh @@ -83,7 +83,7 @@ popd # Install Rust export RUSTUP_HOME=/usr/.rustup export CARGO_HOME=/usr/.cargo -curl --proto '=https' --tlsv1.2 https://sh.rustup.rs -sSf | sh -s -- -y +curl --proto '=https' --tlsv1.2 https://sh.rustup.rs -sSf | sh -s -- -y --default-toolchain=1.82.0 chmod -R 777 /usr/.rustup chmod -R 777 /usr/.cargo From 2d47c204437968b3c521f48030e5fe4cb5091f32 Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Fri, 22 Nov 2024 04:01:19 +0000 Subject: [PATCH 52/79] fix android error --- .github/workflows/android-build.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/android-build.yml b/.github/workflows/android-build.yml index be545705d..c9787abc0 100644 --- a/.github/workflows/android-build.yml +++ b/.github/workflows/android-build.yml @@ -85,9 +85,9 @@ jobs: - name: Install Rust Toolchain uses: dtolnay/rust-toolchain@1.82.0 - - name: Install Rust Toolchain + - name: Install Rust Android Toolchain run: | - rustup target add --toolchain stable-x86_64-unknown-linux-gnu x86_64-linux-android + rustup target add --toolchain 1.82.0-x86_64-unknown-linux-gnu x86_64-linux-android - name: Create Android build run: | From 9a153850efeb9ab03e7b8833d4094706669f719a Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Mon, 25 Nov 2024 15:36:25 +0800 Subject: [PATCH 53/79] fix for review --- src/models/logits.cpp | 4 +++- src/models/logits_processor.cpp | 20 ++++++++++---------- src/models/logits_processor.h | 4 ++-- src/models/model.h | 2 -- test/logits_processor_tests.cpp | 4 ++-- 5 files changed, 17 insertions(+), 17 deletions(-) diff --git a/src/models/logits.cpp b/src/models/logits.cpp index e4a0c65af..aca9282b1 100644 --- a/src/models/logits.cpp +++ b/src/models/logits.cpp @@ -258,7 +258,7 @@ void Logits::Update(DeviceSpan next_tokens_unk) { if (!logits_processors_.empty() && logits_processors_.at(0)) { auto next_tokens = next_tokens_unk.CopyDeviceToCpu(); for (int i = 0; i < next_tokens.size(); i++) { - logits_processors_[i]->CommitTokens(static_cast(next_tokens[i])); + logits_processors_[i]->CommitToken(static_cast(next_tokens[i])); } mask_future_ = std::async(std::launch::async, [&]() { std::vector> result; @@ -309,6 +309,8 @@ void Logits::AddMask(std::span logits, std::vector> auto logits_span = logits.subspan(vocab_index, vocab_size); auto& mask = masks[index]; for (size_t i = 0; i < vocab_size; i++) { + // mask is a 32-bit integer, where each bit corresponds to a token in the vocabulary. + // If the bit is set, the corresponding token is masked (i.e., its logit is set to the lowest possible value). logits_span[i] = mask[i / 32] & (1 << (i % 32)) ? logits_span[i] : std::numeric_limits::lowest(); } vocab_index += vocab_size; diff --git a/src/models/logits_processor.cpp b/src/models/logits_processor.cpp index fd9bc7c26..724f769d1 100644 --- a/src/models/logits_processor.cpp +++ b/src/models/logits_processor.cpp @@ -50,15 +50,15 @@ GuidanceLogitsProcessor::GuidanceLogitsProcessor(int vocab_size, uint32_t eos_to auto prefix_len = tokenizer_->Encode(kTokenizePrefixStr).size(); tokenize_data_ = {tokenizer_.get(), prefix_len}; LlgTokenizerInit tokenizer_init = { - static_cast(vocab_size_), - eos_token, - nullptr, - nullptr, - json_data.c_str(), - false, - tokenize_fn, - false, - &tokenize_data_, + static_cast(vocab_size_), // vocab_size + eos_token, // eos_token + nullptr, // token_lens + nullptr, // token_bytes + json_data.c_str(), // tokenizer_json config data + false, // tokenize_assumes_string + tokenize_fn, // tokenize_fn + false, // use_approximate_greedy_tokenize_fn + &tokenize_data_, // user_data }; char error_buf[128]; @@ -108,7 +108,7 @@ std::vector GuidanceLogitsProcessor::ComputeMask() { return mask; } -void GuidanceLogitsProcessor::CommitTokens(uint32_t token) { +void GuidanceLogitsProcessor::CommitToken(uint32_t token) { LlgCommitResult commit_result; auto error = llg_commit_token(llg_constraint_.get(), token, &commit_result); if (error != 0) { diff --git a/src/models/logits_processor.h b/src/models/logits_processor.h index 5d4e7924b..158c2849f 100644 --- a/src/models/logits_processor.h +++ b/src/models/logits_processor.h @@ -29,7 +29,7 @@ struct LogitsProcessor { LogitsProcessor() = default; virtual ~LogitsProcessor() = default; virtual std::vector ComputeMask() = 0; - virtual void CommitTokens(uint32_t token) = 0; + virtual void CommitToken(uint32_t token) = 0; }; #if USE_GUIDANCE @@ -53,7 +53,7 @@ struct GuidanceLogitsProcessor : public LogitsProcessor { const std::string& guidance_data, std::shared_ptr tokenizer, const std::string& tokenizer_path); std::vector ComputeMask() override; - void CommitTokens(uint32_t token) override; + void CommitToken(uint32_t token) override; size_t vocab_size_; uint32_t eos_token_; diff --git a/src/models/model.h b/src/models/model.h index 5c5a3d679..03d973afa 100644 --- a/src/models/model.h +++ b/src/models/model.h @@ -51,8 +51,6 @@ struct State { std::vector input_names_, output_names_; std::vector adapter_names_; - std::string guided_data_; - std::string guided_type_; std::vector inputs_, outputs_; protected: diff --git a/test/logits_processor_tests.cpp b/test/logits_processor_tests.cpp index 074272c93..6847d6594 100644 --- a/test/logits_processor_tests.cpp +++ b/test/logits_processor_tests.cpp @@ -45,7 +45,7 @@ TEST(LogitsProcessorTests, TestRegex) { reinterpret_cast(text.c_str()), text.size()); for (auto id : target_ids) { auto mask = processor->ComputeMask(); - processor->CommitTokens(id); + processor->CommitToken(id); } } @@ -62,7 +62,7 @@ TEST(LogitsProcessorTests, TestJsonSchema) { reinterpret_cast(text.c_str()), text.size()); for (auto id : target_ids) { auto mask = processor->ComputeMask(); - processor->CommitTokens(id); + processor->CommitToken(id); } } From ff94fe1909c9b788851b8bfcdf8a26728f9deaec Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Fri, 6 Dec 2024 15:36:57 +0800 Subject: [PATCH 54/79] fix SetGuidance unit test --- test/c_api_tests.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/test/c_api_tests.cpp b/test/c_api_tests.cpp index 654d4cf88..2b123dc16 100644 --- a/test/c_api_tests.cpp +++ b/test/c_api_tests.cpp @@ -680,12 +680,15 @@ TEST(CAPITests, SetGuidance) { auto input_sequences = OgaSequences::Create(); tokenizer->Encode(input_string, *input_sequences); auto params = OgaGeneratorParams::Create(*model); - params->SetInputSequences(*input_sequences); params->SetSearchOption("max_length", 32); params->SetGuidance("regex", "answer: .*"); - auto sequences = model->Generate(*params); - auto out_string = tokenizer->Decode(sequences->SequenceData(0), sequences->SequenceCount(0)); + auto generator = OgaGenerator::Create(*model, *params); + generator->AppendTokenSequences(*input_sequences); + while (!generator->IsDone()) { + generator->GenerateNextToken(); + } + auto out_string = tokenizer->Decode(generator->GetSequenceData(0), generator->GetSequenceCount(0)); auto output = std::string(out_string).substr(std::string(input_string).size()); EXPECT_TRUE(std::regex_match(output, std::regex("answer: .*"))); From 4ca4075812dcd657590846711c77a8dc8327bffd Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Fri, 6 Dec 2024 15:41:29 +0800 Subject: [PATCH 55/79] fix format --- src/models/logits_processor.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/models/logits_processor.cpp b/src/models/logits_processor.cpp index 724f769d1..83c362685 100644 --- a/src/models/logits_processor.cpp +++ b/src/models/logits_processor.cpp @@ -50,15 +50,15 @@ GuidanceLogitsProcessor::GuidanceLogitsProcessor(int vocab_size, uint32_t eos_to auto prefix_len = tokenizer_->Encode(kTokenizePrefixStr).size(); tokenize_data_ = {tokenizer_.get(), prefix_len}; LlgTokenizerInit tokenizer_init = { - static_cast(vocab_size_), // vocab_size - eos_token, // eos_token - nullptr, // token_lens - nullptr, // token_bytes - json_data.c_str(), // tokenizer_json config data - false, // tokenize_assumes_string - tokenize_fn, // tokenize_fn - false, // use_approximate_greedy_tokenize_fn - &tokenize_data_, // user_data + static_cast(vocab_size_), // vocab_size + eos_token, // eos_token + nullptr, // token_lens + nullptr, // token_bytes + json_data.c_str(), // tokenizer_json config data + false, // tokenize_assumes_string + tokenize_fn, // tokenize_fn + false, // use_approximate_greedy_tokenize_fn + &tokenize_data_, // user_data }; char error_buf[128]; From 9849f65970af32a6b247b4a2d9afa40756ba05d7 Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Wed, 11 Dec 2024 09:13:17 +0000 Subject: [PATCH 56/79] fix to new continuous decoding api --- CMakeLists.txt | 26 +-- src/generators.cpp | 12 +- src/generators.h | 2 + src/logits_processor.cpp | 240 +++++++++++++++++++++++++++ src/{models => }/logits_processor.h | 53 +++--- src/models/decoder_only.cpp | 2 +- src/models/logits.cpp | 246 ++++++++++++++-------------- src/models/logits.h | 19 +-- src/models/logits_processor.cpp | 144 ---------------- src/ort_genai_c.cpp | 1 + src/python/python.cpp | 1 + test/logits_processor_tests.cpp | 26 +-- test/model_tests.cpp | 1 + test/sampling_benchmark.cpp | 1 + test/sampling_tests.cpp | 1 + 15 files changed, 452 insertions(+), 323 deletions(-) create mode 100644 src/logits_processor.cpp rename src/{models => }/logits_processor.h (54%) delete mode 100644 src/models/logits_processor.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 52e3b7074..6cae8af2c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -111,17 +111,6 @@ target_include_directories(onnxruntime-genai-static PUBLIC ${onnxruntime_extensi target_link_libraries(onnxruntime-genai PRIVATE onnxruntime_extensions) target_link_libraries(onnxruntime-genai-static PUBLIC onnxruntime_extensions) -if(USE_GUIDANCE) - target_include_directories(onnxruntime-genai PUBLIC ${llguidance_SOURCE_DIR}/parser/) - target_include_directories(onnxruntime-genai-static PUBLIC ${llguidance_SOURCE_DIR}/parser/) - target_link_libraries(onnxruntime-genai PRIVATE llguidance_parser) - target_link_libraries(onnxruntime-genai-static PUBLIC llguidance_parser) - if (WIN32) - # bcrypt is needed for the rust std lib - target_link_libraries(onnxruntime-genai PRIVATE bcrypt) - target_link_libraries(onnxruntime-genai-static PRIVATE bcrypt) - endif() -endif() target_link_directories(onnxruntime-genai PRIVATE ${ORT_LIB_DIR}) # we keep the shared libraries disconnected on Android as they will come from separate AARs and we don't want to force @@ -142,6 +131,8 @@ if(USE_CUDA AND CMAKE_CUDA_COMPILER) add_library(onnxruntime-genai-cuda SHARED ${generator_cudalib_srcs}) target_include_directories(onnxruntime-genai-cuda PRIVATE ${ORT_HEADER_DIR}) target_include_directories(onnxruntime-genai-cuda PRIVATE ${GENERATORS_ROOT}) + # target_include_directories(onnxruntime-genai-cuda PRIVATE ${onnxruntime_extensions_SOURCE_DIR}/include) + # target_include_directories(onnxruntime-genai-cuda PRIVATE ${onnxruntime_extensions_SOURCE_DIR}/shared/api/) target_link_libraries(onnxruntime-genai-cuda PRIVATE cublasLt cublas curand cufft cudart) set_target_properties(onnxruntime-genai-cuda PROPERTIES LINKER_LANGUAGE CUDA) add_dependencies(onnxruntime-genai onnxruntime-genai-cuda) @@ -158,6 +149,19 @@ if(USE_CUDA AND CMAKE_CUDA_COMPILER) endif() endif() + +if(USE_GUIDANCE) + target_include_directories(onnxruntime-genai PUBLIC ${llguidance_SOURCE_DIR}/parser/) + target_include_directories(onnxruntime-genai-static PUBLIC ${llguidance_SOURCE_DIR}/parser/) + target_link_libraries(onnxruntime-genai PRIVATE llguidance_parser) + target_link_libraries(onnxruntime-genai-static PUBLIC llguidance_parser) + if (WIN32) + # bcrypt is needed for the rust std lib + target_link_libraries(onnxruntime-genai PRIVATE bcrypt) + target_link_libraries(onnxruntime-genai-static PRIVATE bcrypt) + endif() +endif() + if(CMAKE_GENERATOR_TOOLSET MATCHES "Visual Studio") target_link_options(onnxruntime-genai PRIVATE "/CETCOMPAT") target_compile_options(onnxruntime-genai PRIVATE "/sdl") diff --git a/src/generators.cpp b/src/generators.cpp index 0b2489e87..7a61a3bd5 100644 --- a/src/generators.cpp +++ b/src/generators.cpp @@ -5,6 +5,7 @@ #include "sequences.h" #include "models/model.h" #include "models/decoder_only.h" +#include "logits_processor.h" #include "search.h" #include "cpu/interface.h" #include "cuda/interface.h" @@ -266,6 +267,7 @@ Generator::Generator(const Model& model, const GeneratorParams& params) : model_ search_ = CreateSearch(params); state_ = model.CreateState(search_->GetSequenceLengths(), params); // Search sequence lengths set when creating state + logits_processor_ = CreateLogitsProcessor(*state_); // Temporary solution for multimodal and whisper models if (!params.aux_input_ids.empty() && params.aux_input_ids.data() != nullptr) { AppendTokens(params.aux_input_ids); @@ -302,7 +304,10 @@ void Generator::AppendTokens(const cpu_span input_ids) { void Generator::ComputeLogits(DeviceSpan next_tokens) { if (computed_logits_) throw std::runtime_error("ComputeLogits called again without calling AppendTokens or GenerateNextToken first"); - + if (last_action_ == Action::generated && logits_processor_) { + auto next_tokens_span = next_tokens.CopyDeviceToCpu(); + logits_processor_->CommitTokens(next_tokens_span); + } auto logits = state_->Run(search_->GetSequenceLength(), next_tokens, search_->GetNextIndices()); if (g_log.enabled && g_log.model_logits) { auto& stream = Log("model_logits"); @@ -364,6 +369,10 @@ void Generator::GenerateNextToken() { search_->AppendTokens(next_tokens); ComputeLogits(next_tokens); } + if (logits_processor_) { + auto logits = GetLogits(); + logits_processor_->ProcessLogits(logits); + } computed_logits_ = false; auto& search = search_->params_->search; search_->ApplyMinLength(search.min_length); @@ -417,6 +426,7 @@ void Generator::RewindToLength(size_t new_length) { throw std::runtime_error("RewindToLength must be called with new_length=0 when batch_size > 1"); search_->RewindTo(new_length); state_->RewindTo(new_length); + logits_processor_->Reset(); computed_logits_ = false; last_action_ = Action::rewound; } diff --git a/src/generators.h b/src/generators.h index 7f54c918f..0eae8d187 100644 --- a/src/generators.h +++ b/src/generators.h @@ -46,6 +46,7 @@ struct Model; struct State; struct Search; struct Tokenizer; +struct LogitsProcessor; template DeviceSpan WrapTensor(DeviceInterface& device, OrtValue& value) { @@ -128,6 +129,7 @@ struct Generator : LeakChecked { std::shared_ptr model_; std::unique_ptr state_; std::unique_ptr search_; + std::unique_ptr logits_processor_; bool computed_logits_{}; // Set to true in ComputeLogits() and false after appending a token to ensure a 1 to 1 call ratio private: diff --git a/src/logits_processor.cpp b/src/logits_processor.cpp new file mode 100644 index 000000000..fceff783e --- /dev/null +++ b/src/logits_processor.cpp @@ -0,0 +1,240 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +#include +#include +#include +#include +#include +#include +#include + +#include "generators.h" +#if USE_GUIDANCE +#include "llguidance.h" +#endif + +#if USE_CUDA +#include "cuda/cuda_common.h" +#include "models/kernels.h" +#endif + +#include "logits_processor.h" + +namespace Generators { + +#if USE_GUIDANCE +GuidanceLogitsProcessor::GuidanceLogitsProcessor(const State& state) + : vocab_size_(state.params_->config.model.vocab_size), eos_token_(state.params_->config.model.eos_token_id), device_type_(state.params_->device_type), batch_size_(state.params_->search.batch_size) { + guidance_type_ = state.params_->guidance_type; + guidance_data_ = state.params_->guidance_data; + if (guidance_type_.empty() || guidance_data_.empty()) { + throw std::runtime_error("Guidance type and data must be provided"); + } + + if (guidance_type_ != "json_schema" && guidance_type_ != "regex" && guidance_type_ != "grammar") { + throw std::runtime_error("Unsupported guidance type: " + std::string(guidance_type_)); + } + + auto tokenize_fn = (LlgTokenizeFn) + [](const void* user_data, const uint8_t* bytes, + size_t bytes_len, uint32_t* output_tokens, size_t output_tokens_len) -> unsigned long { + const TokenizeData* tokenize_data = reinterpret_cast(user_data); + auto output_ids = tokenize_partial(reinterpret_cast(tokenize_data->tokenizer), tokenize_data->prefix_len, bytes, bytes_len); + size_t output_size = std::min(output_tokens_len, output_ids.size()); + for (size_t i = 0; i < output_size; i++) { + output_tokens[i] = output_ids[i]; + } + return static_cast(output_ids.size()); + }; + + auto tokenizer_path = state.params_->config.config_path.string(); + fs::path tokenizer_path_fs(tokenizer_path); + fs::path json_path(tokenizer_path_fs / kDefaultVocabFile); + std::ifstream json_file(json_path.string()); + std::stringstream json_buffer; + json_buffer << json_file.rdbuf(); + std::string json_data = json_buffer.str(); + tokenizer_ = state.model_.CreateTokenizer(); + auto prefix_len = tokenizer_->Encode(kTokenizePrefixStr).size(); + tokenize_data_ = {tokenizer_.get(), prefix_len}; + LlgTokenizerInit tokenizer_init = { + static_cast(vocab_size_), // vocab_size + eos_token_, // eos_token + nullptr, // token_lens + nullptr, // token_bytes + json_data.c_str(), // tokenizer_json config data + false, // tokenize_assumes_string + tokenize_fn, // tokenize_fn + false, // use_approximate_greedy_tokenize_fn + &tokenize_data_, // user_data + }; + + char error_buf[128]; + llg_tokenizer_ = std::unique_ptr(llg_new_tokenizer(&tokenizer_init, error_buf, sizeof(error_buf))); + if (!llg_tokenizer_) { + throw std::runtime_error("Error creating llg_tokenizer: " + std::string(error_buf)); + } + + llg_constraints_.resize(batch_size_); + for (int i = 0; i < batch_size_; i++) { + LlgConstraintInit constraint_init; + llg_constraint_init_set_defaults(&constraint_init, llg_tokenizer_.get()); + LlgConstraint* constraint_ptr; + if (guidance_type_ == "json_schema") { + constraint_ptr = llg_new_constraint_json(&constraint_init, guidance_data_.data()); + } else if (guidance_type_ == "regex") { + constraint_ptr = llg_new_constraint_regex(&constraint_init, guidance_data_.data()); + } else { + constraint_ptr = llg_new_constraint(&constraint_init, guidance_data_.data()); + } + if (llg_get_error(constraint_ptr) != nullptr) { + std::string error_message = llg_get_error(constraint_ptr); + llg_free_constraint(constraint_ptr); + throw std::runtime_error("Error creating grammar: " + error_message); + } + llg_constraints_[i] = std::unique_ptr(constraint_ptr); + } + + mask_future_ = std::async(std::launch::async, [&]() { + return ComputeMask(); + }); + +#if USE_CUDA + if (state.params_->device_type == DeviceType::CUDA) { + cuda_logits_mask_ptr_ = state.params_->p_device->Allocate(batch_size_ * vocab_size_ / 32); + } + cuda_stream_ = state.params_->cuda_stream; +#endif +} + +std::vector> GuidanceLogitsProcessor::ComputeMask() { + std::vector> masks; + for (int i = 0; i < batch_size_; i++) { + LlgMaskResult mask_result; + auto error = llg_compute_mask(llg_constraints_[i].get(), &mask_result); + if (error != 0) { + std::string error_message = llg_get_error(llg_constraints_[i].get()); + throw std::runtime_error("Error computing mask: " + error_message); + } + + std::vector mask; + if (mask_result.is_stop) { + std::cout << "should stop" << std::endl; + mask = std::vector((vocab_size_ - 1) / 32 + 1, 0); + uint32_t eos_mask32 = 1 << (eos_token_ % 32); + mask[eos_token_ / 32] = eos_mask32; + } else { + mask.reserve((vocab_size_ - 1) / 32 + 1); + for (int i = 0; i < (vocab_size_ - 1) / 32 + 1; i++) { + mask.push_back(mask_result.sample_mask[i]); + } + } + masks.push_back(mask); + } + return masks; +} + +void GuidanceLogitsProcessor::CommitTokens(std::span tokens) { + for (int i = 0; i < batch_size_; i++) { + LlgCommitResult commit_result; + auto error = llg_commit_token(llg_constraints_[i].get(), static_cast(tokens[i]), &commit_result); + if (error != 0) { + std::string error_message = llg_get_error(llg_constraints_[i].get()); + throw std::runtime_error("Error committing tokens: " + error_message); + } + } + mask_future_ = std::async(std::launch::async, [&]() { + return ComputeMask(); + }); + masks_.clear(); +} + +std::vector> GuidanceLogitsProcessor::GetMask() { + if (masks_.empty()) { + masks_ = mask_future_.get(); + } + return masks_; +} + +void GuidanceLogitsProcessor::ProcessLogits(DeviceSpan logits) { + auto masks = GetMask(); + +#if USE_CUDA + if (device_type_ == DeviceType::CUDA) { + for (int i = 0; i < masks.size(); i++) { + cudaMemcpyAsync(cuda_logits_mask_ptr_.Span().data() + (i * vocab_size_ / 32), masks.at(i).data(), + masks.at(i).size() * sizeof(uint32_t), ::cudaMemcpyHostToDevice, cuda_stream_); + } + cuda::LaunchAddLogitsMask(logits.Span().data(), batch_size_, vocab_size_, cuda_logits_mask_ptr_.Span().data(), cuda_stream_); + return; + } +#else + size_t vocab_index = 0; + + auto logits_span = logits.Span(); + for (int index = 0; index < batch_size_; index++) { + auto subspan = logits_span.subspan(vocab_index, vocab_size_); + auto& mask = masks[index]; + for (size_t i = 0; i < vocab_size_; i++) { + // mask is a 32-bit integer, where each bit corresponds to a token in the vocabulary. + // If the bit is set, the corresponding token is masked (i.e., its logit is set to the lowest possible value). + subspan[i] = mask[i / 32] & (1 << (i % 32)) ? subspan[i] : std::numeric_limits::lowest(); + } + vocab_index += vocab_size_; + } +#endif +} + +void GuidanceLogitsProcessor::Reset() { + masks_.clear(); + llg_constraints_.clear(); + llg_constraints_.resize(batch_size_); + for (int i = 0; i < batch_size_; i++) { + LlgConstraintInit constraint_init; + llg_constraint_init_set_defaults(&constraint_init, llg_tokenizer_.get()); + LlgConstraint* constraint_ptr; + if (guidance_type_ == "json_schema") { + constraint_ptr = llg_new_constraint_json(&constraint_init, guidance_data_.data()); + } else if (guidance_type_ == "regex") { + constraint_ptr = llg_new_constraint_regex(&constraint_init, guidance_data_.data()); + } else { + constraint_ptr = llg_new_constraint(&constraint_init, guidance_data_.data()); + } + if (llg_get_error(constraint_ptr) != nullptr) { + std::string error_message = llg_get_error(constraint_ptr); + llg_free_constraint(constraint_ptr); + throw std::runtime_error("Error creating grammar: " + error_message); + } + llg_constraints_[i] = std::unique_ptr(constraint_ptr); + } + + mask_future_ = std::async(std::launch::async, [&]() { + return ComputeMask(); + }); +} + +std::vector GuidanceLogitsProcessor::tokenize_partial(const Tokenizer* tokenizer, const size_t prefix_len, + const uint8_t* bytes, size_t bytes_len) { + // add prefix to tokenize for partial tokenization, it will produce ids more stable + std::string input_string = kTokenizePrefixStr; + input_string.reserve(bytes_len + 2); + for (size_t i = 0; i < bytes_len; i++) { + input_string.push_back(bytes[i]); + } + std::vector output_ids = tokenizer->Encode(input_string.c_str()); + return std::vector(output_ids.begin() + prefix_len, output_ids.end()); +} + +#endif + +std::unique_ptr CreateLogitsProcessor(const State& state) { +#if USE_GUIDANCE + if (!state.params_->guidance_type.empty() && !state.params_->guidance_data.empty()) { + return std::make_unique(state); + } + +#endif + + Log("warning", "No supported LogitsProcessor found. e.g. to use guidance, build with use_guidance=true"); + return nullptr; +} +} // namespace Generators diff --git a/src/models/logits_processor.h b/src/logits_processor.h similarity index 54% rename from src/models/logits_processor.h rename to src/logits_processor.h index 158c2849f..cb9241ec5 100644 --- a/src/models/logits_processor.h +++ b/src/logits_processor.h @@ -7,29 +7,22 @@ #include #include #include +#include #if USE_GUIDANCE #include #endif -#include "model.h" +#include "models/model.h" namespace Generators { -struct LogitsProcessorConfig { - int vocab_size; - uint32_t eos_token; - std::string guidance_type; - std::string guidance_data; - std::shared_ptr tokenizer; - std::string tokenizer_path; -}; - struct LogitsProcessor { LogitsProcessor() = default; virtual ~LogitsProcessor() = default; - virtual std::vector ComputeMask() = 0; - virtual void CommitToken(uint32_t token) = 0; + virtual void CommitTokens(std::span tokens) = 0; + virtual void ProcessLogits(DeviceSpan logits) = 0; + virtual void Reset() = 0; }; #if USE_GUIDANCE @@ -49,20 +42,36 @@ struct GuidanceLogitsProcessor : public LogitsProcessor { static constexpr const char* kDefaultVocabFile = "tokenizer.json"; static constexpr const char* kTokenizePrefixStr = "\x02"; - GuidanceLogitsProcessor(int vocab_size, uint32_t eos_token, const std::string& guidance_type, - const std::string& guidance_data, std::shared_ptr tokenizer, - const std::string& tokenizer_path); - std::vector ComputeMask() override; - void CommitToken(uint32_t token) override; + GuidanceLogitsProcessor(const State& state); + + void ProcessLogits(DeviceSpan logits) override; + void CommitTokens(std::span tokens) override; + void Reset() override; + std::vector> GetMask(); + static std::vector tokenize_partial(const Tokenizer* tokenizer, const size_t prefix_len, + const uint8_t* bytes, size_t bytes_len); + + private: + std::vector> ComputeMask(); size_t vocab_size_; uint32_t eos_token_; - std::unique_ptr llg_constraint_; + int batch_size_; + DeviceType device_type_; + std::string_view guidance_type_; + std::string_view guidance_data_; + std::vector> masks_; + std::vector> llg_constraints_; std::unique_ptr llg_tokenizer_; std::shared_ptr tokenizer_; - static std::vector tokenize_partial(const Tokenizer* tokenizer, const size_t prefix_len, - const uint8_t* bytes, size_t bytes_len); + std::future>> mask_future_; + std::vector> logits_masks_; + +#if USE_CUDA + DeviceSpan cuda_logits_mask_ptr_; + cudaStream_t cuda_stream_; +#endif struct TokenizeData { Tokenizer* tokenizer; @@ -72,6 +81,6 @@ struct GuidanceLogitsProcessor : public LogitsProcessor { }; #endif -std::unique_ptr CreateLogitsProcessor(const LogitsProcessorConfig& config); +std::unique_ptr CreateLogitsProcessor(const State& state); -} // namespace Generators \ No newline at end of file +} // namespace Generators diff --git a/src/models/decoder_only.cpp b/src/models/decoder_only.cpp index d394b4491..2c61121c0 100644 --- a/src/models/decoder_only.cpp +++ b/src/models/decoder_only.cpp @@ -37,7 +37,7 @@ DeviceSpan DecoderOnly_State::Run(int total_length, DeviceSpan& void DecoderOnly_State::RewindTo(size_t index) { position_inputs_.RewindTo(index); kv_cache_.RewindTo(index); - logits_.ResetProcessors(); + // logits_.ResetProcessors(); } void DecoderOnly_State::UpdateInputsOutputs(DeviceSpan& next_tokens, DeviceSpan beam_indices, int total_length) { diff --git a/src/models/logits.cpp b/src/models/logits.cpp index 589760eba..8852c48c1 100644 --- a/src/models/logits.cpp +++ b/src/models/logits.cpp @@ -34,41 +34,41 @@ Logits::Logits(State& state) cuda_eos_token_ids_.CopyCpuToDevice(); } #endif - if (!state_.params_->guidance_type.empty() && !state_.params_->guidance_data.empty()) { - auto tokenizer = model_.CreateTokenizer(); - LogitsProcessorConfig config = { - model_.config_->model.vocab_size, - static_cast(model_.config_->model.eos_token_id), - state_.params_->guidance_type, - state_.params_->guidance_data, - tokenizer, - model_.config_->config_path.string()}; - logits_processors_.resize(shape_[0]); - for (int i = 0; i < shape_[0]; i++) { - logits_processors_[i] = CreateLogitsProcessor(config); - } - if (logits_processors_.at(0)) { - // Compute the mask maybe time consuming, so we do it in a separate thread - mask_future_ = std::async(std::launch::async, [&]() { - std::vector> result; - for (int i = 0; i < shape_[0]; i++) { - auto processor = logits_processors_.at(i).get(); - if (processor == nullptr) { - result.push_back({}); - continue; - } - auto mask = processor->ComputeMask(); - result.push_back(std::move(mask)); - } - return result; - }); -#if USE_CUDA - if (model_.device_type_ == DeviceType::CUDA) { - cuda_logits_mask_ptr_ = state_.params_->p_device->Allocate(shape_[0] * shape_[2] / 32); - } -#endif - } - } +// if (!state_.params_->guidance_type.empty() && !state_.params_->guidance_data.empty()) { +// auto tokenizer = model_.CreateTokenizer(); +// LogitsProcessorConfig config = { +// model_.config_->model.vocab_size, +// static_cast(model_.config_->model.eos_token_id), +// state_.params_->guidance_type, +// state_.params_->guidance_data, +// tokenizer, +// model_.config_->config_path.string()}; +// logits_processors_.resize(shape_[0]); +// for (int i = 0; i < shape_[0]; i++) { +// logits_processors_[i] = CreateLogitsProcessor(config); +// } +// if (logits_processors_.at(0)) { +// // Compute the mask maybe time consuming, so we do it in a separate thread +// mask_future_ = std::async(std::launch::async, [&]() { +// std::vector> result; +// for (int i = 0; i < shape_[0]; i++) { +// auto processor = logits_processors_.at(i).get(); +// if (processor == nullptr) { +// result.push_back({}); +// continue; +// } +// auto mask = processor->ComputeMask(); +// result.push_back(std::move(mask)); +// } +// return result; +// }); +// #if USE_CUDA +// if (model_.device_type_ == DeviceType::CUDA) { +// cuda_logits_mask_ptr_ = state_.params_->p_device->Allocate(shape_[0] * shape_[2] / 32); +// } +// #endif +// } +// } input_sequence_lengths.resize(state_.params_->search.batch_size); } @@ -183,9 +183,9 @@ DeviceSpan Logits::Get() { if (logits_.empty() || logits_of_last_token->GetTensorMutableRawData() != logits_.Span().data()) logits_ = WrapTensor(*state_.params_->p_device, *logits_of_last_token); - if (!logits_processors_.empty() && logits_processors_.at(0) && logits_masks_.empty()) { - logits_masks_ = mask_future_.get(); - } + // if (!logits_processors_.empty() && logits_processors_.at(0) && logits_masks_.empty()) { + // logits_masks_ = mask_future_.get(); + // } #if USE_CUDA if (model_.device_type_ == DeviceType::CUDA) { @@ -198,13 +198,13 @@ DeviceSpan Logits::Get() { static_cast(cuda_eos_token_ids_.size()), model_.cuda_stream_); - if (!logits_masks_.empty()) { - for (int i = 0; i < logits_masks_.size(); i++) { - cudaMemcpyAsync(cuda_logits_mask_ptr_.Span().data() + (i * shape_[2] / 32), logits_masks_.at(i).data(), - logits_masks_.at(i).size() * sizeof(uint32_t), ::cudaMemcpyHostToDevice, model_.cuda_stream_); - } - AddMask(logits_, cuda_logits_mask_ptr_); - } + // if (!logits_masks_.empty()) { + // for (int i = 0; i < logits_masks_.size(); i++) { + // cudaMemcpyAsync(cuda_logits_mask_ptr_.Span().data() + (i * shape_[2] / 32), logits_masks_.at(i).data(), + // logits_masks_.at(i).size() * sizeof(uint32_t), ::cudaMemcpyHostToDevice, model_.cuda_stream_); + // } + // AddMask(logits_, cuda_logits_mask_ptr_); + // } return logits_; } #endif @@ -226,9 +226,9 @@ DeviceSpan Logits::Get() { auto batched_logits_cpu = cpu_span{cpu_tensor, element_count}; HandleEOSArray(batched_logits_cpu); - if (!logits_masks_.empty()) { - AddMask(batched_logits_cpu, logits_masks_); - } + // if (!logits_masks_.empty()) { + // AddMask(batched_logits_cpu, logits_masks_); + // } logits_ = WrapTensor(*state_.params_->p_device, *value32_cpu_); return logits_; @@ -236,30 +236,30 @@ DeviceSpan Logits::Get() { #endif HandleEOSArray(logits_.Span()); - if (!logits_masks_.empty()) { - AddMask(logits_.Span(), logits_masks_); - } + // if (!logits_masks_.empty()) { + // AddMask(logits_.Span(), logits_masks_); + // } return logits_; } #pragma warning(pop) void Logits::Update(const DeviceSpan& next_tokens, size_t new_kv_length) { - if (!logits_processors_.empty() && logits_processors_.at(0) && new_kv_length == 1) { - auto next_tokens_cpu = next_tokens.CopyDeviceToCpu(); - for (int i = 0; i < next_tokens_cpu.size(); i++) { - logits_processors_[i]->CommitToken(static_cast(next_tokens_cpu[i])); - } - mask_future_ = std::async(std::launch::async, [&]() { - std::vector> result; - for (int i = 0; i < shape_[0]; i++) { - auto processor = logits_processors_.at(i).get(); - auto mask = processor->ComputeMask(); - result.push_back(mask); - } - return result; - }); - } + // if (!logits_processors_.empty() && logits_processors_.at(0) && new_kv_length == 1) { + // auto next_tokens_cpu = next_tokens.CopyDeviceToCpu(); + // for (int i = 0; i < next_tokens_cpu.size(); i++) { + // logits_processors_[i]->CommitToken(static_cast(next_tokens_cpu[i])); + // } + // mask_future_ = std::async(std::launch::async, [&]() { + // std::vector> result; + // for (int i = 0; i < shape_[0]; i++) { + // auto processor = logits_processors_.at(i).get(); + // auto mask = processor->ComputeMask(); + // result.push_back(mask); + // } + // return result; + // }); + // } if (static_cast(output_raw_.get()->GetTensorTypeAndShapeInfo()->GetShape()[1]) == new_kv_length && new_kv_length == 1) { return; @@ -308,63 +308,63 @@ void Logits::HandleEOSArray(std::span batched_logits) { } } -void Logits::AddMask(std::span logits, std::vector>& masks) { - size_t vocab_size = shape_[2]; - size_t vocab_index = 0; - - for (int index = 0; index < shape_[0]; index++) { - auto logits_span = logits.subspan(vocab_index, vocab_size); - auto& mask = masks[index]; - for (size_t i = 0; i < vocab_size; i++) { - // mask is a 32-bit integer, where each bit corresponds to a token in the vocabulary. - // If the bit is set, the corresponding token is masked (i.e., its logit is set to the lowest possible value). - logits_span[i] = mask[i / 32] & (1 << (i % 32)) ? logits_span[i] : std::numeric_limits::lowest(); - } - vocab_index += vocab_size; - } -} - -#if USE_CUDA -void Logits::AddMask(DeviceSpan logits, DeviceSpan mask) { - cuda::LaunchAddLogitsMask(logits.Span().data(), static_cast(shape_[0]), - static_cast(shape_[2]), mask.Span().data(), model_.cuda_stream_); -} -#endif - -void Logits::ResetProcessors() { - if (!state_.params_->guidance_type.empty() && !state_.params_->guidance_data.empty()) { - logits_processors_.clear(); - logits_masks_.clear(); - auto tokenizer = model_.CreateTokenizer(); - LogitsProcessorConfig config = { - model_.config_->model.vocab_size, - static_cast(model_.config_->model.eos_token_id), - state_.params_->guidance_type, - state_.params_->guidance_data, - tokenizer, - model_.config_->config_path.string()}; - logits_processors_.resize(shape_[0]); - for (int i = 0; i < shape_[0]; i++) { - logits_processors_[i] = CreateLogitsProcessor(config); - } - if (logits_processors_.at(0)) { - // Compute the mask maybe time consuming, so we do it in a separate thread - mask_future_ = std::async(std::launch::async, [&]() { - std::vector> result; - for (int i = 0; i < shape_[0]; i++) { - auto processor = logits_processors_.at(i).get(); - if (processor == nullptr) { - result.push_back({}); - continue; - } - auto mask = processor->ComputeMask(); - result.push_back(std::move(mask)); - } - return result; - }); - } - } -} +// void Logits::AddMask(std::span logits, std::vector>& masks) { +// size_t vocab_size = shape_[2]; +// size_t vocab_index = 0; + +// for (int index = 0; index < shape_[0]; index++) { +// auto logits_span = logits.subspan(vocab_index, vocab_size); +// auto& mask = masks[index]; +// for (size_t i = 0; i < vocab_size; i++) { +// // mask is a 32-bit integer, where each bit corresponds to a token in the vocabulary. +// // If the bit is set, the corresponding token is masked (i.e., its logit is set to the lowest possible value). +// logits_span[i] = mask[i / 32] & (1 << (i % 32)) ? logits_span[i] : std::numeric_limits::lowest(); +// } +// vocab_index += vocab_size; +// } +// } + +// #if USE_CUDA +// void Logits::AddMask(DeviceSpan logits, DeviceSpan mask) { +// cuda::LaunchAddLogitsMask(logits.Span().data(), static_cast(shape_[0]), +// static_cast(shape_[2]), mask.Span().data(), model_.cuda_stream_); +// } +// #endif + +// void Logits::ResetProcessors() { +// if (!state_.params_->guidance_type.empty() && !state_.params_->guidance_data.empty()) { +// logits_processors_.clear(); +// logits_masks_.clear(); +// auto tokenizer = model_.CreateTokenizer(); +// LogitsProcessorConfig config = { +// model_.config_->model.vocab_size, +// static_cast(model_.config_->model.eos_token_id), +// state_.params_->guidance_type, +// state_.params_->guidance_data, +// tokenizer, +// model_.config_->config_path.string()}; +// logits_processors_.resize(shape_[0]); +// for (int i = 0; i < shape_[0]; i++) { +// logits_processors_[i] = CreateLogitsProcessor(config); +// } +// if (logits_processors_.at(0)) { +// // Compute the mask maybe time consuming, so we do it in a separate thread +// mask_future_ = std::async(std::launch::async, [&]() { +// std::vector> result; +// for (int i = 0; i < shape_[0]; i++) { +// auto processor = logits_processors_.at(i).get(); +// if (processor == nullptr) { +// result.push_back({}); +// continue; +// } +// auto mask = processor->ComputeMask(); +// result.push_back(std::move(mask)); +// } +// return result; +// }); +// } +// } +// } void Logits::Add() { output_index_ = state_.outputs_.size(); diff --git a/src/models/logits.h b/src/models/logits.h index 4f4b9a300..6fb4ba32c 100644 --- a/src/models/logits.h +++ b/src/models/logits.h @@ -3,10 +3,9 @@ #pragma once #include -#include #include "model.h" #include "static_buffer.h" -#include "logits_processor.h" +// #include "logits_processor.h" namespace Generators { @@ -21,13 +20,13 @@ struct Logits { // Resize logits to [bz, token_count, vocab_size] if necessary. void Update(const DeviceSpan& next_tokens, size_t new_kv_length); - // Reset logits processors to support rewind. - void ResetProcessors(); + // // Reset logits processors to support rewind. + // void ResetProcessors(); private: void HandleEOSArray(std::span logits); - void AddMask(std::span logits, std::vector>& mask); + // void AddMask(std::span logits, std::vector>& mask); State& state_; const Model& model_{state_.model_}; @@ -52,14 +51,14 @@ struct Logits { StaticBuffer* sb_logits32_{}; StaticBuffer* sb_logits16_{}; - std::vector> logits_processors_; - std::future>> mask_future_; - std::vector> logits_masks_; + // std::vector> logits_processors_; + // std::future>> mask_future_; + // std::vector> logits_masks_; #if USE_CUDA DeviceSpan cuda_eos_token_ids_; // eos_token_ids from params, but in cuda accessible memory - DeviceSpan cuda_logits_mask_ptr_; - void AddMask(DeviceSpan logits, DeviceSpan mask); + // DeviceSpan cuda_logits_mask_ptr_; + // void AddMask(DeviceSpan logits, DeviceSpan mask); #endif #if USE_DML diff --git a/src/models/logits_processor.cpp b/src/models/logits_processor.cpp deleted file mode 100644 index 83c362685..000000000 --- a/src/models/logits_processor.cpp +++ /dev/null @@ -1,144 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. -#include -#include -#include -#include -#include -#include -#include - -#if USE_GUIDANCE -#include "llguidance.h" -#endif - -#include "logits_processor.h" - -namespace Generators { - -#if USE_GUIDANCE -GuidanceLogitsProcessor::GuidanceLogitsProcessor(int vocab_size, uint32_t eos_token, - const std::string& guidance_type, const std::string& guidance_data, - std::shared_ptr tokenizer, const std::string& tokenizer_path) - : vocab_size_(vocab_size), eos_token_(eos_token), tokenizer_(std::move(tokenizer)) { - if (guidance_type.empty() || guidance_data.empty()) { - throw std::runtime_error("Guidance type and data must be provided"); - } - - if (guidance_type != "json_schema" && guidance_type != "regex" && guidance_type != "grammar") { - throw std::runtime_error("Unsupported guidance type: " + guidance_type); - } - - auto tokenize_fn = (LlgTokenizeFn) + [](const void* user_data, const uint8_t* bytes, - size_t bytes_len, uint32_t* output_tokens, size_t output_tokens_len) -> unsigned long { - const TokenizeData* tokenize_data = reinterpret_cast(user_data); - auto output_ids = tokenize_partial(reinterpret_cast(tokenize_data->tokenizer), tokenize_data->prefix_len, bytes, bytes_len); - size_t output_size = std::min(output_tokens_len, output_ids.size()); - for (size_t i = 0; i < output_size; i++) { - output_tokens[i] = output_ids[i]; - } - return static_cast(output_ids.size()); - }; - - // TODO reuse the tokenizer between constraints - fs::path tokenizer_path_fs(tokenizer_path); - fs::path json_path(tokenizer_path_fs / kDefaultVocabFile); - std::ifstream json_file(json_path.string()); - std::stringstream json_buffer; - json_buffer << json_file.rdbuf(); - std::string json_data = json_buffer.str(); - auto prefix_len = tokenizer_->Encode(kTokenizePrefixStr).size(); - tokenize_data_ = {tokenizer_.get(), prefix_len}; - LlgTokenizerInit tokenizer_init = { - static_cast(vocab_size_), // vocab_size - eos_token, // eos_token - nullptr, // token_lens - nullptr, // token_bytes - json_data.c_str(), // tokenizer_json config data - false, // tokenize_assumes_string - tokenize_fn, // tokenize_fn - false, // use_approximate_greedy_tokenize_fn - &tokenize_data_, // user_data - }; - - char error_buf[128]; - llg_tokenizer_ = std::unique_ptr(llg_new_tokenizer(&tokenizer_init, error_buf, sizeof(error_buf))); - if (!llg_tokenizer_) { - throw std::runtime_error("Error creating llg_tokenizer: " + std::string(error_buf)); - } - - LlgConstraintInit constraint_init; - llg_constraint_init_set_defaults(&constraint_init, llg_tokenizer_.get()); - LlgConstraint* constraint_ptr; - if (guidance_type == "json_schema") { - constraint_ptr = llg_new_constraint_json(&constraint_init, guidance_data.c_str()); - } else if (guidance_type == "regex") { - constraint_ptr = llg_new_constraint_regex(&constraint_init, guidance_data.c_str()); - } else { - constraint_ptr = llg_new_constraint(&constraint_init, guidance_data.c_str()); - } - if (llg_get_error(constraint_ptr) != nullptr) { - std::string error_message = llg_get_error(constraint_ptr); - auto error = std::runtime_error("Error creating grammar: " + error_message); - llg_free_constraint(constraint_ptr); - throw error; - } - llg_constraint_ = std::unique_ptr(constraint_ptr); -} - -std::vector GuidanceLogitsProcessor::ComputeMask() { - LlgMaskResult mask_result; - auto error = llg_compute_mask(llg_constraint_.get(), &mask_result); - if (error != 0) { - std::string error_message = llg_get_error(llg_constraint_.get()); - throw std::runtime_error("Error computing mask: " + error_message); - } - - std::vector mask; - if (mask_result.is_stop) { - mask = std::vector((vocab_size_ - 1) / 32 + 1, 0); - uint32_t eos_mask32 = 1 << (eos_token_ % 32); - mask[eos_token_ / 32] = eos_mask32; - } else { - mask.reserve((vocab_size_ - 1) / 32 + 1); - for (int i = 0; i < (vocab_size_ - 1) / 32 + 1; i++) { - mask.push_back(mask_result.sample_mask[i]); - } - } - return mask; -} - -void GuidanceLogitsProcessor::CommitToken(uint32_t token) { - LlgCommitResult commit_result; - auto error = llg_commit_token(llg_constraint_.get(), token, &commit_result); - if (error != 0) { - std::string error_message = llg_get_error(llg_constraint_.get()); - throw std::runtime_error("Error committing tokens: " + error_message); - } -} - -std::vector GuidanceLogitsProcessor::tokenize_partial(const Tokenizer* tokenizer, const size_t prefix_len, - const uint8_t* bytes, size_t bytes_len) { - // add prefix to tokenize for partial tokenization, it will produce ids more stable - std::string input_string = kTokenizePrefixStr; - input_string.reserve(bytes_len + 2); - for (size_t i = 0; i < bytes_len; i++) { - input_string.push_back(bytes[i]); - } - std::vector output_ids = tokenizer->Encode(input_string.c_str()); - return std::vector(output_ids.begin() + prefix_len, output_ids.end()); -} -#endif - -std::unique_ptr CreateLogitsProcessor(const LogitsProcessorConfig& config) { -#if USE_GUIDANCE - if (!config.guidance_type.empty() && !config.guidance_data.empty()) { - return std::make_unique(config.vocab_size, config.eos_token, config.guidance_type, config.guidance_data, config.tokenizer, config.tokenizer_path); - } - -#endif - - Log("warning", "No supported LogitsProcessor found. e.g. to use guidance, build with use_guidance=true"); - return nullptr; -} -} // namespace Generators diff --git a/src/ort_genai_c.cpp b/src/ort_genai_c.cpp index f31719e0b..a068f8a3d 100644 --- a/src/ort_genai_c.cpp +++ b/src/ort_genai_c.cpp @@ -7,6 +7,7 @@ #include "span.h" #include "ort_genai_c.h" #include "generators.h" +#include "logits_processor.h" #include "models/model.h" #include "runtime_settings.h" #include "search.h" diff --git a/src/python/python.cpp b/src/python/python.cpp index bc964a978..2ac7ce877 100644 --- a/src/python/python.cpp +++ b/src/python/python.cpp @@ -8,6 +8,7 @@ #include "../ort_genai.h" #include "../json.h" #include "../search.h" +#include "../logits_processor.h" #include "../models/model.h" #include "../logging.h" #include "../smartptrs.h" diff --git a/test/logits_processor_tests.cpp b/test/logits_processor_tests.cpp index 6847d6594..b2aef4826 100644 --- a/test/logits_processor_tests.cpp +++ b/test/logits_processor_tests.cpp @@ -5,7 +5,7 @@ #include #include #include -#include +#include #include #include #include @@ -38,14 +38,16 @@ TEST(LogitsProcessorTests, TestRegex) { std::string text = "answer: I am a robot"; auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32"); auto tokenizer = model->CreateTokenizer(); - auto processor = std::make_unique(model->config_->model.vocab_size, - model->config_->model.eos_token_id, "regex", - regex, tokenizer, model->config_->config_path.string().c_str()); + auto params = Generators::CreateGeneratorParams(*model); + params->SetGuidance("regex", regex); + auto generator = Generators::CreateGenerator(*model, *params); + auto processor = std::make_unique(*generator->state_); auto target_ids = Generators::GuidanceLogitsProcessor::tokenize_partial(tokenizer.get(), tokenizer->Encode(Generators::GuidanceLogitsProcessor::kTokenizePrefixStr).size(), reinterpret_cast(text.c_str()), text.size()); for (auto id : target_ids) { - auto mask = processor->ComputeMask(); - processor->CommitToken(id); + auto mask = processor->GetMask(); + auto tokens = std::vector{static_cast(id)}; + processor->CommitTokens(std::span(tokens)); } } @@ -55,14 +57,16 @@ TEST(LogitsProcessorTests, TestJsonSchema) { auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32"); auto tokenizer = model->CreateTokenizer(); - auto processor = std::make_unique(model->config_->model.vocab_size, - model->config_->model.eos_token_id, "json_schema", - json_schema, tokenizer, model->config_->config_path.string().c_str()); + auto params = Generators::CreateGeneratorParams(*model); + params->SetGuidance("json_schema", json_schema); + auto generator = Generators::CreateGenerator(*model, *params); + auto processor = std::make_unique(*generator->state_); auto target_ids = Generators::GuidanceLogitsProcessor::tokenize_partial(tokenizer.get(), tokenizer->Encode(Generators::GuidanceLogitsProcessor::kTokenizePrefixStr).size(), reinterpret_cast(text.c_str()), text.size()); for (auto id : target_ids) { - auto mask = processor->ComputeMask(); - processor->CommitToken(id); + auto mask = processor->GetMask(); + auto tokens = std::vector{static_cast(id)}; + processor->CommitTokens(std::span(tokens)); } } diff --git a/test/model_tests.cpp b/test/model_tests.cpp index 321d1ac46..35e4b9b53 100644 --- a/test/model_tests.cpp +++ b/test/model_tests.cpp @@ -3,6 +3,7 @@ #include #include +#include #include #include #include diff --git a/test/sampling_benchmark.cpp b/test/sampling_benchmark.cpp index eb7f04cd9..cc1b151e6 100644 --- a/test/sampling_benchmark.cpp +++ b/test/sampling_benchmark.cpp @@ -3,6 +3,7 @@ #include #include +#include #include #include #include diff --git a/test/sampling_tests.cpp b/test/sampling_tests.cpp index a71910b15..6d95875f1 100644 --- a/test/sampling_tests.cpp +++ b/test/sampling_tests.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include From 13e51008ade2dd9ee90309261774b7c813faed50 Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Thu, 12 Dec 2024 01:54:04 +0000 Subject: [PATCH 57/79] remove comments --- CMakeLists.txt | 3 - src/models/decoder_only.cpp | 1 - src/models/logits.cpp | 130 +----------------------------------- src/models/logits.h | 12 ---- 4 files changed, 2 insertions(+), 144 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6cae8af2c..343c2b0f6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -110,7 +110,6 @@ target_include_directories(onnxruntime-genai-static PRIVATE ${onnxruntime_extens target_include_directories(onnxruntime-genai-static PUBLIC ${onnxruntime_extensions_SOURCE_DIR}/shared/api/) target_link_libraries(onnxruntime-genai PRIVATE onnxruntime_extensions) target_link_libraries(onnxruntime-genai-static PUBLIC onnxruntime_extensions) - target_link_directories(onnxruntime-genai PRIVATE ${ORT_LIB_DIR}) # we keep the shared libraries disconnected on Android as they will come from separate AARs and we don't want to force @@ -131,8 +130,6 @@ if(USE_CUDA AND CMAKE_CUDA_COMPILER) add_library(onnxruntime-genai-cuda SHARED ${generator_cudalib_srcs}) target_include_directories(onnxruntime-genai-cuda PRIVATE ${ORT_HEADER_DIR}) target_include_directories(onnxruntime-genai-cuda PRIVATE ${GENERATORS_ROOT}) - # target_include_directories(onnxruntime-genai-cuda PRIVATE ${onnxruntime_extensions_SOURCE_DIR}/include) - # target_include_directories(onnxruntime-genai-cuda PRIVATE ${onnxruntime_extensions_SOURCE_DIR}/shared/api/) target_link_libraries(onnxruntime-genai-cuda PRIVATE cublasLt cublas curand cufft cudart) set_target_properties(onnxruntime-genai-cuda PROPERTIES LINKER_LANGUAGE CUDA) add_dependencies(onnxruntime-genai onnxruntime-genai-cuda) diff --git a/src/models/decoder_only.cpp b/src/models/decoder_only.cpp index 2c61121c0..93d861f91 100644 --- a/src/models/decoder_only.cpp +++ b/src/models/decoder_only.cpp @@ -37,7 +37,6 @@ DeviceSpan DecoderOnly_State::Run(int total_length, DeviceSpan& void DecoderOnly_State::RewindTo(size_t index) { position_inputs_.RewindTo(index); kv_cache_.RewindTo(index); - // logits_.ResetProcessors(); } void DecoderOnly_State::UpdateInputsOutputs(DeviceSpan& next_tokens, DeviceSpan beam_indices, int total_length) { diff --git a/src/models/logits.cpp b/src/models/logits.cpp index 8852c48c1..23c5458ac 100644 --- a/src/models/logits.cpp +++ b/src/models/logits.cpp @@ -2,13 +2,12 @@ // Licensed under the MIT License. #include "../generators.h" #include "model.h" - +#include "logits.h" #if USE_CUDA #include "../cuda/cuda_common.h" #include "kernels.h" #endif -#include "logits.h" namespace Generators { Logits::Logits(State& state) @@ -34,41 +33,6 @@ Logits::Logits(State& state) cuda_eos_token_ids_.CopyCpuToDevice(); } #endif -// if (!state_.params_->guidance_type.empty() && !state_.params_->guidance_data.empty()) { -// auto tokenizer = model_.CreateTokenizer(); -// LogitsProcessorConfig config = { -// model_.config_->model.vocab_size, -// static_cast(model_.config_->model.eos_token_id), -// state_.params_->guidance_type, -// state_.params_->guidance_data, -// tokenizer, -// model_.config_->config_path.string()}; -// logits_processors_.resize(shape_[0]); -// for (int i = 0; i < shape_[0]; i++) { -// logits_processors_[i] = CreateLogitsProcessor(config); -// } -// if (logits_processors_.at(0)) { -// // Compute the mask maybe time consuming, so we do it in a separate thread -// mask_future_ = std::async(std::launch::async, [&]() { -// std::vector> result; -// for (int i = 0; i < shape_[0]; i++) { -// auto processor = logits_processors_.at(i).get(); -// if (processor == nullptr) { -// result.push_back({}); -// continue; -// } -// auto mask = processor->ComputeMask(); -// result.push_back(std::move(mask)); -// } -// return result; -// }); -// #if USE_CUDA -// if (model_.device_type_ == DeviceType::CUDA) { -// cuda_logits_mask_ptr_ = state_.params_->p_device->Allocate(shape_[0] * shape_[2] / 32); -// } -// #endif -// } -// } input_sequence_lengths.resize(state_.params_->search.batch_size); } @@ -183,10 +147,6 @@ DeviceSpan Logits::Get() { if (logits_.empty() || logits_of_last_token->GetTensorMutableRawData() != logits_.Span().data()) logits_ = WrapTensor(*state_.params_->p_device, *logits_of_last_token); - // if (!logits_processors_.empty() && logits_processors_.at(0) && logits_masks_.empty()) { - // logits_masks_ = mask_future_.get(); - // } - #if USE_CUDA if (model_.device_type_ == DeviceType::CUDA) { if (!cuda_eos_token_ids_.empty()) @@ -198,13 +158,6 @@ DeviceSpan Logits::Get() { static_cast(cuda_eos_token_ids_.size()), model_.cuda_stream_); - // if (!logits_masks_.empty()) { - // for (int i = 0; i < logits_masks_.size(); i++) { - // cudaMemcpyAsync(cuda_logits_mask_ptr_.Span().data() + (i * shape_[2] / 32), logits_masks_.at(i).data(), - // logits_masks_.at(i).size() * sizeof(uint32_t), ::cudaMemcpyHostToDevice, model_.cuda_stream_); - // } - // AddMask(logits_, cuda_logits_mask_ptr_); - // } return logits_; } #endif @@ -226,9 +179,6 @@ DeviceSpan Logits::Get() { auto batched_logits_cpu = cpu_span{cpu_tensor, element_count}; HandleEOSArray(batched_logits_cpu); - // if (!logits_masks_.empty()) { - // AddMask(batched_logits_cpu, logits_masks_); - // } logits_ = WrapTensor(*state_.params_->p_device, *value32_cpu_); return logits_; @@ -236,31 +186,13 @@ DeviceSpan Logits::Get() { #endif HandleEOSArray(logits_.Span()); - // if (!logits_masks_.empty()) { - // AddMask(logits_.Span(), logits_masks_); - // } + return logits_; } #pragma warning(pop) void Logits::Update(const DeviceSpan& next_tokens, size_t new_kv_length) { - // if (!logits_processors_.empty() && logits_processors_.at(0) && new_kv_length == 1) { - // auto next_tokens_cpu = next_tokens.CopyDeviceToCpu(); - // for (int i = 0; i < next_tokens_cpu.size(); i++) { - // logits_processors_[i]->CommitToken(static_cast(next_tokens_cpu[i])); - // } - // mask_future_ = std::async(std::launch::async, [&]() { - // std::vector> result; - // for (int i = 0; i < shape_[0]; i++) { - // auto processor = logits_processors_.at(i).get(); - // auto mask = processor->ComputeMask(); - // result.push_back(mask); - // } - // return result; - // }); - // } - if (static_cast(output_raw_.get()->GetTensorTypeAndShapeInfo()->GetShape()[1]) == new_kv_length && new_kv_length == 1) { return; } @@ -308,64 +240,6 @@ void Logits::HandleEOSArray(std::span batched_logits) { } } -// void Logits::AddMask(std::span logits, std::vector>& masks) { -// size_t vocab_size = shape_[2]; -// size_t vocab_index = 0; - -// for (int index = 0; index < shape_[0]; index++) { -// auto logits_span = logits.subspan(vocab_index, vocab_size); -// auto& mask = masks[index]; -// for (size_t i = 0; i < vocab_size; i++) { -// // mask is a 32-bit integer, where each bit corresponds to a token in the vocabulary. -// // If the bit is set, the corresponding token is masked (i.e., its logit is set to the lowest possible value). -// logits_span[i] = mask[i / 32] & (1 << (i % 32)) ? logits_span[i] : std::numeric_limits::lowest(); -// } -// vocab_index += vocab_size; -// } -// } - -// #if USE_CUDA -// void Logits::AddMask(DeviceSpan logits, DeviceSpan mask) { -// cuda::LaunchAddLogitsMask(logits.Span().data(), static_cast(shape_[0]), -// static_cast(shape_[2]), mask.Span().data(), model_.cuda_stream_); -// } -// #endif - -// void Logits::ResetProcessors() { -// if (!state_.params_->guidance_type.empty() && !state_.params_->guidance_data.empty()) { -// logits_processors_.clear(); -// logits_masks_.clear(); -// auto tokenizer = model_.CreateTokenizer(); -// LogitsProcessorConfig config = { -// model_.config_->model.vocab_size, -// static_cast(model_.config_->model.eos_token_id), -// state_.params_->guidance_type, -// state_.params_->guidance_data, -// tokenizer, -// model_.config_->config_path.string()}; -// logits_processors_.resize(shape_[0]); -// for (int i = 0; i < shape_[0]; i++) { -// logits_processors_[i] = CreateLogitsProcessor(config); -// } -// if (logits_processors_.at(0)) { -// // Compute the mask maybe time consuming, so we do it in a separate thread -// mask_future_ = std::async(std::launch::async, [&]() { -// std::vector> result; -// for (int i = 0; i < shape_[0]; i++) { -// auto processor = logits_processors_.at(i).get(); -// if (processor == nullptr) { -// result.push_back({}); -// continue; -// } -// auto mask = processor->ComputeMask(); -// result.push_back(std::move(mask)); -// } -// return result; -// }); -// } -// } -// } - void Logits::Add() { output_index_ = state_.outputs_.size(); diff --git a/src/models/logits.h b/src/models/logits.h index 6fb4ba32c..ff164a990 100644 --- a/src/models/logits.h +++ b/src/models/logits.h @@ -5,7 +5,6 @@ #include #include "model.h" #include "static_buffer.h" -// #include "logits_processor.h" namespace Generators { @@ -20,14 +19,9 @@ struct Logits { // Resize logits to [bz, token_count, vocab_size] if necessary. void Update(const DeviceSpan& next_tokens, size_t new_kv_length); - // // Reset logits processors to support rewind. - // void ResetProcessors(); - private: void HandleEOSArray(std::span logits); - // void AddMask(std::span logits, std::vector>& mask); - State& state_; const Model& model_{state_.model_}; size_t output_index_{~0U}; @@ -51,14 +45,8 @@ struct Logits { StaticBuffer* sb_logits32_{}; StaticBuffer* sb_logits16_{}; - // std::vector> logits_processors_; - // std::future>> mask_future_; - // std::vector> logits_masks_; - #if USE_CUDA DeviceSpan cuda_eos_token_ids_; // eos_token_ids from params, but in cuda accessible memory - // DeviceSpan cuda_logits_mask_ptr_; - // void AddMask(DeviceSpan logits, DeviceSpan mask); #endif #if USE_DML From 9b5a6ceda19a2adbb57cbeb1acde31213fc4306c Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Thu, 12 Dec 2024 08:10:33 +0000 Subject: [PATCH 58/79] fix --- src/logits_processor.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/logits_processor.cpp b/src/logits_processor.cpp index fceff783e..84d428109 100644 --- a/src/logits_processor.cpp +++ b/src/logits_processor.cpp @@ -227,14 +227,13 @@ std::vector GuidanceLogitsProcessor::tokenize_partial(const Tokenizer* #endif std::unique_ptr CreateLogitsProcessor(const State& state) { -#if USE_GUIDANCE if (!state.params_->guidance_type.empty() && !state.params_->guidance_data.empty()) { +#if USE_GUIDANCE return std::make_unique(state); - } - #endif - Log("warning", "No supported LogitsProcessor found. e.g. to use guidance, build with use_guidance=true"); + Log("warning", "No supported LogitsProcessor found. e.g. to use guidance, build with use_guidance=true"); + } return nullptr; } } // namespace Generators From a9390e3869757d47a87a4717d4b19982764cb18a Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Thu, 12 Dec 2024 09:06:57 +0000 Subject: [PATCH 59/79] fix segfault --- src/generators.cpp | 4 +++- src/logits_processor.cpp | 1 - src/models/logits.cpp | 2 -- src/models/logits.h | 2 -- 4 files changed, 3 insertions(+), 6 deletions(-) diff --git a/src/generators.cpp b/src/generators.cpp index 7a61a3bd5..2896d5449 100644 --- a/src/generators.cpp +++ b/src/generators.cpp @@ -426,7 +426,9 @@ void Generator::RewindToLength(size_t new_length) { throw std::runtime_error("RewindToLength must be called with new_length=0 when batch_size > 1"); search_->RewindTo(new_length); state_->RewindTo(new_length); - logits_processor_->Reset(); + if (logits_processor_) { + logits_processor_->Reset(); + } computed_logits_ = false; last_action_ = Action::rewound; } diff --git a/src/logits_processor.cpp b/src/logits_processor.cpp index 84d428109..99be17a9d 100644 --- a/src/logits_processor.cpp +++ b/src/logits_processor.cpp @@ -231,7 +231,6 @@ std::unique_ptr CreateLogitsProcessor(const State& state) { #if USE_GUIDANCE return std::make_unique(state); #endif - Log("warning", "No supported LogitsProcessor found. e.g. to use guidance, build with use_guidance=true"); } return nullptr; diff --git a/src/models/logits.cpp b/src/models/logits.cpp index 23c5458ac..9e5be5b57 100644 --- a/src/models/logits.cpp +++ b/src/models/logits.cpp @@ -157,7 +157,6 @@ DeviceSpan Logits::Get() { cuda_eos_token_ids_.Span().data(), static_cast(cuda_eos_token_ids_.size()), model_.cuda_stream_); - return logits_; } #endif @@ -186,7 +185,6 @@ DeviceSpan Logits::Get() { #endif HandleEOSArray(logits_.Span()); - return logits_; } diff --git a/src/models/logits.h b/src/models/logits.h index ff164a990..f723c48ee 100644 --- a/src/models/logits.h +++ b/src/models/logits.h @@ -2,8 +2,6 @@ // Licensed under the MIT License. #pragma once -#include -#include "model.h" #include "static_buffer.h" namespace Generators { From fc4b7e95dda9a112950c86d9dbf87626a391a9cb Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Fri, 13 Dec 2024 01:27:42 +0000 Subject: [PATCH 60/79] fix win build --- src/logits_processor.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/logits_processor.cpp b/src/logits_processor.cpp index 99be17a9d..c90619f3d 100644 --- a/src/logits_processor.cpp +++ b/src/logits_processor.cpp @@ -24,7 +24,10 @@ namespace Generators { #if USE_GUIDANCE GuidanceLogitsProcessor::GuidanceLogitsProcessor(const State& state) - : vocab_size_(state.params_->config.model.vocab_size), eos_token_(state.params_->config.model.eos_token_id), device_type_(state.params_->device_type), batch_size_(state.params_->search.batch_size) { + : vocab_size_(state.params_->config.model.vocab_size), + eos_token_(state.params_->config.model.eos_token_id), + batch_size_(state.params_->search.batch_size), // moved before device_type_ + device_type_(state.params_->device_type) { // moved after batch_size guidance_type_ = state.params_->guidance_type; guidance_data_ = state.params_->guidance_data; if (guidance_type_.empty() || guidance_data_.empty()) { @@ -108,11 +111,11 @@ GuidanceLogitsProcessor::GuidanceLogitsProcessor(const State& state) std::vector> GuidanceLogitsProcessor::ComputeMask() { std::vector> masks; - for (int i = 0; i < batch_size_; i++) { + for (int batch_idx = 0; batch_idx < batch_size_; batch_idx++) { // renamed 'i' to 'batch_idx' LlgMaskResult mask_result; - auto error = llg_compute_mask(llg_constraints_[i].get(), &mask_result); + auto error = llg_compute_mask(llg_constraints_[batch_idx].get(), &mask_result); if (error != 0) { - std::string error_message = llg_get_error(llg_constraints_[i].get()); + std::string error_message = llg_get_error(llg_constraints_[batch_idx].get()); throw std::runtime_error("Error computing mask: " + error_message); } From a0710d5aa3dab5ea49a81b9bb52d83158aeabf47 Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Fri, 13 Dec 2024 06:19:55 +0000 Subject: [PATCH 61/79] fix win error --- src/logits_processor.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/logits_processor.cpp b/src/logits_processor.cpp index c90619f3d..84a8f1411 100644 --- a/src/logits_processor.cpp +++ b/src/logits_processor.cpp @@ -24,10 +24,10 @@ namespace Generators { #if USE_GUIDANCE GuidanceLogitsProcessor::GuidanceLogitsProcessor(const State& state) - : vocab_size_(state.params_->config.model.vocab_size), - eos_token_(state.params_->config.model.eos_token_id), - batch_size_(state.params_->search.batch_size), // moved before device_type_ - device_type_(state.params_->device_type) { // moved after batch_size + : vocab_size_(state.params_->config.model.vocab_size), + eos_token_(state.params_->config.model.eos_token_id), + batch_size_(state.params_->search.batch_size), // moved before device_type_ + device_type_(state.params_->device_type) { // moved after batch_size guidance_type_ = state.params_->guidance_type; guidance_data_ = state.params_->guidance_data; if (guidance_type_.empty() || guidance_data_.empty()) { @@ -165,7 +165,7 @@ void GuidanceLogitsProcessor::ProcessLogits(DeviceSpan logits) { if (device_type_ == DeviceType::CUDA) { for (int i = 0; i < masks.size(); i++) { cudaMemcpyAsync(cuda_logits_mask_ptr_.Span().data() + (i * vocab_size_ / 32), masks.at(i).data(), - masks.at(i).size() * sizeof(uint32_t), ::cudaMemcpyHostToDevice, cuda_stream_); + static_cast(masks.at(i).size() * sizeof(uint32_t)), ::cudaMemcpyHostToDevice, cuda_stream_); } cuda::LaunchAddLogitsMask(logits.Span().data(), batch_size_, vocab_size_, cuda_logits_mask_ptr_.Span().data(), cuda_stream_); return; From 7d4d6bb96355d8845c7d938f07b22237edf10119 Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Mon, 16 Dec 2024 03:28:07 +0000 Subject: [PATCH 62/79] fix win error --- src/logits_processor.cpp | 2 +- src/logits_processor.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/logits_processor.cpp b/src/logits_processor.cpp index 84a8f1411..1bf17c828 100644 --- a/src/logits_processor.cpp +++ b/src/logits_processor.cpp @@ -163,7 +163,7 @@ void GuidanceLogitsProcessor::ProcessLogits(DeviceSpan logits) { #if USE_CUDA if (device_type_ == DeviceType::CUDA) { - for (int i = 0; i < masks.size(); i++) { + for (int i = 0; i < static_cast(masks.size()); i++) { cudaMemcpyAsync(cuda_logits_mask_ptr_.Span().data() + (i * vocab_size_ / 32), masks.at(i).data(), static_cast(masks.at(i).size() * sizeof(uint32_t)), ::cudaMemcpyHostToDevice, cuda_stream_); } diff --git a/src/logits_processor.h b/src/logits_processor.h index cb9241ec5..ee44d0078 100644 --- a/src/logits_processor.h +++ b/src/logits_processor.h @@ -54,7 +54,7 @@ struct GuidanceLogitsProcessor : public LogitsProcessor { private: std::vector> ComputeMask(); - size_t vocab_size_; + int vocab_size_; uint32_t eos_token_; int batch_size_; DeviceType device_type_; From 5d175a762feea63dcc3d398fbe8f9efbab49a4e7 Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Mon, 16 Dec 2024 06:58:32 +0000 Subject: [PATCH 63/79] add comments --- src/generators.cpp | 2 +- src/generators.h | 4 ++-- src/logits_processor.cpp | 15 ++++++++------- src/logits_processor.h | 11 +++++++++-- 4 files changed, 20 insertions(+), 12 deletions(-) diff --git a/src/generators.cpp b/src/generators.cpp index 2896d5449..7ff3373fe 100644 --- a/src/generators.cpp +++ b/src/generators.cpp @@ -267,7 +267,7 @@ Generator::Generator(const Model& model, const GeneratorParams& params) : model_ search_ = CreateSearch(params); state_ = model.CreateState(search_->GetSequenceLengths(), params); // Search sequence lengths set when creating state - logits_processor_ = CreateLogitsProcessor(*state_); + logits_processor_ = CreateLogitsProcessor(*state_); // Could be nullptr if no logits processor is used // Temporary solution for multimodal and whisper models if (!params.aux_input_ids.empty() && params.aux_input_ids.data() != nullptr) { AppendTokens(params.aux_input_ids); diff --git a/src/generators.h b/src/generators.h index 0eae8d187..d366b5fb9 100644 --- a/src/generators.h +++ b/src/generators.h @@ -104,8 +104,8 @@ struct GeneratorParams : std::enable_shared_from_this, LeakChec void SetInputs(const NamedTensors& inputs); - std::string guidance_type; - std::string guidance_data; + std::string guidance_type; // e.g. json_schema or regex + std::string guidance_data; // e.g. rules data in json_schema or regex void SetGuidance(std::string_view type, std::string_view data); private: diff --git a/src/logits_processor.cpp b/src/logits_processor.cpp index 1bf17c828..500645bf1 100644 --- a/src/logits_processor.cpp +++ b/src/logits_processor.cpp @@ -26,16 +26,16 @@ namespace Generators { GuidanceLogitsProcessor::GuidanceLogitsProcessor(const State& state) : vocab_size_(state.params_->config.model.vocab_size), eos_token_(state.params_->config.model.eos_token_id), - batch_size_(state.params_->search.batch_size), // moved before device_type_ - device_type_(state.params_->device_type) { // moved after batch_size + batch_size_(state.params_->search.batch_size), + device_type_(state.params_->device_type) { guidance_type_ = state.params_->guidance_type; guidance_data_ = state.params_->guidance_data; if (guidance_type_.empty() || guidance_data_.empty()) { - throw std::runtime_error("Guidance type and data must be provided"); + throw std::runtime_error("Guidance type and data must be provided together"); } - if (guidance_type_ != "json_schema" && guidance_type_ != "regex" && guidance_type_ != "grammar") { - throw std::runtime_error("Unsupported guidance type: " + std::string(guidance_type_)); + if (guidance_type_ != "json_schema" && guidance_type_ != "regex") { + throw std::runtime_error("Unsupported guidance type: " + std::string(guidance_type_) + " (only json_schema and regex are supported)"); } auto tokenize_fn = (LlgTokenizeFn) + [](const void* user_data, const uint8_t* bytes, @@ -87,7 +87,7 @@ GuidanceLogitsProcessor::GuidanceLogitsProcessor(const State& state) } else if (guidance_type_ == "regex") { constraint_ptr = llg_new_constraint_regex(&constraint_init, guidance_data_.data()); } else { - constraint_ptr = llg_new_constraint(&constraint_init, guidance_data_.data()); + throw std::runtime_error("Unsupported guidance type: " + std::string(guidance_type_) + " (only json_schema and regex are supported)"); } if (llg_get_error(constraint_ptr) != nullptr) { std::string error_message = llg_get_error(constraint_ptr); @@ -97,6 +97,7 @@ GuidanceLogitsProcessor::GuidanceLogitsProcessor(const State& state) llg_constraints_[i] = std::unique_ptr(constraint_ptr); } + // Compute the mask asynchronously to avoid blocking the model inference on device mask_future_ = std::async(std::launch::async, [&]() { return ComputeMask(); }); @@ -200,7 +201,7 @@ void GuidanceLogitsProcessor::Reset() { } else if (guidance_type_ == "regex") { constraint_ptr = llg_new_constraint_regex(&constraint_init, guidance_data_.data()); } else { - constraint_ptr = llg_new_constraint(&constraint_init, guidance_data_.data()); + throw std::runtime_error("Unsupported guidance type: " + std::string(guidance_type_) + " (only json_schema and regex are supported)"); } if (llg_get_error(constraint_ptr) != nullptr) { std::string error_message = llg_get_error(constraint_ptr); diff --git a/src/logits_processor.h b/src/logits_processor.h index ee44d0078..83e593dd1 100644 --- a/src/logits_processor.h +++ b/src/logits_processor.h @@ -20,8 +20,11 @@ namespace Generators { struct LogitsProcessor { LogitsProcessor() = default; virtual ~LogitsProcessor() = default; + // CommitTokens is used to commit the generated tokens to the logits processor virtual void CommitTokens(std::span tokens) = 0; + // ProcessLogits is used to add logits mask to the logits virtual void ProcessLogits(DeviceSpan logits) = 0; + // Reset is used to reset the logits processor after rewinding virtual void Reset() = 0; }; @@ -39,15 +42,19 @@ struct LlgTokenizerDeleter { }; struct GuidanceLogitsProcessor : public LogitsProcessor { - static constexpr const char* kDefaultVocabFile = "tokenizer.json"; + // llguidance need to use tokenizer.json to add special tokens + static constexpr const char* kDefaultVocabFile = "tokenizer.json"; + // tokenizer need to tokenize token with special prefix static constexpr const char* kTokenizePrefixStr = "\x02"; GuidanceLogitsProcessor(const State& state); - void ProcessLogits(DeviceSpan logits) override; void CommitTokens(std::span tokens) override; void Reset() override; + // GetMask is used to get the logits mask std::vector> GetMask(); + // tokenize_partial is used to tokenize the input tokens with special prefix, this will get stable + // token ids. static std::vector tokenize_partial(const Tokenizer* tokenizer, const size_t prefix_len, const uint8_t* bytes, size_t bytes_len); From 52ccc8b4da3c61e3b5d6abf4988474ae376c6db1 Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Mon, 16 Dec 2024 07:29:59 +0000 Subject: [PATCH 64/79] fix format --- src/generators.cpp | 2 +- src/generators.h | 4 ++-- src/logits_processor.h | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/generators.cpp b/src/generators.cpp index 7ff3373fe..1aedc8a5a 100644 --- a/src/generators.cpp +++ b/src/generators.cpp @@ -267,7 +267,7 @@ Generator::Generator(const Model& model, const GeneratorParams& params) : model_ search_ = CreateSearch(params); state_ = model.CreateState(search_->GetSequenceLengths(), params); // Search sequence lengths set when creating state - logits_processor_ = CreateLogitsProcessor(*state_); // Could be nullptr if no logits processor is used + logits_processor_ = CreateLogitsProcessor(*state_); // Could be nullptr if no logits processor is used // Temporary solution for multimodal and whisper models if (!params.aux_input_ids.empty() && params.aux_input_ids.data() != nullptr) { AppendTokens(params.aux_input_ids); diff --git a/src/generators.h b/src/generators.h index d366b5fb9..430d6197b 100644 --- a/src/generators.h +++ b/src/generators.h @@ -104,8 +104,8 @@ struct GeneratorParams : std::enable_shared_from_this, LeakChec void SetInputs(const NamedTensors& inputs); - std::string guidance_type; // e.g. json_schema or regex - std::string guidance_data; // e.g. rules data in json_schema or regex + std::string guidance_type; // e.g. json_schema or regex + std::string guidance_data; // e.g. rules data in json_schema or regex void SetGuidance(std::string_view type, std::string_view data); private: diff --git a/src/logits_processor.h b/src/logits_processor.h index 83e593dd1..66f08e779 100644 --- a/src/logits_processor.h +++ b/src/logits_processor.h @@ -43,7 +43,7 @@ struct LlgTokenizerDeleter { struct GuidanceLogitsProcessor : public LogitsProcessor { // llguidance need to use tokenizer.json to add special tokens - static constexpr const char* kDefaultVocabFile = "tokenizer.json"; + static constexpr const char* kDefaultVocabFile = "tokenizer.json"; // tokenizer need to tokenize token with special prefix static constexpr const char* kTokenizePrefixStr = "\x02"; @@ -53,8 +53,8 @@ struct GuidanceLogitsProcessor : public LogitsProcessor { void Reset() override; // GetMask is used to get the logits mask std::vector> GetMask(); - // tokenize_partial is used to tokenize the input tokens with special prefix, this will get stable - // token ids. + // tokenize_partial is used to tokenize the input tokens with special prefix, this will get stable + // token ids. static std::vector tokenize_partial(const Tokenizer* tokenizer, const size_t prefix_len, const uint8_t* bytes, size_t bytes_len); From 161fcfc7d1c6cea49fd3d9eb1574540a540aacf3 Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Tue, 17 Dec 2024 07:39:09 +0000 Subject: [PATCH 65/79] fix bug --- src/logits_processor.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/logits_processor.cpp b/src/logits_processor.cpp index 500645bf1..759f3d726 100644 --- a/src/logits_processor.cpp +++ b/src/logits_processor.cpp @@ -122,7 +122,6 @@ std::vector> GuidanceLogitsProcessor::ComputeMask() { std::vector mask; if (mask_result.is_stop) { - std::cout << "should stop" << std::endl; mask = std::vector((vocab_size_ - 1) / 32 + 1, 0); uint32_t eos_mask32 = 1 << (eos_token_ % 32); mask[eos_token_ / 32] = eos_mask32; @@ -171,7 +170,7 @@ void GuidanceLogitsProcessor::ProcessLogits(DeviceSpan logits) { cuda::LaunchAddLogitsMask(logits.Span().data(), batch_size_, vocab_size_, cuda_logits_mask_ptr_.Span().data(), cuda_stream_); return; } -#else +#endif size_t vocab_index = 0; auto logits_span = logits.Span(); @@ -185,7 +184,6 @@ void GuidanceLogitsProcessor::ProcessLogits(DeviceSpan logits) { } vocab_index += vocab_size_; } -#endif } void GuidanceLogitsProcessor::Reset() { From 8e037358e0919a8e2f396a1e622d24a46883b444 Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Tue, 17 Dec 2024 08:11:22 +0000 Subject: [PATCH 66/79] suuport build in ios GHA --- .github/workflows/ios-build.yml | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ios-build.yml b/.github/workflows/ios-build.yml index 71c51267b..ade86d82e 100644 --- a/.github/workflows/ios-build.yml +++ b/.github/workflows/ios-build.yml @@ -3,8 +3,8 @@ on: workflow_dispatch: push: branches: - - main - - rel-* + - main + - rel-* pull_request: concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} @@ -20,7 +20,7 @@ jobs: - uses: actions/setup-python@v5 with: - python-version: '3.12.x' + python-version: "3.12.x" - name: Install the python wheel and dependencies run: | @@ -28,6 +28,9 @@ jobs: source genai-macos-venv/bin/activate python3 -m pip install requests + - name: Install Rust Toolchain + uses: dtolnay/rust-toolchain@1.82.0 + - name: Run iOS Build run: | set -e -x @@ -38,4 +41,4 @@ jobs: --osx_arch arm64 \ --apple_deploy_target 15.4 \ --cmake_generator 'Xcode' \ - --build_dir build_iphonesimulator \ No newline at end of file + --build_dir build_iphonesimulator From db3062b22b6a2c3286dccf7ebe3418037f8c58e8 Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Tue, 17 Dec 2024 08:44:09 +0000 Subject: [PATCH 67/79] update win azure ci --- .pipelines/stages/jobs/steps/capi-win-step.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.pipelines/stages/jobs/steps/capi-win-step.yml b/.pipelines/stages/jobs/steps/capi-win-step.yml index 168fa47c7..fcbaaa398 100644 --- a/.pipelines/stages/jobs/steps/capi-win-step.yml +++ b/.pipelines/stages/jobs/steps/capi-win-step.yml @@ -46,6 +46,13 @@ steps: condition: eq(variables['ep'], 'cuda') workingDirectory: '$(Build.Repository.LocalPath)' +- powershell: | + $exePath = "$env:TEMP\rustup-init.exe" + (New-Object Net.WebClient).DownloadFile('https://static.rust-lang.org/rustup/dist/x86_64-pc-windows-msvc/rustup-init.exe', $exePath) + & $exePath -y --default-toolchain=1.82.0 + Add-Content $env:GITHUB_PATH "$env:USERPROFILE\.cargo\bin" + displayName: 'Install Rust Toolchain' + - powershell: | cmake --preset windows_$(arch)_$(ep)_$(build_config) -T cuda='$(Build.Repository.LocalPath)\cuda_sdk\v$(cuda_version)' displayName: 'Configure CMake C API with CUDA' From 0cdb4ac7cc62affa8db24e52e453f5ab9e53a611 Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Tue, 17 Dec 2024 08:47:31 +0000 Subject: [PATCH 68/79] update linux ci --- .../linux/docker/manylinux/Dockerfile.manylinux2_28_cpu | 4 +++- .../linux/docker/manylinux/Dockerfile.manylinux2_28_cuda_11.8 | 4 +++- .../linux/docker/manylinux/Dockerfile.manylinux2_28_rocm | 4 +++- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/tools/ci_build/github/linux/docker/manylinux/Dockerfile.manylinux2_28_cpu b/tools/ci_build/github/linux/docker/manylinux/Dockerfile.manylinux2_28_cpu index 6af5a8a21..af71099df 100644 --- a/tools/ci_build/github/linux/docker/manylinux/Dockerfile.manylinux2_28_cpu +++ b/tools/ci_build/github/linux/docker/manylinux/Dockerfile.manylinux2_28_cpu @@ -2,7 +2,9 @@ FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64 ADD scripts /tmp/scripts RUN cd /tmp/scripts && /tmp/scripts/install_centos_gcc12.sh && /tmp/scripts/install_deps.sh && rm -rf /tmp/scripts - +ENV PATH="/usr/.cargo/bin:$PATH" +ENV RUSTUP_HOME="/usr/.rustup" +ENV CARGO_HOME="/usr/.cargo" ARG BUILD_UID=1001 ARG BUILD_USER=onnxruntimedev RUN adduser --uid $BUILD_UID $BUILD_USER diff --git a/tools/ci_build/github/linux/docker/manylinux/Dockerfile.manylinux2_28_cuda_11.8 b/tools/ci_build/github/linux/docker/manylinux/Dockerfile.manylinux2_28_cuda_11.8 index 6df955c02..673ba0300 100644 --- a/tools/ci_build/github/linux/docker/manylinux/Dockerfile.manylinux2_28_cuda_11.8 +++ b/tools/ci_build/github/linux/docker/manylinux/Dockerfile.manylinux2_28_cuda_11.8 @@ -10,7 +10,9 @@ else \ echo "Using default gcc because CUDA version is less than 12"; \ cd /tmp/scripts && /tmp/scripts/install_centos.sh && /tmp/scripts/install_deps.sh && rm -rf /tmp/scripts; \ fi - +ENV PATH="/usr/.cargo/bin:$PATH" +ENV RUSTUP_HOME="/usr/.rustup" +ENV CARGO_HOME="/usr/.cargo" ARG BUILD_UID=1001 ARG BUILD_USER=onnxruntimedev RUN adduser --uid $BUILD_UID $BUILD_USER diff --git a/tools/ci_build/github/linux/docker/manylinux/Dockerfile.manylinux2_28_rocm b/tools/ci_build/github/linux/docker/manylinux/Dockerfile.manylinux2_28_rocm index 6af5a8a21..af71099df 100644 --- a/tools/ci_build/github/linux/docker/manylinux/Dockerfile.manylinux2_28_rocm +++ b/tools/ci_build/github/linux/docker/manylinux/Dockerfile.manylinux2_28_rocm @@ -2,7 +2,9 @@ FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64 ADD scripts /tmp/scripts RUN cd /tmp/scripts && /tmp/scripts/install_centos_gcc12.sh && /tmp/scripts/install_deps.sh && rm -rf /tmp/scripts - +ENV PATH="/usr/.cargo/bin:$PATH" +ENV RUSTUP_HOME="/usr/.rustup" +ENV CARGO_HOME="/usr/.cargo" ARG BUILD_UID=1001 ARG BUILD_USER=onnxruntimedev RUN adduser --uid $BUILD_UID $BUILD_USER From 991a012535c303c2b8b6fedf2b15ec4a64f377b2 Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Tue, 17 Dec 2024 09:06:09 +0000 Subject: [PATCH 69/79] fix win ci --- .pipelines/stages/jobs/steps/capi-win-step.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.pipelines/stages/jobs/steps/capi-win-step.yml b/.pipelines/stages/jobs/steps/capi-win-step.yml index fcbaaa398..f3d4fdbc5 100644 --- a/.pipelines/stages/jobs/steps/capi-win-step.yml +++ b/.pipelines/stages/jobs/steps/capi-win-step.yml @@ -50,7 +50,8 @@ steps: $exePath = "$env:TEMP\rustup-init.exe" (New-Object Net.WebClient).DownloadFile('https://static.rust-lang.org/rustup/dist/x86_64-pc-windows-msvc/rustup-init.exe', $exePath) & $exePath -y --default-toolchain=1.82.0 - Add-Content $env:GITHUB_PATH "$env:USERPROFILE\.cargo\bin" + $env:Path = "$env:USERPROFILE\.cargo\bin;$env:Path" + Write-Host $env:Path displayName: 'Install Rust Toolchain' - powershell: | From 4fdbea624e32cf87917e1839b86b20966060500c Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Tue, 17 Dec 2024 09:47:02 +0000 Subject: [PATCH 70/79] fix win ci --- .pipelines/stages/jobs/steps/capi-win-step.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.pipelines/stages/jobs/steps/capi-win-step.yml b/.pipelines/stages/jobs/steps/capi-win-step.yml index f3d4fdbc5..cb5ca72a5 100644 --- a/.pipelines/stages/jobs/steps/capi-win-step.yml +++ b/.pipelines/stages/jobs/steps/capi-win-step.yml @@ -50,8 +50,7 @@ steps: $exePath = "$env:TEMP\rustup-init.exe" (New-Object Net.WebClient).DownloadFile('https://static.rust-lang.org/rustup/dist/x86_64-pc-windows-msvc/rustup-init.exe', $exePath) & $exePath -y --default-toolchain=1.82.0 - $env:Path = "$env:USERPROFILE\.cargo\bin;$env:Path" - Write-Host $env:Path + Write-Host "##vso[task.prependpath]$env:USERPROFILE\.cargo\bin" displayName: 'Install Rust Toolchain' - powershell: | From e99ae657423249f1de9dbadc314f7bec07c7c590 Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Tue, 17 Dec 2024 12:31:40 +0000 Subject: [PATCH 71/79] fix macos arm --- .pipelines/stages/jobs/steps/capi-macos-step.yml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/.pipelines/stages/jobs/steps/capi-macos-step.yml b/.pipelines/stages/jobs/steps/capi-macos-step.yml index e90b3bcaf..7a04bc769 100644 --- a/.pipelines/stages/jobs/steps/capi-macos-step.yml +++ b/.pipelines/stages/jobs/steps/capi-macos-step.yml @@ -29,6 +29,18 @@ steps: echo "build_config=${{ parameters.build_config }}" displayName: 'Print Parameters' +- bash: | + rustup toolchain install 1.82.0-x86_64-apple-darwin + rustup default 1.82.0-x86_64-apple-darwin + + if ("$(arch)" -eq "arm64") { + rustup target add --toolchain 1.82.0-x86_64-apple-darwin aarch64-apple-darwin + } + else { + rustup target add --toolchain 1.82.0-x86_64-apple-darwin x86_64-apple-darwin + } + displayName: 'Install Rust Toolchain' + - powershell: | $env:MACOSX_DEPLOYMENT_TARGET = "12.0" # Monterey cmake --preset macos_$(arch)_$(ep)_$(build_config) From ed96504df1a3e00b9a8b4b2738decc97dcddd51e Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Tue, 17 Dec 2024 12:36:01 +0000 Subject: [PATCH 72/79] fix macos azure ci --- .pipelines/stages/jobs/steps/capi-macos-step.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pipelines/stages/jobs/steps/capi-macos-step.yml b/.pipelines/stages/jobs/steps/capi-macos-step.yml index 7a04bc769..e2275f4ef 100644 --- a/.pipelines/stages/jobs/steps/capi-macos-step.yml +++ b/.pipelines/stages/jobs/steps/capi-macos-step.yml @@ -29,7 +29,7 @@ steps: echo "build_config=${{ parameters.build_config }}" displayName: 'Print Parameters' -- bash: | +- powershell: | rustup toolchain install 1.82.0-x86_64-apple-darwin rustup default 1.82.0-x86_64-apple-darwin From 4cb9e553a32fc35c781c771e7ff3ddb473342e4a Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Wed, 18 Dec 2024 07:29:42 +0000 Subject: [PATCH 73/79] fix for review --- src/logits_processor.cpp | 14 ++++++++------ src/smartptrs.h | 2 +- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/src/logits_processor.cpp b/src/logits_processor.cpp index 759f3d726..0352f8174 100644 --- a/src/logits_processor.cpp +++ b/src/logits_processor.cpp @@ -27,9 +27,9 @@ GuidanceLogitsProcessor::GuidanceLogitsProcessor(const State& state) : vocab_size_(state.params_->config.model.vocab_size), eos_token_(state.params_->config.model.eos_token_id), batch_size_(state.params_->search.batch_size), - device_type_(state.params_->device_type) { - guidance_type_ = state.params_->guidance_type; - guidance_data_ = state.params_->guidance_data; + device_type_(state.params_->device_type), + guidance_type_(state.params_->guidance_type), + guidance_data_(state.params_->guidance_data) { if (guidance_type_.empty() || guidance_data_.empty()) { throw std::runtime_error("Guidance type and data must be provided together"); } @@ -38,8 +38,9 @@ GuidanceLogitsProcessor::GuidanceLogitsProcessor(const State& state) throw std::runtime_error("Unsupported guidance type: " + std::string(guidance_type_) + " (only json_schema and regex are supported)"); } - auto tokenize_fn = (LlgTokenizeFn) + [](const void* user_data, const uint8_t* bytes, - size_t bytes_len, uint32_t* output_tokens, size_t output_tokens_len) -> unsigned long { + auto tokenize_fn = (LlgTokenizeFn)[](const void* user_data, const uint8_t* bytes, + size_t bytes_len, uint32_t* output_tokens, size_t output_tokens_len) + ->unsigned long { const TokenizeData* tokenize_data = reinterpret_cast(user_data); auto output_ids = tokenize_partial(reinterpret_cast(tokenize_data->tokenizer), tokenize_data->prefix_len, bytes, bytes_len); size_t output_size = std::min(output_tokens_len, output_ids.size()); @@ -122,6 +123,7 @@ std::vector> GuidanceLogitsProcessor::ComputeMask() { std::vector mask; if (mask_result.is_stop) { + // when logits processor decides to stop, we mask all tokens except the EOS token mask = std::vector((vocab_size_ - 1) / 32 + 1, 0); uint32_t eos_mask32 = 1 << (eos_token_ % 32); mask[eos_token_ / 32] = eos_mask32; @@ -173,7 +175,7 @@ void GuidanceLogitsProcessor::ProcessLogits(DeviceSpan logits) { #endif size_t vocab_index = 0; - auto logits_span = logits.Span(); + auto logits_span = logits.CpuSpan(); for (int index = 0; index < batch_size_; index++) { auto subspan = logits_span.subspan(vocab_index, vocab_size_); auto& mask = masks[index]; diff --git a/src/smartptrs.h b/src/smartptrs.h index 68dc83fb9..ac7037957 100644 --- a/src/smartptrs.h +++ b/src/smartptrs.h @@ -50,7 +50,7 @@ struct DeviceSpan { } // Copy device memory to CPU memory and return the CPU accessible memory - std::span CopyDeviceToCpu() const { + std::span CopyDeviceToCpu() { p_device_memory_->CopyDeviceToCpu(); return std::span{reinterpret_cast(p_device_memory_->p_cpu_) + begin_, length_}; } From e75166a56aff7341c545b74c6801328d2c5385b9 Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Wed, 18 Dec 2024 07:42:16 +0000 Subject: [PATCH 74/79] fix --- .github/workflows/ios-build.yml | 2 ++ src/logits_processor.cpp | 6 +++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ios-build.yml b/.github/workflows/ios-build.yml index ade86d82e..7bbfcf165 100644 --- a/.github/workflows/ios-build.yml +++ b/.github/workflows/ios-build.yml @@ -35,6 +35,8 @@ jobs: run: | set -e -x source genai-macos-venv/bin/activate + export PATH=/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/usr/lib:$PATH + export LIBRARY_PATH="$LIBRARY_PATH:/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/usr/lib" # Needed for Rust compilation and linking python3 build.py --ios \ --parallel \ --apple_sysroot iphonesimulator \ diff --git a/src/logits_processor.cpp b/src/logits_processor.cpp index 0352f8174..3768aa93d 100644 --- a/src/logits_processor.cpp +++ b/src/logits_processor.cpp @@ -38,9 +38,9 @@ GuidanceLogitsProcessor::GuidanceLogitsProcessor(const State& state) throw std::runtime_error("Unsupported guidance type: " + std::string(guidance_type_) + " (only json_schema and regex are supported)"); } - auto tokenize_fn = (LlgTokenizeFn)[](const void* user_data, const uint8_t* bytes, - size_t bytes_len, uint32_t* output_tokens, size_t output_tokens_len) - ->unsigned long { + auto tokenize_fn = (LlgTokenizeFn) + [](const void* user_data, const uint8_t* bytes, + size_t bytes_len, uint32_t* output_tokens, size_t output_tokens_len) + -> unsigned long { const TokenizeData* tokenize_data = reinterpret_cast(user_data); auto output_ids = tokenize_partial(reinterpret_cast(tokenize_data->tokenizer), tokenize_data->prefix_len, bytes, bytes_len); size_t output_size = std::min(output_tokens_len, output_ids.size()); From 82d33e55e169344a4dcda69c89e9551ff1631ab9 Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Wed, 18 Dec 2024 07:59:02 +0000 Subject: [PATCH 75/79] fix ios ci --- .github/workflows/ios-build.yml | 4 ++++ build.py | 2 ++ 2 files changed, 6 insertions(+) diff --git a/.github/workflows/ios-build.yml b/.github/workflows/ios-build.yml index 7bbfcf165..14efdedcc 100644 --- a/.github/workflows/ios-build.yml +++ b/.github/workflows/ios-build.yml @@ -31,6 +31,10 @@ jobs: - name: Install Rust Toolchain uses: dtolnay/rust-toolchain@1.82.0 + - name: Install Rust target + run: | + rustup target add aarch64-apple-ios-sim + - name: Run iOS Build run: | set -e -x diff --git a/build.py b/build.py index afe9efd74..a65769479 100644 --- a/build.py +++ b/build.py @@ -551,6 +551,8 @@ def _get_opencv_toolchain_file(): # The following arguments are specific to the OpenCV toolchain file f"-DCMAKE_TOOLCHAIN_FILE={_get_opencv_toolchain_file()}", ] + if args.use_guidance: + command += ["-DRust_CARGO_TARGET=aarch64-apple-ios-sim"] if args.macos == "Catalyst": if args.cmake_generator == "Xcode": From 3498e0efd4d4e53442dba751e31b00d09fc14734 Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Wed, 18 Dec 2024 08:07:07 +0000 Subject: [PATCH 76/79] disable on ios ci --- .github/workflows/ios-build.yml | 18 +++++------------- test/c_api_tests.cpp | 2 ++ 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/.github/workflows/ios-build.yml b/.github/workflows/ios-build.yml index 14efdedcc..a1fb9d9eb 100644 --- a/.github/workflows/ios-build.yml +++ b/.github/workflows/ios-build.yml @@ -3,8 +3,8 @@ on: workflow_dispatch: push: branches: - - main - - rel-* + - main + - rel-* pull_request: concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} @@ -20,7 +20,7 @@ jobs: - uses: actions/setup-python@v5 with: - python-version: "3.12.x" + python-version: '3.12.x' - name: Install the python wheel and dependencies run: | @@ -28,23 +28,15 @@ jobs: source genai-macos-venv/bin/activate python3 -m pip install requests - - name: Install Rust Toolchain - uses: dtolnay/rust-toolchain@1.82.0 - - - name: Install Rust target - run: | - rustup target add aarch64-apple-ios-sim - - name: Run iOS Build run: | set -e -x source genai-macos-venv/bin/activate - export PATH=/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/usr/lib:$PATH - export LIBRARY_PATH="$LIBRARY_PATH:/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/usr/lib" # Needed for Rust compilation and linking python3 build.py --ios \ --parallel \ --apple_sysroot iphonesimulator \ --osx_arch arm64 \ --apple_deploy_target 15.4 \ --cmake_generator 'Xcode' \ - --build_dir build_iphonesimulator + --build_dir build_iphonesimulator \ + --use_guidance=false \ No newline at end of file diff --git a/test/c_api_tests.cpp b/test/c_api_tests.cpp index 92047539f..485ea5656 100644 --- a/test/c_api_tests.cpp +++ b/test/c_api_tests.cpp @@ -745,6 +745,7 @@ TEST(CAPITests, RewindGptFp32CAPI) { EXPECT_TRUE(0 == std::memcmp(expected_output_start, sequence_data, sequence_length * sizeof(int32_t))); } +#if USE_GUIDANCE TEST(CAPITests, SetGuidance) { #if TEST_PHI2 @@ -770,3 +771,4 @@ TEST(CAPITests, SetGuidance) { #endif } +#endif \ No newline at end of file From ce846c95ee97587c5e31ca91679d1e3f7f51a47d Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Thu, 19 Dec 2024 04:37:30 +0000 Subject: [PATCH 77/79] disable by default --- build.py | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/build.py b/build.py index 7fc83a75a..87c2c1a39 100644 --- a/build.py +++ b/build.py @@ -26,19 +26,6 @@ def _path_from_env_var(env_var: str): env_var_value = os.environ.get(env_var) return Path(env_var_value) if env_var_value is not None else None -def strtobool (val): - """Convert a string representation of truth to true (1) or false (0). - True values are 'y', 'yes', 't', 'true', 'on', and '1'; false values - are 'n', 'no', 'f', 'false', 'off', and '0'. Raises ValueError if - 'val' is anything else. - """ - val = str(val).lower() - if val in ('y', 'yes', 't', 'true', 'on', '1'): - return True - elif val in ('n', 'no', 'f', 'false', 'off', '0'): - return False - else: - raise ValueError("invalid truth value %r" % (val,)) def _parse_args(): class Parser(argparse.ArgumentParser): @@ -143,7 +130,7 @@ class HelpFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescript parser.add_argument("--use_dml", action="store_true", help="Whether to use DML. Default is to not use DML.") - parser.add_argument("--use_guidance", default=True, type=strtobool, help="Whether to add guidance support. Default is True.") + parser.add_argument("--use_guidance", action="store_true", help="Whether to add guidance support. Default is False.") # The following options are mutually exclusive (cross compiling options such as android, ios, etc.) platform_group = parser.add_mutually_exclusive_group() From c644b2621fedddf10f8233d194b22a739219bc1d Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Thu, 19 Dec 2024 05:41:18 +0000 Subject: [PATCH 78/79] remove azure ci code --- .github/workflows/ios-build.yml | 3 +-- .pipelines/stages/jobs/steps/capi-macos-step.yml | 12 ------------ .pipelines/stages/jobs/steps/capi-win-step.yml | 7 ------- 3 files changed, 1 insertion(+), 21 deletions(-) diff --git a/.github/workflows/ios-build.yml b/.github/workflows/ios-build.yml index a1fb9d9eb..71c51267b 100644 --- a/.github/workflows/ios-build.yml +++ b/.github/workflows/ios-build.yml @@ -38,5 +38,4 @@ jobs: --osx_arch arm64 \ --apple_deploy_target 15.4 \ --cmake_generator 'Xcode' \ - --build_dir build_iphonesimulator \ - --use_guidance=false \ No newline at end of file + --build_dir build_iphonesimulator \ No newline at end of file diff --git a/.pipelines/stages/jobs/steps/capi-macos-step.yml b/.pipelines/stages/jobs/steps/capi-macos-step.yml index e2275f4ef..e90b3bcaf 100644 --- a/.pipelines/stages/jobs/steps/capi-macos-step.yml +++ b/.pipelines/stages/jobs/steps/capi-macos-step.yml @@ -29,18 +29,6 @@ steps: echo "build_config=${{ parameters.build_config }}" displayName: 'Print Parameters' -- powershell: | - rustup toolchain install 1.82.0-x86_64-apple-darwin - rustup default 1.82.0-x86_64-apple-darwin - - if ("$(arch)" -eq "arm64") { - rustup target add --toolchain 1.82.0-x86_64-apple-darwin aarch64-apple-darwin - } - else { - rustup target add --toolchain 1.82.0-x86_64-apple-darwin x86_64-apple-darwin - } - displayName: 'Install Rust Toolchain' - - powershell: | $env:MACOSX_DEPLOYMENT_TARGET = "12.0" # Monterey cmake --preset macos_$(arch)_$(ep)_$(build_config) diff --git a/.pipelines/stages/jobs/steps/capi-win-step.yml b/.pipelines/stages/jobs/steps/capi-win-step.yml index cb5ca72a5..168fa47c7 100644 --- a/.pipelines/stages/jobs/steps/capi-win-step.yml +++ b/.pipelines/stages/jobs/steps/capi-win-step.yml @@ -46,13 +46,6 @@ steps: condition: eq(variables['ep'], 'cuda') workingDirectory: '$(Build.Repository.LocalPath)' -- powershell: | - $exePath = "$env:TEMP\rustup-init.exe" - (New-Object Net.WebClient).DownloadFile('https://static.rust-lang.org/rustup/dist/x86_64-pc-windows-msvc/rustup-init.exe', $exePath) - & $exePath -y --default-toolchain=1.82.0 - Write-Host "##vso[task.prependpath]$env:USERPROFILE\.cargo\bin" - displayName: 'Install Rust Toolchain' - - powershell: | cmake --preset windows_$(arch)_$(ep)_$(build_config) -T cuda='$(Build.Repository.LocalPath)\cuda_sdk\v$(cuda_version)' displayName: 'Configure CMake C API with CUDA' From 422af28e6b636baef22fd1650fdee28c41a0ff20 Mon Sep 17 00:00:00 2001 From: Ying Xiong Date: Thu, 19 Dec 2024 07:13:45 +0000 Subject: [PATCH 79/79] build and test with use_guidance --- .github/workflows/android-build.yml | 4 ++-- .github/workflows/linux-cpu-x64-build.yml | 4 ++-- .github/workflows/linux-cpu-x64-nightly-build.yml | 4 ++-- .github/workflows/mac-cpu-arm64-build.yml | 2 +- .github/workflows/win-cpu-arm64-build.yml | 4 ++-- .github/workflows/win-cpu-x64-build.yml | 4 ++-- .github/workflows/win-cuda-x64-build.yml | 4 ++-- .github/workflows/win-directml-x64-build.yml | 4 ++-- 8 files changed, 15 insertions(+), 15 deletions(-) diff --git a/.github/workflows/android-build.yml b/.github/workflows/android-build.yml index 580e7fa43..34b844359 100644 --- a/.github/workflows/android-build.yml +++ b/.github/workflows/android-build.yml @@ -83,12 +83,12 @@ jobs: run: | set -e -x rm -rf build - ./build.sh --android --android_api=27 --android_ndk_path=${ANDROID_NDK_LATEST_HOME} --config=RelWithDebInfo --android_abi=${{ env.ANDROID_ABI }} --parallel --build_java --update + ./build.sh --android --android_api=27 --android_ndk_path=${ANDROID_NDK_LATEST_HOME} --config=RelWithDebInfo --android_abi=${{ env.ANDROID_ABI }} --parallel --build_java --update --use_guidance - name: Run Android build run: | set -e -x - ./build.sh --android --android_api=27 --android_ndk_path=${ANDROID_NDK_LATEST_HOME} --config=RelWithDebInfo --android_abi=${{ env.ANDROID_ABI }} --parallel --build_java --build + ./build.sh --android --android_api=27 --android_ndk_path=${ANDROID_NDK_LATEST_HOME} --config=RelWithDebInfo --android_abi=${{ env.ANDROID_ABI }} --parallel --build_java --build --use_guidance - name: Enable KVM group perms so Android emulator can run run: | diff --git a/.github/workflows/linux-cpu-x64-build.yml b/.github/workflows/linux-cpu-x64-build.yml index 2ac09fcbf..3e41ba070 100644 --- a/.github/workflows/linux-cpu-x64-build.yml +++ b/.github/workflows/linux-cpu-x64-build.yml @@ -77,8 +77,8 @@ jobs: run: | set -e -x rm -rf build - cmake --preset linux_gcc_cpu_release - cmake --build --preset linux_gcc_cpu_release + cmake --preset linux_gcc_cpu_release -DUSE_GUIDANCE=ON + cmake --build --preset linux_gcc_cpu_release -DUSE_GUIDANCE=ON - name: Install the python wheel and test dependencies run: | diff --git a/.github/workflows/linux-cpu-x64-nightly-build.yml b/.github/workflows/linux-cpu-x64-nightly-build.yml index 1b8bbb151..c78e8bc8d 100644 --- a/.github/workflows/linux-cpu-x64-nightly-build.yml +++ b/.github/workflows/linux-cpu-x64-nightly-build.yml @@ -46,8 +46,8 @@ jobs: run: | set -e -x rm -rf build - cmake --preset linux_gcc_cpu_release - cmake --build --preset linux_gcc_cpu_release + cmake --preset linux_gcc_cpu_release -DUSE_GUIDANCE=ON + cmake --build --preset linux_gcc_cpu_release -DUSE_GUIDANCE=ON - name: Install the python wheel and test dependencies run: | diff --git a/.github/workflows/mac-cpu-arm64-build.yml b/.github/workflows/mac-cpu-arm64-build.yml index 937a6ca13..86fdde374 100644 --- a/.github/workflows/mac-cpu-arm64-build.yml +++ b/.github/workflows/mac-cpu-arm64-build.yml @@ -61,7 +61,7 @@ jobs: - name: Build with CMake run: | - cmake --build --preset macos_arm64_cpu_release --parallel + cmake --build --preset macos_arm64_cpu_release --parallel -DUSE_GUIDANCE=ON continue-on-error: false - name: Install the python wheel and test dependencies diff --git a/.github/workflows/win-cpu-arm64-build.yml b/.github/workflows/win-cpu-arm64-build.yml index e44f6c79a..fca5599cd 100644 --- a/.github/workflows/win-cpu-arm64-build.yml +++ b/.github/workflows/win-cpu-arm64-build.yml @@ -75,11 +75,11 @@ jobs: run: | python -m pip install wheel requests - cmake --preset windows_arm64_cpu_release + cmake --preset windows_arm64_cpu_release -DUSE_GUIDANCE=ON - name: Build with CMake run: | - cmake --build --preset windows_arm64_cpu_release --parallel + cmake --build --preset windows_arm64_cpu_release --parallel -DUSE_GUIDANCE=ON - name: Install the Python Wheel and Test Dependencies run: | diff --git a/.github/workflows/win-cpu-x64-build.yml b/.github/workflows/win-cpu-x64-build.yml index bc13020d4..0a801237c 100644 --- a/.github/workflows/win-cpu-x64-build.yml +++ b/.github/workflows/win-cpu-x64-build.yml @@ -85,11 +85,11 @@ jobs: - name: Configure CMake run: | - cmake --preset windows_x64_cpu_release + cmake --preset windows_x64_cpu_release -DUSE_GUIDANCE=ON - name: Build with CMake run: | - cmake --build --preset windows_x64_cpu_release --parallel + cmake --build --preset windows_x64_cpu_release --parallel -DUSE_GUIDANCE=ON - name: Install the python wheel and test dependencies run: | diff --git a/.github/workflows/win-cuda-x64-build.yml b/.github/workflows/win-cuda-x64-build.yml index 5769b1710..acc2ec95a 100644 --- a/.github/workflows/win-cuda-x64-build.yml +++ b/.github/workflows/win-cuda-x64-build.yml @@ -70,11 +70,11 @@ jobs: - name: Configure CMake run: | - cmake --preset windows_x64_cuda_release -T cuda=${{ env.cuda_dir }}\\v${{ env.cuda_version }} + cmake --preset windows_x64_cuda_release -T cuda=${{ env.cuda_dir }}\\v${{ env.cuda_version }} -DUSE_GUIDANCE=ON - name: Build with CMake run: | - cmake --build --preset windows_x64_cuda_release --parallel + cmake --build --preset windows_x64_cuda_release --parallel -DUSE_GUIDANCE=ON - name: Add CUDA to PATH run: | diff --git a/.github/workflows/win-directml-x64-build.yml b/.github/workflows/win-directml-x64-build.yml index 04ac024b5..37b213e8b 100644 --- a/.github/workflows/win-directml-x64-build.yml +++ b/.github/workflows/win-directml-x64-build.yml @@ -87,11 +87,11 @@ jobs: - name: Configure CMake run: | - cmake --preset windows_x64_directml_release -DTEST_PHI2=False + cmake --preset windows_x64_directml_release -DTEST_PHI2=False -DUSE_GUIDANCE=ON - name: Build with CMake run: | - cmake --build --preset windows_x64_directml_release --parallel + cmake --build --preset windows_x64_directml_release --parallel -DUSE_GUIDANCE=ON - name: Install the Python Wheel and Test Dependencies run: |