microsoft · Taka152 · Oct 31, 2024 · Oct 31, 2024 · Nov 1, 2024 · Nov 1, 2024
diff --git a/.github/workflows/android-build.yml b/.github/workflows/android-build.yml
@@ -82,6 +82,13 @@ jobs:
           unzip microsoft.ml.onnxruntime/${{ env.ORT_NIGHTLY_VERSION }}/runtimes/android/native/onnxruntime.aar -d ort
           ls -lR ort
 
+      - name: Install Rust Toolchain
+        uses: dtolnay/[email protected]
+
+      - name: Install Rust Android Toolchain
+        run: |
+          rustup target add --toolchain 1.82.0-x86_64-unknown-linux-gnu x86_64-linux-android
+
       - name: Create Android build
         run: |
           set -e -x

diff --git a/.github/workflows/ios-build.yml b/.github/workflows/ios-build.yml
@@ -3,8 +3,8 @@ on:
   workflow_dispatch:
   push:
     branches:
-    - main
-    - rel-*
+      - main
+      - rel-*
   pull_request:
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@@ -20,14 +20,17 @@ jobs:
 
       - uses: actions/setup-python@v5
         with:
-          python-version: '3.12.x'
+          python-version: "3.12.x"
 
       - name: Install the python wheel and dependencies
         run: |
           python3 -m venv genai-macos-venv
           source genai-macos-venv/bin/activate
           python3 -m pip install requests
 
+      - name: Install Rust Toolchain
+        uses: dtolnay/[email protected]
+
       - name: Run iOS Build
         run: |
           set -e -x
@@ -38,4 +41,4 @@ jobs:
               --osx_arch arm64 \
               --apple_deploy_target 15.4 \
               --cmake_generator 'Xcode' \
-              --build_dir build_iphonesimulator
+              --build_dir build_iphonesimulator
diff --git a/.github/workflows/linux-cpu-x64-build.yml b/.github/workflows/linux-cpu-x64-build.yml
@@ -39,6 +39,9 @@ jobs:
         with:
           gradle-version: '8.6'
 
+      - name: Install Rust Toolchain
+        uses: dtolnay/[email protected]
+
       - name: Get the Latest OnnxRuntime Nightly Version
         shell: pwsh
         run: |

diff --git a/.github/workflows/linux-cpu-x64-nightly-build.yml b/.github/workflows/linux-cpu-x64-nightly-build.yml
@@ -22,7 +22,8 @@ jobs:
       - name: Checkout OnnxRuntime GenAI repo
         uses: actions/checkout@v2
 
-
+      - name: Install Rust Toolchain
+        uses: dtolnay/[email protected]
 
       - name: Download OnnxRuntime
         run: |

diff --git a/.github/workflows/mac-cpu-arm64-build.yml b/.github/workflows/mac-cpu-arm64-build.yml
@@ -52,6 +52,9 @@ jobs:
           mv ${{ env.ORT_PACKAGE_NAME }}/build/native/include ort/
           mv ${{ env.ORT_PACKAGE_NAME }}/runtimes/osx-arm64/native/* ort/lib/
 
+      - name: Install Rust Toolchain
+        uses: dtolnay/[email protected]
+
       - name: Configure CMake
         run: |
           cmake --preset macos_arm64_cpu_release

diff --git a/.github/workflows/win-cpu-arm64-build.yml b/.github/workflows/win-cpu-arm64-build.yml
@@ -64,6 +64,13 @@ jobs:
         move ${{ env.ORT_PACKAGE_NAME }}/build/native/include ort/
         move ${{ env.ORT_PACKAGE_NAME }}/runtimes/win-arm64/native/* ort/lib/
 
+    - name: Install Rust Toolchain
+      run: |
+        $exePath = "$env:TEMP\rustup-init.exe"
+        (New-Object Net.WebClient).DownloadFile('https://static.rust-lang.org/rustup/dist/x86_64-pc-windows-msvc/rustup-init.exe', $exePath)
+        & $exePath -y --default-toolchain=1.82.0
+        Add-Content $env:GITHUB_PATH "$env:USERPROFILE\.cargo\bin"
+
     - name: Configure CMake
       run: |
         python -m pip install wheel requests

diff --git a/.github/workflows/win-cpu-x64-build.yml b/.github/workflows/win-cpu-x64-build.yml
@@ -53,6 +53,13 @@ jobs:
       with:
         gradle-version: '8.6'
 
+    - name: Install Rust Toolchain
+      run: |
+        $exePath = "$env:TEMP\rustup-init.exe"
+        (New-Object Net.WebClient).DownloadFile('https://static.rust-lang.org/rustup/dist/x86_64-pc-windows-msvc/rustup-init.exe', $exePath)
+        & $exePath -y --default-toolchain=1.82.0
+        Add-Content $env:GITHUB_PATH "$env:USERPROFILE\.cargo\bin"
+
     - name: Download OnnxRuntime Nightly
       shell: pwsh
       run: |

diff --git a/.github/workflows/win-cuda-x64-build.yml b/.github/workflows/win-cuda-x64-build.yml
@@ -59,7 +59,14 @@ jobs:
       run: |
         mkdir ort/lib
         move ${{ env.ORT_PACKAGE_NAME }}/buildTransitive/native/include ort/
-        move ${{ env.ORT_PACKAGE_NAME }}/runtimes/win-x64/native/* ort/lib/    
+        move ${{ env.ORT_PACKAGE_NAME }}/runtimes/win-x64/native/* ort/lib/
+
+    - name: Install Rust Toolchain
+      run: |
+        $exePath = "$env:TEMP\rustup-init.exe"
+        (New-Object Net.WebClient).DownloadFile('https://static.rust-lang.org/rustup/dist/x86_64-pc-windows-msvc/rustup-init.exe', $exePath)
+        & $exePath -y --default-toolchain=1.82.0
+        Add-Content $env:GITHUB_PATH "$env:USERPROFILE\.cargo\bin"
 
     - name: Configure CMake
       run: |

diff --git a/.github/workflows/win-directml-x64-build.yml b/.github/workflows/win-directml-x64-build.yml
@@ -78,6 +78,13 @@ jobs:
         mv $env:d3d12_dir\build\native\bin\x64\D3D12Core.dll ort\lib
         mv $env:dml_dir\include\DirectML.h ort\include
 
+    - name: Install Rust Toolchain
+      run: |
+        $exePath = "$env:TEMP\rustup-init.exe"
+        (New-Object Net.WebClient).DownloadFile('https://static.rust-lang.org/rustup/dist/x86_64-pc-windows-msvc/rustup-init.exe', $exePath)
+        & $exePath -y --default-toolchain=1.82.0
+        Add-Content $env:GITHUB_PATH "$env:USERPROFILE\.cargo\bin"
+
     - name: Configure CMake
       run: |
         cmake --preset windows_x64_directml_release -DTEST_PHI2=False

diff --git a/.gitignore b/.gitignore
@@ -30,6 +30,7 @@ examples/csharp/HelloPhi/models
 
 !test/test_models/hf-internal-testing/
 !test/test_models/hf-internal-testing/tiny-random-gpt2*/*.onnx
+!test/test_models/grammars/
 
 .ipynb_checkpoints/
 /src/java/.gradle

diff --git a/.pipelines/stages/jobs/steps/capi-macos-step.yml b/.pipelines/stages/jobs/steps/capi-macos-step.yml
@@ -29,6 +29,18 @@ steps:
     echo "build_config=${{ parameters.build_config }}"
   displayName: 'Print Parameters'
 
+- powershell: |
+    rustup toolchain install 1.82.0-x86_64-apple-darwin
+    rustup default 1.82.0-x86_64-apple-darwin
+
+    if ("$(arch)" -eq "arm64") {
+        rustup target add --toolchain 1.82.0-x86_64-apple-darwin aarch64-apple-darwin
+    }
+    else {
+        rustup target add --toolchain 1.82.0-x86_64-apple-darwin x86_64-apple-darwin
+    }
+  displayName: 'Install Rust Toolchain'
+
 - powershell: |
     $env:MACOSX_DEPLOYMENT_TARGET = "12.0" # Monterey
     cmake --preset macos_$(arch)_$(ep)_$(build_config)

diff --git a/.pipelines/stages/jobs/steps/capi-win-step.yml b/.pipelines/stages/jobs/steps/capi-win-step.yml
@@ -46,6 +46,13 @@ steps:
   condition: eq(variables['ep'], 'cuda')
   workingDirectory: '$(Build.Repository.LocalPath)'
 
+- powershell: |
+    $exePath = "$env:TEMP\rustup-init.exe"
+    (New-Object Net.WebClient).DownloadFile('https://static.rust-lang.org/rustup/dist/x86_64-pc-windows-msvc/rustup-init.exe', $exePath)
+    & $exePath -y --default-toolchain=1.82.0
+    Write-Host "##vso[task.prependpath]$env:USERPROFILE\.cargo\bin"
+  displayName: 'Install Rust Toolchain'
+
 - powershell: |
     cmake --preset windows_$(arch)_$(ep)_$(build_config) -T cuda='$(Build.Repository.LocalPath)\cuda_sdk\v$(cuda_version)'
   displayName: 'Configure CMake C API with CUDA'

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -47,6 +47,13 @@ include(cmake/check_webgpu.cmake)
 include(cmake/cxx_standard.cmake)
 
 add_compile_definitions(BUILDING_ORT_GENAI_C)
+
+if(USE_GUIDANCE)
+  add_compile_definitions(USE_GUIDANCE=1)
+else()
+  add_compile_definitions(USE_GUIDANCE=0)
+endif()
+
 if(MSVC)
   # set updated value for __cplusplus macro instead of 199711L
   add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/Zc:__cplusplus>)
@@ -139,6 +146,19 @@ if(USE_CUDA AND CMAKE_CUDA_COMPILER)
   endif()
 endif()
 
+
+if(USE_GUIDANCE)
+  target_include_directories(onnxruntime-genai PUBLIC ${llguidance_SOURCE_DIR}/parser/)
+  target_include_directories(onnxruntime-genai-static PUBLIC ${llguidance_SOURCE_DIR}/parser/)
+  target_link_libraries(onnxruntime-genai PRIVATE llguidance_parser)
+  target_link_libraries(onnxruntime-genai-static PUBLIC llguidance_parser)
+  if (WIN32)
+    # bcrypt is needed for the rust std lib
+    target_link_libraries(onnxruntime-genai PRIVATE bcrypt)
+    target_link_libraries(onnxruntime-genai-static PRIVATE bcrypt)
+  endif()
+endif()
+
 if(CMAKE_GENERATOR_TOOLSET MATCHES "Visual Studio")
   target_link_options(onnxruntime-genai PRIVATE "/CETCOMPAT")
   target_compile_options(onnxruntime-genai PRIVATE "/sdl")

diff --git a/build.py b/build.py
@@ -26,6 +26,19 @@ def _path_from_env_var(env_var: str):
     env_var_value = os.environ.get(env_var)
     return Path(env_var_value) if env_var_value is not None else None
 
+def strtobool (val):
+    """Convert a string representation of truth to true (1) or false (0).
+    True values are 'y', 'yes', 't', 'true', 'on', and '1'; false values
+    are 'n', 'no', 'f', 'false', 'off', and '0'.  Raises ValueError if
+    'val' is anything else.
+    """
+    val = str(val).lower()
+    if val in ('y', 'yes', 't', 'true', 'on', '1'):
+        return True
+    elif val in ('n', 'no', 'f', 'false', 'off', '0'):
+        return False
+    else:
+        raise ValueError("invalid truth value %r" % (val,))
 
 def _parse_args():
     class Parser(argparse.ArgumentParser):
@@ -130,6 +143,8 @@ class HelpFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescript
 
     parser.add_argument("--use_dml", action="store_true", help="Whether to use DML. Default is to not use DML.")
 
+    parser.add_argument("--use_guidance", default=True, type=strtobool, help="Whether to add guidance support. Default is True.")
+
     # The following options are mutually exclusive (cross compiling options such as android, ios, etc.)
     platform_group = parser.add_mutually_exclusive_group()
     platform_group.add_argument("--android", action="store_true", help="Build for Android")
@@ -477,6 +492,7 @@ def update(args: argparse.Namespace, env: dict[str, str]):
         f"-DUSE_DML={'ON' if args.use_dml else 'OFF'}",
         f"-DENABLE_JAVA={'ON' if args.build_java else 'OFF'}",
         f"-DBUILD_WHEEL={build_wheel}",
+        f"-DUSE_GUIDANCE={'ON' if args.use_guidance else 'OFF'}",
     ]
 
     if args.ort_home:

diff --git a/cmake/deps.txt b/cmake/deps.txt
@@ -15,3 +15,5 @@ googletest;https://github.com/google/googletest/archive/530d5c8c84abd2a46f38583e
 microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.230629.1.zip;e4a542a323c070376f7c2d1973d0f7ddbc1d2fa5
 directx_headers;https://github.com/microsoft/DirectX-Headers/archive/refs/tags/v1.613.1.zip;47653509a3371eabb156360f42faf582f314bf2e
 onnxruntime_extensions;https://github.com/microsoft/onnxruntime-extensions.git;2c3e936cfc3401ba7ebb79d02b9e52a50439ffc3
+llguidance;https://github.com/microsoft/llguidance.git;4dc358feef3cdf0542a5f95b5f4e92761887a25d
+corrosion;https://github.com/corrosion-rs/corrosion.git;64289b1d79d6d19cd2e241db515381a086bb8407
diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
@@ -93,3 +93,19 @@ list(APPEND EXTERNAL_LIBRARIES
   ocos_operators
   noexcep_operators
 )
+
+if(USE_GUIDANCE)
+  FetchContent_Declare(
+    Corrosion
+    GIT_REPOSITORY ${DEP_URL_corrosion}
+    GIT_TAG ${DEP_SHA1_corrosion}
+    )
+  onnxruntime_fetchcontent_makeavailable(Corrosion)
+  FetchContent_Declare(
+    llguidance
+    GIT_REPOSITORY ${DEP_URL_llguidance}
+    GIT_TAG ${DEP_SHA1_llguidance}
+  )
+  onnxruntime_fetchcontent_makeavailable(llguidance)
+  corrosion_import_crate(MANIFEST_PATH ${llguidance_SOURCE_DIR}/parser/Cargo.toml)
+endif()
diff --git a/cmake/options.cmake b/cmake/options.cmake
@@ -5,6 +5,7 @@ option(USE_CUDA "Build with CUDA support" ON)
 option(USE_ROCM "Build with ROCm support" ON)
 option(USE_DML "Build with DML support" OFF)
 option(USE_WEBGPU "Build with WEBGPU support" ON)
+option(USE_GUIDANCE "Build with guidance support" ON)
 
 # bindings
 option(ENABLE_JAVA "Build the Java API." OFF)

diff --git a/src/cuda/interface.cpp b/src/cuda/interface.cpp
@@ -160,6 +160,10 @@ struct CudaInterfaceImpl : CudaInterface {
     cuda::LaunchHandleEOSArray(batch_logits, batch_beam_size, vocab_size, eos_token_ids, eos_token_ids_count, stream);
   }
 
+  void LaunchAddLogitsMask(float* batch_logits, int batch_beam_size, int vocab_size, const uint32_t* logits_mask, cudaStream_t stream) override {
+    cuda::LaunchAddLogitsMask(batch_logits, batch_beam_size, vocab_size, logits_mask, stream);
+  }
+
   void UpdateCacheIndirectionKernelLauncher(int32_t* tgt_indir_cache, const int32_t* src_indir_cache, const int32_t* beam_ids, int batch_size, int beam_width, int input_seq_length, int max_seq_length, int current_length, cudaStream_t stream) override {
     cuda::UpdateCacheIndirectionKernelLauncher(tgt_indir_cache, src_indir_cache, beam_ids, batch_size, beam_width, input_seq_length, max_seq_length, current_length, stream);
   }

diff --git a/src/cuda/interface.h b/src/cuda/interface.h
@@ -36,6 +36,7 @@ struct CudaInterface : DeviceInterface {
   virtual void Launch_UpdateAttentionMask(int32_t* mask_data, const int32_t* old_data, int batch_beam_size, int new_kv_length, int total_length, int max_length, bool update_only, cudaStream_t stream) = 0;
   virtual void Launch_UpdateAttentionMask(int64_t* mask_data, const int64_t* old_data, int batch_beam_size, int new_kv_length, int total_length, int max_length, bool update_only, cudaStream_t stream) = 0;
   virtual void LaunchHandleEOSArray(float* batch_logits, int batch_beam_size, int vocab_size, const int32_t* eos_token_ids, int eos_token_ids_count, cudaStream_t stream) = 0;
+  virtual void LaunchAddLogitsMask(float* batch_logits, int batch_beam_size, int vocab_size, const uint32_t* logits_mask, cudaStream_t stream) = 0;
   virtual void UpdateCacheIndirectionKernelLauncher(int32_t* tgt_indir_cache, const int32_t* src_indir_cache, const int32_t* beam_ids, int batch_size, int beam_width, int input_seq_length, int max_seq_length, int current_length, cudaStream_t stream) = 0;
   virtual void ReorderPastStatesKernelLauncher(void* out_buffer, const void* in_buffer, int batch_size, int num_heads, int max_length, int head_size, int chunk_size, cudaStream_t stream) = 0;
   virtual void LaunchCopyCrossQKSingleDecodeStep(cudaStream_t stream, float* cross_qk_buffer_data, float** qk_layer_pointers, int token_index, int batch_beam_size, int num_layers, int num_heads, int num_alignment_heads, const int* alignment_heads, int frames, int max_length) = 0;

diff --git a/src/cuda/model_kernels.cu b/src/cuda/model_kernels.cu
@@ -100,6 +100,22 @@ void LaunchHandleEOSArray(float* batch_logits, int batch_beam_size, int vocab_si
   HandleEOSArray<<<(batch_beam_size + 255) / 256, 256, 0, stream>>>(batch_logits, batch_beam_size, vocab_size, eos_token_ids, eos_token_ids_count);
 }
 
+__global__ void AddLogitsMask(float* batch_logits, int batch_beam_size, int vocab_size, const uint32_t* logits_mask) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index >= batch_beam_size * vocab_size)
+    return;
+  int batch_index = index / vocab_size;
+  int vocab_index = index % vocab_size;
+  if (!(logits_mask[(batch_index * vocab_size + vocab_index) / 32] & (1 << (vocab_index % 32))))
+    batch_logits[index] = std::numeric_limits<float>::lowest();
+}
+
+void LaunchAddLogitsMask(float* batch_logits, int batch_beam_size, int vocab_size, const uint32_t* logits_mask, cudaStream_t stream) {
+  int block_size = 256;
+  int num_blocks = (batch_beam_size * vocab_size + block_size - 1) / block_size;
+  AddLogitsMask<<<num_blocks, block_size, 0, stream>>>(batch_logits, batch_beam_size, vocab_size, logits_mask);
+}
+
 __global__ void ConvertFp16ToFp32(const half* src, float* dst, int count) {
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (idx < count)

diff --git a/src/generators.cpp b/src/generators.cpp
@@ -5,6 +5,7 @@
 #include "sequences.h"
 #include "models/model.h"
 #include "models/decoder_only.h"
+#include "logits_processor.h"
 #include "search.h"
 #include "cpu/interface.h"
 #include "cuda/interface.h"
@@ -151,6 +152,7 @@ void Launch_UpdateAttentionMask<int32_t>(int32_t* mask_data, const int32_t* old_
 template <>
 void Launch_UpdateAttentionMask<int64_t>(int64_t* mask_data, const int64_t* old_data, int batch_beam_size, int new_kv_length, int total_length, int max_length, bool update_only, cudaStream_t stream) { GetCudaInterface()->Launch_UpdateAttentionMask(mask_data, old_data, batch_beam_size, new_kv_length, total_length, max_length, update_only, stream); }
 void LaunchHandleEOSArray(float* batch_logits, int batch_beam_size, int vocab_size, const int32_t* eos_token_ids, int eos_token_ids_count, cudaStream_t stream) { GetCudaInterface()->LaunchHandleEOSArray(batch_logits, batch_beam_size, vocab_size, eos_token_ids, eos_token_ids_count, stream); }
+void LaunchAddLogitsMask(float* batch_logits, int batch_beam_size, int vocab_size, const uint32_t* logits_mask, cudaStream_t stream) { GetCudaInterface()->LaunchAddLogitsMask(batch_logits, batch_beam_size, vocab_size, logits_mask, stream); }
 void UpdateCacheIndirectionKernelLauncher(int32_t* tgt_indir_cache, const int32_t* src_indir_cache, const int32_t* beam_ids, int batch_size, int beam_width, int input_seq_length, int max_seq_length, int current_length, cudaStream_t stream) { GetCudaInterface()->UpdateCacheIndirectionKernelLauncher(tgt_indir_cache, src_indir_cache, beam_ids, batch_size, beam_width, input_seq_length, max_seq_length, current_length, stream); }
 void ReorderPastStatesKernelLauncher(void* out_buffer, const void* in_buffer, int batch_size, int num_heads, int max_length, int head_size, int chunk_size, cudaStream_t stream) { GetCudaInterface()->ReorderPastStatesKernelLauncher(out_buffer, in_buffer, batch_size, num_heads, max_length, head_size, chunk_size, stream); }
 template <>
@@ -237,6 +239,11 @@ void GeneratorParams::SetInputs(const NamedTensors& named_tensors) {
   }
 }
 
+void GeneratorParams::SetGuidance(std::string_view type, std::string_view data) {
+  guidance_type = type;
+  guidance_data = data;
+}
+
 std::unique_ptr<Generator> CreateGenerator(const Model& model, const GeneratorParams& params) {
   return std::make_unique<Generator>(model, params);
 }
@@ -260,6 +267,7 @@ Generator::Generator(const Model& model, const GeneratorParams& params) : model_
   search_ = CreateSearch(params);
   state_ = model.CreateState(search_->GetSequenceLengths(), params);  // Search sequence lengths set when creating state
 
+  logits_processor_ = CreateLogitsProcessor(*state_);  // Could be nullptr if no logits processor is used
   // Temporary solution for multimodal and whisper models
   if (!params.aux_input_ids.empty() && params.aux_input_ids.data() != nullptr) {
     AppendTokens(params.aux_input_ids);
@@ -296,7 +304,10 @@ void Generator::AppendTokens(const cpu_span<int32_t> input_ids) {
 void Generator::ComputeLogits(DeviceSpan<int32_t> next_tokens) {
   if (computed_logits_)
     throw std::runtime_error("ComputeLogits called again without calling AppendTokens or GenerateNextToken first");
-
+  if (last_action_ == Action::generated && logits_processor_) {
+    auto next_tokens_span = next_tokens.CopyDeviceToCpu();
+    logits_processor_->CommitTokens(next_tokens_span);
+  }
   auto logits = state_->Run(search_->GetSequenceLength(), next_tokens, search_->GetNextIndices());
   if (g_log.enabled && g_log.model_logits) {
     auto& stream = Log("model_logits");
@@ -358,6 +369,10 @@ void Generator::GenerateNextToken() {
       search_->AppendTokens(next_tokens);
     ComputeLogits(next_tokens);
   }
+  if (logits_processor_) {
+    auto logits = GetLogits();
+    logits_processor_->ProcessLogits(logits);
+  }
   computed_logits_ = false;
   auto& search = search_->params_->search;
   search_->ApplyMinLength(search.min_length);
@@ -411,6 +426,9 @@ void Generator::RewindToLength(size_t new_length) {
     throw std::runtime_error("RewindToLength must be called with new_length=0 when batch_size > 1");
   search_->RewindTo(new_length);
   state_->RewindTo(new_length);
+  if (logits_processor_) {
+    logits_processor_->Reset();
+  }
   computed_logits_ = false;
   last_action_ = Action::rewound;
 }