quickwit-oss · trinity-1686a · Apr 30, 2024 · Apr 30, 2024 · Jun 4, 2024 · Jun 5, 2024
diff --git a/quickwit/Cargo.lock b/quickwit/Cargo.lock
diff --git a/quickwit/quickwit-config/Cargo.toml b/quickwit/quickwit-config/Cargo.toml
@@ -31,6 +31,7 @@ serde_with = { workspace = true }
 serde_yaml = { workspace = true }
 toml = { workspace = true }
 tracing = { workspace = true }
+ulid = { workspace = true }
 utoipa = { workspace = true }
 vrl = { workspace = true, optional = true }
 

diff --git a/quickwit/quickwit-config/src/index_config/mod.rs b/quickwit/quickwit-config/src/index_config/mod.rs
@@ -39,6 +39,7 @@ use quickwit_proto::types::IndexId;
 use serde::{Deserialize, Serialize};
 pub use serialize::load_index_config_from_user_config;
 use tracing::warn;
+use ulid::Ulid;
 
 use crate::index_config::serialize::VersionedIndexConfig;
 use crate::merge_policy_config::{MergePolicyConfig, StableLogMergePolicyConfig};
@@ -93,6 +94,10 @@ pub struct DocMapping {
  /// Record document length
  #[serde(default)]
  pub document_length: bool,
+ /// Version of the doc mapper
+ #[serde(default)]
+ #[schema(value_type = String)]
+ pub version: Ulid,
 }
 
 #[derive(Clone, Debug, Serialize, Deserialize, utoipa::ToSchema)]
@@ -458,6 +463,7 @@ impl TestableForRegression for IndexConfig {
  timestamp_field: Some("timestamp".to_string()),
  tokenizers: vec![tokenizer],
  document_length: false,
+ version: Ulid::nil(),
  };
  let retention_policy = Some(RetentionPolicy {
  retention_period: "90 days".to_string(),
@@ -536,6 +542,7 @@ pub fn build_doc_mapper(
  max_num_partitions: doc_mapping.max_num_partitions,
  tokenizers: doc_mapping.tokenizers.clone(),
  document_length: doc_mapping.document_length,
+ version: doc_mapping.version,
  };
  Ok(Arc::new(builder.try_build()?))
 }

diff --git a/quickwit/quickwit-doc-mapper/Cargo.toml b/quickwit/quickwit-doc-mapper/Cargo.toml
@@ -28,6 +28,7 @@ tantivy = { workspace = true }
 thiserror = { workspace = true }
 tracing = { workspace = true }
 typetag = { workspace = true }
+ulid = { workspace = true }
 utoipa = { workspace = true }
 
 quickwit-common = { workspace = true }

diff --git a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper.rs b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper.rs
@@ -33,6 +33,7 @@ use tantivy::schema::{
  Field, FieldType, FieldValue, OwnedValue as TantivyValue, Schema, INDEXED, STORED,
 };
 use tantivy::TantivyDocument as Document;
+use ulid::Ulid;
 
 use super::field_mapping_entry::RAW_TOKENIZER_NAME;
 use super::DefaultDocMapperBuilder;
@@ -82,6 +83,8 @@ pub struct DefaultDocMapper {
  concatenate_dynamic_fields: Vec<Field>,
  /// Schema generated by the store source and field mappings parameters.
  schema: Schema,
+ /// Version of the doc mapper
+ version: Ulid,
  /// List of field names used for tagging.
  tag_field_names: BTreeSet<String>,
  /// The partition key is a DSL used to route documents
@@ -186,6 +189,7 @@ impl TryFrom<DefaultDocMapperBuilder> for DefaultDocMapper {
  };
 
  let schema = schema_builder.build();
+ let version = builder.version;
 
  let tokenizer_manager = create_default_quickwit_tokenizer_manager();
  let mut custom_tokenizer_names = HashSet::new();
@@ -267,6 +271,7 @@ impl TryFrom<DefaultDocMapperBuilder> for DefaultDocMapper {
  let required_fields = Vec::new();
  Ok(DefaultDocMapper {
  schema,
+ version,
  index_field_presence: builder.index_field_presence,
  source_field,
  dynamic_field,
@@ -388,6 +393,7 @@ impl From<DefaultDocMapper> for DefaultDocMapperBuilder {
  max_num_partitions: default_doc_mapper.max_num_partitions,
  tokenizers: default_doc_mapper.tokenizer_entries,
  document_length: false,
+ version: default_doc_mapper.version,
  }
  }
 }
@@ -712,6 +718,10 @@ impl DocMapper for DefaultDocMapper {
  self.schema.clone()
  }
 
+ fn version(&self) -> Ulid {
+ self.version
+ }
+
  fn timestamp_field_name(&self) -> Option<&str> {
  self.timestamp_field_name.as_deref()
  }

diff --git a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper_builder.rs b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper_builder.rs
@@ -20,6 +20,7 @@
 use std::num::NonZeroU32;
 
 use serde::{Deserialize, Serialize};
+use ulid::Ulid;
 
 use super::tokenizer_entry::TokenizerEntry;
 use super::FieldMappingEntry;
@@ -30,7 +31,7 @@ use crate::DefaultDocMapper;
 /// to create a valid DocMapper.
 ///
 /// It is also used to serialize/deserialize a DocMapper.
-/// note that this is not the way is the DocMapping is deserialized
+/// note that this is not the way the DocMapping is deserialized
 /// from the configuration.
 #[quickwit_macros::serde_multikey]
 #[derive(Serialize, Deserialize, Clone)]
@@ -83,6 +84,9 @@ pub struct DefaultDocMapperBuilder {
  /// Record document length
  #[serde(default)]
  pub document_length: bool,
+ /// Version of the doc mapper
+ #[serde(default)]
+ pub version: Ulid,
 }
 
 /// Defines how an unmapped field should be handled.

diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper.rs
@@ -30,6 +30,7 @@ use serde_json::Value as JsonValue;
 use tantivy::query::Query;
 use tantivy::schema::{Field, FieldType, OwnedValue as Value, Schema};
 use tantivy::{TantivyDocument as Document, Term};
+use ulid::Ulid;
 
 pub type Partition = u64;
 
@@ -99,6 +100,11 @@ pub trait DocMapper: Send + Sync + Debug + DynClone + 'static {
  /// over time. The schema returned here represents the most up-to-date schema of the index.
  fn schema(&self) -> Schema;
 
+ /// Returns the version of the doc mapper
+ ///
+ /// Splits with the same doc mapper version should use the same schema
+ fn version(&self) -> Ulid;
+
  /// Returns the query.
  ///
  /// Considering schema evolution, splits within an index can have different schema

diff --git a/quickwit/quickwit-indexing/src/actors/indexer.rs b/quickwit/quickwit-indexing/src/actors/indexer.rs
@@ -98,6 +98,7 @@ struct IndexerState {
  publish_lock: PublishLock,
  publish_token_opt: Option<PublishToken>,
  schema: Schema,
+ doc_mapper_version: Ulid,
  tokenizer_manager: TokenizerManager,
  max_num_partitions: NonZeroU32,
  index_settings: IndexSettings,
@@ -130,6 +131,7 @@ impl IndexerState {
  self.pipeline_id.clone(),
  partition_id,
  last_delete_opstamp,
+ self.doc_mapper_version,
  self.indexing_directory.clone(),
  index_builder,
  io_controls,
@@ -537,6 +539,7 @@ impl Indexer {
  index_serializer_mailbox: Mailbox<IndexSerializer>,
  ) -> Self {
  let schema = doc_mapper.schema();
+ let doc_mapper_version = doc_mapper.version();
  let tokenizer_manager = doc_mapper.tokenizer_manager().clone();
  let docstore_compression = Compressor::Zstd(ZstdCompressor {
  compression_level: Some(indexing_settings.docstore_compression_level),
@@ -564,6 +567,7 @@ impl Indexer {
  publish_lock: PublishLock::default(),
  publish_token_opt: None,
  schema,
+ doc_mapper_version,
  tokenizer_manager: tokenizer_manager.tantivy_manager().clone(),
  index_settings,
  max_num_partitions: doc_mapper.max_num_partitions(),

diff --git a/quickwit/quickwit-indexing/src/actors/merge_executor.rs b/quickwit/quickwit-indexing/src/actors/merge_executor.rs
@@ -236,7 +236,7 @@ pub fn merge_split_attrs(
  pipeline_id: MergePipelineId,
  merge_split_id: SplitId,
  splits: &[SplitMetadata],
-) -> SplitAttrs {
+) -> anyhow::Result<SplitAttrs> {
  let partition_id = combine_partition_ids_aux(splits.iter().map(|split| split.partition_id));
  let time_range: Option<RangeInclusive<DateTime>> = merge_time_range(splits);
  let uncompressed_docs_size_in_bytes = sum_doc_sizes_in_bytes(splits);
@@ -250,7 +250,17 @@ pub fn merge_split_attrs(
  .map(|split| split.delete_opstamp)
  .min()
  .unwrap_or(0);
- SplitAttrs {
+ let doc_mapper_version = splits
+ .first()
+ .ok_or_else(|| anyhow::anyhow!("attempted to merge zero splits"))?
+ .doc_mapper_version;
+ if splits
+ .iter()
+ .any(|split| split.doc_mapper_version != doc_mapper_version)
+ {
+ anyhow::bail!("attempted to merge splits with different doc mapper version");
+ }
+ Ok(SplitAttrs {
  node_id: pipeline_id.node_id.clone(),
  index_uid: pipeline_id.index_uid.clone(),
  source_id: pipeline_id.source_id.clone(),
@@ -262,7 +272,8 @@ pub fn merge_split_attrs(
  uncompressed_docs_size_in_bytes,
  delete_opstamp,
  num_merge_ops: max_merge_ops(splits) + 1,
- }
+ doc_mapper_version,
+ })
 }
 
 fn max_merge_ops(splits: &[SplitMetadata]) -> usize {
@@ -324,7 +335,7 @@ impl MergeExecutor {
  )?;
  ctx.record_progress();
 
- let split_attrs = merge_split_attrs(self.pipeline_id.clone(), merge_split_id, &splits);
+ let split_attrs = merge_split_attrs(self.pipeline_id.clone(), merge_split_id, &splits)?;
  Ok(IndexedSplit {
  split_attrs,
  index: merged_index,
@@ -444,6 +455,7 @@ impl MergeExecutor {
  uncompressed_docs_size_in_bytes,
  delete_opstamp: last_delete_opstamp,
  num_merge_ops: split.num_merge_ops,
+ doc_mapper_version: split.doc_mapper_version,
  },
  index: merged_index,
  split_scratch_directory: merge_scratch_directory,