From 3da08e92c78e25a1ac67581718b2c495ad0420fc Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Fri, 10 Feb 2023 11:47:46 +0800 Subject: [PATCH] fix: doc store for files larger 4GB Fixes an issue in the skip list deserialization, which deserialized the byte start offset incorrectly as u32. `get_doc` will fail for any docs that live in a block with start offset larger than u32::MAX (~4GB). Causes index corruption, if a segment with a doc store larger 4GB is merged. tantivy version 0.19 is affected --- src/store/index/block.rs | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/store/index/block.rs b/src/store/index/block.rs index 1d86d68aca..ae969110cf 100644 --- a/src/store/index/block.rs +++ b/src/store/index/block.rs @@ -90,7 +90,7 @@ impl CheckpointBlock { return Ok(()); } let mut doc = read_u32_vint(data); - let mut start_offset = read_u32_vint(data) as usize; + let mut start_offset = VInt::deserialize_u64(data)? as usize; for _ in 0..len { let num_docs = read_u32_vint(data); let block_num_bytes = read_u32_vint(data) as usize; @@ -147,6 +147,15 @@ mod tests { test_aux_ser_deser(&checkpoints) } + #[test] + fn test_block_serialize_large_byte_range() -> io::Result<()> { + let checkpoints = vec![Checkpoint { + doc_range: 10..12, + byte_range: 8_000_000_000..9_000_000_000, + }]; + test_aux_ser_deser(&checkpoints) + } + #[test] fn test_block_serialize() -> io::Result<()> { let offsets: Vec = (0..11).map(|i| i * i * i).collect();