Skip to content

Commit

Permalink
Hotfix #681
Browse files Browse the repository at this point in the history
Fixing the construction of the DeleteBitset.
Cherry picked from master.

Changing version to 0.10.3
  • Loading branch information
fulmicoton committed Nov 10, 2019
1 parent 7989465 commit b3903e6
Show file tree
Hide file tree
Showing 8 changed files with 99 additions and 32 deletions.
9 changes: 4 additions & 5 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
Tantivy 0.11.0
=====================

- Added f64 field. Internally reuse u64 code the same way i64 does (@fdb-hiroshima)
Tantivy 0.10.3
==========================

- Fix crash when committing multiple times with deleted documents. #681 (@brainlock)

Tantivy 0.10.2
Tantivy 0.10.2
=====================

- Closes #656. Solving memory leak.
Expand Down
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "tantivy"
version = "0.10.2"
version = "0.10.3"
authors = ["Paul Masurel <[email protected]>"]
license = "MIT"
categories = ["database-implementations", "data-structures"]
Expand Down
15 changes: 15 additions & 0 deletions src/core/index_meta.rs
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,21 @@ impl SegmentMeta {
self.num_deleted_docs() > 0
}

/// Updates the max_doc value from the `SegmentMeta`.
///
/// This method is only used when updating `max_doc` from 0
/// as we finalize a fresh new segment.
pub(crate) fn with_max_doc(self, max_doc: u32) -> SegmentMeta {
assert_eq!(self.tracked.max_doc, 0);
assert!(self.tracked.deletes.is_none());
let tracked = self.tracked.map(move |inner_meta| InnerSegmentMeta {
segment_id: inner_meta.segment_id,
max_doc,
deletes: None,
});
SegmentMeta { tracked }
}

#[doc(hidden)]
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: Opstamp) -> SegmentMeta {
let delete_meta = DeleteMeta {
Expand Down
11 changes: 11 additions & 0 deletions src/core/segment.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,17 @@ impl Segment {
&self.meta
}

/// Updates the max_doc value from the `SegmentMeta`.
///
/// This method is only used when updating `max_doc` from 0
/// as we finalize a fresh new segment.
pub(crate) fn with_max_doc(self, max_doc: u32) -> Segment {
Segment {
index: self.index,
meta: self.meta.with_max_doc(max_doc),
}
}

#[doc(hidden)]
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: Opstamp) -> Segment {
Segment {
Expand Down
20 changes: 11 additions & 9 deletions src/fastfield/delete.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,14 @@ use std::io::Write;
/// Write a delete `BitSet`
///
/// where `delete_bitset` is the set of deleted `DocId`.
pub fn write_delete_bitset(delete_bitset: &BitSet, writer: &mut WritePtr) -> io::Result<()> {
let max_doc = delete_bitset.capacity();
pub fn write_delete_bitset(
delete_bitset: &BitSet,
max_doc: u32,
writer: &mut WritePtr,
) -> io::Result<()> {
let mut byte = 0u8;
let mut shift = 0u8;
for doc in 0..max_doc {
for doc in 0..(max_doc as usize) {
if delete_bitset.contains(doc) {
byte |= 1 << shift;
}
Expand Down Expand Up @@ -86,18 +89,17 @@ mod tests {
use bit_set::BitSet;
use std::path::PathBuf;

fn test_delete_bitset_helper(bitset: &BitSet) {
fn test_delete_bitset_helper(bitset: &BitSet, max_doc: u32) {
let test_path = PathBuf::from("test");
let mut directory = RAMDirectory::create();
{
let mut writer = directory.open_write(&*test_path).unwrap();
write_delete_bitset(bitset, &mut writer).unwrap();
write_delete_bitset(bitset, max_doc, &mut writer).unwrap();
}
{
let source = directory.open_read(&test_path).unwrap();
let delete_bitset = DeleteBitSet::open(source);
let n = bitset.capacity();
for doc in 0..n {
for doc in 0..max_doc as usize {
assert_eq!(bitset.contains(doc), delete_bitset.is_deleted(doc as DocId));
}
assert_eq!(delete_bitset.len(), bitset.len());
Expand All @@ -110,7 +112,7 @@ mod tests {
let mut bitset = BitSet::with_capacity(10);
bitset.insert(1);
bitset.insert(9);
test_delete_bitset_helper(&bitset);
test_delete_bitset_helper(&bitset, 10);
}
{
let mut bitset = BitSet::with_capacity(8);
Expand All @@ -119,7 +121,7 @@ mod tests {
bitset.insert(3);
bitset.insert(5);
bitset.insert(7);
test_delete_bitset_helper(&bitset);
test_delete_bitset_helper(&bitset, 8);
}
}
}
39 changes: 22 additions & 17 deletions src/indexer/index_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,6 @@ pub(crate) fn advance_deletes(
};

let delete_cursor = segment_entry.delete_cursor();

compute_deleted_bitset(
&mut delete_bitset,
&segment_reader,
Expand All @@ -167,22 +166,22 @@ pub(crate) fn advance_deletes(
if num_deleted_docs > 0 {
segment = segment.with_delete_meta(num_deleted_docs as u32, target_opstamp);
let mut delete_file = segment.open_write(SegmentComponent::DELETE)?;
write_delete_bitset(&delete_bitset, &mut delete_file)?;
}
write_delete_bitset(&delete_bitset, max_doc, &mut delete_file)?;
}
}
segment_entry.set_meta(segment.meta().clone());
Ok(())
}

fn index_documents(
memory_budget: usize,
segment: &Segment,
segment: Segment,
grouped_document_iterator: &mut dyn Iterator<Item = OperationGroup>,
segment_updater: &mut SegmentUpdater,
mut delete_cursor: DeleteCursor,
) -> Result<bool> {
let schema = segment.schema();
let segment_id = segment.id();

let mut segment_writer = SegmentWriter::for_segment(memory_budget, segment.clone(), &schema)?;
for document_group in grouped_document_iterator {
for doc in document_group {
Expand All @@ -202,24 +201,28 @@ fn index_documents(
return Ok(false);
}

let num_docs = segment_writer.max_doc();
let max_doc = segment_writer.max_doc();

// this is ensured by the call to peek before starting
// the worker thread.
assert!(num_docs > 0);
assert!(max_doc > 0);

let doc_opstamps: Vec<Opstamp> = segment_writer.finalize()?;
let segment_meta = segment
.index()
.inventory()
.new_segment_meta(segment_id, num_docs);

let segment_with_max_doc = segment.with_max_doc(max_doc);
let last_docstamp: Opstamp = *(doc_opstamps.last().unwrap());

let delete_bitset_opt =
apply_deletes(&segment, &mut delete_cursor, &doc_opstamps, last_docstamp)?;
let delete_bitset_opt = apply_deletes(
&segment_with_max_doc,
&mut delete_cursor,
&doc_opstamps,
last_docstamp,
)?;

let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, delete_bitset_opt);
let segment_entry = SegmentEntry::new(
segment_with_max_doc.meta().clone(),
delete_cursor,
delete_bitset_opt,
);
Ok(segment_updater.add_segment(segment_entry))
}

Expand All @@ -236,7 +239,9 @@ fn apply_deletes(
}
let segment_reader = SegmentReader::open(segment)?;
let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps);
let mut deleted_bitset = BitSet::with_capacity(segment_reader.max_doc() as usize);

let max_doc = segment.meta().max_doc();
let mut deleted_bitset = BitSet::with_capacity(max_doc as usize);
let may_have_deletes = compute_deleted_bitset(
&mut deleted_bitset,
&segment_reader,
Expand Down Expand Up @@ -408,7 +413,7 @@ impl IndexWriter {
let segment = index.new_segment();
index_documents(
mem_budget,
&segment,
segment,
&mut document_iterator,
&mut segment_updater,
delete_cursor.clone(),
Expand Down
22 changes: 22 additions & 0 deletions src/indexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,25 @@ pub use self::segment_writer::SegmentWriter;

/// Alias for the default merge policy, which is the `LogMergePolicy`.
pub type DefaultMergePolicy = LogMergePolicy;

#[cfg(test)]
mod tests {
use crate::schema::{self, Schema};
use crate::{Index, Term};
#[test]
fn test_advance_delete_bug() {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_from_tempdir(schema_builder.build()).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
// there must be one deleted document in the segment
index_writer.add_document(doc!(text_field=>"b"));
index_writer.delete_term(Term::from_field_text(text_field, "b"));
// we need enough data to trigger the bug (at least 32 documents)
for _ in 0..32 {
index_writer.add_document(doc!(text_field=>"c"));
}
index_writer.commit().unwrap();
index_writer.commit().unwrap();
}
}
13 changes: 13 additions & 0 deletions src/schema/field.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,19 @@ use std::io::Write;
#[derive(Copy, Clone, Debug, PartialEq, PartialOrd, Eq, Ord, Hash, Serialize, Deserialize)]
pub struct Field(pub u32);

impl Field {
/// Create a new field object for the given FieldId.
pub fn from_field_id(field_id: u32) -> Field {
Field(field_id)
}

/// Returns a u32 identifying uniquely a field within a schema.
#[allow(clippy::trivially_copy_pass_by_ref)]
pub fn field_id(&self) -> u32 {
self.0
}
}

impl BinarySerializable for Field {
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
self.0.serialize(writer)
Expand Down

0 comments on commit b3903e6

Please sign in to comment.