Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use a memory limited hashset with LocalVocab #1570

Open
wants to merge 16 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion src/engine/LocalVocab.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,8 @@ const LocalVocab::LiteralOrIri& LocalVocab::getWord(
std::vector<LocalVocab::LiteralOrIri> LocalVocab::getAllWordsForTesting()
const {
std::vector<LiteralOrIri> result;
std::ranges::copy(primaryWordSet(), std::back_inserter(result));
std::ranges::copy(primaryWordSet().begin(), primaryWordSet().end(),
std::back_inserter(result));
s1dharth-s marked this conversation as resolved.
Show resolved Hide resolved
for (const auto& previous : otherWordSets_) {
std::ranges::copy(*previous, std::back_inserter(result));
}
Expand Down
27 changes: 24 additions & 3 deletions src/engine/LocalVocab.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

#pragma once

#include <cstddef>
#include <cstdlib>
#include <memory>
#include <span>
Expand All @@ -13,7 +14,10 @@
#include "absl/container/node_hash_set.h"
#include "global/Id.h"
#include "parser/LiteralOrIri.h"
#include "util/AllocatorWithLimit.h"
#include "util/BlankNodeManager.h"
#include "util/HashSet.h"
#include "util/MemorySize/MemorySize.h"

// A class for maintaining a local vocabulary with contiguous (local) IDs. This
// is meant for words that are not part of the normal vocabulary (constructed
Expand All @@ -24,12 +28,26 @@ class LocalVocab {
private:
using Entry = LocalVocabEntry;
using LiteralOrIri = LocalVocabEntry;

// A functor that calculates the memory size of an IRI or Literal.
// This struct defines an operator() that takes a `LiteralOrIri` object and
// returns its dynamic memory usage in bytes.
struct IriSizeGetter {
ad_utility::MemorySize operator()(
const ad_utility::triple_component::LiteralOrIri& literalOrIri) {
return ad_utility::MemorySize::bytes(
literalOrIri.getDynamicMemoryUsage());
}
};

// A map of the words in the local vocabulary to their local IDs. This is a
// node hash map because we need the addresses of the words (which are of type
// `LiteralOrIri`) to remain stable over their lifetime in the hash map
// because we hand out pointers to them.
using Set = absl::node_hash_set<LiteralOrIri>;
std::shared_ptr<Set> primaryWordSet_ = std::make_shared<Set>();
using Set =
ad_utility::NodeHashSetWithMemoryLimit<LiteralOrIri, IriSizeGetter>;
ad_utility::detail::AllocationMemoryLeftThreadsafe limit_;
std::shared_ptr<Set> primaryWordSet_;

// Local vocabularies from child operations that were merged into this
// vocabulary s.t. the pointers are kept alive. They have to be `const`
Expand All @@ -44,7 +62,10 @@ class LocalVocab {

public:
// Create a new, empty local vocabulary.
LocalVocab() = default;
LocalVocab(ad_utility::detail::AllocationMemoryLeftThreadsafe memoryLimit =
ad_utility::makeAllocationMemoryLeftThreadsafeObject(
ad_utility::MemorySize::max()))
: limit_(memoryLimit), primaryWordSet_(std::make_shared<Set>(limit_)) {}

// Prevent accidental copying of a local vocabulary.
LocalVocab(const LocalVocab&) = delete;
Expand Down
5 changes: 5 additions & 0 deletions src/parser/Iri.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,11 @@ class Iri {
// Return the string value of the iri object without any leading or trailing
// angled brackets.
NormalizedStringView getContent() const;

// Calculate the memory usage of the `Iri` string. This might overestimate the
// memory usage as this does not currently take into account small string
// optimization of `std::string`
size_t getDynamicMemoryUsage() const { return iri_.capacity(); }
};

} // namespace ad_utility::triple_component
6 changes: 6 additions & 0 deletions src/parser/Literal.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

#pragma once

#include <cstddef>
#include <optional>
#include <variant>

Expand Down Expand Up @@ -90,5 +91,10 @@ class Literal {
static Literal literalWithoutQuotes(
std::string_view rdfContentWithoutQuotes,
std::optional<std::variant<Iri, std::string>> descriptor = std::nullopt);

// Calculate the memory usage of the `Literal` string. This might overestimate
// the memory usage as this does not currently take into account small string
// optimization of `std::string`
size_t getDynamicMemoryUsage() const { return content_.capacity(); }
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually you can figure out if the small buffer optimization applies.
(basically check if &content <= content.data() < (&content + sizeof(content)).
(but again, not supejr important).

};
} // namespace ad_utility::triple_component
6 changes: 6 additions & 0 deletions src/parser/LiteralOrIri.h
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,12 @@ class alignas(16) LiteralOrIri {
auto& s = *os;
s << literalOrIri.toStringRepresentation();
}

// Return the memory usage of the `LitaralOrIri` variant
size_t getDynamicMemoryUsage() const {
s1dharth-s marked this conversation as resolved.
Show resolved Hide resolved
return std::visit(
[](const auto& val) { return val.getDynamicMemoryUsage(); }, data_);
}
};

} // namespace ad_utility::triple_component
153 changes: 153 additions & 0 deletions src/util/HashSet.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,22 @@

#pragma once

#include <absl/container/node_hash_set.h>

#include <cstddef>
#include <cstdlib>
#include <string>
#include <utility>

#include "absl/container/flat_hash_set.h"
#include "util/AllocatorWithLimit.h"
#include "util/MemorySize/MemorySize.h"
#include "util/ValueSizeGetters.h"

using std::string;

namespace ad_utility {

// Wrapper for HashSets (with elements of type T) to be used everywhere
// throughout code for the semantic search. This wrapper interface is not
// designed to be complete from the beginning. Feel free to extend it at need.
Expand All @@ -32,4 +40,149 @@
using HashSetWithMemoryLimit =
absl::flat_hash_set<T, HashFct, EqualElem, Alloc>;

// Wrapper around absl::node_hash_set with a memory limit. All operations that
// may change the allocated memory of the hash set is tracked using a
// `AllocationMemoryLeftThreadsafe` object.
template <class T, class SizeGetter = DefaultValueSizeGetter<T>,
class HashFct = absl::container_internal::hash_default_hash<T>,
class EqualElem = absl::container_internal::hash_default_eq<T>>
class NodeHashSetWithMemoryLimit {
private:
using HashSet = absl::node_hash_set<T, HashFct, EqualElem>;
HashSet hashSet_;
detail::AllocationMemoryLeftThreadsafe memoryLeft_;
MemorySize memoryUsed_{MemorySize::bytes(0)};
SizeGetter sizeGetter_{};
size_t currentNumSlots_{0};

// `slotMemoryCost` represents the per-slot memory cost of a node hash set.
// It accounts for the memory used by a slot in the hash table, which
// typically consists of a pointer (used for node storage) plus any additional
// control bytes required for maintaining the hash set's structure and state.
// This value helps estimate and manage memory consumption for operations that
// involve slots, such as insertion and rehashing.
//
// The value is defined as `sizeof(void*) + 1` bytes, where:
// - `sizeof(void*)` represents the size of a pointer on the platform (usually
// 4 bytes for 32-bit and 8 bytes for 64-bit systems).
// - `+ 1` accounts for an extra control byte used for state management in the
// hash set.
constexpr static MemorySize slotMemoryCost =
MemorySize::bytes(sizeof(void*) + 1);

public:
NodeHashSetWithMemoryLimit(detail::AllocationMemoryLeftThreadsafe memoryLeft)
: memoryLeft_{memoryLeft} {
// Once the hash set is initialized, calculate the initial memory
// used by the slots of the hash set
updateSlotArrayMemoryUsage();
}

~NodeHashSetWithMemoryLimit() { decreaseMemoryUsed(memoryUsed_); }

// Try to allocate the amount of memory requested
void increaseMemoryUsed(ad_utility::MemorySize amount) {
memoryLeft_.ptr()->wlock()->decrease_if_enough_left_or_throw(amount);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a little wasteful, as we always have to obtain a global mutex.
I think for a follow-up (or preparation) PR,
You could implement a wrapper around the memoryLeftThreadsafe object, that stores a (single-threaded) small pool of memory and only goes to the global wlock() when that pool is exhausted.
Currently increase (1), increase(1), increase(1) needs a global synchronization for each of the three inserts, which seems wastefult.
(But as you have abstracted away all the memory handling, that is easy to integrate.)

memoryUsed_ += amount;
}

// Decrease the amount of memory used
void decreaseMemoryUsed(ad_utility::MemorySize amount) {
memoryLeft_.ptr()->wlock()->increase(amount);
memoryUsed_ -= amount;
}

// Update the memory usage for the slot array if the bucket count changes.
// This function should be called after any operation that could cause
// rehashing. When the slot count increases, it reserves additional memory,
// and if the slot count decreases, it releases the unused memory back to the
// memory tracker.
void updateSlotArrayMemoryUsage() {
size_t newNumSlots = hashSet_.bucket_count();
if (newNumSlots != currentNumSlots_) {
if (newNumSlots > currentNumSlots_) {
ad_utility::MemorySize sizeIncrease =
slotMemoryCost * (newNumSlots - currentNumSlots_);
increaseMemoryUsed(sizeIncrease);
} else {
ad_utility::MemorySize sizeDecrease =
slotMemoryCost * (currentNumSlots_ - newNumSlots);

Check warning on line 109 in src/util/HashSet.h

View check run for this annotation

Codecov / codecov/patch

src/util/HashSet.h#L108-L109

Added lines #L108 - L109 were not covered by tests

decreaseMemoryUsed(sizeDecrease);
}

Check warning on line 112 in src/util/HashSet.h

View check run for this annotation

Codecov / codecov/patch

src/util/HashSet.h#L111-L112

Added lines #L111 - L112 were not covered by tests
}
currentNumSlots_ = newNumSlots;
}

// Insert an element into the hash set. If the memory limit is exceeded, the
// insert operation fails with a runtime error.
std::pair<typename HashSet::iterator, bool> insert(const T& value) {
MemorySize size =
sizeGetter_(value) + ad_utility::MemorySize::bytes(sizeof(T));
s1dharth-s marked this conversation as resolved.
Show resolved Hide resolved
increaseMemoryUsed(size);

const auto& [it, wasInserted] = hashSet_.insert(value);

if (!wasInserted) {
decreaseMemoryUsed(size);
}

updateSlotArrayMemoryUsage();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Technically we might violate the memory limit before this call.
And additionally, when this updateSlotArray... you keep holding too much memory.

So probably you have to do something like "check if inserting would cause a rehash" (needs some more intrinsic checking of abseils mechanisms, i.e. when is the rehashing threshold, how much larger do you make the hash table afterwards but it is doable).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And for the correct counting of all spaces for slots etc,
you can just add an allocator to your node_hash_set that doesn't throw, but just counts the bytes somewhere, so you can before the insert guess, and after the insert check exactly. Such an allocator can also be very useful to analyze the
memory usage of the node_hash_set when writing your code.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about the following
(easy, but very effective approach):

  1. Assume, that each insert doubles the amount of memory that is occupied by the slot array.
  2. After the insert (same as after erase) revert back to the actual correct size.

return std::pair{it, wasInserted};
}

// _____________________________________________________________________________
void erase(const T& value) {
s1dharth-s marked this conversation as resolved.
Show resolved Hide resolved
auto it = hashSet_.find(value);
if (it != hashSet_.end()) {
MemorySize size =
sizeGetter_(*it) + ad_utility::MemorySize::bytes(sizeof(T));
Comment on lines +138 to +139
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same question here as above,
Probably our size getter should only account for the additionally allocated memory, because everything else we can handle using the internal memory tracking of the absl classes.

hashSet_.erase(it);
decreaseMemoryUsed(size);
updateSlotArrayMemoryUsage();
}
}

// _____________________________________________________________________________
void clear() {
hashSet_.clear();
// Release all node memory
decreaseMemoryUsed(memoryUsed_);

// Update slot memory usage based on the new bucket count after clearing
size_t newNumSlots = hashSet_.bucket_count();
ad_utility::MemorySize slotMemoryAfterClear = slotMemoryCost * newNumSlots;
// After clearing it only tracks the slot memory as nodes are gone
increaseMemoryUsed(slotMemoryAfterClear);

currentNumSlots_ = newNumSlots;
}

// _____________________________________________________________________________
size_t size() const { return hashSet_.size(); }

// _____________________________________________________________________________
bool empty() const { return hashSet_.empty(); }

// _____________________________________________________________________________
size_t count(const T& value) const { return hashSet_.count(value); }
s1dharth-s marked this conversation as resolved.
Show resolved Hide resolved

// _____________________________________________________________________________
HashSet::const_iterator find(const T& value) const {
return hashSet_.find(value);
}

// _____________________________________________________________________________
bool contains(const T& key) { return hashSet_.contains(key); }

// _____________________________________________________________________________
HashSet::const_iterator begin() const { return hashSet_.begin(); }

// _____________________________________________________________________________
HashSet::const_iterator end() const { return hashSet_.end(); }

// _____________________________________________________________________________
MemorySize getCurrentMemoryUsage() const { return memoryUsed_; }
};
s1dharth-s marked this conversation as resolved.
Show resolved Hide resolved

} // namespace ad_utility
2 changes: 2 additions & 0 deletions test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,8 @@ addLinkAndDiscoverTest(HashMapTest)

addLinkAndDiscoverTest(HashSetTest)

addLinkAndDiscoverTest(CustomHashSetWithMemoryLimitTest)

addLinkAndDiscoverTestSerial(GroupByTest engine)

addLinkAndDiscoverTest(VocabularyGeneratorTest index)
Expand Down
Loading
Loading