ad-freiburg · s1dharth-s · Oct 21, 2024 · Oct 24, 2024 · Oct 24, 2024 · Oct 27, 2024
diff --git a/src/engine/LocalVocab.cpp b/src/engine/LocalVocab.cpp
@@ -71,7 +71,8 @@ const LocalVocab::LiteralOrIri& LocalVocab::getWord(
 std::vector<LocalVocab::LiteralOrIri> LocalVocab::getAllWordsForTesting()
     const {
   std::vector<LiteralOrIri> result;
-  std::ranges::copy(primaryWordSet(), std::back_inserter(result));
+  std::ranges::copy(primaryWordSet().begin(), primaryWordSet().end(),
+                    std::back_inserter(result));
   for (const auto& previous : otherWordSets_) {
     std::ranges::copy(*previous, std::back_inserter(result));
   }

diff --git a/src/engine/LocalVocab.h b/src/engine/LocalVocab.h
@@ -4,6 +4,7 @@
 
 #pragma once
 
+#include <cstddef>
 #include <cstdlib>
 #include <memory>
 #include <span>
@@ -13,7 +14,10 @@
 #include "absl/container/node_hash_set.h"
 #include "global/Id.h"
 #include "parser/LiteralOrIri.h"
+#include "util/AllocatorWithLimit.h"
 #include "util/BlankNodeManager.h"
+#include "util/HashSet.h"
+#include "util/MemorySize/MemorySize.h"
 
 // A class for maintaining a local vocabulary with contiguous (local) IDs. This
 // is meant for words that are not part of the normal vocabulary (constructed
@@ -24,12 +28,26 @@ class LocalVocab {
  private:
   using Entry = LocalVocabEntry;
   using LiteralOrIri = LocalVocabEntry;
+
+  // A functor that calculates the memory size of an IRI or Literal.
+  // This struct defines an operator() that takes a `LiteralOrIri` object and
+  // returns its dynamic memory usage in bytes.
+  struct IriSizeGetter {
+    ad_utility::MemorySize operator()(
+        const ad_utility::triple_component::LiteralOrIri& literalOrIri) {
+      return ad_utility::MemorySize::bytes(
+          literalOrIri.getDynamicMemoryUsage());
+    }
+  };
+
   // A map of the words in the local vocabulary to their local IDs. This is a
   // node hash map because we need the addresses of the words (which are of type
   // `LiteralOrIri`) to remain stable over their lifetime in the hash map
   // because we hand out pointers to them.
-  using Set = absl::node_hash_set<LiteralOrIri>;
-  std::shared_ptr<Set> primaryWordSet_ = std::make_shared<Set>();
+  using Set =
+      ad_utility::NodeHashSetWithMemoryLimit<LiteralOrIri, IriSizeGetter>;
+  ad_utility::detail::AllocationMemoryLeftThreadsafe limit_;
+  std::shared_ptr<Set> primaryWordSet_;
 
   // Local vocabularies from child operations that were merged into this
   // vocabulary s.t. the pointers are kept alive. They have to be `const`
@@ -44,7 +62,10 @@ class LocalVocab {
 
  public:
   // Create a new, empty local vocabulary.
-  LocalVocab() = default;
+  LocalVocab(ad_utility::detail::AllocationMemoryLeftThreadsafe memoryLimit =
+                 ad_utility::makeAllocationMemoryLeftThreadsafeObject(
+                     ad_utility::MemorySize::max()))
+      : limit_(memoryLimit), primaryWordSet_(std::make_shared<Set>(limit_)) {}
 
   // Prevent accidental copying of a local vocabulary.
   LocalVocab(const LocalVocab&) = delete;

diff --git a/src/parser/Iri.h b/src/parser/Iri.h
@@ -48,6 +48,11 @@ class Iri {
   // Return the string value of the iri object without any leading or trailing
   // angled brackets.
   NormalizedStringView getContent() const;
+
+  // Calculate the memory usage of the `Iri` string. This might overestimate the
+  // memory usage as this does not currently take into account small string
+  // optimization of `std::string`
+  size_t getDynamicMemoryUsage() const { return iri_.capacity(); }
 };
 
 }  // namespace ad_utility::triple_component
diff --git a/src/parser/Literal.h b/src/parser/Literal.h
@@ -4,6 +4,7 @@
 
 #pragma once
 
+#include <cstddef>
 #include <optional>
 #include <variant>
 
@@ -90,5 +91,10 @@ class Literal {
   static Literal literalWithoutQuotes(
       std::string_view rdfContentWithoutQuotes,
       std::optional<std::variant<Iri, std::string>> descriptor = std::nullopt);
+
+  // Calculate the memory usage of the `Literal` string. This might overestimate
+  // the memory usage as this does not currently take into account small string
+  // optimization of `std::string`
+  size_t getDynamicMemoryUsage() const { return content_.capacity(); }
 };
 }  // namespace ad_utility::triple_component
diff --git a/src/parser/LiteralOrIri.h b/src/parser/LiteralOrIri.h
@@ -128,6 +128,12 @@ class alignas(16) LiteralOrIri {
     auto& s = *os;
     s << literalOrIri.toStringRepresentation();
   }
+
+  // Return the memory usage of the `LitaralOrIri` variant
+  size_t getDynamicMemoryUsage() const {
+    return std::visit(
+        [](const auto& val) { return val.getDynamicMemoryUsage(); }, data_);
+  }
 };
 
 }  // namespace ad_utility::triple_component
diff --git a/src/util/HashSet.h b/src/util/HashSet.h
@@ -6,14 +6,22 @@
 
 #pragma once
 
+#include <absl/container/node_hash_set.h>
+
+#include <cstddef>
+#include <cstdlib>
 #include <string>
+#include <utility>
 
 #include "absl/container/flat_hash_set.h"
 #include "util/AllocatorWithLimit.h"
+#include "util/MemorySize/MemorySize.h"
+#include "util/ValueSizeGetters.h"
 
 using std::string;
 
 namespace ad_utility {
+
 // Wrapper for HashSets (with elements of type T) to be used everywhere
 // throughout code for the semantic search. This wrapper interface is not
 // designed to be complete from the beginning. Feel free to extend it at need.
@@ -32,4 +40,149 @@
 using HashSetWithMemoryLimit =
     absl::flat_hash_set<T, HashFct, EqualElem, Alloc>;
 
+// Wrapper around absl::node_hash_set with a memory limit. All operations that
+// may change the allocated memory of the hash set is tracked using a
+// `AllocationMemoryLeftThreadsafe` object.
+template <class T, class SizeGetter = DefaultValueSizeGetter<T>,
+          class HashFct = absl::container_internal::hash_default_hash<T>,
+          class EqualElem = absl::container_internal::hash_default_eq<T>>
+class NodeHashSetWithMemoryLimit {
+ private:
+  using HashSet = absl::node_hash_set<T, HashFct, EqualElem>;
+  HashSet hashSet_;
+  detail::AllocationMemoryLeftThreadsafe memoryLeft_;
+  MemorySize memoryUsed_{MemorySize::bytes(0)};
+  SizeGetter sizeGetter_{};
+  size_t currentNumSlots_{0};
+
+  // `slotMemoryCost` represents the per-slot memory cost of a node hash set.
+  // It accounts for the memory used by a slot in the hash table, which
+  // typically consists of a pointer (used for node storage) plus any additional
+  // control bytes required for maintaining the hash set's structure and state.
+  // This value helps estimate and manage memory consumption for operations that
+  // involve slots, such as insertion and rehashing.
+  //
+  // The value is defined as `sizeof(void*) + 1` bytes, where:
+  // - `sizeof(void*)` represents the size of a pointer on the platform (usually
+  // 4 bytes for 32-bit and 8 bytes for 64-bit systems).
+  // - `+ 1` accounts for an extra control byte used for state management in the
+  // hash set.
+  constexpr static MemorySize slotMemoryCost =
+      MemorySize::bytes(sizeof(void*) + 1);
+
+ public:
+  NodeHashSetWithMemoryLimit(detail::AllocationMemoryLeftThreadsafe memoryLeft)
+      : memoryLeft_{memoryLeft} {
+    // Once the hash set is initialized, calculate the initial memory
+    // used by the slots of the hash set
+    updateSlotArrayMemoryUsage();
+  }
+
+  ~NodeHashSetWithMemoryLimit() { decreaseMemoryUsed(memoryUsed_); }
+
+  // Try to allocate the amount of memory requested
+  void increaseMemoryUsed(ad_utility::MemorySize amount) {
+    memoryLeft_.ptr()->wlock()->decrease_if_enough_left_or_throw(amount);
+    memoryUsed_ += amount;
+  }
+
+  // Decrease the amount of memory used
+  void decreaseMemoryUsed(ad_utility::MemorySize amount) {
+    memoryLeft_.ptr()->wlock()->increase(amount);
+    memoryUsed_ -= amount;
+  }
+
+  // Update the memory usage for the slot array if the bucket count changes.
+  // This function should be called after any operation that could cause
+  // rehashing. When the slot count increases, it reserves additional memory,
+  // and if the slot count decreases, it releases the unused memory back to the
+  // memory tracker.
+  void updateSlotArrayMemoryUsage() {
+    size_t newNumSlots = hashSet_.bucket_count();
+    if (newNumSlots != currentNumSlots_) {
+      if (newNumSlots > currentNumSlots_) {
+        ad_utility::MemorySize sizeIncrease =
+            slotMemoryCost * (newNumSlots - currentNumSlots_);
+        increaseMemoryUsed(sizeIncrease);
+      } else {
+        ad_utility::MemorySize sizeDecrease =
+            slotMemoryCost * (currentNumSlots_ - newNumSlots);
+
+        decreaseMemoryUsed(sizeDecrease);
+      }
+    }
+    currentNumSlots_ = newNumSlots;
+  }
+
+  // Insert an element into the hash set. If the memory limit is exceeded, the
+  // insert operation fails with a runtime error.
+  std::pair<typename HashSet::iterator, bool> insert(const T& value) {
+    MemorySize size =
+        sizeGetter_(value) + ad_utility::MemorySize::bytes(sizeof(T));
+    increaseMemoryUsed(size);
+
+    const auto& [it, wasInserted] = hashSet_.insert(value);
+
+    if (!wasInserted) {
+      decreaseMemoryUsed(size);
+    }
+
+    updateSlotArrayMemoryUsage();
+    return std::pair{it, wasInserted};
+  }
+
+  // _____________________________________________________________________________
+  void erase(const T& value) {
+    auto it = hashSet_.find(value);
+    if (it != hashSet_.end()) {
+      MemorySize size =
+          sizeGetter_(*it) + ad_utility::MemorySize::bytes(sizeof(T));
+      hashSet_.erase(it);
+      decreaseMemoryUsed(size);
+      updateSlotArrayMemoryUsage();
+    }
+  }
+
+  // _____________________________________________________________________________
+  void clear() {
+    hashSet_.clear();
+    // Release all node memory
+    decreaseMemoryUsed(memoryUsed_);
+
+    // Update slot memory usage based on the new bucket count after clearing
+    size_t newNumSlots = hashSet_.bucket_count();
+    ad_utility::MemorySize slotMemoryAfterClear = slotMemoryCost * newNumSlots;
+    // After clearing it only tracks the slot memory as nodes are gone
+    increaseMemoryUsed(slotMemoryAfterClear);
+
+    currentNumSlots_ = newNumSlots;
+  }
+
+  // _____________________________________________________________________________
+  size_t size() const { return hashSet_.size(); }
+
+  // _____________________________________________________________________________
+  bool empty() const { return hashSet_.empty(); }
+
+  // _____________________________________________________________________________
+  size_t count(const T& value) const { return hashSet_.count(value); }
+
+  // _____________________________________________________________________________
+  HashSet::const_iterator find(const T& value) const {
+    return hashSet_.find(value);
+  }
+
+  // _____________________________________________________________________________
+  bool contains(const T& key) { return hashSet_.contains(key); }
+
+  // _____________________________________________________________________________
+  HashSet::const_iterator begin() const { return hashSet_.begin(); }
+
+  // _____________________________________________________________________________
+  HashSet::const_iterator end() const { return hashSet_.end(); }
+
+  // _____________________________________________________________________________
+  MemorySize getCurrentMemoryUsage() const { return memoryUsed_; }
+};
+
 }  // namespace ad_utility
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
@@ -132,6 +132,8 @@ addLinkAndDiscoverTest(HashMapTest)
 
 addLinkAndDiscoverTest(HashSetTest)
 
+addLinkAndDiscoverTest(CustomHashSetWithMemoryLimitTest)
+
 addLinkAndDiscoverTestSerial(GroupByTest engine)
 
 addLinkAndDiscoverTest(VocabularyGeneratorTest index)