From 386fed7e482a6fb8e249d9f3bdb75e6c6aeaff03 Mon Sep 17 00:00:00 2001 From: Jialiang Tan Date: Mon, 9 Dec 2024 18:51:10 -0800 Subject: [PATCH] fix: Fix dead lock in MemoryReclaimer (#11806) Summary: Pull Request resolved: https://github.com/facebookincubator/velox/pull/11806 Fixed a deadlock caused by MemoryReclaimer. We skip adding non-reclaimable pools when reclaiming. This can cause the skipped pool's destructor to be triggered in some cases, causing deadlock. This change fixed it Reviewed By: kevinwilfong, xiaoxmeng Differential Revision: D66994269 fbshipit-source-id: 2735f394e971f2a8eded6238e923ee378e240f09 --- velox/common/memory/MemoryArbitrator.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/velox/common/memory/MemoryArbitrator.cpp b/velox/common/memory/MemoryArbitrator.cpp index 769630bf8bd3..a98ecd395788 100644 --- a/velox/common/memory/MemoryArbitrator.cpp +++ b/velox/common/memory/MemoryArbitrator.cpp @@ -241,15 +241,24 @@ uint64_t MemoryReclaimer::reclaim( std::shared_ptr pool; int64_t reclaimableBytes; }; + + // NOTE: We hold candidate reference for non-reclaimable pools as well. This + // is to make sure child shared pointer is stored to keep child alive, + // avoiding destruction of child pool within below parents' 'poolMutex_' lock. + // Otherwise a double acquisition of 'poolMutex_' can happen in destructor, + // which creates deadlock. + std::vector nonReclaimableCandidates; std::vector candidates; { std::shared_lock guard{pool->poolMutex_}; candidates.reserve(pool->children_.size()); + nonReclaimableCandidates.reserve(pool->children_.size()); for (auto& entry : pool->children_) { auto child = entry.second.lock(); if (child != nullptr) { const auto reclaimableBytesOpt = child->reclaimableBytes(); if (!reclaimableBytesOpt.has_value()) { + nonReclaimableCandidates.push_back(Candidate{std::move(child), 0}); continue; } candidates.push_back(Candidate{