From 7b584eca8a36dea3d82338343fd522981245cb47 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@scylladb.com>
Date: Mon, 17 Jun 2024 11:14:26 +0300
Subject: [PATCH 1/7] fair_queue: Scale fixed-point factor

The vaulue is used to convert request tokens from float-point number to
some "sane" integer. Currently on ~200k IOPS ~2GBs disk it produces the
following token values for requests of different sizes

    512:    150k
    1024:   160k
    2048:   170k
    ...
    131072: 1.4M

These values are pretty huge an when accumulated even in 64-bit counter
can overflow it in months time scale. Current code sort of accounts for
that by checking the overflow and resetting the counters, but in the
future there will be the need to reset counters on different shards, and
that's going to be problematic.

This patch reduces the factor 8 times, so that the costs are now

    512:      19k
    1024:     20k
    2048:     21k
    ...
    131072:  170k

That's much more friendly to accumulating counters (the overflow is now
at the year's scale which is pretty comfortable). Reducing it even
further is problematic, here's why.

In order to provide cross-class fairness the costs are divided by class
shares for accumulation. Given a class of 1000 shares, the 512-bytes
request becomes indistinguishable from 1k one with smaller factor. Said
that, even with the new factor it's worth taking more care when dividing
the cost at shares use div-roundup math.

Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
---
 include/seastar/core/fair_queue.hh | 2 +-
 src/core/fair_queue.cc             | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/seastar/core/fair_queue.hh b/include/seastar/core/fair_queue.hh
index 274a680c..cd6e6dad 100644
--- a/include/seastar/core/fair_queue.hh
+++ b/include/seastar/core/fair_queue.hh
@@ -189,7 +189,7 @@ public:
      * time period for which the speeds from F (in above formula) are taken.
      */
 
-    static constexpr float fixed_point_factor = float(1 << 24);
+    static constexpr float fixed_point_factor = float(1 << 21);
     using rate_resolution = std::milli;
     using token_bucket_t = internal::shared_token_bucket<capacity_t, rate_resolution, internal::capped_release::no>;
 
diff --git a/src/core/fair_queue.cc b/src/core/fair_queue.cc
index 57e9171a..a5ea2888 100644
--- a/src/core/fair_queue.cc
+++ b/src/core/fair_queue.cc
@@ -378,7 +378,7 @@ void fair_queue::dispatch_requests(std::function<void(fair_queue_entry&)> cb) {
         // has chances to be translated into zero cost which, in turn, will make the
         // class show no progress and monopolize the queue.
         auto req_cap = req._capacity;
-        auto req_cost  = std::max(req_cap / h._shares, (capacity_t)1);
+        auto req_cost  = std::max((req_cap + h._shares - 1) / h._shares, (capacity_t)1);
         // signed overflow check to make push_priority_class_from_idle math work
         if (h._accumulated >= std::numeric_limits<signed_capacity_t>::max() - req_cost) {
             for (auto& pc : _priority_classes) {

From 7f1ade152f2b388eb149150b2010db929005079d Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@scylladb.com>
Date: Mon, 17 Jun 2024 11:02:22 +0300
Subject: [PATCH 2/7] fair_queue tests: Remember it is time-based

Current tests on fair queue try to make the queue submit requests in
extremely controllable way -- one-by-one. However, the fair queue
nowadays is driven by rated token bucket and is very sensitive to time
and durations. It's better to teach the test accept the fact that it
cannot control fair-queue requests submissions on per-request
granularity and tunes its accounting instead.

The change affects two places.

Main loop. Before the change it called fair_queue::dispatch_requests()
as many times are the number of requests test case wants to pass, then
performed the necessary checks. Now, the method is called infinitely,
and the handling only processes the requested amount of requests. The
rest is ignored.

Drain. Before the change it called dispatch_requests() in a loop until
it returned anything. Now it's called in a loop until fair queue
explicitly reports that it's empty.

Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
---
 tests/unit/fair_queue_test.cc | 37 ++++++++++++++++++++++-------------
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/tests/unit/fair_queue_test.cc b/tests/unit/fair_queue_test.cc
index bbcf5f79..12ca5aec 100644
--- a/tests/unit/fair_queue_test.cc
+++ b/tests/unit/fair_queue_test.cc
@@ -75,7 +75,16 @@ class test_env {
     }
 
     void drain() {
-        do {} while (tick() != 0);
+        while (true) {
+            if (tick()) {
+                continue;
+            }
+            auto ts = _fq.next_pending_aio();
+            if (ts == std::chrono::steady_clock::time_point::max()) {
+                break;
+            }
+            sleep(ts - std::chrono::steady_clock::now()).get();
+        }
     }
 public:
     test_env(unsigned capacity)
@@ -90,27 +99,27 @@ class test_env {
     // Because of this property, one useful use of tick() is to implement a drain()
     // method (see above) in which all requests currently sent to the queue are drained
     // before the queue is destroyed.
-    unsigned tick(unsigned n = 1) {
+    unsigned tick(unsigned n = 0) {
         unsigned processed = 0;
-        _fg.replenish_capacity(_fg.replenished_ts() + std::chrono::microseconds(1));
-        _fq.dispatch_requests([] (fair_queue_entry& ent) {
-            boost::intrusive::get_parent_from_member(&ent, &request::fqent)->submit();
-        });
+        while (true) {
+            _fg.replenish_capacity(_fg.replenished_ts() + std::chrono::microseconds(1));
+            _fq.dispatch_requests([] (fair_queue_entry& ent) {
+                boost::intrusive::get_parent_from_member(&ent, &request::fqent)->submit();
+            });
 
-        for (unsigned i = 0; i < n; ++i) {
             std::vector<request> curr;
             curr.swap(_inflight);
 
             for (auto& req : curr) {
-                processed++;
-                _results[req.index]++;
+                if (processed < n) {
+                    _results[req.index]++;
+                }
                 _fq.notify_request_finished(req.fqent.capacity());
+                processed++;
+            }
+            if (processed >= n) {
+                break;
             }
-
-            _fg.replenish_capacity(_fg.replenished_ts() + std::chrono::microseconds(1));
-            _fq.dispatch_requests([] (fair_queue_entry& ent) {
-                boost::intrusive::get_parent_from_member(&ent, &request::fqent)->submit();
-            });
         }
         return processed;
     }

From 223e0c6be78d65bff75583203f8972ea1ed3e360 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@scylladb.com>
Date: Thu, 13 Jun 2024 15:22:06 +0300
Subject: [PATCH 3/7] fair_queue: Define signed_capacity_t type in fair_group

For convenience

Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
---
 include/seastar/core/fair_queue.hh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/seastar/core/fair_queue.hh b/include/seastar/core/fair_queue.hh
index cd6e6dad..67c1496b 100644
--- a/include/seastar/core/fair_queue.hh
+++ b/include/seastar/core/fair_queue.hh
@@ -108,6 +108,7 @@ public:
     // a 'normalized' form -- converted from floating-point to fixed-point number
     // and scaled accrding to fair-group's token-bucket duration
     using capacity_t = uint64_t;
+    using signed_capacity_t = std::make_signed<capacity_t>::type;
     friend class fair_queue;
 
 private:
@@ -138,6 +139,7 @@ public:
 class fair_group {
 public:
     using capacity_t = fair_queue_entry::capacity_t;
+    using signed_capacity_t = fair_queue_entry::signed_capacity_t;
     using clock_type = std::chrono::steady_clock;
 
     /*
@@ -301,7 +303,7 @@ public:
     using class_id = unsigned int;
     class priority_class_data;
     using capacity_t = fair_group::capacity_t;
-    using signed_capacity_t = std::make_signed<capacity_t>::type;
+    using signed_capacity_t = fair_queue_entry::signed_capacity_t;
 
 private:
     using clock_type = std::chrono::steady_clock;

From bcf6d8653fa901686bf3aaae924e0292d41d0138 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@scylladb.com>
Date: Thu, 13 Jun 2024 16:33:58 +0300
Subject: [PATCH 4/7] fair_queue: Introduce group-wide capacity balancing

On each shard classes compete with each other by accumulating the sum of
request costs that had been dispatched from them so far. Cost is the
request capacity divided by the class shares. Dispatch loop then selects
the class with the smallest accumulated value, thus providing
shares-aware fairless -- the larger the shares value is, the slower the
accumulator gorws, the more requests are picked from the class for
dispatch.

This patch implements similar approach across shards. For that, each
shard accumnulates the dispatched cost from all classes. IO group keeps
track of a vector of accumulated costs for each shard. When a shard
wants to dispatch it first checks if it has run too far ahead of all
other shards, and if it does, it skips the dispatch loop.

Corner case -- when a queue gets drained, it "withdraws" itself from
other shards' decisions by advancing its group counter to infinity.
Respectively, when a group comes back it may forward its accumulator not
to get too large advantage over other shards.

When scheduling classes, shard has exclusive access to them and uses
log-complex heap to pick the one with smallest consumption counter.
Cross-shard balancing cannot afford it. Instead, each shard manipulates
its own counter only, and to compare it with other shards' it scans the
whole vector, which is not very cache-friendly and race-prone.

Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
---
 include/seastar/core/fair_queue.hh | 27 ++++++++++++++++
 src/core/fair_queue.cc             | 50 ++++++++++++++++++++++++++++++
 2 files changed, 77 insertions(+)

diff --git a/include/seastar/core/fair_queue.hh b/include/seastar/core/fair_queue.hh
index 67c1496b..3dbe0f04 100644
--- a/include/seastar/core/fair_queue.hh
+++ b/include/seastar/core/fair_queue.hh
@@ -219,8 +219,20 @@ private:
     token_bucket_t _token_bucket;
     const capacity_t _per_tick_threshold;
 
+    // Capacities accumulated by queues in this group. Each queue tries not
+    // to run too far ahead of the others, if it does -- it skips dispatch
+    // loop until next tick in the hope that other shards would grab the
+    // unused disk capacity and will move their counters forward.
+    std::vector<capacity_t> _balance;
+
 public:
 
+    // Maximum value the _balance entry can get
+    // It's also set when a queue goes idle and doesn't need to participate
+    // in accumulated races. This value is still suitable for comparisons
+    // of "active" queues
+    static constexpr capacity_t max_balance = std::numeric_limits<signed_capacity_t>::max();
+
     // Convert internal capacity value back into the real token
     static double capacity_tokens(capacity_t cap) noexcept {
         return (double)cap / fixed_point_factor / token_bucket_t::rate_cast(std::chrono::seconds(1)).count();
@@ -268,6 +280,10 @@ public:
     }
 
     const token_bucket_t& token_bucket() const noexcept { return _token_bucket; }
+
+    capacity_t current_balance() const noexcept;
+    void update_balance(capacity_t) noexcept;
+    void reset_balance() noexcept;
 };
 
 /// \brief Fair queuing class
@@ -333,6 +349,15 @@ private:
     std::vector<std::unique_ptr<priority_class_data>> _priority_classes;
     size_t _nr_classes = 0;
     capacity_t _last_accumulated = 0;
+    capacity_t _total_accumulated = 0;
+    // Maximum capacity that a queue can stay behind other shards
+    //
+    // This is similar to priority classes fall-back deviation and it's
+    // calculated as the number of capacity points a group with 1 share
+    // accumulates over tau
+    //
+    // Check max_deviation math in push_priority_class_from_idle())
+    const capacity_t _max_imbalance;
 
     /*
      * When the shared capacity os over the local queue delays
@@ -363,6 +388,8 @@ private:
     enum class grab_result { grabbed, cant_preempt, pending };
     grab_result grab_capacity(const fair_queue_entry& ent) noexcept;
     grab_result grab_pending_capacity(const fair_queue_entry& ent) noexcept;
+
+    bool balanced() noexcept;
 public:
     /// Constructs a fair queue with configuration parameters \c cfg.
     ///
diff --git a/src/core/fair_queue.cc b/src/core/fair_queue.cc
index a5ea2888..51cfd29f 100644
--- a/src/core/fair_queue.cc
+++ b/src/core/fair_queue.cc
@@ -48,6 +48,8 @@ module seastar;
 
 namespace seastar {
 
+logger fq_log("fair_queue");
+
 static_assert(sizeof(fair_queue_ticket) == sizeof(uint64_t), "unexpected fair_queue_ticket size");
 static_assert(sizeof(fair_queue_entry) <= 3 * sizeof(void*), "unexpected fair_queue_entry::_hook size");
 static_assert(sizeof(fair_queue_entry::container_list_t) == 2 * sizeof(void*), "unexpected priority_class::_queue size");
@@ -108,6 +110,7 @@ fair_group::fair_group(config cfg, unsigned nr_queues)
                         tokens_capacity(cfg.min_tokens)
                        )
         , _per_tick_threshold(_token_bucket.limit() / nr_queues)
+        , _balance(smp::count, max_balance)
 {
     if (cfg.rate_factor * fixed_point_factor > _token_bucket.max_rate) {
         throw std::runtime_error("Fair-group rate_factor is too large");
@@ -141,6 +144,21 @@ auto fair_group::capacity_deficiency(capacity_t from) const noexcept -> capacity
     return _token_bucket.deficiency(from);
 }
 
+auto fair_group::current_balance() const noexcept -> capacity_t {
+    return *std::min_element(_balance.begin(), _balance.end());
+}
+
+void fair_group::update_balance(capacity_t acc) noexcept {
+    _balance[this_shard_id()] = acc;
+}
+
+void fair_group::reset_balance() noexcept {
+    // Request cost can be up to half a million. Given 100K iops disk and a
+    // class with 1 share, the 64-bit accumulating counter would overflows
+    // once in few years.
+    on_internal_error_noexcept(fq_log, "cannot reset group balance");
+}
+
 // Priority class, to be used with a given fair_queue
 class fair_queue::priority_class_data {
     friend class fair_queue;
@@ -169,7 +187,11 @@ fair_queue::fair_queue(fair_group& group, config cfg)
     : _config(std::move(cfg))
     , _group(group)
     , _group_replenish(clock_type::now())
+    , _max_imbalance(fair_group::fixed_point_factor * fair_group::token_bucket_t::rate_cast(_config.tau).count())
 {
+    if (fair_group::max_balance > std::numeric_limits<capacity_t>::max() - _max_imbalance) {
+        throw std::runtime_error("Too large tau parameter");
+    }
 }
 
 fair_queue::fair_queue(fair_queue&& other)
@@ -181,6 +203,8 @@ fair_queue::fair_queue(fair_queue&& other)
     , _handles(std::move(other._handles))
     , _priority_classes(std::move(other._priority_classes))
     , _last_accumulated(other._last_accumulated)
+    , _total_accumulated(other._total_accumulated)
+    , _max_imbalance(other._max_imbalance)
 {
 }
 
@@ -210,6 +234,12 @@ void fair_queue::push_priority_class_from_idle(priority_class_data& pc) noexcept
         // over signed maximum (see overflow check below)
         pc._accumulated = std::max<signed_capacity_t>(_last_accumulated - max_deviation, pc._accumulated);
         _handles.assert_enough_capacity();
+        if (_handles.empty()) {
+            capacity_t balance = _group.current_balance();
+            if (balance != fair_group::max_balance) {
+                _total_accumulated = std::max<signed_capacity_t>(balance - _max_imbalance, _total_accumulated);
+            }
+        }
         _handles.push(&pc);
         pc._queued = true;
     }
@@ -346,10 +376,23 @@ fair_queue::clock_type::time_point fair_queue::next_pending_aio() const noexcept
     return std::chrono::steady_clock::time_point::max();
 }
 
+bool fair_queue::balanced() noexcept {
+    capacity_t balance = _group.current_balance();
+    if (_total_accumulated > balance + _max_imbalance) {
+        return false;
+    }
+
+    return true;
+}
+
 void fair_queue::dispatch_requests(std::function<void(fair_queue_entry&)> cb) {
     capacity_t dispatched = 0;
     boost::container::small_vector<priority_class_ptr, 2> preempt;
 
+    if (!balanced()) {
+        return;
+    }
+
     while (!_handles.empty() && (dispatched < _group.per_tick_grab_threshold())) {
         priority_class_data& h = *_handles.top();
         if (h._queue.empty() || !h._plugged) {
@@ -392,6 +435,7 @@ void fair_queue::dispatch_requests(std::function<void(fair_queue_entry&)> cb) {
             }
             _last_accumulated = 0;
         }
+        _total_accumulated += req_cost;
         h._accumulated += req_cost;
         h._pure_accumulated += req_cap;
         dispatched += req_cap;
@@ -406,6 +450,12 @@ void fair_queue::dispatch_requests(std::function<void(fair_queue_entry&)> cb) {
     for (auto&& h : preempt) {
         push_priority_class(*h);
     }
+
+    if (_total_accumulated >= fair_group::max_balance) [[unlikely]] {
+        _group.reset_balance();
+        _total_accumulated = 0;
+    }
+    _group.update_balance(_handles.empty() ? fair_group::max_balance : _total_accumulated);
 }
 
 std::vector<seastar::metrics::impl::metric_definition_impl> fair_queue::metrics(class_id c) {

From 2520e9c34909dfaea06ffa244141d28d5cd41328 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@scylladb.com>
Date: Thu, 13 Jun 2024 15:23:30 +0300
Subject: [PATCH 5/7] fair_queue: Drop per-dispatch-loop threshold

The value is used to limit one shard in the amount of requests it's
allowed to dispatch in one poll. This is to prevent it from consuming
the whole capacity in one go and let other shards get their portion.

Group-wide balancing (previous patch) made this fuse obsotele.

Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
---
 apps/io_tester/ioinfo.cc           | 1 -
 include/seastar/core/fair_queue.hh | 2 --
 src/core/fair_queue.cc             | 5 +----
 3 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/apps/io_tester/ioinfo.cc b/apps/io_tester/ioinfo.cc
index d23f265e..d2393e6e 100644
--- a/apps/io_tester/ioinfo.cc
+++ b/apps/io_tester/ioinfo.cc
@@ -75,7 +75,6 @@ int main(int ac, char** av) {
                             out << YAML::EndMap;
 
                             const auto& fg = internal::get_fair_group(ioq, internal::io_direction_and_length::write_idx);
-                            out << YAML::Key << "per_tick_grab_threshold" << YAML::Value << fg.per_tick_grab_threshold();
 
                             const auto& tb = fg.token_bucket();
                             out << YAML::Key << "token_bucket" << YAML::BeginMap;
diff --git a/include/seastar/core/fair_queue.hh b/include/seastar/core/fair_queue.hh
index 3dbe0f04..570c4437 100644
--- a/include/seastar/core/fair_queue.hh
+++ b/include/seastar/core/fair_queue.hh
@@ -217,7 +217,6 @@ private:
      */
 
     token_bucket_t _token_bucket;
-    const capacity_t _per_tick_threshold;
 
     // Capacities accumulated by queues in this group. Each queue tries not
     // to run too far ahead of the others, if it does -- it skips dispatch
@@ -266,7 +265,6 @@ public:
     fair_group(fair_group&&) = delete;
 
     capacity_t maximum_capacity() const noexcept { return _token_bucket.limit(); }
-    capacity_t per_tick_grab_threshold() const noexcept { return _per_tick_threshold; }
     capacity_t grab_capacity(capacity_t cap) noexcept;
     clock_type::time_point replenished_ts() const noexcept { return _token_bucket.replenished_ts(); }
     void replenish_capacity(clock_type::time_point now) noexcept;
diff --git a/src/core/fair_queue.cc b/src/core/fair_queue.cc
index 51cfd29f..16fc05d8 100644
--- a/src/core/fair_queue.cc
+++ b/src/core/fair_queue.cc
@@ -109,7 +109,6 @@ fair_group::fair_group(config cfg, unsigned nr_queues)
                         std::max<capacity_t>(cfg.rate_factor * fixed_point_factor * token_bucket_t::rate_cast(cfg.rate_limit_duration).count(), tokens_capacity(cfg.limit_min_tokens)),
                         tokens_capacity(cfg.min_tokens)
                        )
-        , _per_tick_threshold(_token_bucket.limit() / nr_queues)
         , _balance(smp::count, max_balance)
 {
     if (cfg.rate_factor * fixed_point_factor > _token_bucket.max_rate) {
@@ -386,14 +385,13 @@ bool fair_queue::balanced() noexcept {
 }
 
 void fair_queue::dispatch_requests(std::function<void(fair_queue_entry&)> cb) {
-    capacity_t dispatched = 0;
     boost::container::small_vector<priority_class_ptr, 2> preempt;
 
     if (!balanced()) {
         return;
     }
 
-    while (!_handles.empty() && (dispatched < _group.per_tick_grab_threshold())) {
+    while (!_handles.empty()) {
         priority_class_data& h = *_handles.top();
         if (h._queue.empty() || !h._plugged) {
             pop_priority_class(h);
@@ -438,7 +436,6 @@ void fair_queue::dispatch_requests(std::function<void(fair_queue_entry&)> cb) {
         _total_accumulated += req_cost;
         h._accumulated += req_cost;
         h._pure_accumulated += req_cap;
-        dispatched += req_cap;
 
         cb(req);
 

From 7819c7a187f78eca2f5927b9e20206b35acd2cd1 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@scylladb.com>
Date: Thu, 20 Jun 2024 18:27:58 +0300
Subject: [PATCH 6/7] fair_queue: Amortize cross-shard balance checking

Looking at group balance counters is not very lightweight and is better
be avoided when possible. For that -- when balance is achieved, arm a
timer for quiscent period, and check only after it expires. When the
group is not balanced, check balance more frequently.

Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
---
 include/seastar/core/fair_queue.hh |  9 +++++++++
 src/core/fair_queue.cc             | 13 +++++++++++++
 2 files changed, 22 insertions(+)

diff --git a/include/seastar/core/fair_queue.hh b/include/seastar/core/fair_queue.hh
index 570c4437..f470756d 100644
--- a/include/seastar/core/fair_queue.hh
+++ b/include/seastar/core/fair_queue.hh
@@ -26,6 +26,7 @@
 #include <seastar/core/shared_ptr.hh>
 #include <seastar/core/circular_buffer.hh>
 #include <seastar/core/metrics_registration.hh>
+#include <seastar/core/lowres_clock.hh>
 #include <seastar/util/shared_token_bucket.hh>
 
 #include <chrono>
@@ -348,6 +349,14 @@ private:
     size_t _nr_classes = 0;
     capacity_t _last_accumulated = 0;
     capacity_t _total_accumulated = 0;
+
+    // Amortize balance checking by assuming that once balance achieved,
+    // it would remain such for the "quiscent" duration. Increase this
+    // duration every time the assumption keeps true, but not more than
+    // tau. When the balance is lost, reset back to frequent checks.
+    static constexpr auto minimal_quiscent_duration = std::chrono::microseconds(100);
+    std::chrono::microseconds _balance_quiscent_duration = minimal_quiscent_duration;
+    timer<lowres_clock> _balance_timer;
     // Maximum capacity that a queue can stay behind other shards
     //
     // This is similar to priority classes fall-back deviation and it's
diff --git a/src/core/fair_queue.cc b/src/core/fair_queue.cc
index 16fc05d8..9c14a88b 100644
--- a/src/core/fair_queue.cc
+++ b/src/core/fair_queue.cc
@@ -186,6 +186,10 @@ fair_queue::fair_queue(fair_group& group, config cfg)
     : _config(std::move(cfg))
     , _group(group)
     , _group_replenish(clock_type::now())
+    , _balance_timer([this] {
+        auto new_qd = _balance_quiscent_duration * 2;
+        _balance_quiscent_duration = std::min(new_qd, _config.tau);
+    })
     , _max_imbalance(fair_group::fixed_point_factor * fair_group::token_bucket_t::rate_cast(_config.tau).count())
 {
     if (fair_group::max_balance > std::numeric_limits<capacity_t>::max() - _max_imbalance) {
@@ -203,6 +207,8 @@ fair_queue::fair_queue(fair_queue&& other)
     , _priority_classes(std::move(other._priority_classes))
     , _last_accumulated(other._last_accumulated)
     , _total_accumulated(other._total_accumulated)
+    , _balance_quiscent_duration(other._balance_quiscent_duration)
+    , _balance_timer(std::move(other._balance_timer))
     , _max_imbalance(other._max_imbalance)
 {
 }
@@ -376,11 +382,17 @@ fair_queue::clock_type::time_point fair_queue::next_pending_aio() const noexcept
 }
 
 bool fair_queue::balanced() noexcept {
+    if (_balance_timer.armed()) {
+        return true;
+    }
+
     capacity_t balance = _group.current_balance();
     if (_total_accumulated > balance + _max_imbalance) {
+        _balance_quiscent_duration = minimal_quiscent_duration;
         return false;
     }
 
+    _balance_timer.arm(_balance_quiscent_duration);
     return true;
 }
 
@@ -451,6 +463,7 @@ void fair_queue::dispatch_requests(std::function<void(fair_queue_entry&)> cb) {
     if (_total_accumulated >= fair_group::max_balance) [[unlikely]] {
         _group.reset_balance();
         _total_accumulated = 0;
+        _balance_quiscent_duration = minimal_quiscent_duration;
     }
     _group.update_balance(_handles.empty() ? fair_group::max_balance : _total_accumulated);
 }

From fb850f9757792498166fb2608e5f1f5b229947ac Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@scylladb.com>
Date: Mon, 17 Jun 2024 13:27:55 +0300
Subject: [PATCH 7/7] test: Add manual test for cross-shard balancing

It's pretty long, so not for automatic execition

2-shards tests:

{'shard_0': {'iops': 88204.3828, 'shares': 100}, 'shard_1': {'iops': 89686.5156, 'shares': 100}}
IOPS ratio 1.02, expected 1.0, deviation 1%

{'shard_0': {'iops': 60321.3125, 'shares': 100}, 'shard_1': {'iops': 117566.406, 'shares': 200}}
IOPS ratio 1.95, expected 2.0, deviation 2%

{'shard_0': {'iops': 37326.2422, 'shares': 100}, 'shard_1': {'iops': 140555.062, 'shares': 400}}
IOPS ratio 3.77, expected 4.0, deviation 5%

{'shard_0': {'iops': 21547.6152, 'shares': 100}, 'shard_1': {'iops': 156309.891, 'shares': 800}}
IOPS ratio 7.25, expected 8.0, deviation 9%

3-shards tests:

{'shard_0': {'iops': 45211.9336, 'shares': 100}, 'shard_1': {'iops': 45211.9766, 'shares': 100}, 'shard_2': {'iops': 87412.9453, 'shares': 200}}
shard-1 IOPS ratio 1.0, expected 1.0, deviation 0%
shard-2 IOPS ratio 1.93, expected 2.0, deviation 3%

{'shard_0': {'iops': 30992.2188, 'shares': 100}, 'shard_1': {'iops': 30992.2812, 'shares': 100}, 'shard_2': {'iops': 115887.609, 'shares': 400}}
shard-1 IOPS ratio 1.0, expected 1.0, deviation 0%
shard-2 IOPS ratio 3.74, expected 4.0, deviation 6%

{'shard_0': {'iops': 19279.6348, 'shares': 100}, 'shard_1': {'iops': 19279.6934, 'shares': 100}, 'shard_2': {'iops': 139316.828, 'shares': 800}}
shard-1 IOPS ratio 1.0, expected 1.0, deviation 0%
shard-2 IOPS ratio 7.23, expected 8.0, deviation 9%

{'shard_0': {'iops': 26505.9082, 'shares': 100}, 'shard_1': {'iops': 53011.9922, 'shares': 200}, 'shard_2': {'iops': 98369.4453, 'shares': 400}}
shard-1 IOPS ratio 2.0, expected 2.0, deviation 0%
shard-2 IOPS ratio 3.71, expected 4.0, deviation 7%

{'shard_0': {'iops': 17461.8145, 'shares': 100}, 'shard_1': {'iops': 34923.8438, 'shares': 200}, 'shard_2': {'iops': 125470.43, 'shares': 800}}
shard-1 IOPS ratio 2.0, expected 2.0, deviation 0%
shard-2 IOPS ratio 7.19, expected 8.0, deviation 10%

{'shard_0': {'iops': 14812.3037, 'shares': 100}, 'shard_1': {'iops': 58262, 'shares': 400}, 'shard_2': {'iops': 104794.633, 'shares': 800}}
shard-1 IOPS ratio 3.93, expected 4.0, deviation 1%
shard-2 IOPS ratio 7.07, expected 8.0, deviation 11%

Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
---
 tests/manual/iosched-smp.py | 167 ++++++++++++++++++++++++++++++++++++
 1 file changed, 167 insertions(+)
 create mode 100755 tests/manual/iosched-smp.py

diff --git a/tests/manual/iosched-smp.py b/tests/manual/iosched-smp.py
new file mode 100755
index 00000000..a9b58ee4
--- /dev/null
+++ b/tests/manual/iosched-smp.py
@@ -0,0 +1,167 @@
+#!/bin/env python3
+
+import os.path
+import yaml
+import shutil
+import subprocess
+import argparse
+from functools import reduce
+
+parser = argparse.ArgumentParser(description='IO scheduler tester')
+parser.add_argument('--directory', help='Directory to run on', default='/mnt')
+parser.add_argument('--seastar-build-dir', help='Path to seastar build directory', default='./build/dev/', dest='bdir')
+parser.add_argument('--duration', help='One run duration', default=60)
+parser.add_argument('--test', help='Test name to run')
+args = parser.parse_args()
+
+class iotune:
+    def __init__(self, args):
+        self._iotune = args.bdir + '/apps/iotune/iotune'
+        self._dir = args.directory
+
+    def ensure_io_properties(self):
+        if os.path.exists('io_properties.yaml'):
+            print('Using existing io_properties file')
+        else:
+            print('Running iotune')
+            subprocess.check_call([self._iotune, '--evaluation-directory', self._dir, '-c1', '--properties-file', 'io_properties.yaml'])
+
+class ioinfo:
+    def __init__(self, args):
+        self._ioinfo = args.bdir + '/apps/io_tester/ioinfo'
+        self._dir = args.directory
+        res = subprocess.check_output([self._ioinfo, '--directory', self._dir, '--io-properties-file', 'io_properties.yaml'])
+        self._info = yaml.safe_load(res)
+
+    def min_read_length(self):
+        return 4
+
+    def max_read_length(self):
+        return min(self._info['disk_read_max_length'] / 1024, 128)
+
+    def min_write_length(self):
+        return 4
+
+    def max_write_length(self):
+        return min(self._info['disk_write_max_length'] / 1024, 64)
+
+
+class job:
+    def __init__(self, typ, req_size_kb, prl, shards, shares, rps=None):
+        self._typ = typ
+        self._req_size = req_size_kb
+        self._prl = prl
+        self._shares = shares
+        self._shards = shards
+        self._rps = rps
+
+    def to_conf_entry(self, name):
+        ret = {
+            'name': name,
+            'shards': self._shards,
+            'type': self._typ,
+            'shard_info': {
+                'parallelism': self._prl,
+                'reqsize': f'{self._req_size}kB',
+                'shares': self._shares
+            }
+        }
+        if self._rps is not None:
+            ret['shard_info']['rps'] = self._rps
+        return ret
+
+    def shares(self):
+        return self._shares
+
+
+class io_tester:
+    def __init__(self, args, smp):
+        self._jobs = []
+        self._duration = args.duration
+        self._io_tester = args.bdir + '/apps/io_tester/io_tester'
+        self._dir = args.directory
+        self._use_fraction = 0.1
+        self._smp = smp
+
+    def add_job(self, name, job):
+        self._jobs.append(job.to_conf_entry(name))
+
+    def _setup_data_sizes(self):
+        du = shutil.disk_usage(self._dir)
+        one_job_space_mb = int(du.free * self._use_fraction / len(self._jobs) / (100*1024*1024)) * 100 # round down to 100MB
+        for j in self._jobs:
+            j['data_size'] = f'{one_job_space_mb}MB'
+
+    def run(self):
+        if not self._jobs:
+            raise 'Empty jobs'
+
+        self._setup_data_sizes()
+        yaml.dump(self._jobs, open('conf.yaml', 'w'))
+        self._proc = subprocess.Popen([self._io_tester, '--storage', self._dir, f'-c{self._smp}', '--num-io-groups', '1', '--conf', 'conf.yaml', '--duration', f'{self._duration}', '--io-properties-file', 'io_properties.yaml'], stdout=subprocess.PIPE)
+        res = self._proc.communicate()
+        res = res[0].split(b'---\n')[1]
+        return yaml.safe_load(res)
+
+
+def run_jobs(jobs, args, smp):
+    iot = io_tester(args, smp)
+    results = {}
+    for j in jobs:
+        iot.add_job(j, jobs[j])
+        results[j] = { 'iops': 0, 'shares': jobs[j].shares() }
+
+    out = iot.run()
+    statuses = {}
+
+    for j in results:
+        for shard in out:
+            if j in shard:
+                results[j]['iops'] += shard[j]['IOPS']
+
+    return results
+
+
+iotune(args).ensure_io_properties()
+ioinf = ioinfo(args)
+
+def run_shares_balance_test():
+    for s in [ 100, 200, 400, 800 ]:
+        def ratios(res, idx):
+            shares_ratio = float(res[f'shard_{idx}']['shares']) / float(res['shard_0']['shares'])
+            iops_ratio = float(res[f'shard_{idx}']['iops']) / float(res['shard_0']['iops'])
+            return (shares_ratio, iops_ratio)
+
+        res = run_jobs({
+                'shard_0': job('randread', ioinf.min_read_length(), 16, ['0'], 100),
+                'shard_1': job('randread', ioinf.min_read_length(), 24, ['1'], s),
+        }, args, 2)
+        print(f'{res}')
+        shares_ratio, iops_ratio = ratios(res, 1)
+        print(f'IOPS ratio {iops_ratio:.3}, expected {shares_ratio:.3}, deviation {int(abs(shares_ratio - iops_ratio)*100.0/shares_ratio)}%')
+
+        for s2 in [ 200, 400, 800 ]:
+            if s2 <= s:
+                continue
+
+            res = run_jobs({
+                    'shard_0': job('randread', ioinf.min_read_length(), 16, ['0'], 100),
+                    'shard_1': job('randread', ioinf.min_read_length(), 24, ['1'], s),
+                    'shard_2': job('randread', ioinf.min_read_length(), 32, ['2'], s2),
+            }, args, 3)
+            print(f'{res}')
+            shares_ratio, iops_ratio = ratios(res, 1)
+            print(f'shard-1 IOPS ratio {iops_ratio:.3}, expected {shares_ratio:.3}, deviation {int(abs(shares_ratio - iops_ratio)*100.0/shares_ratio)}%')
+            shares_ratio, iops_ratio = ratios(res, 2)
+            print(f'shard-2 IOPS ratio {iops_ratio:.3}, expected {shares_ratio:.3}, deviation {int(abs(shares_ratio - iops_ratio)*100.0/shares_ratio)}%')
+
+def run_rps_balance_test():
+    for rps in [ 1000, 2000, 4000 ]:
+        res = run_jobs({
+            'shard_0': job('randread', ioinf.min_read_length(), 1, ['0'], 100, rps=1000),
+            'shard_1': job('randread', ioinf.min_read_length(), 1, ['1'], 100, rps=rps),
+        }, args, 2)
+        print(f'iops = {res["shard_0"]["iops"]}:{res["shard_1"]["iops"]} want 1000:{rps}')
+
+run_shares_balance_test()
+run_rps_balance_test()