diff --git a/changes/bug34088 b/changes/bug34088 new file mode 100644 index 00000000000..172d890898f --- /dev/null +++ b/changes/bug34088 @@ -0,0 +1,4 @@ + o Minor bugfixes (circuit build timeout): + - Remove max_time calculation and associated warn from circuit build + timeout 'alpha' parameter estimation, as this is no longer needed + by our new estimator from 40168. Fixes bug 34088; bugfix on 0.2.2.9-alpha. diff --git a/changes/bug40168 b/changes/bug40168 new file mode 100644 index 00000000000..c52a0352c28 --- /dev/null +++ b/changes/bug40168 @@ -0,0 +1,16 @@ + o Minor bugfixes (circuit build timeout): + - Improve the accuracy of our circuit build timeout calculation for 60%, + 70%, and 80% build rates for various guard choices. We now use a maximum + likelihood estimator for Pareto parameters of the circuit build time + distribution, instead of a "right-censored estimator". This causes + clients to ignore circuits that never finish building in their timeout + calculations. Previously, clients were counting such unfinished circuits + as having the highest possible build time value, when in reality these + circuits most likely just contain relays that are offline. We also now + wait a bit longer to let circuits complete for measurement purposes, + lower the minimum possible effective timeout from 1.5 seconds to 10ms, + and increase the resolution of the circuit build time histogram from + 50ms bin widths to 10ms bin widths. Additionally, we alter our estimate + Xm by taking the maximum of the top 10 most common build time values + of the 10ms histogram, and compute Xm as the average of these. + Fixes bug 40168; bugfix on 0.2.2.14-alpha. diff --git a/src/core/or/circuitstats.c b/src/core/or/circuitstats.c index 2cde21fa1fa..e23b11ccf99 100644 --- a/src/core/or/circuitstats.c +++ b/src/core/or/circuitstats.c @@ -205,10 +205,10 @@ circuit_build_times_max_timeouts(void) * Retrieve and bounds-check the cbtnummodes consensus parameter. * * Effect: This value governs how many modes to use in the weighted - * average calculation of Pareto parameter Xm. A value of 3 introduces - * some bias (2-5% of CDF) under ideal conditions, but allows for better - * performance in the event that a client chooses guard nodes of radically - * different performance characteristics. + * average calculation of Pareto parameter Xm. Analysis of pairs of + * geographically near, far, and mixed guaeds has shown that a value of + * 10 introduces some allows for the actual timeout rate to be within + * 2-7% of the cutoff quantile, for quantiles between 60-80%. */ static int32_t circuit_build_times_default_num_xm_modes(void) @@ -853,58 +853,49 @@ circuit_build_times_create_histogram(const circuit_build_times_t *cbt, * Return the Pareto start-of-curve parameter Xm. * * Because we are not a true Pareto curve, we compute this as the - * weighted average of the N most frequent build time bins. N is either - * 1 if we don't have enough circuit build time data collected, or - * determined by the consensus parameter cbtnummodes (default 3). + * weighted average of the 10 most frequent build time bins. This + * heuristic allowed for the actual timeout rate to be closest + * to the chosen quantile cutoff, for quantiles 60-80%, out of + * many variant approaches (see #40157 for analysis). */ -static build_time_t +STATIC build_time_t circuit_build_times_get_xm(circuit_build_times_t *cbt) { - build_time_t i, nbins; + build_time_t nbins = 0; build_time_t *nth_max_bin; - int32_t bin_counts=0; - build_time_t ret = 0; - uint32_t *histogram = circuit_build_times_create_histogram(cbt, &nbins); - int n=0; + build_time_t xm_total = 0; + build_time_t Xm = 0; + int32_t xm_counts=0; int num_modes = circuit_build_times_default_num_xm_modes(); + uint32_t *histogram = circuit_build_times_create_histogram(cbt, &nbins); tor_assert(nbins > 0); tor_assert(num_modes > 0); - // Only use one mode if < 1000 buildtimes. Not enough data - // for multiple. - if (cbt->total_build_times < CBT_NCIRCUITS_TO_OBSERVE) - num_modes = 1; - nth_max_bin = tor_calloc(num_modes, sizeof(build_time_t)); - /* Determine the N most common build times */ - for (i = 0; i < nbins; i++) { - if (histogram[i] >= histogram[nth_max_bin[0]]) { - nth_max_bin[0] = i; - } - - for (n = 1; n < num_modes; n++) { - if (histogram[i] >= histogram[nth_max_bin[n]] && - (!histogram[nth_max_bin[n-1]] - || histogram[i] < histogram[nth_max_bin[n-1]])) { + /* Determine the N most common build times, by selecting the + * nth largest mode, counting it, and removing it from the histogram. */ + for (int n = 0; n < num_modes; n++) { + /* Get nth mode */ + for (build_time_t i = 0; i < nbins; i++) { + if (histogram[i] > histogram[nth_max_bin[n]]) { nth_max_bin[n] = i; } } - } - for (n = 0; n < num_modes; n++) { - bin_counts += histogram[nth_max_bin[n]]; - ret += CBT_BIN_TO_MS(nth_max_bin[n])*histogram[nth_max_bin[n]]; - log_info(LD_CIRC, "Xm mode #%d: %u %u", n, CBT_BIN_TO_MS(nth_max_bin[n]), - histogram[nth_max_bin[n]]); + /* Update average */ + xm_counts += histogram[nth_max_bin[n]]; + xm_total += CBT_BIN_TO_MS(nth_max_bin[n])*histogram[nth_max_bin[n]]; + + /* Prevent from re-counting this value */ + histogram[nth_max_bin[n]] = 0; } - /* bin_counts can become zero if all of our last CBT_NCIRCUITS_TO_OBSERVE + /* xm_counts can become zero if all of our last CBT_NCIRCUITS_TO_OBSERVE * circuits were abandoned before they completed. This shouldn't happen, * though. We should have reset/re-learned a lower timeout first. */ - if (bin_counts == 0) { - ret = 0; + if (xm_counts == 0) { log_warn(LD_CIRC, "No valid circuit build time data out of %d times, %u modes, " "have_timeout=%d, %lfms", cbt->total_build_times, num_modes, @@ -912,15 +903,13 @@ circuit_build_times_get_xm(circuit_build_times_t *cbt) goto done; } - tor_assert(bin_counts > 0); - - ret /= bin_counts; + Xm = xm_total / xm_counts; done: tor_free(histogram); tor_free(nth_max_bin); - return ret; + return Xm; } /** @@ -1009,43 +998,6 @@ circuit_build_times_shuffle_and_store_array(circuit_build_times_t *cbt, } } -/** - * Filter old synthetic timeouts that were created before the - * new right-censored Pareto calculation was deployed. - * - * Once all clients before 0.2.1.13-alpha are gone, this code - * will be unused. - */ -static int -circuit_build_times_filter_timeouts(circuit_build_times_t *cbt) -{ - int num_filtered=0, i=0; - double timeout_rate = 0; - build_time_t max_timeout = 0; - - timeout_rate = circuit_build_times_timeout_rate(cbt); - max_timeout = (build_time_t)cbt->close_ms; - - for (i = 0; i < CBT_NCIRCUITS_TO_OBSERVE; i++) { - if (cbt->circuit_build_times[i] > max_timeout) { - build_time_t replaced = cbt->circuit_build_times[i]; - num_filtered++; - cbt->circuit_build_times[i] = CBT_BUILD_ABANDONED; - - log_debug(LD_CIRC, "Replaced timeout %d with %d", replaced, - cbt->circuit_build_times[i]); - } - } - - log_info(LD_CIRC, - "We had %d timeouts out of %d build times, " - "and filtered %d above the max of %u", - (int)(cbt->total_build_times*timeout_rate), - cbt->total_build_times, num_filtered, max_timeout); - - return num_filtered; -} - /** * Load histogram from state, shuffling the resulting array * after we do so. Use this result to estimate parameters and @@ -1171,10 +1123,6 @@ circuit_build_times_parse_state(circuit_build_times_t *cbt, circuit_build_times_set_timeout(cbt); - if (!state->CircuitBuildAbandonedCount && cbt->total_build_times) { - circuit_build_times_filter_timeouts(cbt); - } - done: tor_free(loaded_times); return err ? -1 : 0; @@ -1195,7 +1143,6 @@ circuit_build_times_update_alpha(circuit_build_times_t *cbt) build_time_t *x=cbt->circuit_build_times; double a = 0; int n=0,i=0,abandoned_count=0; - build_time_t max_time=0; /* http://en.wikipedia.org/wiki/Pareto_distribution#Parameter_estimation */ /* We sort of cheat here and make our samples slightly more pareto-like @@ -1215,14 +1162,13 @@ circuit_build_times_update_alpha(circuit_build_times_t *cbt) if (x[i] < cbt->Xm) { a += tor_mathlog(cbt->Xm); + n++; } else if (x[i] == CBT_BUILD_ABANDONED) { abandoned_count++; } else { a += tor_mathlog(x[i]); - if (x[i] > max_time) - max_time = x[i]; + n++; } - n++; } /* @@ -1231,30 +1177,23 @@ circuit_build_times_update_alpha(circuit_build_times_t *cbt) * performs this same check, and resets state if it hits it. If we * hit it at runtime, something serious has gone wrong. */ - if (n!=cbt->total_build_times) { + if (n!=cbt->total_build_times-abandoned_count) { log_err(LD_CIRC, "Discrepancy in build times count: %d vs %d", n, cbt->total_build_times); } - tor_assert(n==cbt->total_build_times); - - if (max_time <= 0) { - /* This can happen if Xm is actually the *maximum* value in the set. - * It can also happen if we've abandoned every single circuit somehow. - * In either case, tell the caller not to compute a new build timeout. */ - log_warn(LD_BUG, - "Could not determine largest build time (%d). " - "Xm is %dms and we've abandoned %d out of %d circuits.", max_time, - cbt->Xm, abandoned_count, n); - return 0; - } - - a += abandoned_count*tor_mathlog(max_time); + tor_assert_nonfatal(n==cbt->total_build_times-abandoned_count); + /* This is the "Maximum Likelihood Estimator" for parameter alpha of a Pareto + * Distribution. See: + * https://en.wikipedia.org/wiki/Pareto_distribution#Estimation_of_parameters + * + * The division in the estimator is done with subtraction outside the ln(), + * with the sum occurring in the for loop above. + * + * This done is to avoid the precision issues of logs of small values. + */ a -= n*tor_mathlog(cbt->Xm); - // Estimator comes from Eq #4 in: - // "Bayesian estimation based on trimmed samples from Pareto populations" - // by Arturo J. Fernández. We are right-censored only. - a = (n-abandoned_count)/a; + a = n/a; cbt->alpha = a; @@ -1662,9 +1601,8 @@ circuit_build_times_network_check_changed(circuit_build_times_t *cbt) log_notice(LD_CIRC, "Your network connection speed appears to have changed. Resetting " - "timeout to %lds after %d timeouts and %d buildtimes.", - tor_lround(cbt->timeout_ms/1000), timeout_count, - total_build_times); + "timeout to %ldms after %d timeouts and %d buildtimes.", + tor_lround(cbt->timeout_ms), timeout_count, total_build_times); return 1; } @@ -1830,7 +1768,7 @@ circuit_build_times_set_timeout(circuit_build_times_t *cbt) return; if (cbt->timeout_ms < circuit_build_times_min_timeout()) { - log_info(LD_CIRC, "Set buildtimeout to low value %fms. Setting to %dms", + log_notice(LD_CIRC, "Set buildtimeout to low value %fms. Setting to %dms", cbt->timeout_ms, circuit_build_times_min_timeout()); cbt->timeout_ms = circuit_build_times_min_timeout(); if (cbt->close_ms < cbt->timeout_ms) { @@ -1848,9 +1786,9 @@ circuit_build_times_set_timeout(circuit_build_times_t *cbt) log_info(LD_CIRC, "Based on %d circuit times, it looks like we don't need to " "wait so long for circuits to finish. We will now assume a " - "circuit is too slow to use after waiting %ld seconds.", + "circuit is too slow to use after waiting %ld milliseconds.", cbt->total_build_times, - tor_lround(cbt->timeout_ms/1000)); + tor_lround(cbt->timeout_ms)); log_info(LD_CIRC, "Circuit timeout data: %fms, %fms, Xm: %d, a: %f, r: %f", cbt->timeout_ms, cbt->close_ms, cbt->Xm, cbt->alpha, @@ -1859,18 +1797,18 @@ circuit_build_times_set_timeout(circuit_build_times_t *cbt) log_info(LD_CIRC, "Based on %d circuit times, it looks like we need to wait " "longer for circuits to finish. We will now assume a " - "circuit is too slow to use after waiting %ld seconds.", + "circuit is too slow to use after waiting %ld milliseconds.", cbt->total_build_times, - tor_lround(cbt->timeout_ms/1000)); + tor_lround(cbt->timeout_ms)); log_info(LD_CIRC, "Circuit timeout data: %fms, %fms, Xm: %d, a: %f, r: %f", cbt->timeout_ms, cbt->close_ms, cbt->Xm, cbt->alpha, timeout_rate); } else { log_info(LD_CIRC, - "Set circuit build timeout to %lds (%fms, %fms, Xm: %d, a: %f," + "Set circuit build timeout to %ldms (%fms, %fms, Xm: %d, a: %f," " r: %f) based on %d circuit times", - tor_lround(cbt->timeout_ms/1000), + tor_lround(cbt->timeout_ms), cbt->timeout_ms, cbt->close_ms, cbt->Xm, cbt->alpha, timeout_rate, cbt->total_build_times); } diff --git a/src/core/or/circuitstats.h b/src/core/or/circuitstats.h index 845d7b67229..cd5d1a0c603 100644 --- a/src/core/or/circuitstats.h +++ b/src/core/or/circuitstats.h @@ -55,10 +55,10 @@ void circuit_build_times_mark_circ_as_measurement_only(origin_circuit_t *circ); #define CBT_NCIRCUITS_TO_OBSERVE 1000 /** Width of the histogram bins in milliseconds */ -#define CBT_BIN_WIDTH ((build_time_t)50) +#define CBT_BIN_WIDTH ((build_time_t)10) /** Number of modes to use in the weighted-avg computation of Xm */ -#define CBT_DEFAULT_NUM_XM_MODES 3 +#define CBT_DEFAULT_NUM_XM_MODES 10 #define CBT_MIN_NUM_XM_MODES 1 #define CBT_MAX_NUM_XM_MODES 20 @@ -78,7 +78,7 @@ void circuit_build_times_mark_circ_as_measurement_only(origin_circuit_t *circ); * How long to wait before actually closing circuits that take too long to * build in terms of CDF quantile. */ -#define CBT_DEFAULT_CLOSE_QUANTILE 95 +#define CBT_DEFAULT_CLOSE_QUANTILE 99 #define CBT_MIN_CLOSE_QUANTILE CBT_MIN_QUANTILE_CUTOFF #define CBT_MAX_CLOSE_QUANTILE CBT_MAX_QUANTILE_CUTOFF @@ -119,8 +119,8 @@ double circuit_build_times_quantile_cutoff(void); #define CBT_MAX_TEST_FREQUENCY INT32_MAX /** Lowest allowable value for CircuitBuildTimeout in milliseconds */ -#define CBT_DEFAULT_TIMEOUT_MIN_VALUE (1500) -#define CBT_MIN_TIMEOUT_MIN_VALUE 500 +#define CBT_DEFAULT_TIMEOUT_MIN_VALUE (CBT_BIN_WIDTH) +#define CBT_MIN_TIMEOUT_MIN_VALUE CBT_BIN_WIDTH #define CBT_MAX_TIMEOUT_MIN_VALUE INT32_MAX /** Initial circuit build timeout in milliseconds */ @@ -142,6 +142,7 @@ STATIC void circuit_build_times_reset(circuit_build_times_t *cbt); /* Network liveness functions */ STATIC int circuit_build_times_network_check_changed( circuit_build_times_t *cbt); +STATIC build_time_t circuit_build_times_get_xm(circuit_build_times_t *cbt); #endif /* defined(CIRCUITSTATS_PRIVATE) */ #ifdef TOR_UNIT_TESTS diff --git a/src/test/test.c b/src/test/test.c index 58b468775cb..f4a37b0ac4d 100644 --- a/src/test/test.c +++ b/src/test/test.c @@ -57,6 +57,7 @@ #include "feature/stats/rephist.h" #include "app/config/statefile.h" #include "lib/crypt_ops/crypto_curve25519.h" +#include "feature/nodelist/networkstatus.h" #include "core/or/extend_info_st.h" #include "core/or/or_circuit_st.h" @@ -366,6 +367,106 @@ crypto_rand_deterministic_aes(char *out, size_t n) crypto_cipher_crypt_inplace(crypto_rand_aes_cipher, out, n); } +static int32_t cbtnummodes = 10; + +static int32_t +mock_xm_networkstatus_get_param( + const networkstatus_t *ns, const char *param_name, int32_t default_val, + int32_t min_val, int32_t max_val) +{ + (void)ns; + (void)default_val; + (void)min_val; + (void)max_val; + // only support cbtnummodes right now + tor_assert(strcmp(param_name, "cbtnummodes")==0); + return cbtnummodes; +} + +static void +test_circuit_timeout_xm_alpha(void *arg) +{ + circuit_build_times_t cbt; + build_time_t Xm; + int alpha_ret; + circuit_build_times_init(&cbt); + (void)arg; + + /* Plan: + * 1. Create array of build times with 10 modes. + * 2. Make sure Xm calc is sane for 1,3,5,10,15,20 modes. + * 3. Make sure alpha calc is sane for 1,3,5,10,15,20 modes. + */ + + /* 110 build times, 9 modes, 8 mode ties, 10 abandoned */ + build_time_t circuit_build_times[] = { + 100, 20, 1000, 500, 200, 5000, 30, 600, 200, 300, CBT_BUILD_ABANDONED, + 101, 21, 1001, 501, 201, 5001, 31, 601, 201, 301, CBT_BUILD_ABANDONED, + 102, 22, 1002, 502, 202, 5002, 32, 602, 202, 302, CBT_BUILD_ABANDONED, + 103, 23, 1003, 503, 203, 5003, 33, 603, 203, 303, CBT_BUILD_ABANDONED, + 104, 24, 1004, 504, 204, 5004, 34, 604, 204, 304, CBT_BUILD_ABANDONED, + 105, 25, 1005, 505, 205, 5005, 35, 605, 205, 305, CBT_BUILD_ABANDONED, + 106, 26, 1006, 506, 206, 5006, 36, 606, 206, 306, CBT_BUILD_ABANDONED, + 107, 27, 1007, 507, 207, 5007, 37, 607, 207, 307, CBT_BUILD_ABANDONED, + 108, 28, 1008, 508, 208, 5008, 38, 608, 208, 308, CBT_BUILD_ABANDONED, + 109, 29, 1009, 509, 209, 5009, 39, 609, 209, 309, CBT_BUILD_ABANDONED + }; + + memcpy(cbt.circuit_build_times, circuit_build_times, + sizeof(circuit_build_times)); + cbt.total_build_times = 110; + + MOCK(networkstatus_get_param, mock_xm_networkstatus_get_param); + +#define CBT_ALPHA_PRECISION 0.00001 + cbtnummodes = 1; + Xm = circuit_build_times_get_xm(&cbt); + alpha_ret = circuit_build_times_update_alpha(&cbt); + tt_int_op(alpha_ret, OP_EQ, 1); + tt_int_op(Xm, OP_EQ, 205); + tt_assert(fabs(cbt.alpha - 1.394401) < CBT_ALPHA_PRECISION); + + cbtnummodes = 3; + Xm = circuit_build_times_get_xm(&cbt); + alpha_ret = circuit_build_times_update_alpha(&cbt); + tt_int_op(alpha_ret, OP_EQ, 1); + tt_int_op(Xm, OP_EQ, 117); + tt_assert(fabs(cbt.alpha - 0.902313) < CBT_ALPHA_PRECISION); + + cbtnummodes = 5; + Xm = circuit_build_times_get_xm(&cbt); + alpha_ret = circuit_build_times_update_alpha(&cbt); + tt_int_op(alpha_ret, OP_EQ, 1); + tt_int_op(Xm, OP_EQ, 146); + tt_assert(fabs(cbt.alpha - 1.049032) < CBT_ALPHA_PRECISION); + + cbtnummodes = 10; + Xm = circuit_build_times_get_xm(&cbt); + alpha_ret = circuit_build_times_update_alpha(&cbt); + tt_int_op(alpha_ret, OP_EQ, 1); + tt_int_op(Xm, OP_EQ, 800); + tt_assert(fabs(cbt.alpha - 4.851754) < CBT_ALPHA_PRECISION); + + cbtnummodes = 15; + Xm = circuit_build_times_get_xm(&cbt); + alpha_ret = circuit_build_times_update_alpha(&cbt); + tt_int_op(alpha_ret, OP_EQ, 1); + tt_int_op(Xm, OP_EQ, 800); + tt_assert(fabs(cbt.alpha - 4.851754) < CBT_ALPHA_PRECISION); + + cbtnummodes = 20; + Xm = circuit_build_times_get_xm(&cbt); + alpha_ret = circuit_build_times_update_alpha(&cbt); + tt_int_op(alpha_ret, OP_EQ, 1); + tt_int_op(Xm, OP_EQ, 800); + tt_assert(fabs(cbt.alpha - 4.851754) < CBT_ALPHA_PRECISION); + + done: +#undef CBT_ALPHA_PRECISION + UNMOCK(networkstatus_get_param); + circuit_build_times_free_timeouts(&cbt); +} + static void test_circuit_timeout(void *arg) { @@ -384,7 +485,6 @@ test_circuit_timeout(void *arg) double timeout1, timeout2; or_state_t *state=NULL; int i, runs; - double close_ms; (void)arg; initialize_periodic_events(); @@ -406,18 +506,11 @@ test_circuit_timeout(void *arg) circuit_build_times_initial_alpha(&initial, CBT_DEFAULT_QUANTILE_CUTOFF/100.0, timeout0); - close_ms = MAX(circuit_build_times_calculate_timeout(&initial, - CBT_DEFAULT_CLOSE_QUANTILE/100.0), - CBT_DEFAULT_TIMEOUT_INITIAL_VALUE); do { for (i=0; i < CBT_DEFAULT_MIN_CIRCUITS_TO_OBSERVE; i++) { build_time_t sample = circuit_build_times_generate_sample(&initial,0,1); - if (sample > close_ms) { - circuit_build_times_add_time(&estimate, CBT_BUILD_ABANDONED); - } else { - circuit_build_times_add_time(&estimate, sample); - } + circuit_build_times_add_time(&estimate, sample); } circuit_build_times_update_alpha(&estimate); timeout1 = circuit_build_times_calculate_timeout(&estimate, @@ -823,6 +916,7 @@ static struct testcase_t test_array[] = { { "ntor_handshake", test_ntor_handshake, 0, NULL, NULL }, { "fast_handshake", test_fast_handshake, 0, NULL, NULL }, FORK(circuit_timeout), + FORK(circuit_timeout_xm_alpha), FORK(rend_fns), FORK(stats),