From 8d1c55748b43a2e0d0c31e6d33a37e08041473e3 Mon Sep 17 00:00:00 2001 From: Differential Privacy Team Date: Fri, 30 Jul 2021 03:42:53 -0700 Subject: [PATCH] Add quantile trees and deprecate order statistics in C++ C++: - Add an interface for multiple quantiles, supported by Quantile Trees - Mark OrderStatistics as deprecated in favor of the new quantile implementation Go & Privacy on Beam: - Various fixes and improvements for CI with the `go` tool Privacy on Beam: - Improve instructions for depending on the library with Bazel - Fix typos in DistinctPerKey tests Accounting: - Add a function to conveniently create PLD for Discrete Laplace, Discrete Gaussian and Gaussian Mechanisms GitOrigin-RevId: 958acb2199902126d3d333dd8d1da150fa281911 Change-Id: I89de361be98dcc6da9675d5730f5b06bb7c59a31 --- .github/workflows/go.yml | 47 ++ README.md | 4 +- cc/accounting/BUILD | 2 + cc/accounting/privacy_loss_distribution.cc | 34 + cc/accounting/privacy_loss_distribution.h | 69 ++ .../privacy_loss_distribution_test.cc | 302 ++++++- cc/accounting/privacy_loss_mechanism.cc | 26 +- cc/accounting/privacy_loss_mechanism.h | 12 +- cc/accounting/privacy_loss_mechanism_test.cc | 11 +- cc/algorithms/BUILD | 55 ++ cc/algorithms/internal/BUILD | 45 + cc/algorithms/internal/count-tree.cc | 134 +++ cc/algorithms/internal/count-tree.h | 116 +++ cc/algorithms/internal/count-tree_test.cc | 178 ++++ cc/algorithms/order-statistics.h | 27 +- cc/algorithms/quantile-tree.h | 329 ++++++++ cc/algorithms/quantile-tree_test.cc | 793 ++++++++++++++++++ cc/algorithms/quantiles.h | 209 +++++ cc/algorithms/quantiles_test.cc | 401 +++++++++ cc/base/testing/BUILD | 1 + cc/base/testing/proto_matchers.h | 1 + cc/docs/algorithms/order-statistics.md | 3 + cc/docs/algorithms/quantiles.md | 167 ++++ cc/testing/BUILD | 50 ++ cc/testing/quantile_tree_dp_test.cc | 168 ++++ cc/testing/statistical_tests_utils.cc | 190 +++++ cc/testing/statistical_tests_utils.h | 116 +++ cc/testing/statistical_tests_utils_test.cc | 299 +++++++ examples/go/main/main.go | 2 +- privacy-on-beam/README.md | 54 +- .../pbeam/distinct_per_key_test.go | 28 +- proto/testing/BUILD | 8 + 32 files changed, 3773 insertions(+), 108 deletions(-) create mode 100644 .github/workflows/go.yml create mode 100644 cc/algorithms/internal/BUILD create mode 100644 cc/algorithms/internal/count-tree.cc create mode 100644 cc/algorithms/internal/count-tree.h create mode 100644 cc/algorithms/internal/count-tree_test.cc create mode 100644 cc/algorithms/quantile-tree.h create mode 100644 cc/algorithms/quantile-tree_test.cc create mode 100644 cc/algorithms/quantiles.h create mode 100644 cc/algorithms/quantiles_test.cc create mode 100644 cc/docs/algorithms/quantiles.md create mode 100644 cc/testing/quantile_tree_dp_test.cc create mode 100644 cc/testing/statistical_tests_utils.cc create mode 100644 cc/testing/statistical_tests_utils.h create mode 100644 cc/testing/statistical_tests_utils_test.cc diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml new file mode 100644 index 00000000..fef8474d --- /dev/null +++ b/.github/workflows/go.yml @@ -0,0 +1,47 @@ +name: Go + +on: + push: + branches-ignore: + # Version 1.0 does not work with the "go" tool properly. + - "1.0" + pull_request: + branches-ignore: + # Version 1.0 does not work with the "go" tool properly. + - "1.0" + schedule: + # Every Thursday at 1PM UTC + - cron: "0 13 * * 4" + +jobs: + + build: + runs-on: ubuntu-latest + timeout-minutes: 15 + steps: + - uses: actions/checkout@v2 + + - name: Set up Go + uses: actions/setup-go@v2 + with: + go-version: 1.16 + + - name: Build go + run: go build -mod=mod -v ./... + working-directory: ./go + + - name: Test go + run: go test -mod=mod -v ./... + working-directory: ./go + + - name: Build examples/go + run: go build -mod=mod -v ./... + working-directory: ./examples/go + + - name: Build privacy-on-beam + run: go build -mod=mod -v ./... + working-directory: ./privacy-on-beam + + - name: Test privacy-on-beam + run: go test -mod=mod -v ./... + working-directory: ./privacy-on-beam diff --git a/README.md b/README.md index 66ec23d3..ceb332ce 100644 --- a/README.md +++ b/README.md @@ -28,8 +28,8 @@ Currently, the DP building block libraries support the following algorithms: | Count | Supported | Supported | Supported | | Sum | Supported | Supported | Supported | | Mean | Supported | Supported | Supported | -| Variance | Supported | Planned | Planned | -| Standard deviation | Supported | Planned | Planned | +| Variance | Supported | Supported | Planned | +| Standard deviation | Supported | Supported | Planned | | Quantiles | Supported | Supported | Supported | | Automatic bounds approximation | Supported | Planned | Planned | | Truncated geometric thresholding | Supported | Supported | Supported | diff --git a/cc/accounting/BUILD b/cc/accounting/BUILD index 252c4d34..04806caa 100644 --- a/cc/accounting/BUILD +++ b/cc/accounting/BUILD @@ -47,6 +47,7 @@ cc_library( "@com_google_absl//absl/container:flat_hash_map", "@com_google_absl//absl/status", "@com_google_absl//absl/strings:str_format", + "@com_google_absl//absl/types:optional", "@com_google_cc_differential_privacy//base:status", "@com_google_cc_differential_privacy//base:statusor", ], @@ -81,6 +82,7 @@ cc_test( "//accounting/common:test_util", "@com_google_differential_privacy//proto/accounting:privacy_loss_distribution_cc_proto", "@com_google_absl//absl/status", + "@com_google_absl//absl/types:optional", "@com_google_cc_differential_privacy//base:statusor", ], ) diff --git a/cc/accounting/privacy_loss_distribution.cc b/cc/accounting/privacy_loss_distribution.cc index 6ef9fe34..8d697164 100644 --- a/cc/accounting/privacy_loss_distribution.cc +++ b/cc/accounting/privacy_loss_distribution.cc @@ -22,6 +22,7 @@ #include "absl/strings/str_format.h" #include "accounting/common/common.h" #include "accounting/convolution.h" +#include "accounting/privacy_loss_mechanism.h" #include "proto/accounting/privacy-loss-distribution.pb.h" #include "base/status_macros.h" @@ -204,6 +205,39 @@ PrivacyLossDistribution::CreateForLaplaceMechanism( discretization_interval); } +base::StatusOr> +PrivacyLossDistribution::CreateForDiscreteLaplaceMechanism( + double parameter, int sensitivity, EstimateType estimate_type, + double discretization_interval) { + ASSIGN_OR_RETURN(std::unique_ptr privacy_loss, + DiscreteLaplacePrivacyLoss::Create(parameter, sensitivity)); + return CreateForAdditiveNoise(*privacy_loss, estimate_type, + discretization_interval); +} + +base::StatusOr> +PrivacyLossDistribution::CreateForGaussianMechanism( + double standard_deviation, double sensitivity, EstimateType estimate_type, + double discretization_interval, double mass_truncation_bound) { + ASSIGN_OR_RETURN( + std::unique_ptr privacy_loss, + GaussianPrivacyLoss::Create(standard_deviation, sensitivity, + estimate_type, mass_truncation_bound)); + return CreateForAdditiveNoise(*privacy_loss, estimate_type, + discretization_interval); +} + +base::StatusOr> +PrivacyLossDistribution::CreateForDiscreteGaussianMechanism( + double sigma, int sensitivity, EstimateType estimate_type, + double discretization_interval, absl::optional truncation_bound) { + ASSIGN_OR_RETURN(std::unique_ptr privacy_loss, + DiscreteGaussianPrivacyLoss::Create(sigma, sensitivity, + truncation_bound)); + return CreateForAdditiveNoise(*privacy_loss, estimate_type, + discretization_interval); +} + std::unique_ptr PrivacyLossDistribution::CreateForPrivacyParameters( EpsilonDelta epsilon_delta, double discretization_interval) { diff --git a/cc/accounting/privacy_loss_distribution.h b/cc/accounting/privacy_loss_distribution.h index ea27c54b..8a51272f 100644 --- a/cc/accounting/privacy_loss_distribution.h +++ b/cc/accounting/privacy_loss_distribution.h @@ -22,6 +22,7 @@ #include "absl/container/flat_hash_map.h" #include "base/statusor.h" +#include "absl/types/optional.h" #include "accounting/common/common.h" #include "accounting/privacy_loss_mechanism.h" #include "proto/accounting/privacy-loss-distribution.pb.h" @@ -122,12 +123,80 @@ class PrivacyLossDistribution { // estimate_type: kPessimistic denoting that the rounding is done in such a // way that the resulting epsilon-hockey stick divergence computation gives // an upper estimate to the real value. + // discretization_interval: the length of the dicretization interval for the + // privacy loss distribution. The values will be rounded up/down to be + // integer multiples of this number. static base::StatusOr> CreateForLaplaceMechanism( double parameter, double sensitivity = 1, EstimateType estimate_type = EstimateType::kPessimistic, double discretization_interval = 1e-4); + // Creates {@link PrivacyLossDistribution} for the Discrete Laplace mechanism. + // + // parameter: the parameter of the Discrete Laplace distribution. + // sensitivity: the sensitivity of function f. (i.e. the maximum absolute + // change in f when an input to a single user changes.) + // estimate_type: kPessimistic denoting that the rounding is done in such a + // way that the resulting epsilon-hockey stick divergence computation gives + // an upper estimate to the real value. + // discretization_interval: the length of the dicretization interval for the + // privacy loss distribution. The values will be rounded up/down to be + // integer multiples of this number. + static base::StatusOr> + CreateForDiscreteLaplaceMechanism( + double parameter, int sensitivity = 1, + EstimateType estimate_type = EstimateType::kPessimistic, + double discretization_interval = 1e-4); + + // Creates {@link PrivacyLossDistribution} for the Gaussian mechanism. + // + // standard_deviation: the standard_deviation of the Gaussian distribution. + // sensitivity: the sensitivity of function f. (i.e. the maximum absolute + // change in f when an input to a single user changes.) + // estimate_type: kPessimistic denoting that the rounding is done in such a + // way that the resulting epsilon-hockey stick divergence computation gives + // an upper estimate to the real value. + // discretization_interval: the length of the dicretization interval for the + // privacy loss distribution. The values will be rounded up/down to be + // integer multiples of this number. + // mass_truncation_bound: the natural log of the probability mass that might + // be discarded from the noise distribution. The larger this number, the + // more error it may introduce in divergence calculations. + static base::StatusOr> + CreateForGaussianMechanism( + double standard_deviation, double sensitivity = 1, + EstimateType estimate_type = EstimateType::kPessimistic, + double discretization_interval = 1e-4, + double mass_truncation_bound = -50); + + // Creates {@link PrivacyLossDistribution} for the Gaussian mechanism. + // + // sigma: he parameter of the discrete Gaussian distribution. Note that unlike + // the (continuous) Gaussian distribution this is not equal to the standard + // deviation of the noise. + // sensitivity: the sensitivity of function f. (i.e. the maximum absolute + // change in f when an input to a single user changes.) + // estimate_type: kPessimistic denoting that the rounding is done in such a + // way that the resulting epsilon-hockey stick divergence computation gives + // an upper estimate to the real value. + // discretization_interval: the length of the dicretization interval for the + // privacy loss distribution. The values will be rounded up/down to be + // integer multiples of this number. + // mass_truncation_bound: the natural log of the probability mass that might + // be discarded from the noise distribution. The larger this number, the + // more error it may introduce in divergence calculations. + // truncation_bound: bound for truncating the noise, i.e. the noise will only + // have a support in [-truncation_bound, truncation_bound]. When not set, + // truncation_bound will be chosen in such a way that the mass of the noise + // outside of this range is at most 1e-30. + static base::StatusOr> + CreateForDiscreteGaussianMechanism( + double sigma, int sensitivity = 1, + EstimateType estimate_type = EstimateType::kPessimistic, + double discretization_interval = 1e-4, + absl::optional truncation_bound = absl::nullopt); + // Creates {@link PrivacyLossDistribution} from epsilon and delta parameters. // // When the mechanism is (epsilon, delta)-differentially private, the diff --git a/cc/accounting/privacy_loss_distribution_test.cc b/cc/accounting/privacy_loss_distribution_test.cc index 665e41ad..d749af1c 100644 --- a/cc/accounting/privacy_loss_distribution_test.cc +++ b/cc/accounting/privacy_loss_distribution_test.cc @@ -568,8 +568,11 @@ TEST_P(CreateForPrivacyParameters, Create) { } struct DiscreteLaplacePrivacyLossDistributionParam { + std::string test_name; double parameter; double sensitivity; + EstimateType estimate_type; + double discretization_interval; ProbabilityMassFunction expected_pmf; }; @@ -577,50 +580,80 @@ class DiscreteLaplacePrivacyLossDistribution : public testing::TestWithParam< DiscreteLaplacePrivacyLossDistributionParam> {}; -INSTANTIATE_TEST_SUITE_P(DiscreteLaplacePrivacyLossDistributionParam, - DiscreteLaplacePrivacyLossDistribution, - Values( - DiscreteLaplacePrivacyLossDistributionParam{ - .parameter = 1.0, - .sensitivity = 1, - .expected_pmf = {{1, 0.73105858}, - {-1, 0.26894142}}}, - DiscreteLaplacePrivacyLossDistributionParam{ - .parameter = 1.0, - .sensitivity = 2, - .expected_pmf = {{2, 0.73105858}, - {0, 0.17000340}, - {-2, 0.09893802}}}, - DiscreteLaplacePrivacyLossDistributionParam{ - .parameter = 0.8, - .sensitivity = 2, - .expected_pmf = {{2, 0.68997448}, - {0, 0.17072207}, - {-1, 0.13930345}}}, - DiscreteLaplacePrivacyLossDistributionParam{ - .parameter = 0.8, - .sensitivity = 3, - .expected_pmf = {{3, 0.68997448}, - {1, 0.17072207}, - {0, 0.07671037}, - {-2, 0.06259307}}})); +INSTANTIATE_TEST_SUITE_P( + DiscreteLaplacePrivacyLossDistributionParam, + DiscreteLaplacePrivacyLossDistribution, + Values( + DiscreteLaplacePrivacyLossDistributionParam{ + .test_name = "basic", + .parameter = 1.0, + .sensitivity = 1, + .estimate_type = EstimateType::kPessimistic, + .discretization_interval = 1, + .expected_pmf = {{1, 0.73105858}, {-1, 0.26894142}}}, + DiscreteLaplacePrivacyLossDistributionParam{ + .test_name = "varying_sensitivity", + .parameter = 1.0, + .sensitivity = 2, + .estimate_type = EstimateType::kPessimistic, + .discretization_interval = 1, + .expected_pmf = {{2, 0.73105858}, + {0, 0.17000340}, + {-2, 0.09893802}}}, + DiscreteLaplacePrivacyLossDistributionParam{ + .test_name = "varying_sensitivity_and_parameter", + .parameter = 0.8, + .sensitivity = 2, + .estimate_type = EstimateType::kPessimistic, + .discretization_interval = 1, + .expected_pmf = {{2, 0.68997448}, + {0, 0.17072207}, + {-1, 0.13930345}}}, + DiscreteLaplacePrivacyLossDistributionParam{ + .test_name = "varying_sensitivity_and_parameter_2", + .parameter = 0.8, + .sensitivity = 3, + .estimate_type = EstimateType::kPessimistic, + .discretization_interval = 1, + .expected_pmf = {{3, 0.68997448}, + {1, 0.17072207}, + {0, 0.07671037}, + {-2, 0.06259307}}}, + DiscreteLaplacePrivacyLossDistributionParam{ + .test_name = "varying_discretization_interval", + .parameter = 1, + .sensitivity = 1, + .estimate_type = EstimateType::kPessimistic, + .discretization_interval = 0.03, + .expected_pmf = {{34, 0.73105858}, {-33, 0.26894142}}}, + DiscreteLaplacePrivacyLossDistributionParam{ + .test_name = "optimistic", + .parameter = 1, + .sensitivity = 1, + .estimate_type = EstimateType::kOptimistic, + .discretization_interval = 0.03, + .expected_pmf = {{33, 0.73105858}, {-34, 0.26894142}}}), + [](const ::testing::TestParamInfo< + DiscreteLaplacePrivacyLossDistribution::ParamType>& info) { + return info.param.test_name; + }); TEST_P(DiscreteLaplacePrivacyLossDistribution, Create) { DiscreteLaplacePrivacyLossDistributionParam param = GetParam(); - base::StatusOr> - noise_privacy_loss = DiscreteLaplacePrivacyLoss::Create( - /*parameter=*/param.parameter, - /*sensitivity=*/param.sensitivity); - ASSERT_OK(noise_privacy_loss); + base::StatusOr> pld = + PrivacyLossDistribution::CreateForDiscreteLaplaceMechanism( + /*parameter=*/param.parameter, /*sensitivity=*/param.sensitivity, + /*estimate_type=*/param.estimate_type, + /*discretization_interval=*/param.discretization_interval); - std::unique_ptr pld = - PrivacyLossDistribution::CreateForAdditiveNoise( - *noise_privacy_loss.value(), EstimateType::kPessimistic, - /*discretization_interval=*/1); + ASSERT_OK(pld); - EXPECT_THAT(pld->Pmf(), PMFIsNear(param.expected_pmf)); - EXPECT_NEAR(pld->InfinityMass(), 0, kMaxError); + EXPECT_DOUBLE_EQ((*pld)->InfinityMass(), 0); + EXPECT_DOUBLE_EQ((*pld)->DiscretizationInterval(), + param.discretization_interval); + EXPECT_EQ((*pld)->GetEstimateType(), param.estimate_type); + EXPECT_THAT((*pld)->Pmf(), PMFIsNear(param.expected_pmf)); } struct LaplacePrivacyLossDistributionParam { @@ -706,6 +739,199 @@ TEST_P(LaplacePrivacyLossDistribution, CreatePLD) { EXPECT_THAT((*pld)->Pmf(), PMFIsNear(param.expected_pmf)); } +struct GaussianPrivacyLossDistributionParam { + std::string test_name; + double standard_deviation; + double sensitivity; + EstimateType estimate_type; + double discretization_interval; + ProbabilityMassFunction expected_pmf; + double expected_infinity_mass; +}; + +class GaussianPrivacyLossDistribution + : public testing::TestWithParam {}; + +INSTANTIATE_TEST_SUITE_P( + GaussianPrivacyLossDistributionParam, GaussianPrivacyLossDistribution, + Values( + GaussianPrivacyLossDistributionParam{ + .test_name = "basic", + .standard_deviation = 1, + .sensitivity = 1, + .estimate_type = EstimateType::kPessimistic, + .discretization_interval = 1, + .expected_pmf = {{2, 0.12447741}, {1, 0.38292492}, {0, 0.30853754}}, + .expected_infinity_mass = 0.18406013, // = CDF_normal(-0.9) + }, + GaussianPrivacyLossDistributionParam{ + .test_name = "varying_standard_deviation_and_sensitivity", + .standard_deviation = 3, + .sensitivity = 6, + .estimate_type = EstimateType::kPessimistic, + .discretization_interval = 1, + .expected_pmf = {{4, 0.12447741}, + {3, 0.19146246}, + {2, 0.19146246}, + {1, 0.30853754}}, + .expected_infinity_mass = 0.18406013, // = CDF_normal(-0.9) + }, + GaussianPrivacyLossDistributionParam{ + .test_name = "varying_discretization_interval", + .standard_deviation = 1, + .sensitivity = 1, + .estimate_type = EstimateType::kPessimistic, + .discretization_interval = 0.3, + .expected_pmf = {{5, 0.05790353}, + {4, 0.10261461}, + {3, 0.11559390}, + {2, 0.11908755}, + {1, 0.11220275}, + {0, 0.09668214}, + {-1, 0.21185540}}, + .expected_infinity_mass = 0.18406013, // = CDF_normal(-0.9) + }, + GaussianPrivacyLossDistributionParam{ + .test_name = "optimistic_estimate", + .standard_deviation = 1, + .sensitivity = 1, + .estimate_type = EstimateType::kOptimistic, + .discretization_interval = 1, + .expected_pmf = {{1, 0.30853754}, + {0, 0.38292492}, + {-1, 0.12447741}}, + .expected_infinity_mass = 0}), + [](const ::testing::TestParamInfo< + GaussianPrivacyLossDistribution::ParamType>& info) { + return info.param.test_name; + }); + +TEST_P(GaussianPrivacyLossDistribution, CreatePLD) { + GaussianPrivacyLossDistributionParam param = GetParam(); + // mass_truncation_bound = ln(2) + log(CDF_normal(-0.9)). + double mass_truncation_bound = -0.999345626001393; + + base::StatusOr> pld = + PrivacyLossDistribution::CreateForGaussianMechanism( + /*standard_deviation=*/param.standard_deviation, + /*sensitivity=*/param.sensitivity, + /*estimate_type=*/param.estimate_type, + /*discretization_interval=*/param.discretization_interval, + /*mass_truncation_bound=*/mass_truncation_bound); + + ASSERT_OK(pld); + + EXPECT_NEAR((*pld)->InfinityMass(), param.expected_infinity_mass, kMaxError); + EXPECT_DOUBLE_EQ((*pld)->DiscretizationInterval(), + param.discretization_interval); + EXPECT_EQ((*pld)->GetEstimateType(), param.estimate_type); + EXPECT_THAT((*pld)->Pmf(), PMFIsNear(param.expected_pmf)); +} + +struct DiscreteGaussianPrivacyLossDistributionParam { + std::string test_name; + double sigma; + double sensitivity; + EstimateType estimate_type; + double discretization_interval; + ProbabilityMassFunction expected_pmf; + double expected_infinity_mass; + absl::optional truncation_bound = absl::nullopt; +}; + +class DiscreteGaussianPrivacyLossDistribution + : public testing::TestWithParam< + DiscreteGaussianPrivacyLossDistributionParam> {}; + +INSTANTIATE_TEST_SUITE_P( + DiscreteGaussianPrivacyLossDistributionParam, + DiscreteGaussianPrivacyLossDistribution, + Values( + DiscreteGaussianPrivacyLossDistributionParam{ + .test_name = "basic", + .sigma = 1, + .sensitivity = 1, + .estimate_type = EstimateType::kPessimistic, + .discretization_interval = 1e-2, + .expected_pmf = {{50, 0.45186276}, {-50, 0.27406862}}, + .expected_infinity_mass = 0.27406862, + .truncation_bound = 1, + }, + DiscreteGaussianPrivacyLossDistributionParam{ + .test_name = "varying_sensitivity", + .sigma = 1, + .sensitivity = 2, + .estimate_type = EstimateType::kPessimistic, + .discretization_interval = 1e-2, + .expected_pmf = {{0, 0.27406862}}, + .expected_infinity_mass = 0.72593138, + .truncation_bound = 1, + }, + DiscreteGaussianPrivacyLossDistributionParam{ + .test_name = "varying_sigma", + .sigma = 3, + .sensitivity = 1, + .estimate_type = EstimateType::kPessimistic, + .discretization_interval = 1e-2, + .expected_pmf = {{6, 0.34579116}, {-5, 0.32710442}}, + .expected_infinity_mass = 0.32710442, + .truncation_bound = 1, + }, + DiscreteGaussianPrivacyLossDistributionParam{ + .test_name = "varying_discretization_interval", + .sigma = 3, + .sensitivity = 1, + .estimate_type = EstimateType::kPessimistic, + .discretization_interval = 1e-3, + .expected_pmf = {{56, 0.34579116}, {-55, 0.32710442}}, + .expected_infinity_mass = 0.32710442, + .truncation_bound = 1, + }, + DiscreteGaussianPrivacyLossDistributionParam{ + .test_name = "optimistic", + .sigma = 3, + .sensitivity = 1, + .estimate_type = EstimateType::kOptimistic, + .discretization_interval = 1e-4, + .expected_pmf = {{555, 0.34579116}, {-556, 0.32710442}}, + .expected_infinity_mass = 0.32710442, + .truncation_bound = 1, + }, + DiscreteGaussianPrivacyLossDistributionParam{ + .test_name = "default_truncation", + .sigma = 3, + .sensitivity = 1, + .estimate_type = EstimateType::kOptimistic, + .discretization_interval = 1, + .expected_pmf = + {{1, 0.00221}, {0, 0.56428}, {-1, 0.43278}, {-2, 0.00073}}, + .expected_infinity_mass = 0, + }), + [](const ::testing::TestParamInfo< + DiscreteGaussianPrivacyLossDistribution::ParamType>& info) { + return info.param.test_name; + }); + +TEST_P(DiscreteGaussianPrivacyLossDistribution, CreatePLD) { + DiscreteGaussianPrivacyLossDistributionParam param = GetParam(); + + base::StatusOr> pld = + PrivacyLossDistribution::CreateForDiscreteGaussianMechanism( + /*sigma=*/param.sigma, + /*sensitivity=*/param.sensitivity, + /*estimate_type=*/param.estimate_type, + /*discretization_interval=*/param.discretization_interval, + /*truncation_bound=*/param.truncation_bound); + + ASSERT_OK(pld); + + EXPECT_NEAR((*pld)->InfinityMass(), param.expected_infinity_mass, kMaxError); + EXPECT_DOUBLE_EQ((*pld)->DiscretizationInterval(), + param.discretization_interval); + EXPECT_EQ((*pld)->GetEstimateType(), param.estimate_type); + EXPECT_THAT((*pld)->Pmf(), PMFIsNear(param.expected_pmf)); +} + TEST(PrivacyLossDistributionTest, AccurateComposition) { base::StatusOr> noise_privacy_loss = GaussianPrivacyLoss::Create( diff --git a/cc/accounting/privacy_loss_mechanism.cc b/cc/accounting/privacy_loss_mechanism.cc index ad0fab5b..65088cf7 100644 --- a/cc/accounting/privacy_loss_mechanism.cc +++ b/cc/accounting/privacy_loss_mechanism.cc @@ -228,34 +228,34 @@ PrivacyLossTail DiscreteLaplacePrivacyLoss::PrivacyLossDistributionTail() } base::StatusOr> DiscreteGaussianPrivacyLoss::Create(double sigma, int sensitivity, - int truncation_bound) { + absl::optional truncation_bound) { if (sigma <= 0) { return absl::InvalidArgumentError("sigma should be positive."); } - if (truncation_bound < 0) { - // Tail bound from Canonne et al. ensures that the mass that gets - // truncated is at most 1e-30. (See Proposition 1 in the supplementary - // material.) - truncation_bound = std::ceil(11.6 * sigma); - } + // Tail bound from Canonne et al. ensures that the mass that gets truncated at + // is ceil(11.6 * sigma) at most 1e-30. (See Proposition 1 in the + // supplementary material.) + int truncation_bound_value = + truncation_bound.value_or(std::ceil(11.6 * sigma)); ProbabilityMassFunction noise_pmf; CumulativeDensityFunction noise_cdf; - for (int x = -truncation_bound; x <= truncation_bound; ++x) { + for (int x = -truncation_bound_value; x <= truncation_bound_value; ++x) { noise_pmf[x] = std::exp(-0.5 * std::pow(x, 2) / std::pow(sigma, 2)); noise_cdf[x] = noise_cdf[x - 1] + noise_pmf[x]; } - for (int x = -truncation_bound; x <= truncation_bound; ++x) { - noise_pmf[x] /= noise_cdf[truncation_bound]; - noise_cdf[x] /= noise_cdf[truncation_bound]; + for (int x = -truncation_bound_value; x <= truncation_bound_value; ++x) { + noise_pmf[x] /= noise_cdf[truncation_bound_value]; + noise_cdf[x] /= noise_cdf[truncation_bound_value]; } return absl::WrapUnique(new DiscreteGaussianPrivacyLoss( - sigma, sensitivity, truncation_bound, noise_pmf, noise_cdf)); + sigma, sensitivity, truncation_bound_value, noise_pmf, noise_cdf)); } base::StatusOr> DiscreteGaussianPrivacyLoss::Create(const EpsilonDelta& epsilon_delta, - int sensitivity, int truncation_bound) { + int sensitivity, + absl::optional truncation_bound) { // Use binary search to find the smallest possible sigma of the Discrete // Gaussian noise for which the protocol is (epsilon, delta)-differentially // private. diff --git a/cc/accounting/privacy_loss_mechanism.h b/cc/accounting/privacy_loss_mechanism.h index 23da26d9..5974072f 100644 --- a/cc/accounting/privacy_loss_mechanism.h +++ b/cc/accounting/privacy_loss_mechanism.h @@ -18,6 +18,7 @@ #include "base/logging.h" #include "absl/status/status.h" #include "base/statusor.h" +#include "absl/types/optional.h" #include "boost/math/distributions/laplace.hpp" #include "boost/math/distributions/normal.hpp" #include "accounting/common/common.h" @@ -306,17 +307,18 @@ class DiscreteGaussianPrivacyLoss : public AdditiveNoisePrivacyLoss { // sensitivity: the sensitivity of function f. (i.e. the maximum absolute // change in f when an input to a single user changes.) // truncation_bound: bound for truncating the noise, i.e. the noise will only - // have a support in [-truncation_bound, truncation_bound]. When set to - // -1, truncation_bound will be chosen in such a way that the mass - // of the noise outside of this range is at most 1e-30. + // have a support in [-truncation_bound, truncation_bound]. When not set, + // truncation_bound will be chosen in such a way that the mass of the noise + // outside of this range is at most 1e-30. static base::StatusOr> Create( - double sigma, int sensitivity, int truncation_bound = -1); + double sigma, int sensitivity, + absl::optional truncation_bound = absl::nullopt); NoiseType Discrete() const override { return NoiseType::kDiscrete; } static base::StatusOr> Create( const EpsilonDelta& epsilon_delta, int sensitivity, - int truncation_bound = -1); + absl::optional truncation_bound); double InversePrivacyLoss(double privacy_loss) const override; diff --git a/cc/accounting/privacy_loss_mechanism_test.cc b/cc/accounting/privacy_loss_mechanism_test.cc index c263dd80..1fb45a94 100644 --- a/cc/accounting/privacy_loss_mechanism_test.cc +++ b/cc/accounting/privacy_loss_mechanism_test.cc @@ -18,6 +18,7 @@ #include "gtest/gtest.h" #include "absl/status/status.h" #include "base/statusor.h" +#include "absl/types/optional.h" #include "base/testing/status_matchers.h" namespace differential_privacy { @@ -551,7 +552,7 @@ struct DiscreteGaussianFromEpsilonDeltaParam { double epsilon; double delta; double expected_sigma; - int truncation_bound = -1; + absl::optional truncation_bound = absl::nullopt; }; class DiscreteGaussianFromEpsilonDeltaTest @@ -579,10 +580,10 @@ INSTANTIATE_TEST_SUITE_P( TEST_P(DiscreteGaussianFromEpsilonDeltaTest, CreateFromEpsilonDelta) { DiscreteGaussianFromEpsilonDeltaParam param = GetParam(); - base::StatusOr> mechanism = - DiscreteGaussianPrivacyLoss::Create( - EpsilonDelta{param.epsilon, param.delta}, param.sensitivity, - param.truncation_bound); + base::StatusOr> mechanism; + mechanism = DiscreteGaussianPrivacyLoss::Create( + EpsilonDelta{param.epsilon, param.delta}, param.sensitivity, + param.truncation_bound); ASSERT_OK(mechanism); double sigma = (*mechanism)->Sigma(); diff --git a/cc/algorithms/BUILD b/cc/algorithms/BUILD index 317709a7..97a26581 100644 --- a/cc/algorithms/BUILD +++ b/cc/algorithms/BUILD @@ -107,6 +107,7 @@ cc_library( ":bounded-algorithm", ":numerical-mechanisms", "//base:percentile", + "@com_google_absl//absl/base:core_headers", "@com_google_absl//absl/status", "@com_google_cc_differential_privacy//base:status", "@com_google_cc_differential_privacy//base:statusor", @@ -528,3 +529,57 @@ cc_test( "@com_google_cc_differential_privacy//base:statusor", ], ) + +cc_library( + name = "quantile-tree", + hdrs = ["quantile-tree.h"], + deps = [ + ":algorithm", + ":bounded-algorithm", + ":numerical-mechanisms", + "//algorithms/internal:count-tree", + "//proto:util-lib", + "@com_google_differential_privacy//proto:summary_cc_proto", + "@com_google_absl//absl/status", + "@com_google_cc_differential_privacy//base:status", + "@com_google_cc_differential_privacy//base:statusor", + ], +) + +cc_test( + name = "quantile-tree_test", + srcs = ["quantile-tree_test.cc"], + deps = [ + ":numerical-mechanisms-testing", + ":quantile-tree", + "//base/testing:proto_matchers", + "//base/testing:status_matchers", + "@com_google_googletest//:gtest_main", + "@com_google_absl//absl/random", + ], +) + +cc_library( + name = "quantiles", + hdrs = ["quantiles.h"], + deps = [ + ":algorithm", + ":bounded-algorithm", + ":quantile-tree", + "@com_google_absl//absl/status", + "@com_google_cc_differential_privacy//base:statusor", + ], +) + +cc_test( + name = "quantiles_test", + srcs = ["quantiles_test.cc"], + deps = [ + ":numerical-mechanisms-testing", + ":quantiles", + "//base/testing:proto_matchers", + "//base/testing:status_matchers", + "@com_google_googletest//:gtest_main", + "@com_google_absl//absl/random", + ], +) diff --git a/cc/algorithms/internal/BUILD b/cc/algorithms/internal/BUILD new file mode 100644 index 00000000..1f4b6b4f --- /dev/null +++ b/cc/algorithms/internal/BUILD @@ -0,0 +1,45 @@ +# +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +package( + default_visibility = ["//algorithms:__subpackages__"], +) + +cc_library( + name = "count-tree", + srcs = ["count-tree.cc"], + hdrs = ["count-tree.h"], + deps = [ + "//algorithms:algorithm", + "//algorithms:bounded-algorithm", + "//algorithms:numerical-mechanisms", + "//proto:util-lib", + "@com_google_differential_privacy//proto:summary_cc_proto", + "@com_google_absl//absl/status", + "@com_google_cc_differential_privacy//base:status", + "@com_google_cc_differential_privacy//base:statusor", + ], +) + +cc_test( + name = "count-tree_test", + srcs = ["count-tree_test.cc"], + deps = [ + ":count-tree", + "//base/testing:proto_matchers", + "//base/testing:status_matchers", + "@com_google_googletest//:gtest_main", + ], +) diff --git a/cc/algorithms/internal/count-tree.cc b/cc/algorithms/internal/count-tree.cc new file mode 100644 index 00000000..190ad00a --- /dev/null +++ b/cc/algorithms/internal/count-tree.cc @@ -0,0 +1,134 @@ +// +// Copyright 2021 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "algorithms/internal/count-tree.h" + +#include + +#include "absl/status/status.h" +#include "base/statusor.h" +#include "algorithms/algorithm.h" +#include "algorithms/bounded-algorithm.h" +#include "algorithms/numerical-mechanisms.h" +#include "proto/util.h" +#include "proto/summary.pb.h" +#include "base/canonical_errors.h" +#include "base/status_macros.h" + +namespace differential_privacy { +namespace internal { + +CountTree::CountTree(int height, int branching_factor) + : height_(height), + branching_factor_(branching_factor), + number_of_nodes_((std::pow(branching_factor_, height_ + 1) - 1) / + (branching_factor_ - 1)), + number_of_leaves_(std::pow(branching_factor_, height_)), + left_most_leaf_(number_of_nodes_ - number_of_leaves_) {} + +int CountTree::GetLeftMostLeaf() const { return left_most_leaf_; } +int CountTree::GetNthLeaf(int n) const { return GetLeftMostLeaf() + n; } +int CountTree::GetNumberOfNodes() const { return number_of_nodes_; } +int CountTree::GetNumberOfLeaves() const { return number_of_leaves_; } +int CountTree::GetBranchingFactor() const { return branching_factor_; } +int CountTree::GetHeight() const { return height_; } +int CountTree::GetRoot() const { return root_node_; } + +int CountTree::Parent(int nodeIndex) const { + return (nodeIndex - 1) / branching_factor_; +} +int CountTree::LeftMostChild(int nodeIndex) const { + return nodeIndex * branching_factor_ + 1; +} +int CountTree::RightMostChild(int nodeIndex) const { + return (nodeIndex + 1) * branching_factor_; +} + +bool CountTree::IsLeaf(int nodeIndex) const { + return nodeIndex >= GetLeftMostLeaf() && nodeIndex < GetNumberOfNodes(); +} + +int CountTree::LeftMostInSubtree(int nodeIndex) const { + while (!IsLeaf(nodeIndex)) { + nodeIndex = LeftMostChild(nodeIndex); + } + return nodeIndex; +} +int CountTree::RightMostInSubtree(int nodeIndex) const { + while (!IsLeaf(nodeIndex)) { + nodeIndex = RightMostChild(nodeIndex); + } + return nodeIndex; +} + +void CountTree::IncrementNode(int nodeIndex) { ++tree_[nodeIndex]; } +void CountTree::IncrementNodeBy(int nodeIndex, int64_t increment) { + tree_[nodeIndex] += increment; +} + +void CountTree::ClearNodes() { tree_.clear(); } + +int64_t CountTree::GetNodeCount(int nodeIndex) const { + auto node = tree_.find(nodeIndex); + if (node == tree_.end()) { + return 0; + } + return node->second; +} + +BoundedQuantilesSummary CountTree::Serialize() { + BoundedQuantilesSummary to_return; + to_return.mutable_quantile_tree()->insert(tree_.begin(), tree_.end()); + to_return.set_tree_height(height_); + to_return.set_branching_factor(branching_factor_); + return to_return; +} + +absl::Status CountTree::Merge(const BoundedQuantilesSummary& summary) { + if (!summary.has_tree_height() || !summary.has_branching_factor()) { + return absl::InternalError( + "Summary missing height and/or branching factor."); + } + if (summary.tree_height() != height_) { + return absl::InternalError( + absl::StrCat("Height mismatch. Tree had: ", height_, + " but summary had: ", summary.tree_height())); + } + if (summary.branching_factor() != branching_factor_) { + return absl::InternalError( + absl::StrCat("Branching factor mismatch. Tree had: ", branching_factor_, + " but summary had: ", summary.branching_factor())); + } + for (std::pair node : summary.quantile_tree()) { + tree_[node.first] += node.second; + } + return absl::OkStatus(); +} + +int64_t CountTree::MemoryUsed() { + int64_t size = sizeof(CountTree); + // Makes some guesses about how unordered_map is likely implemented. + for (int i = 0; i < tree_.bucket_count(); ++i) { + // Pointer to array. + size += sizeof(int64_t*); + // Keys and values in an array. + size += tree_.bucket_size(i) * (sizeof(int64_t) + sizeof(int)); + } + return size; +} + +} // namespace internal +} // namespace differential_privacy diff --git a/cc/algorithms/internal/count-tree.h b/cc/algorithms/internal/count-tree.h new file mode 100644 index 00000000..5d2cb4fa --- /dev/null +++ b/cc/algorithms/internal/count-tree.h @@ -0,0 +1,116 @@ +// +// Copyright 2021 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#ifndef DIFFERENTIAL_PRIVACY_CPP_ALGORITHMS_COUNT_TREE_H_ +#define DIFFERENTIAL_PRIVACY_CPP_ALGORITHMS_COUNT_TREE_H_ + +#include + +#include "absl/status/status.h" +#include "base/statusor.h" +#include "algorithms/algorithm.h" +#include "algorithms/bounded-algorithm.h" +#include "algorithms/numerical-mechanisms.h" +#include "proto/util.h" +#include "proto/summary.pb.h" +#include "base/canonical_errors.h" +#include "base/status_macros.h" + +namespace differential_privacy { +namespace internal { + +// Maintains a tree of specified height where each node has branching_factor +// children. Each node contains a count, which can be incremented and read out. +// Nodes are identified by index. Nodes are indexed in sequence, starting with +// the root at index 0, and the rightmost leaf node at the maximum index. All of +// a node's children will have sequential indices. Contains numerous methods to +// traverse the tree. Also has the ability to serialize the tree to a proto, and +// to merge the counts from a serialized tree with identical parameters into the +// current tree. +// +// This is used as the underlying data structure for implementing quantile +// trees. +class CountTree { + public: + // height is the number of levels in the tree, not including the root. + // branching_factor is the number of children each node will have. + CountTree(int height, int branching_factor); + + // Methods for finding a particular node in the tree. Note that the rightmost + // leaf will be at index LeftMostLeaf() + NumberOfLeaves(). + int GetLeftMostLeaf() const; + int GetNthLeaf(int n) const; + int GetBranchingFactor() const; + int GetHeight() const; + int GetRoot() const; + + int GetNumberOfNodes() const; + int GetNumberOfLeaves() const; + + // Methods for navigating the tree from a given node. + int Parent(int nodeIndex) const; + int LeftMostChild(int nodeIndex) const; + int RightMostChild(int nodeIndex) const; + int LeftMostInSubtree(int nodeIndex) const; + int RightMostInSubtree(int nodeIndex) const; + + bool IsLeaf(int nodeIndex) const; + + // Modify the count of a specified node. + void IncrementNode(int nodeIndex); + void IncrementNodeBy(int nodeIndex, int64_t increment); + + // Sets the counts of all nodes to 0. + void ClearNodes(); + + // Returns the count of a specified node. + int64_t GetNodeCount(int nodeIndex) const; + + // Serializes the CountTree to a proto representation. + BoundedQuantilesSummary Serialize(); + + // Deserializes the proto representation and combines it with the current + // CountTree. This will add the counts of each node together. + // Returns an error if the summary is malformed, or if the parameters (height + // and branching factor) of the serialized tree do not match. + absl::Status Merge(const BoundedQuantilesSummary& summary); + + // Returns an estimate of the current memory footprint of the CountTree, + // in bytes. + int64_t MemoryUsed(); + + private: + const int height_; + const int branching_factor_; + // Quantities are all calculated from height and branching factor. Cached + // to avoid re-calculation. + const int number_of_nodes_; + // The number of nodes in the tree that are in the lowest/largest level and + // have no children. + const int number_of_leaves_; + // The index of the leaf node with the smallest index. + const int left_most_leaf_; + // The index of the root. + static const int root_node_ = 0; + // We store the tree as an unordered map. This gives fast lookups, and means + // that we don't need space for empty nodes. + std::unordered_map tree_; +}; + +} // namespace internal +} // namespace differential_privacy + +#endif // DIFFERENTIAL_PRIVACY_CPP_ALGORITHMS_COUNT_TREE_H_ diff --git a/cc/algorithms/internal/count-tree_test.cc b/cc/algorithms/internal/count-tree_test.cc new file mode 100644 index 00000000..006d6f82 --- /dev/null +++ b/cc/algorithms/internal/count-tree_test.cc @@ -0,0 +1,178 @@ +// +// Copyright 2021 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "algorithms/internal/count-tree.h" + +#include "base/testing/proto_matchers.h" +#include "base/testing/status_matchers.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" + +namespace differential_privacy { +namespace internal { +namespace { + +using ::testing::HasSubstr; +using ::differential_privacy::base::testing::StatusIs; + +TEST(CountTreeTest, NumberOfNodes) { + CountTree test(3, 5); + EXPECT_EQ(test.GetNumberOfNodes(), 1 + 5 + 25 + 125); + CountTree test2 = CountTree(4, 9); + EXPECT_EQ(test2.GetNumberOfNodes(), 1 + 9 + 81 + 729 + 6561); +} + +TEST(CountTreeTest, NumberOfLeaves) { + CountTree test(3, 5); + EXPECT_EQ(test.GetNumberOfLeaves(), 125); + CountTree test2 = CountTree(4, 9); + EXPECT_EQ(test2.GetNumberOfLeaves(), 6561); +} + +TEST(CountTreeTest, GetNthLeaf) { + CountTree test(3, 5); + EXPECT_EQ(test.GetNthLeaf(0), 31); + EXPECT_EQ(test.GetNthLeaf(5), 36); + EXPECT_EQ(test.GetNthLeaf(18), 49); +} + +TEST(CountTreeTest, ParentChildInverse) { + CountTree test(5, 6); + for (int i = 0; i < test.GetLeftMostLeaf(); ++i) { + for (int child = test.LeftMostChild(i); child < test.RightMostChild(i); + ++child) { + EXPECT_EQ(test.Parent(child), i); + } + } +} + +TEST(CountTreeTest, ParentChildExamples) { + CountTree test(3, 5); + EXPECT_EQ(test.LeftMostChild(0), 1); + EXPECT_EQ(test.RightMostChild(0), 5); + EXPECT_EQ(test.LeftMostChild(1), 6); + EXPECT_EQ(test.RightMostChild(1), 10); + EXPECT_EQ(test.LeftMostChild(8), 41); + EXPECT_EQ(test.RightMostChild(8), 45); + EXPECT_EQ(test.Parent(38), 7); + EXPECT_EQ(test.Parent(8), 1); + EXPECT_EQ(test.Parent(2), 0); +} + +TEST(CountTreeTest, IsLeaf) { + CountTree test(3, 5); + EXPECT_FALSE(test.IsLeaf(0)); + EXPECT_FALSE(test.IsLeaf(1)); + EXPECT_FALSE(test.IsLeaf(6)); + EXPECT_FALSE(test.IsLeaf(30)); + EXPECT_TRUE(test.IsLeaf(31)); + EXPECT_TRUE(test.IsLeaf(155)); +} + +TEST(CountTreeTest, SubtreeQueries) { + CountTree test(3, 5); + EXPECT_EQ(test.LeftMostInSubtree(0), 31); + EXPECT_EQ(test.RightMostInSubtree(0), 155); + EXPECT_EQ(test.LeftMostInSubtree(1), 31); + EXPECT_EQ(test.RightMostInSubtree(1), 55); + EXPECT_EQ(test.LeftMostInSubtree(3), 81); + EXPECT_EQ(test.RightMostInSubtree(3), 105); + EXPECT_EQ(test.LeftMostInSubtree(82), 82); + EXPECT_EQ(test.RightMostInSubtree(83), 83); +} + +TEST(CountTreeTest, IncrementGet) { + CountTree test(3, 5); + test.IncrementNode(1); + EXPECT_EQ(test.GetNodeCount(1), 1); + EXPECT_EQ(test.GetNodeCount(2), 0); + test.IncrementNode(8); + test.IncrementNode(8); + test.IncrementNode(8); + EXPECT_EQ(test.GetNodeCount(8), 3); +} + +TEST(CountTreeTest, IncrementNodeByGet) { + CountTree test(3, 5); + test.IncrementNode(1); + test.IncrementNodeBy(1, 3); + EXPECT_EQ(test.GetNodeCount(1), 4); + test.IncrementNodeBy(1, 5); + EXPECT_EQ(test.GetNodeCount(1), 9); + test.IncrementNode(1); + EXPECT_EQ(test.GetNodeCount(1), 10); +} + +TEST(CountTreeTest, SerializeMerge) { + CountTree test1(3, 5); + test1.IncrementNode(1); + test1.IncrementNode(8); + test1.IncrementNode(8); + + CountTree test2(3, 5); + EXPECT_OK(test2.Merge(test1.Serialize())); + test1.IncrementNode(8); + test2.IncrementNode(8); + test1.IncrementNode(10); + test2.IncrementNode(10); + + for (int i = test1.GetRoot(); i < test1.GetNumberOfNodes(); ++i) { + EXPECT_EQ(test1.GetNodeCount(i), test2.GetNodeCount(i)); + } +} + +TEST(CountTreeTest, MisatchMergeFails) { + CountTree standard(3, 5); + CountTree shorter(2, 5); + CountTree wider(3, 6); + EXPECT_THAT(shorter.Merge(standard.Serialize()), + StatusIs(absl::StatusCode::kInternal, HasSubstr("Height"))); + EXPECT_THAT(wider.Merge(standard.Serialize()), + StatusIs(absl::StatusCode::kInternal, HasSubstr("Branching"))); +} + +TEST(CountTreeTest, MemoryUsed) { + CountTree empty(3, 5); + CountTree single(3, 5); + CountTree twice(3, 5); + single.IncrementNode(1); + twice.IncrementNode(9); + twice.IncrementNode(9); + EXPECT_GT(single.MemoryUsed(), empty.MemoryUsed()); + EXPECT_EQ(twice.MemoryUsed(), single.MemoryUsed()); +} + +TEST(CountTreeTest, ClearNodes) { + CountTree test1(3, 5); + test1.IncrementNode(1); + test1.IncrementNode(8); + test1.IncrementNode(8); + + CountTree test2(3, 5); + test1.ClearNodes(); + test1.IncrementNode(8); + test2.IncrementNode(8); + test1.IncrementNode(10); + test2.IncrementNode(10); + + for (int i = test1.GetRoot(); i < test1.GetNumberOfNodes(); ++i) { + EXPECT_EQ(test1.GetNodeCount(i), test2.GetNodeCount(i)); + } +} + +} // namespace +} // namespace internal +} // namespace differential_privacy diff --git a/cc/algorithms/order-statistics.h b/cc/algorithms/order-statistics.h index 60a9eeb6..622fb762 100644 --- a/cc/algorithms/order-statistics.h +++ b/cc/algorithms/order-statistics.h @@ -18,6 +18,7 @@ #define DIFFERENTIAL_PRIVACY_ALGORITHMS_ORDER_STATISTICS_H_ #include "base/percentile.h" +#include "absl/base/attributes.h" #include "absl/status/status.h" #include "base/statusor.h" #include "algorithms/algorithm.h" @@ -26,6 +27,10 @@ #include "algorithms/numerical-mechanisms.h" #include "base/canonical_errors.h" +// Old classes for calculating order statistics (aka quantiles, aka +// percentiles). Deprecated, you should use Quantiles instead as it's more +// accurate. + namespace differential_privacy { namespace continuous { @@ -77,9 +82,10 @@ class OrderStatisticsBuilder }; template -class Max : public BinarySearch { +class ABSL_DEPRECATED("Use Quantiles instead.") Max : public BinarySearch { public: - class Builder : public OrderStatisticsBuilder, Builder> { + class ABSL_DEPRECATED("Use Quantiles instead.") Builder + : public OrderStatisticsBuilder, Builder> { using AlgorithmBuilder = differential_privacy::AlgorithmBuilder, Builder>; using BoundedBuilder = BoundedAlgorithmBuilder, Builder>; @@ -105,9 +111,10 @@ class Max : public BinarySearch { }; template -class Min : public BinarySearch { +class ABSL_DEPRECATED("Use Quantiles instead.") Min : public BinarySearch { public: - class Builder : public OrderStatisticsBuilder, Builder> { + class ABSL_DEPRECATED("Use Quantiles instead.") Builder + : public OrderStatisticsBuilder, Builder> { using AlgorithmBuilder = differential_privacy::AlgorithmBuilder, Builder>; using BoundedBuilder = BoundedAlgorithmBuilder, Builder>; @@ -133,9 +140,11 @@ class Min : public BinarySearch { }; template -class Median : public BinarySearch { +class ABSL_DEPRECATED("Use Quantiles instead.") Median + : public BinarySearch { public: - class Builder : public OrderStatisticsBuilder, Builder> { + class ABSL_DEPRECATED("Use Quantiles instead.") Builder + : public OrderStatisticsBuilder, Builder> { using AlgorithmBuilder = differential_privacy::AlgorithmBuilder, Builder>; using BoundedBuilder = BoundedAlgorithmBuilder, Builder>; @@ -162,9 +171,11 @@ class Median : public BinarySearch { }; template -class Percentile : public BinarySearch { +class ABSL_DEPRECATED("Use Quantiles instead.") Percentile + : public BinarySearch { public: - class Builder : public OrderStatisticsBuilder, Builder> { + class ABSL_DEPRECATED("Use Quantiles instead.") Builder + : public OrderStatisticsBuilder, Builder> { using AlgorithmBuilder = differential_privacy::AlgorithmBuilder, Builder>; using BoundedBuilder = BoundedAlgorithmBuilder, Builder>; diff --git a/cc/algorithms/quantile-tree.h b/cc/algorithms/quantile-tree.h new file mode 100644 index 00000000..b9757b93 --- /dev/null +++ b/cc/algorithms/quantile-tree.h @@ -0,0 +1,329 @@ +// +// Copyright 2021 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#ifndef DIFFERENTIAL_PRIVACY_CPP_ALGORITHMS_QUANTILE_TREE_H_ +#define DIFFERENTIAL_PRIVACY_CPP_ALGORITHMS_QUANTILE_TREE_H_ + +#include + +#include "absl/status/status.h" +#include "base/statusor.h" +#include "algorithms/algorithm.h" +#include "algorithms/bounded-algorithm.h" +#include "algorithms/internal/count-tree.h" +#include "algorithms/numerical-mechanisms.h" +#include "proto/util.h" +#include "proto/summary.pb.h" +#include "base/canonical_errors.h" +#include "base/status_macros.h" + +namespace differential_privacy { + +// A small tolerance on the quantile we're searching for. We'll be aiming to +// return a value that's within this tolerance of the chosen quantile. This is +// a post-processing parameter with no privacy implications. +constexpr double kNumericalTolerance = 1.0e-6; +// Default tree parameters. Will result in splitting the input space into 16^4 +// = 65536 equal buckets. Using a larger height or branching factor will +// split the input space more finely, resulting in greater precision but also +// increasing space used. Increasing the height will increase the amount of +// noise that is added. These parameters were selected based on experiments. +constexpr int kDefaultTreeHeight = 4; +constexpr int kDefaultBranchingFactor = 16; +// Fraction a node needs to contribute to the total count of itself and its +// siblings to be considered during the search for a particular quantile. The +// idea of alpha is to filter out noisy empty nodes. This is a post processing +// parameter with no privacy implications. +constexpr double kAlpha = 0.005; + +// Calculates differentially private quantiles using a tree-based data +// structure. See a full writeup of the algorithm at: +// https://github.com/google/differential-privacy/blob/main/common_docs/Differentially_Private_Quantile_Trees.pdf +// +// This algorithm can be used to calculate an arbitrarily large number of +// quantiles with no loss in accuracy or additional expenditure of privacy +// budget. +// +// This is not an Algorithm, and does not behave in the same way as other +// algorithms. For a quantile implementation that follows the Algorithm +// interface, see multi-quantile.h. +template +class QuantileTree { + public: + class Builder; + class Privatized; + + void AddEntry(const T& input) { AddMultipleEntries(input, 1); } + + // Removes all input from the QuantileTree. After calling this method, the + // QuantileTree will be equivalent to one that is newly initialized with no + // input added. + void Reset() { tree_.ClearNodes(); } + + struct DPParams { + double epsilon; + double delta; + int max_contributions_per_partition; + int max_partitions_contributed_to; + std::unique_ptr mechanism_builder; + }; + + // Returns a private version of the quantile tree, which can be used to get + // differentially private quantiles. Each call to this method expends the + // epsilon and delta specified in the params. + base::StatusOr MakePrivate(const DPParams& params) { + ASSIGN_OR_RETURN( + std::unique_ptr mech, + params.mechanism_builder->SetEpsilon(params.epsilon) + .SetDelta(params.delta) + .SetL0Sensitivity(params.max_partitions_contributed_to * + tree_.GetHeight()) + .SetLInfSensitivity(params.max_contributions_per_partition) + .Build()); + return Privatized(upper_, lower_, std::move(mech), tree_); + } + + BoundedQuantilesSummary Serialize() { + BoundedQuantilesSummary to_return = tree_.Serialize(); + to_return.set_lower(lower_); + to_return.set_upper(upper_); + return to_return; + } + + absl::Status Merge(const BoundedQuantilesSummary& summary) { + if (static_cast(lower_) != summary.lower() || + static_cast(upper_) != summary.upper()) { + return absl::InternalError(absl::StrCat( + "Bounds mismatch. Tree: [", lower_, ", ", upper_, "] ", + ", summary: [", summary.lower(), ", ", summary.upper(), "]")); + } + return tree_.Merge(summary); + } + + int64_t MemoryUsed() { + return sizeof(QuantileTree) - sizeof(internal::CountTree) + + tree_.MemoryUsed(); + } + + int GetHeight() { return tree_.GetHeight(); } + int GetBranchingFactor() { return tree_.GetBranchingFactor(); } + + private: + QuantileTree(T lower, T upper, int tree_height, int branching_factor) + : lower_(lower), upper_(upper), tree_(tree_height, branching_factor) {} + + int getLeafIndex(T input) { + double leaf_fraction = + static_cast(input - lower_) / (upper_ - lower_); + return tree_.GetNthLeaf(leaf_fraction * (tree_.GetNumberOfLeaves() - 1)); + } + + void AddMultipleEntries(const T& input, const int64_t times) { + // REF: + // https://stackoverflow.com/questions/61646166/how-to-resolve-fpclassify-ambiguous-call-to-overloaded-function + if (std::isnan(static_cast(input))) { + return; + } + if (times <= 0) { + return; + } + int currentNode = getLeafIndex(Clamp(lower_, upper_, input)); + while (currentNode > tree_.GetRoot()) { + tree_.IncrementNodeBy(currentNode, times); + currentNode = tree_.Parent(currentNode); + } + } + + T lower_; + T upper_; + internal::CountTree tree_; + + friend class QuantileTreeTestPeer; +}; + +// A private version of a quantile tree. Used for calculating differentially +// private quantiles. It will contain raw data internally, but only +// differentially private results can be accessed. +template +class QuantileTree::Privatized { + public: + base::StatusOr GetQuantile(double quantile) { + if (quantile < 0 || quantile > 1) { + return absl::InvalidArgumentError(absl::StrCat( + "Requested quantile must be in [0, 1] but was ", quantile)); + } + + quantile = ClampQuantile(quantile); + + int current_node = raw_tree_.GetRoot(); + while (!raw_tree_.IsLeaf(current_node)) { + int left_most_child = raw_tree_.LeftMostChild(current_node); + int right_most_child = raw_tree_.RightMostChild(current_node); + + double total_count = 0.0; + for (int i = left_most_child; i <= right_most_child; ++i) { + total_count += GetNoisedCount(i); + } + + // All child nodes appear to be empty. No need to continue down the tree. + if (total_count <= 0) break; + + // Remove nodes that make up less than an alpha fraction of the total - + // these are likely empty. + double corrected_total_count = 0.0; + for (int i = left_most_child; i <= right_most_child; ++i) { + corrected_total_count += + GetNoisedCount(i) >= total_count * kAlpha ? GetNoisedCount(i) : 0.0; + } + + // All child nodes have a negligible noisy count. We can't tell whether + // they have any elements in them, and if so how many, so we can stop + // and pick the middle of this range. + if (corrected_total_count <= 0) break; + + double partial_count = 0.0; + for (int i = left_most_child; i <= right_most_child; ++i) { + double count = GetNoisedCount(i); + // Ignore nodes we think are empty. + partial_count += count >= total_count * kAlpha ? count : 0.0; + if (partial_count / corrected_total_count >= + quantile - kNumericalTolerance) { + quantile = + (quantile - (partial_count - count) / corrected_total_count) / + (count / corrected_total_count); + quantile = std::min(std::max(quantile, 0.0), 1.0); + current_node = i; + break; + } + } + } + + double to_return = (1 - quantile) * GetSubtreeLowerBound(current_node) + + quantile * GetSubtreeUpperBound(current_node); + return to_return; + } + + private: + friend class QuantileTree; + + Privatized(T upper, T lower, std::unique_ptr mechanism, + internal::CountTree raw_tree) + : raw_tree_(raw_tree), + upper_(upper), + lower_(lower), + mechanism_(std::move(mechanism)) {} + + double GetNoisedCount(int index) { + if (noised_tree_.find(index) == noised_tree_.end()) { + noised_tree_[index] = mechanism_->AddNoise(raw_tree_.GetNodeCount(index)); + } + return noised_tree_[index]; + } + + double GetSubtreeLowerBound(int index) { + int leaf_index = + raw_tree_.LeftMostInSubtree(index) - raw_tree_.GetLeftMostLeaf(); + double quantile = + static_cast(leaf_index) / raw_tree_.GetNumberOfLeaves(); + return quantile * upper_ + (1 - quantile) * lower_; + } + + double GetSubtreeUpperBound(int index) { + int leaf_index = + raw_tree_.RightMostInSubtree(index) - raw_tree_.GetLeftMostLeaf() + 1; + double quantile = + static_cast(leaf_index) / raw_tree_.GetNumberOfLeaves(); + return quantile * upper_ + (1 - quantile) * lower_; + } + + // Clamps a quantile to a value between 0.005 and 0.995. This mitigates the + // inaccuracy of the quantile tree mechanism when finding a quantile close to + // 0 or 1. + static double ClampQuantile(double quantile) { + return std::min(std::max(0.005, quantile), 0.995); + } + + const T upper_; + const T lower_; + std::unique_ptr mechanism_; + const internal::CountTree raw_tree_; + std::unordered_map noised_tree_; +}; + +template +class QuantileTree::Builder { + public: + Builder& SetTreeHeight(int tree_height) { + tree_height_ = tree_height; + return *static_cast(this); + } + + Builder& SetBranchingFactor(int branching_factor) { + branching_factor_ = branching_factor; + return *static_cast(this); + } + + Builder& SetLower(T lower) { + lower_ = lower; + return *static_cast(this); + } + + Builder& SetUpper(T upper) { + upper_ = upper; + return *static_cast(this); + } + + base::StatusOr>> Build() { + if (!tree_height_.has_value()) { + tree_height_ = kDefaultTreeHeight; + } + if (!branching_factor_.has_value()) { + branching_factor_ = kDefaultBranchingFactor; + } + if (!lower_.has_value() || !upper_.has_value()) { + return absl::InvalidArgumentError( + "Lower and upper bounds must both be set."); + } + + if (tree_height_.value() < 1) { + return absl::InvalidArgumentError(absl::StrCat( + "Tree height must be at least 1, but was ", tree_height_.value())); + } + if (branching_factor_.value() < 2) { + return absl::InvalidArgumentError( + absl::StrCat("Branching factor must be at least 2, but was ", + branching_factor_.value())); + } + if (lower_.value() >= upper_.value()) { + return absl::InvalidArgumentError( + absl::StrCat("Lower bound must be less than upper bound but lower: ", + lower_.value(), " >= upper: ", upper_.value())); + } + + return std::unique_ptr( + new QuantileTree(lower_.value(), upper_.value(), tree_height_.value(), + branching_factor_.value())); + } + + private: + absl::optional tree_height_; + absl::optional branching_factor_; + absl::optional lower_; + absl::optional upper_; +}; +} // namespace differential_privacy + +#endif // DIFFERENTIAL_PRIVACY_CPP_ALGORITHMS_QUANTILE_TREE_H_ diff --git a/cc/algorithms/quantile-tree_test.cc b/cc/algorithms/quantile-tree_test.cc new file mode 100644 index 00000000..937841b7 --- /dev/null +++ b/cc/algorithms/quantile-tree_test.cc @@ -0,0 +1,793 @@ +// +// Copyright 2021 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "algorithms/quantile-tree.h" + +#include "base/testing/proto_matchers.h" +#include "base/testing/status_matchers.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "absl/random/random.h" +#include "algorithms/numerical-mechanisms-testing.h" + +namespace differential_privacy { + +// Provides limited-scope static methods for interacting with a QuantileTree +// object for testing purposes. +class QuantileTreeTestPeer { + public: + template + static void AddMultipleEntries(const T& t, int64_t num_of_entries, + QuantileTree* qt) { + qt->AddMultipleEntries(t, num_of_entries); + } +}; + +namespace { + +using ::differential_privacy::test_utils::ZeroNoiseMechanism; +using ::testing::HasSubstr; +using ::differential_privacy::base::testing::StatusIs; + +const double kTestDefaultEpsilon = 0.5; +const double kDefaultDelta = 1e-5; +const int kDefaultMaxContributionsPerPartition = 5; +const int kDefaultMaxPartitionsContributed = 12; +const int kDefaultDatasetSize = 1001; +const int kNumRanksToTest = 10; + +template +class QuantileTreeTest : public ::testing::Test { + protected: + void SetUp() override {} + void TearDown() override {} +}; + +typedef ::testing::Types NumericTypes; +TYPED_TEST_SUITE(QuantileTreeTest, NumericTypes); + +TEST(QuantileTreeTest, InvalidParametersTest) { + EXPECT_THAT(QuantileTree::Builder() + .SetTreeHeight(0) + .SetLower(0) + .SetUpper(1) + .Build(), + StatusIs(absl::StatusCode::kInvalidArgument, + HasSubstr("Tree height must be at least 1"))); + EXPECT_THAT(QuantileTree::Builder() + .SetBranchingFactor(1) + .SetLower(0) + .SetUpper(1) + .Build(), + StatusIs(absl::StatusCode::kInvalidArgument, + HasSubstr("Branching factor must be at least 2"))); + EXPECT_THAT(QuantileTree::Builder().SetLower(2).SetUpper(1).Build(), + StatusIs(absl::StatusCode::kInvalidArgument, + HasSubstr("Lower bound must be less than upper"))); + EXPECT_THAT(QuantileTree::Builder().Build(), + StatusIs(absl::StatusCode::kInvalidArgument, + HasSubstr("Lower and upper bounds"))); +} + +// Input must be sorted. +template +double TrueQuantileFromSorted(std::vector inputs, double quantile) { + int rank = std::round(quantile * (inputs.size() - 1)); + return inputs[rank]; +} + +TYPED_TEST(QuantileTreeTest, ApproximatesTrueQuantile) { + std::unique_ptr> test_quantiles = + typename QuantileTree::Builder() + .SetUpper(50) + .SetLower(-50) + .SetTreeHeight(4) + .SetBranchingFactor(10) + .Build() + .value(); + + std::vector inputs; + for (int i = 0; i < kDefaultDatasetSize; ++i) { + inputs.push_back(absl::Uniform(absl::BitGen(), -25, 25)); + } + + for (TypeParam input : inputs) { + test_quantiles->AddEntry(input); + } + + typename QuantileTree::DPParams dp_params; + dp_params.epsilon = kTestDefaultEpsilon; + dp_params.delta = kDefaultDelta; + dp_params.max_contributions_per_partition = + kDefaultMaxContributionsPerPartition; + dp_params.max_partitions_contributed_to = kDefaultMaxPartitionsContributed; + dp_params.mechanism_builder = absl::make_unique(); + + typename QuantileTree::Privatized results = + test_quantiles->MakePrivate(dp_params).value(); + + double tolerance = 0.01; // > upper - lower / branchingFactor ^ treeHeight + std::sort(inputs.begin(), inputs.end()); + + for (int i = 0; i < kNumRanksToTest; ++i) { + double quantile = static_cast(i) / kNumRanksToTest; + + EXPECT_NEAR(results.GetQuantile(quantile).value(), + TrueQuantileFromSorted(inputs, quantile), tolerance); + } +} + +TYPED_TEST(QuantileTreeTest, EmptyLinearlyDistributed) { + double lower = -50; + double upper = 50; + std::unique_ptr> test_quantiles = + typename QuantileTree::Builder() + .SetUpper(upper) + .SetLower(lower) + .SetTreeHeight(4) + .SetBranchingFactor(10) + .Build() + .value(); + + double tolerance = 0.01; // < (upper - lower) / branchingFactor ^ treeHeight + + typename QuantileTree::DPParams dp_params; + dp_params.epsilon = kTestDefaultEpsilon; + dp_params.delta = kDefaultDelta; + dp_params.max_contributions_per_partition = + kDefaultMaxContributionsPerPartition; + dp_params.max_partitions_contributed_to = kDefaultMaxPartitionsContributed; + dp_params.mechanism_builder = absl::make_unique(); + + typename QuantileTree::Privatized results = + test_quantiles->MakePrivate(dp_params).value(); + + for (int i = 0; i < kNumRanksToTest; ++i) { + double quantile = static_cast(i) / kNumRanksToTest; + // Avoid extreme quantiles, as they're special-cased. + if (quantile < 0.1 || quantile > 0.9) continue; + double expected = quantile * (upper - lower) + lower; + EXPECT_NEAR(results.GetQuantile(quantile).value(), expected, tolerance); + } +} + +TYPED_TEST(QuantileTreeTest, LowerBoundClamps) { + double lower = -50; + double upper = 50; + std::unique_ptr> test_quantiles = + typename QuantileTree::Builder() + .SetUpper(upper) + .SetLower(lower) + .SetTreeHeight(4) + .SetBranchingFactor(10) + .Build() + .value(); + + for (int i = 0; i < kDefaultDatasetSize; ++i) { + test_quantiles->AddEntry(-100); + } + + typename QuantileTree::DPParams dp_params; + dp_params.epsilon = kTestDefaultEpsilon; + dp_params.delta = kDefaultDelta; + dp_params.max_contributions_per_partition = + kDefaultMaxContributionsPerPartition; + dp_params.max_partitions_contributed_to = kDefaultMaxPartitionsContributed; + dp_params.mechanism_builder = absl::make_unique(); + + typename QuantileTree::Privatized results = + test_quantiles->MakePrivate(dp_params).value(); + + for (int i = 0; i < kNumRanksToTest; ++i) { + double quantile = static_cast(i) / kNumRanksToTest; + + EXPECT_GE(results.GetQuantile(quantile).value(), lower); + } +} + +TYPED_TEST(QuantileTreeTest, UpperBoundClamps) { + double lower = -50; + double upper = 50; + std::unique_ptr> test_quantiles = + typename QuantileTree::Builder() + .SetUpper(upper) + .SetLower(lower) + .SetTreeHeight(4) + .SetBranchingFactor(10) + .Build() + .value(); + + for (int i = 0; i < kDefaultDatasetSize; ++i) { + test_quantiles->AddEntry(100); + } + + typename QuantileTree::DPParams dp_params; + dp_params.epsilon = kTestDefaultEpsilon; + dp_params.delta = kDefaultDelta; + dp_params.max_contributions_per_partition = + kDefaultMaxContributionsPerPartition; + dp_params.max_partitions_contributed_to = kDefaultMaxPartitionsContributed; + dp_params.mechanism_builder = absl::make_unique(); + + typename QuantileTree::Privatized results = + test_quantiles->MakePrivate(dp_params).value(); + + for (int i = 0; i < kNumRanksToTest; ++i) { + double quantile = static_cast(i) / kNumRanksToTest; + + EXPECT_LE(results.GetQuantile(quantile).value(), upper); + } +} + +TYPED_TEST(QuantileTreeTest, ApproximatesTrueQuantileNearUpperBound) { + std::unique_ptr> test_quantiles = + typename QuantileTree::Builder() + .SetUpper(50) + .SetLower(-50) + .SetTreeHeight(4) + .SetBranchingFactor(10) + .Build() + .value(); + + for (int i = 0; i < kDefaultDatasetSize; ++i) { + test_quantiles->AddEntry(50); + } + + typename QuantileTree::DPParams dp_params; + dp_params.epsilon = kTestDefaultEpsilon; + dp_params.delta = kDefaultDelta; + dp_params.max_contributions_per_partition = + kDefaultMaxContributionsPerPartition; + dp_params.max_partitions_contributed_to = kDefaultMaxPartitionsContributed; + dp_params.mechanism_builder = absl::make_unique(); + typename QuantileTree::Privatized results = + test_quantiles->MakePrivate(dp_params).value(); + + double tolerance = 0.01; // < (upper - lower) / branchingFactor ^ treeHeight + EXPECT_NEAR(results.GetQuantile(0.5).value(), 50, tolerance); +} + +TYPED_TEST(QuantileTreeTest, ApproximatesTrueQuantileNearLowerBound) { + std::unique_ptr> test_quantiles = + typename QuantileTree::Builder() + .SetUpper(50) + .SetLower(-50) + .SetTreeHeight(4) + .SetBranchingFactor(10) + .Build() + .value(); + + for (int i = 0; i < kDefaultDatasetSize; ++i) { + test_quantiles->AddEntry(-50); + } + + typename QuantileTree::DPParams dp_params; + dp_params.epsilon = kTestDefaultEpsilon; + dp_params.delta = kDefaultDelta; + dp_params.max_contributions_per_partition = + kDefaultMaxContributionsPerPartition; + dp_params.max_partitions_contributed_to = kDefaultMaxPartitionsContributed; + dp_params.mechanism_builder = absl::make_unique(); + typename QuantileTree::Privatized results = + test_quantiles->MakePrivate(dp_params).value(); + + double tolerance = 0.01; // < (upper - lower) / branchingFactor ^ treeHeight + EXPECT_NEAR(results.GetQuantile(0.5).value(), -50, tolerance); +} + +TYPED_TEST(QuantileTreeTest, InputOrderInvariant) { + std::unique_ptr> test_quantiles1 = + typename QuantileTree::Builder() + .SetUpper(50) + .SetLower(-50) + .SetTreeHeight(4) + .SetBranchingFactor(10) + .Build() + .value(); + std::unique_ptr> test_quantiles2 = + typename QuantileTree::Builder() + .SetUpper(50) + .SetLower(-50) + .SetTreeHeight(4) + .SetBranchingFactor(10) + .Build() + .value(); + + absl::BitGen gen; + + std::vector inputs; + for (int i = 0; i < kDefaultDatasetSize; ++i) { + inputs.push_back(absl::Uniform(gen, -25, 25)); + } + + for (TypeParam input : inputs) { + test_quantiles1->AddEntry(input); + } + std::shuffle(inputs.begin(), inputs.end(), gen); + for (TypeParam input : inputs) { + test_quantiles2->AddEntry(input); + } + + typename QuantileTree::DPParams dp_params; + dp_params.epsilon = kTestDefaultEpsilon; + dp_params.delta = kDefaultDelta; + dp_params.max_contributions_per_partition = + kDefaultMaxContributionsPerPartition; + dp_params.max_partitions_contributed_to = kDefaultMaxPartitionsContributed; + dp_params.mechanism_builder = absl::make_unique(); + + typename QuantileTree::Privatized results1 = + test_quantiles1->MakePrivate(dp_params).value(); + + typename QuantileTree::Privatized results2 = + test_quantiles2->MakePrivate(dp_params).value(); + + for (int i = 0; i < kNumRanksToTest; ++i) { + double quantile = static_cast(i) / kNumRanksToTest; + + EXPECT_EQ(results1.GetQuantile(quantile).value(), + results2.GetQuantile(quantile).value()); + } +} + +// This should hold even with noise enabled. +TYPED_TEST(QuantileTreeTest, RepeatedResultsIdentical) { + std::unique_ptr> test_quantiles = + typename QuantileTree::Builder() + .SetUpper(50) + .SetLower(-50) + .SetTreeHeight(4) + .SetBranchingFactor(10) + .Build() + .value(); + + absl::BitGen gen; + std::vector inputs; + for (int i = 0; i < kDefaultDatasetSize; ++i) { + inputs.push_back(absl::Uniform(gen, -25, 25)); + } + + for (TypeParam input : inputs) { + test_quantiles->AddEntry(input); + } + + typename QuantileTree::DPParams dp_params; + dp_params.epsilon = kTestDefaultEpsilon; + dp_params.delta = kDefaultDelta; + dp_params.max_contributions_per_partition = + kDefaultMaxContributionsPerPartition; + dp_params.max_partitions_contributed_to = kDefaultMaxPartitionsContributed; + dp_params.mechanism_builder = absl::make_unique(); + + typename QuantileTree::Privatized results = + test_quantiles->MakePrivate(dp_params).value(); + + for (int i = 0; i < kNumRanksToTest; ++i) { + double quantile = static_cast(i) / kNumRanksToTest; + + EXPECT_EQ(results.GetQuantile(quantile).value(), + results.GetQuantile(quantile).value()); + } +} + +// This should hold even with noise enabled. +TYPED_TEST(QuantileTreeTest, ResultsIncreaseMonotonically) { + std::unique_ptr> test_quantiles = + typename QuantileTree::Builder() + .SetUpper(50) + .SetLower(-50) + .SetTreeHeight(4) + .SetBranchingFactor(10) + .Build() + .value(); + + absl::BitGen gen; + std::vector inputs; + for (int i = 0; i < kDefaultDatasetSize; ++i) { + inputs.push_back(absl::Uniform(gen, -25, 25)); + } + + for (TypeParam input : inputs) { + test_quantiles->AddEntry(input); + } + + typename QuantileTree::DPParams dp_params; + dp_params.epsilon = kTestDefaultEpsilon; + dp_params.delta = kDefaultDelta; + dp_params.max_contributions_per_partition = + kDefaultMaxContributionsPerPartition; + dp_params.max_partitions_contributed_to = kDefaultMaxPartitionsContributed; + dp_params.mechanism_builder = absl::make_unique(); + + typename QuantileTree::Privatized results = + test_quantiles->MakePrivate(dp_params).value(); + + double last_result = std::numeric_limits::lowest(); + for (int i = 0; i < kNumRanksToTest; ++i) { + double quantile = static_cast(i) / kNumRanksToTest; + + EXPECT_GE(results.GetQuantile(quantile).value(), last_result); + last_result = results.GetQuantile(quantile).value(); + } +} + +TYPED_TEST(QuantileTreeTest, InvalidRanksReturnErrors) { + std::unique_ptr> test_quantiles = + typename QuantileTree::Builder() + .SetUpper(50) + .SetLower(-50) + .SetTreeHeight(4) + .SetBranchingFactor(10) + .Build() + .value(); + + typename QuantileTree::DPParams dp_params; + dp_params.epsilon = kTestDefaultEpsilon; + dp_params.delta = kDefaultDelta; + dp_params.max_contributions_per_partition = + kDefaultMaxContributionsPerPartition; + dp_params.max_partitions_contributed_to = kDefaultMaxPartitionsContributed; + dp_params.mechanism_builder = absl::make_unique(); + + typename QuantileTree::Privatized results = + test_quantiles->MakePrivate(dp_params).value(); + + EXPECT_THAT(results.GetQuantile(-0.5), + StatusIs(absl::StatusCode::kInvalidArgument, + HasSubstr("quantile must be in [0, 1]"))); + EXPECT_THAT(results.GetQuantile(1.5), + StatusIs(absl::StatusCode::kInvalidArgument, + HasSubstr("quantile must be in [0, 1]"))); +} + +TYPED_TEST(QuantileTreeTest, SerializeMergeTest) { + std::unique_ptr> test_quantiles1 = + typename QuantileTree::Builder() + .SetUpper(50) + .SetLower(-50) + .SetTreeHeight(4) + .SetBranchingFactor(10) + .Build() + .value(); + std::unique_ptr> test_quantiles2 = + typename QuantileTree::Builder() + .SetUpper(50) + .SetLower(-50) + .SetTreeHeight(4) + .SetBranchingFactor(10) + .Build() + .value(); + + absl::BitGen gen; + std::vector first_inputs; + std::vector second_inputs; + for (int i = 0; i < kDefaultDatasetSize; ++i) { + TypeParam input = absl::Uniform(gen, -25, 25); + if (i < kDefaultDatasetSize / 2) { + first_inputs.push_back(input); + } else { + second_inputs.push_back(input); + } + } + + for (TypeParam input : first_inputs) { + test_quantiles1->AddEntry(input); + } + + EXPECT_OK(test_quantiles2->Merge(test_quantiles1->Serialize())); + + for (TypeParam input : second_inputs) { + test_quantiles1->AddEntry(input); + test_quantiles2->AddEntry(input); + } + + typename QuantileTree::DPParams dp_params; + dp_params.epsilon = kTestDefaultEpsilon; + dp_params.delta = kDefaultDelta; + dp_params.max_contributions_per_partition = + kDefaultMaxContributionsPerPartition; + dp_params.max_partitions_contributed_to = kDefaultMaxPartitionsContributed; + dp_params.mechanism_builder = absl::make_unique(); + + typename QuantileTree::Privatized results1 = + test_quantiles1->MakePrivate(dp_params).value(); + + typename QuantileTree::Privatized results2 = + test_quantiles2->MakePrivate(dp_params).value(); + + for (int i = 0; i < kNumRanksToTest; ++i) { + double quantile = static_cast(i) / kNumRanksToTest; + + EXPECT_EQ(results1.GetQuantile(quantile).value(), + results2.GetQuantile(quantile).value()); + } +} + +TEST(QuantileTreeTest, MergeFailsWithBadBounds) { + std::unique_ptr> test_quantiles = + typename QuantileTree::Builder() + .SetUpper(50) + .SetLower(-50) + .SetTreeHeight(4) + .SetBranchingFactor(10) + .Build() + .value(); + std::unique_ptr> wrong_lower = + typename QuantileTree::Builder() + .SetUpper(50) + .SetLower(-49) + .SetTreeHeight(4) + .SetBranchingFactor(10) + .Build() + .value(); + std::unique_ptr> wrong_upper = + typename QuantileTree::Builder() + .SetUpper(49) + .SetLower(-50) + .SetTreeHeight(4) + .SetBranchingFactor(10) + .Build() + .value(); + + EXPECT_THAT(wrong_lower->Merge(test_quantiles->Serialize()), + StatusIs(absl::StatusCode::kInternal, HasSubstr("Bounds"))); + EXPECT_THAT(wrong_upper->Merge(test_quantiles->Serialize()), + StatusIs(absl::StatusCode::kInternal, HasSubstr("Bounds"))); +} + +TYPED_TEST(QuantileTreeTest, Reset) { + std::unique_ptr> test_quantiles1 = + typename QuantileTree::Builder() + .SetUpper(50) + .SetLower(-50) + .SetTreeHeight(4) + .SetBranchingFactor(10) + .Build() + .value(); + std::unique_ptr> test_quantiles2 = + typename QuantileTree::Builder() + .SetUpper(50) + .SetLower(-50) + .SetTreeHeight(4) + .SetBranchingFactor(10) + .Build() + .value(); + + absl::BitGen gen; + std::vector first_inputs; + std::vector second_inputs; + for (int i = 0; i < kDefaultDatasetSize; ++i) { + TypeParam input = absl::Uniform(gen, -25, 25); + if (i < kDefaultDatasetSize / 2) { + first_inputs.push_back(input); + } else { + second_inputs.push_back(input); + } + } + + for (TypeParam input : first_inputs) { + test_quantiles1->AddEntry(input); + } + + test_quantiles1->Reset(); + + for (TypeParam input : second_inputs) { + test_quantiles1->AddEntry(input); + test_quantiles2->AddEntry(input); + } + + typename QuantileTree::DPParams dp_params; + dp_params.epsilon = kTestDefaultEpsilon; + dp_params.delta = kDefaultDelta; + dp_params.max_contributions_per_partition = + kDefaultMaxContributionsPerPartition; + dp_params.max_partitions_contributed_to = kDefaultMaxPartitionsContributed; + dp_params.mechanism_builder = absl::make_unique(); + + typename QuantileTree::Privatized results1 = + test_quantiles1->MakePrivate(dp_params).value(); + + typename QuantileTree::Privatized results2 = + test_quantiles2->MakePrivate(dp_params).value(); + + for (int i = 0; i < kNumRanksToTest; ++i) { + double quantile = static_cast(i) / kNumRanksToTest; + + EXPECT_EQ(results1.GetQuantile(quantile).value(), + results2.GetQuantile(quantile).value()); + } +} + +TEST(QuantileTreeTest, IgnoresNaN) { + std::unique_ptr> test_quantiles = + typename QuantileTree::Builder() + .SetUpper(50) + .SetLower(-50) + .SetTreeHeight(4) + .SetBranchingFactor(10) + .Build() + .value(); + + typename QuantileTree::DPParams dp_params; + dp_params.epsilon = kTestDefaultEpsilon; + dp_params.delta = kDefaultDelta; + dp_params.max_contributions_per_partition = + kDefaultMaxContributionsPerPartition; + dp_params.max_partitions_contributed_to = kDefaultMaxPartitionsContributed; + dp_params.mechanism_builder = absl::make_unique(); + + test_quantiles->AddEntry(5.0); + typename QuantileTree::Privatized results1 = + test_quantiles->MakePrivate(dp_params).value(); + + for (int i = 0; i < 100; ++i) { + test_quantiles->AddEntry(std::nan("")); + } + + typename QuantileTree::Privatized results2 = + test_quantiles->MakePrivate(dp_params).value(); + + for (int i = 0; i < kNumRanksToTest; ++i) { + double quantile = static_cast(i) / kNumRanksToTest; + + EXPECT_EQ(results1.GetQuantile(quantile).value(), + results2.GetQuantile(quantile).value()); + } +} + +TEST(QuantileTreeTest, TreeOverflowsWithInputs) { + std::unique_ptr> test_quantiles = + typename QuantileTree::Builder() + .SetUpper(50) + .SetLower(-50) + .SetTreeHeight(4) + .SetBranchingFactor(10) + .Build() + .value(); + + QuantileTreeTestPeer::AddMultipleEntries( + 25, std::numeric_limits::max(), test_quantiles.get()); + test_quantiles->AddEntry(25); + + typename QuantileTree::DPParams dp_params; + dp_params.epsilon = kTestDefaultEpsilon; + dp_params.delta = kDefaultDelta; + dp_params.max_contributions_per_partition = + kDefaultMaxContributionsPerPartition; + dp_params.max_partitions_contributed_to = kDefaultMaxPartitionsContributed; + dp_params.mechanism_builder = absl::make_unique(); + + typename QuantileTree::Privatized results = + test_quantiles->MakePrivate(dp_params).value(); + + // With no noise and no overflow, we should always get 25. With overflow, + // those nodes should register as empty, meaning the whole tree will be empty, + // meaning the median should be 0 (middle of the range). + EXPECT_EQ(results.GetQuantile(0.5).value(), 0); +} + +TEST(QuantileTreeTest, TreeOverflowsWithNoise) { + std::unique_ptr> test_quantiles = + typename QuantileTree::Builder() + .SetUpper(50) + .SetLower(-50) + .SetTreeHeight(4) + .SetBranchingFactor(10) + .Build() + .value(); + + QuantileTreeTestPeer::AddMultipleEntries( + 25, std::numeric_limits::max(), test_quantiles.get()); + + typename QuantileTree::DPParams dp_params; + dp_params.epsilon = kTestDefaultEpsilon; + dp_params.delta = kDefaultDelta; + dp_params.max_contributions_per_partition = + kDefaultMaxContributionsPerPartition; + dp_params.max_partitions_contributed_to = kDefaultMaxPartitionsContributed; + dp_params.mechanism_builder = absl::make_unique(); + + // All the entries are in one leaf node. If the counts don't overflow, they + // should always be much larger than all the noisy zeroes, and so we should + // ~always get a median in that bucket. If the count can overflow, it should + // do so ~50% of the time. If the count overflows, the total count should + // be negative, and we should conclude the tree is empty. Therefore, we will + // pick the middle of the range (0). + // + // That means that if overflows can happen, we should get a 0 ~50% of the + // time. Given 10^3 tries, an event with p=.5 should ~always occur. + for (int i = 0; i < 1e3; ++i) { + typename QuantileTree::Privatized results = + test_quantiles->MakePrivate(dp_params).value(); + + if (results.GetQuantile(0.5).value() == 0) { + // An overflow occurred, so we can return from the test with a success. + return; + } + } + FAIL() << "No overflow occurred after 1e3 iterations."; +} + +TYPED_TEST(QuantileTreeTest, PrivatizedConstantWithExtraInput) { + std::unique_ptr> test_quantiles = + typename QuantileTree::Builder() + .SetUpper(50) + .SetLower(-50) + .SetTreeHeight(4) + .SetBranchingFactor(10) + .Build() + .value(); + + for (int i = 0; i < 100; ++i) { + test_quantiles->AddEntry(-25); + } + + typename QuantileTree::DPParams dp_params; + dp_params.epsilon = kTestDefaultEpsilon; + dp_params.delta = kDefaultDelta; + dp_params.max_contributions_per_partition = + kDefaultMaxContributionsPerPartition; + dp_params.max_partitions_contributed_to = kDefaultMaxPartitionsContributed; + dp_params.mechanism_builder = absl::make_unique(); + + typename QuantileTree::Privatized results = + test_quantiles->MakePrivate(dp_params).value(); + + double tolerance = 0.01; // > upper - lower / branchingFactor ^ treeHeight + EXPECT_NEAR(results.GetQuantile(0.5).value(), -25, tolerance); + + for (int i = 0; i < 1000; ++i) { + test_quantiles->AddEntry(25); + } + + EXPECT_NEAR(results.GetQuantile(0.5).value(), -25, tolerance); +} + +TYPED_TEST(QuantileTreeTest, MemoryUsed) { + std::unique_ptr> empty = + typename QuantileTree::Builder() + .SetUpper(50) + .SetLower(-50) + .SetTreeHeight(4) + .SetBranchingFactor(10) + .Build() + .value(); + std::unique_ptr> once = + typename QuantileTree::Builder() + .SetUpper(50) + .SetLower(-50) + .SetTreeHeight(4) + .SetBranchingFactor(10) + .Build() + .value(); + std::unique_ptr> twice = + typename QuantileTree::Builder() + .SetUpper(50) + .SetLower(-50) + .SetTreeHeight(4) + .SetBranchingFactor(10) + .Build() + .value(); + + once->AddEntry(-49); + twice->AddEntry(49); + twice->AddEntry(49); + EXPECT_GT(once->MemoryUsed(), empty->MemoryUsed()); + EXPECT_EQ(once->MemoryUsed(), twice->MemoryUsed()); +} + +} // namespace +} // namespace differential_privacy diff --git a/cc/algorithms/quantiles.h b/cc/algorithms/quantiles.h new file mode 100644 index 00000000..39d40db9 --- /dev/null +++ b/cc/algorithms/quantiles.h @@ -0,0 +1,209 @@ +// +// Copyright 2021 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#ifndef DIFFERENTIAL_PRIVACY_CPP_ALGORITHMS_QUANTILES_H_ +#define DIFFERENTIAL_PRIVACY_CPP_ALGORITHMS_QUANTILES_H_ + +#include "absl/status/status.h" +#include "base/statusor.h" +#include "algorithms/algorithm.h" +#include "algorithms/bounded-algorithm.h" +#include "algorithms/quantile-tree.h" + +namespace differential_privacy { + +// Calculates multiple differentially private quantiles. Currently implemented +// using the quantile tree mechanism, see quantile-tree.h for more about the +// mechanism. The set of quantiles to be calculated are specified when building +// the algorithm. +// +// Each element of the output represents the value of one of the requested +// quantiles, in the same order as they were requested when building the +// algorithm. +// +// When constructing a MultiQuantile, upper and lower bounds on the input +// must be explicitly specified. MultiQuantile does not support ApproxBounds. +template +class Quantiles : public Algorithm { + static_assert(std::is_arithmetic::value, + "BoundedSum can only be used for arithmetic types"); + + public: + class Builder; + + void AddEntry(const T& t) override { return tree_->AddEntry(t); } + + Summary Serialize() const override { + Summary to_return; + to_return.mutable_data()->PackFrom(tree_->Serialize()); + return to_return; + } + + absl::Status Merge(const Summary& summary) { + if (!summary.has_data()) { + return absl::InternalError( + "Cannot merge summary with no bounded quantiles data"); + } + + BoundedQuantilesSummary quantiles_summary; + if (!summary.data().UnpackTo(&quantiles_summary)) { + return absl::InternalError( + "Bounded quantiles summary could not be unpacked."); + } + return tree_->Merge(quantiles_summary); + } + + int64_t MemoryUsed() override { + return tree_->MemoryUsed() + sizeof(Quantiles) + + sizeof(NumericalMechanismBuilder) + + sizeof(double) * quantiles_.capacity(); + } + + protected: + base::StatusOr GenerateResult(double privacy_budget, + double noise_interval_level) override { + typename QuantileTree::DPParams dp_params; + dp_params.epsilon = Algorithm::GetEpsilon() * privacy_budget; + dp_params.delta = Algorithm::GetDelta() * privacy_budget; + dp_params.max_contributions_per_partition = + max_contributions_per_partition_; + dp_params.max_partitions_contributed_to = max_partitions_contributed_to_; + dp_params.mechanism_builder = mechanism_builder_->Clone(); + base::StatusOr::Privatized> result = + tree_->MakePrivate(dp_params); + if (!result.ok()) { + return result.status(); + } + typename QuantileTree::Privatized privatized_tree = + std::move(result.value()); + + Output output; + for (double quantile : quantiles_) { + double result; + ASSIGN_OR_RETURN(result, privatized_tree.GetQuantile(quantile)); + AddToOutput(&output, result); + } + return output; + } + + void ResetState() override { tree_->Reset(); } + + private: + Quantiles(std::unique_ptr> tree, + std::vector quantiles, double epsilon, double delta, + int max_contributions_per_partition, + int max_partitions_contributed_to, + std::unique_ptr mechanism_builder) + : Algorithm(epsilon, delta), + tree_(std::move(tree)), + quantiles_(quantiles), + max_contributions_per_partition_(max_contributions_per_partition), + max_partitions_contributed_to_(max_partitions_contributed_to), + mechanism_builder_(std::move(mechanism_builder)) {} + + std::unique_ptr> tree_; + int max_contributions_per_partition_; + int max_partitions_contributed_to_; + std::unique_ptr mechanism_builder_; + std::vector quantiles_; +}; + +template +class Quantiles::Builder + : public AlgorithmBuilder, Quantiles::Builder> { + using AlgorithmBuilder = + differential_privacy::AlgorithmBuilder, + Quantiles::Builder>; + + public: + Quantiles::Builder& SetLower(T lower) { + lower_ = lower; + return *this; + } + Quantiles::Builder& SetUpper(T upper) { + upper_ = upper; + return *this; + } + + // The list of quantiles to be produced. It is required; the algorithm will + // fail to build without a list of quantiles. If this method is called + // more than once, it will overwrite any previous list of quantiles rather + // than appending to it. + Quantiles::Builder& SetQuantiles(const std::vector& quantiles) { + quantiles_ = quantiles; + return *this; + } + + protected: + base::StatusOr>> BuildAlgorithm() override { + typename QuantileTree::Builder tree_builder; + + if (lower_.has_value()) { + tree_builder.SetLower(lower_.value()); + } + if (upper_.has_value()) { + tree_builder.SetUpper(upper_.value()); + } + std::unique_ptr> tree; + ASSIGN_OR_RETURN(tree, tree_builder.Build()); + + if (quantiles_.empty()) { + return absl::InvalidArgumentError( + "You must specify at least one quantile to calculate."); + } + for (double quantile : quantiles_) { + if (quantile < 0 || quantile > 1) { + return absl::InvalidArgumentError( + "All quantiles to calculate must be in [0, 1]."); + } + } + + // Try building a numerical mechanism so we can return an error now if any + // parameters are invalid. Otherwise, the error wouldn't be returned until + // we call MakePrivate in GenerateResult. + std::unique_ptr mech_builder_clone = + AlgorithmBuilder::GetMechanismBuilderClone(); + if (AlgorithmBuilder::GetEpsilon().has_value()) { + mech_builder_clone->SetEpsilon(AlgorithmBuilder::GetEpsilon().value()); + } + if (AlgorithmBuilder::GetDelta().has_value()) { + mech_builder_clone->SetDelta(AlgorithmBuilder::GetDelta().value()); + } + mech_builder_clone + ->SetLInfSensitivity( + AlgorithmBuilder::GetMaxContributionsPerPartition().value_or(1)) + .SetL0Sensitivity( + AlgorithmBuilder::GetMaxPartitionsContributed().value_or(1) * + tree->GetHeight()); + RETURN_IF_ERROR(mech_builder_clone->Build().status()); + + return std::unique_ptr(new Quantiles( + std::move(tree), quantiles_, AlgorithmBuilder::GetEpsilon().value(), + AlgorithmBuilder::GetDelta().value_or(0), + AlgorithmBuilder::GetMaxContributionsPerPartition().value_or(1), + AlgorithmBuilder::GetMaxPartitionsContributed().value_or(1), + AlgorithmBuilder::GetMechanismBuilderClone())); + } + + private: + absl::optional lower_; + absl::optional upper_; + std::vector quantiles_; +}; + +} // namespace differential_privacy + +#endif // DIFFERENTIAL_PRIVACY_CPP_ALGORITHMS_QUANTILES_H_ diff --git a/cc/algorithms/quantiles_test.cc b/cc/algorithms/quantiles_test.cc new file mode 100644 index 00000000..f7d7bc22 --- /dev/null +++ b/cc/algorithms/quantiles_test.cc @@ -0,0 +1,401 @@ +// +// Copyright 2021 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "algorithms/quantiles.h" + +#include + +#include "base/testing/proto_matchers.h" +#include "base/testing/status_matchers.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "absl/random/random.h" +#include "algorithms/numerical-mechanisms-testing.h" + +namespace differential_privacy { +namespace { + +using ::differential_privacy::test_utils::ZeroNoiseMechanism; +using ::testing::HasSubstr; +using ::differential_privacy::base::testing::StatusIs; + +const int kDefaultDatasetSize = 1001; +const int kNumRanksToTest = 10; + +template +class QuantilesTest : public ::testing::Test { + protected: + void SetUp() override {} + void TearDown() override {} +}; + +typedef ::testing::Types NumericTypes; +TYPED_TEST_SUITE(QuantilesTest, NumericTypes); + +TEST(QuantilesTest, InvalidParametersTest) { + EXPECT_THAT(Quantiles::Builder() + .SetLower(0) + .SetUpper(1) + .SetQuantiles({}) + .Build(), + StatusIs(absl::StatusCode::kInvalidArgument, + HasSubstr("must specify at least one quantile"))); + EXPECT_THAT(Quantiles::Builder() + .SetLower(0) + .SetUpper(1) + .SetQuantiles({-1}) + .Build(), + StatusIs(absl::StatusCode::kInvalidArgument, + HasSubstr("quantiles to calculate must be in [0, 1]"))); + EXPECT_THAT(Quantiles::Builder() + .SetLower(0) + .SetUpper(1) + .SetQuantiles({1.5}) + .Build(), + StatusIs(absl::StatusCode::kInvalidArgument, + HasSubstr("quantiles to calculate must be in [0, 1]"))); + EXPECT_THAT(Quantiles::Builder() + .SetLower(2) + .SetUpper(1) + .SetQuantiles({0.5}) + .Build(), + StatusIs(absl::StatusCode::kInvalidArgument, + HasSubstr("Lower bound must be less than upper"))); + EXPECT_THAT(Quantiles::Builder().SetQuantiles({0.5}).Build(), + StatusIs(absl::StatusCode::kInvalidArgument, + HasSubstr("Lower and upper bounds must both be set"))); +} + +// Input must be sorted. +template +double TrueQuantileFromSorted(std::vector inputs, double quantile) { + int rank = std::round(quantile * (inputs.size() - 1)); + return inputs[rank]; +} + +TYPED_TEST(QuantilesTest, ApproximatesTrueQuantile) { + std::vector quantiles; + for (int i = 0; i < kNumRanksToTest; ++i) { + quantiles.push_back(static_cast(i) / kNumRanksToTest); + } + + std::unique_ptr> test_quantiles = + typename Quantiles::Builder() + .SetUpper(50) + .SetLower(-50) + .SetQuantiles(quantiles) + .SetLaplaceMechanism(absl::make_unique()) + .Build() + .value(); + + std::vector inputs; + for (int i = 0; i < kDefaultDatasetSize; ++i) { + inputs.push_back(absl::Uniform(absl::BitGen(), -25, 25)); + } + + for (TypeParam input : inputs) { + test_quantiles->AddEntry(input); + } + + double tolerance = 0.01; // > upper - lower / branchingFactor ^ treeHeight + std::sort(inputs.begin(), inputs.end()); + + Output results = test_quantiles->PartialResult().value(); + + for (int i = 0; i < quantiles.size(); ++i) { + double quantile = quantiles[i]; + + EXPECT_NEAR(results.elements(i).value().float_value(), + TrueQuantileFromSorted(inputs, quantile), tolerance); + } +} + +TYPED_TEST(QuantilesTest, InputOrderInvariant) { + std::vector quantiles; + for (int i = 0; i < kNumRanksToTest; ++i) { + quantiles.push_back(static_cast(i) / kNumRanksToTest); + } + + std::unique_ptr> test_quantiles1 = + typename Quantiles::Builder() + .SetUpper(50) + .SetLower(-50) + .SetQuantiles(quantiles) + .SetLaplaceMechanism(absl::make_unique()) + .Build() + .value(); + std::unique_ptr> test_quantiles2 = + typename Quantiles::Builder() + .SetUpper(50) + .SetLower(-50) + .SetQuantiles(quantiles) + .SetLaplaceMechanism(absl::make_unique()) + .Build() + .value(); + + absl::BitGen gen; + std::vector inputs; + for (int i = 0; i < kDefaultDatasetSize; ++i) { + inputs.push_back(absl::Uniform(gen, -25, 25)); + } + + for (TypeParam input : inputs) { + test_quantiles1->AddEntry(input); + } + std::shuffle(inputs.begin(), inputs.end(), gen); + for (TypeParam input : inputs) { + test_quantiles2->AddEntry(input); + } + + Output results1 = test_quantiles1->PartialResult().value(); + Output results2 = test_quantiles2->PartialResult().value(); + + EXPECT_THAT(results1, ::differential_privacy::base::testing::EqualsProto(results2)); +} + +// This should hold even with noise enabled. +TYPED_TEST(QuantilesTest, ResultsIncreaseMonotonically) { + std::vector quantiles; + for (int i = 0; i < kNumRanksToTest; ++i) { + quantiles.push_back(static_cast(i) / kNumRanksToTest); + } + + std::unique_ptr> test_quantiles = + typename Quantiles::Builder() + .SetUpper(50) + .SetLower(-50) + .SetQuantiles(quantiles) + .SetLaplaceMechanism(absl::make_unique()) + .Build() + .value(); + + absl::BitGen gen; + std::vector inputs; + for (int i = 0; i < kDefaultDatasetSize; ++i) { + inputs.push_back(absl::Uniform(gen, -25, 25)); + } + + for (TypeParam input : inputs) { + test_quantiles->AddEntry(input); + } + + Output result = test_quantiles->PartialResult().value(); + + double last_result = std::numeric_limits::lowest(); + for (int i = 0; i < result.elements_size(); ++i) { + EXPECT_GE(result.elements(i).value().float_value(), last_result); + last_result = result.elements(i).value().float_value(); + } +} + +TYPED_TEST(QuantilesTest, SerializeMergeTest) { + std::vector quantiles; + for (int i = 0; i < kNumRanksToTest; ++i) { + quantiles.push_back(static_cast(i) / kNumRanksToTest); + } + + std::unique_ptr> test_quantiles1 = + typename Quantiles::Builder() + .SetUpper(50) + .SetLower(-50) + .SetQuantiles(quantiles) + .SetLaplaceMechanism(absl::make_unique()) + .Build() + .value(); + std::unique_ptr> test_quantiles2 = + typename Quantiles::Builder() + .SetUpper(50) + .SetLower(-50) + .SetQuantiles(quantiles) + .SetLaplaceMechanism(absl::make_unique()) + .Build() + .value(); + + absl::BitGen gen; + std::vector first_inputs; + std::vector second_inputs; + for (int i = 0; i < kDefaultDatasetSize; ++i) { + TypeParam input = absl::Uniform(gen, -25, 25); + if (i < kDefaultDatasetSize / 2) { + first_inputs.push_back(input); + } else { + second_inputs.push_back(input); + } + } + + for (TypeParam input : first_inputs) { + test_quantiles1->AddEntry(input); + } + + EXPECT_OK(test_quantiles2->Merge(test_quantiles1->Serialize())); + + for (TypeParam input : second_inputs) { + test_quantiles1->AddEntry(input); + test_quantiles2->AddEntry(input); + } + + Output results1 = test_quantiles1->PartialResult().value(); + + Output results2 = test_quantiles2->PartialResult().value(); + + EXPECT_THAT(results1, ::differential_privacy::base::testing::EqualsProto(results2)); +} + +TEST(QuantilesTest, MergeFailsWithBadBounds) { + std::unique_ptr> test_quantiles = + typename Quantiles::Builder() + .SetUpper(50) + .SetLower(-50) + .SetQuantiles({0.5}) + .Build() + .value(); + std::unique_ptr> wrong_lower = + typename Quantiles::Builder() + .SetUpper(50) + .SetLower(-49) + .SetQuantiles({0.5}) + .Build() + .value(); + std::unique_ptr> wrong_upper = + typename Quantiles::Builder() + .SetUpper(49) + .SetLower(-50) + .SetQuantiles({0.5}) + .Build() + .value(); + + EXPECT_THAT(wrong_lower->Merge(test_quantiles->Serialize()), + StatusIs(absl::StatusCode::kInternal, HasSubstr("Bounds"))); + EXPECT_THAT(wrong_upper->Merge(test_quantiles->Serialize()), + StatusIs(absl::StatusCode::kInternal, HasSubstr("Bounds"))); +} + +TEST(QuantilesTest, IgnoresNaN) { + std::unique_ptr> test_quantiles1 = + typename Quantiles::Builder() + .SetUpper(50) + .SetLower(-50) + .SetQuantiles({0.5}) + .SetLaplaceMechanism(absl::make_unique()) + .Build() + .value(); + + std::unique_ptr> test_quantiles2 = + typename Quantiles::Builder() + .SetUpper(50) + .SetLower(-50) + .SetQuantiles({0.5}) + .SetLaplaceMechanism(absl::make_unique()) + .Build() + .value(); + + test_quantiles1->AddEntry(5.0); + test_quantiles2->AddEntry(5.0); + + for (int i = 0; i < 100; ++i) { + test_quantiles2->AddEntry(std::nan("")); + } + + Output results1 = test_quantiles1->PartialResult().value(); + Output results2 = test_quantiles2->PartialResult().value(); + + EXPECT_THAT(results1, ::differential_privacy::base::testing::EqualsProto(results2)); +} + +TYPED_TEST(QuantilesTest, MemoryUsed) { + std::unique_ptr> empty = + typename Quantiles::Builder() + .SetUpper(50) + .SetLower(-50) + .SetQuantiles({0.5}) + .Build() + .value(); + std::unique_ptr> once = + typename Quantiles::Builder() + .SetUpper(50) + .SetLower(-50) + .SetQuantiles({0.5}) + .Build() + .value(); + std::unique_ptr> twice = + typename Quantiles::Builder() + .SetUpper(50) + .SetLower(-50) + .SetQuantiles({0.5}) + .Build() + .value(); + + once->AddEntry(-49); + twice->AddEntry(49); + twice->AddEntry(49); + EXPECT_GT(once->MemoryUsed(), empty->MemoryUsed()); + EXPECT_EQ(once->MemoryUsed(), twice->MemoryUsed()); +} + +TYPED_TEST(QuantilesTest, Reset) { + std::vector quantiles; + for (int i = 0; i < kNumRanksToTest; ++i) { + quantiles.push_back(static_cast(i) / kNumRanksToTest); + } + + std::unique_ptr> test_quantiles1 = + typename Quantiles::Builder() + .SetUpper(50) + .SetLower(-50) + .SetQuantiles(quantiles) + .SetLaplaceMechanism(absl::make_unique()) + .Build() + .value(); + std::unique_ptr> test_quantiles2 = + typename Quantiles::Builder() + .SetUpper(50) + .SetLower(-50) + .SetQuantiles(quantiles) + .SetLaplaceMechanism(absl::make_unique()) + .Build() + .value(); + + std::vector first_inputs; + std::vector second_inputs; + for (int i = 0; i < kDefaultDatasetSize; ++i) { + TypeParam input = absl::Uniform(absl::BitGen(), -25, 25); + if (i < kDefaultDatasetSize / 2) { + first_inputs.push_back(input); + } else { + second_inputs.push_back(input); + } + } + + for (TypeParam input : first_inputs) { + test_quantiles1->AddEntry(input); + } + + test_quantiles1->Reset(); + + for (TypeParam input : second_inputs) { + test_quantiles1->AddEntry(input); + test_quantiles2->AddEntry(input); + } + + Output results1 = test_quantiles1->PartialResult().value(); + + Output results2 = test_quantiles2->PartialResult().value(); + + EXPECT_THAT(results1, ::differential_privacy::base::testing::EqualsProto(results2)); +} + +} // namespace +} // namespace differential_privacy diff --git a/cc/base/testing/BUILD b/cc/base/testing/BUILD index 75366fef..4e9bf62f 100644 --- a/cc/base/testing/BUILD +++ b/cc/base/testing/BUILD @@ -29,6 +29,7 @@ cc_library( hdrs = ["proto_matchers.h"], deps = [ "@com_google_googletest//:gtest_main", + "//base:logging", "@com_google_absl//absl/strings", "@com_google_protobuf//:protobuf", ], diff --git a/cc/base/testing/proto_matchers.h b/cc/base/testing/proto_matchers.h index 4186371e..15f1a0c5 100644 --- a/cc/base/testing/proto_matchers.h +++ b/cc/base/testing/proto_matchers.h @@ -24,6 +24,7 @@ #include "gmock/gmock.h" #include "gtest/gtest.h" #include "absl/strings/string_view.h" +#include "base/logging.h" namespace differential_privacy { namespace base { diff --git a/cc/docs/algorithms/order-statistics.md b/cc/docs/algorithms/order-statistics.md index a5942ecc..e7a79cc7 100644 --- a/cc/docs/algorithms/order-statistics.md +++ b/cc/docs/algorithms/order-statistics.md @@ -1,6 +1,9 @@ # Order Statistics +WARNING: These algorithms are deprecated and may be removed soon. Please use +[Quantiles](quantiles.md) (which is more accurate) instead. + We have a set of algorithms for calculating [order statistics](https://github.com/google/differential-privacy/blob/main/cc/algorithms/order-statistics.h) (aka quantiles, percentiles). The following are supported: diff --git a/cc/docs/algorithms/quantiles.md b/cc/docs/algorithms/quantiles.md new file mode 100644 index 00000000..df1b5d26 --- /dev/null +++ b/cc/docs/algorithms/quantiles.md @@ -0,0 +1,167 @@ + +# Quantiles + +We have two different pieces of code to calculate quantiles (aka percentiles, or +order statistics): +[QuantileTree](https://github.com/google/differential-privacy/blob/main/cc/algorithms/quantile-tree.h) +offers a tree-based differentially private algorithm with a distinctive +interface, and +[Quantiles](https://github.com/google/differential-privacy/blob/main/cc/algorithms/quantiles.h) +uses it to implement the `Algorithm` interface. Both offer the same accuracy, +performance, and privacy guarantees because they use the same underlying DP +mechanism. Both can be used to calculate any quantile (though they are least +accurate close to the maximum and minimum), and both can be used to calculate +any number of quantiles with no loss in accuracy (e.g. if you want to calculate +a median, your result will be equally accurate regardless of whether or not you +choose to calculate additional quantiles). The only difference between the two +is in their interface. + +Note: If you only want to calculate a single quantile, we recommend using +`Quantiles` and requesting only a single quantile. We do not currently have a +more efficient algorithm for single quantiles. If we add a more efficient +algorithm for single quantiles, we will use it whenever a `Quantiles` is created +to find a single quantile. + +## Quantiles + +### Input & Output + +`Quantiles` support any numeric type. Its `Output`s contain one element for each +requested quantile, in the same order as when the `Quantiles` was built. +`ConfidenceInterval` and `BoundingReport` are not provided. + +### Construction + +`Quantiles` is an [`Algorithm`](algorithm.md). It does require an upper and +lower bound, but it does not use [BoundedAlgorithmBuilder](bounded-algorithm.md) +and cannot determine the bounds automatically. Instead, the user must manually +set the bounds or the algorithm will fail to build. + +Note: You can run [`ApproxBounds`](approx-bounds.md) over your dataset to get +bounds if you do not know them in advance, but you must do so manually in a +separate pass over your data (before you construct your `Quantiles`) and you +must manually pass the bounds from your `ApproxBounds` to your `Quantiles`. + +In addition, you must specify a set of quantiles to calculate. Quantiles are +provided as a `vector`. Quantiles will be returned in the same order as +you specify them. If you call `SetQuantiles` multiple times, each call will +overwrite the set of quantiles from the previous call. + +Here is the minimal set of arguments for constructing a Quantiles: + +``` +base::StatusOr>> quantile = + Quantiles::Builder() + .SetLower(upper) + .SetUpper(lower) + .SetQuantiles(quantiles) + .Build(); +``` + +* `T`: The input type, for example `double` or `int64`. +* `T upper, lower`: The upper and lower bounds on each input element. If any + input elements are greater than `upper` or less than `lower`, they will be + replaced with `upper` or `lower` respectively. +* `std::vector quantiles`: The list of quantiles you wish to + calculate. Each quantile should be in the range [0, 1] where 0 will find the + minimum, and 1 the maximum. Can be created inline, e.g. `.SetQuantile({0.25, + 0.5, 0.75})`. + +### Use + +`Quantiles` is an [`Algorithm`](algorithm.md) and supports its full API, except +for the ability to provide confidence intervals. + +## QuantileTree + +QuantileTree implements a tree-based differentially private quantile calculation +algorithm. For details, see the +[full algorithm writeup](https://github.com/google/differential-privacy/blob/main/common_docs/Differentially_Private_Quantile_Trees.pdf). + +### Construction + +A `QuantileTree` is constructed via a `QuantileTree::Builder`. The builder +supports the following methods: + +``` +base::StatusOr>> quantile_tree = + QuantileTree::Builder() + .SetLower(lower) + .SetUpper(upper) + .SetTreeHeight(tree_height) // optional + .SetBranchingFactor(branching_factor) + .Build(); +``` + +* `T`: The input type, for example `double` or `int64`. +* `T upper, lower`: The upper and lower bounds for each input element. If any + inputs are greater than `upper` or less than `lower` they will be replaced + with `upper` or `lower` respectively. +* `int tree_height, branching_factor`: These parameters specify the height and + width of the quantile tree. Each node will have `branching_factor` children, + and the tree will be of height `tree_height`. The quantile tree assigns each + leaf node to an equal sized portion of the domain [lower, upper], and there + will be `branching_factor` ^ `tree_height` leaf nodes. These parameters are + optional, and have reasonable default values. + +Note that privacy parameters are not specified when constructing the quantile +tree. + +### Input and Output + +Like `Quantiles`, `QuantileTree` supports any numeric type as input. + +To get results, call the `MakePrivate` method. The `MakePrivate` method takes a +struct that contains all of the privacy parameters (these are the same ones used +when constructing an [`Algorithm`](algorithm.md)): + +``` +base::StatusOr::Privatized> results = + quantile_tree.MakePrivate({ + .epsilon = epsilon, + .delta = delta, + .max_contributions_per_partition = max_contributions, + .max_partitions_contributed_to = max_partitions, + .mechanism_builder = absl::MakeUnique()}); +``` + +* `double epsilon`: The `epsilon` differential privacy parameter. A smaller + number means more privacy but less accuracy. `epsilon` should be > 0. +* `double delta`: The `delta` differential privacy parameter. A smaller number + means more privacy but less accuracy. `delta` should be in (0, 1). +* `int max_partitions`: The number of aggregations, or 'partitions,' that each + user is allowed to contribute to. Defaults to 1 if unset. The caller must + guarantee that this limit is enforced on the input. The library cannot + enforce it because it cannot distinguish between users or aggregations. Note + that `Algorithm`s that will be merged together are considered part of the + same partition. +* `int max_contributions`: The number of pieces of input to this aggregation + that can belong to a single user. Defaults to 1 if unset. The caller must + guarantee that this limit is enforced on the input. The library cannot + enforce it because it does not now which inputs belong to which users. If + summaries from multiple `Algorithm`s are merged together, the total number + of inputs from a single user across all marged `Algorithm`s must not exceed + this limit. +* `std::unique_ptr mechanism_builder`: Used to + specify the type of numerical mechanism the algorithm will use to add noise + (e.g. Laplace, Gaussian). In most cases this should not be set (and a + default LaplaceMechanism will be used), but it can be used to remove or mock + noise during testing. + +`MakePrivate` returns a `QuantileTree::Privatized` (or an error status if an +error occurred). A `QuantileTree::Privatized` can be used to calculate any +quantile by calling the `GetQuantile` method. Calling `GetQuantile` consumes no +additional privacy budget. A `QuantileTree::Privatized` may contain internal +non-privatized information, but only privatized information can be accessed +through its API. A `QuantileTree::Privatized` answers questions about the +input that had been added to the `QuantileTree` at the time that +`MakePrivate` was called. After a `QuantileTree::Privatized` has been created +it is no longer tied to the `QuantileTree`, and modifying the +`QuantileTree` has no effect on it. + +## Result Performance + +For both `Quantiles` and `QuantileTree`, calling `Result` has a constant time +complexity (though the value of the constant will depend on the tree +parameters). Space complexity is also constant, and again depends on the tree +parameters. diff --git a/cc/testing/BUILD b/cc/testing/BUILD index fd839254..70989a11 100644 --- a/cc/testing/BUILD +++ b/cc/testing/BUILD @@ -99,3 +99,53 @@ cc_test( "@com_google_cc_differential_privacy//base:status", ], ) + +cc_library( + name = "statistical_tests_utils", + testonly = 1, + srcs = ["statistical_tests_utils.cc"], + hdrs = ["statistical_tests_utils.h"], + deps = [ + "//algorithms:rand", + "//algorithms:util", + "@com_google_absl//absl/container:flat_hash_map", + "@com_google_absl//absl/random:distributions", + "@com_google_absl//absl/strings", + "@com_google_protobuf//:protobuf", + ], +) + +cc_test( + name = "statistical_tests_utils_test", + srcs = [ + "statistical_tests_utils_test.cc", + ], + deps = [ + ":statistical_tests_utils", + "//base/testing:proto_matchers", + "@com_google_googletest//:gtest_main", + "//algorithms:util", + "@com_google_differential_privacy//proto/testing:statistical_tests_cc_proto", + "@com_google_protobuf//:protobuf", + ], +) + +cc_test( + name = "quantile_tree_dp_tests", + size = "enormous", + srcs = [ + "quantile_tree_dp_test.cc", + ], + data = [ + "@com_google_differential_privacy//proto/testing:bounded_quantiles_dp_test_cases.textproto", + ], + # The shard count must match the number of test cases defined in bounded_quantiles_dp_test_cases.textproto + shard_count = 39, + deps = [ + ":statistical_tests_utils", + "@com_google_googletest//:gtest_main", + "//algorithms:numerical-mechanisms", + "//algorithms:quantile-tree", + "@com_google_differential_privacy//proto/testing:statistical_tests_cc_proto", + ], +) diff --git a/cc/testing/quantile_tree_dp_test.cc b/cc/testing/quantile_tree_dp_test.cc new file mode 100644 index 00000000..3c621f18 --- /dev/null +++ b/cc/testing/quantile_tree_dp_test.cc @@ -0,0 +1,168 @@ +// +// Copyright 2021 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "algorithms/numerical-mechanisms.h" +#include "algorithms/quantile-tree.h" +#include "testing/statistical_tests_utils.h" +#include "proto/testing/statistical_tests.pb.h" + +namespace differential_privacy { +namespace testing { +namespace { + +using differential_privacy::QuantileTree; +using ::testing::BoundedQuantilesDpTestCase; +using ::testing::BoundedQuantilesDpTestCaseCollection; +using ::testing::BoundedQuantilesSamplingParameters; +using ::testing::DpTestParameters; +using ::testing::Message; + +class QuantileTreeDpTest + : public ::testing::TestWithParam {}; + +constexpr char kTestCaseProtoPath[] = + "external/com_google_differential_privacy/proto/testing/" + "bounded_quantiles_dp_test_cases.textproto"; + +static bool GenerateVote( + std::function()> sample_generator_a, + std::function()> sample_generator_b, int num_samples, + int num_ranks, double lower, double upper, double epsilon, double delta, + double delta_tolerance, int num_buckets) { + std::vector> samples_a(num_ranks); + std::vector> samples_b(num_ranks); + + for (int i = 0; i < num_samples; ++i) { + std::vector sample_a = sample_generator_a(); + std::vector sample_b = sample_generator_b(); + for (int j = 0; j < num_ranks; ++j) { + samples_a[j].push_back(Bucketize(sample_a[j], lower, upper, num_buckets)); + samples_b[j].push_back(Bucketize(sample_b[j], lower, upper, num_buckets)); + } + } + + // Only vote to accept if all quantiles pass the test. + for (int j = 0; j < num_ranks; ++j) { + if (!VerifyApproximateDp(samples_a[j], samples_b[j], epsilon, delta, + delta_tolerance)) { + return false; + } + } + return true; +} + +// Execute a test case from bounded_quantiles_dp_test_cases.textproto. +// We set up quantile trees as the parameters specify, add the specified inputs, +// and then check that the output distribution does not violate the differential +// privacy definition. +TEST_P(QuantileTreeDpTest, RunTestCasesAndCountVotes) { + BoundedQuantilesDpTestCase test_case = GetParam(); + SCOPED_TRACE(Message() << "Test case " << test_case.name()); + + BoundedQuantilesSamplingParameters sampling_params = + test_case.bounded_quantiles_sampling_parameters(); + DpTestParameters dp_test_params = test_case.dp_test_parameters(); + + QuantileTree::Builder quantile_builder; + quantile_builder.SetTreeHeight(sampling_params.tree_height()) + .SetBranchingFactor(sampling_params.branching_factor()) + .SetLower(sampling_params.lower_bound()) + .SetUpper(sampling_params.upper_bound()); + + std::unique_ptr> tree = quantile_builder.Build().value(); + for (double raw_entry : sampling_params.raw_entry()) { + tree->AddEntry(raw_entry); + } + + std::unique_ptr> neighbor_tree = + quantile_builder.Build().value(); + for (double neighbor_entry : sampling_params.neighbour_raw_entry()) { + neighbor_tree->AddEntry(neighbor_entry); + } + + QuantileTree::DPParams dp_params; + dp_params.epsilon = sampling_params.epsilon(); + dp_params.max_contributions_per_partition = + sampling_params.max_contributions_per_partition(); + dp_params.max_partitions_contributed_to = + sampling_params.max_partitions_contributed(); + + switch (sampling_params.noise_type()) { + case ::testing::NoiseType::LAPLACE: + dp_params.mechanism_builder = + absl::make_unique(); + dp_params.delta = 0; + break; + case ::testing::NoiseType::GAUSSIAN: + dp_params.mechanism_builder = + absl::make_unique(); + dp_params.delta = sampling_params.delta(); + break; + default: + FAIL() << "Unknown noise type"; + } + + std::function()> sample_generator = [&tree, &dp_params, + &sampling_params]() { + QuantileTree::Privatized privatized_tree = + tree->MakePrivate(dp_params).value(); + std::vector results; + for (double rank : sampling_params.rank()) { + results.push_back(privatized_tree.GetQuantile(rank).value()); + } + return results; + }; + + std::function()> neighbor_generator = + [&neighbor_tree, &dp_params, &sampling_params]() { + QuantileTree::Privatized privatized_tree = + neighbor_tree->MakePrivate(dp_params).value(); + std::vector results; + for (double rank : sampling_params.rank()) { + results.push_back(privatized_tree.GetQuantile(rank).value()); + } + return results; + }; + + std::function vote_generator = [&sample_generator, + &neighbor_generator, &dp_test_params, + &sampling_params]() { + return GenerateVote( + sample_generator, neighbor_generator, + sampling_params.number_of_samples(), sampling_params.rank_size(), + sampling_params.lower_bound(), sampling_params.upper_bound(), + dp_test_params.epsilon(), dp_test_params.delta(), + dp_test_params.delta_tolerance(), dp_test_params.num_of_buckets()); + }; + + int number_of_votes = + ReadProto(kTestCaseProtoPath) + ->voting_parameters() + .number_of_votes(); + + EXPECT_TRUE(RunBallot(vote_generator, number_of_votes)); +} + +INSTANTIATE_TEST_SUITE_P( + ShardedStatTests, QuantileTreeDpTest, + ValuesIn(ReadProto(kTestCaseProtoPath) + ->bounded_quantiles_dp_test_case())); + +} // namespace +} // namespace testing +} // namespace differential_privacy diff --git a/cc/testing/statistical_tests_utils.cc b/cc/testing/statistical_tests_utils.cc new file mode 100644 index 00000000..1afcf58c --- /dev/null +++ b/cc/testing/statistical_tests_utils.cc @@ -0,0 +1,190 @@ +// +// Copyright 2021 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "testing/statistical_tests_utils.h" + +#include +#include + +#include "absl/random/distributions.h" +#include "algorithms/rand.h" +#include "algorithms/util.h" + +namespace differential_privacy::testing { + +double SampleReferenceLaplacian(double mean, double variance, + SecureURBG* random) { + double b = std::sqrt(variance / 2); + double exp = absl::Exponential(*random, 1 / b); + double flip = absl::Bernoulli(*random, 0.5); + return mean + (flip ? -exp : exp); +} + +namespace { + +using Histogram = absl::flat_hash_map; + +Histogram BuildHistogram(const std::vector& samples) { + Histogram histogram; + for (double sample : samples) ++histogram[sample]; + + return histogram; +} + +// Decides whether two sets of random samples were likely drawn from similar +// discrete distributions up to tolerance l2_tolerance. +// +// The distributions are considered similar if the l2 distance between them is +// less than half the specified l2 tolerance t. Otherwise, if the distance is +// greater than t, they are considered dissimilar. The error probability is at +// most 4014 / (n * t^2), where n is the number of samples contained in one of +// the sets. See (broken link) for more information. +bool VerifyCloseness(const std::vector& samples_a, + const std::vector& samples_b, + double l2_tolerance) { + DCHECK(samples_a.size() == samples_b.size()) + << "The sample sets must be of equal size."; + DCHECK(!samples_a.empty()) << "The sample sets must not be empty"; + DCHECK(l2_tolerance > 0) << "The l2 tolerance must be positive"; + DCHECK(l2_tolerance < 1) << "The l2 tolerance should be less than 1"; + + absl::flat_hash_map histogram_a = BuildHistogram(samples_a); + absl::flat_hash_map histogram_b = BuildHistogram(samples_b); + + int64_t self_collision_count_a = 0; + int64_t self_collision_count_b = 0; + int64_t cross_collision_count = 0; + + for (const auto& key_count : histogram_a) { + int64_t count = key_count.second; + self_collision_count_a += (count * (count - 1)) / 2; + } + + for (const auto& key_count : histogram_b) { + int64_t count = key_count.second; + self_collision_count_b += (count * (count - 1)) / 2; + } + + for (const auto& key_count : histogram_a) { + auto it = histogram_b.find(key_count.first); + if (it == histogram_b.end()) continue; + int64_t count_a = key_count.second; + int64_t count_b = it->second; + + cross_collision_count += count_a * count_b; + } + + double test_value = + self_collision_count_a + self_collision_count_b - + ((samples_a.size() - 1.0) / samples_a.size()) * cross_collision_count; + double threshold = (l2_tolerance * (samples_a.size() - 1)) * + (l2_tolerance * samples_a.size()) / 4.0; + return test_value < threshold; +} + +double ComputeAproximateDpTestValue( + absl::flat_hash_map histogram_a, + absl::flat_hash_map histogram_b, double epsilon, + int num_of_samples) { + double test_value = 0; + for (auto it_a = histogram_a.begin(); it_a != histogram_a.end(); ++it_a) { + double sample_count_a = it_a->second; + auto it_b = histogram_b.find(it_a->first); + if (it_b != histogram_b.end()) { + double sample_count_b = it_b->second; + test_value += + std::max(0.0, (sample_count_a - std::exp(epsilon) * sample_count_b) / + num_of_samples); + } else { + test_value += sample_count_a / num_of_samples; + } + } + return test_value; +} + +} // namespace + +bool VerifyApproximateDp(const std::vector& samples_a, + const std::vector& samples_b, double epsilon, + double delta, double delta_tolerance) { + DCHECK(samples_a.size() == samples_b.size()) + << "The sample sets must be of equal size."; + DCHECK(!samples_a.empty()) << "The sample sets must not be empty"; + DCHECK(delta_tolerance > 0) << "The delta tolerance must be positive"; + DCHECK(delta_tolerance < 1) << "The delta tolerance should be less than 1"; + DCHECK(epsilon >= 0) << "Epsilon must not be negative"; + DCHECK(delta >= 0) << "Delta must not be negative"; + DCHECK(delta < 1) << "Delta should be less than 1"; + + absl::flat_hash_map histogram_a = BuildHistogram(samples_a); + absl::flat_hash_map histogram_b = BuildHistogram(samples_b); + + double test_value_a = ComputeAproximateDpTestValue(histogram_a, histogram_b, + epsilon, samples_a.size()); + double test_value_b = ComputeAproximateDpTestValue(histogram_b, histogram_a, + epsilon, samples_b.size()); + return test_value_a < delta + delta_tolerance && + test_value_b < delta + delta_tolerance; +} + +bool RunBallot(std::function vote_generator, int number_of_votes) { + DCHECK(number_of_votes > 0) << "The number of votes must be positive"; + int accept_votes = 0; + int reject_votes = 0; + while (std::max(accept_votes, reject_votes) <= number_of_votes / 2) + (vote_generator() ? accept_votes : reject_votes)++; + + return accept_votes > reject_votes; +} + +bool GenerateClosenessVote(std::function sample_generator_a, + std::function sample_generator_b, + int number_of_samples, double l2_tolerance, + double granularity) { + std::vector samples_a(number_of_samples); + std::vector samples_b(number_of_samples); + for (int i = 0; i < number_of_samples; i++) { + samples_a[i] = + RoundToNearestDoubleMultiple(sample_generator_a(), granularity); + samples_b[i] = + RoundToNearestDoubleMultiple(sample_generator_b(), granularity); + } + return VerifyCloseness(samples_a, samples_b, l2_tolerance); +} + +bool GenerateApproximateDpVote(std::function sample_generator_a, + std::function sample_generator_b, + int number_of_samples, double epsilon, + double delta, double delta_tolerance, + double granularity) { + std::vector samples_a(number_of_samples); + std::vector samples_b(number_of_samples); + for (int i = 0; i < number_of_samples; ++i) { + samples_a[i] = RoundToNearestMultiple(sample_generator_a(), granularity); + samples_b[i] = RoundToNearestMultiple(sample_generator_b(), granularity); + } + return VerifyApproximateDp(samples_a, samples_b, epsilon, delta, + delta_tolerance); +} + +int Bucketize(double sample, double lower, double upper, int num_buckets) { + return std::max( + 0, std::min(num_buckets - 1, + static_cast(floor(((sample - lower) / (upper - lower)) * + num_buckets)))); +} + +} // namespace differential_privacy::testing diff --git a/cc/testing/statistical_tests_utils.h b/cc/testing/statistical_tests_utils.h new file mode 100644 index 00000000..b952254a --- /dev/null +++ b/cc/testing/statistical_tests_utils.h @@ -0,0 +1,116 @@ +// +// Copyright 2021 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#ifndef DIFFERENTIAL_PRIVACY_CPP_TESTING_STATISTICAL_TESTS_UTILS_H_ +#define DIFFERENTIAL_PRIVACY_CPP_TESTING_STATISTICAL_TESTS_UTILS_H_ + +#include +#include +#include + +#include "google/protobuf/text_format.h" +#include "absl/container/flat_hash_map.h" +#include "absl/strings/str_cat.h" +#include "algorithms/rand.h" + +namespace differential_privacy::testing { + +// Sample a value from Laplace distribution, implemented with absl random +// generators (i.e. non-secure). +double SampleReferenceLaplacian(double mean, double variance, + SecureURBG* random); + +// Generates number_of_samples samples from both sample_generator_a and +// sample_generator_b generators and decides whether these 2 sets of random +// samples were likely drawn from similar discrete distributions. See +// (broken link) for more information. +bool GenerateClosenessVote(std::function sample_generator_a, + std::function sample_generator_b, + int number_of_samples, double l2_tolerance, + double granularity); + +// Decides whether two sets of random samples were likely drawn from a pair of +// discrete distributions that approximately satisfy (ε,δ) differential privacy. +// +// The two distributions are considered to be (ε,δ) differentially private if +// the likelihood of any event with respect to the first distribution is at most +// δ plus e^ε times the likelihood of the same event in the second distribution +// and vice versa. Moreover, the distributions are considered approximately +// (ε,δ) differentially private if there exists a δ' such that the distributions +// are (ε,δ') differentially private and |δ' - δ| is less than half of a given +// tolerance α. Otherwise if no δ' exists such that |δ' - δ| is less than α, the +// distributions are not considered approximately (ε, δ) differentially private. +// Assuming that α > (m / n)^0.5 * (1 + e^(2 * ε)), the error probability is at +// most (1 + e^(2 * ε)) / (n * (α - (m / n)^0.5 * (1 + e^(2 * ε)))^2), where m +// is the size of the support of the distributions and n is the expected value +// of a Poisson distribution from which the number of samples is drawn. See +// (broken link) for more information. +bool VerifyApproximateDp(const std::vector& samples_a, + const std::vector& samples_b, double epsilon, + double delta, double delta_tolerance); + +// Generates number_of_samples samples from both sample_generator_a and +// sample_generator_b generators and decides whether two sets of random samples +// were likely drawn from a pair of discrete distributions that approximately +// satisfy (ε,δ) differential privacy. See (broken link) for more +// information. The test will fail if this pair of samples do not satisfy +// (ε,δ + delta_tolerance) differential privacy. +bool GenerateApproximateDpVote(std::function sample_generator_a, + std::function sample_generator_b, + int number_of_samples, double epsilon, + double delta, double delta_tolerance, + double granularity); + +// Generates number_of_votes of votes from vote_generator to determine a +// majority. Stops early as soon as the majority is clear. Returns the majority. +bool RunBallot(std::function vote_generator, int number_of_votes); + +template +std::optional ReadProto(std::istream* proto_file) { + T tests; + std::string serialized_protobuf; + + std::string line; + while (getline(*proto_file, line)) { + absl::StrAppend(&serialized_protobuf, line, "\n"); + } + *proto_file >> serialized_protobuf; + if (!google::protobuf::TextFormat::ParseFromString(serialized_protobuf, &tests)) { + return std::optional(); + } + return tests; +} + +template +std::optional ReadProto(const std::string& path) { + std::ifstream proto_file(path); + if (!proto_file.is_open()) { + return std::optional(); + } + return ReadProto(&proto_file); +} + +// Partitions the interval between lower and upper into num_buckets subintervals +// of equal size and return the index (from 0 to num_buckets - 1) of the +// subinterval that contains the specified sample. +// Samples outside the bounds will be assigned to the lowest or highest bin as +// appropriate. Samples that are exactly equal to bin boundaries will be +// assigned to the higher bin. +int Bucketize(double sample, double lower, double upper, int num_buckets); + +} // namespace differential_privacy::testing + +#endif // DIFFERENTIAL_PRIVACY_CPP_TESTING_STATISTICAL_TESTS_UTILS_H_ diff --git a/cc/testing/statistical_tests_utils_test.cc b/cc/testing/statistical_tests_utils_test.cc new file mode 100644 index 00000000..87f9054f --- /dev/null +++ b/cc/testing/statistical_tests_utils_test.cc @@ -0,0 +1,299 @@ +// +// Copyright 2021 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "testing/statistical_tests_utils.h" + +#include "google/protobuf/text_format.h" +#include "base/testing/proto_matchers.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "algorithms/util.h" +#include "proto/testing/statistical_tests.pb.h" + +namespace differential_privacy::testing { +namespace { + +const double kLowL2Tolerance = 0.000000001; +const double kDefaultL2Tolerance = 0.001; +const double kHighL2Tolerance = 0.5; + +const double kDefaultEpsilon = 1.0; +const double kDefaultDelta = 0.00001; +const double kLowDeltaTolerance = 0.0000000001; +const double kDefaultDeltaTolerance = 0.00001; +const double kHighDeltaTolerance = 0.5; + +const double kNumSamples = 1000000; + +// A callable object that will return the items in its input vector in order. +struct VectorGenerator { + std::vector samples_; + VectorGenerator(std::vector samples) : samples_(samples) {} + double operator()() { + double to_return = *samples_.begin(); + samples_.erase(samples_.begin()); + return to_return; + } +}; + +TEST(ClosenessVoteTest, AcceptsIdenticalSamples) { + std::vector samples = {1.0, 2.0, 2.0, 3.0, 3.0, 3.0, 4.0, 4.0, + 4.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0}; + + // We use granularity = 1.0 because the samples are already multiples of 1.0. + EXPECT_TRUE(GenerateClosenessVote(VectorGenerator(samples), + VectorGenerator(samples), samples.size(), + kLowL2Tolerance, /*granularity=*/1.0)); + EXPECT_TRUE(GenerateClosenessVote(VectorGenerator(samples), + VectorGenerator(samples), samples.size(), + kHighL2Tolerance, /*granularity=*/1.0)); +} + +TEST(ClosenessVoteTest, RejectsDifferentSamples) { + std::vector samples_a = {1.0, 2.0, 2.0, 3.0, 3.0, 3.0, 4.0, 4.0, + 4.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0}; + std::vector samples_b = {1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, + 2.0, 3.0, 3.0, 3.0, 4.0, 4.0, 5.0}; + + // We use granularity = 1.0 because the samples are already multiples of 1.0. + EXPECT_FALSE(GenerateClosenessVote( + VectorGenerator(samples_a), VectorGenerator(samples_b), samples_a.size(), + kDefaultL2Tolerance, /*granularity=*/1.0)); + EXPECT_FALSE(GenerateClosenessVote( + VectorGenerator(samples_a), VectorGenerator(samples_b), samples_a.size(), + kLowL2Tolerance, /*granularity=*/1.0)); +} + +TEST(ClosenessVoteTest, AcceptsDifferentSamplesWithHighTolernace) { + std::vector samples_a = {1.0, 2.0, 2.0, 3.0, 3.0, 3.0, 4.0, 4.0, + 4.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0}; + std::vector samples_b = {1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, + 2.0, 3.0, 3.0, 3.0, 4.0, 4.0, 5.0}; + + // We use granularity = 1.0 because the samples are already multiples of 1.0. + EXPECT_TRUE(GenerateClosenessVote( + VectorGenerator(samples_a), VectorGenerator(samples_b), samples_a.size(), + kHighL2Tolerance, /*granularity=*/1.0)); +} + +TEST(ClosenessVoteTest, InvariantToSampleOrder) { + std::vector samples_a = {1.0, 2.0, 2.0, 3.0, 3.0, 3.0, 4.0, 4.0, + 4.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0}; + std::vector samples_a_unsorted = {5.0, 3.0, 2.0, 3.0, 1.0, + 2.0, 5.0, 3.0, 4.0, 5.0, + 5.0, 4.0, 5.0, 4.0, 4.0}; + std::vector samples_b = {1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, + 2.0, 3.0, 3.0, 3.0, 4.0, 4.0, 5.0}; + std::vector samples_b_unsorted = {4.0, 1.0, 5.0, 1.0, 4.0, + 2.0, 3.0, 3.0, 1.0, 2.0, + 2.0, 1.0, 3.0, 1.0, 2.0}; + + EXPECT_TRUE(GenerateClosenessVote( + VectorGenerator(samples_a), VectorGenerator(samples_a_unsorted), + samples_a.size(), kDefaultL2Tolerance, /*granularity=*/1.0)); + EXPECT_TRUE(GenerateClosenessVote( + VectorGenerator(samples_a_unsorted), VectorGenerator(samples_a), + samples_a_unsorted.size(), kDefaultL2Tolerance, /*granularity=*/1.0)); + EXPECT_FALSE(GenerateClosenessVote( + VectorGenerator(samples_a), VectorGenerator(samples_b_unsorted), + samples_a.size(), kDefaultL2Tolerance, /*granularity=*/1.0)); + EXPECT_FALSE(GenerateClosenessVote( + VectorGenerator(samples_a_unsorted), VectorGenerator(samples_b), + samples_a_unsorted.size(), kDefaultL2Tolerance, /*granularity=*/1.0)); + EXPECT_FALSE(GenerateClosenessVote( + VectorGenerator(samples_a_unsorted), VectorGenerator(samples_b_unsorted), + samples_a_unsorted.size(), kDefaultL2Tolerance, /*granularity=*/1.0)); +} + +TEST(ApproximateDpVoteTest, AcceptsIdenticalSamples) { + std::vector samples = {1.0, 2.0, 2.0, 3.0, 3.0, 3.0, 4.0, 4.0, + 4.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0}; + + // Identical sample sets should accept with epsilon and delta = 0, and almost + // any delta tolerance. We use granularity = 1.0 because the samples are + // already multiples of 1.0. + EXPECT_TRUE(GenerateApproximateDpVote( + VectorGenerator(samples), VectorGenerator(samples), samples.size(), + /*epsilon=*/0.0, /*delta=*/0.0, kLowDeltaTolerance, /*granularity=*/1.0)); + EXPECT_TRUE(GenerateApproximateDpVote( + VectorGenerator(samples), VectorGenerator(samples), samples.size(), + /*epsilon=*/0.0, /*delta=*/0.0, kHighL2Tolerance, /*granularity=*/1.0)); +} + +TEST(ApproximateDpVoteTest, RejectsDifferentSamples) { + std::vector samples_a = {1.0, 2.0, 2.0, 3.0, 3.0, 3.0, 4.0, 4.0, + 4.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0}; + std::vector samples_b = {1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, + 2.0, 3.0, 3.0, 3.0, 4.0, 4.0, 5.0}; + + // We use granularity = 1.0 because the samples are already multiples of 1.0. + EXPECT_FALSE(GenerateApproximateDpVote( + VectorGenerator(samples_a), VectorGenerator(samples_b), samples_a.size(), + kDefaultEpsilon, kDefaultDelta, kDefaultDeltaTolerance, + /*granularity=*/1.0)); + EXPECT_FALSE(GenerateApproximateDpVote( + VectorGenerator(samples_a), VectorGenerator(samples_b), samples_a.size(), + kDefaultEpsilon, kDefaultDelta, kLowDeltaTolerance, /*granularity=*/1.0)); +} + +TEST(ApproximateDpVoteTest, AcceptsDifferentSamplesWithHighTolerance) { + std::vector samples_a = {1.0, 2.0, 2.0, 3.0, 3.0, 3.0, 4.0, 4.0, + 4.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0}; + std::vector samples_b = {1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, + 2.0, 3.0, 3.0, 3.0, 4.0, 4.0, 5.0}; + + // We use granularity = 1.0 because the samples are already multiples of 1.0. + EXPECT_TRUE(GenerateApproximateDpVote( + VectorGenerator(samples_a), VectorGenerator(samples_b), samples_a.size(), + kDefaultEpsilon, kDefaultDelta, kHighDeltaTolerance, + /*granularity=*/1.0)); +} + +TEST(ApproximateDpVoteTest, InvariantToSampleOrder) { + std::vector samples_a = {1.0, 2.0, 2.0, 3.0, 3.0, 3.0, 4.0, 4.0, + 4.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0}; + std::vector samples_a_unsorted = {5.0, 3.0, 2.0, 3.0, 1.0, + 2.0, 5.0, 3.0, 4.0, 5.0, + 5.0, 4.0, 5.0, 4.0, 4.0}; + std::vector samples_b = {1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, + 2.0, 3.0, 3.0, 3.0, 4.0, 4.0, 5.0}; + std::vector samples_b_unsorted = {4.0, 1.0, 5.0, 1.0, 4.0, + 2.0, 3.0, 3.0, 1.0, 2.0, + 2.0, 1.0, 3.0, 1.0, 2.0}; + + // Identical sample sets should accept with epsilon and delta = 0, and almost + // any delta tolerance. We use granularity = 1.0 because the samples are + // already multiples of 1.0. + EXPECT_TRUE(GenerateApproximateDpVote( + VectorGenerator(samples_a), VectorGenerator(samples_a_unsorted), + samples_a.size(), + /*epsilon=*/0.0, /*delta=*/0.0, kDefaultL2Tolerance, + /*granularity=*/1.0)); + EXPECT_TRUE(GenerateApproximateDpVote( + VectorGenerator(samples_a_unsorted), VectorGenerator(samples_a), + samples_a_unsorted.size(), + /*epsilon=*/0.0, /*delta=*/0.0, kDefaultL2Tolerance, + /*granularity=*/1.0)); + EXPECT_FALSE(GenerateApproximateDpVote( + VectorGenerator(samples_a), VectorGenerator(samples_b_unsorted), + samples_a.size(), kDefaultEpsilon, kDefaultDelta, kDefaultL2Tolerance, + /*granularity=*/1.0)); + EXPECT_FALSE(GenerateApproximateDpVote( + VectorGenerator(samples_a_unsorted), VectorGenerator(samples_b), + samples_a_unsorted.size(), kDefaultEpsilon, kDefaultDelta, + kDefaultL2Tolerance, /*granularity=*/1.0)); + EXPECT_FALSE(GenerateApproximateDpVote( + VectorGenerator(samples_a_unsorted), VectorGenerator(samples_b_unsorted), + samples_a_unsorted.size(), kDefaultEpsilon, kDefaultDelta, + kDefaultL2Tolerance, /*granularity=*/1.0)); +} + +TEST(RunBallotTest, AcceptsMajorityTrue) { + std::vector votes = {true, true, true, true, false, false, false}; + auto vote_it = votes.begin(); + std::function vote_generator = [&vote_it]() { return *(vote_it++); }; + EXPECT_TRUE(RunBallot(vote_generator, votes.size())); +} + +TEST(RunBallotTest, RejectsMajorityFalse) { + std::vector votes = {true, true, true, false, false, false, false}; + auto vote_it = votes.begin(); + std::function vote_generator = [&vote_it]() { return *(vote_it++); }; + EXPECT_FALSE(RunBallot(vote_generator, votes.size())); +} + +TEST(ReferenceLaplaceTest, HasAccurateStatisticalProperties) { + double mean = 0.0; + double variance = 2.0; + + std::vector samples; + for (int i = 0; i < kNumSamples; ++i) { + samples.push_back( + SampleReferenceLaplacian(mean, variance, &SecureURBG::GetSingleton())); + } + + EXPECT_NEAR(Mean(samples), mean, 0.1); + EXPECT_NEAR(Variance(samples), variance, 0.5); +} + +TEST(ReadProtoTest, ReadProtoFromFile) { + std::istringstream proto_file( + R"( + name: "Foo" + dp_test_parameters { + epsilon: 1.09861228866810969140 # = ln(3) + delta: 0.0 + delta_tolerance: 0.01125 + granularity: 0.015625 + } + noise_sampling_parameters { + number_of_samples: 1000000 + l0_sensitivity: 1 + linf_sensitivity: 1.0 + epsilon: 1.09861228866810969140 # = ln(3) + raw_input: 0.0 + } +)"); + + ::testing::DistributionDpTestCase expected; + google::protobuf::TextFormat::ParseFromString( + R"pb( + name: "Foo" + dp_test_parameters { + epsilon: 1.09861228866810969140 # = ln(3) + delta: 0.0 + delta_tolerance: 0.01125 + granularity: 0.015625 + } + noise_sampling_parameters { + number_of_samples: 1000000 + l0_sensitivity: 1 + linf_sensitivity: 1.0 + epsilon: 1.09861228866810969140 # = ln(3) + raw_input: 0.0 + } + )pb", + &expected); + + std::optional<::testing::DistributionDpTestCase> test_case = + ReadProto<::testing::DistributionDpTestCase>(&proto_file); + ASSERT_TRUE(test_case.has_value()); + EXPECT_THAT(test_case.value(), ::differential_privacy::base::testing::EqualsProto(expected)); +} + +TEST(BucketizeTest, BucketizesCorrectly) { + EXPECT_EQ(Bucketize(0.5, 0, 10, 10), 0); + EXPECT_EQ(Bucketize(5.5, 0, 10, 10), 5); + EXPECT_EQ(Bucketize(9.6, 0, 10, 10), 9); + + EXPECT_EQ(Bucketize(-4.5, -5, 5, 10), 0); + EXPECT_EQ(Bucketize(4.5, -5, 5, 10), 9); + + EXPECT_EQ(Bucketize(8, 0, 35, 5), 1); + EXPECT_EQ(Bucketize(20, 0, 35, 5), 2); + + EXPECT_EQ(Bucketize(-5.5, -5, 5, 10), 0); + EXPECT_EQ(Bucketize(-5, -5, 5, 10), 0); + EXPECT_EQ(Bucketize(5, -5, 5, 10), 9); + EXPECT_EQ(Bucketize(5.5, -5, 5, 10), 9); + + EXPECT_EQ(Bucketize(-1, -5, 5, 10), 4); + EXPECT_EQ(Bucketize(0, -5, 5, 10), 5); + EXPECT_EQ(Bucketize(1, -5, 5, 10), 6); +} + +} // namespace +} // namespace differential_privacy::testing diff --git a/examples/go/main/main.go b/examples/go/main/main.go index cb3bd2f7..50104ba3 100644 --- a/examples/go/main/main.go +++ b/examples/go/main/main.go @@ -93,7 +93,7 @@ func main() { case countVisitsPerCertainDuration: sc = &examples.CountVisitsPerCertainDurationScenario{} default: - log.Exitf("There is no scenario with id = %d", id) + log.Exitf("There is no scenario with id = %s", id) } err = examples.RunScenario(sc, *inputFile, *nonPrivateResultsOutputFile, *privateResultsOutputFile) diff --git a/privacy-on-beam/README.md b/privacy-on-beam/README.md index 49d5f176..ab045956 100644 --- a/privacy-on-beam/README.md +++ b/privacy-on-beam/README.md @@ -61,19 +61,19 @@ you can omit `-mod=mod`. ## Using with Bazel In order to include Privacy on Beam in your Bazel project, you need to add the -following to your `WORKSPACE` file (use the latest commit id or the id of the -commit you want to depend on): +following to your `WORKSPACE` file (change `dp_lib_version` to the version you +want to depend on, or alternatively you can depend on a specific commit; but +keep in mind that you have to update `dp_lib_tar_sha256` as well): ``` -load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository") load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") http_archive( name = "io_bazel_rules_go", - sha256 = "6f111c57fd50baf5b8ee9d63024874dd2a014b069426156c55adbf6d3d22cb7b", + sha256 = "7c10271940c6bce577d51a075ae77728964db285dac0a46614a7934dc34303e6", urls = [ - "https://mirror.bazel.build/github.com/bazelbuild/rules_go/releases/download/v0.25.0/rules_go-v0.25.0.tar.gz", - "https://github.com/bazelbuild/rules_go/releases/download/v0.25.0/rules_go-v0.25.0.tar.gz", + "https://mirror.bazel.build/github.com/bazelbuild/rules_go/releases/download/v0.26.0/rules_go-v0.26.0.tar.gz", + "https://github.com/bazelbuild/rules_go/releases/download/v0.26.0/rules_go-v0.26.0.tar.gz", ], ) @@ -81,14 +81,14 @@ load("@io_bazel_rules_go//go:deps.bzl", "go_register_toolchains", "go_rules_depe go_rules_dependencies() -go_register_toolchains(version = "1.15.5") +go_register_toolchains(version = "1.16") http_archive( name = "bazel_gazelle", - sha256 = "b85f48fa105c4403326e9525ad2b2cc437babaa6e15a3fc0b1dbab0ab064bc7c", + sha256 = "62ca106be173579c0a167deb23358fdfe71ffa1e4cfdddf5582af26520f1c66f", urls = [ - "https://mirror.bazel.build/github.com/bazelbuild/bazel-gazelle/releases/download/v0.22.2/bazel-gazelle-v0.22.2.tar.gz", - "https://github.com/bazelbuild/bazel-gazelle/releases/download/v0.22.2/bazel-gazelle-v0.22.2.tar.gz", + "https://mirror.bazel.build/github.com/bazelbuild/bazel-gazelle/releases/download/v0.23.0/bazel-gazelle-v0.23.0.tar.gz", + "https://github.com/bazelbuild/bazel-gazelle/releases/download/v0.23.0/bazel-gazelle-v0.23.0.tar.gz", ], ) @@ -96,10 +96,17 @@ load("@bazel_gazelle//:deps.bzl", "gazelle_dependencies", "go_repository") gazelle_dependencies() -git_repository( +dp_lib_version = "1.0.1" # Change to the version you want to use. +dp_lib_tar_sha256 = "c72422dc29b7307334f12b0ff95866002503e2c1d209d16cae0a6f849ebf07f4" # Change to the sha256 of the .tar.gz of the version you want to use. +dp_lib_url = "https://github.com/google/differential-privacy/archive/refs/tags/v" + dp_lib_version + ".tar.gz" + +http_archive( name = "com_github_google_differential_privacy", - remote = "https://github.com/google/differential-privacy.git", - commit = "de8460c9791de4c89a9dbb906b11a8f62e045f7b", + sha256 = dp_lib_tar_sha256, + urls = [ + dp_lib_url, + ], + strip_prefix = "differential-privacy-" + dp_lib_version, ) # Load dependencies for Google DP Library base workspace. @@ -110,22 +117,25 @@ differential_privacy_deps() load("@com_google_protobuf//:protobuf_deps.bzl", "protobuf_deps") protobuf_deps() -git_repository( +http_archive( name = "com_google_go_differential_privacy", - remote = "https://github.com/google/differential-privacy.git", - # Workaround from https://github.com/bazelbuild/bazel/issues/10062#issuecomment-642144553 - patch_cmds = ["mv (broken link) ."], - commit = "de8460c9791de4c89a9dbb906b11a8f62e045f7b", + sha256 = dp_lib_tar_sha256, + urls = [ + dp_lib_url, + ], + strip_prefix = "differential-privacy-" + dp_lib_version + "/go", ) load("@com_google_go_differential_privacy//:go_differential_privacy_deps.bzl", "go_differential_privacy_deps") go_differential_privacy_deps() -git_repository( +http_archive( name = "com_google_privacy_on_beam", - remote = "https://github.com/google/differential-privacy.git", - strip_prefix = "privacy-on-beam/", - commit = "de8460c9791de4c89a9dbb906b11a8f62e045f7b", + sha256 = dp_lib_tar_sha256, + urls = [ + dp_lib_url, + ], + strip_prefix = "differential-privacy-" + dp_lib_version + "/privacy-on-beam", ) load("@com_google_privacy_on_beam//:privacy_on_beam_deps.bzl", "privacy_on_beam_deps") diff --git a/privacy-on-beam/pbeam/distinct_per_key_test.go b/privacy-on-beam/pbeam/distinct_per_key_test.go index b4157ffc..cbdfd12c 100644 --- a/privacy-on-beam/pbeam/distinct_per_key_test.go +++ b/privacy-on-beam/pbeam/distinct_per_key_test.go @@ -26,9 +26,9 @@ import ( "github.com/apache/beam/sdks/go/pkg/beam/transforms/stats" ) -// Checks that DistinctPrivacyKey returns a correct answer, in particular that values +// Checks that DistinctPerKey returns a correct answer, in particular that values // are correctly counted (without duplicates). -func TestDistinctPrivacyKeyNoNoise(t *testing.T) { +func TestDistinctPerKeyNoNoise(t *testing.T) { var triples []testutils.TripleWithIntValue for i := 0; i < 100; i++ { // Add 200 distinct values to Partition 0. triples = append(triples, testutils.TripleWithIntValue{ID: i, Partition: 0, Value: i}) @@ -77,8 +77,8 @@ func TestDistinctPrivacyKeyNoNoise(t *testing.T) { } } -// Checks that DistinctPrivacyKey adds noise to its output. The logic mirrors TestDistinctPrivacyIDAddsNoise. -func TestDistinctPrivacyKeyAddsNoise(t *testing.T) { +// Checks that DistinctPerKey adds noise to its output. The logic mirrors TestDistinctPrivacyIDAddsNoise. +func TestDistinctPerKeyAddsNoise(t *testing.T) { for _, tc := range []struct { name string noiseKind NoiseKind @@ -187,7 +187,7 @@ func TestDistinctPerKeyPerKeyCrossPartitionContributionBounding(t *testing.T) { } } -// Checks that DistinctPrivacyKey bounds cross-partition contributions before doing deduplication of +// Checks that DistinctPerKey bounds cross-partition contributions before doing deduplication of // values. This is to ensure we don't run into a contribution bounding-related privacy bug in some // rare cases. func TestDistinctPerKeyPerKeyCrossPartitionContributionBounding_IsAppliedBeforeDeduplication(t *testing.T) { @@ -230,8 +230,8 @@ func TestDistinctPerKeyPerKeyCrossPartitionContributionBounding_IsAppliedBeforeD } } -// Checks that DistinctPrivacyKey bounds per-partition contributions correctly. -func TestDistinctPrivacyKeyPerPartitionContributionBounding(t *testing.T) { +// Checks that DistinctPerKey bounds per-partition contributions correctly. +func TestDistinctPerKeyPerPartitionContributionBounding(t *testing.T) { var triples []testutils.TripleWithIntValue for i := 0; i < 100; i++ { // Add 500 distinct values to Partition 0. // MaxContributionsPerPartition is set to 2, so 3 of these 5 contributions will be dropped for each user. @@ -272,17 +272,17 @@ func TestDistinctPrivacyKeyPerPartitionContributionBounding(t *testing.T) { got := DistinctPerKey(s, pcol, DistinctPerKeyParams{MaxPartitionsContributed: 3, NoiseKind: LaplaceNoise{}, MaxContributionsPerPartition: 2}) want = beam.ParDo(s, testutils.Int64MetricToKV, want) if err := testutils.ApproxEqualsKVInt64(s, got, want, testutils.LaplaceTolerance(k, l1Sensitivity, epsilon)); err != nil { - t.Fatalf("TestDistinctPrivacyKeyPerPartitionContributionBounding: %v", err) + t.Fatalf("TestDistinctPerKeyPerPartitionContributionBounding: %v", err) } if err := ptest.Run(p); err != nil { - t.Errorf("TestDistinctPrivacyKeyPerPartitionContributionBounding: DistinctPerKey(%v) = %v, expected %v: %v", col, got, want, err) + t.Errorf("TestDistinctPerKeyPerPartitionContributionBounding: DistinctPerKey(%v) = %v, expected %v: %v", col, got, want, err) } } -// Checks that DistinctPrivacyKey bounds per-partition contributions before doing deduplication of +// Checks that DistinctPerKey bounds per-partition contributions before doing deduplication of // values. This is to ensure we don't run into a contribution bounding-related privacy bug in some // rare cases. -func TestDistinctPrivacyKeyPerPartitionContributionBounding_IsAppliedBeforeDeduplication(t *testing.T) { +func TestDistinctPerKeyPerPartitionContributionBounding_IsAppliedBeforeDeduplication(t *testing.T) { var triples []testutils.TripleWithIntValue for i := 0; i < 100; i++ { // Add 100 distinct values to Partition 0. triples = append(triples, testutils.TripleWithIntValue{ID: i, Partition: 0, Value: i}) @@ -312,10 +312,10 @@ func TestDistinctPrivacyKeyPerPartitionContributionBounding_IsAppliedBeforeDedup got := DistinctPerKey(s, pcol, DistinctPerKeyParams{MaxPartitionsContributed: 1, NoiseKind: LaplaceNoise{}, MaxContributionsPerPartition: 1}) want = beam.ParDo(s, testutils.Int64MetricToKV, want) if err := testutils.ApproxEqualsKVInt64(s, got, want, testutils.LaplaceTolerance(k, l1Sensitivity, epsilon)); err != nil { - t.Fatalf("TestDistinctPrivacyKeyPerPartitionContributionBounding_IsAppliedBeforeDeduplication: %v", err) + t.Fatalf("TestDistinctPerKeyPerPartitionContributionBounding_IsAppliedBeforeDeduplication: %v", err) } if err := ptest.Run(p); err != nil { - t.Errorf("TestDistinctPrivacyKeyPerPartitionContributionBounding_IsAppliedBeforeDeduplication: DistinctPerKey(%v) = %v, expected %v: %v", col, got, want, err) + t.Errorf("TestDistinctPerKeyPerPartitionContributionBounding_IsAppliedBeforeDeduplication: DistinctPerKey(%v) = %v, expected %v: %v", col, got, want, err) } } @@ -401,7 +401,7 @@ func TestDistinctPerKeyPartitionSelection(t *testing.T) { } } -// Checks that DistinctPrivacyKey performs thresholding/partition selection +// Checks that DistinctPerKey performs thresholding/partition selection // on the number of privacy IDs in a partition and not the number of distinct // values. func TestDistinctPerKeyThresholdsOnPrivacyIDs(t *testing.T) { diff --git a/proto/testing/BUILD b/proto/testing/BUILD index e3f8c7c1..26c9c111 100644 --- a/proto/testing/BUILD +++ b/proto/testing/BUILD @@ -34,3 +34,11 @@ exports_files([ "laplace_closeness_test_cases.textproto", "laplace_dp_test_cases.textproto", ]) + +cc_proto_library( + name = "statistical_tests_cc_proto", + visibility = [ + "//visibility:public", + ], + deps = [":statistical_tests_proto"], +)