Add weights butler script. (#3941)

Simple script to allow querying fuzzer/job weights and computing some simple stats on them. To do in followup PRs: - support editing weights once #3937 is fixed - support `FuzzTargetJob` weights
google · Apr 16, 2024 · d3854a4 · d3854a4
1 parent f65983a
commit d3854a4
Show file tree

Hide file tree

Showing 2 changed files with 296 additions and 0 deletions.
diff --git a/butler.py b/butler.py
@@ -249,6 +249,42 @@ def main():
  subparsers.add_parser(
  'integration_tests', help='Run end-to-end integration tests.')
 
+ parser_weights = subparsers.add_parser(
+ 'weights', help='Interact with fuzzer/job weights.')
+ parser_weights.add_argument(
+ '-c', '--config-dir', required=True, help='Path to application config.')
+
+ weights_subparsers = parser_weights.add_subparsers(dest='weights_command')
+ weights_subparsers.add_parser('platforms', help='List platforms.')
+
+ weights_dump_parser = weights_subparsers.add_parser(
+ 'dump', help='Dump database entries.')
+ weights_dump_parser.add_argument(
+ 'type',
+ help='The type of entries to dump from the database.',
+ choices=['fuzzer_job', 'fuzzer_jobs'])
+
+ weights_list_parser = weights_subparsers.add_parser(
+ 'list', help='List FuzzerJob entries.')
+ weights_list_parser.add_argument(
+ '-p',
+ '--platforms',
+ help='Which platforms to list entries for.',
+ nargs='+')
+ weights_list_parser.add_argument(
+ '-f', '--fuzzers', help='Which fuzzers to list entries for.', nargs='+')
+ weights_list_parser.add_argument(
+ '-j', '--jobs', help='Which jobs to list entries for.', nargs='+')
+
+ weights_aggregate_parser = weights_subparsers.add_parser(
+ 'aggregate', help='Aggregate matching FuzzerJob entries.')
+ weights_aggregate_parser.add_argument(
+ '-p', '--platform', help='Which platform to query.', required=True)
+ weights_aggregate_parser.add_argument(
+ '-f', '--fuzzers', help='Which fuzzers to aggregate.', nargs='+')
+ weights_aggregate_parser.add_argument(
+ '-j', '--jobs', help='Which jobs to aggregate.', nargs='+')
+
  args = parser.parse_args()
  if not args.command:
  parser.print_help()

diff --git a/src/local/butler/weights.py b/src/local/butler/weights.py
@@ -0,0 +1,260 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Script to interact with fuzzer weights in the database.
+
+Usage:
+
+ python butler.py weights --help
+
+"""
+
+import csv
+import enum
+import os
+import statistics
+import sys
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Sequence
+from typing import Union
+
+from src.clusterfuzz._internal.config import local_config
+from src.clusterfuzz._internal.datastore import data_types
+from src.clusterfuzz._internal.datastore import ndb_init
+
+
+class EntryType(enum.Enum):
+ FUZZER_JOB = 'fuzzer_job'
+ FUZZER_JOBS = 'fuzzer_jobs'
+
+
+def _iter_weights(
+ fuzzer_jobs: Sequence[data_types.FuzzerJob]) -> Sequence[float]:
+ for fj in fuzzer_jobs:
+ yield fj.actual_weight
+
+
+def _sum_weights(fuzzer_jobs: Sequence[data_types.FuzzerJob]) -> float:
+ return sum(_iter_weights(fuzzer_jobs))
+
+
+def _display_prob(probability: float) -> str:
+ return f'{probability:0.04f} = {probability * 100:0.02f}%'
+
+
+def list_platforms() -> None:
+ # Query only distinct platform values from the database.
+ fuzzer_jobs = data_types.FuzzerJob.query(
+ projection=[data_types.FuzzerJob.platform], distinct=True)
+ for fuzzer_job in fuzzer_jobs:
+ print(fuzzer_job.platform)
+
+
+def _query_fuzzer_jobs_batches(platforms: Optional[Sequence[str]] = None
+ ) -> Sequence[data_types.FuzzerJobs]:
+ query = data_types.FuzzerJobs.query()
+
+ if platforms:
+ query = query.filter(
+ data_types.FuzzerJobs.platform.IN([p.upper() for p in platforms]))
+
+ return query
+
+
+def _query_fuzzer_jobs(
+ platforms: Optional[Sequence[str]] = None,
+ fuzzers: Optional[Sequence[str]] = None,
+ jobs: Optional[Sequence[str]] = None,
+) -> Sequence[data_types.FuzzerJob]:
+ """Queries Datastore for matching FuzzerJob entries."""
+ query = data_types.FuzzerJob.query()
+
+ if platforms:
+ query = query.filter(
+ data_types.FuzzerJob.platform.IN([p.upper() for p in platforms]))
+ if fuzzers:
+ query = query.filter(data_types.FuzzerJob.fuzzer.IN(fuzzers))
+ if jobs:
+ query = query.filter(data_types.FuzzerJob.job.IN(jobs))
+
+ return query
+
+
+def _list_fuzzer_jobs(fuzzer_jobs: Sequence[data_types.FuzzerJob]) -> None:
+ """Lists the given FuzzerJob entries on stdout."""
+ fuzzer_jobs = list(fuzzer_jobs)
+ fuzzer_jobs.sort(key=lambda fj: fj.actual_weight, reverse=True)
+
+ total_weight = _sum_weights(fuzzer_jobs)
+
+ for fuzzer_job in fuzzer_jobs:
+ probability = fuzzer_job.actual_weight / total_weight
+
+ print("FuzzerJob:")
+ print(f' Fuzzer: {fuzzer_job.fuzzer}')
+ print(f' Job: {fuzzer_job.job}')
+ print(f' Platform: {fuzzer_job.platform}')
+ print(f' Weight: {fuzzer_job.actual_weight} = ' +
+ f'{fuzzer_job.weight} * {fuzzer_job.multiplier}')
+ print(f' Probability: {_display_prob(probability)}')
+
+ print(f'Count: {len(fuzzer_jobs)}')
+ print(f'Total weight (for this query): {total_weight}')
+
+
+_FUZZER_JOB_FIELDS = [
+ 'fuzzer',
+ 'job',
+ 'platform',
+ 'weight',
+ 'multiplier',
+ 'actual_weight',
+]
+
+
+def _fuzzer_job_to_dict(
+ fuzzer_job: data_types.FuzzerJob) -> Dict[str, Union[str, float]]:
+ """Converts the given FuzzerJob to a dictionary of CSV column values."""
+ return {
+ 'fuzzer': fuzzer_job.fuzzer,
+ 'job': fuzzer_job.job,
+ 'platform': fuzzer_job.platform,
+ 'weight': fuzzer_job.weight,
+ 'multiplier': fuzzer_job.multiplier,
+ 'actual_weight': fuzzer_job.actual_weight,
+ }
+
+
+def _dump_fuzzer_jobs() -> None:
+ """Dumps FuzzerJob entries from the database to stdout in CSV format."""
+ fuzzer_jobs = _query_fuzzer_jobs()
+
+ writer = csv.DictWriter(sys.stdout, fieldnames=_FUZZER_JOB_FIELDS)
+ writer.writeheader()
+
+ for fuzzer_job in fuzzer_jobs:
+ writer.writerow(_fuzzer_job_to_dict(fuzzer_job))
+
+
+def _dump_fuzzer_jobs_batches() -> None:
+ """Dumps FuzzerJobs entries from the database to stdout in CSV format."""
+ batches = _query_fuzzer_jobs_batches()
+
+ writer = csv.DictWriter(sys.stdout, fieldnames=['batch'] + _FUZZER_JOB_FIELDS)
+ writer.writeheader()
+
+ for batch in batches:
+ for fuzzer_job in batch.fuzzer_jobs:
+ fields = _fuzzer_job_to_dict(fuzzer_job)
+ fields['batch'] = batch.key.id()
+ writer.writerow(fields)
+
+
+def _dump_entries(entry_type: EntryType) -> None:
+ """Dumps entries of the given type from the database to stdout."""
+ if entry_type == EntryType.FUZZER_JOB:
+ _dump_fuzzer_jobs()
+ elif entry_type == EntryType.FUZZER_JOBS:
+ _dump_fuzzer_jobs_batches()
+
+
+def _fuzzer_job_matches(
+ fuzzer_job: data_types.FuzzerJob,
+ fuzzers: Optional[Sequence[str]],
+ jobs: Optional[Sequence[str]],
+) -> bool:
+ """Returns whether the given FuzzerJob matches the given optional filters."""
+ if fuzzers and fuzzer_job.fuzzer not in fuzzers:
+ return False
+
+ if jobs and fuzzer_job.job not in jobs:
+ return False
+
+ return True
+
+
+def _print_stats(fuzzer_jobs: List[data_types.FuzzerJob],
+ total_weight: float) -> None:
+ """Helper for `_aggregate_fuzzer_jobs()`."""
+ weight = _sum_weights(fuzzer_jobs)
+ probability = weight / total_weight
+
+ print(f' Count: {len(fuzzer_jobs)}')
+ print(f' Total weight: {weight}')
+ print(f' Total probability: {_display_prob(probability)}')
+
+ # New in Python 3.8. We appease the linter by disabling `no-member` below.
+ if len(fuzzer_jobs) < 2 or not hasattr(statistics, 'quantiles'):
+ return
+
+ # `quantiles()` returns n-1 cut points between n quantiles.
+ # `weight_deciles[0]` separates the first from the second decile, i.e. it is
+ # the 10% percentile value. `weight_deciles[i]` is the (i+1)*10-th.
+ weight_deciles = statistics.quantiles(_iter_weights(fuzzer_jobs), n=10) # pylint: disable=no-member
+ weight_median = weight_deciles[4]
+ weight_90p = weight_deciles[8]
+
+ prob_median = weight_median / total_weight
+ prob_90p = weight_90p / total_weight
+
+ print(f' Median weight: {weight_median}')
+ print(f' Median probability: {_display_prob(prob_median)}')
+ print(f' 90th percentile weight: {weight_90p}')
+ print(f' 90th percentile probability: {_display_prob(prob_90p)}')
+
+
+def _aggregate_fuzzer_jobs(
+ platform: str,
+ fuzzers: Optional[Sequence[str]] = None,
+ jobs: Optional[Sequence[str]] = None,
+) -> None:
+ """Aggregates statistics for matching and non-matching FuzzerJob entries."""
+ fuzzer_jobs = list(_query_fuzzer_jobs(platforms=[platform.upper()]))
+ total_weight = _sum_weights(fuzzer_jobs)
+
+ matches = []
+ others = []
+ for fuzzer_job in fuzzer_jobs:
+ if _fuzzer_job_matches(fuzzer_job, fuzzers, jobs):
+ matches.append(fuzzer_job)
+ else:
+ others.append(fuzzer_job)
+
+ print('Matching FuzzerJob entries:')
+ _print_stats(matches, total_weight)
+ print('Other FuzzerJob entries:')
+ _print_stats(others, total_weight)
+
+
+def execute(args) -> None:
+ """Entrypoint from butler.py."""
+ os.environ['CONFIG_DIR_OVERRIDE'] = args.config_dir
+ local_config.ProjectConfig().set_environment()
+
+ with ndb_init.context():
+ cmd = args.weights_command
+ if cmd == 'platforms':
+ list_platforms()
+ elif cmd == 'dump':
+ _dump_entries(EntryType(args.type))
+ elif cmd == 'list':
+ _list_fuzzer_jobs(
+ _query_fuzzer_jobs(
+ platforms=args.platforms, fuzzers=args.fuzzers, jobs=args.jobs))
+ elif cmd == 'aggregate':
+ _aggregate_fuzzer_jobs(
+ args.platform, fuzzers=args.fuzzers, jobs=args.jobs)
+ else:
+ raise TypeError(f'weights command {repr(cmd)} unrecognized')