Skip to content

Commit

Permalink
[Monitoring] Enrich UNTRIAGED_TESTCASE_AGE metric to track testcases …
Browse files Browse the repository at this point in the history
…stuck in analyze (#4547)

### Motivation

We currently have no way to tell if analyze task was successfully
executed. The TESTCASE_UPLOAD_TRIAGE_DURATION metric from #4364 would
only track duration for tasks that did finish.

An analyze_pending field is added to the Testcase entity in datastore,
which is set to False by default, to True for manually uploaded
testcases, and to False once analyze task postprocess runs.

It also increments the UNTRIAGED_TESTCASE_AGE metric from #4381 with a
status label, so we can know at what step the testcase is stuck, thus
allowing us to alert if analyze is taking longer to finish than
expected.

The alert itself could be, for instance, P50 age of untriaged testcase
(status=analyze_pending) > 3h.

Also, this retroactively addresses comments from #4481:

* Fixes docstring for emit_testcase_triage_duration_metric
* Removes assertions
* Renames TESTCASE_UPLOAD_TRIAGE_DURATION to TESTCASE_TRIAGE_DURATION,
since it now accounts for fuzzer generated testcases
* Use a boolean "from_fuzzer" field, instead of "origin" string, in
TESTCASE_TRIAGE_DURATION
  • Loading branch information
vitorguidi authored Dec 23, 2024
1 parent 764e8fe commit 3daf2eb
Show file tree
Hide file tree
Showing 6 changed files with 49 additions and 55 deletions.
2 changes: 2 additions & 0 deletions src/clusterfuzz/_internal/bot/tasks/utasks/analyze_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -552,6 +552,8 @@ def _update_testcase(output):
if analyze_task_output.platform_id:
testcase.platform_id = analyze_task_output.platform_id

testcase.analyze_pending = False

testcase.put()


Expand Down
40 changes: 10 additions & 30 deletions src/clusterfuzz/_internal/common/testcase_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,59 +31,39 @@


def emit_testcase_triage_duration_metric(testcase_id: int, step: str):
'''Finds out if a testcase is fuzzer generated or manually uploaded,
and emits the TESTCASE_UPLOAD_TRIAGE_DURATION metric.'''
testcase_upload_metadata = get_testcase_upload_metadata(testcase_id)
if not testcase_upload_metadata:
logs.warning(f'No upload metadata found for testcase {testcase_id},'
' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.')
return
if not testcase_upload_metadata.timestamp:
logs.warning(
f'No timestamp for testcase {testcase_upload_metadata.testcase_id},'
' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.')
return
assert step in [
'analyze_launched', 'analyze_completed', 'minimize_completed',
'regression_completed', 'impact_completed', 'issue_updated'
]

"""Finds out if a testcase is fuzzer generated or manually uploaded,
and emits the TESTCASE_TRIAGE_DURATION metric."""
testcase = data_handler.get_testcase_by_id(testcase_id)

if not testcase:
logs.warning(f'No testcase found with id {testcase_id},'
' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.')
' failed to emit TESTCASE_TRIAGE_DURATION metric.')
return

if not testcase.job_type:
logs.warning(f'No job_type associated to testcase {testcase_id},'
' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.')
' failed to emit TESTCASE_TRIAGE_DURATION metric.')
return

from_fuzzer = not get_testcase_upload_metadata(testcase_id)

assert step in [
'analyze_launched', 'analyze_completed', 'minimize_completed',
'regression_completed', 'impact_completed', 'issue_updated'
]

if not testcase.get_age_in_seconds():
logs.warning(f'No timestamp associated to testcase {testcase_id},'
' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.')
' failed to emit TESTCASE_TRIAGE_DURATION metric.')
return

testcase_age_in_hours = testcase.get_age_in_seconds() / 3600
testcase_age_in_hours = testcase.get_age_in_seconds() / (60 * 60)

logs.info('Emiting TESTCASE_UPLOAD_TRIAGE_DURATION metric for testcase '
logs.info('Emiting TESTCASE_TRIAGE_DURATION metric for testcase '
f'{testcase_id} (age = {testcase_age_in_hours} hours.) '
'in step {step}.')
f'in step {step}, from_fuzzer: {from_fuzzer}.')

monitoring_metrics.TESTCASE_UPLOAD_TRIAGE_DURATION.add(
monitoring_metrics.TESTCASE_TRIAGE_DURATION.add(
testcase_age_in_hours,
labels={
'job': testcase.job_type,
'step': step,
'origin': 'fuzzer' if from_fuzzer else 'manually_uploaded'
'from_fuzzer': from_fuzzer
})


Expand Down
46 changes: 26 additions & 20 deletions src/clusterfuzz/_internal/cron/triage.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,8 @@ def _check_and_update_similar_bug(testcase, issue_tracker):

def _emit_bug_filing_from_testcase_elapsed_time_metric(testcase):
testcase_age = testcase.get_age_in_seconds()
if not testcase_age:
return
monitoring_metrics.BUG_FILING_FROM_TESTCASE_ELAPSED_TIME.add(
testcase_age,
labels={
Expand Down Expand Up @@ -336,27 +338,30 @@ def _emit_untriaged_testcase_count_metric():
})


def _emit_untriaged_testcase_age_metric(testcase: data_types.Testcase):
PENDING_ANALYZE = 'pending_analyze'
PENDING_CRITICAL_TASKS = 'pending_critical_tasks'
PENDING_PROGRESSION = 'pending_progression'
PENDING_GROUPING = 'pending_grouping'
PENDING_FILING = 'pending_filing'


def _emit_untriaged_testcase_age_metric(testcase: data_types.Testcase,
step: str):
"""Emmits a metric to track age of untriaged testcases."""
if not testcase.timestamp:
if not testcase.get_age_in_seconds():
return

logs.info(f'Emiting UNTRIAGED_TESTCASE_AGE for testcase {testcase.key.id()} '
f'(age = {testcase.get_age_in_seconds()})')
f'(age = {testcase.get_age_in_seconds()}), step = {step}')
monitoring_metrics.UNTRIAGED_TESTCASE_AGE.add(
testcase.get_age_in_seconds() / 3600,
labels={
'job': testcase.job_type,
'platform': testcase.platform,
'step': step,
})


PENDING_CRITICAL_TASKS = 'pending_critical_tasks'
PENDING_PROGRESSION = 'pending_progression'
PENDING_GROUPING = 'pending_grouping'
PENDING_FILING = 'pending_filing'


def main():
"""Files bugs."""
try:
Expand Down Expand Up @@ -409,7 +414,7 @@ def main():
if testcase.get_metadata('progression_pending'):
_set_testcase_stuck_state(testcase, True)
logs.info(f'Skipping testcase {testcase_id}, progression pending')
_emit_untriaged_testcase_age_metric(testcase)
_emit_untriaged_testcase_age_metric(testcase, PENDING_PROGRESSION)
_increment_untriaged_testcase_count(testcase.job_type,
PENDING_PROGRESSION)
continue
Expand All @@ -432,10 +437,12 @@ def main():
# Require that all tasks like minimizaton, regression testing, etc have
# finished.
if not critical_tasks_completed:
_emit_untriaged_testcase_age_metric(testcase)
status = PENDING_CRITICAL_TASKS
if testcase.analyze_pending:
status = PENDING_ANALYZE
_emit_untriaged_testcase_age_metric(testcase, status)
_set_testcase_stuck_state(testcase, True)
_increment_untriaged_testcase_count(testcase.job_type,
PENDING_CRITICAL_TASKS)
_increment_untriaged_testcase_count(testcase.job_type, status)
logs.info(
f'Skipping testcase {testcase_id}, critical tasks still pending.')
continue
Expand All @@ -452,15 +459,15 @@ def main():
# metadata works well.
if not testcase.group_id and not dates.time_has_expired(
testcase.timestamp, hours=data_types.MIN_ELAPSED_TIME_SINCE_REPORT):
_emit_untriaged_testcase_age_metric(testcase)
_emit_untriaged_testcase_age_metric(testcase, PENDING_GROUPING)
_set_testcase_stuck_state(testcase, True)
_increment_untriaged_testcase_count(testcase.job_type, PENDING_GROUPING)
logs.info(f'Skipping testcase {testcase_id}, pending grouping.')
continue

if not testcase.get_metadata('ran_grouper'):
# Testcase should be considered by the grouper first before filing.
_emit_untriaged_testcase_age_metric(testcase)
_emit_untriaged_testcase_age_metric(testcase, PENDING_GROUPING)
_set_testcase_stuck_state(testcase, True)
_increment_untriaged_testcase_count(testcase.job_type, PENDING_GROUPING)
logs.info(f'Skipping testcase {testcase_id}, pending grouping.')
Expand Down Expand Up @@ -490,16 +497,15 @@ def main():
# Clean up old triage messages that would be not applicable now.
testcase.delete_metadata(TRIAGE_MESSAGE_KEY, update_testcase=False)

# A testcase is untriaged, until immediately before a bug is opened
_emit_untriaged_testcase_age_metric(testcase)
_set_testcase_stuck_state(testcase, False)
_increment_untriaged_testcase_count(testcase.job_type, PENDING_FILING)

# File the bug first and then create filed bug metadata.
if not _file_issue(testcase, issue_tracker, throttler):
_emit_untriaged_testcase_age_metric(testcase, PENDING_FILING)
_increment_untriaged_testcase_count(testcase.job_type, PENDING_FILING)
logs.info(f'Issue filing failed for testcase id {testcase_id}')
continue

_set_testcase_stuck_state(testcase, False)

_create_filed_bug_metadata(testcase)
issue_filer.notify_issue_update(testcase, 'new')

Expand Down
3 changes: 2 additions & 1 deletion src/clusterfuzz/_internal/datastore/data_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -921,7 +921,7 @@ def critical_tasks_completed(testcase):
return testcase.minimized_keys and testcase.regression

return bool(testcase.minimized_keys and testcase.regression and
testcase.is_impact_set_flag)
testcase.is_impact_set_flag and not testcase.analyze_pending)


# ------------------------------------------------------------------------------
Expand Down Expand Up @@ -1380,6 +1380,7 @@ def create_user_uploaded_testcase(key,

testcase.timestamp = utils.utcnow()
testcase.created = testcase.timestamp
testcase.analyze_pending = True
testcase.uploader_email = uploader_email
testcase.put()

Expand Down
4 changes: 4 additions & 0 deletions src/clusterfuzz/_internal/datastore/data_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -583,6 +583,10 @@ class Testcase(Model):
# Tracks if a testcase is stuck during triage.
stuck_in_triage = ndb.BooleanProperty(default=False)

# Tracks if analyze task is pending.
# Defaults to false, since most testcases are fuzzer produced.
analyze_pending = ndb.BooleanProperty(default=False)

def is_chromium(self):
return self.project_name in ('chromium', 'chromium-testing')

Expand Down
9 changes: 5 additions & 4 deletions src/clusterfuzz/_internal/metrics/monitoring_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,17 +231,17 @@
],
)

TESTCASE_UPLOAD_TRIAGE_DURATION = monitor.CumulativeDistributionMetric(
'uploaded_testcase_analysis/triage_duration_secs',
TESTCASE_TRIAGE_DURATION = monitor.CumulativeDistributionMetric(
'testcase_analysis/triage_duration_hours',
description=('Time elapsed between testcase upload and completion'
' of relevant tasks in the testcase upload lifecycle.'
' of relevant tasks in the testcase lifecycle.'
' Origin can be either from a fuzzer, or a manual'
' upload. Measured in hours.'),
bucketer=monitor.GeometricBucketer(),
field_spec=[
monitor.StringField('step'),
monitor.StringField('job'),
monitor.StringField('origin'),
monitor.BooleanField('from_fuzzer'),
],
)

Expand Down Expand Up @@ -365,6 +365,7 @@
field_spec=[
monitor.StringField('job'),
monitor.StringField('platform'),
monitor.StringField('step'),
])

UNTRIAGED_TESTCASE_COUNT = monitor.GaugeMetric(
Expand Down

0 comments on commit 3daf2eb

Please sign in to comment.