[Monitoring] Enrich UNTRIAGED_TESTCASE_AGE metric to track testcases …

…stuck in analyze (#4547) ### Motivation We currently have no way to tell if analyze task was successfully executed. The TESTCASE_UPLOAD_TRIAGE_DURATION metric from #4364 would only track duration for tasks that did finish. An analyze_pending field is added to the Testcase entity in datastore, which is set to False by default, to True for manually uploaded testcases, and to False once analyze task postprocess runs. It also increments the UNTRIAGED_TESTCASE_AGE metric from #4381 with a status label, so we can know at what step the testcase is stuck, thus allowing us to alert if analyze is taking longer to finish than expected. The alert itself could be, for instance, P50 age of untriaged testcase (status=analyze_pending) > 3h. Also, this retroactively addresses comments from #4481: * Fixes docstring for emit_testcase_triage_duration_metric * Removes assertions * Renames TESTCASE_UPLOAD_TRIAGE_DURATION to TESTCASE_TRIAGE_DURATION, since it now accounts for fuzzer generated testcases * Use a boolean "from_fuzzer" field, instead of "origin" string, in TESTCASE_TRIAGE_DURATION
google · Dec 23, 2024 · 3daf2eb · 3daf2eb
1 parent 764e8fe
commit 3daf2eb
Show file tree

Hide file tree

Showing 6 changed files with 49 additions and 55 deletions.
diff --git a/src/clusterfuzz/_internal/bot/tasks/utasks/analyze_task.py b/src/clusterfuzz/_internal/bot/tasks/utasks/analyze_task.py
@@ -552,6 +552,8 @@ def _update_testcase(output):
   if analyze_task_output.platform_id:
     testcase.platform_id = analyze_task_output.platform_id
 
+  testcase.analyze_pending = False
+
   testcase.put()
 
 

diff --git a/src/clusterfuzz/_internal/common/testcase_utils.py b/src/clusterfuzz/_internal/common/testcase_utils.py
@@ -31,59 +31,39 @@
 
 
 def emit_testcase_triage_duration_metric(testcase_id: int, step: str):
-  '''Finds out if a testcase is fuzzer generated or manually uploaded,
-      and emits the TESTCASE_UPLOAD_TRIAGE_DURATION metric.'''
-  testcase_upload_metadata = get_testcase_upload_metadata(testcase_id)
-  if not testcase_upload_metadata:
-    logs.warning(f'No upload metadata found for testcase {testcase_id},'
-                 ' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.')
-    return
-  if not testcase_upload_metadata.timestamp:
-    logs.warning(
-        f'No timestamp for testcase {testcase_upload_metadata.testcase_id},'
-        ' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.')
-    return
-  assert step in [
-      'analyze_launched', 'analyze_completed', 'minimize_completed',
-      'regression_completed', 'impact_completed', 'issue_updated'
-  ]
-
+  """Finds out if a testcase is fuzzer generated or manually uploaded,
+      and emits the TESTCASE_TRIAGE_DURATION metric."""
   testcase = data_handler.get_testcase_by_id(testcase_id)
 
   if not testcase:
     logs.warning(f'No testcase found with id {testcase_id},'
-                 ' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.')
+                 ' failed to emit TESTCASE_TRIAGE_DURATION metric.')
     return
 
   if not testcase.job_type:
     logs.warning(f'No job_type associated to testcase {testcase_id},'
-                 ' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.')
+                 ' failed to emit TESTCASE_TRIAGE_DURATION metric.')
     return
 
   from_fuzzer = not get_testcase_upload_metadata(testcase_id)
 
-  assert step in [
-      'analyze_launched', 'analyze_completed', 'minimize_completed',
-      'regression_completed', 'impact_completed', 'issue_updated'
-  ]
-
   if not testcase.get_age_in_seconds():
     logs.warning(f'No timestamp associated to testcase {testcase_id},'
-                 ' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.')
+                 ' failed to emit TESTCASE_TRIAGE_DURATION metric.')
     return
 
-  testcase_age_in_hours = testcase.get_age_in_seconds() / 3600
+  testcase_age_in_hours = testcase.get_age_in_seconds() / (60 * 60)
 
-  logs.info('Emiting TESTCASE_UPLOAD_TRIAGE_DURATION metric for testcase '
+  logs.info('Emiting TESTCASE_TRIAGE_DURATION metric for testcase '
             f'{testcase_id} (age = {testcase_age_in_hours} hours.) '
-            'in step {step}.')
+            f'in step {step}, from_fuzzer: {from_fuzzer}.')
 
-  monitoring_metrics.TESTCASE_UPLOAD_TRIAGE_DURATION.add(
+  monitoring_metrics.TESTCASE_TRIAGE_DURATION.add(
       testcase_age_in_hours,
       labels={
           'job': testcase.job_type,
           'step': step,
-          'origin': 'fuzzer' if from_fuzzer else 'manually_uploaded'
+          'from_fuzzer': from_fuzzer
       })
 
 

diff --git a/src/clusterfuzz/_internal/cron/triage.py b/src/clusterfuzz/_internal/cron/triage.py
@@ -256,6 +256,8 @@ def _check_and_update_similar_bug(testcase, issue_tracker):
 
 def _emit_bug_filing_from_testcase_elapsed_time_metric(testcase):
   testcase_age = testcase.get_age_in_seconds()
+  if not testcase_age:
+    return
   monitoring_metrics.BUG_FILING_FROM_TESTCASE_ELAPSED_TIME.add(
       testcase_age,
       labels={
@@ -336,27 +338,30 @@ def _emit_untriaged_testcase_count_metric():
         })
 
 
-def _emit_untriaged_testcase_age_metric(testcase: data_types.Testcase):
+PENDING_ANALYZE = 'pending_analyze'
+PENDING_CRITICAL_TASKS = 'pending_critical_tasks'
+PENDING_PROGRESSION = 'pending_progression'
+PENDING_GROUPING = 'pending_grouping'
+PENDING_FILING = 'pending_filing'
+
+
+def _emit_untriaged_testcase_age_metric(testcase: data_types.Testcase,
+                                        step: str):
   """Emmits a metric to track age of untriaged testcases."""
-  if not testcase.timestamp:
+  if not testcase.get_age_in_seconds():
     return
 
   logs.info(f'Emiting UNTRIAGED_TESTCASE_AGE for testcase {testcase.key.id()} '
-            f'(age = {testcase.get_age_in_seconds()})')
+            f'(age = {testcase.get_age_in_seconds()}), step = {step}')
   monitoring_metrics.UNTRIAGED_TESTCASE_AGE.add(
       testcase.get_age_in_seconds() / 3600,
       labels={
           'job': testcase.job_type,
           'platform': testcase.platform,
+          'step': step,
       })
 
 
-PENDING_CRITICAL_TASKS = 'pending_critical_tasks'
-PENDING_PROGRESSION = 'pending_progression'
-PENDING_GROUPING = 'pending_grouping'
-PENDING_FILING = 'pending_filing'
-
-
 def main():
   """Files bugs."""
   try:
@@ -409,7 +414,7 @@ def main():
     if testcase.get_metadata('progression_pending'):
       _set_testcase_stuck_state(testcase, True)
       logs.info(f'Skipping testcase {testcase_id}, progression pending')
-      _emit_untriaged_testcase_age_metric(testcase)
+      _emit_untriaged_testcase_age_metric(testcase, PENDING_PROGRESSION)
       _increment_untriaged_testcase_count(testcase.job_type,
                                           PENDING_PROGRESSION)
       continue
@@ -432,10 +437,12 @@ def main():
     # Require that all tasks like minimizaton, regression testing, etc have
     # finished.
     if not critical_tasks_completed:
-      _emit_untriaged_testcase_age_metric(testcase)
+      status = PENDING_CRITICAL_TASKS
+      if testcase.analyze_pending:
+        status = PENDING_ANALYZE
+      _emit_untriaged_testcase_age_metric(testcase, status)
       _set_testcase_stuck_state(testcase, True)
-      _increment_untriaged_testcase_count(testcase.job_type,
-                                          PENDING_CRITICAL_TASKS)
+      _increment_untriaged_testcase_count(testcase.job_type, status)
       logs.info(
           f'Skipping testcase {testcase_id}, critical tasks still pending.')
       continue
@@ -452,15 +459,15 @@ def main():
     # metadata works well.
     if not testcase.group_id and not dates.time_has_expired(
         testcase.timestamp, hours=data_types.MIN_ELAPSED_TIME_SINCE_REPORT):
-      _emit_untriaged_testcase_age_metric(testcase)
+      _emit_untriaged_testcase_age_metric(testcase, PENDING_GROUPING)
       _set_testcase_stuck_state(testcase, True)
       _increment_untriaged_testcase_count(testcase.job_type, PENDING_GROUPING)
       logs.info(f'Skipping testcase {testcase_id}, pending grouping.')
       continue
 
     if not testcase.get_metadata('ran_grouper'):
       # Testcase should be considered by the grouper first before filing.
-      _emit_untriaged_testcase_age_metric(testcase)
+      _emit_untriaged_testcase_age_metric(testcase, PENDING_GROUPING)
       _set_testcase_stuck_state(testcase, True)
       _increment_untriaged_testcase_count(testcase.job_type, PENDING_GROUPING)
       logs.info(f'Skipping testcase {testcase_id}, pending grouping.')
@@ -490,16 +497,15 @@ def main():
     # Clean up old triage messages that would be not applicable now.
     testcase.delete_metadata(TRIAGE_MESSAGE_KEY, update_testcase=False)
 
-    # A testcase is untriaged, until immediately before a bug is opened
-    _emit_untriaged_testcase_age_metric(testcase)
-    _set_testcase_stuck_state(testcase, False)
-    _increment_untriaged_testcase_count(testcase.job_type, PENDING_FILING)
-
     # File the bug first and then create filed bug metadata.
     if not _file_issue(testcase, issue_tracker, throttler):
+      _emit_untriaged_testcase_age_metric(testcase, PENDING_FILING)
+      _increment_untriaged_testcase_count(testcase.job_type, PENDING_FILING)
       logs.info(f'Issue filing failed for testcase id {testcase_id}')
       continue
 
+    _set_testcase_stuck_state(testcase, False)
+
     _create_filed_bug_metadata(testcase)
     issue_filer.notify_issue_update(testcase, 'new')
 

diff --git a/src/clusterfuzz/_internal/datastore/data_handler.py b/src/clusterfuzz/_internal/datastore/data_handler.py
@@ -921,7 +921,7 @@ def critical_tasks_completed(testcase):
     return testcase.minimized_keys and testcase.regression
 
   return bool(testcase.minimized_keys and testcase.regression and
-              testcase.is_impact_set_flag)
+              testcase.is_impact_set_flag and not testcase.analyze_pending)
 
 
 # ------------------------------------------------------------------------------
@@ -1380,6 +1380,7 @@ def create_user_uploaded_testcase(key,
 
   testcase.timestamp = utils.utcnow()
   testcase.created = testcase.timestamp
+  testcase.analyze_pending = True
   testcase.uploader_email = uploader_email
   testcase.put()
 

diff --git a/src/clusterfuzz/_internal/datastore/data_types.py b/src/clusterfuzz/_internal/datastore/data_types.py
@@ -583,6 +583,10 @@ class Testcase(Model):
   # Tracks if a testcase is stuck during triage.
   stuck_in_triage = ndb.BooleanProperty(default=False)
 
+  # Tracks if analyze task is pending.
+  # Defaults to false, since most testcases are fuzzer produced.
+  analyze_pending = ndb.BooleanProperty(default=False)
+
   def is_chromium(self):
     return self.project_name in ('chromium', 'chromium-testing')
 

diff --git a/src/clusterfuzz/_internal/metrics/monitoring_metrics.py b/src/clusterfuzz/_internal/metrics/monitoring_metrics.py
@@ -231,17 +231,17 @@
     ],
 )
 
-TESTCASE_UPLOAD_TRIAGE_DURATION = monitor.CumulativeDistributionMetric(
-    'uploaded_testcase_analysis/triage_duration_secs',
+TESTCASE_TRIAGE_DURATION = monitor.CumulativeDistributionMetric(
+    'testcase_analysis/triage_duration_hours',
     description=('Time elapsed between testcase upload and completion'
-                 ' of relevant tasks in the testcase upload lifecycle.'
+                 ' of relevant tasks in the testcase lifecycle.'
                  ' Origin can be either from a fuzzer, or a manual'
                  ' upload. Measured in hours.'),
     bucketer=monitor.GeometricBucketer(),
     field_spec=[
         monitor.StringField('step'),
         monitor.StringField('job'),
-        monitor.StringField('origin'),
+        monitor.BooleanField('from_fuzzer'),
     ],
 )
 
@@ -365,6 +365,7 @@
     field_spec=[
         monitor.StringField('job'),
         monitor.StringField('platform'),
+        monitor.StringField('step'),
     ])
 
 UNTRIAGED_TESTCASE_COUNT = monitor.GaugeMetric(