Skip to content

Commit

Permalink
scheduler: expose scheduling errors to users
Browse files Browse the repository at this point in the history
Closes #643
  • Loading branch information
mdonadoni committed Nov 20, 2023
1 parent 34131ba commit 204b290
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 12 deletions.
5 changes: 5 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
Changes
=======

Version 0.9.2 (UNRELEASED)
--------------------------

- Changes workflow scheduler logging behaviour to also report the main reason behind scheduling errors to the users.

Version 0.9.1 (2023-09-27)
--------------------------

Expand Down
28 changes: 16 additions & 12 deletions reana_server/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,13 +149,18 @@ def _fail_workflow(self, workflow_id: str, logs: str = "") -> None:
logs=logs,
)

def _retry_submission(self, workflow_id: str, workflow_submission: Dict) -> None:
def _retry_submission(
self, workflow_id: str, workflow_submission: Dict, reason: Optional[str] = None
) -> None:
retry_count = workflow_submission.get("retry_count", 0)

if retry_count >= REANA_SCHEDULER_REQUEUE_COUNT:
error_message = (
f"Workflow {workflow_submission['workflow_id_or_name']} failed to schedule after "
f"{retry_count} retries. Giving up."
)
if reason:
error_message += f"\nReason: {reason}"
logging.error(error_message)
self._fail_workflow(workflow_id, logs=error_message)
else:
Expand Down Expand Up @@ -202,12 +207,12 @@ def on_message(self, body, message):
f'Workflow {http_response_json["workflow_id"]} successfully started.'
)

except HTTPBadGateway as api_e:
logging.error(
"Workflow failed to start because RWC got an error while calling"
f"an external service (i.e. DB):\n {api_e}",
exc_info=True,
except HTTPBadGateway:
error = (
"Workflow failed to start because reana-workflow-controller got an "
"error while calling an external service (i.e. database)."
)
logging.exception(error)
except HTTPNotFound as not_found_e:
# if workflow is not found, we cannot retry or report an error to workflow logs
retry = False
Expand All @@ -233,15 +238,14 @@ def on_message(self, body, message):
exc_info=True,
)
self._fail_workflow(workflow_id, logs=error_message)
except Exception as e:
logging.error(
f"Something went wrong while calling RWC:\n {e}", exc_info=True
)
except Exception:
error = "Something went wrong while calling reana-workflow-controller."
logging.exception(error)
finally:
sleep(REANA_SCHEDULER_REQUEUE_SLEEP)
if not started and retry:
message.reject()
self._retry_submission(workflow_id, workflow_submission_copy)
self._retry_submission(workflow_id, workflow_submission_copy, error)
else:
message.ack()
else:
Expand All @@ -251,4 +255,4 @@ def on_message(self, body, message):
)
sleep(REANA_SCHEDULER_REQUEUE_SLEEP)
message.reject()
self._retry_submission(workflow_id, workflow_submission_copy)
self._retry_submission(workflow_id, workflow_submission_copy, error)

0 comments on commit 204b290

Please sign in to comment.