Skip to content

Commit

Permalink
feat: Implemented metrics exposure for job status with labels: name, …
Browse files Browse the repository at this point in the history
…namespace, uid, status, reason (#310)
  • Loading branch information
13241308289 authored Mar 3, 2024
1 parent 434352b commit b93a2b4
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 0 deletions.
3 changes: 3 additions & 0 deletions pkg/job_controller/job.go
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,9 @@ func (jc *JobController) ReconcileJobs(job client.Object, replicas map[apiv1.Rep
return result, err
}

// Metering job status
jc.Metrics.JobStatusMetrics(job, jobStatus)

// Metering first pod launch delay when job state transit from created to running.
if commonutil.IsCreated(*oldStatus) && commonutil.IsRunning(jobStatus) {
jc.Metrics.FirstPodLaunchDelaySeconds(activePods, job, jobStatus)
Expand Down
22 changes: 22 additions & 0 deletions pkg/metrics/job_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,10 @@ var (
Name: "kubedl_jobs_all_pods_launch_delay_seconds",
Help: "Histogram for recording sync launch delay duration(from job created to all pods running).",
}, []string{"kind", "name", "namespace", "uid"})
jobStatus = promauto.NewHistogramVec(prometheus.HistogramOpts{
Name: "kubedl_job_status",
Help: "Counts number of jobs with failed status",
}, []string{"kind", "name", "namespace", "uid", "status", "reason"})
)

// JobMetrics holds the kinds of metrics counter for some type of job workload.
Expand All @@ -71,6 +75,7 @@ type JobMetrics struct {
restart prometheus.Counter
firstPodLaunchDelay *prometheus.HistogramVec
allPodsLaunchDelay *prometheus.HistogramVec
jobStatus *prometheus.HistogramVec
}

func NewJobMetrics(kind string, client client.Client) *JobMetrics {
Expand All @@ -85,6 +90,7 @@ func NewJobMetrics(kind string, client client.Client) *JobMetrics {
restart: restart.With(label),
firstPodLaunchDelay: firstPodLaunchDelayHist,
allPodsLaunchDelay: allPodsLaunchDelayHist,
jobStatus: jobStatus,
}
// Register running gauge func on center prometheus demand pull.
// Different kinds of workload metrics share the same metric name and help info,
Expand Down Expand Up @@ -137,6 +143,22 @@ func (m *JobMetrics) RestartInc() {
m.restart.Inc()
}

func (m *JobMetrics) JobStatusMetrics(job metav1.Object, status v1.JobStatus) {
for _, condition := range status.Conditions {
if condition.Status == corev1.ConditionTrue {
m.jobStatus.With(prometheus.Labels{
"kind": m.kind,
"name": job.GetName(),
"namespace": job.GetNamespace(),
"uid": string(job.GetUID()),
"status": string(condition.Type),
"reason": condition.Reason,
}).Observe(1)
break
}
}
}

func (m *JobMetrics) FirstPodLaunchDelaySeconds(activePods []*corev1.Pod, job metav1.Object, status v1.JobStatus) {
if !util.IsRunning(status) {
return
Expand Down

0 comments on commit b93a2b4

Please sign in to comment.