microsoft · M2skills · Oct 4, 2023
@@ -289,7 +289,7 @@
                   "content": {
                     "version": "KqlItem/1.0",
                     "query": "// of all currently running endpoints\r\n// find the most recent correlation id\r\nlet upstatus = Prometheus_HaClusterExporter_CL\r\n| where TimeGenerated > ago(10min)\r\n| where name_s == \"sapmon\"\r\n| summarize arg_max(TimeGenerated, correlation_id_g, value_d) by sid_s, clusterName_s, hostname_s\r\n| project correlation_id_g;\r\n//identify the master (status = DC)\r\nlet dcstatus = materialize(Prometheus_HaClusterExporter_CL\r\n| where correlation_id_g in (upstatus)\r\n| where name_s == 'ha_cluster_pacemaker_nodes'\r\n| where value_d == 1\r\n| extend node_status=parse_json(labels_s)\r\n| where node_status['status']=='dc'\r\n| where tostring(node_status['node']) == hostname_s\r\n| summarize arg_max(TimeGenerated, correlation_id_g) by sid_s, clusterName_s, hostname_s\r\n| project correlation_id_g);\r\n//find all pacemaker resources and their status from dc metrics\r\n( Prometheus_HaClusterExporter_CL\r\n| where correlation_id_g in (dcstatus)\r\n| where name_s == \"ha_cluster_pacemaker_resources\" \r\n| where value_d == 1\r\n| extend  resources = parse_json(labels_s)\r\n| summarize \r\n    resources_failed = countif(resources['status'] == 'failed' or resources['status'] == 'failed_ignored'), \r\n    resources_blocked = countif(resources['status'] == 'blocked' or resources['status'] == 'orphaned'), \r\n    resources_active = countif(resources['role'] == 'started' and resources['managed'] == 'true')\r\n    + countif(resources['role'] == 'master' and resources['managed'] == 'true')\r\n    + countif(resources['role'] == 'slave' and resources['managed'] == 'true'),\r\n    resources_unmanaged = countif(resources['managed'] == 'false'),\r\n    status_red = countif(resources['status'] == 'failed' or resources['status'] == 'failed_ignored'), \r\n    status_yellow = countif(resources['status'] == 'blocked' or resources['status'] == 'orphaned'), \r\n    status_green = countif(resources['status'] == 'active' and resources['managed'] == 'true'),\r\n    status_grey = countif(resources['managed'] == 'false')\r\n    by sid_s, clusterName_s)\r\n| union \r\n//find all pacemaker nodes and their status from dc metrics\r\n( Prometheus_HaClusterExporter_CL\r\n| where correlation_id_g in (dcstatus)\r\n| where name_s == \"ha_cluster_pacemaker_nodes\" \r\n| where value_d == 1\r\n| extend  nodes = parse_json(labels_s)\r\n| summarize \r\n    nodes_unclean = countif(nodes['status'] == 'unclean'), \r\n    nodes_pending_shutdown = countif(nodes['status'] == 'pending' or nodes['status'] == 'shutdown'  or nodes['status'] == 'standby_onfail'), \r\n    nodes_online = countif(nodes['status'] == 'online'),\r\n    nodes_maint_standby = countif(nodes['status'] == 'maintenance' or nodes['status'] == 'standby'),\r\n    status_red = countif(nodes['status'] == 'unclean'), \r\n    status_yellow = countif(nodes['status'] == 'pending' or nodes['status'] == 'shutdown'  or nodes['status'] == 'standby_onfail'), \r\n    status_green = countif(nodes['status'] == 'online' or nodes['status'] == 'dc'),\r\n    status_grey = countif(nodes['status'] == 'maintenance' or nodes['status'] == 'standby')\r\n    by sid_s, clusterName_s)   \r\n| union \r\n//find all exporter up-status as additional metric\r\n(Prometheus_HaClusterExporter_CL\r\n| where correlation_id_g in (upstatus)\r\n| where name_s == 'up'\r\n| summarize status_grey = case(countif(value_d==1) == 0, 1, 0)//only count grey status if there is no (0) endpoints up\r\n    by sid_s, clusterName_s)\r\n//summarize per cluster per sid\r\n| summarize sum(resources_failed),sum(resources_blocked),sum(resources_active),sum(resources_unmanaged),sum(nodes_unclean),sum(nodes_pending_shutdown),sum(nodes_online),sum(nodes_maint_standby),cluster_status = case(sum(status_red) > 0, 'red', sum(status_yellow) > 0, 'yellow', sum(status_grey) > 0, 'grey', sum(status_green) > 0, 'green', 'greyblue')  by sid_s, clusterName_s\r\n| project cluster_status,sum_resources_failed,sum_resources_blocked, sum_resources_active, sum_resources_unmanaged,sum_nodes_unclean, sum_nodes_pending_shutdown,sum_nodes_online,sum_nodes_maint_standby,sid_s,clusterName_s",
-                    "size": 4,
+                    "size": 3,
                     "exportedParameters": [
                       {
                         "fieldName": "sid_s",