From ded036ffecb3d8c3fd541bc8ac0a49ec3a249db7 Mon Sep 17 00:00:00 2001 From: Sagar Shinde Date: Fri, 25 Oct 2024 15:15:45 +0530 Subject: [PATCH 1/8] monitoring --- .../lib/rtsl_exporter_alerts.libsonnet | 42 +++++++++++++++++++ .../kube-prometheus/monitoring.jsonnet | 9 ++-- token.json | 10 +++++ 3 files changed, 57 insertions(+), 4 deletions(-) create mode 100644 k8s/manifests/kube-prometheus/lib/rtsl_exporter_alerts.libsonnet create mode 100644 token.json diff --git a/k8s/manifests/kube-prometheus/lib/rtsl_exporter_alerts.libsonnet b/k8s/manifests/kube-prometheus/lib/rtsl_exporter_alerts.libsonnet new file mode 100644 index 00000000..8d7cb668 --- /dev/null +++ b/k8s/manifests/kube-prometheus/lib/rtsl_exporter_alerts.libsonnet @@ -0,0 +1,42 @@ + + +local addMixin = (import 'kube-prometheus/lib/mixin.libsonnet'); + +local prometheusRules = { + prometheusRules+:: { + groups+: [ + { + name: 'rtsl_exporter.rules', + rules: [ + { + alert: 'RtslExporterDown', + expr: ||| + up{job="rtsl-exporter", namespace="rtsl-exporter", service="rtsl-exporter"} == 0 + |||, + 'for': '5m', + labels: { + severity: 'critical' + }, + annotations: { + summary: "RTSL Exporter service is down", + description: "The RTSL Exporter service is not reachable." + } + } + ], + }, + ], + }, +}; + +local grafanaDashboards = {}; + +local rtslExporterMixin = addMixin({ + name: 'rtsl_exporter', + dashboardFolder: 'RTSL Exporter', + mixin: prometheusRules + grafanaDashboards, +}); + +{ + grafanaDashboards: rtslExporterMixin.grafanaDashboards, + prometheusRules: rtslExporterMixin.prometheusRules, +} \ No newline at end of file diff --git a/k8s/manifests/kube-prometheus/monitoring.jsonnet b/k8s/manifests/kube-prometheus/monitoring.jsonnet index 4fd99e42..fa00d044 100644 --- a/k8s/manifests/kube-prometheus/monitoring.jsonnet +++ b/k8s/manifests/kube-prometheus/monitoring.jsonnet @@ -8,6 +8,7 @@ local argocd = (import 'lib/argocd.libsonnet'); local ingress = (import 'lib/ingress.libsonnet'); local dhis2Server = (import 'lib/dhis2-server.libsonnet'); local alphasms = (import 'lib/alphasms.libsonnet'); +local rtslExporterAlerts = (import 'lib/rtsl_exporter_alerts.libsonnet'); local environment = std.extVar('ENVIRONMENT'); local namespace = 'monitoring'; @@ -36,8 +37,8 @@ local grafanaDashboards = redis.grafanaDashboards + ingressNginx.grafanaDashboards + simpleServer.grafanaDashboards + - (if enableDhis2Dashboards then dhis2Server.grafanaDashboards else {}); - + (if enableDhis2Dashboards then dhis2Server.grafanaDashboards else {}) + + rtslExporterAlerts.grafanaDashboards; local kp = (import 'kube-prometheus/main.libsonnet') + (import 'kube-prometheus/addons/all-namespaces.libsonnet') + @@ -112,6 +113,6 @@ local manifests = [postgres.prometheusRules] + postgres.monitors(config.postgresNamespaces).exporterServices + postgres.monitors(config.postgresNamespaces).serviceMonitors + - (if isEnvSandbox then [alphasms.prometheusRules] else []); - + (if isEnvSandbox then [alphasms.prometheusRules] else []) + + rtslExporterAlerts.prometheusRules; argocd.addArgoAnnotations(manifests, kp.values.common.namespace) diff --git a/token.json b/token.json new file mode 100644 index 00000000..e02b5771 --- /dev/null +++ b/token.json @@ -0,0 +1,10 @@ +{ + "kind": "ExecCredential", + "apiVersion": "client.authentication.k8s.io/v1beta1", + "spec": {}, + "status": { + "expirationTimestamp": "2024-10-14T10:09:37Z", + "token": "k8s-aws-v1.aHR0cHM6Ly9zdHMuYXAtc291dGgtMS5hbWF6b25hd3MuY29tLz9BY3Rpb249R2V0Q2FsbGVySWRlbnRpdHkmVmVyc2lvbj0yMDExLTA2LTE1JlgtQW16LUFsZ29yaXRobT1BV1M0LUhNQUMtU0hBMjU2JlgtQW16LUNyZWRlbnRpYWw9QUtJQTRKR0JRTDVaWFVFUEE3S0UlMkYyMDI0MTAxNCUyRmFwLXNvdXRoLTElMkZzdHMlMkZhd3M0X3JlcXVlc3QmWC1BbXotRGF0ZT0yMDI0MTAxNFQwOTU1MzdaJlgtQW16LUV4cGlyZXM9NjAmWC1BbXotU2lnbmVkSGVhZGVycz1ob3N0JTNCeC1rOHMtYXdzLWlkJlgtQW16LVNpZ25hdHVyZT05MTY3ZTI3MDE1M2QwNTRjZDgyOWYxNDgyNzM3YjkzNWEwZWYwNDA0NTBiN2ZkYzU2MjIyNDU2NTkwYzk1Yjk0" + } +} + From b6ea2b01c18b9592f9ea37d59fd863ff175fca08 Mon Sep 17 00:00:00 2001 From: Sagar Shinde Date: Mon, 28 Oct 2024 12:00:08 +0530 Subject: [PATCH 2/8] Remove unused grafanaDashboards variable The grafanaDashboards variable was declared but not used anywhere in the code. Removing it helps to clean up the code and prevent any potential confusion for future maintenance. --- .../kube-prometheus/lib/rtsl_exporter_alerts.libsonnet | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/k8s/manifests/kube-prometheus/lib/rtsl_exporter_alerts.libsonnet b/k8s/manifests/kube-prometheus/lib/rtsl_exporter_alerts.libsonnet index 8d7cb668..86f8c1f0 100644 --- a/k8s/manifests/kube-prometheus/lib/rtsl_exporter_alerts.libsonnet +++ b/k8s/manifests/kube-prometheus/lib/rtsl_exporter_alerts.libsonnet @@ -28,15 +28,12 @@ local prometheusRules = { }, }; -local grafanaDashboards = {}; - local rtslExporterMixin = addMixin({ name: 'rtsl_exporter', dashboardFolder: 'RTSL Exporter', - mixin: prometheusRules + grafanaDashboards, + mixin: prometheusRules }); { - grafanaDashboards: rtslExporterMixin.grafanaDashboards, - prometheusRules: rtslExporterMixin.prometheusRules, + prometheusRules: rtslExporterMixin.prometheusRules } \ No newline at end of file From 72d074ba27f20b3ac6308927f9975c015967959c Mon Sep 17 00:00:00 2001 From: Sagar Shinde Date: Mon, 28 Oct 2024 12:03:26 +0530 Subject: [PATCH 3/8] Remove duplicate dhis2Server grafanaDashboards addition Eliminated a redundant addition of dhis2Server grafanaDashboards in monitoring.jsonnet to avoid duplicated configurations. This change ensures that the rtslExporterAlerts grafanaDashboards are included only once, improving configuration clarity and correctness. --- k8s/manifests/kube-prometheus/monitoring.jsonnet | 2 -- 1 file changed, 2 deletions(-) diff --git a/k8s/manifests/kube-prometheus/monitoring.jsonnet b/k8s/manifests/kube-prometheus/monitoring.jsonnet index cf4db8cd..c15fbe0d 100644 --- a/k8s/manifests/kube-prometheus/monitoring.jsonnet +++ b/k8s/manifests/kube-prometheus/monitoring.jsonnet @@ -43,8 +43,6 @@ local grafanaDashboards = loki.grafanaDashboards + (if enableDhis2Dashboards then dhis2Server.grafanaDashboards else {}); - (if enableDhis2Dashboards then dhis2Server.grafanaDashboards else {}) + - rtslExporterAlerts.grafanaDashboards; local kp = (import 'kube-prometheus/main.libsonnet') + (import 'kube-prometheus/addons/all-namespaces.libsonnet') + From e5c463e8e39e2f3c3def080d2a9708d53e5d322c Mon Sep 17 00:00:00 2001 From: Sagar Shinde Date: Mon, 28 Oct 2024 12:06:51 +0530 Subject: [PATCH 4/8] Add rtslExporterAlerts rules to sandbox environment Integrated rtslExporterAlerts.prometheusRules into the monitoring configuration for the sandbox environment. This ensures that RTSL exporter alerts are active when isEnvSandbox is true. Additionally, removed redundant rtslExporterAlerts rule declaration outside the sandbox check. --- k8s/manifests/kube-prometheus/monitoring.jsonnet | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/k8s/manifests/kube-prometheus/monitoring.jsonnet b/k8s/manifests/kube-prometheus/monitoring.jsonnet index c15fbe0d..e5647ada 100644 --- a/k8s/manifests/kube-prometheus/monitoring.jsonnet +++ b/k8s/manifests/kube-prometheus/monitoring.jsonnet @@ -128,8 +128,7 @@ local manifests = [postgres.prometheusRules] + postgres.monitors(config.postgresNamespaces).exporterServices + postgres.monitors(config.postgresNamespaces).serviceMonitors + - (if isEnvSandbox then [alphasms.prometheusRules] + [sendgrid.prometheusRules] else []); + (if isEnvSandbox then [alphasms.prometheusRules] + [sendgrid.prometheusRules] + + [rtslExporterAlerts.prometheusRules] else []); - (if isEnvSandbox then [alphasms.prometheusRules] else []) + - rtslExporterAlerts.prometheusRules; argocd.addArgoAnnotations(manifests, kp.values.common.namespace) From 6ff509c0a45d6541937694d2d23ef2cda9f48106 Mon Sep 17 00:00:00 2001 From: Sagar Shinde Date: Mon, 28 Oct 2024 12:17:45 +0530 Subject: [PATCH 5/8] Fix naming convention in rtsl_exporter manifests Updated the naming convention from 'rtsl_exporter' to 'rtsl-exporter' in all relevant fields. This change aligns the naming format with other components and ensures consistency across the manifests. --- .../kube-prometheus/lib/rtsl_exporter_alerts.libsonnet | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/k8s/manifests/kube-prometheus/lib/rtsl_exporter_alerts.libsonnet b/k8s/manifests/kube-prometheus/lib/rtsl_exporter_alerts.libsonnet index 86f8c1f0..45c9b51a 100644 --- a/k8s/manifests/kube-prometheus/lib/rtsl_exporter_alerts.libsonnet +++ b/k8s/manifests/kube-prometheus/lib/rtsl_exporter_alerts.libsonnet @@ -6,7 +6,7 @@ local prometheusRules = { prometheusRules+:: { groups+: [ { - name: 'rtsl_exporter.rules', + name: 'rtsl-exporter.rules', rules: [ { alert: 'RtslExporterDown', @@ -29,7 +29,7 @@ local prometheusRules = { }; local rtslExporterMixin = addMixin({ - name: 'rtsl_exporter', + name: 'rtsl-exporter', dashboardFolder: 'RTSL Exporter', mixin: prometheusRules }); From ef72c152702f06b481a89d09fb7636f641dbab84 Mon Sep 17 00:00:00 2001 From: Sagar Shinde Date: Mon, 28 Oct 2024 12:20:04 +0530 Subject: [PATCH 6/8] Reduce RTSL Exporter alert duration from 5m to 3s This change modifies the alert rule for the RTSL Exporter to trigger after 3 seconds of downtime instead of 5 minutes. This aims to provide quicker notifications for potential issues with the RTSL Exporter service. --- .../kube-prometheus/lib/rtsl_exporter_alerts.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/k8s/manifests/kube-prometheus/lib/rtsl_exporter_alerts.libsonnet b/k8s/manifests/kube-prometheus/lib/rtsl_exporter_alerts.libsonnet index 45c9b51a..a1ad0fe9 100644 --- a/k8s/manifests/kube-prometheus/lib/rtsl_exporter_alerts.libsonnet +++ b/k8s/manifests/kube-prometheus/lib/rtsl_exporter_alerts.libsonnet @@ -13,7 +13,7 @@ local prometheusRules = { expr: ||| up{job="rtsl-exporter", namespace="rtsl-exporter", service="rtsl-exporter"} == 0 |||, - 'for': '5m', + 'for': '3s', labels: { severity: 'critical' }, From 06f4e1d85e6bbbba03b8578fc08b3ad2bb6655f5 Mon Sep 17 00:00:00 2001 From: Sagar Shinde Date: Wed, 30 Oct 2024 10:17:21 +0530 Subject: [PATCH 7/8] Update RTSL Exporter alert rules Replaced 'up' check with 'absent_over_time' for better accuracy. Extended alert duration from 3 seconds to 1 minute. Improved alert descriptions for clarity. --- .../kube-prometheus/lib/rtsl_exporter_alerts.libsonnet | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/k8s/manifests/kube-prometheus/lib/rtsl_exporter_alerts.libsonnet b/k8s/manifests/kube-prometheus/lib/rtsl_exporter_alerts.libsonnet index a1ad0fe9..d912370d 100644 --- a/k8s/manifests/kube-prometheus/lib/rtsl_exporter_alerts.libsonnet +++ b/k8s/manifests/kube-prometheus/lib/rtsl_exporter_alerts.libsonnet @@ -1,5 +1,3 @@ - - local addMixin = (import 'kube-prometheus/lib/mixin.libsonnet'); local prometheusRules = { @@ -11,15 +9,15 @@ local prometheusRules = { { alert: 'RtslExporterDown', expr: ||| - up{job="rtsl-exporter", namespace="rtsl-exporter", service="rtsl-exporter"} == 0 + absent_over_time(up{job="rtsl-exporter", namespace="rtsl-exporter", service="rtsl-exporter"}[1m]) |||, - 'for': '3s', + 'for': '1m', labels: { severity: 'critical' }, annotations: { - summary: "RTSL Exporter service is down", - description: "The RTSL Exporter service is not reachable." + summary: "RTSL Exporter service down", + description: "No metrics have been received from the RTSL Exporter service for the past 1 minute." } } ], From 10c72772d12210b187007f1678d5b429e2074f7d Mon Sep 17 00:00:00 2001 From: Sagar Shinde Date: Wed, 30 Oct 2024 12:52:15 +0530 Subject: [PATCH 8/8] Fix sandbox environment monitoring rules Consolidate Prometheus monitoring rules for the sandbox environment. Ensure `rtslExporterAlerts.prometheusRules` is properly included along with other rules when `isEnvSandbox` is true. --- k8s/manifests/kube-prometheus/monitoring.jsonnet | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/k8s/manifests/kube-prometheus/monitoring.jsonnet b/k8s/manifests/kube-prometheus/monitoring.jsonnet index d6ba3fb8..d0694549 100644 --- a/k8s/manifests/kube-prometheus/monitoring.jsonnet +++ b/k8s/manifests/kube-prometheus/monitoring.jsonnet @@ -128,8 +128,6 @@ local manifests = [postgres.prometheusRules] + postgres.monitors(config.postgresNamespaces).exporterServices + postgres.monitors(config.postgresNamespaces).serviceMonitors + - (if isEnvSandbox then [alphasms.prometheusRules] + [sendgrid.prometheusRules] + [loki.prometheusRules] else []); - (if isEnvSandbox then [alphasms.prometheusRules] + [sendgrid.prometheusRules] + - [rtslExporterAlerts.prometheusRules] else []); + (if isEnvSandbox then [alphasms.prometheusRules] + [sendgrid.prometheusRules] + [loki.prometheusRules] + [rtslExporterAlerts.prometheusRules] else []); argocd.addArgoAnnotations(manifests, kp.values.common.namespace)