diff --git a/client/src/Hooks/useMonitorForm.ts b/client/src/Hooks/useMonitorForm.ts index 963409fc8a..9fce86727e 100644 --- a/client/src/Hooks/useMonitorForm.ts +++ b/client/src/Hooks/useMonitorForm.ts @@ -17,6 +17,7 @@ const getBaseDefaults = (data?: Monitor | null) => ({ geoCheckEnabled: data?.geoCheckEnabled ?? false, geoCheckLocations: data?.geoCheckLocations || [], geoCheckInterval: data?.geoCheckInterval || 300000, + escalationRules: data?.escalationRules || [], }); export const useMonitorForm = ({ diff --git a/client/src/Pages/CreateMonitor/index.tsx b/client/src/Pages/CreateMonitor/index.tsx index 15b76eab36..4f8726fb06 100644 --- a/client/src/Pages/CreateMonitor/index.tsx +++ b/client/src/Pages/CreateMonitor/index.tsx @@ -2,7 +2,7 @@ import { useMemo, useState } from "react"; import { useEffect } from "react"; import { logger } from "@/Utils/logger"; import { useParams, useLocation, useNavigate } from "react-router"; -import { useForm, Controller } from "react-hook-form"; +import { useForm, Controller, useFieldArray } from "react-hook-form"; import { zodResolver } from "@hookform/resolvers/zod"; import { useTheme } from "@mui/material"; import Stack from "@mui/material/Stack"; @@ -14,7 +14,7 @@ import Typography from "@mui/material/Typography"; import Link from "@mui/material/Link"; import Divider from "@mui/material/Divider"; import IconButton from "@mui/material/IconButton"; -import { Trash2 } from "lucide-react"; +import { Trash2, Plus } from "lucide-react"; import { HeaderDeleteControls } from "@/Components/monitors"; import { GeoContinents } from "@/Types/GeoCheck"; @@ -203,6 +203,8 @@ const CreateMonitorPage = () => { defaultValues: defaults, }); const { control, watch, handleSubmit, clearErrors } = form; + const { fields: escalationFields, append: appendEscalation, remove: removeEscalation } = + useFieldArray({ control, name: "escalationRules" }); useEffect(() => { form.reset(defaults); @@ -765,6 +767,87 @@ const CreateMonitorPage = () => { } /> + + {escalationFields.map((field, index) => { + const notificationOptions = (notifications ?? []).map((n) => ({ + ...n, + name: n.notificationName, + })); + return ( + + ( + f.onChange(Number(e.target.value))} + fieldLabel={t("pages.createMonitor.form.escalation.delayLabel")} + error={!!fieldState.error} + helperText={fieldState.error?.message ?? ""} + sx={{ width: 140 }} + /> + )} + /> + { + const selected = + notificationOptions.find((n) => n.id === f.value) ?? null; + return ( + o.name} + onChange={(_: unknown, v: (typeof notificationOptions)[0] | null) => + f.onChange(v?.id ?? "") + } + isOptionEqualToValue={(o, v) => o.id === v.id} + fieldLabel={t( + "pages.createMonitor.form.escalation.channelLabel" + )} + sx={{ minWidth: 200 }} + /> + ); + }} + /> + removeEscalation(index)} + aria-label="Remove escalation rule" + sx={{ mt: 3 }} + > + + + + ); + })} + + + } + /> + {(watchedType === "http" || watchedType === "grpc" || watchedType === "websocket") && ( diff --git a/client/src/Types/Monitor.ts b/client/src/Types/Monitor.ts index 053b517d1d..1b0340a6d1 100644 --- a/client/src/Types/Monitor.ts +++ b/client/src/Types/Monitor.ts @@ -38,6 +38,11 @@ export type MonitorStatus = (typeof MonitorStatuses)[number]; export type MonitorMatchMethod = "equal" | "include" | "regex" | ""; +export interface EscalationRule { + notificationId: string; + delayMinutes: number; +} + export interface Monitor { id: string; userId: string; @@ -60,6 +65,7 @@ export interface Monitor { interval: number; uptimePercentage?: number; notifications: string[]; + escalationRules?: EscalationRule[]; secret?: string; cpuAlertThreshold: number; cpuAlertCounter: number; diff --git a/client/src/Validation/monitor.ts b/client/src/Validation/monitor.ts index 9acffe6fed..6f196b49d3 100644 --- a/client/src/Validation/monitor.ts +++ b/client/src/Validation/monitor.ts @@ -4,6 +4,11 @@ import { GeoContinents } from "@/Types/GeoCheck"; // URL schema with custom error message const urlSchema = z.url({ message: "Please enter a valid URL" }); +const escalationRuleSchema = z.object({ + notificationId: z.string().min(1), + delayMinutes: z.number().int().min(1, "Delay must be at least 1 minute"), +}); + // Common base schema for all monitor types const baseSchema = z.object({ name: z @@ -27,6 +32,7 @@ const baseSchema = z.object({ .number() .min(300000, "Interval must be at least 5 minutes") .optional(), + escalationRules: z.array(escalationRuleSchema).optional(), }); // HTTP monitor schema diff --git a/client/src/locales/en.json b/client/src/locales/en.json index 92a21939f3..514c9289e0 100644 --- a/client/src/locales/en.json +++ b/client/src/locales/en.json @@ -543,6 +543,13 @@ "description": "Select the notification channels you want to use", "title": "Notifications" }, + "escalation": { + "title": "Escalation rules", + "description": "Send an additional alert if an incident is still active after a set delay.", + "addRule": "Add escalation rule", + "delayLabel": "Delay (minutes)", + "channelLabel": "Notify channel" + }, "type": { "description": "Select the type of check to perform", "optionDockerDescription": "Use Docker to monitor if a container is running.", diff --git a/server/src/db/models/Incident.ts b/server/src/db/models/Incident.ts index 82e2b5eb2b..5618edc073 100644 --- a/server/src/db/models/Incident.ts +++ b/server/src/db/models/Incident.ts @@ -72,6 +72,10 @@ const IncidentSchema = new Schema( type: String, default: null, }, + escalationsFired: { + type: [String], + default: [], + }, }, { timestamps: true } ); diff --git a/server/src/db/models/Monitor.ts b/server/src/db/models/Monitor.ts index 036aeadad6..6e76225e65 100644 --- a/server/src/db/models/Monitor.ts +++ b/server/src/db/models/Monitor.ts @@ -1,5 +1,5 @@ import { Schema, model, Types } from "mongoose"; -import type { Monitor, MonitorMatchMethod, CheckSnapshot } from "@/types/monitor.js"; +import type { Monitor, MonitorMatchMethod, CheckSnapshot, EscalationRule } from "@/types/monitor.js"; import { MonitorTypes, MonitorStatuses } from "@/types/monitor.js"; import type { CheckAudits, @@ -173,6 +173,14 @@ const snapshotAuditsSchema = new Schema( { _id: false } ); +const escalationRuleSchema = new Schema( + { + notificationId: { type: String, required: true }, + delayMinutes: { type: Number, required: true, min: 1 }, + }, + { _id: false } +); + const checkSnapshotSchema = new Schema( { id: { type: String, required: true }, @@ -351,6 +359,10 @@ const MonitorSchema = new Schema( type: Number, default: 300000, }, + escalationRules: { + type: [escalationRuleSchema], + default: [], + }, recentChecks: { type: [checkSnapshotSchema], default: [], diff --git a/server/src/repositories/incidents/IIncidentsRepository.ts b/server/src/repositories/incidents/IIncidentsRepository.ts index c4fcef2ae0..6f51051e0f 100644 --- a/server/src/repositories/incidents/IIncidentsRepository.ts +++ b/server/src/repositories/incidents/IIncidentsRepository.ts @@ -22,6 +22,7 @@ export interface IIncidentsRepository { // update updateById(incidentId: string, teamId: string, updateData: Partial): Promise; + addEscalationFired(incidentId: string, teamId: string, notificationId: string): Promise; // delete deleteByMonitorId(monitorId: string, teamId: string): Promise; deleteByMonitorIdsNotIn(monitorIds: string[]): Promise; diff --git a/server/src/repositories/incidents/MongoIncidentRepository.ts b/server/src/repositories/incidents/MongoIncidentRepository.ts index 096ba3d37b..fa287df368 100644 --- a/server/src/repositories/incidents/MongoIncidentRepository.ts +++ b/server/src/repositories/incidents/MongoIncidentRepository.ts @@ -60,6 +60,7 @@ class MongoIncidentRepository implements IIncidentsRepository { resolvedBy: doc.resolvedBy ? this.toStringId(doc.resolvedBy) : null, resolvedByEmail: doc.resolvedByEmail ?? null, comment: doc.comment ?? null, + escalationsFired: doc.escalationsFired ?? [], createdAt: this.toDateString(doc.createdAt), updatedAt: this.toDateString(doc.updatedAt), }; @@ -274,6 +275,13 @@ class MongoIncidentRepository implements IIncidentsRepository { }; }; + addEscalationFired = async (incidentId: string, teamId: string, notificationId: string): Promise => { + await IncidentModel.updateOne( + { _id: new mongoose.Types.ObjectId(incidentId), teamId: new mongoose.Types.ObjectId(teamId) }, + { $addToSet: { escalationsFired: notificationId } } + ); + }; + deleteByMonitorId = async (monitorId: string, teamId: string) => { const result = await IncidentModel.deleteMany({ monitorId: new mongoose.Types.ObjectId(monitorId), diff --git a/server/src/repositories/monitors/MongoMonitorsRepository.ts b/server/src/repositories/monitors/MongoMonitorsRepository.ts index b2d7594483..f52ffed355 100644 --- a/server/src/repositories/monitors/MongoMonitorsRepository.ts +++ b/server/src/repositories/monitors/MongoMonitorsRepository.ts @@ -374,6 +374,10 @@ class MongoMonitorsRepository implements IMonitorsRepository { interval: doc.interval, uptimePercentage: doc.uptimePercentage ?? undefined, notifications: notificationIds, + escalationRules: (doc.escalationRules ?? []).map((rule) => ({ + notificationId: rule.notificationId, + delayMinutes: rule.delayMinutes, + })), secret: doc.secret ?? undefined, cpuAlertThreshold: doc.cpuAlertThreshold, cpuAlertCounter: doc.cpuAlertCounter, @@ -433,6 +437,10 @@ class MongoMonitorsRepository implements IMonitorsRepository { interval: doc.interval, uptimePercentage: doc.uptimePercentage ?? undefined, notifications: notificationIds, + escalationRules: (doc.escalationRules ?? []).map((rule) => ({ + notificationId: rule.notificationId, + delayMinutes: rule.delayMinutes, + })), secret: doc.secret ?? undefined, cpuAlertThreshold: doc.cpuAlertThreshold, cpuAlertCounter: doc.cpuAlertCounter, diff --git a/server/src/service/infrastructure/SuperSimpleQueue/SuperSimpleQueueHelper.ts b/server/src/service/infrastructure/SuperSimpleQueue/SuperSimpleQueueHelper.ts index b6908127b2..27052b95ca 100644 --- a/server/src/service/infrastructure/SuperSimpleQueue/SuperSimpleQueueHelper.ts +++ b/server/src/service/infrastructure/SuperSimpleQueue/SuperSimpleQueueHelper.ts @@ -38,7 +38,7 @@ export interface MonitorActionDecision { shouldResolveIncident: boolean; shouldSendNotification: boolean; incidentReason: "status_down" | "threshold_breach" | null; - notificationReason: "status_change" | "threshold_breach" | null; + notificationReason: "status_change" | "threshold_breach" | "escalation" | null; thresholdBreaches?: { cpu?: boolean; memory?: boolean; @@ -177,6 +177,15 @@ export class SuperSimpleQueueHelper implements ISuperSimpleQueueHelper { stack: error instanceof Error ? error.stack : undefined, }); }); + + // Step 8. Check escalations (best effort, don't wait) + this.checkEscalations(statusChangeResult.monitor).catch((error: unknown) => { + this.logger.warn({ + message: `Error checking escalations for monitor ${statusChangeResult.monitor.id}: ${error instanceof Error ? error.message : "Unknown error"}`, + service: SERVICE_NAME, + method: "getMonitorJob", + }); + }); } catch (error: unknown) { this.logger.warn({ message: error instanceof Error ? error.message : "Unknown error", @@ -418,6 +427,40 @@ export class SuperSimpleQueueHelper implements ISuperSimpleQueueHelper { }; }; + private checkEscalations = async (monitor: Monitor): Promise => { + if (!monitor.escalationRules || monitor.escalationRules.length === 0) return; + + const incident = await this.incidentsRepository.findActiveByMonitorId(monitor.id, monitor.teamId); + if (!incident) return; + + const incidentStartMs = new Date(incident.startTime).getTime(); + const nowMs = Date.now(); + const alreadyFired = new Set(incident.escalationsFired ?? []); + + const dueRules = monitor.escalationRules.filter((rule) => { + if (alreadyFired.has(rule.notificationId)) return false; + return nowMs >= incidentStartMs + rule.delayMinutes * 60 * 1000; + }); + + for (const rule of dueRules) { + try { + await this.notificationsService.sendEscalationNotification(monitor, rule.notificationId); + await this.incidentsRepository.addEscalationFired(incident.id, monitor.teamId, rule.notificationId); + this.logger.info({ + message: `Escalation fired for monitor ${monitor.id}, notificationId ${rule.notificationId}`, + service: SERVICE_NAME, + method: "checkEscalations", + }); + } catch (error: unknown) { + this.logger.warn({ + message: `Escalation failed for notificationId ${rule.notificationId}: ${error instanceof Error ? error.message : "Unknown error"}`, + service: SERVICE_NAME, + method: "checkEscalations", + }); + } + } + }; + private evaluateMonitorAction(statusChangeResult: StatusChangeResult): MonitorActionDecision { const { monitor, statusChanged, prevStatus } = statusChangeResult; diff --git a/server/src/service/infrastructure/notificationMessageBuilder.ts b/server/src/service/infrastructure/notificationMessageBuilder.ts index 934163b2a9..b4733090a3 100644 --- a/server/src/service/infrastructure/notificationMessageBuilder.ts +++ b/server/src/service/infrastructure/notificationMessageBuilder.ts @@ -15,6 +15,7 @@ export interface INotificationMessageBuilder { decision: MonitorActionDecision, clientHost: string ): NotificationMessage; + buildEscalationMessage(monitor: Monitor, clientHost: string): NotificationMessage; extractThresholdBreaches(monitor: Monitor, monitorStatusResponse: MonitorStatusResponse): ThresholdBreach[]; } @@ -182,6 +183,31 @@ export class NotificationMessageBuilder implements INotificationMessageBuilder { }; } + public buildEscalationMessage(monitor: Monitor, clientHost: string): NotificationMessage { + return { + type: "monitor_down", + severity: "critical", + monitor: { + id: monitor.id, + name: monitor.name, + url: monitor.url, + type: monitor.type, + status: monitor.status, + }, + content: { + title: `Escalation: ${monitor.name} is still down`, + summary: `Monitor "${monitor.name}" remains down and has not recovered. An escalation alert has been triggered.`, + details: [`URL: ${monitor.url}`, `Status: ${monitor.status}`, `Type: ${monitor.type}`], + timestamp: new Date(), + }, + clientHost, + metadata: { + teamId: monitor.teamId, + notificationReason: "escalation", + }, + }; + } + public extractThresholdBreaches(monitor: Monitor, monitorStatusResponse: MonitorStatusResponse): ThresholdBreach[] { const breaches: ThresholdBreach[] = []; diff --git a/server/src/service/infrastructure/notificationsService.ts b/server/src/service/infrastructure/notificationsService.ts index c75477c88c..cc353601c2 100644 --- a/server/src/service/infrastructure/notificationsService.ts +++ b/server/src/service/infrastructure/notificationsService.ts @@ -14,6 +14,7 @@ export interface INotificationsService { updateById(id: string, teamId: string, updateData: Partial): Promise; deleteById: (id: string, teamId: string) => Promise; handleNotifications: (monitor: Monitor, monitorStatusResponse: MonitorStatusResponse, decision: MonitorActionDecision) => Promise; + sendEscalationNotification: (monitor: Monitor, notificationId: string) => Promise; sendTestNotification: (notification: Partial) => Promise; testAllNotifications: (notificationIds: string[]) => Promise; @@ -141,6 +142,14 @@ export class NotificationsService implements INotificationsService { return await this.sendNotifications(monitor, monitorStatusResponse, decision); }; + sendEscalationNotification = async (monitor: Monitor, notificationId: string): Promise => { + const notification = await this.notificationsRepository.findById(notificationId, monitor.teamId); + const settings = this.settingsService.getSettings(); + const clientHost = settings.clientHost || "Host not defined"; + const message = this.notificationMessageBuilder.buildEscalationMessage(monitor, clientHost); + return await this.send(notification, monitor, {} as MonitorStatusResponse, {} as MonitorActionDecision, message); + }; + sendTestNotification = async (notification: Partial) => { switch (notification.type) { case "email": diff --git a/server/src/types/incident.ts b/server/src/types/incident.ts index 6b076ff835..483fda7d18 100644 --- a/server/src/types/incident.ts +++ b/server/src/types/incident.ts @@ -16,6 +16,7 @@ export interface Incident { resolvedBy?: string | null; resolvedByEmail?: string | null; comment?: string | null; + escalationsFired?: string[]; createdAt: string; updatedAt: string; } diff --git a/server/src/types/monitor.ts b/server/src/types/monitor.ts index f29ce75d78..b15b43c498 100644 --- a/server/src/types/monitor.ts +++ b/server/src/types/monitor.ts @@ -15,6 +15,11 @@ export type MonitorStatus = (typeof MonitorStatuses)[number]; export const MonitorMatchMethods = ["equal", "include", "regex"] as const; export type MonitorMatchMethod = (typeof MonitorMatchMethods)[number] | ""; +export interface EscalationRule { + notificationId: string; + delayMinutes: number; +} + export interface Monitor { id: string; userId: string; @@ -37,6 +42,7 @@ export interface Monitor { interval: number; uptimePercentage?: number; notifications: string[]; + escalationRules?: EscalationRule[]; secret?: string; cpuAlertThreshold: number; cpuAlertCounter: number; diff --git a/server/src/validation/monitorValidation.ts b/server/src/validation/monitorValidation.ts index df000ecef2..5a5f226d24 100644 --- a/server/src/validation/monitorValidation.ts +++ b/server/src/validation/monitorValidation.ts @@ -49,6 +49,11 @@ export const getCertificateParamValidation = z.object({ monitorId: z.string().min(1, "Monitor ID is required"), }); +const escalationRuleSchema = z.object({ + notificationId: z.string().min(1), + delayMinutes: z.number().int().min(1), +}); + export const createMonitorBodyValidation = z.object({ _id: z.string().optional(), name: z.string().min(1, "Name is required"), @@ -78,6 +83,7 @@ export const createMonitorBodyValidation = z.object({ geoCheckEnabled: z.boolean().optional(), geoCheckLocations: z.array(z.enum(GeoContinents)).optional(), geoCheckInterval: z.number().min(300000).optional(), + escalationRules: z.array(escalationRuleSchema).optional(), }); export const editMonitorBodyValidation = z.object({ @@ -107,6 +113,7 @@ export const editMonitorBodyValidation = z.object({ geoCheckEnabled: z.boolean().optional(), geoCheckLocations: z.array(z.enum(GeoContinents)).optional(), geoCheckInterval: z.number().min(300000).optional(), + escalationRules: z.array(escalationRuleSchema).optional(), }); export const pauseMonitorParamValidation = z.object({