diff --git a/client/src/Hooks/useMonitorForm.ts b/client/src/Hooks/useMonitorForm.ts index 963409fc8a..703582829c 100644 --- a/client/src/Hooks/useMonitorForm.ts +++ b/client/src/Hooks/useMonitorForm.ts @@ -12,6 +12,7 @@ const getBaseDefaults = (data?: Monitor | null) => ({ description: data?.description || "", interval: data?.interval || 60000, notifications: data?.notifications || [], + escalationNotifications: data?.escalationNotifications || [], statusWindowSize: data?.statusWindowSize || 5, statusWindowThreshold: data?.statusWindowThreshold || 60, geoCheckEnabled: data?.geoCheckEnabled ?? false, diff --git a/client/src/Pages/CreateMonitor/index.tsx b/client/src/Pages/CreateMonitor/index.tsx index 15b76eab36..ba35c4a478 100644 --- a/client/src/Pages/CreateMonitor/index.tsx +++ b/client/src/Pages/CreateMonitor/index.tsx @@ -765,6 +765,140 @@ const CreateMonitorPage = () => { } /> + { + const notificationOptions = (notifications ?? []).map((n) => ({ + ...n, + name: n.notificationName, + })); + + const selectedNotifications = notificationOptions.filter((n) => + (field.value ?? []).some((entry) => entry.notificationId === n.id) + ); + + const selectedEscalations = (field.value ?? []) + .map((entry) => ({ + entry, + notification: notificationOptions.find( + (option) => option.id === entry.notificationId + ), + })) + .filter((item) => Boolean(item.notification)); + + return ( + + option.name} + onChange={(_: unknown, newValue: typeof notificationOptions) => { + const existingDelayById = new Map( + (field.value ?? []).map((entry) => [entry.notificationId, entry.delayMinutes]) + ); + + field.onChange( + newValue.map((item) => ({ + notificationId: item.id, + delayMinutes: existingDelayById.get(item.id) ?? 15, + })) + ); + }} + isOptionEqualToValue={(option, value) => option.id === value.id} + /> + {selectedEscalations.length > 0 && ( + + {selectedEscalations.map(({ entry, notification }, index) => { + if (!notification) { + return null; + } + + return ( + + + + {t( + "pages.createMonitor.form.escalationNotifications.option.channel.label" + )} + + {notification.notificationName} + + + { + const nextValue = Math.max(1, Number(e.target.value) || 1); + field.onChange( + (field.value ?? []).map((item) => + item.notificationId === notification.id + ? { ...item, delayMinutes: nextValue } + : item + ) + ); + }} + sx={{ minWidth: { xs: "100%", md: 180 } }} + /> + { + field.onChange( + (field.value ?? []).filter( + (item) => item.notificationId !== notification.id + ) + ); + }} + aria-label="Remove escalation notification" + > + + + + {index < selectedEscalations.length - 1 && } + + ); + })} + + )} + {fieldState.error?.message && ( + + {fieldState.error.message} + + )} + + ); + }} + /> + } + /> + {(watchedType === "http" || watchedType === "grpc" || watchedType === "websocket") && ( diff --git a/client/src/Types/Monitor.ts b/client/src/Types/Monitor.ts index 053b517d1d..c80925752b 100644 --- a/client/src/Types/Monitor.ts +++ b/client/src/Types/Monitor.ts @@ -38,6 +38,11 @@ export type MonitorStatus = (typeof MonitorStatuses)[number]; export type MonitorMatchMethod = "equal" | "include" | "regex" | ""; +export interface MonitorEscalationNotification { + notificationId: string; + delayMinutes: number; +} + export interface Monitor { id: string; userId: string; @@ -60,6 +65,7 @@ export interface Monitor { interval: number; uptimePercentage?: number; notifications: string[]; + escalationNotifications?: MonitorEscalationNotification[]; secret?: string; cpuAlertThreshold: number; cpuAlertCounter: number; diff --git a/client/src/Validation/monitor.ts b/client/src/Validation/monitor.ts index 9acffe6fed..fade9d0929 100644 --- a/client/src/Validation/monitor.ts +++ b/client/src/Validation/monitor.ts @@ -13,6 +13,15 @@ const baseSchema = z.object({ description: z.string().optional(), interval: z.number().min(15000, "Interval must be at least 15 seconds"), notifications: z.array(z.string()), + escalationNotifications: z.array( + z.object({ + notificationId: z.string().min(1, "Escalation notification channel is required"), + delayMinutes: z + .number({ message: "Escalation delay is required" }) + .int("Escalation delay must be a whole number") + .min(1, "Escalation delay must be at least 1 minute"), + }) + ), statusWindowSize: z .number({ message: "Status window size is required" }) .min(1, "Status window size must be at least 1") diff --git a/client/src/locales/en.json b/client/src/locales/en.json index 92a21939f3..df1759c77e 100644 --- a/client/src/locales/en.json +++ b/client/src/locales/en.json @@ -543,6 +543,18 @@ "description": "Select the notification channels you want to use", "title": "Notifications" }, + "escalationNotifications": { + "title": "Escalation Notifications", + "description": "Choose channels for follow-up alerts if the monitor is still down after a delay.", + "option": { + "channel": { + "label": "Notification channel" + }, + "delay": { + "label": "Escalation delay (minutes)" + } + } + }, "type": { "description": "Select the type of check to perform", "optionDockerDescription": "Use Docker to monitor if a container is running.", diff --git a/server/src/config/services.ts b/server/src/config/services.ts index b31c8a5e91..d75f6dcab4 100644 --- a/server/src/config/services.ts +++ b/server/src/config/services.ts @@ -234,6 +234,7 @@ export const initializeServices = async ({ const notificationsService = new NotificationsService( notificationsRepository, monitorsRepository, + incidentsRepository, webhookProvider, emailProvider, slackProvider, diff --git a/server/src/db/models/Monitor.ts b/server/src/db/models/Monitor.ts index 036aeadad6..f99d302212 100644 --- a/server/src/db/models/Monitor.ts +++ b/server/src/db/models/Monitor.ts @@ -18,11 +18,12 @@ type CheckSnapshotDocument = Omit & { createdAt: Dat type MonitorDocumentBase = Omit< Monitor, - "id" | "userId" | "teamId" | "notifications" | "selectedDisks" | "statusWindow" | "recentChecks" | "createdAt" | "updatedAt" + "id" | "userId" | "teamId" | "notifications" | "escalationNotifications" | "selectedDisks" | "statusWindow" | "recentChecks" | "createdAt" | "updatedAt" > & { statusWindow: boolean[]; recentChecks: CheckSnapshotDocument[]; notifications: Types.ObjectId[]; + escalationNotifications: { notificationId: Types.ObjectId; delayMinutes: number }[]; selectedDisks: string[]; matchMethod?: MonitorMatchMethod; }; @@ -198,6 +199,22 @@ const checkSnapshotSchema = new Schema( { _id: false } ); +const escalationNotificationSchema = new Schema<{ notificationId: Types.ObjectId; delayMinutes: number }>( + { + notificationId: { + type: Schema.Types.ObjectId, + ref: "Notification", + required: true, + }, + delayMinutes: { + type: Number, + required: true, + min: 1, + }, + }, + { _id: false } +); + const MonitorSchema = new Schema( { userId: { @@ -284,6 +301,10 @@ const MonitorSchema = new Schema( ref: "Notification", }, ], + escalationNotifications: { + type: [escalationNotificationSchema], + default: [], + }, secret: { type: String, }, diff --git a/server/src/repositories/monitors/MongoMonitorsRepository.ts b/server/src/repositories/monitors/MongoMonitorsRepository.ts index b2d7594483..facd2817f4 100644 --- a/server/src/repositories/monitors/MongoMonitorsRepository.ts +++ b/server/src/repositories/monitors/MongoMonitorsRepository.ts @@ -17,7 +17,7 @@ class MongoMonitorsRepository implements IMonitorsRepository { if (!monitors.length) { return []; } - const payload = monitors.map((monitor) => ({ ...monitor, notifications: undefined })); + const payload = monitors.map((monitor) => ({ ...monitor, notifications: undefined, escalationNotifications: undefined })); try { const inserted = await MonitorModel.insertMany(payload, { ordered: false }); return this.mapDocuments(inserted); @@ -351,6 +351,10 @@ class MongoMonitorsRepository implements IMonitorsRepository { }; const notificationIds = (doc.notifications ?? []).map((notification) => toStringId(notification)); + const escalationNotifications = (doc.escalationNotifications ?? []).map((item) => ({ + notificationId: toStringId(item.notificationId), + delayMinutes: item.delayMinutes, + })); return { id: toStringId(doc._id), @@ -374,6 +378,7 @@ class MongoMonitorsRepository implements IMonitorsRepository { interval: doc.interval, uptimePercentage: doc.uptimePercentage ?? undefined, notifications: notificationIds, + escalationNotifications, secret: doc.secret ?? undefined, cpuAlertThreshold: doc.cpuAlertThreshold, cpuAlertCounter: doc.cpuAlertCounter, @@ -410,6 +415,10 @@ class MongoMonitorsRepository implements IMonitorsRepository { }; const notificationIds = (doc.notifications ?? []).map((notification: unknown) => toStringId(notification)); + const escalationNotifications = (doc.escalationNotifications ?? []).map((item) => ({ + notificationId: toStringId(item.notificationId), + delayMinutes: item.delayMinutes, + })); return { id: toStringId(doc._id), @@ -433,6 +442,7 @@ class MongoMonitorsRepository implements IMonitorsRepository { interval: doc.interval, uptimePercentage: doc.uptimePercentage ?? undefined, notifications: notificationIds, + escalationNotifications, secret: doc.secret ?? undefined, cpuAlertThreshold: doc.cpuAlertThreshold, cpuAlertCounter: doc.cpuAlertCounter, diff --git a/server/src/service/infrastructure/notificationMessageBuilder.ts b/server/src/service/infrastructure/notificationMessageBuilder.ts index 934163b2a9..7b28568191 100644 --- a/server/src/service/infrastructure/notificationMessageBuilder.ts +++ b/server/src/service/infrastructure/notificationMessageBuilder.ts @@ -15,6 +15,7 @@ export interface INotificationMessageBuilder { decision: MonitorActionDecision, clientHost: string ): NotificationMessage; + buildEscalationMessage(monitor: Monitor, downForMinutes: number, clientHost: string): NotificationMessage; extractThresholdBreaches(monitor: Monitor, monitorStatusResponse: MonitorStatusResponse): ThresholdBreach[]; } @@ -52,6 +53,38 @@ export class NotificationMessageBuilder implements INotificationMessageBuilder { }; } + buildEscalationMessage(monitor: Monitor, downForMinutes: number, clientHost: string): NotificationMessage { + const downForLabel = this.formatDownDuration(downForMinutes); + + return { + type: "monitor_down_escalation", + severity: "critical", + monitor: { + id: monitor.id, + name: monitor.name, + url: monitor.url, + type: monitor.type, + status: monitor.status, + }, + content: { + title: `Escalation: ${monitor.name} is still down`, + summary: `Monitor "${monitor.name}" is still down after ${downForLabel}.`, + details: [ + `URL: ${monitor.url}`, + `Status: Down`, + `Type: ${monitor.type}`, + `Down for: ${downForLabel}`, + ], + timestamp: new Date(), + }, + clientHost, + metadata: { + teamId: monitor.teamId, + notificationReason: "escalation", + }, + }; + } + private determineNotificationType(decision: MonitorActionDecision, monitor: Monitor): NotificationType { // Down status has highest priority (critical) if (monitor.status === "down") { @@ -80,6 +113,7 @@ export class NotificationMessageBuilder implements INotificationMessageBuilder { private determineSeverity(type: NotificationType): NotificationSeverity { switch (type) { case "monitor_down": + case "monitor_down_escalation": return "critical"; case "threshold_breach": return "warning"; @@ -97,6 +131,8 @@ export class NotificationMessageBuilder implements INotificationMessageBuilder { switch (type) { case "monitor_down": return this.buildMonitorDownContent(monitor, monitorStatusResponse); + case "monitor_down_escalation": + return this.buildMonitorDownContent(monitor, monitorStatusResponse); case "monitor_up": return this.buildMonitorUpContent(monitor); case "threshold_breach": @@ -182,6 +218,20 @@ export class NotificationMessageBuilder implements INotificationMessageBuilder { }; } + private formatDownDuration(downForMinutes: number): string { + if (downForMinutes < 60) { + return `${downForMinutes} minute${downForMinutes === 1 ? "" : "s"}`; + } + + const hours = Math.floor(downForMinutes / 60); + const minutes = downForMinutes % 60; + if (minutes === 0) { + return `${hours} hour${hours === 1 ? "" : "s"}`; + } + + return `${hours} hour${hours === 1 ? "" : "s"} ${minutes} minute${minutes === 1 ? "" : "s"}`; + } + public extractThresholdBreaches(monitor: Monitor, monitorStatusResponse: MonitorStatusResponse): ThresholdBreach[] { const breaches: ThresholdBreach[] = []; diff --git a/server/src/service/infrastructure/notificationProviders/email.ts b/server/src/service/infrastructure/notificationProviders/email.ts index b3686651cc..6ef9b9809a 100644 --- a/server/src/service/infrastructure/notificationProviders/email.ts +++ b/server/src/service/infrastructure/notificationProviders/email.ts @@ -81,6 +81,8 @@ export class EmailProvider implements INotificationProvider { switch (message.type) { case "monitor_down": return `Monitor ${message.monitor.name} is down`; + case "monitor_down_escalation": + return `Escalation: ${message.monitor.name} is still down`; case "monitor_up": return `Monitor ${message.monitor.name} is back up`; case "threshold_breach": diff --git a/server/src/service/infrastructure/notificationsService.ts b/server/src/service/infrastructure/notificationsService.ts index c75477c88c..d8e634cecc 100644 --- a/server/src/service/infrastructure/notificationsService.ts +++ b/server/src/service/infrastructure/notificationsService.ts @@ -1,6 +1,6 @@ import type { Monitor, MonitorStatusResponse, Notification } from "@/types/index.js"; import type { NotificationMessage } from "@/types/notificationMessage.js"; -import { IMonitorsRepository, INotificationsRepository } from "@/repositories/index.js"; +import { IIncidentsRepository, IMonitorsRepository, INotificationsRepository } from "@/repositories/index.js"; import { INotificationProvider } from "./notificationProviders/INotificationProvider.js"; import type { MonitorActionDecision } from "@/service/infrastructure/SuperSimpleQueue/SuperSimpleQueueHelper.js"; import type { ISettingsService } from "@/service/system/settingsService.js"; @@ -26,6 +26,7 @@ export class NotificationsService implements INotificationsService { private notificationsRepository: INotificationsRepository; private monitorsRepository: IMonitorsRepository; + private incidentsRepository: IIncidentsRepository; private webhookProvider: INotificationProvider; private emailProvider: INotificationProvider; private slackProvider: INotificationProvider; @@ -36,10 +37,12 @@ export class NotificationsService implements INotificationsService { private logger: ILogger; private settingsService: ISettingsService; private notificationMessageBuilder: INotificationMessageBuilder; + private escalationTimers: Map>; constructor( notificationsRepository: INotificationsRepository, monitorsRepository: IMonitorsRepository, + incidentsRepository: IIncidentsRepository, webhookProvider: INotificationProvider, emailProvider: INotificationProvider, slackProvider: INotificationProvider, @@ -53,6 +56,7 @@ export class NotificationsService implements INotificationsService { ) { this.notificationsRepository = notificationsRepository; this.monitorsRepository = monitorsRepository; + this.incidentsRepository = incidentsRepository; this.webhookProvider = webhookProvider; this.emailProvider = emailProvider; this.slackProvider = slackProvider; @@ -63,15 +67,10 @@ export class NotificationsService implements INotificationsService { this.settingsService = settingsService; this.logger = logger; this.notificationMessageBuilder = notificationMessageBuilder; + this.escalationTimers = new Map(); } - private send = async ( - notification: Notification, - monitor: Monitor, - monitorStatusResponse: MonitorStatusResponse, - decision: MonitorActionDecision, - notificationMessage: NotificationMessage | undefined - ): Promise => { + private send = async (notification: Notification, notificationMessage: NotificationMessage | undefined): Promise => { if (!notificationMessage) { this.logger.warn({ message: "Notification message not provided", @@ -107,16 +106,13 @@ export class NotificationsService implements INotificationsService { } }; - private sendNotifications = async (monitor: Monitor, monitorStatusResponse: MonitorStatusResponse, decision: MonitorActionDecision) => { - const notificationIds = monitor.notifications ?? []; + private sendNotificationsByIds = async (notificationIds: string[], notificationMessage: NotificationMessage) => { const notifications = await this.notificationsRepository.findNotificationsByIds(notificationIds); + if (notifications.length === 0) { + return true; + } - // Build notification message once for all notifications - const settings = this.settingsService.getSettings(); - const clientHost = settings.clientHost || "Host not defined"; - const notificationMessage = this.notificationMessageBuilder.buildMessage(monitor, monitorStatusResponse, decision, clientHost); - - const tasks = notifications.map((notification) => this.send(notification, monitor, monitorStatusResponse, decision, notificationMessage)); + const tasks = notifications.map((notification) => this.send(notification, notificationMessage)); const outcomes = await Promise.all(tasks); const succeeded = outcomes.filter(Boolean).length; @@ -132,11 +128,94 @@ export class NotificationsService implements INotificationsService { return succeeded === notifications.length; }; + private sendNotifications = async (monitor: Monitor, monitorStatusResponse: MonitorStatusResponse, decision: MonitorActionDecision) => { + const notificationIds = monitor.notifications ?? []; + const settings = this.settingsService.getSettings(); + const clientHost = settings.clientHost || "Host not defined"; + const notificationMessage = this.notificationMessageBuilder.buildMessage(monitor, monitorStatusResponse, decision, clientHost); + + return await this.sendNotificationsByIds(notificationIds, notificationMessage); + }; + + private escalationTimerKey = (monitorId: string, notificationId: string) => `${monitorId}:${notificationId}`; + + private clearEscalationTimersForMonitor = (monitorId: string) => { + for (const [key, timeoutId] of this.escalationTimers.entries()) { + if (!key.startsWith(`${monitorId}:`)) { + continue; + } + + clearTimeout(timeoutId); + this.escalationTimers.delete(key); + } + }; + + private scheduleEscalations = async (monitor: Monitor) => { + const escalationNotifications = monitor.escalationNotifications ?? []; + if (escalationNotifications.length === 0) { + return; + } + + for (const escalation of escalationNotifications) { + const delayMinutes = Number(escalation.delayMinutes); + if (!escalation.notificationId || !Number.isFinite(delayMinutes) || delayMinutes < 1) { + continue; + } + + const timerKey = this.escalationTimerKey(monitor.id, escalation.notificationId); + if (this.escalationTimers.has(timerKey)) { + continue; + } + + const timeoutId = setTimeout(async () => { + this.escalationTimers.delete(timerKey); + + try { + const latestMonitor = await this.monitorsRepository.findById(monitor.id, monitor.teamId); + if (latestMonitor.status !== "down") { + return; + } + + const activeIncident = await this.incidentsRepository.findActiveByMonitorId(latestMonitor.id, latestMonitor.teamId); + if (!activeIncident) { + return; + } + + const startedAt = new Date(activeIncident.startTime).getTime(); + const downForMinutes = Math.max(1, Math.floor((Date.now() - startedAt) / 60000)); + + const settings = this.settingsService.getSettings(); + const clientHost = settings.clientHost || "Host not defined"; + const escalationMessage = this.notificationMessageBuilder.buildEscalationMessage(latestMonitor, downForMinutes, clientHost); + + await this.sendNotificationsByIds([escalation.notificationId], escalationMessage); + } catch (error: unknown) { + this.logger.error({ + message: `Failed to send escalation notification for monitor ${monitor.id}: ${error instanceof Error ? error.message : "Unknown error"}`, + service: SERVICE_NAME, + method: "scheduleEscalations", + stack: error instanceof Error ? error.stack : undefined, + }); + } + }, delayMinutes * 60 * 1000); + + this.escalationTimers.set(timerKey, timeoutId); + } + }; + handleNotifications = async (monitor: Monitor, monitorStatusResponse: MonitorStatusResponse, decision: MonitorActionDecision) => { if (!decision.shouldSendNotification) { return false; } + if (decision.shouldResolveIncident && monitor.status === "up") { + this.clearEscalationTimersForMonitor(monitor.id); + } + + if (decision.shouldCreateIncident && monitor.status === "down") { + await this.scheduleEscalations(monitor); + } + // Send notifications based on decision return await this.sendNotifications(monitor, monitorStatusResponse, decision); }; diff --git a/server/src/types/monitor.ts b/server/src/types/monitor.ts index f29ce75d78..731ea47e63 100644 --- a/server/src/types/monitor.ts +++ b/server/src/types/monitor.ts @@ -15,6 +15,11 @@ export type MonitorStatus = (typeof MonitorStatuses)[number]; export const MonitorMatchMethods = ["equal", "include", "regex"] as const; export type MonitorMatchMethod = (typeof MonitorMatchMethods)[number] | ""; +export interface MonitorEscalationNotification { + notificationId: string; + delayMinutes: number; +} + export interface Monitor { id: string; userId: string; @@ -37,6 +42,7 @@ export interface Monitor { interval: number; uptimePercentage?: number; notifications: string[]; + escalationNotifications?: MonitorEscalationNotification[]; secret?: string; cpuAlertThreshold: number; cpuAlertCounter: number; diff --git a/server/src/types/notificationMessage.ts b/server/src/types/notificationMessage.ts index f06ff1bd9a..0990b29295 100644 --- a/server/src/types/notificationMessage.ts +++ b/server/src/types/notificationMessage.ts @@ -3,7 +3,7 @@ * Part of notification system unification effort */ -export type NotificationType = "monitor_down" | "monitor_up" | "threshold_breach" | "threshold_resolved" | "test"; +export type NotificationType = "monitor_down" | "monitor_up" | "monitor_down_escalation" | "threshold_breach" | "threshold_resolved" | "test"; export type NotificationSeverity = "critical" | "warning" | "info" | "success"; diff --git a/server/src/validation/monitorValidation.ts b/server/src/validation/monitorValidation.ts index df000ecef2..9c2cb82203 100644 --- a/server/src/validation/monitorValidation.ts +++ b/server/src/validation/monitorValidation.ts @@ -3,6 +3,11 @@ import { booleanCoercion } from "./shared.js"; import { GeoContinents } from "@/types/geoCheck.js"; import { MonitorMatchMethods, MonitorTypes } from "@/types/monitor.js"; +const escalationNotificationValidation = z.object({ + notificationId: z.string().min(1, "Escalation notification channel is required"), + delayMinutes: z.number().int().min(1, "Escalation delay must be at least 1 minute"), +}); + export const getMonitorByIdParamValidation = z.object({ monitorId: z.string().min(1, "Monitor ID is required"), }); @@ -67,6 +72,7 @@ export const createMonitorBodyValidation = z.object({ diskAlertThreshold: z.number().optional(), tempAlertThreshold: z.number().optional(), notifications: z.array(z.string()).optional(), + escalationNotifications: z.array(escalationNotificationValidation).optional(), secret: z.string().optional(), jsonPath: z.union([z.string(), z.literal("")]).optional(), expectedValue: z.union([z.string(), z.literal("")]).optional(), @@ -89,6 +95,7 @@ export const editMonitorBodyValidation = z.object({ description: z.union([z.string(), z.literal("")]).optional(), interval: z.number().optional(), notifications: z.array(z.string()).optional(), + escalationNotifications: z.array(escalationNotificationValidation).optional(), secret: z.string().optional(), ignoreTlsErrors: z.boolean().optional(), useAdvancedMatching: z.boolean().optional(), @@ -144,6 +151,7 @@ const importedMonitorSchema = z.object({ interval: z.number().default(60000), uptimePercentage: z.number().optional(), notifications: z.array(z.string()).default([]), + escalationNotifications: z.array(escalationNotificationValidation).default([]), secret: z.string().optional(), cpuAlertThreshold: z.number().default(100), cpuAlertCounter: z.number().default(5),