mirror of
https://github.com/dcarrillo/atalaya.git
synced 2026-04-18 02:24:05 +00:00
feat: maintenance mode for monitors
This commit is contained in:
45
README.md
45
README.md
@@ -16,6 +16,7 @@ Live [example](https://uptime.ifconfig.es/).
|
||||
- [Configuration](#configuration)
|
||||
- [Settings](#settings)
|
||||
- [Monitor Types](#monitor-types)
|
||||
- [Maintenance Windows](#maintenance-windows)
|
||||
- [Regional Monitoring](#regional-monitoring)
|
||||
- [Alerts](#alerts)
|
||||
- [Status Page](#status-page)
|
||||
@@ -151,6 +152,49 @@ Each monitor can override the global default\_\* settings:
|
||||
|
||||
### Monitor Types
|
||||
|
||||
### Maintenance Windows
|
||||
|
||||
You can configure scheduled maintenance windows for individual monitors, suspending alerts while downtime is still tracked for metrics.
|
||||
|
||||
**Syntax**
|
||||
Each monitor supports a `maintenance` array, with one or more objects specifying a `start` and `end` timestamp in UTC ISO8601 format.
|
||||
|
||||
```yaml
|
||||
- name: 'web-api'
|
||||
type: http
|
||||
target: 'https://example.com/health'
|
||||
maintenance:
|
||||
- start: '2026-05-10T23:00:00Z'
|
||||
end: '2026-05-11T01:00:00Z'
|
||||
- start: '2026-06-01T02:00:00Z'
|
||||
end: '2026-06-01T03:30:00Z'
|
||||
alerts: ['default']
|
||||
```
|
||||
|
||||
**Behavior**
|
||||
|
||||
- At any time when `now` (UTC) is within a window, the monitor is shown as "maintenance":
|
||||
- Status page and API both clearly show "maintenance" (distinct from "up" or "down").
|
||||
- Downtime during maintenance _is still counted_ for metrics and reporting.
|
||||
- All alerts are suppressed—no notifications are sent for failures during maintenance.
|
||||
- Malformed/invalid windows are logged with a warning and ignored.
|
||||
- Windows are strictly parsed as UTC. Both `start` and `end` must be present and valid.
|
||||
- If any part of maintenance is ongoing upon startup, status immediately reflects "maintenance".
|
||||
- Overlapping or adjacent windows are treated as separate, but merged for state computation.
|
||||
|
||||
**Example Config**
|
||||
|
||||
```yaml
|
||||
monitors:
|
||||
- name: 'api-maintenance'
|
||||
type: http
|
||||
target: 'https://api.example.com/health'
|
||||
maintenance:
|
||||
- start: '2026-05-10T23:00:00Z'
|
||||
end: '2026-05-11T01:00:00Z'
|
||||
alerts: ['default']
|
||||
```
|
||||
|
||||
**HTTP**
|
||||
|
||||
```yaml
|
||||
@@ -367,6 +411,7 @@ npm run check:pages # pages (astro check + tsc)
|
||||
|
||||
- [ ] Add support for TLS checks (certificate validity, expiration). Apparently, the Workers API does not support certificate data access, even at the socket level. An external service may be required.
|
||||
- [ ] Refine the status page to look... well... less IA generated.
|
||||
- [x] Per-monitor maintenance windows (docs and config example added)
|
||||
- [ ] Initial support for incident management (manual status overrides, incident timeline).
|
||||
- [x] Branded status page (simple custom banner).
|
||||
- [ ] Add support for notifications other than webhooks.
|
||||
|
||||
@@ -108,6 +108,37 @@ describe('getStatusApiData', () => {
|
||||
expect(result.title).toBe('Test Status Page');
|
||||
});
|
||||
|
||||
it('surfaces maintenance status and excludes from up/down counts', async () => {
|
||||
const now = Math.floor(Date.now() / 1000);
|
||||
const db = mockD1Database({
|
||||
states: [
|
||||
{ monitor_name: 'up-monitor', current_status: 'up', last_checked: now },
|
||||
{ monitor_name: 'maint', current_status: 'maintenance', last_checked: now },
|
||||
{ monitor_name: 'down-monitor', current_status: 'down', last_checked: now },
|
||||
],
|
||||
hourly: [],
|
||||
recent: [
|
||||
{
|
||||
monitor_name: 'maint',
|
||||
checked_at: now - 10,
|
||||
status: 'maintenance',
|
||||
response_time_ms: 0,
|
||||
},
|
||||
],
|
||||
});
|
||||
const result = await getStatusApiData(db, testConfig);
|
||||
const maint = result.monitors.find(m => m.name === 'maint');
|
||||
expect(maint).toBeDefined();
|
||||
expect(maint!.status).toBe('maintenance');
|
||||
expect(maint!.recentChecks[0]).toEqual({
|
||||
timestamp: now - 10,
|
||||
status: 'maintenance',
|
||||
responseTimeMs: 0,
|
||||
});
|
||||
// Only up and down counted in summary
|
||||
expect(result.summary).toEqual({ total: 3, operational: 1, down: 1 });
|
||||
});
|
||||
|
||||
it('does not count unknown status monitors as down', async () => {
|
||||
const now = Math.floor(Date.now() / 1000);
|
||||
const db = mockD1Database({
|
||||
|
||||
@@ -70,15 +70,17 @@ export async function getStatusApiData(
|
||||
const dailyHistory = computeDailyHistory(hourly);
|
||||
const uptimePercent = computeOverallUptime(hourly);
|
||||
|
||||
const status: 'up' | 'down' | 'unknown' =
|
||||
state.current_status === 'up' || state.current_status === 'down'
|
||||
const status: 'up' | 'down' | 'unknown' | 'maintenance' =
|
||||
state.current_status === 'maintenance'
|
||||
? 'maintenance'
|
||||
: state.current_status === 'up' || state.current_status === 'down'
|
||||
? state.current_status
|
||||
: 'unknown';
|
||||
|
||||
const rawChecks = checksByMonitor.get(state.monitor_name) ?? [];
|
||||
const apiRecentChecks: ApiRecentCheck[] = rawChecks.map(c => ({
|
||||
timestamp: c.checked_at,
|
||||
status: c.status === 'up' ? ('up' as const) : ('down' as const),
|
||||
status: c.status === 'maintenance' ? 'maintenance' : c.status === 'up' ? 'up' : 'down',
|
||||
responseTimeMs: c.response_time_ms ?? 0,
|
||||
}));
|
||||
|
||||
|
||||
@@ -68,6 +68,37 @@ function applyDefaults(raw: RawYamlConfig): Config {
|
||||
failureThreshold: m.failure_threshold ?? settings.defaultFailureThreshold,
|
||||
alerts: m.alerts ?? [],
|
||||
region: m.region && isValidRegion(m.region) ? m.region : undefined,
|
||||
maintenance: Array.isArray(m.maintenance)
|
||||
? m.maintenance.filter((w: any) => {
|
||||
if (
|
||||
!w ||
|
||||
typeof w !== 'object' ||
|
||||
typeof w.start !== 'string' ||
|
||||
typeof w.end !== 'string'
|
||||
)
|
||||
return false;
|
||||
const startMs = Date.parse(w.start);
|
||||
const endMs = Date.parse(w.end);
|
||||
if (
|
||||
isNaN(startMs) ||
|
||||
isNaN(endMs) ||
|
||||
!w.start.endsWith('Z') ||
|
||||
!w.end.endsWith('Z') ||
|
||||
endMs <= startMs
|
||||
) {
|
||||
console.warn(
|
||||
JSON.stringify({
|
||||
event: 'invalid_maintenance_window',
|
||||
start: w.start,
|
||||
end: w.end,
|
||||
monitor: m.name,
|
||||
})
|
||||
);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
})
|
||||
: undefined,
|
||||
};
|
||||
|
||||
const type = (m.type as 'http' | 'tcp' | 'dns') ?? 'http';
|
||||
|
||||
@@ -19,6 +19,11 @@ export type WebhookAlert = AlertBase & {
|
||||
export type Alert = WebhookAlert; // | EmailAlert | ...
|
||||
|
||||
interface MonitorBase {
|
||||
/**
|
||||
* List of maintenance windows. If now is >= start and < end,
|
||||
* monitor is treated as "maintenance". Times must be ISO8601 UTC (with 'Z').
|
||||
*/
|
||||
maintenance?: { start: string; end: string }[];
|
||||
name: string;
|
||||
target: string;
|
||||
timeoutMs: number;
|
||||
@@ -87,5 +92,6 @@ export type RawYamlConfig = {
|
||||
failure_threshold?: number;
|
||||
alerts?: string[];
|
||||
region?: string; // Cloudflare region code for regional checks
|
||||
maintenance?: { start: string; end: string }[];
|
||||
}>;
|
||||
};
|
||||
|
||||
2
src/processor/maintenance-import.ts
Normal file
2
src/processor/maintenance-import.ts
Normal file
@@ -0,0 +1,2 @@
|
||||
// Temporary import for next edit
|
||||
import { isInMaintenance } from '../utils/maintenance.js';
|
||||
@@ -7,6 +7,7 @@ import type {
|
||||
AlertCall,
|
||||
StateUpdate,
|
||||
} from './types.js';
|
||||
import { isInMaintenance } from '../utils/maintenance.js';
|
||||
|
||||
export function processResults(
|
||||
results: CheckResult[],
|
||||
@@ -35,6 +36,8 @@ export function processResults(
|
||||
if (!monitor) {
|
||||
continue;
|
||||
}
|
||||
// Maintenance check
|
||||
const inMaintenance = isInMaintenance(monitor.maintenance, new Date());
|
||||
|
||||
const state = stateMap.get(result.name) ?? {
|
||||
monitor_name: result.name,
|
||||
@@ -56,15 +59,23 @@ export function processResults(
|
||||
|
||||
const newState: StateUpdate = {
|
||||
monitorName: result.name,
|
||||
currentStatus: state.current_status,
|
||||
currentStatus: inMaintenance ? 'maintenance' : state.current_status,
|
||||
consecutiveFailures: state.consecutive_failures,
|
||||
lastStatusChange: state.last_status_change,
|
||||
lastChecked: now,
|
||||
};
|
||||
|
||||
// Only update downtime/failure/recovery/alerts logic if not in maintenance
|
||||
if (inMaintenance) {
|
||||
// Alert suppression: no alerts for down or recovery
|
||||
// But downtime is recorded (dbWrite above)
|
||||
// State persists in 'maintenance', reset nothing
|
||||
actions.stateUpdates.push(newState);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (result.status === 'down') {
|
||||
newState.consecutiveFailures = state.consecutive_failures + 1;
|
||||
|
||||
if (
|
||||
newState.consecutiveFailures >= monitor.failureThreshold &&
|
||||
state.current_status === 'up'
|
||||
@@ -86,10 +97,8 @@ export function processResults(
|
||||
} else {
|
||||
newState.consecutiveFailures = 0;
|
||||
newState.currentStatus = 'up';
|
||||
|
||||
if (state.current_status === 'down') {
|
||||
newState.lastStatusChange = now;
|
||||
|
||||
for (const alertName of monitor.alerts) {
|
||||
const alert: AlertCall = {
|
||||
alertName,
|
||||
@@ -104,7 +113,6 @@ export function processResults(
|
||||
newState.lastStatusChange = state.last_status_change;
|
||||
}
|
||||
}
|
||||
|
||||
actions.stateUpdates.push(newState);
|
||||
}
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@ export type CheckResult = {
|
||||
|
||||
export type MonitorState = {
|
||||
monitor_name: string;
|
||||
current_status: 'up' | 'down';
|
||||
current_status: 'up' | 'down' | 'maintenance';
|
||||
consecutive_failures: number;
|
||||
last_status_change: number;
|
||||
last_checked: number;
|
||||
@@ -33,7 +33,7 @@ export type AlertCall = {
|
||||
|
||||
export type StateUpdate = {
|
||||
monitorName: string;
|
||||
currentStatus: string;
|
||||
currentStatus: 'up' | 'down' | 'maintenance';
|
||||
consecutiveFailures: number;
|
||||
lastStatusChange: number;
|
||||
lastChecked: number;
|
||||
|
||||
16
src/types.ts
16
src/types.ts
@@ -108,7 +108,14 @@ export type StatusApiResponse = {
|
||||
|
||||
export type ApiMonitorStatus = {
|
||||
name: string;
|
||||
status: 'up' | 'down' | 'unknown';
|
||||
/**
|
||||
* Current status of the monitor.
|
||||
* 'up' - healthy
|
||||
* 'down' - failing
|
||||
* 'unknown' - initial/undefined
|
||||
* 'maintenance' - within a configured maintenance window (alerts suppressed, shown as maintenance in UI)
|
||||
*/
|
||||
status: 'up' | 'down' | 'unknown' | 'maintenance';
|
||||
lastChecked: number | undefined;
|
||||
uptimePercent: number;
|
||||
dailyHistory: ApiDayStatus[];
|
||||
@@ -122,6 +129,11 @@ export type ApiDayStatus = {
|
||||
|
||||
export type ApiRecentCheck = {
|
||||
timestamp: number;
|
||||
status: 'up' | 'down';
|
||||
/**
|
||||
* Status for a single check event.
|
||||
* Usually 'up' or 'down',
|
||||
* but 'maintenance' if check occurred during a maintenance window.
|
||||
*/
|
||||
status: 'up' | 'down' | 'maintenance';
|
||||
responseTimeMs: number;
|
||||
};
|
||||
|
||||
50
src/utils/maintenance.test.ts
Normal file
50
src/utils/maintenance.test.ts
Normal file
@@ -0,0 +1,50 @@
|
||||
import { describe, it, expect } from 'vitest';
|
||||
import { isInMaintenance, MaintenanceWindow } from './maintenance';
|
||||
|
||||
function utc(date: string) {
|
||||
// Shortcut for Date creation
|
||||
return new Date(date);
|
||||
}
|
||||
|
||||
describe('isInMaintenance', () => {
|
||||
it('returns false when maintenance undefined or empty', () => {
|
||||
expect(isInMaintenance(undefined, utc('2026-05-01T10:00:00Z'))).toBe(false);
|
||||
expect(isInMaintenance([], utc('2026-05-01T10:00:00Z'))).toBe(false);
|
||||
});
|
||||
|
||||
it('includes and excludes at precise boundaries', () => {
|
||||
const mw: MaintenanceWindow[] = [
|
||||
{ start: '2026-05-01T10:00:00Z', end: '2026-05-01T12:00:00Z' },
|
||||
];
|
||||
expect(isInMaintenance(mw, utc('2026-05-01T09:59:59Z'))).toBe(false);
|
||||
expect(isInMaintenance(mw, utc('2026-05-01T10:00:00Z'))).toBe(true); // start boundary, inclusive
|
||||
expect(isInMaintenance(mw, utc('2026-05-01T11:59:59Z'))).toBe(true);
|
||||
expect(isInMaintenance(mw, utc('2026-05-01T12:00:00Z'))).toBe(false); // end boundary, exclusive
|
||||
});
|
||||
|
||||
it('handles overlapping windows', () => {
|
||||
const mw: MaintenanceWindow[] = [
|
||||
{ start: '2026-05-01T10:00:00Z', end: '2026-05-01T11:00:00Z' },
|
||||
{ start: '2026-05-01T10:30:00Z', end: '2026-05-01T11:30:00Z' },
|
||||
];
|
||||
expect(isInMaintenance(mw, utc('2026-05-01T10:45:00Z'))).toBe(true);
|
||||
expect(isInMaintenance(mw, utc('2026-05-01T11:15:00Z'))).toBe(true);
|
||||
expect(isInMaintenance(mw, utc('2026-05-01T11:30:00Z'))).toBe(false);
|
||||
});
|
||||
|
||||
it('ignores malformed windows (should not reach here)', () => {
|
||||
// A test for the future if parser passes bad data. Should stay false.
|
||||
const mw = [{ start: 'bad', end: 'also-bad' }] as any;
|
||||
expect(isInMaintenance(mw, utc('2026-05-01T10:00:00Z'))).toBe(false);
|
||||
});
|
||||
|
||||
it('prefers the first valid match if multiple windows overlap', () => {
|
||||
const mw: MaintenanceWindow[] = [
|
||||
{ start: '2026-05-01T08:00:00Z', end: '2026-05-01T11:00:00Z' },
|
||||
{ start: '2026-05-01T10:00:00Z', end: '2026-05-01T12:00:00Z' },
|
||||
];
|
||||
expect(isInMaintenance(mw, utc('2026-05-01T09:00:00Z'))).toBe(true);
|
||||
expect(isInMaintenance(mw, utc('2026-05-01T11:00:00Z'))).toBe(true);
|
||||
expect(isInMaintenance(mw, utc('2026-05-01T12:01:00Z'))).toBe(false);
|
||||
});
|
||||
});
|
||||
27
src/utils/maintenance.ts
Normal file
27
src/utils/maintenance.ts
Normal file
@@ -0,0 +1,27 @@
|
||||
// Utility to determine if a monitor is in maintenance based on maintenance windows and current time
|
||||
// All times must be strict ISO8601 with 'Z' (UTC). End is exclusive. Windows must be validated beforehand.
|
||||
|
||||
export interface MaintenanceWindow {
|
||||
start: string; // ISO8601 UTC
|
||||
end: string; // ISO8601 UTC
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if now is within any valid maintenance window.
|
||||
* start is inclusive, end is exclusive (UTC).
|
||||
* Malformed windows should have been filtered out by config parser.
|
||||
* Overlapping windows are fine.
|
||||
*/
|
||||
export function isInMaintenance(maintenance: MaintenanceWindow[] | undefined, now: Date): boolean {
|
||||
if (!maintenance || maintenance.length === 0) return false;
|
||||
const nowMs = now.getTime();
|
||||
for (const w of maintenance) {
|
||||
const startMs = Date.parse(w.start);
|
||||
const endMs = Date.parse(w.end);
|
||||
if (isNaN(startMs) || isNaN(endMs) || endMs <= startMs) continue; // skip malformed
|
||||
if (nowMs >= startMs && nowMs < endMs) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
@@ -1,5 +1,7 @@
|
||||
---
|
||||
import type { ApiMonitorStatus } from '@worker/types';
|
||||
// Allow maintenance as a valid runtime state:
|
||||
type MonitorStatus = ApiMonitorStatus['status'] | 'maintenance';
|
||||
import UptimeBars from './UptimeBars.astro';
|
||||
|
||||
interface Props {
|
||||
@@ -42,11 +44,17 @@ const chartData = JSON.stringify({
|
||||
<div
|
||||
class:list={['status-dot', `status-dot-${monitor.status}`]}
|
||||
role="status"
|
||||
aria-label={`Status: ${monitor.status === 'up' ? 'Operational' : monitor.status === 'down' ? 'Down' : 'Unknown'}`}
|
||||
title={`${monitor.status === 'up' ? 'Operational' : monitor.status === 'down' ? 'Down' : 'Unknown'}`}
|
||||
aria-label={`Status: ${(monitor.status as any) === 'up' ? 'Operational' : (monitor.status as any) === 'down' ? 'Down' : (monitor.status as any) === 'maintenance' ? 'Maintenance' : 'Unknown'}`}
|
||||
title={`${(monitor.status as any) === 'up' ? 'Operational' : (monitor.status as any) === 'down' ? 'Down' : (monitor.status as any) === 'maintenance' ? 'Maintenance' : 'Unknown'}`}
|
||||
|
||||
></div>
|
||||
<h3 class="monitor-name" id={`monitor-${monitor.name.replace(/\s+/g, '-').toLowerCase()}-title`} title={monitor.name}>{monitor.name}</h3>
|
||||
<span class:list={['monitor-uptime', `uptime-${monitor.status}`]}>{uptimeFormatted}%</span>
|
||||
<h3 class="monitor-name" id={`monitor-${monitor.name.replace(/\s+/g, '-').toLowerCase()}-title`} title={monitor.name}>
|
||||
{monitor.name}
|
||||
{(monitor.status as any) === 'maintenance' && (
|
||||
<span class="maintenance-badge" title="Scheduled Maintenance">Maintenance</span>
|
||||
)}
|
||||
</h3>
|
||||
<span class:list={['monitor-uptime', `uptime-${monitor.status}`]} aria-label={(monitor.status as any) === 'maintenance' ? 'Scheduled Maintenance - uptime value reflects unmonitored state' : undefined}>{uptimeFormatted}%</span>
|
||||
<span class="monitor-meta">{lastCheckedText}</span>
|
||||
</div>
|
||||
|
||||
@@ -166,6 +174,34 @@ const chartData = JSON.stringify({
|
||||
animation: pulse-glow 3s ease-in-out infinite;
|
||||
}
|
||||
|
||||
.status-dot-maintenance {
|
||||
background: var(--maintenance, #bfa21a);
|
||||
box-shadow: 0 0 16px #ffe066cc, 0 0 32px #bfa21ab0;
|
||||
border-color: #f8e16c;
|
||||
}
|
||||
.status-dot-maintenance::after {
|
||||
background: var(--maintenance, #f8e16c);
|
||||
animation: pulse-glow 1.5s ease-in-out infinite;
|
||||
opacity: 0.4;
|
||||
}
|
||||
|
||||
.uptime-maintenance {
|
||||
color: var(--maintenance, #bfa21a);
|
||||
}
|
||||
|
||||
.maintenance-badge {
|
||||
display: inline-block;
|
||||
margin-left: 0.5em;
|
||||
padding: 0.1em 0.5em;
|
||||
font-size: var(--text-2xs, 12px);
|
||||
background: var(--maintenance, #fff5bf);
|
||||
color: var(--maintenance, #bfa21a);
|
||||
border-radius: 4px;
|
||||
font-weight: 600;
|
||||
letter-spacing: 0.03em;
|
||||
vertical-align: middle;
|
||||
}
|
||||
|
||||
@keyframes pulse-glow {
|
||||
0%, 100% {
|
||||
opacity: 0.3;
|
||||
|
||||
@@ -32,7 +32,7 @@ try {
|
||||
}
|
||||
|
||||
// Sort: down monitors first, then unknown, then up
|
||||
const sortOrder = { down: 0, unknown: 1, up: 2 } as const;
|
||||
const sortOrder = { down: 0, maintenance: 1, unknown: 2, up: 3 } as const;
|
||||
const sortedMonitors = data ? [...data.monitors].sort(
|
||||
(a, b) => (sortOrder[a.status] ?? 1) - (sortOrder[b.status] ?? 1)
|
||||
) : [];
|
||||
|
||||
@@ -88,6 +88,13 @@ monitors:
|
||||
headers:
|
||||
Authorization: "Basic ${BASIC_AUTH}" # BASIC_AUTH must be defined as secret in Cloudflare
|
||||
alerts: ["default"]
|
||||
# Optional maintenance windows - ISO 8601 UTC format
|
||||
maintenance:
|
||||
- start: "2026-05-10T23:00:00Z" # Begin maintenance, UTC
|
||||
end: "2026-05-11T01:00:00Z" # End maintenance (exclusive), UTC
|
||||
- start: "2026-06-01T02:00:00Z"
|
||||
end: "2026-06-01T03:30:00Z"
|
||||
# Monitors in maintenance do NOT send alerts, but still accumulate downtime for reporting
|
||||
|
||||
# Regional monitoring examples
|
||||
# Run checks from specific Cloudflare regions
|
||||
|
||||
Reference in New Issue
Block a user