Moving all the monitoring into docker-compose
I might write this up in full as a useful instruction guide at some stage, but for now I'm documenting the eventual approach and the things that I tripped over along the way.
It's Prometheus, AlertManager, BlackboxExporterand Grafana, with nginx as a reverse proxy. Port 443 is only accessible over the LAN and wireguard networks. It's not exposed to the internet, and so no authn in place at this point.
NodeExporter is running on all of the monitored nodes, and BlackboxExporter is used to ping the hosts to see if they're alive (should NodeExporter be down).
AlertManager is configured to push alerts via webhook. At the moment, this is going to a MS Power Automate flow.
I've looked at ntfy.sh and will probably spin up a test instance soon.
Trips
- I've written up much of this config on the fly previously, and I spent far too much time fixing problems I'd created ages ago.
- Trying to run Prometheus and the associated services with a user other than
nobody
was frustrating. I'd planned to make the config hidden from users on the local monitoring box by default, but because Prometheus is running asnobody
, I've had tochmod 644
(-rw-r--r--
).
# Create config directories
sudo mkdir -p /etc/prometheus
sudo mkdir -p /etc/blackbox
sudo mkdir -p /etc/alertmanager
# Create prometheus rules file
cat <<EOF | sudo tee /etc/prometheus/rules.yml > /dev/null
groups:
- name: host-down
rules:
- alert: Host down
for: 1m
expr: up{job="node_exporter"} == 0 or probe_success{job="blackbox"} == 0
labels:
severity: critical
annotations:
title: Host is down
description: The host cannot be contacted
- name: system-alerts
rules:
- alert: HighCPUUsage
expr: 100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 2m
labels:
severity: warning
annotations:
summary: "High CPU usage detected on {{ $labels.instance }}"
description: "CPU usage is above 80% for the past 2 minutes on {{ $labels.instance }}."
- alert: HighDriveSpaceUsage
expr: (node_filesystem_size_bytes - node_filesystem_free_bytes) / node_filesystem_size_bytes * 100 > 80
for: 2m
labels:
severity: warning
annotations:
summary: "High disk space usage detected on {{ $labels.instance }}"
description: "Disk usage is above 80% for the past 2 minutes on {{ $labels.instance }} (mountpoint: {{ $labels.mountpoint }})."
- alert: HighCPUTemperature
expr: node_hwmon_temp_celsius > 50
for: 2m
labels:
severity: warning
annotations:
summary: "High CPU temperature detected on {{ $labels.instance }}"
description: "CPU core temperature is above 50°C for the past 2 minutes on {{ $labels.instance }}."
- alert: HighMemoryUsage
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 80
for: 2m
labels:
severity: warning
annotations:
summary: "High memory usage detected on {{ $labels.instance }}"
description: "Memory usage is above 80% for the past 2 minutes on {{ $labels.instance }}."
EOF
# Create promethus config file
cat <<EOF | sudo tee /etc/prometheus/prometheus.yml > /dev/null
global:
evaluation_interval: 1m
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
rule_files:
- "rules.yml"
scrape_configs:
- job_name: "node_exporter"
static_configs:
- targets:
- prometheus:9090
- nuc1.internal:9100
- nuc2.internal:9100
- nuc3.internal:9100
- gateway.blip.zip:9100
- job_name: 'blackbox'
metrics_path: /probe
params:
module: [icmp]
static_configs:
- targets:
- nuc1.internal
- nuc2.internal
- nuc3.internal
- gateway.blip.zip
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-exporter:9115
EOF
# Create blackbox config file
cat <<EOF | sudo tee /etc/blackbox/blackbox.yml > /dev/null
modules:
tcp_connect:
prober: tcp
icmp:
prober: icmp
icmp_ttl5:
prober: icmp
timeout: 5s
icmp:
ttl: 5
EOF
# Read in value for webhook URL
read -p "Enter webhook URL for alertmanager: " webhook_url_alertmanager
# Create alertmanager config file
cat <<EOF | sudo tee /etc/alertmanager/alertmanager.yml > /dev/null
global:
resolve_timeout: 1m
route:
group_by: ['alertname']
group_wait: 30s
group_interval: 5m
repeat_interval: 24h
receiver: 'powerautomate'
receivers:
- name: 'powerautomate'
webhook_configs:
- url: '$webhook_url_alertmanager'
EOF
sudo chmod 755 /etc/prometheus
sudo chmod 644 /etc/prometheus/*
sudo chmod 755 /etc/blackbox
sudo chmod 644 /etc/blackbox/*
sudo chmod 755 /etc/alertmanager
sudo chmod 644 /etc/alertmanager/*
# Create nginx config. We're assuming that certs already exist in /etc/nginx/certs.
cat <<EOF | sudo tee /etc/nginx/nginx.conf > /dev/null
events {
worker_connections 1024;
}
http {
server {
listen 443 ssl;
server_name prometheus.blip.zip;
ssl_certificate_key /etc/nginx/certs/key.pem;
ssl_certificate /etc/nginx/certs/cert.pem;
ssl_protocols TLSv1.2 TLSv1.3;
location / {
proxy_pass http://prometheus:9090;
}
}
server {
listen 443 ssl;
server_name alertmanager.blip.zip;
ssl_certificate_key /etc/nginx/certs/key.pem;
ssl_certificate /etc/nginx/certs/cert.pem;
ssl_protocols TLSv1.2 TLSv1.3;
location / {
proxy_pass http://alertmanager:9093;
}
}
server {
listen 443 ssl;
server_name grafana.blip.zip;
ssl_certificate_key /etc/nginx/certs/key.pem;
ssl_certificate /etc/nginx/certs/cert.pem;
ssl_protocols TLSv1.2 TLSv1.3;
location / {
proxy_pass http://grafana:3000;
proxy_set_header Host \$host;
proxy_set_header X-Real-IP \$remote_addr;
proxy_set_header X-Forwarded-For \$proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto \$scheme;
}
}
}
EOF
# Create docker-compose file for the whole monitoring stack
sudo mkdir -p /etc/docker-scripts/monitoring
cat <<EOF | sudo tee /etc/docker-scripts/monitoring/docker-compose.yml > /dev/null
---
version: '3.8'
services:
prometheus:
image: prom/prometheus:latest
container_name: prometheus
restart: always
volumes:
- prometheus_data:/prometheus
- /etc/prometheus:/etc/prometheus:ro
ports:
- '9090'
networks:
- monitoring
grafana:
image: grafana/grafana:latest
container_name: grafana
restart: always
volumes:
- grafana_data:/var/lib/grafana
ports:
- '3000'
networks:
- monitoring
depends_on:
- prometheus
blackbox-exporter:
image: prom/blackbox-exporter:latest
container_name: blackbox-exporter
restart: always
volumes:
- /etc/blackbox:/etc/blackbox:ro
command:
- '--config.file=/etc/blackbox/blackbox.yml'
ports:
- '9115'
networks:
- monitoring
alertmanager:
image: prom/alertmanager:latest
container_name: alertmanager
restart: always
volumes:
- alertmanager_data:/alertmanager
- /etc/alertmanager:/etc/alertmanager:ro
command:
- '--config.file=/etc/alertmanager/alertmanager.yml'
- '--storage.path=/alertmanager'
ports:
- '9093'
networks:
- monitoring
nginx:
image: nginx:latest
container_name: nginx
restart: always
volumes:
- /etc/nginx/:/etc/nginx/:ro
ports:
- '443:443'
networks:
- monitoring
volumes:
prometheus_data:
grafana_data:
alertmanager_data:
networks:
monitoring:
driver: bridge
ipam:
driver: default
config:
- subnet: "10.60.21.0/24"
gateway: "10.60.21.1"
EOF