Moving all the monitoring into docker-compose

I might write this up in full as a useful instruction guide at some stage, but for now I'm documenting the eventual approach and the things that I tripped over along the way.

It's Prometheus, AlertManager, BlackboxExporterand Grafana, with nginx as a reverse proxy. Port 443 is only accessible over the LAN and wireguard networks. It's not exposed to the internet, and so no authn in place at this point.

NodeExporter is running on all of the monitored nodes, and BlackboxExporter is used to ping the hosts to see if they're alive (should NodeExporter be down).

AlertManager is configured to push alerts via webhook. At the moment, this is going to a MS Power Automate flow.

I've looked at ntfy.sh and will probably spin up a test instance soon.

Trips

# Create config directories
sudo mkdir -p /etc/prometheus
sudo mkdir -p /etc/blackbox
sudo mkdir -p /etc/alertmanager

# Create prometheus rules file
cat <<EOF | sudo tee /etc/prometheus/rules.yml > /dev/null
groups:
  - name: host-down
    rules:
      - alert: Host down
        for: 1m
        expr: up{job="node_exporter"} == 0 or probe_success{job="blackbox"} == 0
        labels:
          severity: critical
        annotations:
          title: Host is down
          description: The host cannot be contacted
  - name: system-alerts
    rules:
      - alert: HighCPUUsage
        expr: 100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: "High CPU usage detected on {{ $labels.instance }}"
          description: "CPU usage is above 80% for the past 2 minutes on {{ $labels.instance }}."

      - alert: HighDriveSpaceUsage
        expr: (node_filesystem_size_bytes - node_filesystem_free_bytes) / node_filesystem_size_bytes * 100 > 80
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: "High disk space usage detected on {{ $labels.instance }}"
          description: "Disk usage is above 80% for the past 2 minutes on {{ $labels.instance }} (mountpoint: {{ $labels.mountpoint }})."

      - alert: HighCPUTemperature
        expr: node_hwmon_temp_celsius > 50
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: "High CPU temperature detected on {{ $labels.instance }}"
          description: "CPU core temperature is above 50°C for the past 2 minutes on {{ $labels.instance }}."

      - alert: HighMemoryUsage
        expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 80
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: "High memory usage detected on {{ $labels.instance }}"
          description: "Memory usage is above 80% for the past 2 minutes on {{ $labels.instance }}."

EOF

# Create promethus config file
cat <<EOF | sudo tee /etc/prometheus/prometheus.yml > /dev/null
global:
  evaluation_interval: 1m
alerting:
  alertmanagers:
    - static_configs:
        - targets:
           - alertmanager:9093
rule_files:
  - "rules.yml"
scrape_configs:
  - job_name: "node_exporter"
    static_configs:
      - targets:
        - prometheus:9090
        - nuc1.internal:9100
        - nuc2.internal:9100
        - nuc3.internal:9100
        - gateway.blip.zip:9100
  - job_name: 'blackbox'
    metrics_path: /probe
    params:
      module: [icmp]
    static_configs:
      - targets:
        - nuc1.internal
        - nuc2.internal
        - nuc3.internal
        - gateway.blip.zip
    relabel_configs:
      - source_labels: [__address__]
        target_label: __param_target
      - source_labels: [__param_target]
        target_label: instance
      - target_label: __address__
        replacement: blackbox-exporter:9115
EOF

# Create blackbox config file
cat <<EOF | sudo tee /etc/blackbox/blackbox.yml > /dev/null
modules:
  tcp_connect:
    prober: tcp
  icmp:
    prober: icmp
  icmp_ttl5:
    prober: icmp
    timeout: 5s
    icmp:
      ttl: 5
EOF

# Read in value for webhook URL
read -p "Enter webhook URL for alertmanager: " webhook_url_alertmanager

# Create alertmanager config file
cat <<EOF | sudo tee /etc/alertmanager/alertmanager.yml > /dev/null
global:
  resolve_timeout: 1m

route:
  group_by: ['alertname']
  group_wait: 30s
  group_interval: 5m
  repeat_interval: 24h
  receiver: 'powerautomate'

receivers:
- name: 'powerautomate'
  webhook_configs:
  - url: '$webhook_url_alertmanager'
EOF


sudo chmod 755 /etc/prometheus
sudo chmod 644 /etc/prometheus/*

sudo chmod 755 /etc/blackbox
sudo chmod 644 /etc/blackbox/*

sudo chmod 755 /etc/alertmanager 
sudo chmod 644 /etc/alertmanager/*


# Create nginx config. We're assuming that certs already exist in /etc/nginx/certs.
cat <<EOF | sudo tee /etc/nginx/nginx.conf > /dev/null
events { 
	worker_connections 1024;
}
http {
	
	server {
		listen 443 ssl;
		server_name prometheus.blip.zip;
		ssl_certificate_key     /etc/nginx/certs/key.pem;
		ssl_certificate         /etc/nginx/certs/cert.pem;
		ssl_protocols           TLSv1.2 TLSv1.3;

		location / {
				proxy_pass http://prometheus:9090;
		}
	}

	server {
		listen 443 ssl;
		server_name alertmanager.blip.zip;
		ssl_certificate_key     /etc/nginx/certs/key.pem;
		ssl_certificate         /etc/nginx/certs/cert.pem;
		ssl_protocols           TLSv1.2 TLSv1.3;

		location / {
				proxy_pass http://alertmanager:9093;
		}
	}

	server {
		listen 443 ssl;
		server_name grafana.blip.zip;
		ssl_certificate_key     /etc/nginx/certs/key.pem;
		ssl_certificate         /etc/nginx/certs/cert.pem;
		ssl_protocols           TLSv1.2 TLSv1.3;

		location / {
				proxy_pass http://grafana:3000;
				proxy_set_header Host \$host;
				proxy_set_header X-Real-IP \$remote_addr;
				proxy_set_header X-Forwarded-For \$proxy_add_x_forwarded_for;
				proxy_set_header X-Forwarded-Proto \$scheme;
		}
	}
}
EOF

# Create docker-compose file for the whole monitoring stack
sudo mkdir -p /etc/docker-scripts/monitoring
cat <<EOF | sudo tee /etc/docker-scripts/monitoring/docker-compose.yml > /dev/null
---
version: '3.8'

services:
  prometheus:
    image: prom/prometheus:latest
    container_name: prometheus
    restart: always
    volumes:
      - prometheus_data:/prometheus
      - /etc/prometheus:/etc/prometheus:ro
    ports:
      - '9090'
    networks:
      - monitoring

  grafana:
    image: grafana/grafana:latest
    container_name: grafana
    restart: always
    volumes:
      - grafana_data:/var/lib/grafana
    ports:
      - '3000'
    networks:
      - monitoring
    depends_on:
      - prometheus

  blackbox-exporter:
    image: prom/blackbox-exporter:latest
    container_name: blackbox-exporter
    restart: always
    volumes:
      - /etc/blackbox:/etc/blackbox:ro
    command:
      - '--config.file=/etc/blackbox/blackbox.yml'
    ports:
      - '9115'
    networks:
      - monitoring

  alertmanager:
    image: prom/alertmanager:latest
    container_name: alertmanager
    restart: always
    volumes:
      - alertmanager_data:/alertmanager
      - /etc/alertmanager:/etc/alertmanager:ro
    command:
      - '--config.file=/etc/alertmanager/alertmanager.yml'
      - '--storage.path=/alertmanager'
    ports:
      - '9093'
    networks:
      - monitoring

  nginx:
    image: nginx:latest
    container_name: nginx
    restart: always
    volumes:
      - /etc/nginx/:/etc/nginx/:ro
    ports:
      - '443:443'
    networks:
      - monitoring

volumes:
  prometheus_data:
  grafana_data:
  alertmanager_data:

networks:
  monitoring:
    driver: bridge
    ipam:
      driver: default
      config:
        - subnet: "10.60.21.0/24"
          gateway: "10.60.21.1"
EOF