在 Debian 上搭建 Docker 监控与告警
一 架构与组件
二 快速部署步骤
version: '3.8'
services:
prometheus:
image: prom/prometheus:v2.45.0
container_name: prometheus
restart: always
ports:
- "9090:9090"
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
- prometheus-data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/usr/share/prometheus/console_libraries'
- '--web.console.templates=/usr/share/prometheus/consoles'
networks:
- prom-net
cadvisor:
image: gcr.io/cadvisor/cadvisor:v0.47.0
container_name: cadvisor
restart: always
ports:
- "8080:8080"
volumes:
- /:/rootfs:ro
- /var/run:/var/run:ro
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
- /dev/disk/:/dev/disk:ro
networks:
- prom-net
node-exporter:
image: prom/node-exporter:v1.6.1
container_name: node-exporter
restart: always
ports:
- "9100:9100"
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
command:
- '--path.procfs=/host/proc'
- '--path.sysfs=/host/sys'
- '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)'
user: root
networks:
- prom-net
grafana:
image: grafana/grafana:10.2.2
container_name: grafana
restart: always
ports:
- "3000:3000"
volumes:
- grafana-data:/var/lib/grafana
networks:
- prom-net
volumes:
prometheus-data:
grafana-data:
networks:
prom-net:
driver: bridge
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_configs:
- job_name: 'cadvisor'
static_configs:
- targets: ['cadvisor:8080']
- job_name: 'node-exporter'
static_configs:
- targets: ['node-exporter:9100']
cd /opt/prometheus-docker
docker compose up -d
三 告警规则与通知
groups:
- name: container.rules
rules:
- alert: HighCPUUsage
expr: sum(rate(container_cpu_usage_seconds_total{name!=""}[1m])) by (name) > 0.8
for: 2m
labels:
severity: critical
annotations:
summary: "容器 {{ $labels.name }} CPU使用率过高"
description: "当前使用率: {{ $value | humanizePercentage }}"
- alert: MemoryNearLimit
expr: container_memory_usage_bytes{name!=""} / container_spec_memory_limit_bytes{name!=""} > 0.9
for: 5m
labels:
severity: warning
annotations:
summary: "容器 {{ $labels.name }} 内存接近上限"
description: "已用/上限: {{ $value | humanizePercentage }}"
- alert: InstanceDown
expr: up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "实例 {{ $labels.instance }} 已下线"
rule_files:
- /etc/prometheus/rules.yml
global:
resolve_timeout: 5m
route:
group_by: ['alertname']
group_wait: 30s
group_interval: 5m
repeat_interval: 1h
receiver: 'email'
receivers:
- name: 'email'
email_configs:
- to: 'ops@example.com'
from: 'alert@example.com'
smarthost: 'smtp.example.com:587'
auth_username: 'user'
auth_password: 'pass'
require_tls: true
receivers:
- name: 'slack'
slack_configs:
- api_url: 'https://hooks.slack.com/services/T00000000/B00000000/XXXXXXXXXXXXXXXXXXXXXXXX'
channel: '#alerts'
receivers:
- name: 'wechat'
webhook_configs:
- url: 'https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=YOUR_KEY'
receivers:
- name: 'dingtalk'
webhook_configs:
- url: 'https://oapi.dingtalk.com/robot/send?access_token=YOUR_TOKEN'
四 验证与运维要点
docker logs -f prometheus、docker logs -f alertmanager、docker logs -f cadvisor。五 可选增强