CentOS Backlog 监控方法
一 概念与关键指标
二 命令行快速监控
三 脚本化巡检与阈值告警
#!/usr/bin/env bash
set -o pipefail
# 阈值(可按业务调整)
WARN_FULL=0.7
CRIT_FULL=0.9
SOMAXCONN=$(cat /proc/sys/net/core/somaxconn 2>/dev/null || echo 128)
SYN_MAX=$(cat /proc/sys/net/ipv4/tcp_max_syn_backlog 2>/dev/null || echo 128)
# 获取所有 LISTEN 套接字中 Recv-Q 的最大值(近似全连接队列使用)
max_recvq=0
while read -r proto _ _ _ _ state recvq _; do
[[ "$state" == "LISTEN" ]] || continue
if (( recvq > max_recvq )); then
max_recvq=$recvq
fi
done < <(ss -tnlH 2>/dev/null)
# 计算使用率
usage=0
if (( SOMAXCONN > 0 )); then
usage=$(awk "BEGIN{printf \"%.2f\", $max_recvq/$SOMAXCONN}")
fi
# 获取 ListenOverflows(全连接队列溢出计数)
overflow=0
if line=$(netstat -s 2>/dev/null | grep -i 'listen overflows'); then
overflow=$(echo "$line" | awk '{print $1}' | tr -d ',')
fi
# 获取 SYN 相关计数(半连接压力观测)
syn_recv=0
if line=$(netstat -s 2>/dev/null | grep -i 'SYNs received'); then
syn_recv=$(echo "$line" | awk '{print $1}' | tr -d ',')
fi
# 阈值判断与告警
msg=""
if (( $(awk "BEGIN{exit !($usage >= $CRIT_FULL)}") )); then
msg="CRIT backlog usage=${usage} (max_recvq=${max_recvq}, somaxconn=${SOMAXCONN}) overflow=${overflow} syn_recv=${syn_recv}"
elif (( $(awk "BEGIN{exit !($usage >= $WARN_FULL)}") )); then
msg="WARN backlog usage=${usage} (max_recvq=${max_recvq}, somaxconn=${SOMAXCONN}) overflow=${overflow} syn_recv=${syn_recv}"
fi
if [[ -n "$msg" ]]; then
echo "$(date '+%F %T') $msg" | mail -s "Backlog Alert on $(hostname)" admin@example.com
fi
四 可视化与长期告警
groups:
- name: backlog
rules:
- alert: HighBacklogUsage
expr: node_backlog_usage > 0.8
for: 1m
labels:
severity: warning
annotations:
summary: "High backlog usage on {{ $labels.instance }}"
description: "Current usage {{ $value | humanizePercentage }}, max_recvq={{ $labels.max_recvq }}, somaxconn={{ $labels.somaxconn }}"
- alert: BacklogOverflow
expr: increase(node_backlog_overflow[5m]) > 0
labels:
severity: critical
annotations:
summary: "Backlog overflow detected on {{ $labels.instance }}"
五 排障与优化要点