Ubuntu监控HDFS集群状态
一 命令行快速巡检
二 Web UI可视化监控
三 第三方监控与可视化
四 自动化巡检与告警脚本示例
#!/usr/bin/env bash
set -o pipefail
NN_HOST="namenode.example.com"
HADOOP_HOME="/usr/local/hadoop-3.3.6" # 按实际路径修改
PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
# 1) 进程存活
if ! jps | grep -qE "NameNode|DataNode"; then
echo "CRITICAL: HDFS进程(NameNode/DataNode)未运行"
exit 2
fi
# 2) 安全模式
SAFEMODE=$(hdfs dfsadmin -safemode get 2>/dev/null | awk '{print $NF}')
if [[ "$SAFEMODE" != "OFF" ]]; then
echo "WARNING: HDFS处于安全模式: $SAFEMODE"
exit 1
fi
# 3) 集群报告
REPORT=$(hdfs dfsadmin -report 2>/dev/null)
LIVE_DN=$(echo "$REPORT" | awk -F: '/Live datanodes/{gsub(/ /,"",$2); print $2; exit}')
CAP_TOTAL=$(echo "$REPORT" | awk '/Configured Capacity/{gsub(/,/,"",$3); print $3; exit}')
CAP_USED=$(echo "$REPORT" | awk '/DFS Used/{gsub(/,/,"",$3); print $3; exit}')
CAP_REMAINING=$(echo "$REPORT" | awk '/DFS Remaining/{gsub(/,/,"",$3); print $3; exit}')
UNDER_REPLICATED=$(echo "$REPORT" | awk '/Under replicated blocks/{print $4; exit}')
MISSING_BLOCKS=$(echo "$REPORT" | awk '/Missing blocks/{print $3; exit}')
echo "OK: LiveDataNodes=$LIVE_DN CapacityTotal=${CAP_TOTAL} Used=${CAP_USED} Remaining=${CAP_REMAINING} UnderReplicated=${UNDER_REPLICATED} Missing=${MISSING_BLOCKS}"
# 阈值示例:可用容量低于10GB或存在不健康块即告警
if (( $(echo "$CAP_REMAINING < 10737418240" | bc -l) )); then
echo "CRITICAL: 可用容量不足10GB"
exit 2
fi
if [[ "$UNDER_REPLICATED" != "0" || "$MISSING_BLOCKS" != "0" ]]; then
echo "CRITICAL: 存在不健康块(UnderReplicated=$UNDER_REPLICATED Missing=$MISSING_BLOCKS)"
exit 2
fi
exit 0
五 运维要点与常见问题