监控目标与分层
快速落地方案 Prometheus + Grafana
pip install prometheus-clientfrom prometheus_client import start_http_server, Counter, Histogram, Gauge
import time, random
# 启动指标服务,端口 8000
start_http_server(8000)
REQUEST_DURATION = Histogram('app_request_duration_seconds', 'Request latency', ['method', 'endpoint'])
REQUEST_COUNT = Counter('app_requests_total', 'Total requests', ['method', 'endpoint', 'status'])
MEM_RSS_GAUGE = Gauge('app_process_rss_bytes', 'Process RSS in bytes')
def handle_request(method, endpoint):
start = time.time()
# 模拟业务
time.sleep(random.uniform(0.01, 0.2))
status = "200"
# 记录指标
REQUEST_DURATION.labels(method, endpoint).observe(time.time() - start)
REQUEST_COUNT.labels(method, endpoint, status).inc()
# 更新RSS(需在目标进程中采集)
try:
import psutil, os
p = psutil.Process(os.getpid())
MEM_RSS_GAUGE.set(p.memory_info().rss)
except Exception:
pass
scrape_configs:
- job_name: 'python_app'
static_configs:
- targets: ['localhost:8000']
系统与应用内监控工具
top/htop、vmstat、free、iostat、ss 等用于快速定位瓶颈。py-spy record -o profile.svg --pid <PID> 或 py-spy top --pid <PID>。日志与业务指标
import logging
logging.basicConfig(
filename='app.log', level=logging.INFO,
format='%(asctime)s %(levelname)s %(name)s %(message)s'
)
logging.error("业务异常", extra={"trace_id": "abc-123"})
tail -f app.log | grep "ERROR"。告警与可视化最佳实践