Prometheus 监控k8s告警rules
cpu-usage.rules
ALERT NodeCPUUsage
IF (100 - (avg by (instance) (irate(node_cpu{name="node-exporter",mode="idle"}[5m])) * 100)) > 75
FOR 2m
LABELS {
severity="page"
}
ANNOTATIONS {
SUMMARY = "{{$labels.instance}}: High CPU usage detected",
DESCRIPTION = "{{$labels.instance}}: CPU usage is above 75% (current value is: {{ $value }})"
}
instance-availability.rules
ALERT InstanceDown
IF up == 0
FOR 1m
LABELS { severity = "page" }
ANNOTATIONS {
summary = "Instance {{ $labels.instance }} down",
description = "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minute.",
}
low-disk-space.rules
ALERT NodeLowRootDisk
IF ((node_filesystem_size{mountpoint="/root-disk"} - node_filesystem_free{mountpoint="/root-disk"} ) / node_filesystem_size{mountpoint="/root-disk"} * 100) > 75
FOR 2m
LABELS {
severity="page"
}
ANNOTATIONS {
SUMMARY = "{{$labels.instance}}: Low root disk space",
DESCRIPTION = "{{$labels.instance}}: Root disk usage is above 75% (current value is: {{ $value }})"
}
ALERT NodeLowDataDisk
IF ((node_filesystem_size{mountpoint="/data-disk"} - node_filesystem_free{mountpoint="/data-disk"} ) / node_filesystem_size{mountpoint="/data-disk"} * 100) > 75
FOR 2m
LABELS {
severity="page"
}
ANNOTATIONS {
SUMMARY = "{{$labels.instance}}: Low data disk space",
DESCRIPTION = "{{$labels.instance}}: Data disk usage is above 75% (current value is: {{ $value }})"
}
mem-usage.rules
ALERT NodeSwapUsage
IF (((node_memory_SwapTotal-node_memory_SwapFree)/node_memory_SwapTotal)*100) > 75
FOR 2m
LABELS {
severity="page"
}
ANNOTATIONS {
SUMMARY = "{{$labels.instance}}: Swap usage detected",
DESCRIPTION = "{{$labels.instance}}: Swap usage usage is above 75% (current value is: {{ $value }})"
}
ALERT NodeMemoryUsage
IF (((node_memory_MemTotal-node_memory_MemFree-node_memory_Cached)/(node_memory_MemTotal)*100)) > 75
FOR 2m
LABELS {
severity="page"
}
ANNOTATIONS {
SUMMARY = "{{$labels.instance}}: High memory usage detected",
DESCRIPTION = "{{$labels.instance}}: Memory usage is above 75% (current value is: {{ $value }})"
}
redis状态:
redis_cluster_state{addr="10.19.100.8:7000"}
redis连接数:
redis_connected_clients{addr="10.19.100.8:7000"}
redis命中率:
redis_keyspace_hits_total{addr="10.19.100.8:7000"}/(redis_keyspace_misses_total{addr="10.19.100.8:7000"}+redis_keyspace_hits_total{addr="10.19.100.8:7000"})
redis内存使用率:
redis_memory_used_bytes{addr="10.19.100.8:7000"}/redis_memory_max_bytes{addr="10.19.100.8:7000"}
mysql连接数:
mysql_global_status_connections{vip="10.19.124.36:3306"}
mysql状态:
MySQL_Up{vip="10.19.124.36:3306"}
mysql ops:
delta(mysql_global_status_innodb_row_ops_total{vip="10.19.185.107:3306"}[5m])
mysql hit命中率:
(mysql_global_status_qcache_hits-mysql_global_status_qcache_inserts )/mysql_global_status_qcache_hits * 100
Node内存使用率:
(1-node_memory_MemAvailable_bytes/node_memory_MemTotal_bytes)*100
根分区剩余空间:
node_filesystem_avail_bytes{device="rootfs"}
Node CPU数量:
machine_cpu_cores
容器CPU使用率:
(sum(irate(container_cpu_usage_seconds_total{container_name!="",pod_name!="",namespace="test-godeyes"}[1m])) by(cluster,namespace,container_name,pod_name))/(sum(container_spec_cpu_quota{namespace="test-godeyes",container_name!="",pod_name!=""}) by(cluster,namespace,container_name,pod_name) /100000)*100
容器内存使用率:
container_memory_rss{namespace="test-godeyes",container_name!="",pod_name!=""}/(container_spec_memory_limit_bytes{namespace="test-godeyes", container_name!="",pod_name!=""}) <=1
容器inode使用总数:
container_fs_inodes_total{namespace="test-godeyes",container_name!="",pod_name!=""}
NodeCPU使用率:
sum(irate(node_cpu_seconds_total{nodename="10.10.173.203",mode!="idle"}[1m]))*100
MySQLD Exporter:
查询吞吐量(速率)
sum(rate(mysql_global_status_commands_total{command=~"insert|update|delete"}[2m])) without (command)
连接情况
当前剩余的可用连接数
mysql_global_variables_max_connections - mysql_global_status_threads_connected
前MySQL实例连接拒绝数
mysql_global_status_aborted_connects
缓冲池使用情况
MySQL实例的缓冲池利用率
(sum(mysql_global_status_buffer_pool_pages) by (instance) - sum(mysql_global_status_buffer_pool_pages{state="free"}) by (instance)) / sum(mysql_global_status_buffer_pool_pages) by (instance)
2分钟内磁盘读取请求次数的增长率的变化情况
rate(mysql_global_status_innodb_buffer_pool_reads[2m])
查询执行性能
Slow_queries的增长情况
rate(mysql_global_status_slow_queries[2m])
cadvice:
计算容器cpu的使用率:
sum(irate(container_cpu_usage_seconds_total{image!=""}[1m])) without (cpu)
容器内存使用量(单位:字节)
container_memory_usage_bytes{image!=""}
查询容器网络接收量速率(单位:字节/秒)
sum(rate(container_network_receive_bytes_total{image!=""}[1m])) without (interface)
查询容器网络传输量速率(单位:字节/秒):
sum(rate(container_network_transmit_bytes_total{image!=""}[1m])) without (interface)
查询容器文件系统读取速率(单位:字节/秒):
sum(rate(container_fs_reads_bytes_total{image!=""}[1m])) without (device)
查询容器文件系统写入速率(单位:字节/秒):
sum(rate(container_fs_writes_bytes_total{image!=""}[1m])) without (device)
- name: example
rules:
- alert: 实例丢失
expr: up{job="node-exporter"} == 0
for: 1m
labels:
severity: page
annotations:
summary: "服务器实例 {{ $labels.instance }} 丢失"
description: "{{ $labels.instance }} 上的任务 {{ $labels.job }} 已经停止了 1 分钟已上了"
- alert: 磁盘容量小于 5%
expr: 100 - ((node_filesystem_avail_bytes{job="node-exporter",mountpoint=~".*",fstype=~"ext4|xfs|ext2|ext3"} * 100) / node_filesystem_size_bytes {job="node-exporter",mountpoint=~".*",fstype=~"ext4|xfs|ext2|ext3"}) > 95
for: 30s
annotations:
summary: "服务器实例 {{ $labels.instance }} 磁盘不足 告警通知"
description: "磁盘 {{ $labels.device }} 资源 {{ $value }} 已不足 5%, 请尽快排查,自动通知,请勿回复!"
- alert: "内存容量小于 20%"
expr: ((node_memory_MemTotal_bytes - node_memory_MemFree_bytes - node_memory_Buffers_bytes - node_memory_Cached_bytes) / (node_memory_MemTotal_bytes )) * 100 > 80
for: 30s
labels:
severity: warning
annotations:
summary: "服务器实例 {{ $labels.instance }} 内存不足 告警通知"
description: "内存资源 {{ $value }} 已不足 20%, 请尽快排查,自动通知,请勿回复!"
- alert: "CPU 平均负载大于 4 个"
expr: node_load5 > 4
for: 30s
annotations:
sumary: "服务器实例 {{ $labels.instance }} CPU 负载 告警通知"
description: "CPU 平均负载(5 分钟) {{ $value }} 已超过 4 个, 请尽快排查,自动通知,请勿回复!"
- alert: "磁盘读 I/O 超过 30MB/s"
expr: irate(node_disk_read_bytes_total{device="sda"}[1m]) > 30000000
for: 30s
annotations:
sumary: "服务器实例 {{ $labels.instance }} I/O 读负载 告警通知"
description: "I/O 每分钟读 {{ $value }} 已超过 30MB/s, 请尽快排查,自动通知,请勿回复!"
- alert: "磁盘写 I/O 超过 30MB/s"
expr: irate(node_disk_written_bytes_total{device="sda"}[1m]) > 30000000
for: 30s
annotations:
sumary: "服务器实例 {{ $labels.instance }} I/O 写负载 告警通知"
description: "I/O 每分钟写 {{ $value }} 已超过 30MB/s, 请尽快排查,自动通知,请勿回复!"
- alert: "网卡流出速率大于 10MB/s"
expr: (irate(node_network_transmit_bytes_total{device!~"lo"}[1m]) / 1000) > 1000000
for: 30s
annotations:
sumary: "服务器实例 {{ $labels.instance }} 网卡流量负载 告警通知"
description: "网卡 {{ $labels.device }} 流量 {{ $value }} 已经超过 10MB/s, 请尽快排查,自动通知,请勿回复!"
- alert: "CPU 使用率大于 90%"
expr: 100 - ((avg by (instance,job,env)(irate(node_cpu_seconds_total{mode="idle"}[30s]))) *100) > 90
for: 30s
annotations:
sumary: "服务器实例 {{ $labels.instance }} CPU 使用率 告警通知"
description: "CPU 使用率 {{ $value }} 已超过 90%, 请尽快排查,自动通知,请勿回复!"
PVC卷监控
1.pvc可用空间小于3%
name: PVCCriticalCapacity
expr: 100 * kubelet_volume_stats_available_bytes{job="kubernetes-nodes"} / kubelet_volume_stats_capacity_bytes{job="kubernetes-nodes"} < 3
for: 2m
labels:
severity: warning
annotations:
command1: kubectl get pvc -n <namespace>
command2: Check the Kubernetes Cluster PVC Metrics grafana dashboard.
datacenter: eu-west-1
environment: production
summary: A persistent volume only has three percent capacity left.
name: PVCFourDayexpr: (kubelet_volume_stats_used_bytes{job="kubernetes-nodes"} / kubelet_volume_stats_capacity_bytes{job="kubernetes-nodes"}) > 0.85 and predict_linear(kubelet_volume_stats_available_bytes{job="kubernetes-nodes"}[6h], 4 * 24 * 3600) < 0for: 2mlabels:severity: warningannotations:command1: kubectl get pvc -n <namespace>command2: Check the Kubernetes Cluster PVC Metrics grafana dashboard.datacenter: eu-west-1environment: productionsummary: A persistent volume is estimated to fill up in four days.
2.使用率百分百
kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes * 100