Prometheus 监控k8s告警rules

作者: root007 分类: kubernetes 发布时间: 2019-03-06 13:47

cpu-usage.rules

ALERT NodeCPUUsage
  IF (100 - (avg by (instance) (irate(node_cpu{name="node-exporter",mode="idle"}[5m])) * 100)) > 75
  FOR 2m
  LABELS {
    severity="page"
  }
  ANNOTATIONS {
    SUMMARY = "{{$labels.instance}}: High CPU usage detected",
    DESCRIPTION = "{{$labels.instance}}: CPU usage is above 75% (current value is: {{ $value }})"
  }

instance-availability.rules

ALERT InstanceDown
  IF up == 0
  FOR 1m
  LABELS { severity = "page" }
  ANNOTATIONS {
    summary = "Instance {{ $labels.instance }} down",
    description = "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minute.",
  }

low-disk-space.rules

ALERT NodeLowRootDisk
  IF ((node_filesystem_size{mountpoint="/root-disk"} - node_filesystem_free{mountpoint="/root-disk"} ) / node_filesystem_size{mountpoint="/root-disk"} * 100) > 75
  FOR 2m
  LABELS {
    severity="page"
  }
  ANNOTATIONS {
    SUMMARY = "{{$labels.instance}}: Low root disk space",
    DESCRIPTION = "{{$labels.instance}}: Root disk usage is above 75% (current value is: {{ $value }})"
  }

ALERT NodeLowDataDisk
  IF ((node_filesystem_size{mountpoint="/data-disk"} - node_filesystem_free{mountpoint="/data-disk"} ) / node_filesystem_size{mountpoint="/data-disk"} * 100) > 75
  FOR 2m
  LABELS {
    severity="page"
  }
  ANNOTATIONS {
    SUMMARY = "{{$labels.instance}}: Low data disk space",
    DESCRIPTION = "{{$labels.instance}}: Data disk usage is above 75% (current value is: {{ $value }})"
  }

mem-usage.rules

ALERT NodeSwapUsage
  IF (((node_memory_SwapTotal-node_memory_SwapFree)/node_memory_SwapTotal)*100) > 75
  FOR 2m
  LABELS {
    severity="page"
  }
  ANNOTATIONS {
    SUMMARY = "{{$labels.instance}}: Swap usage detected",
    DESCRIPTION = "{{$labels.instance}}: Swap usage usage is above 75% (current value is: {{ $value }})"
  }

ALERT NodeMemoryUsage
  IF (((node_memory_MemTotal-node_memory_MemFree-node_memory_Cached)/(node_memory_MemTotal)*100)) > 75
  FOR 2m
  LABELS {
    severity="page"
  }
  ANNOTATIONS {
    SUMMARY = "{{$labels.instance}}: High memory usage detected",
    DESCRIPTION = "{{$labels.instance}}: Memory usage is above 75% (current value is: {{ $value }})"
  }
redis状态:
 redis_cluster_state{addr="10.19.100.8:7000"}
 redis连接数:
 redis_connected_clients{addr="10.19.100.8:7000"}
 redis命中率:
 redis_keyspace_hits_total{addr="10.19.100.8:7000"}/(redis_keyspace_misses_total{addr="10.19.100.8:7000"}+redis_keyspace_hits_total{addr="10.19.100.8:7000"})
 redis内存使用率:
 redis_memory_used_bytes{addr="10.19.100.8:7000"}/redis_memory_max_bytes{addr="10.19.100.8:7000"}
 mysql连接数:
 mysql_global_status_connections{vip="10.19.124.36:3306"}
 mysql状态:
 MySQL_Up{vip="10.19.124.36:3306"}
 mysql ops:
 delta(mysql_global_status_innodb_row_ops_total{vip="10.19.185.107:3306"}[5m])
 mysql hit命中率:
 (mysql_global_status_qcache_hits-mysql_global_status_qcache_inserts )/mysql_global_status_qcache_hits * 100
Node内存使用率:
 (1-node_memory_MemAvailable_bytes/node_memory_MemTotal_bytes)*100
 根分区剩余空间:
 node_filesystem_avail_bytes{device="rootfs"}
 Node CPU数量:
 machine_cpu_cores
 容器CPU使用率:
 (sum(irate(container_cpu_usage_seconds_total{container_name!="",pod_name!="",namespace="test-godeyes"}[1m])) by(cluster,namespace,container_name,pod_name))/(sum(container_spec_cpu_quota{namespace="test-godeyes",container_name!="",pod_name!=""}) by(cluster,namespace,container_name,pod_name) /100000)*100
 容器内存使用率:
 container_memory_rss{namespace="test-godeyes",container_name!="",pod_name!=""}/(container_spec_memory_limit_bytes{namespace="test-godeyes", container_name!="",pod_name!=""}) <=1
 容器inode使用总数:
 container_fs_inodes_total{namespace="test-godeyes",container_name!="",pod_name!=""}
NodeCPU使用率:
 sum(irate(node_cpu_seconds_total{nodename="10.10.173.203",mode!="idle"}[1m]))*100
MySQLD Exporter:
 查询吞吐量(速率)
 sum(rate(mysql_global_status_commands_total{command=~"insert|update|delete"}[2m])) without (command)
 连接情况
 当前剩余的可用连接数
 mysql_global_variables_max_connections - mysql_global_status_threads_connected
 前MySQL实例连接拒绝数
 mysql_global_status_aborted_connects
 缓冲池使用情况
 MySQL实例的缓冲池利用率
 (sum(mysql_global_status_buffer_pool_pages) by (instance) - sum(mysql_global_status_buffer_pool_pages{state="free"}) by (instance)) / sum(mysql_global_status_buffer_pool_pages) by (instance)
 2分钟内磁盘读取请求次数的增长率的变化情况
 rate(mysql_global_status_innodb_buffer_pool_reads[2m])
 查询执行性能
 Slow_queries的增长情况
 rate(mysql_global_status_slow_queries[2m])
cadvice:
 计算容器cpu的使用率:
 sum(irate(container_cpu_usage_seconds_total{image!=""}[1m])) without (cpu)
 容器内存使用量(单位:字节)
 container_memory_usage_bytes{image!=""}
 查询容器网络接收量速率(单位:字节/秒)
 sum(rate(container_network_receive_bytes_total{image!=""}[1m])) without (interface)
 查询容器网络传输量速率(单位:字节/秒):
 sum(rate(container_network_transmit_bytes_total{image!=""}[1m])) without (interface)
 查询容器文件系统读取速率(单位:字节/秒):
 sum(rate(container_fs_reads_bytes_total{image!=""}[1m])) without (device)
 查询容器文件系统写入速率(单位:字节/秒):
 sum(rate(container_fs_writes_bytes_total{image!=""}[1m])) without (device)
- name: example
  rules:
 
  - alert: 实例丢失
    expr: up{job="node-exporter"} == 0
    for: 1m
    labels:
      severity: page
    annotations:
      summary: "服务器实例 {{ $labels.instance }} 丢失"
      description: "{{ $labels.instance }} 上的任务 {{ $labels.job }} 已经停止了 1 分钟已上了"
 
  - alert: 磁盘容量小于 5%
    expr: 100 - ((node_filesystem_avail_bytes{job="node-exporter",mountpoint=~".*",fstype=~"ext4|xfs|ext2|ext3"} * 100) / node_filesystem_size_bytes {job="node-exporter",mountpoint=~".*",fstype=~"ext4|xfs|ext2|ext3"}) > 95
    for: 30s
    annotations:
      summary: "服务器实例 {{ $labels.instance }} 磁盘不足 告警通知"
      description: "磁盘 {{ $labels.device }} 资源 {{ $value }} 已不足 5%, 请尽快排查,自动通知,请勿回复!"
 
  - alert: "内存容量小于 20%"
    expr: ((node_memory_MemTotal_bytes - node_memory_MemFree_bytes - node_memory_Buffers_bytes - node_memory_Cached_bytes) / (node_memory_MemTotal_bytes )) * 100 > 80
    for: 30s
    labels:
      severity: warning
    annotations:
      summary: "服务器实例 {{ $labels.instance }} 内存不足 告警通知"
      description: "内存资源 {{ $value }} 已不足 20%, 请尽快排查,自动通知,请勿回复!"
 
  - alert: "CPU 平均负载大于 4 个"
    expr: node_load5 > 4
    for: 30s
    annotations:
      sumary: "服务器实例 {{ $labels.instance }} CPU 负载 告警通知"
      description: "CPU 平均负载(5 分钟) {{ $value }} 已超过 4 个, 请尽快排查,自动通知,请勿回复!"
 
  - alert: "磁盘读 I/O 超过 30MB/s"
    expr: irate(node_disk_read_bytes_total{device="sda"}[1m]) > 30000000
    for: 30s
    annotations:
      sumary: "服务器实例 {{ $labels.instance }} I/O 读负载 告警通知"
      description: "I/O 每分钟读 {{ $value }} 已超过 30MB/s, 请尽快排查,自动通知,请勿回复!"
 
  - alert: "磁盘写 I/O 超过 30MB/s"
    expr: irate(node_disk_written_bytes_total{device="sda"}[1m]) > 30000000
    for: 30s
    annotations:
      sumary: "服务器实例 {{ $labels.instance }} I/O 写负载 告警通知"
      description: "I/O 每分钟写 {{ $value }} 已超过 30MB/s, 请尽快排查,自动通知,请勿回复!"
 
  - alert: "网卡流出速率大于 10MB/s"
    expr: (irate(node_network_transmit_bytes_total{device!~"lo"}[1m]) / 1000) > 1000000
    for: 30s
    annotations:
      sumary: "服务器实例 {{ $labels.instance }} 网卡流量负载 告警通知"
      description: "网卡 {{ $labels.device }} 流量 {{ $value }} 已经超过 10MB/s, 请尽快排查,自动通知,请勿回复!"
 
  - alert: "CPU 使用率大于 90%"
    expr: 100 - ((avg by (instance,job,env)(irate(node_cpu_seconds_total{mode="idle"}[30s]))) *100) > 90
    for: 30s
    annotations:
      sumary: "服务器实例 {{ $labels.instance }} CPU 使用率 告警通知"
      description: "CPU 使用率 {{ $value }} 已超过 90%, 请尽快排查,自动通知,请勿回复!"

PVC卷监控

1.pvc可用空间小于3%

name: PVCCriticalCapacity
expr: 100 * kubelet_volume_stats_available_bytes{job="kubernetes-nodes"} / kubelet_volume_stats_capacity_bytes{job="kubernetes-nodes"} < 3
for: 2m
labels:
severity: warning
annotations:
command1: kubectl get pvc -n <namespace>
command2: Check the Kubernetes Cluster PVC Metrics grafana dashboard.
datacenter: eu-west-1
environment: production
summary: A persistent volume only has three percent capacity left.

name: PVCFourDayexpr: (kubelet_volume_stats_used_bytes{job="kubernetes-nodes"} / kubelet_volume_stats_capacity_bytes{job="kubernetes-nodes"}) > 0.85 and predict_linear(kubelet_volume_stats_available_bytes{job="kubernetes-nodes"}[6h], 4 * 24 * 3600) < 0for: 2mlabels:severity: warningannotations:command1: kubectl get pvc -n <namespace>command2: Check the Kubernetes Cluster PVC Metrics grafana dashboard.datacenter: eu-west-1environment: productionsummary: A persistent volume is estimated to fill up in four days.

2.使用率百分百

kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes * 100

发表回复

您的电子邮箱地址不会被公开。 必填项已用 * 标注