You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

107 lines
4.1 KiB

2 years ago
  1. groups:
  2. - name: example #定义规则组
  3. rules:
  4. - alert: InstanceDown #定义报警名称
  5. expr: up == 0 #Promql语句,触发规则
  6. for: 1m # 一分钟
  7. labels: #标签定义报警的级别和主机
  8. name: instance
  9. severity: Critical
  10. annotations: #注解
  11. summary: " {{ $labels.appname }}" #报警摘要,取报警信息的appname名称
  12. description: " 服务停止运行 " #报警信息
  13. value: "{{ $value }}%" # 当前报警状态值
  14. - name: Host
  15. rules:
  16. - alert: HostMemory Usage
  17. expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes)) / node_memory_MemTotal_bytes * 100 > 80
  18. for: 1m
  19. labels:
  20. name: Memory
  21. severity: Warning
  22. annotations:
  23. summary: " {{ $labels.appname }} "
  24. description: "宿主机内存使用率超过80%."
  25. value: "{{ $value }}"
  26. - alert: HostCPU Usage
  27. expr: sum(avg without (cpu)(irate(node_cpu_seconds_total{mode!='idle'}[5m]))) by (instance,appname) > 0.65
  28. for: 1m
  29. labels:
  30. name: CPU
  31. severity: Warning
  32. annotations:
  33. summary: " {{ $labels.appname }} "
  34. description: "宿主机CPU使用率超过65%."
  35. value: "{{ $value }}"
  36. - alert: HostLoad
  37. expr: node_load5 > 4
  38. for: 1m
  39. labels:
  40. name: Load
  41. severity: Warning
  42. annotations:
  43. summary: "{{ $labels.appname }} "
  44. description: " 主机负载5分钟超过4."
  45. value: "{{ $value }}"
  46. - alert: HostFilesystem Usage
  47. expr: 1-(node_filesystem_free_bytes / node_filesystem_size_bytes) > 0.8
  48. for: 1m
  49. labels:
  50. name: Disk
  51. severity: Warning
  52. annotations:
  53. summary: " {{ $labels.appname }} "
  54. description: " 宿主机 [ {{ $labels.mountpoint }} ]分区使用超过80%."
  55. value: "{{ $value }}%"
  56. - alert: HostDiskio
  57. expr: irate(node_disk_writes_completed_total{job=~"Host"}[1m]) > 10
  58. for: 1m
  59. labels:
  60. name: Diskio
  61. severity: Warning
  62. annotations:
  63. summary: " {{ $labels.appname }} "
  64. description: " 宿主机 [{{ $labels.device }}]磁盘1分钟平均写入IO负载较高."
  65. value: "{{ $value }}iops"
  66. - alert: Network_receive
  67. expr: irate(node_network_receive_bytes_total{device!~"lo|bond[0-9]|cbr[0-9]|veth.*|virbr.*|ovs-system"}[5m]) / 1048576 > 3
  68. for: 1m
  69. labels:
  70. name: Network_receive
  71. severity: Warning
  72. annotations:
  73. summary: " {{ $labels.appname }} "
  74. description: " 宿主机 [{{ $labels.device }}] 网卡5分钟平均接收流量超过3Mbps."
  75. value: "{{ $value }}3Mbps"
  76. - alert: Network_transmit
  77. expr: irate(node_network_transmit_bytes_total{device!~"lo|bond[0-9]|cbr[0-9]|veth.*|virbr.*|ovs-system"}[5m]) / 1048576 > 3
  78. for: 1m
  79. labels:
  80. name: Network_transmit
  81. severity: Warning
  82. annotations:
  83. summary: " {{ $labels.appname }} "
  84. description: " 宿主机 [{{ $labels.device }}] 网卡5分钟内平均发送流量超过3Mbps."
  85. value: "{{ $value }}3Mbps"
  86. - name: Container
  87. rules:
  88. - alert: ContainerCPU Usage
  89. expr: (sum by(name,instance) (rate(container_cpu_usage_seconds_total{image!=""}[5m]))*100) > 60
  90. for: 1m
  91. labels:
  92. name: CPU
  93. severity: Warning
  94. annotations:
  95. summary: "{{ $labels.name }} "
  96. description: " 容器CPU使用超过60%."
  97. value: "{{ $value }}%"
  98. - alert: ContainerMem Usage
  99. # expr: (container_memory_usage_bytes - container_memory_cache) / container_spec_memory_limit_bytes * 100 > 10
  100. expr: container_memory_usage_bytes{name=~".+"} / 1048576 > 1024
  101. for: 1m
  102. labels:
  103. name: Memory
  104. severity: Warning
  105. annotations:
  106. summary: "{{ $labels.name }} "
  107. description: " 容器内存使用超过1GB."
  108. value: "{{ $value }}G"