alertmanager-0.24.0.linux-amd64.tar.gz
blackbox_exporter-0.22.0.linux-amd64.tar.gz
node_exporter-1.4.0.linux-amd64.tar.gz
prometheus-2.40.0-rc.0.linux-amd64.tar.gz
cat /data/prometheus/prometheus.yml #根据自己情况更改
- global:
- scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
- evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
- # scrape_timeout is set to the global default (10s).
-
- alerting:
- alertmanagers:
- - static_configs:
- - targets:
- - 这里填写alertmanagers的ip:9093
- # - alertmanager:9093
-
- rule_files:
- - "rules/*.rules" #这里定义rule文件
- # - "second_rules.yml"
-
- scrape_configs:
- # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- - job_name: "prometheus"
-
- # metrics_path defaults to '/metrics'
- # scheme defaults to 'http'.
-
- static_configs:
- - targets: ["localhost:9090"]
-
- - job_name: '定义一个名称'
- metrics_path: /probe
- params:
- module: [blackbox_exporter里面的module名称]
- static_configs:
- - targets:
- - http://api的ip/api
- relabel_configs:
- - source_labels: [__address__]
- target_label: __param_target
- - source_labels: [__param_target]
- target_label: instance
- - target_label: __address__
- replacement: 这里填写blackbox_exporter的ip:9115
-
- - job_name: 'node'
- static_configs:
- - targets: ['localhost:9100']
cat /data/prometheus/rules/node.rules #这是一个node 的rules配置,可以直接使用,无需更改
- groups:
- - name: 主机状态-监控告警
- rules:
- - alert: 主机状态
- expr: up == 0
- for: 1m
- labels:
- status: 非常严重
- severity: warning
- annotations:
- #summary: "服务器宕机"
- description: "服务器延时超过5分钟"
-
- - alert: CPU使用情况
- expr: 100-(avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by(instance)* 100) > 80
- for: 1m
- labels:
- status: 一般告警
- severity: warning
- annotations:
- #summary: "CPU使用率过高!"
- description: "CPU使用大于80%(目前使用:{{$value}}%)"
-
- - alert: 内存使用
- expr: round(100- node_memory_MemAvailable_bytes{instance!="10.152.120.25:9100"}/node_memory_MemTotal_bytes{instance!="10.152.120.25:9100"}*100) > 80
- for: 1m
- labels:
- status: 一般告警
- severity: warning
- annotations:
- # summary: "内存使用率过高"
- description: "内存使用率{{ $value }}%"
-
- - alert: 25-内存使用
- expr: round(100- node_memory_MemAvailable_bytes{instance=~"10.152.120.25:9100"}/node_memory_MemTotal_bytes{instance=~"10.152.120.25:9100"}*100) > 95
- for: 1m
- labels:
- status: 一般告警
- severity: warning
- annotations:
- #summary: "内存使用率过高"
- description: "内存使用率{{ $value }}%"
-
- - alert: IO性能
- expr: 100-(avg(irate(node_disk_io_time_seconds_total[1m])) by(instance)* 100) < 60
- for: 1m
- labels:
- status: 严重告警
- severity: warning
- annotations:
- # summary: "{{$labels.mountpoint}} 流入磁盘IO使用率过高!"
- description: "{{$labels.mountpoint }} 流入磁盘IO大于60%(目前使用:{{$value}})"
-
- - alert: 网络
- expr: ((sum(rate (node_network_receive_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])) by (instance)) / 100) > 102400
- for: 1m
- labels:
- status: 严重告警
- severity: warning
- annotations:
- # summary: "{{$labels.mountpoint}} 流入网络带宽过高!"
- description: "{{$labels.mountpoint }}流入网络带宽持续2分钟高于100M. RX带宽使用率{{$value}}"
-
- - alert: TCP会话
- expr: node_netstat_Tcp_CurrEstab > 1000
- for: 1m
- labels:
- severity: warning
- status: 严重告警
- annotations:
- # summary: "{{$labels.mountpoint}} TCP_ESTABLISHED过高!"
- description: "{{$labels.mountpoint }} TCP_ESTABLISHED大于1000%(目前使用:{{$value}}%)"
-
- - alert: 磁盘容量
- expr: 100-(node_filesystem_free_bytes{fstype=~"ext4|xfs"}/node_filesystem_size_bytes {fstype=~"ext4|xfs"}*100) > 90
- for: 1m
- labels:
- status: 严重告警
- severity: warning
- annotations:
- # summary: "{{$labels.mountpoint}} 磁盘分区使用率过高!"
- description: "{{$labels.mountpoint }} 磁盘分区使用大于90%(目前使用:{{$value}}%)"
cat /data/prometheus/rules/blackbox_http.rules #这是自定义的api监控rules,需要根据自己情况更改
- groups:
- - name: 接口状态 #组的名字,在这个文件中必须要唯一
- rules:
- - alert: http-api #告警的名字,在组中需要唯一
- expr: probe_success{job="这里要对应job里面的名称"} == 0 #表达式, 执行结果为true: 表示需要告警
- for: 1s #超过多少时间才认为需要告警(即up==0需要持续的时间)
- labels:
- status: 非常严重
- severity: warning #定义标签
- annotations:
- description: "Job {{ $labels.job }} 中的接口 {{ $labels.instance }} 已经down掉."
- summary: '接口 {{ $labels.instance }} down ! ! !'
-
cat /data/alertmanager/alertmanager.yml #定义告警配置
- global:
- resolve_timeout: 5m #每5分钟检测一次是否恢复
- templates:
- - '/data/alertmanager/wechat.tmpl' # Alertmanager微信告警模板
-
- route:
- group_by: ['alertname']
- group_wait: 5s # 初次发送告警延时
- group_interval: 1m # 距离第一次发送告警,等待多久再次发送告警
- repeat_interval: 5m # 告警重发时间
- receiver: 'wechat'
-
- receivers:
- - name: 'wechat'
- wechat_configs:
- - corp_id: 'ww8f28' # 企业微信中企业ID
- to_party: '42' # 企业微信中创建的接收告警的告警部门ID
- # to_user: 'zhai' # 企业微信中创建的接收告警的单个人唯一ID
- agent_id: '100' # 企业微信中创建应用的AgentId
- api_secret: 'wiZIFkuo' # 企业微信中,Prometheus应用的Secret
- send_resolved: true
-
- inhibit_rules:
- - source_match:
- severity: 'critical'
- target_match:
- severity: 'warning'
- equal: ['alertname', 'dev', 'instance']
cat /data/blackbox_exporter/blackbox.yml
- modules:
- http_2xx:
- prober: http
- http_post_2xx:
- prober: http
- http:
- method: POST
- tcp_connect:
- prober: tcp
- pop3s_banner:
- prober: tcp
- tcp:
- query_response:
- - expect: "^+OK"
- tls: true
- tls_config:
- insecure_skip_verify: false
- grpc:
- prober: grpc
- grpc:
- tls: true
- preferred_ip_protocol: "ip4"
- grpc_plain:
- prober: grpc
- grpc:
- tls: false
- service: "service1"
- ssh_banner:
- prober: tcp
- tcp:
- query_response:
- - expect: "^SSH-2.0-"
- - send: "SSH-2.0-blackbox-ssh-check"
- irc_banner:
- prober: tcp
- tcp:
- query_response:
- - send: "NICK prober"
- - send: "USER prober prober prober :prober"
- - expect: "PING :([^ ]+)"
- send: "PONG ${1}"
- - expect: "^:[^ ]+ 001"
- icmp:
- prober: icmp
- icmp_ttl5:
- prober: icmp
- timeout: 5s
- icmp:
- ttl: 5
-
- ##################以上都是默认配置无需更改##############
- http_api: #定义模块名称,和prometheus的module一样
- prober: http
- timeout: 18s
- http:
- method: GET
- headers:
- token: 3579333KX4abK04i5
- Content-Type: application/json
- #prometheus
- /data/prometheus/prometheus --config.file=prometheus.yml --log.level=debug &
- #备注 开启debug日志,方便差错
-
- #alertmanager
- /data/alertmanager/alertmanager --config.file=alertmanager.yml --log.level=debug &
-
- #blackbox_exporter
- /data/blackbox_exporter/blackbox_exporter --config.file=blackbox.yml &
-
- #node_exporter
- /data/node_exporter/node_exporter &
-




