download_url="https://github.com/prometheus/alertmanager/releases/download/v0.22.2/alertmanager-0.22.2.linux-amd64.tar.gz"
#wget ${download_url}
tar -xvf alertmanager-0.22.2.linux-amd64.tar.gz
mv alertmanager-0.22.2.linux-amd64 /usr/local/
ln -sv /usr/local/alertmanager-0.22.2.linux-amd64 /usr/local/alertmanager
cat </usr/lib/systemd/system/alertmanager.service
[Unit]
Description=alertmanager
Documentation=https://prometheus.io/
After=network-online.target
[Service]
Type=simple
User=root
Group=root
ExecStart=/usr/local/alertmanager/alertmanager \
--config.file=/usr/local/alertmanager/alertmanager.yml \
--storage.path=/usr/local/alertmanager/data/ \
--data.retention=120h \
--web.external-url=http://192.168.1.12:9093
--web.listen-address=:9093
Restart=on-failure
[Install]
WantedBy=multi-user.target
EOF
systemctl daemon-reload
systemctl start alertmanager
http://192.168.1.12:9093/#/alerts
[root@slave ~]# yum install -y mailx # 安装
[root@slave ~]# mail -V # 查看版本
tail -f /var/spool/mail/root
[root@slave ~]# vim /etc/mail.rc
...
set from=951699@qq.com
set smtp=smtp.qq.com
set smtp-auth-user=951699@qq.com
set smtp-auth-password=dfsdsadtvjuia #参考https://www.58pxe.com/7980.html获取
set smtp-auth=login
set ssl-verify=ignore
echo 'test' | mail -s 'test mail' 951699@qq.com
配置alertmanager服务器
[root@prome ~]# vim +8 /usr/local/prometheus/prometheus.yml
...
alerting:
alertmanagers:
- static_configs:
- targets:
- 192.168.1.12:9093
# 配置规则文件
rule_files:
- "/etc/prometheus/rules/*.yml"
[root@prome ~]# mkdir -p /etc/prometheus/rules
[root@prome ~]# cat /etc/prometheus/rules/rules.yml
groups:
- name: host down
rules:
- alert: node-down
expr: up{} < 2 #这里的公式现在prometheus里查好,设置2是为了较快实现报警
for: 15s
labels:
status: High
team: ezdevops
annotations:
description: "Team: {{ $labels.team}} Instance: {{ $labels.instance }} is Down ! ! !"
value: '{{ $value }}'
summary: "The host node was down 15 minutes ago"
global:
smtp_smarthost: smtp.qq.com:465 #163邮箱的应该是25,不一样
smtp_from: 951699@qq.com
smtp_auth_username: 951699@qq.com
smtp_auth_password: gkjfhfgjjjwylbgah
smtp_require_tls: false
resolve_timeout: 300s # 在此时间内未收到alert信息就默认为该报警解除
route:
# 报警时间相关配置
group_wait: 20s # 收到报警后会根据分组等待group_wait时间,这时间内的同组的报警将一起发出
group_interval: 5m # 同一个分组下之前已经发送成功过,进入新的alert时等待group_interval
repeat_interval: 120s # alert group报警发送成功且没有变化则等待repeat_interval后发送报警
# 分组及路由
group_by: [alertname] # 报警分组
receiver: default-receiver
#routes:
# - match:
# team: ezdevops
# group_by: ['instance']
# receiver: 'ops'
receivers:
- name: 'default-receiver'
email_configs:
- to: 951699058@qq.com
send_resolved: true
[root@slave ~]# systemctl restart alertmanager.service
[root@prome ~]# systemctl restart prometheus.service
global:
scrape_interval: 15s # 数据采集间隔
evaluation_interval: 15s # 评估告警周期
scrape_timeout: 30s # 数据采集超时时间默认10s
route:
group_wait: 10s # 一个新分组等待发送报警的时间
group_interval: 10s # 已经发送成功了报警的组,有新增alert加入组时下一次发送报警的时间
repeat_interval: 1m # 报警无变化情况下,重复发送告警时间。默认1h
resolve_timeout: 5m # 该时间内未收到报警则视为问题解决
groups:
- name: host down
rules:
- alert: node-down
expr: up{} != 1
for: 15s # 报警持续时间
inactive: evaluation_interval: 15s # 评估告警周期
pending: for: 15s # 报警持续时间
firing: #推送报警
是 Alertmanager 把同类型的警报进行分组,合并多条警报到一个通知中。可以把这些被触发的警报合并为一个警报进行通知,从而避免瞬间突发性的接受大量警报通知。
# 默认使用job名称分组
# 默认接受者ops
# 接受到报警后如果匹配的instance是指定内容时发送到dba组
route:
group_wait: 20s
group_interval: 60s
repeat_interval: 120s
group_by: ['first job']
receiver: ops
routes:
- match:
instance: "192.168.1.11:9100"
receiver: 'first'
#- match_re:
# team: ops|dba
# group_by: [env]
# receiver: 'ops'
- name: 'first'
email_configs:
- to: 951699@qq.com@qq.comn
send_resolved: true
当某条警报已经发送,停止重复发送由此警报引发的其他异常或故障的警报机制。例如网络交换机断开,则该交换机下的主机down机事件就无需上报。
# web cluster
- targets:
- 192.168.0.52:9100
labels:
cluster: web cluster
role: db
- targets:
- 192.168.0.53:9100
labels:
cluster: web cluster
role: web
/etc/prometheus/rules/rules.yml
- name: db down
rules:
- alert: db-down
expr: up{instance="192.168.1.11:9100"} != 1
for: 15s
labels:
status: down
- name: web down
rules:
- alert: web-down
expr: up{instance="192.168.1.12:9100"} != 1
for: 15s
labels:
http: error
/usr/local/alertmanager/alertmanager.yml
inhibit_rules:
- source_match:
status: 'down' # 匹配到标签名称status的值为down
target_match:
http: 'error' # 匹配到标签名称http的值为error
equal: ['cluster'] # 匹配到的记录如果标签cluster的值是相等的,那么就抑制
抑制结果:正常一条报警
规则参考
https://www.cnblogs.com/zhaojiedi1992/p/zhaojiedi_liunx_65_prometheus_alertmanager_rule.html