prometheus 监控
1.上传tar包
[root@promethes prometheus]
[root@promethes opt]
total 131628
-rw-r--r--. 1 root root 29254678 Sep 6 17:36 alertmanager-0.25.0.linux-amd64.tar.gz
-rw-r--r--. 1 root root 10649117 Aug 22 10:58 blackbox_exporter-0.22.0.linux-amd64.tar.gz
-rw-r--r--. 1 root root 94876162 Aug 22 14:45 prometheus-2.46.0.linux-amd64.tar.gz
[root@promethes opt]
2.解压到对应文件夹
[root@promethes opt]
[root@promethes local ]
total 92660
drwxr-xr-x. 4 3434 3434 123 Sep 6 17:51 alertmanager
drwxr-xr-x. 3 root root 19 Aug 25 14:48 alertmanager_bak
drwxr-xr-x. 2 root root 6 May 16 2022 bin
lrwxrwxrwx. 1 root root 37 Aug 22 11:01 blackbox_exporter -> blackbox_exporter-0.22.0.linux-amd64/
drwxr-xr-x. 2 3434 3434 104 Sep 6 15:26 blackbox_exporter-0.22.0.linux-amd64
drwxr-xr-x. 2 root root 6 May 16 2022 etc
drwxr-xr-x. 2 root root 6 May 16 2022 games
drwxr-xr-x. 2 root root 6 May 16 2022 include
drwxr-xr-x. 2 root root 6 May 16 2022 lib
drwxr-xr-x. 3 root root 17 Aug 10 10:42 lib64
drwxr-xr-x. 2 root root 6 May 16 2022 libexec
drwxr-xr-x. 5 root root 145 Sep 7 11:10 prometheus
drwxr-xr-x. 5 root root 171 Sep 6 16:37 prometheus_bak
-rw-r--r--. 1 root root 94879997 Sep 6 16:33 prometheus.tar
drwxr-xr-x. 2 root root 6 May 16 2022 sbin
drwxr-xr-x. 5 root root 49 Aug 10 10:42 share
drwxr-xr-x. 2 root root 6 May 16 2022 src
[root@promethes local ]
3.配置开机自启动
[root@promethes system]
/usr/lib/systemd/system
[root@promethes system]
[Unit]
Description=https://prometheus.io
[Service]
Restart=on-failure
ExecStart=/usr/local/prometheus/prometheus --config.file=/usr/local/prometheus/prometheus.yml
[Install]
WantedBy=multi-user.target
[root@promethes system]
[Unit]
Descriptinotallow=AlertManager Server Service daemon
[Service]
Type=Simple
ExecStart=/usr/local/alertmanager/alertmanager \
--config.file "/usr/local/alertmanager/alertmanager.yml" \
[Install]
WantedBy=multi-user.target
[root@promethes system]
[root@promethes system]
[Unit]
Description=blackbox_exporter
After=network.target
[Service]
User=root
Type=simple
ExecStart=/usr/local/blackbox_exporter/blackbox_exporter --config.file=/usr/local/blackbox_exporter/blackbox.yml
Restart=on-failure
[Install]
WantedBy=multi-user.target
[root@promethes system]
4.配置Prometheus
[root@promethes prometheus]
/usr/local/prometheus
[root@promethes prometheus]
total 236276
drwxr-xr-x. 2 root root 38 Jul 25 21:06 console_libraries
drwxr-xr-x. 2 root root 173 Jul 25 21:06 consoles
drwxr-xr-x. 2 root root 26 Sep 6 16:56 etc.d
-rw-r--r--. 1 root root 11357 Jul 25 21:06 LICENSE
-rw-r--r--. 1 root root 3773 Jul 25 21:06 NOTICE
-rwxr-xr-x. 1 root root 123611355 Jul 25 20:34 prometheus
-rw-r--r--. 1 root root 2749 Sep 7 11:10 prometheus.yml
-rwxr-xr-x. 1 root root 118310964 Jul 25 20:36 promtool
[root@promethes prometheus]
global:
scrape_interval: 60s
evaluation_interval: 60s
external_labels:
monitor: '?jivest'
alerting:
alertmanagers:
- static_configs:
- targets:
- 172.30.3.22:9093
rule_files:
- "/usr/local/alertmanager/rules/*.yml"
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090' ]
- job_name: 'http_status'
metrics_path: /probe
params:
module: [http_2xx]
file_sd_configs:
- files:
- '/usr/local/prometheus/etc.d/job_web.yaml'
refresh_interval: 15s
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 172.30.3.22:9115
- job_name: node
static_configs:
- targets:
- 172.30.3.?:9099
- 172.30.3.17?:9099
- 172.30.3.19?:9099
[root@promethes prometheus]
5.black_exporter 监控网站状态
[root@promethes blackbox_exporter]
total 20284
-rwxr-xr-x. 1 3434 3434 20745692 Aug 2 2022 blackbox_exporter
-rw-r--r--. 1 3434 3434 1503 Aug 22 16:24 blackbox.yml
-rw-r--r--. 1 root root 910 Aug 22 16:21 blackbox.yml.bak
-rw-r--r--. 1 3434 3434 11357 Aug 2 2022 LICENSE
-rw-r--r--. 1 3434 3434 94 Aug 2 2022 NOTICE
[root@promethes blackbox_exporter]
/usr/local/blackbox_exporter
[root@promethes blackbox_exporter]
modules:
http_2xx:
prober: http
timeout : 5s
http:
valid_status_codes: []
method: GET
fail_if_body_not_matches_regexp: []
tls_config:
insecure_skip_verify: true
http_post_2xx:
prober: http
http:
method: POST
tcp_connect:
prober: tcp
pop3s_banner:
prober: tcp
tcp:
query_response:
- expect: "^+OK"
tls: true
tls_config:
insecure_skip_verify: false
grpc:
prober: grpc
grpc:
tls: true
preferred_ip_protocol: "ip4"
grpc_plain:
prober: grpc
grpc:
tls: false
service: "service1"
ssh_banner:
prober: tcp
tcp:
query_response:
- expect: "^SSH-2.0-"
- send: "SSH-2.0-blackbox-ssh-check"
irc_banner:
prober: tcp
tcp:
query_response:
- send: "NICK prober"
- send: "USER prober prober prober :prober"
- expect: "PING :([^ ]+)"
send: "PONG ${1} "
- expect: "^:[^ ]+ 001"
icmp:
prober: icmp
icmp_ttl5:
prober: icmp
timeout : 5s
icmp:
ttl: 5
[root@promethes blackbox_exporter]
6.alertmanager监控规则路径
6.1目录结构
[root@promethes alertmanager]
total 61016
-rwxr-xr-x. 1 3434 3434 34546840 Dec 22 2022 alertmanager
-rw-r--r--. 1 3434 3434 2074 Sep 6 17:51 alertmanager.yml
-rwxr-xr-x. 1 3434 3434 27906085 Dec 22 2022 amtool
-rw-r--r--. 1 3434 3434 11357 Dec 22 2022 LICENSE
-rw-r--r--. 1 3434 3434 457 Dec 22 2022 NOTICE
drwxr-xr-x. 2 root root 103 Sep 7 15:41 rules
drwxr-xr-x. 2 root root 24 Sep 6 17:43 templates
[root@promethes alertmanager]
/usr/local/alertmanager
[root@promethes alertmanager]
global:
smtp_smarthost: 'smtp.exmail.qq.com:587'
smtp_from: '*@mails.*jivest.com'
smtp_hello: '*jivest.com'
smtp_auth_username: '*@mails.youjivest.com'
smtp_auth_password: '????????'
templates:
- '/usr/local/alertmanager/templates/*.tmpl'
route:
group_by: ['alertname' , 'cluster' , 'service' ]
group_wait: 30s
group_interval: 5m
repeat_interval: 3h
receiver: team-X
routes:
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname' , 'cluster' , 'service' ]
receivers:
- name: 'team-X'
email_configs:
- to: 'chenhu@youjivest.com'
[root@promethes alertmanager]
告警模板大全
[root@promethes prometheus]
[root@promethes rules]
total 4
-rw-r--r--. 1 root root 3535 Sep 6 17:42 rules.yml
[root@promethes rules]
groups :
- name: 系统盘空间
rules:
- alert: node_filesystem_avail_bytes
expr : node_filesystem_avail_bytes{mountpoint="/" } / node_filesystem_size_bytes{mountpoint="/" } < 0.1
for : 10m
annotations:
summary: 系统盘空间不足10%
- name: 数据盘空间
rules:
- alert: node_filesystem_avail_bytes
expr : node_filesystem_avail_bytes{mountpoint="/data" } / node_filesystem_size_bytes{mountpoint="/data" } < 0.1
for : 10m
annotations:
summary: 数据盘空间不足10%
- name: 可用内存
rules:
- alert: node_memory_MemAvailable_bytes
expr : node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes < 0.8
for : 10m
annotations:
summary: 可用内存不足, 使用swap空间超过20%
- name: 服务器状态
rules:
- alert: up
expr : up == 0
for : 10m
annotations:
summary: 服务器不可访问
- name: ipa服务状态
rules:
- alert: freeipa
expr : node_systemd_unit_state{name="ipa.service" , state="failed" } == 1
for : 10m
annotations:
summary: ipa服务故障
- name: radius认证服务状态
rules:
- alert: freeradius
expr : node_systemd_unit_state{name="radiusd.service" , state="failed" } == 1
for : 10m
annotations:
summary: VPN认证服务故障
- name: VPN服务状态
rules:
- alert: strongswan
expr : node_systemd_unit_state{name="strongswan-starter.service" , state="failed" } == 1
for : 10m
annotations:
summary: VPN服务故障
- name: certbot-renew服务状态
rules:
- alert: certbot
expr : node_systemd_unit_state{name="certbot-renew.service" , state="failed" } == 1
for : 10m
annotations:
summary: 证书自动更新服务故障
- name: dns服务状态
rules:
- alert: dnsmasq
expr : node_systemd_unit_state{name="dnsmasq.service" , state="failed" } == 1
for : 10m
annotations:
summary: DNS服务故障
- name: nginx服务状态
rules:
- alert: nginx
expr : node_systemd_unit_state{name="nginx.service" , state="failed" } == 1
for : 10m
annotations:
summary: Web服务故障
- name: squid服务状态
rules:
- alert: squid
expr : node_systemd_unit_state{name="squid.service" , state="failed" } == 1
for : 10m
annotations:
summary: Web正向代理服务故障
- name: gitlab服务状态
rules:
- alert: gitlab
expr : node_systemd_unit_state{name="gitlab-runsvdir.service" , state="failed" } == 1
for : 10m
annotations:
summary: Gitlab服务故障
- name: drive服务状态
rules:
- alert: nextcloud
expr : node_systemd_unit_state{name="php-fpm.service" , state="failed" } == 1
for : 10m
annotations:
summary: 网盘服务故障
- name: keycloak服务状态
rules:
- alert: keycloak
expr : node_systemd_unit_state{name="keycloak.service" , state="failed" } == 1
for : 10m
annotations:
summary: 单点登录服务故障
- name: jenkins服务状态
rules:
- alert: jenkins
expr : node_systemd_unit_state{name="jenkins.service" , state="failed" } == 1
for : 10m
annotations:
summary: 持续集成服务故障
- name: postgresql服务状态
rules:
- alert: postgresql
expr : node_systemd_unit_state{name="postgresql.service" , state="failed" } == 1
for : 10m
annotations:
summary: 数据库服务故障
- name: neo4j服务状态
rules:
- alert: neo4j
expr : node_systemd_unit_state{name="neo4j.service" , state="failed" } == 1
for : 10m
annotations:
summary: 图数据库服务故障
[root@promethes rules]
6.2监控ceph
[root@promethes rules]
groups :
- name: EmbeddedExporter
rules:
- alert: CephState
expr : 'ceph_health_status != 0'
for : 30m
labels:
severity: critical
annotations:
summary: Ceph State (instance {{ $labels .instance }})
description: "Ceph instance unhealthy\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CephMonitorClockSkew
expr : 'abs(ceph_monitor_clock_skew_seconds) > 0.2'
for : 2m
labels:
severity: warning
annotations:
summary: Ceph monitor clock skew (instance {{ $labels .instance }})
description: "Ceph monitor clock skew detected. Please check ntp and hardware clock settings\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CephMonitorLowSpace
expr : 'ceph_monitor_avail_percent < 10'
for : 2m
labels:
severity: warning
annotations:
summary: Ceph monitor low space (instance {{ $labels .instance }})
description: "Ceph monitor storage is low.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CephOsdDown
expr : 'ceph_osd_up == 0'
for : 30m
labels:
severity: critical
annotations:
summary: Ceph OSD Down (instance {{ $labels .instance }})
description: "Ceph Object Storage Daemon Down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CephHighOsdLatency
expr : 'ceph_osd_perf_apply_latency_seconds > 5'
for : 1m
labels:
severity: warning
annotations:
summary: Ceph high OSD latency (instance {{ $labels .instance }})
description: "Ceph Object Storage Daemon latency is high. Please check if it doesn't stuck in weird state.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CephOsdLowSpace
expr : 'ceph_osd_utilization > 90'
for : 2m
labels:
severity: warning
annotations:
summary: Ceph OSD low space (instance {{ $labels .instance }})
description: "Ceph Object Storage Daemon is going out of space. Please add more disks.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CephOsdReweighted
expr : 'ceph_osd_weight < 1'
for : 2m
labels:
severity: warning
annotations:
summary: Ceph OSD reweighted (instance {{ $labels .instance }})
description: "Ceph Object Storage Daemon takes too much time to resize.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CephPgDown
expr : 'ceph_pg_down > 0'
for : 30m
labels:
severity: critical
annotations:
summary: Ceph PG down (instance {{ $labels .instance }})
description: "Some Ceph placement groups are down. Please ensure that all the data are available.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CephPgIncomplete
expr : 'ceph_pg_incomplete > 0'
for : 30m
labels:
severity: critical
annotations:
summary: Ceph PG incomplete (instance {{ $labels .instance }})
description: "Some Ceph placement groups are incomplete. Please ensure that all the data are available.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CephPgInconsistent
expr : 'ceph_pg_inconsistent > 0'
for : 30m
labels:
severity: warning
annotations:
summary: Ceph PG inconsistent (instance {{ $labels .instance }})
description: "Some Ceph placement groups are inconsistent. Data is available but inconsistent across nodes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CephPgActivationLong
expr : 'ceph_pg_activating > 0'
for : 2m
labels:
severity: warning
annotations:
summary: Ceph PG activation long (instance {{ $labels .instance }})
description: "Some Ceph placement groups are too long to activate.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CephPgBackfillFull
expr : 'ceph_pg_backfill_toofull > 0'
for : 2m
labels:
severity: warning
annotations:
summary: Ceph PG backfill full (instance {{ $labels .instance }})
description: "Some Ceph placement groups are located on full Object Storage Daemon on cluster. Those PGs can be unavailable shortly. Please check OSDs, change weight or reconfigure CRUSH rules.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CephPgUnavailable
expr : 'ceph_pg_total - ceph_pg_active > 0'
for : 30m
labels:
severity: critical
annotations:
summary: Ceph PG unavailable (instance {{ $labels .instance }})
description: "Some Ceph placement groups are unavailable.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
[root@promethes rules]
6.3监控jenkins
[root@promethes rules]
groups :
- name: MetricPlugin
rules:
- alert: JenkinsOffline
expr : 'jenkins_node_offline_value > 1'
for : 30m
labels:
severity: critical
annotations:
summary: Jenkins offline (instance {{ $labels .instance }})
description: "Jenkins offline: `{{$labels .instance}}` in realm {{$labels .realm}}/{{$labels .env}} ({{$labels .region}})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: JenkinsHealthcheck
expr : 'jenkins_health_check_score < 1'
for : 30m
labels:
severity: critical
annotations:
summary: Jenkins healthcheck (instance {{ $labels .instance }})
description: "Jenkins healthcheck score: {{$value }}. Healthcheck failure for `{{$labels .instance}}` in realm {{$labels .realm}}/{{$labels .env}} ({{$labels .region}})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: JenkinsOutdatedPlugins
expr : 'sum(jenkins_plugins_withUpdate) by (instance) > 3'
for : 1d
labels:
severity: warning
annotations:
summary: Jenkins outdated plugins (instance {{ $labels .instance }})
description: "{{ $value }} plugins need update\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: JenkinsBuildsHealthScore
expr : 'default_jenkins_builds_health_score < 1'
for : 30m
labels:
severity: critical
annotations:
summary: Jenkins builds health score (instance {{ $labels .instance }})
description: "Healthcheck failure for `{{$labels .instance}}` in realm {{$labels .realm}}/{{$labels .env}} ({{$labels .region}})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: JenkinsRunFailureTotal
expr : 'delta(jenkins_runs_failure_total[1h]) > 100'
for : 30m
labels:
severity: warning
annotations:
summary: Jenkins run failure total (instance {{ $labels .instance }})
description: "Job run failures: ({{$value }}) {{$labels .jenkins_job}}. Healthcheck failure for `{{$labels .instance}}` in realm {{$labels .realm}}/{{$labels .env}} ({{$labels .region}})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: JenkinsBuildTestsFailing
expr : 'default_jenkins_builds_last_build_tests_failing > 0'
for : 30m
labels:
severity: warning
annotations:
summary: Jenkins build tests failing (instance {{ $labels .instance }})
description: "Last build tests failed: {{$labels .jenkins_job}}. Failed build Tests for job `{{$labels .jenkins_job}}` on {{$labels .instance}}/{{$labels .env}} ({{$labels .region}})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: JenkinsLastBuildFailed
expr : 'default_jenkins_builds_last_build_result_ordinal == 2'
for : 30m
labels:
severity: warning
annotations:
summary: Jenkins last build failed (instance {{ $labels .instance }})
description: "Last build failed: {{$labels .jenkins_job}}. Failed build for job `{{$labels .jenkins_job}}` on {{$labels .instance}}/{{$labels .env}} ({{$labels .region}})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
[root@promethes rules]
6.4监控postgresql
[root@promethes rules]
groups :
- name: PostgresExporter
rules:
- alert: PostgresqlDown
expr : 'pg_up == 0'
for : 0m
labels:
severity: critical
annotations:
summary: Postgresql down (instance {{ $labels .instance }})
description: "Postgresql instance is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlRestarted
expr : 'time() - pg_postmaster_start_time_seconds < 60'
for : 0m
labels:
severity: critical
annotations:
summary: Postgresql restarted (instance {{ $labels .instance }})
description: "Postgresql restarted\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlExporterError
expr : 'pg_exporter_last_scrape_error > 0'
for : 0m
labels:
severity: critical
annotations:
summary: Postgresql exporter error (instance {{ $labels .instance }})
description: "Postgresql exporter is showing errors. A query may be buggy in query.yaml\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlTableNotAutoVacuumed
expr : '(pg_stat_user_tables_last_autovacuum > 0) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10'
for : 0m
labels:
severity: warning
annotations:
summary: Postgresql table not auto vacuumed (instance {{ $labels .instance }})
description: "Table {{ $labels .relname }} has not been auto vacuumed for 10 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlTableNotAutoAnalyzed
expr : '(pg_stat_user_tables_last_autoanalyze > 0) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10'
for : 0m
labels:
severity: warning
annotations:
summary: Postgresql table not auto analyzed (instance {{ $labels .instance }})
description: "Table {{ $labels .relname }} has not been auto analyzed for 10 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlTooManyConnections
expr : 'sum by (instance, job, server) (pg_stat_activity_count) > min by (instance, job, server) (pg_settings_max_connections * 0.8)'
for : 2m
labels:
severity: warning
annotations:
summary: Postgresql too many connections (instance {{ $labels .instance }})
description: "PostgreSQL instance has too many connections (> 80%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlNotEnoughConnections
expr : 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5'
for : 2m
labels:
severity: warning
annotations:
summary: Postgresql not enough connections (instance {{ $labels .instance }})
description: "PostgreSQL instance should have more connections (> 5)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlDeadLocks
expr : 'increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5'
for : 0m
labels:
severity: warning
annotations:
summary: Postgresql dead locks (instance {{ $labels .instance }})
description: "PostgreSQL has dead-locks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlHighRollbackRate
expr : 'sum by (namespace,datname) ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) / ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) + (rate(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[3m])))) > 0.02'
for : 0m
labels:
severity: warning
annotations:
summary: Postgresql high rollback rate (instance {{ $labels .instance }})
description: "Ratio of transactions being aborted compared to committed is > 2 %\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlCommitRateLow
expr : 'rate(pg_stat_database_xact_commit[1m]) < 10'
for : 2m
labels:
severity: critical
annotations:
summary: Postgresql commit rate low (instance {{ $labels .instance }})
description: "Postgresql seems to be processing very few transactions\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlLowXidConsumption
expr : 'rate(pg_txid_current[1m]) < 5'
for : 2m
labels:
severity: warning
annotations:
summary: Postgresql low XID consumption (instance {{ $labels .instance }})
description: "Postgresql seems to be consuming transaction IDs very slowly\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlHighRateStatementTimeout
expr : 'rate(postgresql_errors_total{type="statement_timeout"}[1m]) > 3'
for : 0m
labels:
severity: critical
annotations:
summary: Postgresql high rate statement timeout (instance {{ $labels .instance }})
description: "Postgres transactions showing high rate of statement timeouts\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlHighRateDeadlock
expr : 'increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1'
for : 0m
labels:
severity: critical
annotations:
summary: Postgresql high rate deadlock (instance {{ $labels .instance }})
description: "Postgres detected deadlocks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlUnusedReplicationSlot
expr : 'pg_replication_slots_active == 0'
for : 1m
labels:
severity: warning
annotations:
summary: Postgresql unused replication slot (instance {{ $labels .instance }})
description: "Unused Replication Slots\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlTooManyDeadTuples
expr : '((pg_stat_user_tables_n_dead_tup > 10000) / (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup)) >= 0.1'
for : 2m
labels:
severity: warning
annotations:
summary: Postgresql too many dead tuples (instance {{ $labels .instance }})
description: "PostgreSQL dead tuples is too large\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlConfigurationChanged
expr : '{__name__=~"pg_settings_.*"} != ON(__name__) {__name__=~"pg_settings_([^t]|t[^r]|tr[^a]|tra[^n]|tran[^s]|trans[^a]|transa[^c]|transac[^t]|transact[^i]|transacti[^o]|transactio[^n]|transaction[^_]|transaction_[^r]|transaction_r[^e]|transaction_re[^a]|transaction_rea[^d]|transaction_read[^_]|transaction_read_[^o]|transaction_read_o[^n]|transaction_read_on[^l]|transaction_read_onl[^y]).*"} OFFSET 5m'
for : 0m
labels:
severity: info
annotations:
summary: Postgresql configuration changed (instance {{ $labels .instance }})
description: "Postgres Database configuration change has occurred\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlSslCompressionActive
expr : 'sum(pg_stat_ssl_compression) > 0'
for : 0m
labels:
severity: critical
annotations:
summary: Postgresql SSL compression active (instance {{ $labels .instance }})
description: "Database connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlTooManyLocksAcquired
expr : '((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20'
for : 2m
labels:
severity: critical
annotations:
summary: Postgresql too many locks acquired (instance {{ $labels .instance }})
description: "Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlBloatIndexHigh(>80%)
expr : 'pg_bloat_btree_bloat_pct > 80 and on (idxname) (pg_bloat_btree_real_size > 100000000)'
for : 1h
labels:
severity: warning
annotations:
summary: Postgresql bloat index high (> 80%) (instance {{ $labels .instance }})
description: "The index {{ $labels .idxname }} is bloated. You should execute `REINDEX INDEX CONCURRENTLY {{ $labels .idxname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlBloatTableHigh(>80%)
expr : 'pg_bloat_table_bloat_pct > 80 and on (relname) (pg_bloat_table_real_size > 200000000)'
for : 1h
labels:
severity: warning
annotations:
summary: Postgresql bloat table high (> 80%) (instance {{ $labels .instance }})
description: "The table {{ $labels .relname }} is bloated. You should execute `VACUUM {{ $labels .relname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
[root@promethes rules]
6.5发送邮件模板
[root@promethes templates]
{{ define "test.html" }}
{{ range .Alerts }}
=========start==========
告警程序: {{ .Labels.job }}
告警级别: {{ .Labels.severity }} 级
告警类型: {{ .Labels.alertname }}
故障主机: {{ .Labels.instance }}
告警主题: {{ .Annotations.summary }}
触发时间: {{ .StartsAt.Format "2006-01-02 15:04:05" }}
=========end==========
{{ end }}
{{ end }}
[root@promethes templates]