• prometheus 监控实战篇


    prometheus 监控

    1.上传tar包

    [root@promethes prometheus]# cd /opt
    [root@promethes opt]# ll
    total 131628
    -rw-r--r--. 1 root root 29254678 Sep 6 17:36 alertmanager-0.25.0.linux-amd64.tar.gz
    -rw-r--r--. 1 root root 10649117 Aug 22 10:58 blackbox_exporter-0.22.0.linux-amd64.tar.gz
    -rw-r--r--. 1 root root 94876162 Aug 22 14:45 prometheus-2.46.0.linux-amd64.tar.gz
    [root@promethes opt]#

    2.解压到对应文件夹

    [root@promethes opt]# tar xf alertmanager-0.25.0.linux-amd64.tar.gz -C /usr/local/
    [root@promethes local]# ll
    total 92660
    drwxr-xr-x. 4 3434 3434 123 Sep 6 17:51 alertmanager
    drwxr-xr-x. 3 root root 19 Aug 25 14:48 alertmanager_bak
    drwxr-xr-x. 2 root root 6 May 16 2022 bin
    lrwxrwxrwx. 1 root root 37 Aug 22 11:01 blackbox_exporter -> blackbox_exporter-0.22.0.linux-amd64/
    drwxr-xr-x. 2 3434 3434 104 Sep 6 15:26 blackbox_exporter-0.22.0.linux-amd64
    drwxr-xr-x. 2 root root 6 May 16 2022 etc
    drwxr-xr-x. 2 root root 6 May 16 2022 games
    drwxr-xr-x. 2 root root 6 May 16 2022 include
    drwxr-xr-x. 2 root root 6 May 16 2022 lib
    drwxr-xr-x. 3 root root 17 Aug 10 10:42 lib64
    drwxr-xr-x. 2 root root 6 May 16 2022 libexec
    drwxr-xr-x. 5 root root 145 Sep 7 11:10 prometheus
    drwxr-xr-x. 5 root root 171 Sep 6 16:37 prometheus_bak
    -rw-r--r--. 1 root root 94879997 Sep 6 16:33 prometheus.tar
    drwxr-xr-x. 2 root root 6 May 16 2022 sbin
    drwxr-xr-x. 5 root root 49 Aug 10 10:42 share
    drwxr-xr-x. 2 root root 6 May 16 2022 src
    [root@promethes local]#

    3.配置开机自启动

    [root@promethes system]# pwd
    /usr/lib/systemd/system
    [root@promethes system]# cat prometheus.service
    [Unit]
    Description=https://prometheus.io
    [Service]
    Restart=on-failure
    ExecStart=/usr/local/prometheus/prometheus --config.file=/usr/local/prometheus/prometheus.yml
    [Install]
    WantedBy=multi-user.target
    [root@promethes system]# cat alertmanager.service
    [Unit]
    Descriptinotallow=AlertManager Server Service daemon
    #Wants=network-online.target
    #After=network-online.target
    [Service]
    #User=root
    #Group=root
    Type=Simple
    #Restart=on-failure
    ExecStart=/usr/local/alertmanager/alertmanager \
    --config.file "/usr/local/alertmanager/alertmanager.yml" \
    # --storage.path="/usr/local/alertmanager/data" \
    # --data.retentinotallow=120h \
    # --alerts.gc-interval=30m \
    # --web.external-url "http://172.30.3.23:9093"
    # --web.listen-address=":9093"
    [Install]
    WantedBy=multi-user.target
    [root@promethes system]#
    [root@promethes system]# cat blackbox_exporter.service
    [Unit]
    Description=blackbox_exporter
    After=network.target
    [Service]
    User=root
    Type=simple
    ExecStart=/usr/local/blackbox_exporter/blackbox_exporter --config.file=/usr/local/blackbox_exporter/blackbox.yml
    Restart=on-failure
    [Install]
    WantedBy=multi-user.target
    [root@promethes system]#

    4.配置Prometheus

    [root@promethes prometheus]# pwd
    /usr/local/prometheus
    [root@promethes prometheus]# ll
    total 236276
    drwxr-xr-x. 2 root root 38 Jul 25 21:06 console_libraries
    drwxr-xr-x. 2 root root 173 Jul 25 21:06 consoles
    drwxr-xr-x. 2 root root 26 Sep 6 16:56 etc.d
    -rw-r--r--. 1 root root 11357 Jul 25 21:06 LICENSE
    -rw-r--r--. 1 root root 3773 Jul 25 21:06 NOTICE
    -rwxr-xr-x. 1 root root 123611355 Jul 25 20:34 prometheus
    -rw-r--r--. 1 root root 2749 Sep 7 11:10 prometheus.yml
    -rwxr-xr-x. 1 root root 118310964 Jul 25 20:36 promtool
    [root@promethes prometheus]# cat prometheus.yml
    # my global config
    global:
    scrape_interval: 60s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
    evaluation_interval: 60s # Evaluate rules every 15 seconds. The default is every 1 minute.
    # scrape_timeout is set to the global default (10s).
    # Attach these labels to any time series or alerts when communicating with
    # external systems (federation, remote storage, Alertmanager).
    external_labels:
    monitor: '?jivest'
    # Alertmanager configuration
    alerting:
    alertmanagers:
    - static_configs:
    - targets:
    - 172.30.3.22:9093
    # Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
    #监控规则路径
    rule_files:
    # - "first_rules.yml"
    # - "second_rules.yml"
    #- rules.yml
    - "/usr/local/alertmanager/rules/*.yml"
    # A scrape configuration containing exactly one endpoint to scrape:
    # Here it's Prometheus itself.
    scrape_configs:
    # The job name is added as a label `job=` to any timeseries scraped from this config.
    - job_name: 'prometheus'
    static_configs:
    - targets: ['localhost:9090']
    # 网站监控
    - job_name: 'http_status'
    metrics_path: /probe
    params:
    module: [http_2xx] # 在black_exporter中定义的模块名
    file_sd_configs: # 因需要监控的地址很多,我们这里将所有地址独立出来
    - files:
    - '/usr/local/prometheus/etc.d/job_web.yaml' #监控网站地址
    refresh_interval: 15s
    relabel_configs:
    - source_labels: [__address__]
    target_label: __param_target
    - source_labels: [__param_target]
    target_label: instance
    - target_label: __address__
    replacement: 172.30.3.22:9115
    - job_name: node
    # grab stats about the local machine by default.
    static_configs:
    - targets:
    - 172.30.3.?:9099
    - 172.30.3.17?:9099
    - 172.30.3.19?:9099
    [root@promethes prometheus]#

    5.black_exporter 监控网站状态

    [root@promethes blackbox_exporter]# ll
    total 20284
    -rwxr-xr-x. 1 3434 3434 20745692 Aug 2 2022 blackbox_exporter
    -rw-r--r--. 1 3434 3434 1503 Aug 22 16:24 blackbox.yml
    -rw-r--r--. 1 root root 910 Aug 22 16:21 blackbox.yml.bak
    -rw-r--r--. 1 3434 3434 11357 Aug 2 2022 LICENSE
    -rw-r--r--. 1 3434 3434 94 Aug 2 2022 NOTICE
    [root@promethes blackbox_exporter]# pwd
    /usr/local/blackbox_exporter
    [root@promethes blackbox_exporter]# cat blackbox.yml
    modules:
    http_2xx:
    prober: http
    timeout: 5s # 探针检测超时时间
    http:
    valid_status_codes: [] # 有效的状态码,默认为200,也可以自己定义,比如你的站点304也可能是正常的
    method: GET # http使用get请求
    fail_if_body_not_matches_regexp: [] # 对返回结果进行正则匹配,如果未匹配成功则认为失败
    tls_config:
    insecure_skip_verify: true # 不安全的https跳过确认,如某些证书不合法或者过期,如果你在浏览器访问,那浏览器会让你确认是否继续,这里也是类似的意思。
    http_post_2xx:
    prober: http
    http:
    method: POST
    tcp_connect:
    prober: tcp
    pop3s_banner:
    prober: tcp
    tcp:
    query_response:
    - expect: "^+OK"
    tls: true
    tls_config:
    insecure_skip_verify: false
    grpc:
    prober: grpc
    grpc:
    tls: true
    preferred_ip_protocol: "ip4"
    grpc_plain:
    prober: grpc
    grpc:
    tls: false
    service: "service1"
    ssh_banner:
    prober: tcp
    tcp:
    query_response:
    - expect: "^SSH-2.0-"
    - send: "SSH-2.0-blackbox-ssh-check"
    irc_banner:
    prober: tcp
    tcp:
    query_response:
    - send: "NICK prober"
    - send: "USER prober prober prober :prober"
    - expect: "PING :([^ ]+)"
    send: "PONG ${1}"
    - expect: "^:[^ ]+ 001"
    icmp:
    prober: icmp
    icmp_ttl5:
    prober: icmp
    timeout: 5s
    icmp:
    ttl: 5
    [root@promethes blackbox_exporter]#

    6.alertmanager监控规则路径

    6.1目录结构

    [root@promethes alertmanager]# ll
    total 61016
    -rwxr-xr-x. 1 3434 3434 34546840 Dec 22 2022 alertmanager
    -rw-r--r--. 1 3434 3434 2074 Sep 6 17:51 alertmanager.yml
    -rwxr-xr-x. 1 3434 3434 27906085 Dec 22 2022 amtool
    -rw-r--r--. 1 3434 3434 11357 Dec 22 2022 LICENSE
    -rw-r--r--. 1 3434 3434 457 Dec 22 2022 NOTICE
    drwxr-xr-x. 2 root root 103 Sep 7 15:41 rules
    drwxr-xr-x. 2 root root 24 Sep 6 17:43 templates
    [root@promethes alertmanager]# pwd
    /usr/local/alertmanager
    [root@promethes alertmanager]# cat alertmanager.yml
    # Sample configuration.
    # See https://prometheus.io/docs/alerting/configuration/ for documentation.
    global:
    # The smarthost and SMTP sender used for mail notifications.
    smtp_smarthost: 'smtp.exmail.qq.com:587'
    smtp_from: '*@mails.*jivest.com'
    smtp_hello: '*jivest.com'
    smtp_auth_username: '*@mails.youjivest.com'
    smtp_auth_password: '????????'
    # The directory from which notification templates are read.
    templates:
    - '/usr/local/alertmanager/templates/*.tmpl'
    # The root route on which each incoming alert enters.
    route:
    # The labels by which incoming alerts are grouped together. For example,
    # multiple alerts coming in for cluster=A and alertname=LatencyHigh would
    # be batched into a single group.
    group_by: ['alertname', 'cluster', 'service']
    # When a new group of alerts is created by an incoming alert, wait at
    # least 'group_wait' to send the initial notification.
    # This way ensures that you get multiple alerts for the same group that start
    # firing shortly after another are batched together on the first
    # notification.
    group_wait: 30s
    # When the first notification was sent, wait 'group_interval' to send a batch
    # of new alerts that started firing for that group.
    group_interval: 5m
    # If an alert has successfully been sent, wait 'repeat_interval' to
    # resend them.
    repeat_interval: 3h
    # A default receiver
    receiver: team-X
    # All the above attributes are inherited by all child routes and can
    # overwritten on each.
    # The child route trees.
    routes:
    # Inhibition rules allow to mute a set of alerts given that another alert is
    # firing.
    # We use this to mute any warning-level notifications if the same alert is
    # already critical.
    inhibit_rules:
    - source_match:
    severity: 'critical'
    target_match:
    severity: 'warning'
    # Apply inhibition if the alertname is the same.
    equal: ['alertname', 'cluster', 'service']
    receivers:
    - name: 'team-X'
    email_configs:
    - to: 'chenhu@youjivest.com'
    # wechat_configs:
    # - agent_id: '1000002'
    # to_user: '@all'
    [root@promethes alertmanager]#

    告警模板大全

    [root@promethes prometheus]# cd /usr/local/alertmanager/rules
    [root@promethes rules]# ll
    total 4
    -rw-r--r--. 1 root root 3535 Sep 6 17:42 rules.yml
    [root@promethes rules]# cat rules.yml
    groups:
    - name: 系统盘空间
    rules:
    - alert: node_filesystem_avail_bytes
    expr: node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} < 0.1
    for: 10m
    annotations:
    summary: 系统盘空间不足10%
    - name: 数据盘空间
    rules:
    - alert: node_filesystem_avail_bytes
    expr: node_filesystem_avail_bytes{mountpoint="/data"} / node_filesystem_size_bytes{mountpoint="/data"} < 0.1
    for: 10m
    annotations:
    summary: 数据盘空间不足10%
    - name: 可用内存
    rules:
    - alert: node_memory_MemAvailable_bytes
    expr: node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes < 0.8
    for: 10m
    annotations:
    summary: 可用内存不足, 使用swap空间超过20%
    - name: 服务器状态
    rules:
    - alert: up
    expr: up == 0
    for: 10m
    annotations:
    summary: 服务器不可访问
    - name: ipa服务状态
    rules:
    - alert: freeipa
    expr: node_systemd_unit_state{name="ipa.service", state="failed"} == 1
    for: 10m
    annotations:
    summary: ipa服务故障
    - name: radius认证服务状态
    rules:
    - alert: freeradius
    expr: node_systemd_unit_state{name="radiusd.service", state="failed"} == 1
    for: 10m
    annotations:
    summary: VPN认证服务故障
    - name: VPN服务状态
    rules:
    - alert: strongswan
    expr: node_systemd_unit_state{name="strongswan-starter.service", state="failed"} == 1
    for: 10m
    annotations:
    summary: VPN服务故障
    - name: certbot-renew服务状态
    rules:
    - alert: certbot
    expr: node_systemd_unit_state{name="certbot-renew.service", state="failed"} == 1
    for: 10m
    annotations:
    summary: 证书自动更新服务故障
    - name: dns服务状态
    rules:
    - alert: dnsmasq
    expr: node_systemd_unit_state{name="dnsmasq.service", state="failed"} == 1
    for: 10m
    annotations:
    summary: DNS服务故障
    - name: nginx服务状态
    rules:
    - alert: nginx
    expr: node_systemd_unit_state{name="nginx.service", state="failed"} == 1
    for: 10m
    annotations:
    summary: Web服务故障
    - name: squid服务状态
    rules:
    - alert: squid
    expr: node_systemd_unit_state{name="squid.service", state="failed"} == 1
    for: 10m
    annotations:
    summary: Web正向代理服务故障
    - name: gitlab服务状态
    rules:
    - alert: gitlab
    expr: node_systemd_unit_state{name="gitlab-runsvdir.service", state="failed"} == 1
    for: 10m
    annotations:
    summary: Gitlab服务故障
    - name: drive服务状态
    rules:
    - alert: nextcloud
    expr: node_systemd_unit_state{name="php-fpm.service", state="failed"} == 1
    for: 10m
    annotations:
    summary: 网盘服务故障
    - name: keycloak服务状态
    rules:
    - alert: keycloak
    expr: node_systemd_unit_state{name="keycloak.service", state="failed"} == 1
    for: 10m
    annotations:
    summary: 单点登录服务故障
    - name: jenkins服务状态
    rules:
    - alert: jenkins
    expr: node_systemd_unit_state{name="jenkins.service", state="failed"} == 1
    for: 10m
    annotations:
    summary: 持续集成服务故障
    - name: postgresql服务状态
    rules:
    - alert: postgresql
    expr: node_systemd_unit_state{name="postgresql.service", state="failed"} == 1
    for: 10m
    annotations:
    summary: 数据库服务故障
    - name: neo4j服务状态
    rules:
    - alert: neo4j
    expr: node_systemd_unit_state{name="neo4j.service", state="failed"} == 1
    for: 10m
    annotations:
    summary: 图数据库服务故障
    [root@promethes rules]#

    6.2监控ceph

    [root@promethes rules]# cat ceph-exporter.yml
    groups:
    - name: EmbeddedExporter
    rules:
    - alert: CephState
    expr: 'ceph_health_status != 0'
    for: 30m
    labels:
    severity: critical
    annotations:
    summary: Ceph State (instance {{ $labels.instance }})
    description: "Ceph instance unhealthy\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
    - alert: CephMonitorClockSkew
    expr: 'abs(ceph_monitor_clock_skew_seconds) > 0.2'
    for: 2m
    labels:
    severity: warning
    annotations:
    summary: Ceph monitor clock skew (instance {{ $labels.instance }})
    description: "Ceph monitor clock skew detected. Please check ntp and hardware clock settings\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
    - alert: CephMonitorLowSpace
    expr: 'ceph_monitor_avail_percent < 10'
    for: 2m
    labels:
    severity: warning
    annotations:
    summary: Ceph monitor low space (instance {{ $labels.instance }})
    description: "Ceph monitor storage is low.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
    - alert: CephOsdDown
    expr: 'ceph_osd_up == 0'
    for: 30m
    labels:
    severity: critical
    annotations:
    summary: Ceph OSD Down (instance {{ $labels.instance }})
    description: "Ceph Object Storage Daemon Down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
    - alert: CephHighOsdLatency
    expr: 'ceph_osd_perf_apply_latency_seconds > 5'
    for: 1m
    labels:
    severity: warning
    annotations:
    summary: Ceph high OSD latency (instance {{ $labels.instance }})
    description: "Ceph Object Storage Daemon latency is high. Please check if it doesn't stuck in weird state.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
    - alert: CephOsdLowSpace
    expr: 'ceph_osd_utilization > 90'
    for: 2m
    labels:
    severity: warning
    annotations:
    summary: Ceph OSD low space (instance {{ $labels.instance }})
    description: "Ceph Object Storage Daemon is going out of space. Please add more disks.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
    - alert: CephOsdReweighted
    expr: 'ceph_osd_weight < 1'
    for: 2m
    labels:
    severity: warning
    annotations:
    summary: Ceph OSD reweighted (instance {{ $labels.instance }})
    description: "Ceph Object Storage Daemon takes too much time to resize.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
    - alert: CephPgDown
    expr: 'ceph_pg_down > 0'
    for: 30m
    labels:
    severity: critical
    annotations:
    summary: Ceph PG down (instance {{ $labels.instance }})
    description: "Some Ceph placement groups are down. Please ensure that all the data are available.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
    - alert: CephPgIncomplete
    expr: 'ceph_pg_incomplete > 0'
    for: 30m
    labels:
    severity: critical
    annotations:
    summary: Ceph PG incomplete (instance {{ $labels.instance }})
    description: "Some Ceph placement groups are incomplete. Please ensure that all the data are available.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
    - alert: CephPgInconsistent
    expr: 'ceph_pg_inconsistent > 0'
    for: 30m
    labels:
    severity: warning
    annotations:
    summary: Ceph PG inconsistent (instance {{ $labels.instance }})
    description: "Some Ceph placement groups are inconsistent. Data is available but inconsistent across nodes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
    - alert: CephPgActivationLong
    expr: 'ceph_pg_activating > 0'
    for: 2m
    labels:
    severity: warning
    annotations:
    summary: Ceph PG activation long (instance {{ $labels.instance }})
    description: "Some Ceph placement groups are too long to activate.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
    - alert: CephPgBackfillFull
    expr: 'ceph_pg_backfill_toofull > 0'
    for: 2m
    labels:
    severity: warning
    annotations:
    summary: Ceph PG backfill full (instance {{ $labels.instance }})
    description: "Some Ceph placement groups are located on full Object Storage Daemon on cluster. Those PGs can be unavailable shortly. Please check OSDs, change weight or reconfigure CRUSH rules.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
    - alert: CephPgUnavailable
    expr: 'ceph_pg_total - ceph_pg_active > 0'
    for: 30m
    labels:
    severity: critical
    annotations:
    summary: Ceph PG unavailable (instance {{ $labels.instance }})
    description: "Some Ceph placement groups are unavailable.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
    [root@promethes rules]#

    6.3监控jenkins

    [root@promethes rules]# cat jenkins-plugin.yml
    groups:
    - name: MetricPlugin
    rules:
    - alert: JenkinsOffline
    expr: 'jenkins_node_offline_value > 1'
    for: 30m
    labels:
    severity: critical
    annotations:
    summary: Jenkins offline (instance {{ $labels.instance }})
    description: "Jenkins offline: `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
    - alert: JenkinsHealthcheck
    expr: 'jenkins_health_check_score < 1'
    for: 30m
    labels:
    severity: critical
    annotations:
    summary: Jenkins healthcheck (instance {{ $labels.instance }})
    description: "Jenkins healthcheck score: {{$value}}. Healthcheck failure for `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
    - alert: JenkinsOutdatedPlugins
    expr: 'sum(jenkins_plugins_withUpdate) by (instance) > 3'
    for: 1d
    labels:
    severity: warning
    annotations:
    summary: Jenkins outdated plugins (instance {{ $labels.instance }})
    description: "{{ $value }} plugins need update\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
    - alert: JenkinsBuildsHealthScore
    expr: 'default_jenkins_builds_health_score < 1'
    for: 30m
    labels:
    severity: critical
    annotations:
    summary: Jenkins builds health score (instance {{ $labels.instance }})
    description: "Healthcheck failure for `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
    - alert: JenkinsRunFailureTotal
    expr: 'delta(jenkins_runs_failure_total[1h]) > 100'
    for: 30m
    labels:
    severity: warning
    annotations:
    summary: Jenkins run failure total (instance {{ $labels.instance }})
    description: "Job run failures: ({{$value}}) {{$labels.jenkins_job}}. Healthcheck failure for `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
    - alert: JenkinsBuildTestsFailing
    expr: 'default_jenkins_builds_last_build_tests_failing > 0'
    for: 30m
    labels:
    severity: warning
    annotations:
    summary: Jenkins build tests failing (instance {{ $labels.instance }})
    description: "Last build tests failed: {{$labels.jenkins_job}}. Failed build Tests for job `{{$labels.jenkins_job}}` on {{$labels.instance}}/{{$labels.env}} ({{$labels.region}})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
    - alert: JenkinsLastBuildFailed
    expr: 'default_jenkins_builds_last_build_result_ordinal == 2'
    for: 30m
    labels:
    severity: warning
    annotations:
    summary: Jenkins last build failed (instance {{ $labels.instance }})
    description: "Last build failed: {{$labels.jenkins_job}}. Failed build for job `{{$labels.jenkins_job}}` on {{$labels.instance}}/{{$labels.env}} ({{$labels.region}})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
    [root@promethes rules]#

    6.4监控postgresql

    [root@promethes rules]# cat postgres-exporter.yml
    groups:
    - name: PostgresExporter
    rules:
    - alert: PostgresqlDown
    expr: 'pg_up == 0'
    for: 0m
    labels:
    severity: critical
    annotations:
    summary: Postgresql down (instance {{ $labels.instance }})
    description: "Postgresql instance is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
    - alert: PostgresqlRestarted
    expr: 'time() - pg_postmaster_start_time_seconds < 60'
    for: 0m
    labels:
    severity: critical
    annotations:
    summary: Postgresql restarted (instance {{ $labels.instance }})
    description: "Postgresql restarted\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
    - alert: PostgresqlExporterError
    expr: 'pg_exporter_last_scrape_error > 0'
    for: 0m
    labels:
    severity: critical
    annotations:
    summary: Postgresql exporter error (instance {{ $labels.instance }})
    description: "Postgresql exporter is showing errors. A query may be buggy in query.yaml\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
    - alert: PostgresqlTableNotAutoVacuumed
    expr: '(pg_stat_user_tables_last_autovacuum > 0) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10'
    for: 0m
    labels:
    severity: warning
    annotations:
    summary: Postgresql table not auto vacuumed (instance {{ $labels.instance }})
    description: "Table {{ $labels.relname }} has not been auto vacuumed for 10 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
    - alert: PostgresqlTableNotAutoAnalyzed
    expr: '(pg_stat_user_tables_last_autoanalyze > 0) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10'
    for: 0m
    labels:
    severity: warning
    annotations:
    summary: Postgresql table not auto analyzed (instance {{ $labels.instance }})
    description: "Table {{ $labels.relname }} has not been auto analyzed for 10 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
    - alert: PostgresqlTooManyConnections
    expr: 'sum by (instance, job, server) (pg_stat_activity_count) > min by (instance, job, server) (pg_settings_max_connections * 0.8)'
    for: 2m
    labels:
    severity: warning
    annotations:
    summary: Postgresql too many connections (instance {{ $labels.instance }})
    description: "PostgreSQL instance has too many connections (> 80%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
    - alert: PostgresqlNotEnoughConnections
    expr: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5'
    for: 2m
    labels:
    severity: warning
    annotations:
    summary: Postgresql not enough connections (instance {{ $labels.instance }})
    description: "PostgreSQL instance should have more connections (> 5)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
    - alert: PostgresqlDeadLocks
    expr: 'increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5'
    for: 0m
    labels:
    severity: warning
    annotations:
    summary: Postgresql dead locks (instance {{ $labels.instance }})
    description: "PostgreSQL has dead-locks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
    - alert: PostgresqlHighRollbackRate
    expr: 'sum by (namespace,datname) ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) / ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) + (rate(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[3m])))) > 0.02'
    for: 0m
    labels:
    severity: warning
    annotations:
    summary: Postgresql high rollback rate (instance {{ $labels.instance }})
    description: "Ratio of transactions being aborted compared to committed is > 2 %\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
    - alert: PostgresqlCommitRateLow
    expr: 'rate(pg_stat_database_xact_commit[1m]) < 10'
    for: 2m
    labels:
    severity: critical
    annotations:
    summary: Postgresql commit rate low (instance {{ $labels.instance }})
    description: "Postgresql seems to be processing very few transactions\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
    - alert: PostgresqlLowXidConsumption
    expr: 'rate(pg_txid_current[1m]) < 5'
    for: 2m
    labels:
    severity: warning
    annotations:
    summary: Postgresql low XID consumption (instance {{ $labels.instance }})
    description: "Postgresql seems to be consuming transaction IDs very slowly\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
    - alert: PostgresqlHighRateStatementTimeout
    expr: 'rate(postgresql_errors_total{type="statement_timeout"}[1m]) > 3'
    for: 0m
    labels:
    severity: critical
    annotations:
    summary: Postgresql high rate statement timeout (instance {{ $labels.instance }})
    description: "Postgres transactions showing high rate of statement timeouts\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
    - alert: PostgresqlHighRateDeadlock
    expr: 'increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1'
    for: 0m
    labels:
    severity: critical
    annotations:
    summary: Postgresql high rate deadlock (instance {{ $labels.instance }})
    description: "Postgres detected deadlocks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
    - alert: PostgresqlUnusedReplicationSlot
    expr: 'pg_replication_slots_active == 0'
    for: 1m
    labels:
    severity: warning
    annotations:
    summary: Postgresql unused replication slot (instance {{ $labels.instance }})
    description: "Unused Replication Slots\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
    - alert: PostgresqlTooManyDeadTuples
    expr: '((pg_stat_user_tables_n_dead_tup > 10000) / (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup)) >= 0.1'
    for: 2m
    labels:
    severity: warning
    annotations:
    summary: Postgresql too many dead tuples (instance {{ $labels.instance }})
    description: "PostgreSQL dead tuples is too large\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
    - alert: PostgresqlConfigurationChanged
    expr: '{__name__=~"pg_settings_.*"} != ON(__name__) {__name__=~"pg_settings_([^t]|t[^r]|tr[^a]|tra[^n]|tran[^s]|trans[^a]|transa[^c]|transac[^t]|transact[^i]|transacti[^o]|transactio[^n]|transaction[^_]|transaction_[^r]|transaction_r[^e]|transaction_re[^a]|transaction_rea[^d]|transaction_read[^_]|transaction_read_[^o]|transaction_read_o[^n]|transaction_read_on[^l]|transaction_read_onl[^y]).*"} OFFSET 5m'
    for: 0m
    labels:
    severity: info
    annotations:
    summary: Postgresql configuration changed (instance {{ $labels.instance }})
    description: "Postgres Database configuration change has occurred\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
    - alert: PostgresqlSslCompressionActive
    expr: 'sum(pg_stat_ssl_compression) > 0'
    for: 0m
    labels:
    severity: critical
    annotations:
    summary: Postgresql SSL compression active (instance {{ $labels.instance }})
    description: "Database connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
    - alert: PostgresqlTooManyLocksAcquired
    expr: '((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20'
    for: 2m
    labels:
    severity: critical
    annotations:
    summary: Postgresql too many locks acquired (instance {{ $labels.instance }})
    description: "Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
    - alert: PostgresqlBloatIndexHigh(>80%)
    expr: 'pg_bloat_btree_bloat_pct > 80 and on (idxname) (pg_bloat_btree_real_size > 100000000)'
    for: 1h
    labels:
    severity: warning
    annotations:
    summary: Postgresql bloat index high (> 80%) (instance {{ $labels.instance }})
    description: "The index {{ $labels.idxname }} is bloated. You should execute `REINDEX INDEX CONCURRENTLY {{ $labels.idxname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
    - alert: PostgresqlBloatTableHigh(>80%)
    expr: 'pg_bloat_table_bloat_pct > 80 and on (relname) (pg_bloat_table_real_size > 200000000)'
    for: 1h
    labels:
    severity: warning
    annotations:
    summary: Postgresql bloat table high (> 80%) (instance {{ $labels.instance }})
    description: "The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
    [root@promethes rules]#

    6.5发送邮件模板

    [root@promethes templates]# cat email.tmpl
    {{ define "test.html" }}
    {{ range .Alerts }}
    =========start==========
    告警程序: {{ .Labels.job }}
    告警级别: {{ .Labels.severity }} 级
    告警类型: {{ .Labels.alertname }}
    故障主机: {{ .Labels.instance }}
    告警主题: {{ .Annotations.summary }}
    触发时间: {{ .StartsAt.Format "2006-01-02 15:04:05" }}
    =========end==========
    {{ end }}
    {{ end }}
    [root@promethes templates]#
  • 相关阅读:
    【QT】Qt项目demo:数据在ui界面上显示,鼠标双击可弹窗显示具体信息
    Exoplayer简介
    嘉为蓝鲸受邀参加2022(第十届)国际智慧机场发展论坛
    Python之hello, world
    Sentinel的另外三种流控模式(附代码详细介绍)
    栈的基本操作
    HCIP---企业网的三层架构
    python——json
    八、【快速选择工具组】
    Leetcode85. 最大矩形
  • 原文地址:https://www.cnblogs.com/zttong/p/17685161.html