• clickhouse 业务日志告警


    一、需求
    对入库到clickhouse的业务日志进行告警,达阀值后发送企业微信告警。

    方法一、
    fluent-bit–>clickhouse(http)<–shell脚本,每隔一分钟获取分析结果 --> 把结果保存到/dev/shm/目录下 <-- node_exporter读取指标入库到prometheus<-- rules根据告警规则生产告警–>alertmanager–>webhook --> 企业微信。
    方法二、
    fluent-bit–>clickhouse(http)<–python,每隔一分钟获取分析结果 --> pushgateway–>指标入库到prometheus<-- rules根据告警规则生产告警–>alertmanager–>webhook --> 企业微信。

    二、告警组件
    clickhouse
    prometheus
    alertmanager
    node_exporter+查询脚本或者(python脚本+pushgateway)
    webhook

    三、clickhouse搭建和建表
    业务日志库

    四、node_exporter
    启动参数添加 --collector.textfile.directory=/dev/shm/

    [Unit]
    Description=node_exporter Service
    After=network.target
    After=network-online.target
    Wants=network-online.target
    
    [Service]
    Type=simple
    WorkingDirectory=/data/node_exporter
    ExecStart=/data/node_exporter/node_exporter \
    --web.config.file=/data/node_exporter/etc/config.yml \
    --collector.filesystem.mount-points-exclude="^/(sys|proc|dev|host|etc|var/lib/docker/.+|var/lib/kubelet/.+)($|/)" \
    --collector.systemd \
    --collector.systemd.unit-include="(docker|sshd|isg|sgadmin).service" \
    --web.listen-address=:19100 \
    --collector.textfile.directory=/dev/shm/ \
    --web.telemetry-path=/metrics
    
    Restart=always
    RestartSec=5
    
    [Install]
    WantedBy=multi-user.target
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23

    五、shell脚本
    使用crontab定时,一分钟执行一次

    #!/usr/bin/env bash
    #
    # Generate node_resolv_info
    # which are not handled by node_exporter's own collector
    
    set -e
    
    #ch的IP
    ch_host=xx.xx.xx.xx
    #ch的端口
    ch_port=9000
    #ch的用户
    ch_user=xxxx
    #ch的密码
    ch_password=xxxxxxxxxxxxxxxxxxxx
    #ch的数据库
    ch_database=xxxxxxxxxxxxxx
    #ch的表名
    ch_table=xxxxxxxxxxxxx
    #查询推后
    query_delay=60
    
    #因入库时间较慢,查询前一分钟所
    #站点(聚合)
    site_sql="SELECT splitByChar('/',req_path)[2] as paasid , round(sum(if((toInt64(res_statuscode) >= 200) AND (toInt64(res_statuscode) < 400), 1, 0))) as suc, count(1) as total , round(sum(if((toInt64(res_statuscode) >= 200) AND (toInt64(res_statuscode) < 400), 1, 0)) / count(1)*100, 5) AS val FROM ${ch_database}.${ch_table} PREWHERE (create_time >= toDateTime(now() - 60 - ${query_delay})) AND (create_time < toDateTime(now() - ${query_delay})) GROUP BY paasid HAVING total >= 5  ORDER BY val DESC"
    
    SITE_ARRAY=(`docker exec -i ch clickhouse-client --user=${ch_user} --password=${ch_password} --host ${ch_host} --port ${ch_port} -n -m -q "${site_sql}"| tr -d '\r'`)
    
    site_num=${#SITE_ARRAY[@]}
    
    cat <<EOS >> /dev/shm/site_rate.prom.tmp
    # HELP site_rate
    # TYPE site_rate gauge
    EOS
    for ((i=0;i<site_num;i=i+4)); do
      REQ_PATH="${SITE_ARRAY[i]}"
      SUC="${SITE_ARRAY[i+1]}"
      TOL="${SITE_ARRAY[i+2]}"
      VAL="${SITE_ARRAY[i+3]}"
    cat <<EOS >> /dev/shm/site_rate.prom.tmp
    site_rate{site_path="${REQ_PATH}",suc="${SUC}",total="${TOL}"} ${VAL}
    EOS
    done
    \mv /dev/shm/site_rate.prom.tmp /dev/shm/site_rate.prom
    
    #------------------------------------
    #API接口
    api_sql="SELECT req_path , round(sum(if((toInt64(res_statuscode) >= 200) AND (toInt64(res_statuscode) < 400), 1, 0))) as suc, count(1) as total , round(sum(if((toInt64(res_statuscode) >= 200) AND (toInt64(res_statuscode) < 400), 1, 0)) / count(1)*100, 5) AS val FROM ${ch_database}.${ch_table} PREWHERE req_path like '/ebus/%' and  (create_time >= toDateTime(now() - 60 - ${query_delay})) AND (create_time < toDateTime(now() - ${query_delay})) GROUP BY req_path HAVING total >= 3 ORDER BY val DESC"
    
    API_ARRAY=(`docker exec -i ch clickhouse-client --user=${ch_user} --password=${ch_password} --host ${ch_host} --port ${ch_port} -n -m -q "${api_sql}"| tr -d '\r'`)
    
    api_num=${#API_ARRAY[@]}
    
    cat <<EOS >> /dev/shm/api_rate.prom.tmp
    # HELP api_rate
    # TYPE api_rate gauge
    EOS
    for ((i=0;i<api_num;i=i+4)); do
      REQ_PATH="${API_ARRAY[i]}"
      SUC="${API_ARRAY[i+1]}"
      TOL="${API_ARRAY[i+2]}"
      VAL="${API_ARRAY[i+3]}"
    cat <<EOS >> /dev/shm/interface_rate.prom.tmp
    api_rate{api_path="${REQ_PATH}",suc="${SUC}",total="${TOL}"} ${VAL}
    EOS
    done
    
    \mv /dev/shm/api_rate.prom.tmp /dev/shm/api_rate.prom
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56
    • 57
    • 58
    • 59
    • 60
    • 61
    • 62
    • 63
    • 64
    • 65
    • 66
    • 67
    • 68

    #脚本生成结果1

    cat /dev/shm/site_rate.prom 
    # HELP site_rate
    # TYPE site_rate gauge
    site_rate{site_path="/metrics/",suc="49",total="49"} 100
    site_rate{site_path="/grafana/",suc="9",total="9"} 100
    site_rate{site_path="/dail_healthcheck/",suc="16",total="16"} 100
    site_rate{site_path="/abcyhzx5/",suc="64",total="64"} 100
    site_rate{site_path="/abcapm/",suc="30",total="32"} 93.75
    site_rate{site_path="/abc/",suc="333",total="370"} 90
    site_rate{site_path="/ebus/",suc="2",total="14"} 14.28571
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10

    六、prometheus告警规则

    groups:
        - name: 接口成功率-监控告警
          rules:
          - alert: 接口成功率低于85%
            expr: avg by (api_path,suc,total) (api_rate)  <= 85
            for: 0m
            labels:
              severity: 一般
              alert: api
            annotations:
              description: "接口成功率低于85%\n(suc:{{$labels.suc}} total:{{$labels.total}})\n成功率:{{printf \"%.0f\" $value}}%"
          - alert: 站点成功率低于85%
            expr: avg by (site_path,suc,total) (site_rate)  <= 85
            for: 0m
            labels:
              severity: 一般
              alert: api
            annotations:
              description: "站点成功率低于85%\n(suc:{{$labels.suc}} total:{{$labels.total}})\n成功率:{{printf \"%.0f\" $value}}%"
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19

    七、alertmanager

    global:
      resolve_timeout: 1m
      smtp_from: 'xxxxxxxx@qq.com'
      smtp_smarthost: 'smtp.qq.com:465'
      smtp_auth_username: 'xxxxxx@qqq.com'
      smtp_auth_password: 'XXXXXX'  
      smtp_require_tls: false
      smtp_hello: 'qq.com'
      
    templates:
      - '/etc/alertmanager/email.tmpl' #邮件模板文件,容器内的路径  
    
    route:
      receiver: 'ding2wechat'
      #按alertname等进行分组
      group_by: ['alertname']
      #周期内有同一组的报警到来则一起发送 
      group_wait: 1m 
      #报警发送周期 
      group_interval: 10m
      #与上次相同的报警延迟30m才发送,这里应该是(10+30)m左右 
      repeat_interval: 30m 
      routes:
        #可以使用match_re正则匹配
        - match:     
            severity: 严重
          #匹配上则发给下面的name=ding2wechat   
          receiver: ding2wechat 
        - match:
            alert: api 
          #匹配上则发给下面的name=api_ding2wechat
          receiver: api_ding2wechat
          repeat_interval: 24h
          group_interval: 1m
    
    receivers:
    ##企微机器人2,通过prometheus-webhook-dingtalk后,再通过ding2wechat
    - name: 'ding2wechat'
      webhook_configs:
      - url: 'http://172.xxx.xxx.xxx:8060/dingtalk/ding2wechat/send'
        send_resolved: true
    
    - name: 'api_ding2wechat'
      webhook_configs:
      #不需要发送恢复告警
      - url: 'http://172.xxx.xxx.xxx:8060/dingtalk/ding2wechat/send'
        send_resolved: false
    
    - name: 'email'
      email_configs:
        - to: 'xxxxxxxx@qq.com'
          html: '{{ template "email.jwolf.html" . }}'
          send_resolved: true
    
    #抑制规则,(如果是critical时,抑制warning警报)
    inhibit_rules:
      - source_match:
          severity: 'critical'
        target_match:
          severity: 'warning'
        equal: ['alertname', 'instance'] 
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56
    • 57
    • 58
    • 59
    • 60
    • 61

    在这里插入图片描述

  • 相关阅读:
    基于AI深度学习的安全帽检测算法,如何应用在实际场景中?
    模板与泛型编程值typelist实现
    数组扁平化的方法
    黄州科目三
    接口测试需要验证数据库么?
    非洲美食多样性而丰富多彩
    两个数据源的分页实现
    MySQL:库操作 | 表操作
    国内首家!阿里云 Elasticsearch 8.9 版本释放 AI 搜索新动能
    Java常见注解及其使用汇总
  • 原文地址:https://blog.csdn.net/u010533742/article/details/133030611