• prometheus监控外部k8s


    node-exporter

    vim node-exporter-ds.yaml

    apiVersion: apps/v1
    kind: DaemonSet
    metadata:
      name: prometheus-node-exporter
      namespace: prom
      labels:
        app: prometheus
        component: node-exporter
    spec:
      selector:
        matchLabels:
          app: prometheus
          component: node-exporter
      template:
        metadata:
          name: prometheus-node-exporter
          labels:
            app: prometheus
            component: node-exporter
        spec:
          tolerations:
          - key: node-role.kubernetes.io/etcd
            effect: NoExecute
            operator: "Exists"
          - key: node-role.kubernetes.io/controlplane
            effect: NoSchedule
            operator: "Exists"
          containers:
          - image:  prom/node-exporter:v0.18.1
            name: prometheus-node-exporter
            ports:
            - name: prom-node-exp
              containerPort: 9100
              hostPort: 9100
          hostNetwork: true
          hostPID: true
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36

    vim node-exporter-svc.yaml

    apiVersion: v1
    kind: Service
    metadata:
      annotations:
        prometheus.io/scrape: 'true'
      name: prometheus-node-exporter
      namespace: prom
      labels:
        app: prometheus
        component: node-exporter
    spec:
      clusterIP: None
      ports:
        - name: prometheus-node-exporter
          port: 9100
          protocol: TCP
      selector:
        app: prometheus
        component: node-exporter
      type: ClusterIP
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20

    kube-state-metrics

    vim kube-state-metrics-rbac.yaml

    ---
    apiVersion: v1
    kind: ServiceAccount
    metadata:
      name: kube-state-metrics
      namespace: prom
    ---
    apiVersion: rbac.authorization.k8s.io/v1
    kind: ClusterRole
    metadata:
      name: kube-state-metrics
    rules:
    - apiGroups: [""]
      resources: ["nodes", "pods", "services", "resourcequotas", "replicationcontrollers", "limitranges", "persistentvolumeclaims", "persistentvolumes", "namespaces", "endpoints"]
      verbs: ["list", "watch"]
    - apiGroups: ["extensions"]
      resources: ["daemonsets", "deployments", "replicasets"]
      verbs: ["list", "watch"]
    - apiGroups: ["apps"]
      resources: ["statefulsets"]
      verbs: ["list", "watch"]
    - apiGroups: ["batch"]
      resources: ["cronjobs", "jobs"]
      verbs: ["list", "watch"]
    - apiGroups: ["autoscaling"]
      resources: ["horizontalpodautoscalers"]
      verbs: ["list", "watch"]
    ---
    apiVersion: rbac.authorization.k8s.io/v1
    kind: ClusterRoleBinding
    metadata:
      name: kube-state-metrics
    roleRef:
      apiGroup: rbac.authorization.k8s.io
      kind: ClusterRole
      name: kube-state-metrics
    subjects:
    - kind: ServiceAccount
      name: kube-state-metrics
      namespace: prom
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40

    vim kube-state-metrics-deploy.yaml

    apiVersion: apps/v1
    kind: Deployment
    metadata:
      name: kube-state-metrics
      namespace: prom
    spec:
      replicas: 1
      selector:
        matchLabels:
          app: kube-state-metrics
      template:
        metadata:
          labels:
            app: kube-state-metrics
        spec:
          serviceAccountName: kube-state-metrics
          containers:
          - name: kube-state-metrics
            image: harbor.com.cn/library/kube-state-metrics:2.4.2
    #        image: gcr.io/google-containers/kube-state-metrics-amd64:v1.9.5
            ports:
            - containerPort: 8080
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22

    vim kube-state-metrics-svc.yaml

    apiVersion: v1
    kind: Service
    metadata:
      annotations:
        prometheus.io/scrape: 'true'
      name: kube-state-metrics
      namespace: prom
      labels:
        app: kube-state-metrics
    spec:
      ports:
      - name: kube-state-metrics
        port: 8080
        protocol: TCP
      selector:
        app: kube-state-metrics
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16

    blackbox

    vim cm.yaml

    apiVersion: v1
    kind: ConfigMap
    metadata:
      labels:
        app: blackbox-exporter
      name: blackbox-exporter
      namespace: kube-system
    data:
      blackbox.yml: |-
        modules:
          http_2xx:
            prober: http
            timeout: 2s
            http:
              valid_http_versions: ["HTTP/1.1", "HTTP/2"]
              valid_status_codes: [200,301,302]
              method: GET
              preferred_ip_protocol: "ip4"
          tcp_connect:
            prober: tcp
            timeout: 2s
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21

    vim dp.yaml

    kind: Deployment
    apiVersion: apps/v1
    metadata:
      name: blackbox-exporter
      namespace: kube-system
      labels:
        app: blackbox-exporter
      annotations:
        deployment.kubernetes.io/revision: "1"
    spec:
      replicas: 1
      selector:
        matchLabels:
          app: blackbox-exporter
      template:
        metadata:
          labels:
            app: blackbox-exporter
        spec:
          volumes:
          - name: config
            configMap:
              name: blackbox-exporter
              defaultMode: 420
          containers:
          - name: blackbox-exporter
            image: harbor.com.cn/library/blackbox-exporter:v0.15.1
            imagePullPolicy: IfNotPresent
            args:
            - --config.file=/etc/blackbox_exporter/blackbox.yml
            - --log.level=info
            - --web.listen-address=:9115
            ports:
            - name: blackbox-port
              containerPort: 9115
              protocol: TCP
            resources:
              limits:
                cpu: 500m
                memory: 512Mi
              requests:
                cpu: 200m
                memory: 256Mi
            volumeMounts:
            - name: config
              mountPath: /etc/blackbox_exporter
            readinessProbe:
              tcpSocket:
                port: 9115
              initialDelaySeconds: 5
              timeoutSeconds: 5
              periodSeconds: 10
              successThreshold: 1
              failureThreshold: 3
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54

    vim svc.yaml

    kind: Service
    apiVersion: v1
    metadata:
      name: blackbox-exporter
      namespace: kube-system
    spec:
      selector:
        app: blackbox-exporter
      ports:
      - port: 9115
        protocol: TCP
        targetPort: 9115
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12

    vim ingress.yaml

    apiVersion: networking.k8s.io/v1
    kind: Ingress
    metadata:
      name: blackbox-exporter
      namespace: kube-system
      annotations:
        kubernetes.io/ingress.class: "nginx"
    spec:
      rules:
      - host: blackbox.com.cn
        http:
          paths:
          - path: /
            pathType: ImplementationSpecific
            backend:
              service:
                name: blackbox-exporter
                port:
                  number: 9115
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19

    cadvisor

    docker run \
      --volume=/:/rootfs:ro \
      --volume=/var/run:/var/run:rw \
      --volume=/sys:/sys:ro \
      --volume=/var/lib/docker:/var/lib/docker:ro \
      --volume=/dev/disk/:/dev/disk:ro \
      --publish=8080:8080 \
      --detach=true \
      --name=cadvisor \
      --privileged=true \
      google/cadvisor:latest
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11

    注:本实验prometheus是二进制部署

    prometheus

    获取k8s token 写入 k8s-token.conf文件,prometheus连接外部k8s使用

    #创建用户
    kubectl create serviceaccount dashboard-admin -n kube-system
    
    #绑定集群角色
    kubectl create clusterrolebinding dashboard-cluster-admin --clusterrole=cluster-admin --serviceaccount=kube-system:dashboard-admin
    
    #查看token
    kubectl -n kube-system get secrets | grep dashboard-admin
    kubectl -n kube-system get secrets dashboard-admin-token-bqnz5
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9

    vim prometheus.yml

    global:
      scrape_interval:     15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
      evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
      # scrape_timeout is set to the global default (10s).
    
    # Alertmanager configuration
    alerting:
      alertmanagers:
      - static_configs:
        - targets: ['192.168.51.159:9093']
          # - alertmanager:9093
    
    # Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
    rule_files:
      # - "first_rules.yml"
      # - "second_rules.yml"
      - "/prometheus/prometheus/rules/rules.yml"
    
    # A scrape configuration containing exactly one endpoint to scrape:
    # Here it's Prometheus itself.
    scrape_configs:
      # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
      - job_name: 'prometheus'
    
        # metrics_path defaults to '/metrics'
        # scheme defaults to 'http'.
    
        static_configs:
        - targets: ['localhost:9090']
    
    
      - job_name: 'vcloud'
        metrics_path: /actuator/prometheus
        file_sd_configs:
        - files:
          - /prometheus/prometheus/file_config/vcloud/*.json
          refresh_interval: 10s
        relabel_configs:
          - source_labels: [appname]
            action: replace
            target_label: appname
    
    
      - job_name: 'rabbitmq'
        scrape_interval: 60s
        scrape_timeout: 60s
        static_configs:
        - targets: ['192.168.51.109:9090']
    
      - job_name: 'etcd'
        scheme: https
        tls_config:
          ca_file: /prometheus/prometheus/ca.crt
          cert_file: /prometheus/prometheus/server.crt
          key_file: /prometheus/prometheus/server.key
        static_configs:
        - targets:
          - '10.10.95.11:2379'
          - '10.10.95.12:2379'
          - '10.10.95.13:2379'
    
      - job_name: 'k8s-master'
        static_configs:
        - targets: ['10.10.95.11:9100', '10.10.95.12:9100', '10.10.95.13:9100']
    
      - job_name: 'k8s-node'
        static_configs:
        - targets: ['10.10.95.21:9100', '10.10.95.22:9100', '10.10.95.23:9100']
    
      - job_name: 'kubernetes-apiservers'
        kubernetes_sd_configs:
        - role: endpoints
          api_server: https://10.10.95.18:6443
          bearer_token_file: /prometheus/prometheus/k8s-token.conf
          tls_config:
            insecure_skip_verify: true
        bearer_token_file: /prometheus/prometheus/k8s-token.conf
        tls_config:
          insecure_skip_verify: true
        scheme: https
        relabel_configs:
        - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
          action: keep
          regex: default;kubernetes;https
    
      - job_name: 'kubernetes-kubelet'
        kubernetes_sd_configs:
        - role: node
          api_server: https://10.10.95.18:6443
          bearer_token_file: /prometheus/prometheus/k8s-token.conf
          tls_config:
            insecure_skip_verify: true
        bearer_token_file: /prometheus/prometheus/k8s-token.conf
        tls_config:
          insecure_skip_verify: true
        scheme: https
        relabel_configs:
        - action: labelmap
          regex: __meta_kubernetes_node_label_(.+)
        - source_labels: [__meta_kubernetes_node_name]
          regex: (.+)
          target_label: __address__
          replacement: ${1}:10250
    
    
      - job_name: kubernetes-nodes-cadvisor
        metrics_path: /metrics
        scheme: https
        kubernetes_sd_configs:
        - role: node
          api_server: https://10.10.95.18:6443
          bearer_token_file: /prometheus/prometheus/k8s-token.conf
          tls_config:
            insecure_skip_verify: true
        bearer_token_file: /prometheus/prometheus/k8s-token.conf
        tls_config:
          insecure_skip_verify: true
        relabel_configs:
        - action: labelmap
          regex: __meta_kubernetes_node_label_(.*)
        - action: replace
          regex: (.*)
          source_labels: ["__address__"]
          target_label: __address__
          replacement: 10.10.95.18:6443
        - action: replace
          source_labels: [__meta_kubernetes_node_name]
          target_label: __metrics_path__
          regex: (.*)
          replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
    
      - job_name: 'container'
        static_configs:
        - targets: ['10.10.95.11:8080', '10.10.95.12:8080', '10.10.95.13:8080','10.10.95.21:8080', '10.10.95.22:8080', '10.10.95.23:8080']
          labels:
            group: container
    
      - job_name: 'blackbox_http_service_probe'
        metrics_path: /probe
        kubernetes_sd_configs:
        - role: service
          api_server: https://10.10.95.18:6443
          bearer_token_file: /prometheus/prometheus/k8s-token.conf
          tls_config:
            insecure_skip_verify: true
        bearer_token_file: /prometheus/prometheus/k8s-token.conf
        tls_config:
          insecure_skip_verify: true
        params:
          module: [http_2xx]
        relabel_configs:
        - source_labels: [__meta_kubernetes_service_annotation_blackbox_scheme]
          action: keep
          regex: http
        - source_labels: [__address__, __meta_kubernetes_service_annotation_blackbox_port,  __meta_kubernetes_service_annotation_blackbox_path]
          action: replace
          regex: ([^:]+)(?::\d+)?;(\d+);(.+)
          replacement: $1:$2$3
          target_label: __param_target
        - action: replace
          target_label: __address__
          replacement: blackbox.com.cn:80
        - source_labels: [__param_target]
          target_label: instance
        - action: labelmap
          regex: __meta_kubernetes_service_label_(.+)
        - source_labels: [__meta_kubernetes_namespace]
          action: replace
          target_label: kubernetes_namespace
        - source_labels: [__meta_kubernetes_service_name]
          action: replace
          target_label: kubernetes_service_name
    
      - job_name: 'blackbox_http_pod_probe'
        metrics_path: /probe
        kubernetes_sd_configs:
        - role: pod
          api_server: https://10.10.95.18:6443
          bearer_token_file: /prometheus/prometheus/k8s-token.conf
          tls_config:
            insecure_skip_verify: true
        bearer_token_file: /prometheus/prometheus/k8s-token.conf
        tls_config:
          insecure_skip_verify: true
        params:
          module: [http_2xx]
        relabel_configs:
        - source_labels: [__meta_kubernetes_pod_annotation_blackbox_scheme]
          action: keep
          regex: http
        - source_labels: [__address__, __meta_kubernetes_pod_annotation_blackbox_port,  __meta_kubernetes_pod_annotation_blackbox_path]
          action: replace
          regex: ([^:]+)(?::\d+)?;(\d+);(.+)
          replacement: $1:$2$3
          target_label: __param_target
        - action: replace
          target_label: __address__
          replacement: blackbox.com.cn:80
        - source_labels: [__param_target]
          target_label: instance
        - action: labelmap
          regex: __meta_kubernetes_pod_label_(.+)
        - source_labels: [__meta_kubernetes_namespace]
          action: replace
          target_label: kubernetes_namespace
        - source_labels: [__meta_kubernetes_pod_name]
          action: replace
          target_label: kubernetes_pod_name
      
      - job_name: 'blackbox_tcp_pod_probe'
        metrics_path: /probe
        kubernetes_sd_configs:
        - role: pod
          api_server: https://10.10.95.18:6443
          bearer_token_file: /prometheus/prometheus/k8s-token.conf
          tls_config:
            insecure_skip_verify: true
        bearer_token_file: /prometheus/prometheus/k8s-token.conf
        tls_config:
          insecure_skip_verify: true
        params:
          module: [tcp_connect]
        relabel_configs:
        - source_labels: [__meta_kubernetes_pod_annotation_blackbox_scheme]
          action: keep
          regex: tcp
        - source_labels: [__address__, __meta_kubernetes_pod_annotation_blackbox_port]
          action: replace
          regex: ([^:]+)(?::\d+)?;(\d+)
          replacement: $1:$2
          target_label: __param_target
        - action: replace
          target_label: __address__
          replacement: blackbox.com.cn:80
        - source_labels: [__param_target]
          target_label: instance
        - action: labelmap
          regex: __meta_kubernetes_pod_label_(.+)
        - source_labels: [__meta_kubernetes_namespace]
          action: replace
          target_label: kubernetes_namespace
        - source_labels: [__meta_kubernetes_pod_name]
          action: replace
          target_label: kubernetes_pod_name
    
    
      - job_name: 'k8s-pods'
        kubernetes_sd_configs:
        - role: pod  
          api_server: https://10.10.95.18:6443
          bearer_token_file: /prometheus/prometheus/k8s-token.conf
          tls_config:
            insecure_skip_verify: true
        bearer_token_file: /prometheus/prometheus/k8s-token.conf
        tls_config:
          insecure_skip_verify: true
        relabel_configs:
        - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
          action: keep
          regex: true
        - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
          action: replace
          target_label: __metrics_path__
          regex: (.+)
        - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
          action: replace
          regex: ([^:]+)(?::\d+)?;(\d+)
          replacement: $1:$2
          target_label: __address__
        - action: labelmap
          regex: __meta_kubernetes_pod_label_(.+)
        - source_labels: [__meta_kubernetes_namespace]
          action: replace
          target_label: kubernetes_namespace
        - source_labels: [__meta_kubernetes_pod_name]
          action: replace
          target_label: kubernetes_pod_name
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56
    • 57
    • 58
    • 59
    • 60
    • 61
    • 62
    • 63
    • 64
    • 65
    • 66
    • 67
    • 68
    • 69
    • 70
    • 71
    • 72
    • 73
    • 74
    • 75
    • 76
    • 77
    • 78
    • 79
    • 80
    • 81
    • 82
    • 83
    • 84
    • 85
    • 86
    • 87
    • 88
    • 89
    • 90
    • 91
    • 92
    • 93
    • 94
    • 95
    • 96
    • 97
    • 98
    • 99
    • 100
    • 101
    • 102
    • 103
    • 104
    • 105
    • 106
    • 107
    • 108
    • 109
    • 110
    • 111
    • 112
    • 113
    • 114
    • 115
    • 116
    • 117
    • 118
    • 119
    • 120
    • 121
    • 122
    • 123
    • 124
    • 125
    • 126
    • 127
    • 128
    • 129
    • 130
    • 131
    • 132
    • 133
    • 134
    • 135
    • 136
    • 137
    • 138
    • 139
    • 140
    • 141
    • 142
    • 143
    • 144
    • 145
    • 146
    • 147
    • 148
    • 149
    • 150
    • 151
    • 152
    • 153
    • 154
    • 155
    • 156
    • 157
    • 158
    • 159
    • 160
    • 161
    • 162
    • 163
    • 164
    • 165
    • 166
    • 167
    • 168
    • 169
    • 170
    • 171
    • 172
    • 173
    • 174
    • 175
    • 176
    • 177
    • 178
    • 179
    • 180
    • 181
    • 182
    • 183
    • 184
    • 185
    • 186
    • 187
    • 188
    • 189
    • 190
    • 191
    • 192
    • 193
    • 194
    • 195
    • 196
    • 197
    • 198
    • 199
    • 200
    • 201
    • 202
    • 203
    • 204
    • 205
    • 206
    • 207
    • 208
    • 209
    • 210
    • 211
    • 212
    • 213
    • 214
    • 215
    • 216
    • 217
    • 218
    • 219
    • 220
    • 221
    • 222
    • 223
    • 224
    • 225
    • 226
    • 227
    • 228
    • 229
    • 230
    • 231
    • 232
    • 233
    • 234
    • 235
    • 236
    • 237
    • 238
    • 239
    • 240
    • 241
    • 242
    • 243
    • 244
    • 245
    • 246
    • 247
    • 248
    • 249
    • 250
    • 251
    • 252
    • 253
    • 254
    • 255
    • 256
    • 257
    • 258
    • 259
    • 260
    • 261
    • 262
    • 263
    • 264
    • 265
    • 266
    • 267
    • 268
    • 269
    • 270
    • 271
    • 272
    • 273
    • 274
    • 275
    • 276
    • 277

    vim rules.yml

    groups:
    - name: test-rules
      rules:
      - alert: InstanceDown
        expr: up == 0
        for: 2m
        labels:
          everity: Disaster
        annotations:
          summary: "Instance {{ $labels.instance }} down."
    
      - alert: rabbitmqDown
        expr: rabbitmq_running == 0
        for: 2m
        labels:
          everity: Disaster
        annotations:
          summary: "Instance {{ $labels.node }} down."
    
    - name: http_status
      rules:
      - alert: BlackboxSlowPing
        expr: probe_icmp_duration_seconds > 2
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Blackbox slow ping (instance {{ $labels.instance }})"
          description: "Blackbox ping took more than 2s (current value: {{ $value }})"
      
      - alert: BlackboxSlowRequests
        expr: probe_http_duration_seconds > 2 
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Blackbox slow requests (instance {{ $labels.instance }})"
          description: "Blackbox request took more than 2s (current value: {{ $value }})"
    
      - alert: ProbeFailed
        expr: probe_success == 0
        for: 1m
        labels:
          severity: error
        annotations:
          summary: "Probe failed (instance {{ $labels.instance }})"
          description: "Probe failed (current value: {{ $value }})"
    
      - alert: StatusCode
        expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400
        for: 1m
        labels:
          severity: error
        annotations:
          summary: "Status Code (instance {{ $labels.instance }})"
          description: "HTTP status code is not 200-399 (current value: {{ $value }})"
    
    - name: node_alerts
      rules:
      - alert: NodeMemoryWaring
        expr: node_memory_MemAvailable_bytes/1024/1024 <= 2014
        for: 30s
        labels:
          severity: Disaster
        annotations:
          summary: "机器 {{ $labels.instance }} 可用内存低于2014M"
    
      - alert: NodeMemoryDisaster
        expr: ((node_memory_MemTotal_bytes - (node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes)) / node_memory_MemTotal_bytes) * 100 >= 80
        for: 30s
        labels:
          severity: Critical
        annotations:
          summary: "机器 {{ $labels.instance }} 内存使用率超过80%"
    
      - alert: NodeCPUUsage
        expr: 100 * (1 - sum by (instance)(increase(node_cpu_seconds_total{mode="idle"}[5m])) / sum by (instance)(increase(node_cpu_seconds_total[5m]))) > 80
        for: 2m
        labels:
          team: node
        annotations:
          summary: "{{$labels.instance}}: High CPU usage detected"
          description: "{{$labels.instance}}: CPU usage is above 80% (current value is: {{ $value }}"
    
      - alert: NodeFilesystemUsage
        expr: 100 - ((node_filesystem_avail_bytes{mountpoint="/",fstype=~"ext4|xfs"} * 100) / node_filesystem_size_bytes {mountpoint="/",fstype=~"ext4|xfs"}) > 80
        for: 2m
        labels:
          team: node
        annotations:
          summary: "{{$labels.instance}}: High Filesystem usage detected"
          description: "{{$labels.instance}}: Filesystem usage is above 80% (current value is: {{ $value }}"
    
      - alert: InstanceDown
        expr: up == 0
        for: 30s
        labels:
          severity: Disaster
        annotations:
          summary: "Instance {{ $labels.instance }} down."
          description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than"
    
    
      - alert: ticket-test
        expr: sum(jvm_memory_used_bytes{application="vcloud-ticket-api",area="heap",}) / sum(jvm_memory_max_bytes{application="vcloud-ticket-api",area="heap"})*100 > 90
        for: 30s
        labels:
          severity: Disaster
        annotations:
          summary: "JVM Instance {{ $labels.instance }} memory usage > 90%"
          description: "{{ $labels.instance }} has been in status [heap usage > 90%] for more than 1 minutes. current usage ({{ $value }}%)"
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56
    • 57
    • 58
    • 59
    • 60
    • 61
    • 62
    • 63
    • 64
    • 65
    • 66
    • 67
    • 68
    • 69
    • 70
    • 71
    • 72
    • 73
    • 74
    • 75
    • 76
    • 77
    • 78
    • 79
    • 80
    • 81
    • 82
    • 83
    • 84
    • 85
    • 86
    • 87
    • 88
    • 89
    • 90
    • 91
    • 92
    • 93
    • 94
    • 95
    • 96
    • 97
    • 98
    • 99
    • 100
    • 101
    • 102
    • 103
    • 104
    • 105
    • 106
    • 107
    • 108
    • 109
    • 110
    • 111

    test.json

    [
      {
        "targets": [
          "192.168.51.201:8010"
        ],
        "labels": {
          "appname": "api01"
        }
      },
    
      {
        "targets": [
          "192.168.51.202:8010"
        ],
        "labels": {
          "appname": "api02"
        }
      }
    ]
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19

    alertmanager

    vim alertmanager.yml

    global:
      resolve_timeout: 5m
      smtp_smarthost: 'mail.com.cn:587'
      smtp_from: 'admin@com.cn'
      smtp_auth_username: 'admin@com.cn'
      smtp_auth_password: '123456'
    #  smtp_require_tls: false
    templates:
      - '/prometheus/alertmanager/templates/*.tmpl'
    route:
      group_by: ['service', 'alertname', 'cluster']
      group_interval: 5m
      group_wait: 10s
      repeat_interval: 5m
      receiver: default-receiver
      routes:
      - match:
          severity: ^(Critical|Warning|Disaster)$
        receiver: 'email'
    receivers:
    - name: 'default-receiver'
      email_configs:
      - to: 'name1@com.cn,name2@com.cn'
    #    html: '{{ template "dingding.to.html" . }}'
        headers: { Subject: 'Prometheus 告警邮件'}
        send_resolved: true
    - name: 'email'
      email_configs:
      - to: 'name1@com.cn,name2@com.cn'
    #    html: '{{ template "dingding.to.html" . }}'
        headers: { Subject: 'Prometheus 告警邮件'}
        send_resolved: true
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    ./alertmanager --config.file=alertmanager.yml --web.listen-address=:9093
    
    • 1

    监控k8s service资源

    ---
    apiVersion: v1   
    kind: Service
    metadata:
      annotations:
        blackbox_path: "/"
        blackbox_port: "80"
        blackbox_scheme: "http"
      name: $APPNAME
      namespace: $NAMESPACE
    spec:
      selector:
        appname: $APPNAME
      ports:
      - port: 80
        protocol: TCP
        targetPort: 80
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
  • 相关阅读:
    vue在线查看pdf文件
    靶向嵌合体PEG-ethoxycarbonyl-propanoic/Dodecaethylene glycol
    SIMULIA|Abaqus 2022x新功能介绍第三弹
    2.Spring的优缺点是什么?
    vue高德地图(二):获取并标记用户位置
    智慧交通完整解决方案
    优橙内推甘肃专场——5G网络优化(中高级)工程师
    python毕业设计作品基于django框架 电影院购票选座系统毕设成品(8)毕业设计论文模板
    Leetcode 21. Merge Two Sorted Lists (Python)
    Git 创建分支、合并分支
  • 原文地址:https://blog.csdn.net/wuxingge/article/details/125469810