• postgresql 数据库巡检


    主机信息

    CPU

    mpstat | sed -n '3,$p' | awk -F' ' '{print $13}'
    echo 'CPU CORE' && cat /proc/cpuinfo|grep processor|wc -l
    
    • 1
    • 2
    • 正常:空闲cpu大于20%;
    • 异常处理:排查问题,杀掉cpu高进程,top 按c;

    检查内存

    free -m
    
    • 1
    • 正常:空闲内存大于30%;
    • 异常处理:排查问题,杀掉内存高进程,top 按c;

    检查磁盘空间

    df -lh 
    
    • 1
    • 正常:磁盘空间已用空间小于70%;
    • 异常处理:增加硬盘或者删除无用的数据;

    检查IO

    iostat -x | sed -n '6,$p' | awk -F' ' '{print $1,$13,$14}'
    
    • 1
    • 正常:磁盘空间已用空间小于70%;
    • 异常处理:增加硬盘或者删除无用的数据;

    检查端口

    netstat -tanp | grep 'LISTEN' | grep '5432'
    
    • 1
    • 正常:tcp4和tcp6正常监听;
    • 异常处理:排查数据库是否正常启动,排查数据库配置文件的端口参数是否为5432;

    检查postgres进程

    ps -ef | grep "checkpointer\|background writer\|walwriter\|autovacuum launcher\|archiver\|stats collector\|logical replication launcher\|logger" | grep -v grep
    
    • 1
    • 正常:进程都在;
    • 异常处理:重启数据库;

    数据库

    检查安装信息

    select 
         to_char(now(),'yyyy-mm-dd hh24:mi:ss') "巡检时间"
        ,to_char(pg_postmaster_start_time(),'yyyy-mm-dd hh24:mi:ss') "pg_start_time(启动时间)"
        ,now()-pg_postmaster_start_time() "pg_running_time(运行时长)"
        --,inet_server_addr() "server_ip(服务器ip)"
        --,inet_server_port() "server_port(服务器端口)"
        --,inet_client_addr() "client_ip(客户端ip)"
        --,inet_client_port() "client_port(客户端端口)"
        ,version() "server_version(数据库版本)"
        ,(case when pg_is_in_recovery()='f' then 'primary' else 'standby' end ) as  "primary_or_standby(主或备)"
    ;
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 正常:数据库正常使用;
    • 异常处理:重装数据库;

    检查postgresql.conf文件

    select 
         to_char(now(),'yyyy-mm-dd hh24:mi:ss') "巡检时间"
        ,sourceline "sourceline(行号)"
        ,name "para(参数名)"
        ,setting "value(参数值)"
    from pg_file_settings
    order by "sourceline(行号)";
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 正常:各项参数设置适合;
    • 异常处理:编辑postgresql.conf文件,修改参数后重启数据库;
    vi $PGDATA/postgresql.conf
    pg_ctl restart -mf
    
    • 1
    • 2

    检查pg_hba.conf文件

    select 
         to_char(now(),'yyyy-mm-dd hh24:mi:ss') "巡检时间"
        ,line_number "line_number(行号)"
        ,type "type(连接类型)"
        ,database "database(数据库名)"
        ,user_name "user_name(用户名)"
        ,address "address(ip地址)"
        ,netmask "netmask(子网掩码)"
        ,auth_method "auth_method(认证方式)"
    from pg_hba_file_rules
    order by "line_number(行号)";
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 正常:非套接字连接都需要md5认证;
    • 异常处理:编辑pg_hba.conf文件,修改参数后重新加载数据库;
    vi $PGDATA/postgresql.conf
    pg_ctl reload
    
    • 1
    • 2

    检查数据库重要配置

    select
         to_char(now(),'yyyy-mm-dd hh24:mi:ss') "巡检时间"
        ,name 
        ,setting
    from
        pg_settings a
    where a.name in (
      'data_directory',
      'port',
      'client_encoding',
      'config_file',
      'hba_file',
      'ident_file',
      'archive_mode',
      'logging_collector',
      'log_directory',
      'log_filename',
      'log_truncate_on_rotation',
      'log_statement',
      'log_min_duration_statement',
      'max_connections', 
      'listen_addresses'
    )
    order by name;
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 正常:各项配置都适合;
    • 异常处理:修改不合适的配置;

    检查主从WAl状态

    select 
        -- pid "pid(进程id)"
        --,usename "username(用户名)"
        --,application_name "application_name(应用名)"
        --,client_addr "client_addr(IP)"
        --,backend_start "backend_start(备份开始时间)"
         state "state(WAL发送状态编码)"
    --    ,case 
    --        when state = 'startup' then '正在启动'
    --        when state = 'catchup' then '追赶主库'
    --        when state = 'streaming' then '流传送'
    --        when state = 'backup' then '发送备份'
    --        when state = 'stopping' then '发送停止'
    --     end "statename(WAL状态)"
        ,sync_state "sync_state(同步状态编码)"
    --    ,case 
    --        when sync_state = 'async' then '异步'
    --        when sync_state = 'potential' then '后备失效变同步'
    --        when sync_state = 'sync' then '同步'
    --        when sync_state = 'quorum' then '候选'
    --     end "sync_statename(同步状态名称)"
        --,round(pg_wal_lsn_diff(pg_current_wal_lsn(),replay_lsn) /(1024 * 1024),2) as "slave_latency_mb(同步延迟_MB)"
    from pg_stat_replication;
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    select 
        -- pid "pid(进程id)"
         status "status(WAl接收状态)"
        ,'async' "sync_state(同步状态编码)"
        --,last_msg_send_time "last_msg_send_time(接收到最后的消息发送时间)"
        --,last_msg_receipt_time "last_msg_receipt_time(接收到最后的消息接收时间)"
        --,sender_host "sender_host(主库IP)"
    from pg_stat_wal_receiver
    ;
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9

    检查表空间

    select 
         to_char(now(),'yyyy-mm-dd hh24:mi:ss') "巡检时间"
        ,spcname AS "Name(名称)"
        ,pg_catalog.pg_get_userbyid(spcowner) AS "Owner(拥有者)"
        --,pg_catalog.pg_tablespace_location(oid) AS "Location(数据文件目录)"
        --,pg_catalog.array_to_string(spcacl, E'\n') AS "Access privileges(访问权限)"
        --,spcoptions AS "Options(参数)"
        ,pg_catalog.pg_size_pretty(pg_catalog.pg_tablespace_size(oid)) AS "Size(表空间大小)"
        --,pg_catalog.shobj_description(oid, 'pg_tablespace') AS "Description(备注)"
    from pg_catalog.pg_tablespace
    order by 1;
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11

    检查连接数

    select
         to_char(now(),'yyyy-mm-dd hh24:mi:ss') "巡检时间"
        ,max_conn "max_conn(最大连接数)"
        ,now_conn "now_conn(当前连接数)"
        ,max_conn - now_conn "remain_conn(剩余连接数)"
    from (
        select
             setting::int8 as max_conn
            ,(select count(*) from pg_stat_activity ) as now_conn
        from pg_settings
        where name = 'max_connections' 
    ) a
    ;
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 正常:连接数不超过总连接数的90%;
    • 异常处理:超级用户(postgres)杀连接;
    --杀掉所有空闲连接
    select pg_terminate_backend(pid) from pg_stat_activity WHERE state = 'idle';
    
    • 1
    • 2

    检查锁表

    select 
         to_char(now(),'yyyy-mm-dd hh24:mi:ss') "巡检时间"
        ,relname "relname(表名)"
        ,b.nspname "shemaname(模式名)"
        ,c.rolname "user(用户名)"
        ,d.locktype "locktype(被锁对象类型)"
        ,d.mode "mode(锁类型)"
        ,d.pid "pid(进程id)"
        ,e.query "query(锁表sql)"
        ,current_timestamp-state_change "lock_duration(锁表时长)"
    from pg_class a
    inner join pg_namespace b 
    on (a.relnamespace = b.oid)
    inner join pg_roles c 
    on (a.relowner = c.oid)
    inner join pg_locks d
    on (a.oid = d.relation)
    left join pg_stat_activity e 
    on (d.pid = e.pid)
    where d.mode = 'AccessExclusiveLock'
    order by "lock_duration(锁表时长)" desc;
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 正常:无锁表;
    • 异常处理:取消该进程或杀掉该会话;
    --取消该进程
    select pg_cancel_backend(pid);
    --杀掉该会话
    select pg_terminate_backend(pid);
    
    • 1
    • 2
    • 3
    • 4

    检查空闲连接top5

    select 
         to_char(now(),'yyyy-mm-dd hh24:mi:ss') "巡检时间"
        ,a.datname "datname(数据库名)"
        ,a.pid "pid(进程id)"
        ,b.rolname "username(用户名)"
        --,a.application_name "app_name(应用名称)"
        ,a.client_addr "client_ip(客户端ip)"
        --,a.query_start "query_start(当前查询开始时间)"
        ,to_char(a.state_change,'yyyy-mm-dd hh24:mi:ss') "state_change(状态变化时间)"
        --,a.state "state(状态)"
        --,a.query "sql(执行的sql)"
        --,a.backend_type "backend_type(后端类型)"
    from pg_stat_activity a
    inner join pg_roles b 
    on (a.usesysid = b.oid)
    where a.state = 'idle'
        and state_change < current_timestamp - interval '30 min'
    order by current_timestamp-state_change desc
    limit 5
    ;
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 正常:超半小时空闲的连接;
    • 异常处理:杀连接;
    select pg_terminate_backend(pid);
    
    • 1

    检查长事务top5

    select 
         to_char(now(),'yyyy-mm-dd hh24:mi:ss') "巡检时间"
        ,a.datname "datname(数据库名)"
        ,a.pid "pid(进程id)"
        ,b.rolname "username(用户名)"
        --,a.application_name "app_name(应用名称)"
        ,a.client_addr "client_ip(客户端ip)"
        --,a.xact_start "xact_start(当前事务开始时间)"
        --,a.query_start "query_start(当前查询开始时间)"
        ,to_char(a.state_change,'yyyy-mm-dd hh24:mi:ss') "state_change(状态变化时间)"
        --,a.state "state(状态)"
        --,a.query "sql(执行的sql)"
        --,a.backend_type "backend_type(后端类型)"
    from pg_stat_activity a
    inner join pg_roles b 
    on (a.usesysid = b.oid)
    where a.state in ('idle in transaction','idle in transaction (aborted)')
        and state_change < current_timestamp - interval '30 min'
    order by current_timestamp-state_change desc
    limit 5;
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 正常:不存在长事务;
    • 异常处理:杀会话;
    select pg_terminate_backend(pid);
    
    • 1

    检查慢SQLtop5

    select 
         to_char(now(),'yyyy-mm-dd hh24:mi:ss') "巡检时间"
        ,a.datname "datname(数据库名)"
        ,a.pid "pid(进程id)"
        ,b.rolname "username(用户名)"
        --,a.application_name "app_name(应用名称)"
        ,a.client_addr "client_ip(客户端ip)"
        --,a.query_start "query_start(当前查询开始时间)"
        ,to_char(a.state_change,'yyyy-mm-dd hh24:mi:ss') "state_change(状态变化时间)"
        --,a.wait_event_type "wait_event_type(等待类型)"
        --,a.wait_event "wait_event(等待事件)"
        --,a.state "state(状态)"
        --,a.query "sql(执行的sql)"
        --,a.backend_type "backend_type(后端类型)"
    from pg_stat_activity a
    left join pg_roles b 
    on (a.usesysid = b.oid)
    where a.state = 'active'
        and state_change < current_timestamp - interval '1 hour'
        and a.datname is not null
    order by current_timestamp-state_change desc 
    limit 5;
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 正常:不存在慢sql;
    • 异常处理:分析原因,有针对性地杀连接;
    select pg_terminate_backend(pid);
    
    • 1

    检查对象数

    --这里需要循环查每个库所有数据然后合并
    select to_char(now(),'yyyy-mm-dd hh24:mi:ss') "巡检时间"
        ,current_database()
        ,sum(obj_num) "obj_num(对象数)" 
    from (
        select count(1) obj_num from pg_class 
        union all 
        select count(1) from pg_proc 
    ) a
    ;
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 正常:总对象数不超过5万;
    • 异常处理:删除无用的对象;

    检查表膨胀top5

    select
         to_char(now(),'yyyy-mm-dd hh24:mi:ss') "巡检时间"
        ,current_database() current_database 
        ,relname as "table_name(表名)"
        ,schemaname as "schema_name(模式名)"
        ,pg_size_pretty(pg_relation_size('"'||schemaname|| '"."'||relname||'"')) as "table_size(表大小)"
        ,n_dead_tup as "n_dead_tup(无效记录数)"
        ,n_live_tup as "n_live_tup(有效记录数)"
        ,to_char(round(n_dead_tup*1.0/(n_live_tup+n_dead_tup)*100,2),'fm990.00') as "dead_rate(无效记录比例%)"
    from
        pg_stat_all_tables
    where n_live_tup+n_dead_tup <> 0
    ;
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 正常:不存在表膨胀,因为有自动清理垃圾进程;
    • 异常处理:对膨胀表做vacuum analyze操作;

    检查索引膨胀

    select
      to_char(now(),'yyyy-mm-dd hh24:mi:ss') "巡检时间",
      current_database() AS db, schemaname, tablename,bs, reltuples::bigint AS tups, relpages::bigint AS pages, otta,
      ROUND(CASE WHEN otta=0 OR sml.relpages=0 OR sml.relpages=otta THEN 0.0 ELSE sml.relpages/otta::numeric END,1) AS tbloat,
      CASE WHEN relpages < otta THEN 0 ELSE relpages::bigint - otta END AS wastedpages,
      CASE WHEN relpages < otta THEN 0 ELSE bs*(sml.relpages-otta)::bigint END AS wastedbytes,
      CASE WHEN relpages < otta THEN $$0 bytes$$::text ELSE (bs*(relpages-otta))::bigint || $$ bytes$$ END AS wastedsize,
      iname, ituples::bigint AS itups, ipages::bigint AS ipages, iotta,
      ROUND(CASE WHEN iotta=0 OR ipages=0 OR ipages=iotta THEN 0.0 ELSE ipages/iotta::numeric END,1) AS ibloat,
      CASE WHEN ipages < iotta THEN 0 ELSE ipages::bigint - iotta END AS wastedipages,
      CASE WHEN ipages < iotta THEN 0 ELSE bs*(ipages-iotta) END AS wastedibytes,
      CASE WHEN ipages < iotta THEN $$0 bytes$$ ELSE (bs*(ipages-iotta))::bigint || $$ bytes$$ END AS wastedisize,
      CASE WHEN relpages < otta THEN
        CASE WHEN ipages < iotta THEN 0 ELSE bs*(ipages-iotta::bigint) END
        ELSE CASE WHEN ipages < iotta THEN bs*(relpages-otta::bigint)
          ELSE bs*(relpages-otta::bigint + ipages-iotta::bigint) END
      END AS totalwastedbytes
    FROM (
      SELECT
        nn.nspname AS schemaname,
        cc.relname AS tablename,
        COALESCE(cc.reltuples,0) AS reltuples,
        COALESCE(cc.relpages,0) AS relpages,
        COALESCE(bs,0) AS bs,
        COALESCE(CEIL((cc.reltuples*((datahdr+ma-
          (CASE WHEN datahdr%ma=0 THEN ma ELSE datahdr%ma END))+nullhdr2+4))/(bs-20::float)),0) AS otta,
        COALESCE(c2.relname,$$?$$) AS iname, COALESCE(c2.reltuples,0) AS ituples, COALESCE(c2.relpages,0) AS ipages,
        COALESCE(CEIL((c2.reltuples*(datahdr-12))/(bs-20::float)),0) AS iotta -- very rough approximation, assumes all cols
      FROM
         pg_class cc
      JOIN pg_namespace nn ON cc.relnamespace = nn.oid AND nn.nspname <> $$information_schema$$
      LEFT JOIN
      (
        SELECT
          ma,bs,foo.nspname,foo.relname,
          (datawidth+(hdr+ma-(case when hdr%ma=0 THEN ma ELSE hdr%ma END)))::numeric AS datahdr,
          (maxfracsum*(nullhdr+ma-(case when nullhdr%ma=0 THEN ma ELSE nullhdr%ma END))) AS nullhdr2
        FROM (
          SELECT
            ns.nspname, tbl.relname, hdr, ma, bs,
            SUM((1-coalesce(null_frac,0))*coalesce(avg_width, 2048)) AS datawidth,
            MAX(coalesce(null_frac,0)) AS maxfracsum,
            hdr+(
              SELECT 1+count(*)/8
              FROM pg_stats s2
              WHERE null_frac<>0 AND s2.schemaname = ns.nspname AND s2.tablename = tbl.relname
            ) AS nullhdr
          FROM pg_attribute att 
          JOIN pg_class tbl ON att.attrelid = tbl.oid
          JOIN pg_namespace ns ON ns.oid = tbl.relnamespace 
          LEFT JOIN pg_stats s ON s.schemaname=ns.nspname
          AND s.tablename = tbl.relname
          AND s.inherited=false
          AND s.attname=att.attname,
          (
            SELECT
              (SELECT current_setting($$block_size$$)::numeric) AS bs,
                CASE WHEN SUBSTRING(SPLIT_PART(v, $$ $$, 2) FROM $$#"[0-9]+.[0-9]+#"%$$ for $$#$$)
                  IN ($$8.0$$,$$8.1$$,$$8.2$$) THEN 27 ELSE 23 END AS hdr,
              CASE WHEN v ~ $$mingw32$$ OR v ~ $$64-bit$$ THEN 8 ELSE 4 END AS ma
            FROM (SELECT version() AS v) AS foo
          ) AS constants
          WHERE att.attnum > 0 AND tbl.relkind=$$r$$
          GROUP BY 1,2,3,4,5
        ) AS foo
      ) AS rs
      ON cc.relname = rs.relname AND nn.nspname = rs.nspname
      LEFT JOIN pg_index i ON indrelid = cc.oid
      LEFT JOIN pg_class c2 ON c2.oid = i.indexrelid
    ) AS sml 
    ;
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56
    • 57
    • 58
    • 59
    • 60
    • 61
    • 62
    • 63
    • 64
    • 65
    • 66
    • 67
    • 68
    • 69
    • 70
    • 71
    • 索引膨胀,依赖于统计信息,统计信息未更新,索引膨胀信息不准确。一般每年统一做一次重建索引即可。
    • 异常处理:重建索引;
    reindex index 索引名;
    
    • 1
  • 相关阅读:
    【Shell篇<Ⅰ>】——shell脚本、变量、数值运算
    【快应用】如何配置快应用图标,及几种常见的无效配置案例
    【RuoYi-Cloud项目研究】【ruoyi-gateway模块】Spring Gatewaye和Sentinel实现网关流控
    【纯css】实现无限滚动循环组件,无js
    电子器件系列57:肖特基二极管(BAS7005)
    【Java集合类】之 HashSet以及底层逻辑分析
    终于可以彻底告别手写正则表达式了
    C++模拟实现位图和布隆过滤器
    DAY12-深度学习100例-卷积神经网络(CNN)识别验证码
    mysql套用:查询近N个月的数据,没有数据的月份补0
  • 原文地址:https://blog.csdn.net/qq_33445829/article/details/126638945