嗨,各位小伙伴大家上午好呀,我是爱小可爱的IT白,忙里偷闲跟大家捞一捞呀;最近所做项目中,用到了达梦、Oracle、PostgreSQL等数据库,在每个项目进入正式指标开发之前,我们需要先对数据源的表做一个关于数据质量的统计(主要包括所有库、所有表、所有字段、空值率、空字段率、表注释、库注释、表数据量等信息的统计),这有利于我们后面的原型设计以及指标加工;在当时统计这些的时候,我需要全网查指令,太麻烦了,为了方便大家,我就将测试之后可成功使用的代码以博客的方式发布出来,欢迎各位小伙伴提出宝贵意见,如果有其他数据库的这类数据质量方面的统计,也欢迎各位小伙伴给我私信或者留言,我都给加到这篇文章里面,力争帮助更多的小伙伴提高工作效率,争取帮各位实现早下班自由;废话不多说了,下面我们进入正题,本文我主要对达梦、Oracle、PostgreSQL三种数据库为各位小伙伴进行关于数据库数据质量的讲解。
SELECT
aaa.relname, --表名
aaa.obj_description, --名称
aaa.attname, --字段
aaa.description, --字段备注
aaa.lie_type, --列类型
bbb.*
from
(select
c.relname, --表名
cast (
obj_description (relfilenode, 'pg_class') as varchar
) as obj_description, --名称
a.attname, -- 字段
d.description, -- 字段备注
concat_ws (
'',
t.typname,
SUBSTRING (
format_type (a.atttypid, a.atttypmod)
from
'\(.*\)'
)
) as lie_type --列类型
from
pg_class c,
pg_attribute a,
pg_type t,
pg_description d
where
a.attnum > 0
and a.attrelid = c.oid
and a.atttypid = t.oid
and d.objoid = a.attrelid
and d.objsubid = a.attnum
and c.relname in (
select
tablename
from
pg_tables
where
schemaname = 'public'
and position ('_2' in tablename) = 0
)
order by
c.relname,
a.attnum
) aaa
LEFT JOIN
(SELECT
c.oid,
obj_description (c.oid),
--c.relhasoids AS hasoids,
n.nspname AS schemaname,
c.relname AS tablename,
c.relkind,
pg_get_userbyid (c.relowner) AS tableowner,
t.spcname AS "tablespace",
c.relhasindex AS hasindexes,
c.relhasrules AS hasrules,
c.relhastriggers AS hastriggers,
ft.ftoptions,
fs.srvname,
c.relacl,
c.reltuples,
(
(
SELECT
count (*)
FROM
pg_inherits
WHERE
inhparent = c.oid
) > 0
) AS inhtable,
i2.nspname AS inhschemaname,
i2.relname AS inhtablename,
c.reloptions AS param,
c.relpersistence AS unlogged
FROM
pg_class c
LEFT JOIN pg_namespace n ON n.oid = c.relnamespace
LEFT JOIN pg_tablespace t ON t.oid = c.reltablespace
LEFT JOIN (
pg_inherits i
INNER JOIN pg_class c2 ON i.inhparent = c2.oid
LEFT JOIN pg_namespace n2 ON n2.oid = c2.relnamespace
) i2 ON i2.inhrelid = c.oid
LEFT JOIN pg_foreign_table ft ON ft.ftrelid = c.oid
LEFT JOIN pg_foreign_server fs ON ft.ftserver = fs.oid
WHERE
(
(c.relkind = 'r' :: "char")
OR (c.relkind = 'f' :: "char")
)
AND n.nspname = 'public') bbb
ON aaa.relname = bbb.tablename;
select
c.relname 表名,
cast (
obj_description (relfilenode, 'pg_class') as varchar
) 名称,
a.attname 字段,
d.description 字段备注,
concat_ws (
'',
t.typname,
SUBSTRING (
format_type (a.atttypid, a.atttypmod)
from
'\(.*\)'
)
) as 列类型
from
pg_class c,
pg_attribute a,
pg_type t,
pg_description d
where
a.attnum > 0
and a.attrelid = c.oid
and a.atttypid = t.oid
and d.objoid = a.attrelid
and d.objsubid = a.attnum
and c.relname in (
select
tablename
from
pg_tables
where
schemaname = 'public'
and position ('_2' in tablename) = 0
)
order by
c.relname,
a.attnum
--查询显示结果如下:
表名 名称 字段 字段备注 列类型
test20220825 测试表 test1 测试字段1 varchar(255)
select distinct lower(pg_class.relname),pg_attribute.attname as colname,
' SELECT count(1) from '|| lower(pg_class.relname)||' where '||pg_attribute.attname||'>''2021-04-01'' ;',
' delete from '|| lower(pg_class.relname)||' where '||pg_attribute.attname||'>''2021-04-01'' ;'
from pg_constraint
inner join pg_class on pg_constraint.conrelid = pg_class.oid
inner join pg_attribute on pg_attribute.attrelid = pg_class.oid
where pg_attribute.attname ~'createtime'
order by lower(pg_class.relname)
--查询显示结果如下:
lower colname ?column? ?column?(1)
test20220825 createtime select count(1) from test20220825 where createtime>'2021-04-01';
delete from test20220825 where createtime>'2021-04-01';
SELECT
c.oid,
obj_description (c.oid),
--c.relhasoids AS hasoids,
n.nspname AS schemaname,
c.relname AS tablename,
c.relkind,
pg_get_userbyid (c.relowner) AS tableowner,
t.spcname AS "tablespace",
c.relhasindex AS hasindexes,
c.relhasrules AS hasrules,
c.relhastriggers AS hastriggers,
ft.ftoptions,
fs.srvname,
c.relacl,
c.reltuples,
(
(
SELECT
count (*)
FROM
pg_inherits
WHERE
inhparent = c.oid
) > 0
) AS inhtable,
i2.nspname AS inhschemaname,
i2.relname AS inhtablename,
c.reloptions AS param,
c.relpersistence AS unlogged
FROM
pg_class c
LEFT JOIN pg_namespace n ON n.oid = c.relnamespace
LEFT JOIN pg_tablespace t ON t.oid = c.reltablespace
LEFT JOIN (
pg_inherits i
INNER JOIN pg_class c2 ON i.inhparent = c2.oid
LEFT JOIN pg_namespace n2 ON n2.oid = c2.relnamespace
) i2 ON i2.inhrelid = c.oid
LEFT JOIN pg_foreign_table ft ON ft.ftrelid = c.oid
LEFT JOIN pg_foreign_server fs ON ft.ftserver = fs.oid
WHERE
(
(c.relkind = 'r' :: "char")
OR (c.relkind = 'f' :: "char")
)
AND n.nspname = 'public';
--查询显示结果如下:
oid obj_description schemaname tablename relkind tableowner tablespace hasindexes hasrules hastriggers reltuples inhtable unlogged
149336 测试表 public test20220825 r postgre NULL t
f f 438 f p
select tablename
from pg_tables
where schemaname='public'
and position('_2' in tablename)=0;
select * from pg_tables;
--查询显示结果如下:
tablename
test20220825
SELECT
i.indrelid AS oid,
ci.relname AS index_name,
ct.relname AS table_name,
am.amname,
i.indexrelid,
i.indisunique,
i.indisclustered,
i.indisprimary,
i.indkey,
i.indclass,
obj_description (indexrelid),
i.indnatts,
pg_get_expr (indpred, indrelid, true) AS indconstraint,
pa.rolname AS owner,
ts.spcname,
ci.reloptions,
i.indoption,
i.indcollation
FROM
pg_index i
LEFT JOIN pg_class ct ON ct.oid = i.indrelid
LEFT JOIN pg_class ci ON ci.oid = i.indexrelid
LEFT JOIN pg_namespace tns ON tns.oid = ct.relnamespace
LEFT JOIN pg_namespace ins ON ins.oid = ci.relnamespace
LEFT JOIN pg_tablespace ts ON ci.reltablespace = ts.oid
LEFT JOIN pg_am am ON ci.relam = am.oid
LEFT JOIN pg_depend dep ON dep.classid = ci.tableoid
AND dep.objid = ci.oid
AND dep.refobjsubid = '0'
LEFT JOIN pg_constraint con ON con.tableoid = dep.refclassid
AND con.oid = dep.refobjid
LEFT JOIN pg_roles pa ON pa.oid = ci.relowner
WHERE
tns.nspname = 'public'
-- AND ct.relname = '查询的表名'
AND conname IS NULL
ORDER BY
ct.relname,
ins.nspname,
ci.relname;
--查询显示结果如下:
oid index_name table_name amname indexrelid indisunique indisclustered indisprimary indkey indclass obj_description indnatts owner indoption indcollation
149336 createtime test20220825 btree 12461181 f f
f 7 3126 测试表 1 postgre 0 100
SELECT
opc.oid,
opc.opcname,
nsp.nspname,
opc.opcdefault
FROM
pg_opclass opc,
pg_namespace nsp
WHERE
opc.opcnamespace = nsp.oid;
--查询显示结果如下:
oid opcname nspname opcdefault
10000 array_ops pg_catalog t
SELECT
col. table_name,
col. column_name,
col.character_maximum_length,
col.is_nullable,
col.numeric_precision,
col.numeric_scale,
col.datetime_precision,
col.ordinal_position,
b.atttypmod,
b.attndims,
col.data_type AS col_type,
et.typelem,
et.typlen,
et.typtype,
nbt.nspname AS elem_schema,
bt.typname AS elem_name,
b.atttypid,
col.udt_schema,
col.udt_name,
col.column_default AS col_default,
col.domain_catalog,
col.domain_schema,
col.domain_name,
b.attfdwoptions AS foreign_options,
col_description (c.oid, col.ordinal_position) AS comment,
b.attacl,
coll.collname
FROM
information_schema. columns AS col
LEFT JOIN pg_namespace ns ON ns.nspname = col.table_schema
LEFT JOIN pg_class c ON col. table_name = c.relname
AND c.relnamespace = ns.oid
LEFT JOIN pg_attrdef a ON c.oid = a.adrelid
AND col.ordinal_position = a.adnum
LEFT JOIN pg_attribute b ON b.attrelid = c.oid
AND b.attname = col. column_name
LEFT JOIN pg_type et ON et.oid = b.atttypid
LEFT JOIN pg_collation coll ON coll.oid = b.attcollation
LEFT JOIN pg_type bt ON et.typelem = bt.oid
LEFT JOIN pg_namespace nbt ON bt.typnamespace = nbt.oid
WHERE
col.table_schema = 'public'
-- AND col.table_name = '查询的表名'
ORDER BY
col.table_name,
col.ordinal_position;
--查询显示结果如下:
table_name column_name character_maximum_length is_nullable ordinal_position atttypmod attndims col_type typelem typlen typtype atttypid
udt_schema udt_name comment collname
test20220825 createtime 255 YES 1 259
0 character varying 0 -1 b 1043
pg_catalog varchar 创建时间
select
relname as tabname,
cast (
obj_description (relfilenode, 'pg_class') as varchar
) as comment
from
pg_class c
where
relname in (
select
tablename
from
pg_tables
where
schemaname = 'public'
and position ('_2' in tablename) = 0
);
--select * from pg_class;
--查询显示结果如下:
tabname comment
test20220825 测试表
select
relname as tabname,
cast (
obj_description (relfilenode, 'pg_class') as varchar
) as comment
from
pg_class c
where
relname = '查询的表名';
select
a.attnum,
a.attname,
concat_ws (
'',
t.typname,
SUBSTRING (
format_type (a.atttypid, a.atttypmod)
from
'\(.*\)'
)
) as type,
d.description
from
pg_class c,
pg_attribute a,
pg_type t,
pg_description d
where
c.relname = '查询的表名'
and a.attnum > 0
and a.attrelid = c.oid
and a.atttypid = t.oid
and d.objoid = a.attrelid
and d.objsubid = a.attnum;
SELECT
string_agg (DISTINCT t3.attname, ',') AS primaryKeyColumn,
t4.tablename AS tableName,
string_agg (
cast (
obj_description (relfilenode, 'pg_class') as varchar
),
''
) as comment
FROM
pg_constraint t1
INNER JOIN pg_class t2 ON t1.conrelid = t2.oid
INNER JOIN pg_attribute t3 ON t3.attrelid = t2.oid
AND array_position (t1.conkey, t3.attnum) is not null
INNER JOIN pg_tables t4 on t4.tablename = t2.relname
INNER JOIN pg_index t5 ON t5.indrelid = t2.oid
AND t3.attnum = ANY (t5.indkey)
LEFT JOIN pg_description t6 on t6.objoid = t3.attrelid
and t6.objsubid = t3.attnum
WHERE
t1.contype = 'p'
AND length (t3.attname) > 0
AND t2.oid = '查询的表名'::regclass
group by
t4.tablename
SELECT
col_description (a.attrelid, a.attnum) as comment,
format_type (a.atttypid, a.atttypmod) as type,
a.attname as name,
a.attnotnull as notnull
FROM
pg_class as c,
pg_attribute as a
where
c.relname = '查询的表名'
and a.attrelid = c.oid
and a.attnum > 0
order by
a.attname
--查询显示结果如下:
comment type name notnull
创建时间 character varying(255) createtime f
select
distinct 'select a.table_name,a.n,b.n,round((b.n*1.0/a.n)*100,2)||''%'' from (select count(*) as n,'''||OBJECT_NAME||
''' as table_name from dba_tables where owner = '''||OBJECT_NAME||
''') a,(select count(*) as n,'''||OBJECT_NAME||
''' as table_name from SYSTABLECOMMENTS where schname = '''||OBJECT_NAME||
''') b where a.table_name = b.table_name union all'
from ALL_OBJECTS where OBJECT_TYPE = 'SCH'
--然后执行上述SQL结果如下:
对应中台 模式名称 表个数 有注释信息的表个数 有注释信息的表比例
table_name n n round((b.n*1./a.n)*100,2)||'%'
测试中台 DB_TEST 23 23 100%
--可以展示所有表名
select table_name from dba_tables where owner = 'ODS_APP_MARKET_APP_STORE' order by table_name
--可以展示数据库下表总数,此模式下为229张表
select count(*) from dba_tables where owner = 'ODS_APP_MARKET_APP_STORE'
select tvname,comment$ from SYSTABLECOMMENTS where schname = 'ODS_APP_MARKET_APP_STORE' order by tvname
模式名称 表名称 是否有表名称注释 表名称注释 总字段个数 有注释信息的字段个数 有注释信息的字段比例 是否为空表 数据量
--①可查出模式名称、表名称、总字段个数、有注释信息的字段个数、有注释信息的字段比例
--先查出当前模式下的表和模式:例如查出DB_ODS_ABILITY_APIDB模式下
select owner,table_name,
'select a.owner,a.tvname,a.n,b.n,round((b.n*1.0/a.n)*100,2)||''%'' from (select count(*) as n,'''
||owner||
''' as owner,'''||table_name||''' as tvname from all_tab_columns where owner = '''||owner||
''' and table_name = '''||table_name||''') a,(select count(*) as n,'''||owner||
''' as owner,'''||table_name||''' as tvname from SYSCOLUMNCOMMENTS where schname = '''||owner||
''' and tvname = '''||table_name||''') b where a.owner = b.owner and a.tvname = b.tvname union all'
from dba_tables
where
owner = 'DB_TEST'
order by owner,table_name
--②可查出模式名称、表名称、表的数据量
--再查出可以查出模式名称和表名称以及表数据量的SQL语句
select 'select '''||owner||''' as owner,'''||table_name||''' as tvname,count(*) from "'||owner||'"."'||table_name||'" union all '
from dba_tables
where owner = 'DB_TEST'
order by owner,table_name
--③可查出表的中文注释:
select a.table_name,b.comment$
from (select table_name from dba_tables where owner = 'ODS_TEST' order by table_name) a
left join (select tvname,comment$ from SYSTABLECOMMENTS where schname = 'DB_TEST') b
on a.table_name = b.tvname
select owner,table_name,column_name,data_type,data_length,nullable
from all_tab_columns
where owner like 'DB_TEST%'
--示例结果如下:
owner table_name column_name data_type data_length nullable
DB_TEST ODS_TEST ACCOUNT_ID VARCHAR 32 N
方案一:推荐使用
analyze: analyze table table_name compute statistics;
1、生成analyze的sql。:推荐使用
SELECT 'analyze table '|| table_name || ' compute statistics; ' FROM user_tables;
查询结果实例如下:
analyze table LY_OA1 compute statistics;
analyze table LY_OA2 compute statistics;
analyze table LY_SYS compute statistics;
2、批量执行analyze的sql。
即将上面1的查询结果批量执行
3、查询统计结果:不推荐使用
SELECT t.table_name, t.NUM_ROWS FROM user_tables t --这个查询结果无法显示码表数据量,因此不建议用
如果要查询大于一百万数据量的表,指令如下:
SELECT t.table_name,t.num_rows FROM user_tables t WHERE t.num_rows > 1000000
4、查询统计结果:推荐使用
SELECT b.table_name,d.comments,b.num_rows,CASE WHEN b.num_rows = 0 THEN '是' ELSE '否' END AS aaa, c.clo
FROM user_tables b
LEFT JOIN (
SELECT
a.table_name,
COUNT( a.table_name ) clo
FROM
user_tab_columns a
GROUP BY
a.table_name
) c ON c.table_name = b.table_name
LEFT JOIN (
SELECT * FROM user_tab_comments WHERE TABLE_NAME NOT LIKE '%$%'
) d ON d.table_name = b.table_name;
SELECT 字段名,COUNT(*) FROM TABLE GROUP BY 字段名 HAVING COUNT(*) > 1;
SELECT name1,name2,COUNT(*) FROM TABLE GROUP BY name1,name2 HAVING COUNT(*) > 1;
创建存储过程-用来统计数据库表数据量
CREATE OR REPLACE PROCEDURE TABLES_ROWS_COUNT
AS
CREATE_SQL VARCHAR2(2000);
P_TABLE_NAME VARCHAR2(2000);
BEGIN
-- EXECUTE IMMEDIATE 'DROP TABLE CHECK_TABLE_COUNT';
EXECUTE IMMEDIATE 'CREATE TABLE T_ROWS_COUNT (TABLE_NAME varchar2(200) ,COUNT number)';
FOR RESOURCE_OBJ IN (SELECT T.TABLE_NAME
FROM USER_TABLES T
ORDER BY T.TABLE_NAME) LOOP
P_TABLE_NAME := RESOURCE_OBJ.TABLE_NAME;
CREATE_SQL := 'insert into T_ROWS_COUNT(TABLE_NAME,COUNT) SELECT ''' ||P_TABLE_NAME|| '''AS TABLE_NAME , COUNT(1) AS COUNT FROM ' || P_TABLE_NAME;
EXECUTE IMMEDIATE CREATE_SQL;
COMMIT;
END LOOP;
END TABLES_ROWS_COUNT;
2、执行存储过程
BEGIN
-- Call the procedure
TABLES_ROWS_COUNT;
END;
3、输出统计结果
SELECT * FROM T_ROWS_COUNT;
方法一:推荐使用
SELECT
a.TABLE_NAME,b.COMMENTS
FROM
user_tables a,user_tab_comments b
WHERE
a.TABLE_NAME=b.TABLE_NAME
ORDER BY
TABLE_NAME
方法二:查询当前用户下所有表名、表注释
SELECT table_name,comments FROM user_tab_comments; --table_name(表名)、comments(表注释)
方法一:推荐使用
SELECT
b.TABLE_NAME,b.COLUMN_NAME,a.COMMENTS,b.DATA_TYPE,b.DATA_LENGTH,b.DATA_PRECISION
FROM
USER_TAB_COLUMNS b,USER_COL_COMMENTS a
WHERE
b.TABLE_NAME = 'ACTION_URL' AND b.TABLE_NAME = a.TABLE_NAME AND b.COLUMN_NAME = a.COLUMN_NAME
--其中表USER_TAB_COLUMNS 包含"TABLE_NAME"(表名), "COLUMN_NAME"(字段名), "DATA_TYPE"(字段类型), "DATA_LENGTH"(字段长度), "DATA_PRECISION"(=精度)等信息,但不包含字段描述。
--表USER_COL_COMMENTS 包含"TABLE_NAME"(表名), "COLUMN_NAME"(字段名), "COMMENTS"(字段注释)信息。
方法二:查询当前用户下每个表对应的字段名称、注释
SELECT table_name,column_name,comments FROM user_col_comments;------table_name(表名)、column_name(字段名称)、comments(字段注释)
SELECT RS.表名称,
RS.表备注,
RS.字段名称,
RS.字段类型,
CASE WHEN 字段类型 NOT IN ( 'NUMBER' , 'DATE') THEN RS.数据长度/2
ELSE RS.数据长度 END AS 字段长度,
RS.整数位,
RS.DATA_SCALE,
RS.字段备注
FROM
(
SELECT T.TABLE_NAME AS 表名称,
C1.COMMENTS AS 表备注,
C2.COLUMN_NAME AS 字段名称,
C2.DATA_TYPE AS 字段类型,
-- C2.DATA_LENGTH AS 字段长度,
C2.数据长度,
C2.整数位,
C2.DATA_SCALE,
C3.COMMENTS AS 字段备注
FROM
-- 查询指定用户下的表名称
(SELECT TABLE_NAME FROM ALL_ALL_TABLES WHERE OWNER = 'LYASP') T
LEFT JOIN
-- 查询指定用户下的表注释
(SELECT TABLE_NAME ,
COMMENTS
FROM ALL_TAB_COMMENTS -- 表注释
WHERE OWNER = 'LYASP'
) C1 ON T.TABLE_NAME = C1.TABLE_NAME
LEFT JOIN
-- 查询指定用户下的表字段名称
(
SELECT TABLE_NAME ,
COLUMN_NAME,
DATA_TYPE,
DATA_LENGTH,
DATA_LENGTH AS 数据长度,
DATA_PRECISION AS 整数位 ,
DATA_SCALE
FROM ALL_TAB_COLUMNS WHERE OWNER = 'LYASP'
)C2 ON T.TABLE_NAME = C2.TABLE_NAME
LEFT JOIN
-- 查询指定用户下的字段注释
(
SELECT TABLE_NAME ,
COLUMN_NAME ,
COMMENTS -- 字段注释
FROM ALL_COL_COMMENTS
WHERE OWNER = 'LYASP'
) C3
ON T.TABLE_NAME = C3.TABLE_NAME
AND C2.COLUMN_NAME = C3.COLUMN_NAME
ORDER BY T.TABLE_NAME
) RS
--WHERE RS.表名称 = 'QL_TDSYQ'
至此,本文对达梦、Oracle、PostgreSQL三种数据库关于数据质量的讲解完毕,看千遍说千遍不如实际练一遍,所以大家还是多加练习,欢迎各位提出宝贵意见,一起交流学习,如果各位小伙伴有其他数据库的这类数据质量方面的统计,也欢迎各位随时给我私信或者留言,我都给加到这篇文章里面,力争帮助更多的小伙伴提高工作效率,争取帮各位实现早下班自由乃至财富自由。最后,如果各位小伙伴觉得文章还不错的话,动动发财小手给帮忙点个赞、收藏、评论,一键三连走起呀,下期见~~