概述
如何快速插入大量数据比如几千万上亿的带索引的数据表。
数据准备
准备一个拥有二十个索引的数据表。
- kingbase=# \d+ bigtab
- Table "kingbase.bigtab"
- Column | Type | Collation | Nullable | Default | Storage | Stats target | Description
- --------+---------+-----------+----------+---------+----------+--------------+-------------
- id | integer | | | | plain | |
- c01 | integer | | | | plain | |
- c02 | integer | | | | plain | |
- c03 | integer | | | | plain | |
- c04 | integer | | | | plain | |
- c05 | integer | | | | plain | |
- c06 | integer | | | | plain | |
- c07 | integer | | | | plain | |
- c08 | integer | | | | plain | |
- c09 | integer | | | | plain | |
- c10 | integer | | | | plain | |
- c11 | integer | | | | plain | |
- c12 | integer | | | | plain | |
- c13 | integer | | | | plain | |
- c14 | integer | | | | plain | |
- c15 | integer | | | | plain | |
- c16 | integer | | | | plain | |
- c17 | integer | | | | plain | |
- c18 | integer | | | | plain | |
- c19 | integer | | | | plain | |
- c20 | integer | | | | plain | |
- c21 | integer | | | | plain | |
- c22 | integer | | | | plain | |
- c23 | integer | | | | plain | |
- c24 | integer | | | | plain | |
- c25 | integer | | | | plain | |
- c26 | integer | | | | plain | |
- c27 | integer | | | | plain | |
- c28 | integer | | | | plain | |
- c29 | integer | | | | plain | |
- t01 | text | | | | extended | |
- t02 | text | | | | extended | |
- t03 | text | | | | extended | |
- t04 | text | | | | extended | |
- t05 | text | | | | extended | |
- t06 | text | | | | extended | |
- t07 | text | | | | extended | |
- t08 | text | | | | extended | |
- t09 | text | | | | extended | |
- t10 | text | | | | extended | |
- t11 | text | | | | extended | |
- t12 | text | | | | extended | |
- t13 | text | | | | extended | |
- t14 | text | | | | extended | |
- t15 | text | | | | extended | |
- t16 | text | | | | extended | |
- t17 | text | | | | extended | |
- t18 | text | | | | extended | |
- t19 | text | | | | extended | |
- t20 | text | | | | extended | |
- Indexes:
- "bigtab_i01" btree (c01)
- "bigtab_i02" btree (c02)
- "bigtab_i03" btree (c03)
- "bigtab_i04" btree (c04)
- "bigtab_i05" btree (c05)
- "bigtab_i06" btree (c06)
- "bigtab_i07" btree (c07)
- "bigtab_i08" btree (c08)
- "bigtab_i09" btree (c09)
- "bigtab_i10" btree (c10)
- "bigtab_i11" btree (c11)
- "bigtab_i12" btree (c12)
- "bigtab_i13" btree (c13)
- "bigtab_i14" btree (c14)
- "bigtab_i15" btree (c15)
- "bigtab_i16" btree (c16)
- "bigtab_i17" btree (c17)
- "bigtab_i18" btree (c18)
- "bigtab_i19" btree (c19)
- "bigtab_i20" btree (c20)
- Access method: heap
-
- kingbase=#
方法一:直接插入海量数据,自动维护索引
- kingbase=#
- kingbase=# insert into bigtab
- kingbase-# select id
- kingbase-# , (random() * 100)::int + 1000 c01
- kingbase-# , (random() * 200)::int + 1000 c02
- kingbase-# , (random() * 300)::int + 10000 c03
- kingbase-# , (random() * 400)::int + 10000 c04
- kingbase-# , (random() * 500)::int + 10000 c05
- kingbase-# , (random() * 600)::int + 10000 c06
- kingbase-# , (random() * 700)::int + 10000 c07
- kingbase-# , (random() * 800)::int + 10000 c08
- kingbase-# , (random() * 900)::int + 10000 c09
- kingbase-# , (random() * 1000)::int + 10000 c10
- kingbase-# , (random() * 2000)::int + 10000 c11
- kingbase-# , (random() * 3000)::int + 10000 c12
- kingbase-# , (random() * 4000)::int + 10000 c13
- kingbase-# , (random() * 5000)::int + 10000 c14
- kingbase-# , (random() * 6000)::int + 10000 c15
- kingbase-# , (random() * 7000)::int + 10000 c16
- kingbase-# , (random() * 8000)::int + 10000 c17
- kingbase-# , (random() * 9000)::int + 10000 c18
- kingbase-# , (random() * 10000)::int + 10000 c19
- kingbase-# , (random() * 20000)::int + 10000 c20
- kingbase-# , (random() * 30000)::int + 10000 c21
- kingbase-# , (random() * 40000)::int + 10000 c22
- kingbase-# , (random() * 50000)::int + 10000 c23
- kingbase-# , (random() * 60000)::int + 10000 c24
- kingbase-# , (random() * 70000)::int + 10000 c25
- kingbase-# , (random() * 80000)::int + 10000 c26
- kingbase-# , (random() * 90000)::int + 10000 c27
- kingbase-# , (random() * 10000)::int + 10000 c28
- kingbase-# , (random() * 10000)::int + 10000 c29
- kingbase-# , md5(random()::text) t01
- kingbase-# , md5(random()::text) t02
- kingbase-# , md5(random()::text) t03
- kingbase-# , md5(random()::text) t04
- kingbase-# , md5(random()::text) t05
- kingbase-# , md5(random()::text) t06
- kingbase-# , md5(random()::text) t07
- kingbase-# , md5(random()::text) t08
- kingbase-# , md5(random()::text) t09
- kingbase-# , md5(random()::text) t10
- kingbase-# , md5(random()::text) t11
- kingbase-# , md5(random()::text) t12
- kingbase-# , md5(random()::text) t13
- kingbase-# , md5(random()::text) t14
- kingbase-# , md5(random()::text) t15
- kingbase-# , md5(random()::text) t16
- kingbase-# , md5(random()::text) t17
- kingbase-# , md5(random()::text) t18
- kingbase-# , md5(random()::text) t19
- kingbase-# , md5(random()::text) t20
- kingbase-# from generate_series(1, 2000000) id;
- INSERT 0 2000000
- Time: 299331.143 ms (04:59.331)
优点: 语句单一;自动维护索引;自动支持之后的索引。
缺点: 逐行维护索引,造成用时较长。
方法二:删除索引,插入海量数据,再创建索引
- kingbase=#
- kingbase=# do
- kingbase-# $$
- kingbase$# begin
- kingbase$# drop index bigtab_i01;
- kingbase$# drop index bigtab_i02;
- kingbase$# drop index bigtab_i03;
- kingbase$# drop index bigtab_i04;
- kingbase$# drop index bigtab_i05;
- kingbase$# drop index bigtab_i06;
- kingbase$# drop index bigtab_i07;
- kingbase$# drop index bigtab_i08;
- kingbase$# drop index bigtab_i09;
- kingbase$# drop index bigtab_i10;
- kingbase$# drop index bigtab_i11;
- kingbase$# drop index bigtab_i12;
- kingbase$# drop index bigtab_i13;
- kingbase$# drop index bigtab_i14;
- kingbase$# drop index bigtab_i15;
- kingbase$# drop index bigtab_i16;
- kingbase$# drop index bigtab_i17;
- kingbase$# drop index bigtab_i18;
- kingbase$# drop index bigtab_i19;
- kingbase$# drop index bigtab_i20;
- kingbase$#
- kingbase$# insert into bigtab
- kingbase$# select id
- kingbase$# , (random() * 100)::int + 1000 c01
- kingbase$# , (random() * 200)::int + 1000 c02
- kingbase$# , (random() * 300)::int + 10000 c03
- kingbase$# , (random() * 400)::int + 10000 c04
- kingbase$# , (random() * 500)::int + 10000 c05
- kingbase$# , (random() * 600)::int + 10000 c06
- kingbase$# , (random() * 700)::int + 10000 c07
- kingbase$# , (random() * 800)::int + 10000 c08
- kingbase$# , (random() * 900)::int + 10000 c09
- kingbase$# , (random() * 1000)::int + 10000 c10
- kingbase$# , (random() * 2000)::int + 10000 c11
- kingbase$# , (random() * 3000)::int + 10000 c12
- kingbase$# , (random() * 4000)::int + 10000 c13
- kingbase$# , (random() * 5000)::int + 10000 c14
- kingbase$# , (random() * 6000)::int + 10000 c15
- kingbase$# , (random() * 7000)::int + 10000 c16
- kingbase$# , (random() * 8000)::int + 10000 c17
- kingbase$# , (random() * 9000)::int + 10000 c18
- kingbase$# , (random() * 10000)::int + 10000 c19
- kingbase$# , (random() * 20000)::int + 10000 c20
- kingbase$# , (random() * 30000)::int + 10000 c21
- kingbase$# , (random() * 40000)::int + 10000 c22
- kingbase$# , (random() * 50000)::int + 10000 c23
- kingbase$# , (random() * 60000)::int + 10000 c24
- kingbase$# , (random() * 70000)::int + 10000 c25
- kingbase$# , (random() * 80000)::int + 10000 c26
- kingbase$# , (random() * 90000)::int + 10000 c27
- kingbase$# , (random() * 10000)::int + 10000 c28
- kingbase$# , (random() * 10000)::int + 10000 c29
- kingbase$# , md5(random()::text) t01
- kingbase$# , md5(random()::text) t02
- kingbase$# , md5(random()::text) t03
- kingbase$# , md5(random()::text) t04
- kingbase$# , md5(random()::text) t05
- kingbase$# , md5(random()::text) t06
- kingbase$# , md5(random()::text) t07
- kingbase$# , md5(random()::text) t08
- kingbase$# , md5(random()::text) t09
- kingbase$# , md5(random()::text) t10
- kingbase$# , md5(random()::text) t11
- kingbase$# , md5(random()::text) t12
- kingbase$# , md5(random()::text) t13
- kingbase$# , md5(random()::text) t14
- kingbase$# , md5(random()::text) t15
- kingbase$# , md5(random()::text) t16
- kingbase$# , md5(random()::text) t17
- kingbase$# , md5(random()::text) t18
- kingbase$# , md5(random()::text) t19
- kingbase$# , md5(random()::text) t20
- kingbase$# from generate_series(1, 2000000) id;
- kingbase$#
- kingbase$# create index bigtab_i01 on bigtab (c01);
- kingbase$# create index bigtab_i02 on bigtab (c02);
- kingbase$# create index bigtab_i03 on bigtab (c03);
- kingbase$# create index bigtab_i04 on bigtab (c04);
- kingbase$# create index bigtab_i05 on bigtab (c05);
- kingbase$# create index bigtab_i06 on bigtab (c06);
- kingbase$# create index bigtab_i07 on bigtab (c07);
- kingbase$# create index bigtab_i08 on bigtab (c08);
- kingbase$# create index bigtab_i09 on bigtab (c09);
- kingbase$# create index bigtab_i10 on bigtab (c10);
- kingbase$# create index bigtab_i11 on bigtab (c11);
- kingbase$# create index bigtab_i12 on bigtab (c12);
- kingbase$# create index bigtab_i13 on bigtab (c13);
- kingbase$# create index bigtab_i14 on bigtab (c14);
- kingbase$# create index bigtab_i15 on bigtab (c15);
- kingbase$# create index bigtab_i16 on bigtab (c16);
- kingbase$# create index bigtab_i17 on bigtab (c17);
- kingbase$# create index bigtab_i18 on bigtab (c18);
- kingbase$# create index bigtab_i19 on bigtab (c19);
- kingbase$# create index bigtab_i20 on bigtab (c20);
- kingbase$#
- kingbase$# end;
- kingbase$# $$;
- ANONYMOUS BLOCK
- Time: 83069.170 ms (01:23.069)
优点: 批量维护索引,用时最短。
缺点: 语句复杂且固化;手动维护删建索引语句;不支持之后的索引。
方法三:禁止索引更改,插入海量数据,重建表的全部索引
- kingbase=# do
- kingbase-# $$
- kingbase$# begin
- kingbase$#
- kingbase$# update pg_index
- kingbase$# set indislive= false
- kingbase$# where indrelid = 'bigtab'::regclass;
- kingbase$#
- kingbase$# insert into bigtab
- kingbase$# select id
- kingbase$# , (random() * 100)::int + 1000 c01
- kingbase$# , (random() * 200)::int + 1000 c02
- kingbase$# , (random() * 300)::int + 10000 c03
- kingbase$# , (random() * 400)::int + 10000 c04
- kingbase$# , (random() * 500)::int + 10000 c05
- kingbase$# , (random() * 600)::int + 10000 c06
- kingbase$# , (random() * 700)::int + 10000 c07
- kingbase$# , (random() * 800)::int + 10000 c08
- kingbase$# , (random() * 900)::int + 10000 c09
- kingbase$# , (random() * 1000)::int + 10000 c10
- kingbase$# , (random() * 2000)::int + 10000 c11
- kingbase$# , (random() * 3000)::int + 10000 c12
- kingbase$# , (random() * 4000)::int + 10000 c13
- kingbase$# , (random() * 5000)::int + 10000 c14
- kingbase$# , (random() * 6000)::int + 10000 c15
- kingbase$# , (random() * 7000)::int + 10000 c16
- kingbase$# , (random() * 8000)::int + 10000 c17
- kingbase$# , (random() * 9000)::int + 10000 c18
- kingbase$# , (random() * 10000)::int + 10000 c19
- kingbase$# , (random() * 20000)::int + 10000 c20
- kingbase$# , (random() * 30000)::int + 10000 c21
- kingbase$# , (random() * 40000)::int + 10000 c22
- kingbase$# , (random() * 50000)::int + 10000 c23
- kingbase$# , (random() * 60000)::int + 10000 c24
- kingbase$# , (random() * 70000)::int + 10000 c25
- kingbase$# , (random() * 80000)::int + 10000 c26
- kingbase$# , (random() * 90000)::int + 10000 c27
- kingbase$# , (random() * 10000)::int + 10000 c28
- kingbase$# , (random() * 10000)::int + 10000 c29
- kingbase$# , md5(random()::text) t01
- kingbase$# , md5(random()::text) t02
- kingbase$# , md5(random()::text) t03
- kingbase$# , md5(random()::text) t04
- kingbase$# , md5(random()::text) t05
- kingbase$# , md5(random()::text) t06
- kingbase$# , md5(random()::text) t07
- kingbase$# , md5(random()::text) t08
- kingbase$# , md5(random()::text) t09
- kingbase$# , md5(random()::text) t10
- kingbase$# , md5(random()::text) t11
- kingbase$# , md5(random()::text) t12
- kingbase$# , md5(random()::text) t13
- kingbase$# , md5(random()::text) t14
- kingbase$# , md5(random()::text) t15
- kingbase$# , md5(random()::text) t16
- kingbase$# , md5(random()::text) t17
- kingbase$# , md5(random()::text) t18
- kingbase$# , md5(random()::text) t19
- kingbase$# , md5(random()::text) t20
- kingbase$# from generate_series(1, 2000000) id;
- kingbase$#
- kingbase$# update pg_index
- kingbase$# set indislive= true
- kingbase$# where indrelid = 'bigtab'::regclass;
- kingbase$#
- kingbase$# analyse bigtab;
- kingbase$# reindex table bigtab;
- kingbase$#
- kingbase$# end;
- kingbase$# $$;
- ANONYMOUS BLOCK
- Time: 87110.126 ms (01:27.110)
优点: 批量维护索引,用时短;语句固定模式;自动维护索引;支持之后的索引。
缺点: 多个SQL语句,不易嵌入语句块。
最后的话
reindex table 的执行依赖统计信息,所以需要执行 analyse table ,才能成功重建表的全部可更新的索引。
reindex index 不受上述因素的影响,可以强制重建不更新的索引,并自动修改 indislive= true。
如果在REINDEX期间出现异常,那么所有需要rebuild的索引的状态都是invalid,意味着这些索引仍然占用空间,定义仍在但不能使用。
避免REINDEX期间出现异常,可以在索引更新操作时,跳过唯一索引和外键依赖索引等。
