Python大数据之linux学习总结——day06_hive02

hive内外表操作

建表语法

create [external] table [if not exits] 
			表名(字段名 字段类型，字段名 字段类型,...)
			[partitioned by (分区字段名 分区字段类型)]	#分区表固定格式
			[clustered by (分桶字段名） into 桶个数 buckets]	#分桶表固定格式	注意：可以排序[sorted by (排序字段名 asc|desc)
			[row format delimited fields terminated by '字段分隔符']	#自定义字段分隔符固定格式
			[stored as textfile]	默认即可
            ;	#注意：最后一定加分号

注意：关键字顺序是从上到下从左到右，否则报错

default默认库存储路径： /uesr/hive/warehouse
自定义库在HDFS的默认存储路径： /user/hive/warehouse/库名.db
自定义表在HDFS的默认存储路径:	/user/hive/warehouse/库名.db
业务数据文件在HDFS的默认存储路径:	/user/hive/warehouse/库名.db/表名/数据文件
1
2
3
4
5
6
7
8
9
10
11
12
13
14

数据类型

基础数据类型: 整数:int
			小数:float double
			字符串:string varchar(长度)
			日期:date timestamp
			
复杂数据类型:集合:array
			映射:map
			结构体:struct
			联合体：union
			
1
2
3
4
5
6
7
8
9
10

表分类

Hive中可以创建的表有好几种类型，分别是:
内部表:又叫管理表或者托管表
	   分区表(分区分桶表)
外部表:又叫非管理表或者非托管表
	   分区表(分区分桶表)
	   分桶表

内部表和外部表的区别？
内部表：未被external关键字修饰即是内部表，即普通表。内部表又称管理表，还可以叫托管表。
		删除表:直接删除元数据(metedata)和存储数据本身
外部表:被external关键字修饰的即是外部表，即关联表。还可以叫非管理部和非托管表。
		删除外部表:仅仅是删除元数据(metedata),不会删除存储数据本身。

查看格式化信息：desc formatted 表名
	--内部表类型:MANAGED_TABLE
	--外部表类型:EXTERNAL_TABLE

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17

-- 内部表(又叫管理表或者托管表)
create table stu1(
    id int,
    name string
);
-- 外部表(又叫非管理表,非托管表)
create external table stu2(
    id int,
    name string
);
-- 查看表结构
desc stu1;
desc stu2;
-- 查看表格式化信息
desc formatted stu1; -- 内部表类型: managed_table
desc formatted stu2; -- 外部表类型: external_table

-- 演示内外部表的重点区别
-- 删除内部表(管理表/托管表),会删除表相关的所有数据
insert into stu1 values(1,'张三');
drop table stu1;
-- 删除外部表,只删除了元数据,hdfs中业务数据保留
insert into stu2 values(1,'张三');
drop table stu2;
-- 再次建表后,可以使用location重新关联原来hdfs保留的业务数据
create external table stu22(
    id int,
    name string
)location '/user/hive/warehouse/hive1.db/stu2';
-- 验证数据
select * from stu22 limit 10;

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32

内部表基本操作[练习]

知识点：

创建内部表:create table [if not exists] 内部表名(字段名 字段类型 ，字段名 字段类型 ，...)
[row format delimited filelds terminated by '字段分隔符'];

复制内部表:方式1:like方式复制表结构
		  方式2:as 方式复制表结构和数据

删除内部表:drop table 内部表名;
		  注意:删除内部表效果是mysql中相关元数据被删除，同时存储在hdfs中的业务数据本身也被删除

查看表格式化信息:desc formatted 表名;		--内部表类型:MANAGED_TABLE

注意:还可以使用truncate清空内部表
		   格式:truncate table 内部表名;
1
2
3
4
5
6
7
8
9
10
11
12
13

示例:

-- 操作表的前提:先有库并使用它
create database hive2;
use hive2;
-- 一.内部表的创建和删除
-- 1.演示创建内部表
-- 建表方式1
create table inner_stu1(
    id int,
    name string
);
-- 插入数据
insert into inner_stu1 values(1,'张三');

-- 建表方式2: 复制表结构
create table inner_stu2 like inner_stu1;
-- 插入数据
insert into inner_stu2 values(1,'张三');

-- 建表方式3: 复制表结构和数据
create table inner_stu3 as
select * from inner_stu1;


-- 2.演示查看内部表结构详细信息
-- 内部表类型: MANAGED_TABLE
desc formatted inner_stu1;
desc formatted inner_stu2;
desc formatted inner_stu3;


-- 3.演示内部表的删除
-- 删除内部表
drop table inner_stu3;-- 元数据和业务数据均被删除
-- 清空内部数据
truncate table inner_stu2;
-- 注意: delete和update不能使用
delete from inner_stu1;-- 报错
update inner_stu1 set name = '李四'; -- 报错
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38

外部表基本操作[练习]

知识点:

创建外部表:create external table [if not exists] 外部表名(字段名 字段类型 , 字段名 字段类型 , ... )
		  [row format delimited fields terminated by '字段分隔符'];

复制表:方式1:like方式复制表结构
		  注意:as方式不可以使用

删除外部表:drop table 外部表名；
		  注意:删除外部表效果是mysql中元数据被删除，但是存储在hdfs中的业务数据本身被保留

查看表格式化信息：desc formatted 表名; 	--外部表类型：
				EXTERNAL_TABLE
		  注意:外部表不能使用truncate清空数据本身
1
2
3
4
5
6
7
8
9
10
11
12

示例:

-- 二.外部表的创建和删除
-- 1.外部的表创建
-- 建表方式1
create external table outer_stu1(
    id int,
    name string
);
-- 插入数据
insert into outer_stu1 values(1,'张三');

-- 建表方式2
create external table outer_stu2 like outer_stu1;
-- 插入数据
insert into outer_stu2 values(1,'张三');

-- 注意: 外部表不能使用create ... as 方式复制表
create external table outer_stu3 as
    select * from outer_stu1; -- 报错

-- 2.演示查看外部表结构详细信息
-- 外部表类型: EXTERNAL_TABLE
desc formatted outer_stu1;
desc formatted outer_stu2;


-- 3.演示外部表的删除
-- 删除表
drop table outer_stu2;
-- 注意: 外部表不能使用truncate关键字清空数据
truncate table outer_stu1; -- 报错
-- 注意: delete和update不能使用
delete from outer_stu1; -- 报错
update outer_stu1 set name = '李四'; -- 报错
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33

查看/修改表

查看所有表:show tables;
查看建表语句:show create table 表名;
查看表信息:desc 表名;
查看表结构信息:desc 表名;
查看表格式化信息:desc formatted 表名;
			   注意:formatted能够展示详细信息

修改表名:alter table 旧表名 rename to 新表名
字段的添加:alter table 表名 add column (字段名,字段类型);
字段的替换:alter table 表名 replace columns (字段名 字段类型,...);
字段名和字段类型同时修改:alter table 表名 change 旧字段名 新字段名 新字段类型;
		注意:字符串类型不能直接改数值类型

修改表路径:alter table 表名 set location 'hdfs中存储路径';
		  注意:建议使用默认路径
location:建表的时候不写有默认路径/uesr/hive/warehouse/库名.db/表名，当然建表的时候也可以直接指定路径

修改表属性:alter table 表名 set tblproperties ('属性名‘='属性值');
		  注意:经常用于内外部表切换

内外部表类型切换:外部表属性:'EXTERNAL'='TRUE'
				内部表属性:'EXTERNAL'='FALSE'
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22

示例:

默认分隔符

知识点:

创建表的时候，如果不指定分隔符，以后表只能识别默认的分隔符，
键盘不好打印打印，展示形式一般为:\0001,SOH,^A,口
1
2

示例:

在这里插入代码片
1

快速映射表[练习]

知识点:

创建表的时候指定分隔符: create [external] table 表名(字段名 字段类型)
row format delimited fields terminated by 符号;

加载数据: load data [local] inpath '结构化数据文件' into table 表名;
1
2
3
4

示例:

-- 创建表
create table products(
    id int,
    name string,
    price double,
    cid string
)row format delimited
fields terminated by ',';
-- 加载数据
-- 注意: 如果从hdfs中加载文件,本质就是移动文件到对应表路径下
load data inpath '/source/products.txt' into table products;
-- 验证数据
select * from products limit 1;
1
2
3
4
5
6
7
8
9
10
11
12
13

数据导入和导出

文件数据加载导入

1.直接上传文件

window页面上传

需求: 已知emp1.txt文件在windows/mac系统,要求使用hdfs保存此文件
并且使用hivesql建表关联数据

-- 1.先在hive上根据数据建表,然后在window/mac上传文件到hdfs表路径中
create table emp1(
    id int,
    name string,
    sal double,
    dept string
)row format delimited
fields terminated by ',';

-- windows使用hdfs页面上传文件
-- node1:9870访问页面把emp1.txt上传到/user/hive/warehouse/hive02.db/emp1路径下
-- 查询数据
select * from emp1;

1
2
3
4
5
6
7
8
9
10
11
12
13
14

linux本地put上传

需求: 已知emp2.txt文件在linux系统,要求使用hdfs保存此文件
并且使用hivesql建表关联数据

-- 2.先在hive上根据数据建表,然后在linux上传文件到hdfs表路径中
create table emp2(
    id int,
    name string,
    sal double,
    dept string
)row format delimited
fields terminated by ',';

-- linux使用hdfs命令上传文件
-- [root@node1 ~]# hdfs dfs -put emp2.txt /user/hive/warehouse/hive02.db/emp2

-- 查看数据
select * from emp2;

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15

2.load加载文件:

从hdfs路径把文件移动到表对应存储路径中: load data inpath 'HDFS文件路径' [overwrite] into table 表名;

从linux本地把文件上传到表对应存储路径中: load data local inpath 'Linux文件路径' [overwrite] into table 表名;
1
2
3

load移动HDFS文件:

-- 数据导入
-- 需求1: load加载hdfs中文件到表路径中
-- 1.根据斌哥资料中search_log.txt数据创建表
create table search_log(
    dt string,
    uid string,
    name string,
    url string
)row format delimited fields terminated by '\t';
-- 2.把windows中search_log.txt文件上传hdfs其他路径,例如:/src中
-- 3.使用load把hdfs的/src中的文件移动到search_log对应hdfs表存储路径中
load data inpath '/src/search_log.txt' into table search_log;
-- 4.查询数据
select * from search_log;
1
2
3
4
5
6
7
8
9
10
11
12
13
14

load上传Linux文件

-- 需求2: 直接把linux中最新的search_log.txt文件上传到search表对应hdfs路径中
-- 先把资料中search_log.txt文件传到linux中,例如:/root
-- load命令上传文件
load data local inpath '/root/search_log.txt' overwrite into table search_log;

-- 查看最终数据
select * from search_log;
1
2
3
4
5
6
7

3.insert插入数据

从其他表查询数据'追加'插入到当前表中: insert into [table] 表名 select 语句;

从其他表查询数据'覆盖'插入到当前表中: insert overwrite table 表名 select 语句;
1
2
3

insert追加数据

-- 需求1:创建一个search_log_copy表,然后从search_log查询数据插入到新表中
create table search_log_copy(
    dt string,
    uid string,
    word string,
    url string
)row format delimited
fields terminated by '\t';

-- 从search_log表中查所有数据,直接插入到search_log_copy表
insert into table search_log_copy select * from search_log;
-- 查看数据
select * from search_log_copy;

1
2
3
4
5
6
7
8
9
10
11
12
13
14

insert覆盖数据

-- 需求2: 假设search_log表中数据改变了,要求把最新的数据更新到search_log_copy表中
insert overwrite table search_log_copy select * from search_log;
-- 查看数据
select * from search_log_copy;
1
2
3
4

文件数据导出

1.直接下载文件

web页面下载

需求: 已知search_log.txt文件在HFDS的/user/hive/warehouse/hive02.db/search_log路径下,
要下载到window系统

在这里插入图片描述

get命令下载文件

需求: 已知search_log.txt文件在HFDS的/user/hive/warehouse/hive02.db/search_log路径下,要下载到linux系统

[root@node1 binzi]# hdfs dfs -get /user/hive/warehouse/hive02.db/search_log/search_log.txt /binzi
1

2.insert导出数据

查询数据导出到hdfs其他路径: insert overwrite directory 'hfds存储该数据路径' select语句;

查询数据导出到linux本地中: insert overwrite local directory 'linux存储该数据路径' select语句;

注意:  overwrite默认是覆盖重写,所以在指定存储该数据路径的时候尽量指定一个空的目录

注意: 导出数据的时候不指定分隔符采用默认分隔符SOH,0001,?...

导出数据指定分隔符添加: row format delimited fields terminated by '分隔符'
1
2
3
4
5
6
7
8
9

insert导出到hdfs

-- 演示insert overwrite导出数据到文件
-- 语法:  insert overwrite [local] directory 文件存储路径 [指定分隔符] select语句;
-- 导出数据到hfds
-- 注意: 如果是根目录/,会自动创建-ext-10000目录存储生成的000000_0文件
-- 但是其他目录,会自动清空所有内容,再生成一个000000_0文件,所以注意导出目录尽量是一个新的空目录
-- 默认分隔符
insert overwrite  directory '/source' select * from search_log1;
-- 指定分隔符
insert overwrite  directory '/output'
    row format delimited fields terminated by ','
select * from search_log1;
1
2
3
4
5
6
7
8
9
10
11

insert导出linux

-- 2.2导出到linux
-- [root@node1 ~]# mkdir /output
-- 导出到linux的/output目录下,自动生成000000_0文件存储查询结果
-- 默认分隔符
insert overwrite local directory '/output' select * from search_log1;
-- 指定分隔符
insert overwrite local directory '/output'
    row format delimited fields terminated by ','
select * from search_log1;
1
2
3
4
5
6
7
8
9

3.hive_shell命令

hive命令执行sql语句:  hive -e "sql语句" > 存储该结果数据的文件路径

hive命令执行sql脚本:  hive -f sql脚本文件 > 存储该结果数据的文件路径
1
2
3

hql语句导出

# 以下命令都是在linux的shell命令行执行
# 3.1使用hive -e sql语句方式导出数据
[root@node1 ~]# hive -e 'select * from hive02.search_log;' > /home/hs1.txt
[root@node1 ~]# cat hs1.txt
1
2
3
4

hql脚本导出

# 3.2使用hive -f 脚本文件方式导出数据
[root@node1 ~]# echo 'select * from hive02.search_log;' > /home/export.sql
[root@node1 ~]# hive -f export.sql > /home/hs2.txt
[root@node1 ~]# cat hs2.txt
1
2
3
4

分区表[预习]

特点: 需要产生分区目录, 查询的时候使用分区字段筛选数据,避免全表扫描从而提升查询效率

注意: 如果是分区表,在查询数据的时候没有使用分区字段去筛选数据,效率不变

一级分区

知识点:

创建分区表: create [external] table [if not exists] 表名(字段名 字段类型 , 字段名 字段类型 , ... )partitioned by (分区字段名 分区字段类型) ; 

自动生成分区目录并插入数据: load data [local] inpath '文件路径' into table 分区表名 partition (分区字段名='值');
1
2
3

示例:

-- 1.演示一级分区
/*创建分区表: create [external] table [if not exists] 表名(
字段名 字段类型 , 字段名 字段类型 , ...
)partitioned by (分区字段名 分区字段类型) ; */
-- 建表
create table score(
    name string,
    subject string,
    grade int
)partitioned by (dt string)
row format delimited
fields terminated by '\t';

-- 加载数据(提前把score.txt上传到hdfs根目录下)
-- 注意: 数据文件需要放到分区目录中,才能被识别
load data inpath '/score.txt' into table score partition (dt='2022');
load data inpath '/score.txt' into table score partition (dt='2023');
load data inpath '/score.txt' into table score partition (dt='2024');
-- 查询数据
select * from score;

-- 需求: 查询2023年的数据
-- 注意: 如果查询数据的时候使用了分区字段,效率提升
select * from score where dt='2023'; --底层直接找所有2023目录下数据

-- 需求: 查询周杰轮的所有信息
-- 注意: 如果查询数据的时候没有使用分区字段,效率不变
select * from score where name='周杰轮'; -- 全表扫描
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28

相关阅读:
2022.7.26 模拟赛
 《前端》html总结
 [汇编实操]DOSBox工具: unable to open input file: 文件名.asm问题解决
 请求方式 ajax 原理
 prometheus/grafana监控数据收集与展示——k8s从入门到高并发系列教程（九）
bug是测不完的，根本测不完
 数据结构与算法基础-学习-35-各排序算法效率总结
 面试高频问题----2
用whl文件安装Anaconda中的GDAL
233062C++&QTday5
原文地址：https://blog.csdn.net/weixin_46010015/article/details/134374571