create database if not exists myhive;
use myhive;
desc database myhive;
/user/hive/warehouse
内# 使用location关键字,指定数据库在HDFS的存储路径
create database myhive2 location '/myhive2 ';
drop database myhive;
drop database myhive2cascade;
CREATE [EXTERNAL] TABLE[IF NOT EXISTS] tabLe_name
[(col_namedata_type [COMMENT col_comment], ...)]
[COMMENT table_comment]
[PARTITIONED BY (col_name data_type [CoMMENTcol_comment], ...)]
[CLUSTERED BY (col_name,col_name,...)
[SORTED BY (col_name[ASC|DESC],...)] INTO num_buckets BUCKETS]
[ROW FORMAT row_format]
[STORED AS file_format]
[LOCATION hdfs_path]
内部表(Managed Table) | 外部表(External Table) | |
---|---|---|
创建语法 | CREATE TABLE table_name … | CREATE EXTERNAL TABLE table_name … LOCATION … |
存储位置 | 由Hive管理,存储在Hive默认文件系统中 | 可在任何位置,通过LOCATION关键字指定 |
元数据和数据 | Hive管理和控制元数据和数据 | 仅Hive管理元数据,不控制实际数据 |
删除表时的行为 | 删除表会同时删除元数据和存储的数据 | 仅删除表的元数据,不删除实际数据 |
适用场景 | 数据集完全由Hive管理和控制的情况 | 与其他系统共享数据,引入已有数据的场景 |
与其他工具的共享性 | 不适合与其他工具共享数据 | 可以随意临时连接到外部数据上 |
CREATE TABLE table_name ...
create database if not exists myhive;
use myhive;
create table if not exists stu(id int, name string);
insert into stu values ( 1,"zhangsan"),(2, "wangwu");
select *from stu;
hadoop fs -ls /user/hive/warehouse/myhive.db/stu
hadoop fs -cat /user/hive/warehouse/myhive.db/stu/*
CREATE TABLE table_name as
-- 示例
create table stu3 as select * from stu2;
CREATE TABLE table_namelike
-- 示例
create table stu4 like stu2;
DESC FORMATTED Stu2;
create table if not exists stu2(id int ,name string) row format delimited fields terminated by '\t' ;
row format delimited fields terminated by '\t' ;
表示使用\t
分隔drop table table_name;
CREATE EXTERNAL TABLE table_name ... LOCATION ...
EXTERNAL
关键字修饰,从概念是被认为并非Hive拥有的表,只是临时关联数据去使用。\t
分隔:1 hello
2 world
3 hadoop
LOCATION
目录
hadoop fs -ls /tmp
,确认不存在/tmp/test_ext1
目录create external table test_ext1(id int,name string) row format delimited fields terminated by '\t' location ' /tmp/test_ext1';
select * from test_ext1
hadoop fs -put test_external.txt /tmp/test_ext1. select * from test_ext1
hadoop fs -mkdir /tmp/test_ext2
hadoop fs -put test_external.txt /tmp/test_ext2/
create external table test_ext2(id int,name string) row format delimited fieldsterminated by '\t' location '/tmp/test_ext2';
select * from test_ext2;
DROP TABLE table_name;
查看表类型: desc formatted table_name;
Hive可以很简单的通过SQL语句转换内外部表。
内部表转外部表
alter table table_name set tblproperties('EXTERNAL'='TRUE');
外部表转内部表
alter table table_name set tblproperties('EXTERNAL'='FALSE');
要注意:(‘EXTERNAL’=‘FALSE’)或(‘EXTERNAL’=‘TRUE’)为固定写法,区分大小写! ! !
load data local inpath '/home/hadoop/search_log.txt' into table myhive.test_load;
load data inpath '/tmp/search_log.txt' overwrite into table myhive.test_load;
INSERT [OVERWRITE | INTO] TABLE tablename1 [PARTITION (partcol=vall, partcol2=val2 ...) [TF NOTEXISTS]] select_statement1 FROM from_statement;
insert overwrite [local] directory 'path' select_statement1 FROM from_statement;
将hive表中的数据导出到其他任意目录,例如linux本地磁盘,例如hdfs,例如mysq|等等
将查询的结果导出到本地-使用默认列分隔符
insert overwrite local directory '/home/hadoop/export1' select * from test_load ;
insert overwrite local directory '/home/hadoop/export2' row format delimited fields terminated by '\t' select * from test_load;
insert overwrite directory '/tmp/export' row format delimited fields terminated by '\t' select * from test_load;
bin/hive -e "select * from myhive.test_load;" > /home/hadoop/export3/export4.txt
bin/hive -f export.sql > /home/hadoop/export4/export4.txt
CREATE TABLE table_name (
column1 data_type,
column2 data_type,
...
)
PARTITIONED BY (partition_column1 data_type, partition_column2 data_type, ...);
create tables core(
sid string,
cid string,
sscore int
)
partitioned by(month string)
row format delimited fields terminated by't';
create table score2 (sid string,c_id string,sscore int)
partitioned by(year string,month string,day string)
row format delimited fields terminated by'\t';
load data local inpath '/export/server/hivedatas/score.txt' into table score
partition (month='202006');
load data local inpath '/export/server/hivedatas/score.txt' into table score2
partition(year='2020',month='06',day='01');
INSERT INTO TABLE table_name PARTITION (partition_column1 = value1, partition_column2 = value2, ...)
VALUES (value1, value2, ...);
INSERT INTO TABLE sales PARTITION (year = 2023, month = 9)
VALUES (1, 'Product A', '2023-09-08', 100.0);
show partitions score;
alter table score add partition(month='202005')
alter table score add partition(month='202004') partition(month='202003');
ALTER TABLE table_name PARTITION (partition_column1 = value1, partition_column2 = value2, ...)
SET LOCATION '/new/partition/location';
alter table table_name partition(month='2002005') rename to partition(month='201105')
alter table table_name drop partition(month='202006');
set hive.enforce.bucketing=true;
create table course (
c_id string,
c_name string,
t_id string
) clustered by(c_id) into 3 buckets
row format delimited fields terminated by '\t';
create table course_common (
c_id string,
c_name string,
t_id string
) rowformat delimited fields terminated by 't';
load data local inpath '/export/server/hivedatas/course.txt' into table course_common;
insert overwrite
给桶表中加载数据insert overwrite table course select * from course_common cluster by(cid);
如果没有分桶设置,插入(加载)数据只是简单的将数据放入到:
表的数据存储文件夹中(没有分区)
表指定分区的文件夹中(带有分区)
一旦有了分桶设置,比如分桶数量为3,那么,表内文件或分区内数据文件的数量就限定为3当数据插入的时候,需要一分为3,进入三个桶文件内。
问题:如何将数据分成三份,划分的规则是什么?
数据的三份划分基于分桶列的值进行hash取模来决定。由于load data不会触发MapReduce,也就是没有计算过程(无法执行Hash算法),只是简单的移动数据而已,所以无法用于分桶表数据插入。
alter table old_table_namerename to new_table_name;
ALTER TABLE table_name SET TBLPROPERTIES table_properties;
table_properties:(property_name=property_value,property_name=property_value,...)
ALTER TABLE table_name SETTBLPROPERTIES("EXTERNAL"="TRUE")
修改内外部表属性ALTER TABLE table_name SETTBLPROPERTIES('comment'=new_comment
修改表注释alter table table_name add columns(v1 int,v2 string);
alter table table_name change v1 v1new int;
drop table table_name;
-- 只能清空内部表
truncate table table_name;
Hive支持的数据类型很多,除了基本的: int、string、varchar、timestamp等还有一些复杂的数据类型:array(数组类型)、map(映射类型)、struct(结构类型)
data_for_array_type.txt文件内容如下
zhangsan beijing,shanghai,tianjin,hangzhou
wangwu changchun,chengdu,wuhan,beijin
建表语句
create table test_array(
name string,
work_locations array<string>)
row format delimited fields terminated by '\t'
COLLECTION ITEMS TERMINATED BY ',';
row format delimited fields terminated by '\t′
表示列分隔符是\t
.COLLECTION ITEMS TERMINATED BY ',’
表示集合(array)元素的分隔符是逗号导入数据
load data local inpath '/home/hadoop/data_for_array_type.txt' overwrite into table itheima.test_array;
常用array类型查询:
-- 查询所有数据
select * from test_array;
-- 查询loction数组中第一个元素
select name, work_locations[0] location from test_array;
-- 查询location数组中元素的个数
select name, size(work_locations) location from test_array;
-- 查询location数组中包含tianjin的信息
select * from test_array where array_contains(work_locations,'tianjin');
1,林杰均,father:林大明#mother:小甜甜#brother:小甜,28
2,周杰伦,father:马小云#mother:黄大奕#brother:小天,22
3,王葱,father:王林#mother:如花#sister:潇潇,29
4,马大云,father:周街轮#mother:美美,26
create table test_map(
id int,
name string,
members map<string,string>,
age int)
row format delimited fields terminated by ','
COLLECTION ITEMS TERMINATED BY '#'
MAP KEYS TERMINATED BY ':';
MAP KEYS TERMINATED BY
表示key-value之间用:
分隔load data local inpath '/home/hadoop/data_for_map_type.txt' overwrite into table test_map;
--查询全部
select * from test_map;
--查询father、mother这两个map的key
select id, name, members["father"] father, members["mother"] mother, age from test_map;
--查询全部map的key,使用map_keys函数,结果是array类型
select id, name, map_keys(members) as relation from test_map;
--查询全部map的value,使用mapvalues函数,结果是array类型
select id, name, map_values(members) as relation from test_map;
--查询map类型的KV对数量
select id,name,size(members) num from test_map;
--查询map的key中有brother的数据
select * from test_map where array_contains(map_keys(members), 'brother');
struct类型是一个复合类型,可以在一个列中存入多个子列,每个子列允许设置类型和名称。
有如下数据文件,说明:字段之间#分割,struct之间冒号分割
1#周杰轮:11
2#林均杰:16
3#刘德滑:21
4#张学油:26
5#蔡依临:23
create table test_struct(
id string,
info struct<name:string,age:int>
)
row format delimited fields terminated by '#'
COLLECTION ITEMS TERMINATED BY ':';
load data local inpath '/home/hadoop/data_for_struct_type.txt' into table test_struct;
-- 查询全部
select * from test_struct;
-- 直接使用列名。子列名即可从struct中取出子列查询
select id, info.name from test_struct;