目录





脚本中编写如下内容
- #!/bin/bash
-
- # 定义变量方便修改
- APP=gmall
-
- # 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
- if [ -n "$1" ] ;then
- do_date=$1
- else
- do_date=`date -d "-1 day" +%F`
- fi
-
- echo ================== 日志日期为 $do_date ==================
- sql="
- load data inpath '/origin_data/$APP/log/topic_log/$do_date' into table ${APP}.ods_log partition(dt='$do_date');
- "
-
- hive -e "$sql"
-
- hadoop jar /opt/module/hadoop-3.1.3/share/hadoop/common/hadoop-lzo-0.4.20.jar com.hadoop.compression.lzo.DistributedLzoIndexer /warehouse/$APP/ods/ods_log/dt=$do_date



DIM层储存的基本都为维度表,且基本都是进行全量同步操作
- DROP TABLE IF EXISTS dim_sku_info;
- CREATE EXTERNAL TABLE dim_sku_info (
- `id` STRING COMMENT '商品id',
- `price` DECIMAL(16,2) COMMENT '商品价格',
- `sku_name` STRING COMMENT '商品名称',
- `sku_desc` STRING COMMENT '商品描述',
- `weight` DECIMAL(16,2) COMMENT '重量',
- `is_sale` BOOLEAN COMMENT '是否在售',
- `spu_id` STRING COMMENT 'spu编号',
- `spu_name` STRING COMMENT 'spu名称',
- `category3_id` STRING COMMENT '三级分类id',
- `category3_name` STRING COMMENT '三级分类名称',
- `category2_id` STRING COMMENT '二级分类id',
- `category2_name` STRING COMMENT '二级分类名称',
- `category1_id` STRING COMMENT '一级分类id',
- `category1_name` STRING COMMENT '一级分类名称',
- `tm_id` STRING COMMENT '品牌id',
- `tm_name` STRING COMMENT '品牌名称',
- `sku_attr_values` ARRAY<STRUCT<attr_id:STRING,value_id:STRING,attr_name:STRING,value_name:STRING>> COMMENT '平台属性',
- `sku_sale_attr_values` ARRAY<STRUCT<sale_attr_id:STRING,sale_attr_value_id:STRING,sale_attr_name:STRING,sale_attr_value_name:STRING>> COMMENT '销售属性',
- `create_time` STRING COMMENT '创建时间'
- ) COMMENT '商品维度表'
- PARTITIONED BY (`dt` STRING)
- STORED AS PARQUET
- LOCATION '/warehouse/gmall/dim/dim_sku_info/'
- TBLPROPERTIES ("parquet.compression"="lzo");
这里的平台属性表由于每个商品的平台属性不一样,不会因为某个商品的特殊属性去增加一个字段。因此这里将商品的平台属性设为一个结构体,把所有的关于该商品的平台属性放到结构体中并最后由一个数组保存。



首日装载:
- with
- sku as
- (
- select
- id,
- price,
- sku_name,
- sku_desc,
- weight,
- is_sale,
- spu_id,
- category3_id,
- tm_id,
- create_time
- from ods_sku_info
- where dt='2020-06-14'
- ),
- spu as
- (
- select
- id,
- spu_name
- from ods_spu_info
- where dt='2020-06-14'
- ),
- c3 as
- (
- select
- id,
- name,
- category2_id
- from ods_base_category3
- where dt='2020-06-14'
- ),
- c2 as
- (
- select
- id,
- name,
- category1_id
- from ods_base_category2
- where dt='2020-06-14'
- ),
- c1 as
- (
- select
- id,
- name
- from ods_base_category1
- where dt='2020-06-14'
- ),
- tm as
- (
- select
- id,
- tm_name
- from ods_base_trademark
- where dt='2020-06-14'
- ),
- attr as
- (
- select
- sku_id,
- collect_set(named_struct('attr_id',attr_id,'value_id',value_id,'attr_name',attr_name,'value_name',value_name)) attrs
- from ods_sku_attr_value
- where dt='2020-06-14'
- group by sku_id
- ),
- sale_attr as
- (
- select
- sku_id,
- collect_set(named_struct('sale_attr_id',sale_attr_id,'sale_attr_value_id',sale_attr_value_id,'sale_attr_name',sale_attr_name,'sale_attr_value_name',sale_attr_value_name)) sale_attrs
- from ods_sku_sale_attr_value
- where dt='2020-06-14'
- group by sku_id
- )
- insert overwrite table dim_sku_info partition(dt='2020-06-14')
- select
- sku.id,
- sku.price,
- sku.sku_name,
- sku.sku_desc,
- sku.weight,
- sku.is_sale,
- sku.spu_id,
- spu.spu_name,
- sku.category3_id,
- c3.name,
- c3.category2_id,
- c2.name,
- c2.category1_id,
- c1.name,
- sku.tm_id,
- tm.tm_name,
- attr.attrs,
- sale_attr.sale_attrs,
- sku.create_time
- from sku
- left join spu on sku.spu_id=spu.id
- left join c3 on sku.category3_id=c3.id
- left join c2 on c3.category2_id=c2.id
- left join c1 on c2.category1_id=c1.id
- left join tm on sku.tm_id=tm.id
- left join attr on sku.id=attr.sku_id
- left join sale_attr on sku.id=sale_attr.sku_id;
每日装载:
- with
- sku as
- (
- select
- id,
- price,
- sku_name,
- sku_desc,
- weight,
- is_sale,
- spu_id,
- category3_id,
- tm_id,
- create_time
- from ods_sku_info
- where dt='2020-06-15'
- ),
- spu as
- (
- select
- id,
- spu_name
- from ods_spu_info
- where dt='2020-06-15'
- ),
- c3 as
- (
- select
- id,
- name,
- category2_id
- from ods_base_category3
- where dt='2020-06-15'
- ),
- c2 as
- (
- select
- id,
- name,
- category1_id
- from ods_base_category2
- where dt='2020-06-15'
- ),
- c1 as
- (
- select
- id,
- name
- from ods_base_category1
- where dt='2020-06-15'
- ),
- tm as
- (
- select
- id,
- tm_name
- from ods_base_trademark
- where dt='2020-06-15'
- ),
- attr as
- (
- select
- sku_id,
- collect_set(named_struct('attr_id',attr_id,'value_id',value_id,'attr_name',attr_name,'value_name',value_name)) attrs
- from ods_sku_attr_value
- where dt='2020-06-15'
- group by sku_id
- ),
- sale_attr as
- (
- select
- sku_id,
- collect_set(named_struct('sale_attr_id',sale_attr_id,'sale_attr_value_id',sale_attr_value_id,'sale_attr_name',sale_attr_name,'sale_attr_value_name',sale_attr_value_name)) sale_attrs
- from ods_sku_sale_attr_value
- where dt='2020-06-15'
- group by sku_id
- )
- insert overwrite table dim_sku_info partition(dt='2020-06-15')
- select
- sku.id,
- sku.price,
- sku.sku_name,
- sku.sku_desc,
- sku.weight,
- sku.is_sale,
- sku.spu_id,
- spu.spu_name,
- sku.category3_id,
- c3.name,
- c3.category2_id,
- c2.name,
- c2.category1_id,
- c1.name,
- sku.tm_id,
- tm.tm_name,
- attr.attrs,
- sale_attr.sale_attrs,
- sku.create_time
- from sku
- left join spu on sku.spu_id=spu.id
- left join c3 on sku.category3_id=c3.id
- left join c2 on c3.category2_id=c2.id
- left join c1 on c2.category1_id=c1.id
- left join tm on sku.tm_id=tm.id
- left join attr on sku.id=attr.sku_id
- left join sale_attr on sku.id=sale_attr.sku_id;
- DROP TABLE IF EXISTS dim_coupon_info;
- CREATE EXTERNAL TABLE dim_coupon_info(
- `id` STRING COMMENT '购物券编号',
- `coupon_name` STRING COMMENT '购物券名称',
- `coupon_type` STRING COMMENT '购物券类型 1 现金券 2 折扣券 3 满减券 4 满件打折券',
- `condition_amount` DECIMAL(16,2) COMMENT '满额数',
- `condition_num` BIGINT COMMENT '满件数',
- `activity_id` STRING COMMENT '活动编号',
- `benefit_amount` DECIMAL(16,2) COMMENT '减金额',
- `benefit_discount` DECIMAL(16,2) COMMENT '折扣',
- `create_time` STRING COMMENT '创建时间',
- `range_type` STRING COMMENT '范围类型 1、商品 2、品类 3、品牌',
- `limit_num` BIGINT COMMENT '最多领取次数',
- `taken_count` BIGINT COMMENT '已领取次数',
- `start_time` STRING COMMENT '可以领取的开始日期',
- `end_time` STRING COMMENT '可以领取的结束日期',
- `operate_time` STRING COMMENT '修改时间',
- `expire_time` STRING COMMENT '过期时间'
- ) COMMENT '优惠券维度表'
- PARTITIONED BY (`dt` STRING)
- STORED AS PARQUET
- LOCATION '/warehouse/gmall/dim/dim_coupon_info/'
- TBLPROPERTIES ("parquet.compression"="lzo");


首日装载:
这里不需要和别的表进行join操作,因此只需要select进行操作即可,这里看上去没有进行其他操作,但是数据的存储格式从ODS层的lzo压缩文件变为了DIM层的列式存储加压缩的格式。
- insert overwrite table dim_coupon_info partition(dt='2020-06-14')
- select
- id,
- coupon_name,
- coupon_type,
- condition_amount,
- condition_num,
- activity_id,
- benefit_amount,
- benefit_discount,
- create_time,
- range_type,
- limit_num,
- taken_count,
- start_time,
- end_time,
- operate_time,
- expire_time
- from ods_coupon_info
- where dt='2020-06-14';
每日装载:
- insert overwrite table dim_coupon_info partition(dt='2020-06-15')
- select
- id,
- coupon_name,
- coupon_type,
- condition_amount,
- condition_num,
- activity_id,
- benefit_amount,
- benefit_discount,
- create_time,
- range_type,
- limit_num,
- taken_count,
- start_time,
- end_time,
- operate_time,
- expire_time
- from ods_coupon_info
- where dt='2020-06-15';
- DROP TABLE IF EXISTS dim_activity_rule_info;
- CREATE EXTERNAL TABLE dim_activity_rule_info(
- `activity_rule_id` STRING COMMENT '活动规则ID',
- `activity_id` STRING COMMENT '活动ID',
- `activity_name` STRING COMMENT '活动名称',
- `activity_type` STRING COMMENT '活动类型',
- `start_time` STRING COMMENT '开始时间',
- `end_time` STRING COMMENT '结束时间',
- `create_time` STRING COMMENT '创建时间',
- `condition_amount` DECIMAL(16,2) COMMENT '满减金额',
- `condition_num` BIGINT COMMENT '满减件数',
- `benefit_amount` DECIMAL(16,2) COMMENT '优惠金额',
- `benefit_discount` DECIMAL(16,2) COMMENT '优惠折扣',
- `benefit_level` STRING COMMENT '优惠级别'
- ) COMMENT '活动信息表'
- PARTITIONED BY (`dt` STRING)
- STORED AS PARQUET
- LOCATION '/warehouse/gmall/dim/dim_activity_rule_info/'
- TBLPROPERTIES ("parquet.compression"="lzo");


首日装载:
- insert overwrite table dim_activity_rule_info partition(dt='2020-06-14')
- select
- ar.id,
- ar.activity_id,
- ai.activity_name,
- ar.activity_type,
- ai.start_time,
- ai.end_time,
- ai.create_time,
- ar.condition_amount,
- ar.condition_num,
- ar.benefit_amount,
- ar.benefit_discount,
- ar.benefit_level
- from
- (
- select
- id,
- activity_id,
- activity_type,
- condition_amount,
- condition_num,
- benefit_amount,
- benefit_discount,
- benefit_level
- from ods_activity_rule
- where dt='2020-06-14'
- )ar
- left join
- (
- select
- id,
- activity_name,
- start_time,
- end_time,
- create_time
- from ods_activity_info
- where dt='2020-06-14'
- )ai
- on ar.activity_id=ai.id;
每日装载:
- insert overwrite table dim_activity_rule_info partition(dt='2020-06-15')
- select
- ar.id,
- ar.activity_id,
- ai.activity_name,
- ar.activity_type,
- ai.start_time,
- ai.end_time,
- ai.create_time,
- ar.condition_amount,
- ar.condition_num,
- ar.benefit_amount,
- ar.benefit_discount,
- ar.benefit_level
- from
- (
- select
- id,
- activity_id,
- activity_type,
- condition_amount,
- condition_num,
- benefit_amount,
- benefit_discount,
- benefit_level
- from ods_activity_rule
- where dt='2020-06-15'
- )ar
- left join
- (
- select
- id,
- activity_name,
- start_time,
- end_time,
- create_time
- from ods_activity_info
- where dt='2020-06-15'
- )ai
- on ar.activity_id=ai.id;
地区维度表一般不会改变,因此不需要每天都进行同步,因此就不需要进行分区
- DROP TABLE IF EXISTS dim_base_province;
- CREATE EXTERNAL TABLE dim_base_province (
- `id` STRING COMMENT 'id',
- `province_name` STRING COMMENT '省市名称',
- `area_code` STRING COMMENT '地区编码',
- `iso_code` STRING COMMENT 'ISO-3166编码,供可视化使用',
- `iso_3166_2` STRING COMMENT 'IOS-3166-2编码,供可视化使用',
- `region_id` STRING COMMENT '地区id',
- `region_name` STRING COMMENT '地区名称'
- ) COMMENT '地区维度表'
- STORED AS PARQUET
- LOCATION '/warehouse/gmall/dim/dim_base_province/'
- TBLPROPERTIES ("parquet.compression"="lzo");
地区维度表数据相对稳定,变化概率较低,故无需每日装载。

- insert overwrite table dim_base_province
- select
- bp.id,
- bp.name,
- bp.area_code,
- bp.iso_code,
- bp.iso_3166_2,
- bp.region_id,
- br.region_name
- from ods_base_province bp
- join ods_base_region br on bp.region_id = br.id;
- DROP TABLE IF EXISTS dim_date_info;
- CREATE EXTERNAL TABLE dim_date_info(
- `date_id` STRING COMMENT '日',
- `week_id` STRING COMMENT '周ID',
- `week_day` STRING COMMENT '周几',
- `day` STRING COMMENT '每月的第几天',
- `month` STRING COMMENT '第几月',
- `quarter` STRING COMMENT '第几季度',
- `year` STRING COMMENT '年',
- `is_workday` STRING COMMENT '是否是工作日',
- `holiday_id` STRING COMMENT '节假日'
- ) COMMENT '时间维度表'
- STORED AS PARQUET
- LOCATION '/warehouse/gmall/dim/dim_date_info/'
- TBLPROPERTIES ("parquet.compression"="lzo");
通常情况下,时间维度表的数据并不是来自于业务系统,而是手动写入,并且由于时间维度表数据的可预见性,无须每日导入,一般可一次性导入一年的数据。

日期数据文件是文本文件,而表数据是lzo压缩和parquet存储,因此直接load进表中是无法识别的。这里建一个临时表,不使用parquet存储和lzo压缩,将日期的文本文件先load该表中,然后再将该临时表中选择数据,去insert到时间维度表




建表语句:
- DROP TABLE IF EXISTS dim_user_info;
- CREATE EXTERNAL TABLE dim_user_info(
- `id` STRING COMMENT '用户id',
- `login_name` STRING COMMENT '用户名称',
- `nick_name` STRING COMMENT '用户昵称',
- `name` STRING COMMENT '用户姓名',
- `phone_num` STRING COMMENT '手机号码',
- `email` STRING COMMENT '邮箱',
- `user_level` STRING COMMENT '用户等级',
- `birthday` STRING COMMENT '生日',
- `gender` STRING COMMENT '性别',
- `create_time` STRING COMMENT '创建时间',
- `operate_time` STRING COMMENT '操作时间',
- `start_date` STRING COMMENT '开始日期',
- `end_date` STRING COMMENT '结束日期'
- ) COMMENT '用户表'
- PARTITIONED BY (`dt` STRING)
- STORED AS PARQUET
- LOCATION '/warehouse/gmall/dim/dim_user_info/'
- TBLPROPERTIES ("parquet.compression"="lzo");
分区规划:

数据装载:
首日装载:
拉链表首日装载,需要进行初始化操作,具体工作为将截止到初始化当日的全部历史用户导入一次性导入到拉链表中。目前的ods_user_info表的第一个分区,即2020-06-14分区中就是全部的历史用户,故将该分区数据进行一定处理后导入拉链表的9999-99-99分区即可。
- insert overwrite table dim_user_info partition(dt='9999-99-99')
- select
- id,
- login_name,
- nick_name,
- md5(name),
- md5(phone_num),
- md5(email),
- user_level,
- birthday,
- gender,
- create_time,
- operate_time,
- '2020-06-14',
- '9999-99-99'
- from ods_user_info
- where dt='2020-06-14';
每日装载sql编写:

- with
- tmp as
- (
- select
- old.id old_id,
- old.login_name old_login_name,
- old.nick_name old_nick_name,
- old.name old_name,
- old.phone_num old_phone_num,
- old.email old_email,
- old.user_level old_user_level,
- old.birthday old_birthday,
- old.gender old_gender,
- old.create_time old_create_time,
- old.operate_time old_operate_time,
- old.start_date old_start_date,
- old.end_date old_end_date,
- new.id new_id,
- new.login_name new_login_name,
- new.nick_name new_nick_name,
- new.name new_name,
- new.phone_num new_phone_num,
- new.email new_email,
- new.user_level new_user_level,
- new.birthday new_birthday,
- new.gender new_gender,
- new.create_time new_create_time,
- new.operate_time new_operate_time,
- new.start_date new_start_date,
- new.end_date new_end_date
- from
- (
- select
- id,
- login_name,
- nick_name,
- name,
- phone_num,
- email,
- user_level,
- birthday,
- gender,
- create_time,
- operate_time,
- start_date,
- end_date
- from dim_user_info
- where dt='9999-99-99'
- )old
- full outer join
- (
- select
- id,
- login_name,
- nick_name,
- md5(name) name,
- md5(phone_num) phone_num,
- md5(email) email,
- user_level,
- birthday,
- gender,
- create_time,
- operate_time,
- '2020-06-15' start_date,
- '9999-99-99' end_date
- from ods_user_info
- where dt='2020-06-15'
- )new
- on old.id=new.id
- )
- insert overwrite table dim_user_info partition(dt)
- select
- nvl(new_id,old_id),
- nvl(new_login_name,old_login_name),
- nvl(new_nick_name,old_nick_name),
- nvl(new_name,old_name),
- nvl(new_phone_num,old_phone_num),
- nvl(new_email,old_email),
- nvl(new_user_level,old_user_level),
- nvl(new_birthday,old_birthday),
- nvl(new_gender,old_gender),
- nvl(new_create_time,old_create_time),
- nvl(new_operate_time,old_operate_time),
- nvl(new_start_date,old_start_date),
- nvl(new_end_date,old_end_date),
- nvl(new_end_date,old_end_date) dt
- from tmp
- union all
- select
- old_id,
- old_login_name,
- old_nick_name,
- old_name,
- old_phone_num,
- old_email,
- old_user_level,
- old_birthday,
- old_gender,
- old_create_time,
- old_operate_time,
- old_start_date,
- cast(date_add('2020-06-15',-1) as string),
- cast(date_add('2020-06-15',-1) as string) dt
- from tmp
- where new_id is not null and old_id is not null;
- #!/bin/bash
-
- APP=gmall
-
- if [ -n "$2" ] ;then
- do_date=$2
- else
- echo "请传入日期参数"
- exit
- fi
-
- dim_user_info="
- set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
- insert overwrite table ${APP}.dim_user_info partition(dt='9999-99-99')
- select
- id,
- login_name,
- nick_name,
- md5(name),
- md5(phone_num),
- md5(email),
- user_level,
- birthday,
- gender,
- create_time,
- operate_time,
- '$do_date',
- '9999-99-99'
- from ${APP}.ods_user_info
- where dt='$do_date';
- "
-
- dim_sku_info="
- set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
- with
- sku as
- (
- select
- id,
- price,
- sku_name,
- sku_desc,
- weight,
- is_sale,
- spu_id,
- category3_id,
- tm_id,
- create_time
- from ${APP}.ods_sku_info
- where dt='$do_date'
- ),
- spu as
- (
- select
- id,
- spu_name
- from ${APP}.ods_spu_info
- where dt='$do_date'
- ),
- c3 as
- (
- select
- id,
- name,
- category2_id
- from ${APP}.ods_base_category3
- where dt='$do_date'
- ),
- c2 as
- (
- select
- id,
- name,
- category1_id
- from ${APP}.ods_base_category2
- where dt='$do_date'
- ),
- c1 as
- (
- select
- id,
- name
- from ${APP}.ods_base_category1
- where dt='$do_date'
- ),
- tm as
- (
- select
- id,
- tm_name
- from ${APP}.ods_base_trademark
- where dt='$do_date'
- ),
- attr as
- (
- select
- sku_id,
- collect_set(named_struct('attr_id',attr_id,'value_id',value_id,'attr_name',attr_name,'value_name',value_name)) attrs
- from ${APP}.ods_sku_attr_value
- where dt='$do_date'
- group by sku_id
- ),
- sale_attr as
- (
- select
- sku_id,
- collect_set(named_struct('sale_attr_id',sale_attr_id,'sale_attr_value_id',sale_attr_value_id,'sale_attr_name',sale_attr_name,'sale_attr_value_name',sale_attr_value_name)) sale_attrs
- from ${APP}.ods_sku_sale_attr_value
- where dt='$do_date'
- group by sku_id
- )
- insert overwrite table ${APP}.dim_sku_info partition(dt='$do_date')
- select
- sku.id,
- sku.price,
- sku.sku_name,
- sku.sku_desc,
- sku.weight,
- sku.is_sale,
- sku.spu_id,
- spu.spu_name,
- sku.category3_id,
- c3.name,
- c3.category2_id,
- c2.name,
- c2.category1_id,
- c1.name,
- sku.tm_id,
- tm.tm_name,
- attr.attrs,
- sale_attr.sale_attrs,
- sku.create_time
- from sku
- left join spu on sku.spu_id=spu.id
- left join c3 on sku.category3_id=c3.id
- left join c2 on c3.category2_id=c2.id
- left join c1 on c2.category1_id=c1.id
- left join tm on sku.tm_id=tm.id
- left join attr on sku.id=attr.sku_id
- left join sale_attr on sku.id=sale_attr.sku_id;
- "
-
- dim_base_province="
- set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
- insert overwrite table ${APP}.dim_base_province
- select
- bp.id,
- bp.name,
- bp.area_code,
- bp.iso_code,
- bp.iso_3166_2,
- bp.region_id,
- br.region_name
- from ${APP}.ods_base_province bp
- join ${APP}.ods_base_region br on bp.region_id = br.id;
- "
-
- dim_coupon_info="
- set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
- insert overwrite table ${APP}.dim_coupon_info partition(dt='$do_date')
- select
- id,
- coupon_name,
- coupon_type,
- condition_amount,
- condition_num,
- activity_id,
- benefit_amount,
- benefit_discount,
- create_time,
- range_type,
- limit_num,
- taken_count,
- start_time,
- end_time,
- operate_time,
- expire_time
- from ${APP}.ods_coupon_info
- where dt='$do_date';
- "
-
- dim_activity_rule_info="
- set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
- insert overwrite table ${APP}.dim_activity_rule_info partition(dt='$do_date')
- select
- ar.id,
- ar.activity_id,
- ai.activity_name,
- ar.activity_type,
- ai.start_time,
- ai.end_time,
- ai.create_time,
- ar.condition_amount,
- ar.condition_num,
- ar.benefit_amount,
- ar.benefit_discount,
- ar.benefit_level
- from
- (
- select
- id,
- activity_id,
- activity_type,
- condition_amount,
- condition_num,
- benefit_amount,
- benefit_discount,
- benefit_level
- from ${APP}.ods_activity_rule
- where dt='$do_date'
- )ar
- left join
- (
- select
- id,
- activity_name,
- start_time,
- end_time,
- create_time
- from ${APP}.ods_activity_info
- where dt='$do_date'
- )ai
- on ar.activity_id=ai.id;
- "
-
- case $1 in
- "dim_user_info"){
- hive -e "$dim_user_info"
- };;
- "dim_sku_info"){
- hive -e "$dim_sku_info"
- };;
- "dim_base_province"){
- hive -e "$dim_base_province"
- };;
- "dim_coupon_info"){
- hive -e "$dim_coupon_info"
- };;
- "dim_activity_rule_info"){
- hive -e "$dim_activity_rule_info"
- };;
- "all"){
- hive -e "$dim_user_info$dim_sku_info$dim_coupon_info$dim_activity_rule_info$dim_base_province"
- };;
- esac
注意:该脚本不包含时间维度表的装载,时间维度表需要手动装载
- #!/bin/bash
-
- APP=gmall
-
- # 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
- if [ -n "$2" ] ;then
- do_date=$2
- else
- do_date=`date -d "-1 day" +%F`
- fi
-
- dim_user_info="
- set hive.exec.dynamic.partition.mode=nonstrict;
- set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
- with
- tmp as
- (
- select
- old.id old_id,
- old.login_name old_login_name,
- old.nick_name old_nick_name,
- old.name old_name,
- old.phone_num old_phone_num,
- old.email old_email,
- old.user_level old_user_level,
- old.birthday old_birthday,
- old.gender old_gender,
- old.create_time old_create_time,
- old.operate_time old_operate_time,
- old.start_date old_start_date,
- old.end_date old_end_date,
- new.id new_id,
- new.login_name new_login_name,
- new.nick_name new_nick_name,
- new.name new_name,
- new.phone_num new_phone_num,
- new.email new_email,
- new.user_level new_user_level,
- new.birthday new_birthday,
- new.gender new_gender,
- new.create_time new_create_time,
- new.operate_time new_operate_time,
- new.start_date new_start_date,
- new.end_date new_end_date
- from
- (
- select
- id,
- login_name,
- nick_name,
- name,
- phone_num,
- email,
- user_level,
- birthday,
- gender,
- create_time,
- operate_time,
- start_date,
- end_date
- from ${APP}.dim_user_info
- where dt='9999-99-99'
- and start_date<'$do_date'
- )old
- full outer join
- (
- select
- id,
- login_name,
- nick_name,
- md5(name) name,
- md5(phone_num) phone_num,
- md5(email) email,
- user_level,
- birthday,
- gender,
- create_time,
- operate_time,
- '$do_date' start_date,
- '9999-99-99' end_date
- from ${APP}.ods_user_info
- where dt='$do_date'
- )new
- on old.id=new.id
- )
- insert overwrite table ${APP}.dim_user_info partition(dt)
- select
- nvl(new_id,old_id),
- nvl(new_login_name,old_login_name),
- nvl(new_nick_name,old_nick_name),
- nvl(new_name,old_name),
- nvl(new_phone_num,old_phone_num),
- nvl(new_email,old_email),
- nvl(new_user_level,old_user_level),
- nvl(new_birthday,old_birthday),
- nvl(new_gender,old_gender),
- nvl(new_create_time,old_create_time),
- nvl(new_operate_time,old_operate_time),
- nvl(new_start_date,old_start_date),
- nvl(new_end_date,old_end_date),
- nvl(new_end_date,old_end_date) dt
- from tmp
- union all
- select
- old_id,
- old_login_name,
- old_nick_name,
- old_name,
- old_phone_num,
- old_email,
- old_user_level,
- old_birthday,
- old_gender,
- old_create_time,
- old_operate_time,
- old_start_date,
- cast(date_add('$do_date',-1) as string),
- cast(date_add('$do_date',-1) as string) dt
- from tmp
- where new_id is not null and old_id is not null;
- "
-
- dim_sku_info="
- set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
- with
- sku as
- (
- select
- id,
- price,
- sku_name,
- sku_desc,
- weight,
- is_sale,
- spu_id,
- category3_id,
- tm_id,
- create_time
- from ${APP}.ods_sku_info
- where dt='$do_date'
- ),
- spu as
- (
- select
- id,
- spu_name
- from ${APP}.ods_spu_info
- where dt='$do_date'
- ),
- c3 as
- (
- select
- id,
- name,
- category2_id
- from ${APP}.ods_base_category3
- where dt='$do_date'
- ),
- c2 as
- (
- select
- id,
- name,
- category1_id
- from ${APP}.ods_base_category2
- where dt='$do_date'
- ),
- c1 as
- (
- select
- id,
- name
- from ${APP}.ods_base_category1
- where dt='$do_date'
- ),
- tm as
- (
- select
- id,
- tm_name
- from ${APP}.ods_base_trademark
- where dt='$do_date'
- ),
- attr as
- (
- select
- sku_id,
- collect_set(named_struct('attr_id',attr_id,'value_id',value_id,'attr_name',attr_name,'value_name',value_name)) attrs
- from ${APP}.ods_sku_attr_value
- where dt='$do_date'
- group by sku_id
- ),
- sale_attr as
- (
- select
- sku_id,
- collect_set(named_struct('sale_attr_id',sale_attr_id,'sale_attr_value_id',sale_attr_value_id,'sale_attr_name',sale_attr_name,'sale_attr_value_name',sale_attr_value_name)) sale_attrs
- from ${APP}.ods_sku_sale_attr_value
- where dt='$do_date'
- group by sku_id
- )
- insert overwrite table ${APP}.dim_sku_info partition(dt='$do_date')
- select
- sku.id,
- sku.price,
- sku.sku_name,
- sku.sku_desc,
- sku.weight,
- sku.is_sale,
- sku.spu_id,
- spu.spu_name,
- sku.category3_id,
- c3.name,
- c3.category2_id,
- c2.name,
- c2.category1_id,
- c1.name,
- sku.tm_id,
- tm.tm_name,
- attr.attrs,
- sale_attr.sale_attrs,
- sku.create_time
- from sku
- left join spu on sku.spu_id=spu.id
- left join c3 on sku.category3_id=c3.id
- left join c2 on c3.category2_id=c2.id
- left join c1 on c2.category1_id=c1.id
- left join tm on sku.tm_id=tm.id
- left join attr on sku.id=attr.sku_id
- left join sale_attr on sku.id=sale_attr.sku_id;
- "
-
- dim_base_province="
- set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
- insert overwrite table ${APP}.dim_base_province
- select
- bp.id,
- bp.name,
- bp.area_code,
- bp.iso_code,
- bp.iso_3166_2,
- bp.region_id,
- bp.name
- from ${APP}.ods_base_province bp
- join ${APP}.ods_base_region br on bp.region_id = br.id;
- "
-
- dim_coupon_info="
- set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
- insert overwrite table ${APP}.dim_coupon_info partition(dt='$do_date')
- select
- id,
- coupon_name,
- coupon_type,
- condition_amount,
- condition_num,
- activity_id,
- benefit_amount,
- benefit_discount,
- create_time,
- range_type,
- limit_num,
- taken_count,
- start_time,
- end_time,
- operate_time,
- expire_time
- from ${APP}.ods_coupon_info
- where dt='$do_date';
- "
-
- dim_activity_rule_info="
- set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
- insert overwrite table ${APP}.dim_activity_rule_info partition(dt='$do_date')
- select
- ar.id,
- ar.activity_id,
- ai.activity_name,
- ar.activity_type,
- ai.start_time,
- ai.end_time,
- ai.create_time,
- ar.condition_amount,
- ar.condition_num,
- ar.benefit_amount,
- ar.benefit_discount,
- ar.benefit_level
- from
- (
- select
- id,
- activity_id,
- activity_type,
- condition_amount,
- condition_num,
- benefit_amount,
- benefit_discount,
- benefit_level
- from ${APP}.ods_activity_rule
- where dt='$do_date'
- )ar
- left join
- (
- select
- id,
- activity_name,
- start_time,
- end_time,
- create_time
- from ${APP}.ods_activity_info
- where dt='$do_date'
- )ai
- on ar.activity_id=ai.id;
- "
-
- case $1 in
- "dim_user_info"){
- hive -e "$dim_user_info"
- };;
- "dim_sku_info"){
- hive -e "$dim_sku_info"
- };;
- "dim_base_province"){
- hive -e "$dim_base_province"
- };;
- "dim_coupon_info"){
- hive -e "$dim_coupon_info"
- };;
- "dim_activity_rule_info"){
- hive -e "$dim_activity_rule_info"
- };;
- "all"){
- hive -e "$dim_user_info$dim_sku_info$dim_coupon_info$dim_activity_rule_info"
- };;
- esac