• 数仓学习笔记(4)——数仓搭建(ODS层和DIM层)


    目录

    一、数仓搭建-ODS层

    1、ODS层(用户行为数据)

    1.1 创建日志表ods_log

    1.2 Shell中单引号和双引号区别

    1.3 ODS层日志表加载数据脚本

    2、ODS层(业务数据)

    二、数仓搭建-DIM层

    1、商品维度表(全量)

    1.1 建表语句

    1.2 分区规划

    1.3 数据装载

    1.4 Hive读取索引文件问题

    1.5 首日装载和每日装载

    2、优惠卷维度表(全量)

    2.1 建表语句

    2.2 分区规划

    2.3 数据装载

    2.4 首日装载和每日装载

    3、活动维度表(全量)

    3.1 建表语句

    3.2 分区规划

    3.3 数据装载

     3.4 首日装载和每日装载

    4、地区维度表(特殊)

    4.1 建表语句

    4.2 数据装载

    5、时间维度表(特殊)

    5.1 建表语句

    5.2 数据装载

    6、拉链表

    6.1 拉链表概述

    6.2 制作拉链表

    7、DIM层首日装载数据脚本

    8、DIM层每日数据装载脚本


    一、数仓搭建-ODS层

    1、ODS层(用户行为数据)

    1.1 创建日志表ods_log

    1.2 Shell中单引号和双引号区别

    1.3 ODS层日志表加载数据脚本

    脚本中编写如下内容

    1. #!/bin/bash
    2. # 定义变量方便修改
    3. APP=gmall
    4. # 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
    5. if [ -n "$1" ] ;then
    6. do_date=$1
    7. else
    8. do_date=`date -d "-1 day" +%F`
    9. fi
    10. echo ================== 日志日期为 $do_date ==================
    11. sql="
    12. load data inpath '/origin_data/$APP/log/topic_log/$do_date' into table ${APP}.ods_log partition(dt='$do_date');
    13. "
    14. hive -e "$sql"
    15. hadoop jar /opt/module/hadoop-3.1.3/share/hadoop/common/hadoop-lzo-0.4.20.jar com.hadoop.compression.lzo.DistributedLzoIndexer /warehouse/$APP/ods/ods_log/dt=$do_date

    2、ODS层(业务数据)

    二、数仓搭建-DIM层

    DIM层储存的基本都为维度表,且基本都是进行全量同步操作

    1、商品维度表(全量)

    1.1 建表语句

    1. DROP TABLE IF EXISTS dim_sku_info;
    2. CREATE EXTERNAL TABLE dim_sku_info (
    3. `id` STRING COMMENT '商品id',
    4. `price` DECIMAL(16,2) COMMENT '商品价格',
    5. `sku_name` STRING COMMENT '商品名称',
    6. `sku_desc` STRING COMMENT '商品描述',
    7. `weight` DECIMAL(16,2) COMMENT '重量',
    8. `is_sale` BOOLEAN COMMENT '是否在售',
    9. `spu_id` STRING COMMENT 'spu编号',
    10. `spu_name` STRING COMMENT 'spu名称',
    11. `category3_id` STRING COMMENT '三级分类id',
    12. `category3_name` STRING COMMENT '三级分类名称',
    13. `category2_id` STRING COMMENT '二级分类id',
    14. `category2_name` STRING COMMENT '二级分类名称',
    15. `category1_id` STRING COMMENT '一级分类id',
    16. `category1_name` STRING COMMENT '一级分类名称',
    17. `tm_id` STRING COMMENT '品牌id',
    18. `tm_name` STRING COMMENT '品牌名称',
    19. `sku_attr_values` ARRAY<STRUCT<attr_id:STRING,value_id:STRING,attr_name:STRING,value_name:STRING>> COMMENT '平台属性',
    20. `sku_sale_attr_values` ARRAY<STRUCT<sale_attr_id:STRING,sale_attr_value_id:STRING,sale_attr_name:STRING,sale_attr_value_name:STRING>> COMMENT '销售属性',
    21. `create_time` STRING COMMENT '创建时间'
    22. ) COMMENT '商品维度表'
    23. PARTITIONED BY (`dt` STRING)
    24. STORED AS PARQUET
    25. LOCATION '/warehouse/gmall/dim/dim_sku_info/'
    26. TBLPROPERTIES ("parquet.compression"="lzo");

    这里的平台属性表由于每个商品的平台属性不一样,不会因为某个商品的特殊属性去增加一个字段。因此这里将商品的平台属性设为一个结构体,把所有的关于该商品的平台属性放到结构体中并最后由一个数组保存。

    1.2 分区规划

    1.3 数据装载

    1.4 Hive读取索引文件问题

    1.5 首日装载和每日装载

    首日装载:

    1. with
    2. sku as
    3. (
    4. select
    5. id,
    6. price,
    7. sku_name,
    8. sku_desc,
    9. weight,
    10. is_sale,
    11. spu_id,
    12. category3_id,
    13. tm_id,
    14. create_time
    15. from ods_sku_info
    16. where dt='2020-06-14'
    17. ),
    18. spu as
    19. (
    20. select
    21. id,
    22. spu_name
    23. from ods_spu_info
    24. where dt='2020-06-14'
    25. ),
    26. c3 as
    27. (
    28. select
    29. id,
    30. name,
    31. category2_id
    32. from ods_base_category3
    33. where dt='2020-06-14'
    34. ),
    35. c2 as
    36. (
    37. select
    38. id,
    39. name,
    40. category1_id
    41. from ods_base_category2
    42. where dt='2020-06-14'
    43. ),
    44. c1 as
    45. (
    46. select
    47. id,
    48. name
    49. from ods_base_category1
    50. where dt='2020-06-14'
    51. ),
    52. tm as
    53. (
    54. select
    55. id,
    56. tm_name
    57. from ods_base_trademark
    58. where dt='2020-06-14'
    59. ),
    60. attr as
    61. (
    62. select
    63. sku_id,
    64. collect_set(named_struct('attr_id',attr_id,'value_id',value_id,'attr_name',attr_name,'value_name',value_name)) attrs
    65. from ods_sku_attr_value
    66. where dt='2020-06-14'
    67. group by sku_id
    68. ),
    69. sale_attr as
    70. (
    71. select
    72. sku_id,
    73. collect_set(named_struct('sale_attr_id',sale_attr_id,'sale_attr_value_id',sale_attr_value_id,'sale_attr_name',sale_attr_name,'sale_attr_value_name',sale_attr_value_name)) sale_attrs
    74. from ods_sku_sale_attr_value
    75. where dt='2020-06-14'
    76. group by sku_id
    77. )
    78. insert overwrite table dim_sku_info partition(dt='2020-06-14')
    79. select
    80. sku.id,
    81. sku.price,
    82. sku.sku_name,
    83. sku.sku_desc,
    84. sku.weight,
    85. sku.is_sale,
    86. sku.spu_id,
    87. spu.spu_name,
    88. sku.category3_id,
    89. c3.name,
    90. c3.category2_id,
    91. c2.name,
    92. c2.category1_id,
    93. c1.name,
    94. sku.tm_id,
    95. tm.tm_name,
    96. attr.attrs,
    97. sale_attr.sale_attrs,
    98. sku.create_time
    99. from sku
    100. left join spu on sku.spu_id=spu.id
    101. left join c3 on sku.category3_id=c3.id
    102. left join c2 on c3.category2_id=c2.id
    103. left join c1 on c2.category1_id=c1.id
    104. left join tm on sku.tm_id=tm.id
    105. left join attr on sku.id=attr.sku_id
    106. left join sale_attr on sku.id=sale_attr.sku_id;

    每日装载:

    1. with
    2. sku as
    3. (
    4. select
    5. id,
    6. price,
    7. sku_name,
    8. sku_desc,
    9. weight,
    10. is_sale,
    11. spu_id,
    12. category3_id,
    13. tm_id,
    14. create_time
    15. from ods_sku_info
    16. where dt='2020-06-15'
    17. ),
    18. spu as
    19. (
    20. select
    21. id,
    22. spu_name
    23. from ods_spu_info
    24. where dt='2020-06-15'
    25. ),
    26. c3 as
    27. (
    28. select
    29. id,
    30. name,
    31. category2_id
    32. from ods_base_category3
    33. where dt='2020-06-15'
    34. ),
    35. c2 as
    36. (
    37. select
    38. id,
    39. name,
    40. category1_id
    41. from ods_base_category2
    42. where dt='2020-06-15'
    43. ),
    44. c1 as
    45. (
    46. select
    47. id,
    48. name
    49. from ods_base_category1
    50. where dt='2020-06-15'
    51. ),
    52. tm as
    53. (
    54. select
    55. id,
    56. tm_name
    57. from ods_base_trademark
    58. where dt='2020-06-15'
    59. ),
    60. attr as
    61. (
    62. select
    63. sku_id,
    64. collect_set(named_struct('attr_id',attr_id,'value_id',value_id,'attr_name',attr_name,'value_name',value_name)) attrs
    65. from ods_sku_attr_value
    66. where dt='2020-06-15'
    67. group by sku_id
    68. ),
    69. sale_attr as
    70. (
    71. select
    72. sku_id,
    73. collect_set(named_struct('sale_attr_id',sale_attr_id,'sale_attr_value_id',sale_attr_value_id,'sale_attr_name',sale_attr_name,'sale_attr_value_name',sale_attr_value_name)) sale_attrs
    74. from ods_sku_sale_attr_value
    75. where dt='2020-06-15'
    76. group by sku_id
    77. )
    78. insert overwrite table dim_sku_info partition(dt='2020-06-15')
    79. select
    80. sku.id,
    81. sku.price,
    82. sku.sku_name,
    83. sku.sku_desc,
    84. sku.weight,
    85. sku.is_sale,
    86. sku.spu_id,
    87. spu.spu_name,
    88. sku.category3_id,
    89. c3.name,
    90. c3.category2_id,
    91. c2.name,
    92. c2.category1_id,
    93. c1.name,
    94. sku.tm_id,
    95. tm.tm_name,
    96. attr.attrs,
    97. sale_attr.sale_attrs,
    98. sku.create_time
    99. from sku
    100. left join spu on sku.spu_id=spu.id
    101. left join c3 on sku.category3_id=c3.id
    102. left join c2 on c3.category2_id=c2.id
    103. left join c1 on c2.category1_id=c1.id
    104. left join tm on sku.tm_id=tm.id
    105. left join attr on sku.id=attr.sku_id
    106. left join sale_attr on sku.id=sale_attr.sku_id;

    2、优惠卷维度表(全量)

    2.1 建表语句

    1. DROP TABLE IF EXISTS dim_coupon_info;
    2. CREATE EXTERNAL TABLE dim_coupon_info(
    3. `id` STRING COMMENT '购物券编号',
    4. `coupon_name` STRING COMMENT '购物券名称',
    5. `coupon_type` STRING COMMENT '购物券类型 1 现金券 2 折扣券 3 满减券 4 满件打折券',
    6. `condition_amount` DECIMAL(16,2) COMMENT '满额数',
    7. `condition_num` BIGINT COMMENT '满件数',
    8. `activity_id` STRING COMMENT '活动编号',
    9. `benefit_amount` DECIMAL(16,2) COMMENT '减金额',
    10. `benefit_discount` DECIMAL(16,2) COMMENT '折扣',
    11. `create_time` STRING COMMENT '创建时间',
    12. `range_type` STRING COMMENT '范围类型 1、商品 2、品类 3、品牌',
    13. `limit_num` BIGINT COMMENT '最多领取次数',
    14. `taken_count` BIGINT COMMENT '已领取次数',
    15. `start_time` STRING COMMENT '可以领取的开始日期',
    16. `end_time` STRING COMMENT '可以领取的结束日期',
    17. `operate_time` STRING COMMENT '修改时间',
    18. `expire_time` STRING COMMENT '过期时间'
    19. ) COMMENT '优惠券维度表'
    20. PARTITIONED BY (`dt` STRING)
    21. STORED AS PARQUET
    22. LOCATION '/warehouse/gmall/dim/dim_coupon_info/'
    23. TBLPROPERTIES ("parquet.compression"="lzo");

    2.2 分区规划

    2.3 数据装载

    2.4 首日装载和每日装载

    首日装载:

    这里不需要和别的表进行join操作,因此只需要select进行操作即可,这里看上去没有进行其他操作,但是数据的存储格式从ODS层的lzo压缩文件变为了DIM层的列式存储加压缩的格式。

    1. insert overwrite table dim_coupon_info partition(dt='2020-06-14')
    2. select
    3. id,
    4. coupon_name,
    5. coupon_type,
    6. condition_amount,
    7. condition_num,
    8. activity_id,
    9. benefit_amount,
    10. benefit_discount,
    11. create_time,
    12. range_type,
    13. limit_num,
    14. taken_count,
    15. start_time,
    16. end_time,
    17. operate_time,
    18. expire_time
    19. from ods_coupon_info
    20. where dt='2020-06-14';

    每日装载:

    1. insert overwrite table dim_coupon_info partition(dt='2020-06-15')
    2. select
    3. id,
    4. coupon_name,
    5. coupon_type,
    6. condition_amount,
    7. condition_num,
    8. activity_id,
    9. benefit_amount,
    10. benefit_discount,
    11. create_time,
    12. range_type,
    13. limit_num,
    14. taken_count,
    15. start_time,
    16. end_time,
    17. operate_time,
    18. expire_time
    19. from ods_coupon_info
    20. where dt='2020-06-15';

    3、活动维度表(全量)

    3.1 建表语句

    1. DROP TABLE IF EXISTS dim_activity_rule_info;
    2. CREATE EXTERNAL TABLE dim_activity_rule_info(
    3. `activity_rule_id` STRING COMMENT '活动规则ID',
    4. `activity_id` STRING COMMENT '活动ID',
    5. `activity_name` STRING COMMENT '活动名称',
    6. `activity_type` STRING COMMENT '活动类型',
    7. `start_time` STRING COMMENT '开始时间',
    8. `end_time` STRING COMMENT '结束时间',
    9. `create_time` STRING COMMENT '创建时间',
    10. `condition_amount` DECIMAL(16,2) COMMENT '满减金额',
    11. `condition_num` BIGINT COMMENT '满减件数',
    12. `benefit_amount` DECIMAL(16,2) COMMENT '优惠金额',
    13. `benefit_discount` DECIMAL(16,2) COMMENT '优惠折扣',
    14. `benefit_level` STRING COMMENT '优惠级别'
    15. ) COMMENT '活动信息表'
    16. PARTITIONED BY (`dt` STRING)
    17. STORED AS PARQUET
    18. LOCATION '/warehouse/gmall/dim/dim_activity_rule_info/'
    19. TBLPROPERTIES ("parquet.compression"="lzo");

    3.2 分区规划

    3.3 数据装载

     3.4 首日装载和每日装载

    首日装载:

    1. insert overwrite table dim_activity_rule_info partition(dt='2020-06-14')
    2. select
    3. ar.id,
    4. ar.activity_id,
    5. ai.activity_name,
    6. ar.activity_type,
    7. ai.start_time,
    8. ai.end_time,
    9. ai.create_time,
    10. ar.condition_amount,
    11. ar.condition_num,
    12. ar.benefit_amount,
    13. ar.benefit_discount,
    14. ar.benefit_level
    15. from
    16. (
    17. select
    18. id,
    19. activity_id,
    20. activity_type,
    21. condition_amount,
    22. condition_num,
    23. benefit_amount,
    24. benefit_discount,
    25. benefit_level
    26. from ods_activity_rule
    27. where dt='2020-06-14'
    28. )ar
    29. left join
    30. (
    31. select
    32. id,
    33. activity_name,
    34. start_time,
    35. end_time,
    36. create_time
    37. from ods_activity_info
    38. where dt='2020-06-14'
    39. )ai
    40. on ar.activity_id=ai.id;

    每日装载:

    1. insert overwrite table dim_activity_rule_info partition(dt='2020-06-15')
    2. select
    3. ar.id,
    4. ar.activity_id,
    5. ai.activity_name,
    6. ar.activity_type,
    7. ai.start_time,
    8. ai.end_time,
    9. ai.create_time,
    10. ar.condition_amount,
    11. ar.condition_num,
    12. ar.benefit_amount,
    13. ar.benefit_discount,
    14. ar.benefit_level
    15. from
    16. (
    17. select
    18. id,
    19. activity_id,
    20. activity_type,
    21. condition_amount,
    22. condition_num,
    23. benefit_amount,
    24. benefit_discount,
    25. benefit_level
    26. from ods_activity_rule
    27. where dt='2020-06-15'
    28. )ar
    29. left join
    30. (
    31. select
    32. id,
    33. activity_name,
    34. start_time,
    35. end_time,
    36. create_time
    37. from ods_activity_info
    38. where dt='2020-06-15'
    39. )ai
    40. on ar.activity_id=ai.id;

    4、地区维度表(特殊)

    地区维度表一般不会改变,因此不需要每天都进行同步,因此就不需要进行分区

    4.1 建表语句

    1. DROP TABLE IF EXISTS dim_base_province;
    2. CREATE EXTERNAL TABLE dim_base_province (
    3. `id` STRING COMMENT 'id',
    4. `province_name` STRING COMMENT '省市名称',
    5. `area_code` STRING COMMENT '地区编码',
    6. `iso_code` STRING COMMENT 'ISO-3166编码,供可视化使用',
    7. `iso_3166_2` STRING COMMENT 'IOS-3166-2编码,供可视化使用',
    8. `region_id` STRING COMMENT '地区id',
    9. `region_name` STRING COMMENT '地区名称'
    10. ) COMMENT '地区维度表'
    11. STORED AS PARQUET
    12. LOCATION '/warehouse/gmall/dim/dim_base_province/'
    13. TBLPROPERTIES ("parquet.compression"="lzo");

    4.2 数据装载

    地区维度表数据相对稳定,变化概率较低,故无需每日装载。

    1. insert overwrite table dim_base_province
    2. select
    3. bp.id,
    4. bp.name,
    5. bp.area_code,
    6. bp.iso_code,
    7. bp.iso_3166_2,
    8. bp.region_id,
    9. br.region_name
    10. from ods_base_province bp
    11. join ods_base_region br on bp.region_id = br.id;

    5、时间维度表(特殊)

    5.1 建表语句

    1. DROP TABLE IF EXISTS dim_date_info;
    2. CREATE EXTERNAL TABLE dim_date_info(
    3. `date_id` STRING COMMENT '日',
    4. `week_id` STRING COMMENT '周ID',
    5. `week_day` STRING COMMENT '周几',
    6. `day` STRING COMMENT '每月的第几天',
    7. `month` STRING COMMENT '第几月',
    8. `quarter` STRING COMMENT '第几季度',
    9. `year` STRING COMMENT '年',
    10. `is_workday` STRING COMMENT '是否是工作日',
    11. `holiday_id` STRING COMMENT '节假日'
    12. ) COMMENT '时间维度表'
    13. STORED AS PARQUET
    14. LOCATION '/warehouse/gmall/dim/dim_date_info/'
    15. TBLPROPERTIES ("parquet.compression"="lzo");

    5.2 数据装载

    通常情况下,时间维度表的数据并不是来自于业务系统,而是手动写入,并且由于时间维度表数据的可预见性,无须每日导入,一般可一次性导入一年的数据。

    日期数据文件是文本文件,而表数据是lzo压缩和parquet存储,因此直接load进表中是无法识别的。这里建一个临时表,不使用parquet存储和lzo压缩,将日期的文本文件先load该表中,然后再将该临时表中选择数据,去insert到时间维度表

    6、拉链表

    6.1 拉链表概述

     

    6.2 制作拉链表

    建表语句:

    1. DROP TABLE IF EXISTS dim_user_info;
    2. CREATE EXTERNAL TABLE dim_user_info(
    3. `id` STRING COMMENT '用户id',
    4. `login_name` STRING COMMENT '用户名称',
    5. `nick_name` STRING COMMENT '用户昵称',
    6. `name` STRING COMMENT '用户姓名',
    7. `phone_num` STRING COMMENT '手机号码',
    8. `email` STRING COMMENT '邮箱',
    9. `user_level` STRING COMMENT '用户等级',
    10. `birthday` STRING COMMENT '生日',
    11. `gender` STRING COMMENT '性别',
    12. `create_time` STRING COMMENT '创建时间',
    13. `operate_time` STRING COMMENT '操作时间',
    14. `start_date` STRING COMMENT '开始日期',
    15. `end_date` STRING COMMENT '结束日期'
    16. ) COMMENT '用户表'
    17. PARTITIONED BY (`dt` STRING)
    18. STORED AS PARQUET
    19. LOCATION '/warehouse/gmall/dim/dim_user_info/'
    20. TBLPROPERTIES ("parquet.compression"="lzo");

    分区规划:

    数据装载:

    首日装载:

    拉链表首日装载,需要进行初始化操作,具体工作为将截止到初始化当日的全部历史用户导入一次性导入到拉链表中。目前的ods_user_info表的第一个分区,即2020-06-14分区中就是全部的历史用户,故将该分区数据进行一定处理后导入拉链表的9999-99-99分区即可。

    1. insert overwrite table dim_user_info partition(dt='9999-99-99')
    2. select
    3. id,
    4. login_name,
    5. nick_name,
    6. md5(name),
    7. md5(phone_num),
    8. md5(email),
    9. user_level,
    10. birthday,
    11. gender,
    12. create_time,
    13. operate_time,
    14. '2020-06-14',
    15. '9999-99-99'
    16. from ods_user_info
    17. where dt='2020-06-14';

    每日装载sql编写: 

    1. with
    2. tmp as
    3. (
    4. select
    5. old.id old_id,
    6. old.login_name old_login_name,
    7. old.nick_name old_nick_name,
    8. old.name old_name,
    9. old.phone_num old_phone_num,
    10. old.email old_email,
    11. old.user_level old_user_level,
    12. old.birthday old_birthday,
    13. old.gender old_gender,
    14. old.create_time old_create_time,
    15. old.operate_time old_operate_time,
    16. old.start_date old_start_date,
    17. old.end_date old_end_date,
    18. new.id new_id,
    19. new.login_name new_login_name,
    20. new.nick_name new_nick_name,
    21. new.name new_name,
    22. new.phone_num new_phone_num,
    23. new.email new_email,
    24. new.user_level new_user_level,
    25. new.birthday new_birthday,
    26. new.gender new_gender,
    27. new.create_time new_create_time,
    28. new.operate_time new_operate_time,
    29. new.start_date new_start_date,
    30. new.end_date new_end_date
    31. from
    32. (
    33. select
    34. id,
    35. login_name,
    36. nick_name,
    37. name,
    38. phone_num,
    39. email,
    40. user_level,
    41. birthday,
    42. gender,
    43. create_time,
    44. operate_time,
    45. start_date,
    46. end_date
    47. from dim_user_info
    48. where dt='9999-99-99'
    49. )old
    50. full outer join
    51. (
    52. select
    53. id,
    54. login_name,
    55. nick_name,
    56. md5(name) name,
    57. md5(phone_num) phone_num,
    58. md5(email) email,
    59. user_level,
    60. birthday,
    61. gender,
    62. create_time,
    63. operate_time,
    64. '2020-06-15' start_date,
    65. '9999-99-99' end_date
    66. from ods_user_info
    67. where dt='2020-06-15'
    68. )new
    69. on old.id=new.id
    70. )
    71. insert overwrite table dim_user_info partition(dt)
    72. select
    73. nvl(new_id,old_id),
    74. nvl(new_login_name,old_login_name),
    75. nvl(new_nick_name,old_nick_name),
    76. nvl(new_name,old_name),
    77. nvl(new_phone_num,old_phone_num),
    78. nvl(new_email,old_email),
    79. nvl(new_user_level,old_user_level),
    80. nvl(new_birthday,old_birthday),
    81. nvl(new_gender,old_gender),
    82. nvl(new_create_time,old_create_time),
    83. nvl(new_operate_time,old_operate_time),
    84. nvl(new_start_date,old_start_date),
    85. nvl(new_end_date,old_end_date),
    86. nvl(new_end_date,old_end_date) dt
    87. from tmp
    88. union all
    89. select
    90. old_id,
    91. old_login_name,
    92. old_nick_name,
    93. old_name,
    94. old_phone_num,
    95. old_email,
    96. old_user_level,
    97. old_birthday,
    98. old_gender,
    99. old_create_time,
    100. old_operate_time,
    101. old_start_date,
    102. cast(date_add('2020-06-15',-1) as string),
    103. cast(date_add('2020-06-15',-1) as string) dt
    104. from tmp
    105. where new_id is not null and old_id is not null;

    7、DIM层首日装载数据脚本

    1. #!/bin/bash
    2. APP=gmall
    3. if [ -n "$2" ] ;then
    4. do_date=$2
    5. else
    6. echo "请传入日期参数"
    7. exit
    8. fi
    9. dim_user_info="
    10. set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
    11. insert overwrite table ${APP}.dim_user_info partition(dt='9999-99-99')
    12. select
    13. id,
    14. login_name,
    15. nick_name,
    16. md5(name),
    17. md5(phone_num),
    18. md5(email),
    19. user_level,
    20. birthday,
    21. gender,
    22. create_time,
    23. operate_time,
    24. '$do_date',
    25. '9999-99-99'
    26. from ${APP}.ods_user_info
    27. where dt='$do_date';
    28. "
    29. dim_sku_info="
    30. set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
    31. with
    32. sku as
    33. (
    34. select
    35. id,
    36. price,
    37. sku_name,
    38. sku_desc,
    39. weight,
    40. is_sale,
    41. spu_id,
    42. category3_id,
    43. tm_id,
    44. create_time
    45. from ${APP}.ods_sku_info
    46. where dt='$do_date'
    47. ),
    48. spu as
    49. (
    50. select
    51. id,
    52. spu_name
    53. from ${APP}.ods_spu_info
    54. where dt='$do_date'
    55. ),
    56. c3 as
    57. (
    58. select
    59. id,
    60. name,
    61. category2_id
    62. from ${APP}.ods_base_category3
    63. where dt='$do_date'
    64. ),
    65. c2 as
    66. (
    67. select
    68. id,
    69. name,
    70. category1_id
    71. from ${APP}.ods_base_category2
    72. where dt='$do_date'
    73. ),
    74. c1 as
    75. (
    76. select
    77. id,
    78. name
    79. from ${APP}.ods_base_category1
    80. where dt='$do_date'
    81. ),
    82. tm as
    83. (
    84. select
    85. id,
    86. tm_name
    87. from ${APP}.ods_base_trademark
    88. where dt='$do_date'
    89. ),
    90. attr as
    91. (
    92. select
    93. sku_id,
    94. collect_set(named_struct('attr_id',attr_id,'value_id',value_id,'attr_name',attr_name,'value_name',value_name)) attrs
    95. from ${APP}.ods_sku_attr_value
    96. where dt='$do_date'
    97. group by sku_id
    98. ),
    99. sale_attr as
    100. (
    101. select
    102. sku_id,
    103. collect_set(named_struct('sale_attr_id',sale_attr_id,'sale_attr_value_id',sale_attr_value_id,'sale_attr_name',sale_attr_name,'sale_attr_value_name',sale_attr_value_name)) sale_attrs
    104. from ${APP}.ods_sku_sale_attr_value
    105. where dt='$do_date'
    106. group by sku_id
    107. )
    108. insert overwrite table ${APP}.dim_sku_info partition(dt='$do_date')
    109. select
    110. sku.id,
    111. sku.price,
    112. sku.sku_name,
    113. sku.sku_desc,
    114. sku.weight,
    115. sku.is_sale,
    116. sku.spu_id,
    117. spu.spu_name,
    118. sku.category3_id,
    119. c3.name,
    120. c3.category2_id,
    121. c2.name,
    122. c2.category1_id,
    123. c1.name,
    124. sku.tm_id,
    125. tm.tm_name,
    126. attr.attrs,
    127. sale_attr.sale_attrs,
    128. sku.create_time
    129. from sku
    130. left join spu on sku.spu_id=spu.id
    131. left join c3 on sku.category3_id=c3.id
    132. left join c2 on c3.category2_id=c2.id
    133. left join c1 on c2.category1_id=c1.id
    134. left join tm on sku.tm_id=tm.id
    135. left join attr on sku.id=attr.sku_id
    136. left join sale_attr on sku.id=sale_attr.sku_id;
    137. "
    138. dim_base_province="
    139. set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
    140. insert overwrite table ${APP}.dim_base_province
    141. select
    142. bp.id,
    143. bp.name,
    144. bp.area_code,
    145. bp.iso_code,
    146. bp.iso_3166_2,
    147. bp.region_id,
    148. br.region_name
    149. from ${APP}.ods_base_province bp
    150. join ${APP}.ods_base_region br on bp.region_id = br.id;
    151. "
    152. dim_coupon_info="
    153. set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
    154. insert overwrite table ${APP}.dim_coupon_info partition(dt='$do_date')
    155. select
    156. id,
    157. coupon_name,
    158. coupon_type,
    159. condition_amount,
    160. condition_num,
    161. activity_id,
    162. benefit_amount,
    163. benefit_discount,
    164. create_time,
    165. range_type,
    166. limit_num,
    167. taken_count,
    168. start_time,
    169. end_time,
    170. operate_time,
    171. expire_time
    172. from ${APP}.ods_coupon_info
    173. where dt='$do_date';
    174. "
    175. dim_activity_rule_info="
    176. set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
    177. insert overwrite table ${APP}.dim_activity_rule_info partition(dt='$do_date')
    178. select
    179. ar.id,
    180. ar.activity_id,
    181. ai.activity_name,
    182. ar.activity_type,
    183. ai.start_time,
    184. ai.end_time,
    185. ai.create_time,
    186. ar.condition_amount,
    187. ar.condition_num,
    188. ar.benefit_amount,
    189. ar.benefit_discount,
    190. ar.benefit_level
    191. from
    192. (
    193. select
    194. id,
    195. activity_id,
    196. activity_type,
    197. condition_amount,
    198. condition_num,
    199. benefit_amount,
    200. benefit_discount,
    201. benefit_level
    202. from ${APP}.ods_activity_rule
    203. where dt='$do_date'
    204. )ar
    205. left join
    206. (
    207. select
    208. id,
    209. activity_name,
    210. start_time,
    211. end_time,
    212. create_time
    213. from ${APP}.ods_activity_info
    214. where dt='$do_date'
    215. )ai
    216. on ar.activity_id=ai.id;
    217. "
    218. case $1 in
    219. "dim_user_info"){
    220. hive -e "$dim_user_info"
    221. };;
    222. "dim_sku_info"){
    223. hive -e "$dim_sku_info"
    224. };;
    225. "dim_base_province"){
    226. hive -e "$dim_base_province"
    227. };;
    228. "dim_coupon_info"){
    229. hive -e "$dim_coupon_info"
    230. };;
    231. "dim_activity_rule_info"){
    232. hive -e "$dim_activity_rule_info"
    233. };;
    234. "all"){
    235. hive -e "$dim_user_info$dim_sku_info$dim_coupon_info$dim_activity_rule_info$dim_base_province"
    236. };;
    237. esac

    注意:该脚本不包含时间维度表的装载,时间维度表需要手动装载

    8、DIM层每日数据装载脚本

    1. #!/bin/bash
    2. APP=gmall
    3. # 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
    4. if [ -n "$2" ] ;then
    5. do_date=$2
    6. else
    7. do_date=`date -d "-1 day" +%F`
    8. fi
    9. dim_user_info="
    10. set hive.exec.dynamic.partition.mode=nonstrict;
    11. set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
    12. with
    13. tmp as
    14. (
    15. select
    16. old.id old_id,
    17. old.login_name old_login_name,
    18. old.nick_name old_nick_name,
    19. old.name old_name,
    20. old.phone_num old_phone_num,
    21. old.email old_email,
    22. old.user_level old_user_level,
    23. old.birthday old_birthday,
    24. old.gender old_gender,
    25. old.create_time old_create_time,
    26. old.operate_time old_operate_time,
    27. old.start_date old_start_date,
    28. old.end_date old_end_date,
    29. new.id new_id,
    30. new.login_name new_login_name,
    31. new.nick_name new_nick_name,
    32. new.name new_name,
    33. new.phone_num new_phone_num,
    34. new.email new_email,
    35. new.user_level new_user_level,
    36. new.birthday new_birthday,
    37. new.gender new_gender,
    38. new.create_time new_create_time,
    39. new.operate_time new_operate_time,
    40. new.start_date new_start_date,
    41. new.end_date new_end_date
    42. from
    43. (
    44. select
    45. id,
    46. login_name,
    47. nick_name,
    48. name,
    49. phone_num,
    50. email,
    51. user_level,
    52. birthday,
    53. gender,
    54. create_time,
    55. operate_time,
    56. start_date,
    57. end_date
    58. from ${APP}.dim_user_info
    59. where dt='9999-99-99'
    60. and start_date<'$do_date'
    61. )old
    62. full outer join
    63. (
    64. select
    65. id,
    66. login_name,
    67. nick_name,
    68. md5(name) name,
    69. md5(phone_num) phone_num,
    70. md5(email) email,
    71. user_level,
    72. birthday,
    73. gender,
    74. create_time,
    75. operate_time,
    76. '$do_date' start_date,
    77. '9999-99-99' end_date
    78. from ${APP}.ods_user_info
    79. where dt='$do_date'
    80. )new
    81. on old.id=new.id
    82. )
    83. insert overwrite table ${APP}.dim_user_info partition(dt)
    84. select
    85. nvl(new_id,old_id),
    86. nvl(new_login_name,old_login_name),
    87. nvl(new_nick_name,old_nick_name),
    88. nvl(new_name,old_name),
    89. nvl(new_phone_num,old_phone_num),
    90. nvl(new_email,old_email),
    91. nvl(new_user_level,old_user_level),
    92. nvl(new_birthday,old_birthday),
    93. nvl(new_gender,old_gender),
    94. nvl(new_create_time,old_create_time),
    95. nvl(new_operate_time,old_operate_time),
    96. nvl(new_start_date,old_start_date),
    97. nvl(new_end_date,old_end_date),
    98. nvl(new_end_date,old_end_date) dt
    99. from tmp
    100. union all
    101. select
    102. old_id,
    103. old_login_name,
    104. old_nick_name,
    105. old_name,
    106. old_phone_num,
    107. old_email,
    108. old_user_level,
    109. old_birthday,
    110. old_gender,
    111. old_create_time,
    112. old_operate_time,
    113. old_start_date,
    114. cast(date_add('$do_date',-1) as string),
    115. cast(date_add('$do_date',-1) as string) dt
    116. from tmp
    117. where new_id is not null and old_id is not null;
    118. "
    119. dim_sku_info="
    120. set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
    121. with
    122. sku as
    123. (
    124. select
    125. id,
    126. price,
    127. sku_name,
    128. sku_desc,
    129. weight,
    130. is_sale,
    131. spu_id,
    132. category3_id,
    133. tm_id,
    134. create_time
    135. from ${APP}.ods_sku_info
    136. where dt='$do_date'
    137. ),
    138. spu as
    139. (
    140. select
    141. id,
    142. spu_name
    143. from ${APP}.ods_spu_info
    144. where dt='$do_date'
    145. ),
    146. c3 as
    147. (
    148. select
    149. id,
    150. name,
    151. category2_id
    152. from ${APP}.ods_base_category3
    153. where dt='$do_date'
    154. ),
    155. c2 as
    156. (
    157. select
    158. id,
    159. name,
    160. category1_id
    161. from ${APP}.ods_base_category2
    162. where dt='$do_date'
    163. ),
    164. c1 as
    165. (
    166. select
    167. id,
    168. name
    169. from ${APP}.ods_base_category1
    170. where dt='$do_date'
    171. ),
    172. tm as
    173. (
    174. select
    175. id,
    176. tm_name
    177. from ${APP}.ods_base_trademark
    178. where dt='$do_date'
    179. ),
    180. attr as
    181. (
    182. select
    183. sku_id,
    184. collect_set(named_struct('attr_id',attr_id,'value_id',value_id,'attr_name',attr_name,'value_name',value_name)) attrs
    185. from ${APP}.ods_sku_attr_value
    186. where dt='$do_date'
    187. group by sku_id
    188. ),
    189. sale_attr as
    190. (
    191. select
    192. sku_id,
    193. collect_set(named_struct('sale_attr_id',sale_attr_id,'sale_attr_value_id',sale_attr_value_id,'sale_attr_name',sale_attr_name,'sale_attr_value_name',sale_attr_value_name)) sale_attrs
    194. from ${APP}.ods_sku_sale_attr_value
    195. where dt='$do_date'
    196. group by sku_id
    197. )
    198. insert overwrite table ${APP}.dim_sku_info partition(dt='$do_date')
    199. select
    200. sku.id,
    201. sku.price,
    202. sku.sku_name,
    203. sku.sku_desc,
    204. sku.weight,
    205. sku.is_sale,
    206. sku.spu_id,
    207. spu.spu_name,
    208. sku.category3_id,
    209. c3.name,
    210. c3.category2_id,
    211. c2.name,
    212. c2.category1_id,
    213. c1.name,
    214. sku.tm_id,
    215. tm.tm_name,
    216. attr.attrs,
    217. sale_attr.sale_attrs,
    218. sku.create_time
    219. from sku
    220. left join spu on sku.spu_id=spu.id
    221. left join c3 on sku.category3_id=c3.id
    222. left join c2 on c3.category2_id=c2.id
    223. left join c1 on c2.category1_id=c1.id
    224. left join tm on sku.tm_id=tm.id
    225. left join attr on sku.id=attr.sku_id
    226. left join sale_attr on sku.id=sale_attr.sku_id;
    227. "
    228. dim_base_province="
    229. set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
    230. insert overwrite table ${APP}.dim_base_province
    231. select
    232. bp.id,
    233. bp.name,
    234. bp.area_code,
    235. bp.iso_code,
    236. bp.iso_3166_2,
    237. bp.region_id,
    238. bp.name
    239. from ${APP}.ods_base_province bp
    240. join ${APP}.ods_base_region br on bp.region_id = br.id;
    241. "
    242. dim_coupon_info="
    243. set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
    244. insert overwrite table ${APP}.dim_coupon_info partition(dt='$do_date')
    245. select
    246. id,
    247. coupon_name,
    248. coupon_type,
    249. condition_amount,
    250. condition_num,
    251. activity_id,
    252. benefit_amount,
    253. benefit_discount,
    254. create_time,
    255. range_type,
    256. limit_num,
    257. taken_count,
    258. start_time,
    259. end_time,
    260. operate_time,
    261. expire_time
    262. from ${APP}.ods_coupon_info
    263. where dt='$do_date';
    264. "
    265. dim_activity_rule_info="
    266. set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
    267. insert overwrite table ${APP}.dim_activity_rule_info partition(dt='$do_date')
    268. select
    269. ar.id,
    270. ar.activity_id,
    271. ai.activity_name,
    272. ar.activity_type,
    273. ai.start_time,
    274. ai.end_time,
    275. ai.create_time,
    276. ar.condition_amount,
    277. ar.condition_num,
    278. ar.benefit_amount,
    279. ar.benefit_discount,
    280. ar.benefit_level
    281. from
    282. (
    283. select
    284. id,
    285. activity_id,
    286. activity_type,
    287. condition_amount,
    288. condition_num,
    289. benefit_amount,
    290. benefit_discount,
    291. benefit_level
    292. from ${APP}.ods_activity_rule
    293. where dt='$do_date'
    294. )ar
    295. left join
    296. (
    297. select
    298. id,
    299. activity_name,
    300. start_time,
    301. end_time,
    302. create_time
    303. from ${APP}.ods_activity_info
    304. where dt='$do_date'
    305. )ai
    306. on ar.activity_id=ai.id;
    307. "
    308. case $1 in
    309. "dim_user_info"){
    310. hive -e "$dim_user_info"
    311. };;
    312. "dim_sku_info"){
    313. hive -e "$dim_sku_info"
    314. };;
    315. "dim_base_province"){
    316. hive -e "$dim_base_province"
    317. };;
    318. "dim_coupon_info"){
    319. hive -e "$dim_coupon_info"
    320. };;
    321. "dim_activity_rule_info"){
    322. hive -e "$dim_activity_rule_info"
    323. };;
    324. "all"){
    325. hive -e "$dim_user_info$dim_sku_info$dim_coupon_info$dim_activity_rule_info"
    326. };;
    327. esac

  • 相关阅读:
    [TFF学习]官方教程jupyter运行记录_联邦学习之图像分类任务_1
    第四章 文件管理 九、文件系统的层次结构
    深度干货 | 38道Java基础面试题 (1.2W字详细解析)
    【单片机基础】C51语言基础
    位于同一子网下的ip在子网掩码配置错误的情况下如何进行通信(wireshrak抓包分析)
    vue3中sync修饰符的使用
    PID 控制理论
    Redis 内存淘汰策略,从根儿上理解
    Unity学习笔记---材质纹理&天空盒
    springmvc第十六个练习(多个拦截器的执行和拦截器过滤器的比较)
  • 原文地址:https://blog.csdn.net/qq_64557330/article/details/126613939