• 数仓4.0


    用户维度表(拉链表

    1)什么是拉链表

          拉链表,记录每条信息的生命周期,一旦一条记录的生命周期结束,就重新开始一条新的记录并把当前日期放入生效开始日期。

         如果当前信息至今有效,在生命周期中填入一个极大值例如(9999-99-99)。

    2)为什么要做拉链表

    拉链表适合于:数据会发生变化但是变化频率并不高的维度(即:缓慢变化)

    比如:用户信息会发生变化,但是每天变化的比例不高。如果数据量有一定规模,按照每日全量的方式保存效率很低。比如:1亿用户*365天,每天一份用户信息。(做每日全量效率低) 

    3)如何使用拉链表

    通过,生效开始日期<=某个日期且生效日期>=某个日期 ,能得到某个时间点的数据全量切片。

     4)拉链表形成过程

     

    我的理解应该是将这些用户发生了微小变化通过每日的表格进行比较,最后再合并起来

    DIM层首日数据装载脚本

    1)在/home/atguigu/bin目录下创建脚本ods_to_dim_db_init.sh,并填写一下内容

    1. #!/bin/bash
    2. APP=gmall
    3. if [ -n "$2" ] ;then
    4. do_date=$2
    5. else
    6. echo "请传入日期参数"
    7. exit
    8. fi
    9. dim_user_info="
    10. set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
    11. insert overwrite table ${APP}.dim_user_info partition(dt='9999-99-99')
    12. select
    13. id,
    14. login_name,
    15. nick_name,
    16. md5(name),
    17. md5(phone_num),
    18. md5(email),
    19. user_level,
    20. birthday,
    21. gender,
    22. create_time,
    23. operate_time,
    24. '$do_date',
    25. '9999-99-99'
    26. from ${APP}.ods_user_info
    27. where dt='$do_date';
    28. "
    29. dim_sku_info="
    30. set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
    31. with
    32. sku as
    33. (
    34. select
    35. id,
    36. price,
    37. sku_name,
    38. sku_desc,
    39. weight,
    40. is_sale,
    41. spu_id,
    42. category3_id,
    43. tm_id,
    44. create_time
    45. from ${APP}.ods_sku_info
    46. where dt='$do_date'
    47. ),
    48. spu as
    49. (
    50. select
    51. id,
    52. spu_name
    53. from ${APP}.ods_spu_info
    54. where dt='$do_date'
    55. ),
    56. c3 as
    57. (
    58. select
    59. id,
    60. name,
    61. category2_id
    62. from ${APP}.ods_base_category3
    63. where dt='$do_date'
    64. ),
    65. c2 as
    66. (
    67. select
    68. id,
    69. name,
    70. category1_id
    71. from ${APP}.ods_base_category2
    72. where dt='$do_date'
    73. ),
    74. c1 as
    75. (
    76. select
    77. id,
    78. name
    79. from ${APP}.ods_base_category1
    80. where dt='$do_date'
    81. ),
    82. tm as
    83. (
    84. select
    85. id,
    86. tm_name
    87. from ${APP}.ods_base_trademark
    88. where dt='$do_date'
    89. ),
    90. attr as
    91. (
    92. select
    93. sku_id,
    94. collect_set(named_struct('attr_id',attr_id,'value_id',value_id,'attr_name',attr_name,'value_name',value_name)) attrs
    95. from ${APP}.ods_sku_attr_value
    96. where dt='$do_date'
    97. group by sku_id
    98. ),
    99. sale_attr as
    100. (
    101. select
    102. sku_id,
    103. collect_set(named_struct('sale_attr_id',sale_attr_id,'sale_attr_value_id',sale_attr_value_id,'sale_attr_name',sale_attr_name,'sale_attr_value_name',sale_attr_value_name)) sale_attrs
    104. from ${APP}.ods_sku_sale_attr_value
    105. where dt='$do_date'
    106. group by sku_id
    107. )
    108. insert overwrite table ${APP}.dim_sku_info partition(dt='$do_date')
    109. select
    110. sku.id,
    111. sku.price,
    112. sku.sku_name,
    113. sku.sku_desc,
    114. sku.weight,
    115. sku.is_sale,
    116. sku.spu_id,
    117. spu.spu_name,
    118. sku.category3_id,
    119. c3.name,
    120. c3.category2_id,
    121. c2.name,
    122. c2.category1_id,
    123. c1.name,
    124. sku.tm_id,
    125. tm.tm_name,
    126. attr.attrs,
    127. sale_attr.sale_attrs,
    128. sku.create_time
    129. from sku
    130. left join spu on sku.spu_id=spu.id
    131. left join c3 on sku.category3_id=c3.id
    132. left join c2 on c3.category2_id=c2.id
    133. left join c1 on c2.category1_id=c1.id
    134. left join tm on sku.tm_id=tm.id
    135. left join attr on sku.id=attr.sku_id
    136. left join sale_attr on sku.id=sale_attr.sku_id;
    137. "
    138. dim_base_province="
    139. set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
    140. insert overwrite table ${APP}.dim_base_province
    141. select
    142. bp.id,
    143. bp.name,
    144. bp.area_code,
    145. bp.iso_code,
    146. bp.iso_3166_2,
    147. bp.region_id,
    148. br.region_name
    149. from ${APP}.ods_base_province bp
    150. join ${APP}.ods_base_region br on bp.region_id = br.id;
    151. "
    152. dim_coupon_info="
    153. set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
    154. insert overwrite table ${APP}.dim_coupon_info partition(dt='$do_date')
    155. select
    156. id,
    157. coupon_name,
    158. coupon_type,
    159. condition_amount,
    160. condition_num,
    161. activity_id,
    162. benefit_amount,
    163. benefit_discount,
    164. create_time,
    165. range_type,
    166. limit_num,
    167. taken_count,
    168. start_time,
    169. end_time,
    170. operate_time,
    171. expire_time
    172. from ${APP}.ods_coupon_info
    173. where dt='$do_date';
    174. "
    175. dim_activity_rule_info="
    176. set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
    177. insert overwrite table ${APP}.dim_activity_rule_info partition(dt='$do_date')
    178. select
    179. ar.id,
    180. ar.activity_id,
    181. ai.activity_name,
    182. ar.activity_type,
    183. ai.start_time,
    184. ai.end_time,
    185. ai.create_time,
    186. ar.condition_amount,
    187. ar.condition_num,
    188. ar.benefit_amount,
    189. ar.benefit_discount,
    190. ar.benefit_level
    191. from
    192. (
    193. select
    194. id,
    195. activity_id,
    196. activity_type,
    197. condition_amount,
    198. condition_num,
    199. benefit_amount,
    200. benefit_discount,
    201. benefit_level
    202. from ${APP}.ods_activity_rule
    203. where dt='$do_date'
    204. )ar
    205. left join
    206. (
    207. select
    208. id,
    209. activity_name,
    210. start_time,
    211. end_time,
    212. create_time
    213. from ${APP}.ods_activity_info
    214. where dt='$do_date'
    215. )ai
    216. on ar.activity_id=ai.id;
    217. "
    218. case $1 in
    219. "dim_user_info"){
    220. hive -e "$dim_user_info"
    221. };;
    222. "dim_sku_info"){
    223. hive -e "$dim_sku_info"
    224. };;
    225. "dim_base_province"){
    226. hive -e "$dim_base_province"
    227. };;
    228. "dim_coupon_info"){
    229. hive -e "$dim_coupon_info"
    230. };;
    231. "dim_activity_rule_info"){
    232. hive -e "$dim_activity_rule_info"
    233. };;
    234. "all"){
    235. hive -e "$dim_user_info$dim_sku_info$dim_coupon_info$dim_activity_rule_info$dim_base_province"
    236. };;
    237. esac

    2)增加执行权限并执行

    1. [atguigu@hadoop102 bin]$ chmod +x ods_to_dim_db_init.sh
    2. [atguigu@hadoop102 bin]$ ods_to_dim_db_init.sh all 2020-06-14

    DIM层每日数据装载脚本

    1)在/home/atguigu/bin目录下创建脚本ods_to_dim_db.sh

    1. #!/bin/bash
    2. APP=gmall
    3. # 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
    4. if [ -n "$2" ] ;then
    5. do_date=$2
    6. else
    7. do_date=`date -d "-1 day" +%F`
    8. fi
    9. dim_user_info="
    10. set hive.exec.dynamic.partition.mode=nonstrict;
    11. set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
    12. with
    13. tmp as
    14. (
    15. select
    16. old.id old_id,
    17. old.login_name old_login_name,
    18. old.nick_name old_nick_name,
    19. old.name old_name,
    20. old.phone_num old_phone_num,
    21. old.email old_email,
    22. old.user_level old_user_level,
    23. old.birthday old_birthday,
    24. old.gender old_gender,
    25. old.create_time old_create_time,
    26. old.operate_time old_operate_time,
    27. old.start_date old_start_date,
    28. old.end_date old_end_date,
    29. new.id new_id,
    30. new.login_name new_login_name,
    31. new.nick_name new_nick_name,
    32. new.name new_name,
    33. new.phone_num new_phone_num,
    34. new.email new_email,
    35. new.user_level new_user_level,
    36. new.birthday new_birthday,
    37. new.gender new_gender,
    38. new.create_time new_create_time,
    39. new.operate_time new_operate_time,
    40. new.start_date new_start_date,
    41. new.end_date new_end_date
    42. from
    43. (
    44. select
    45. id,
    46. login_name,
    47. nick_name,
    48. name,
    49. phone_num,
    50. email,
    51. user_level,
    52. birthday,
    53. gender,
    54. create_time,
    55. operate_time,
    56. start_date,
    57. end_date
    58. from ${APP}.dim_user_info
    59. where dt='9999-99-99'
    60. and start_date<'$do_date'
    61. )old
    62. full outer join
    63. (
    64. select
    65. id,
    66. login_name,
    67. nick_name,
    68. md5(name) name,
    69. md5(phone_num) phone_num,
    70. md5(email) email,
    71. user_level,
    72. birthday,
    73. gender,
    74. create_time,
    75. operate_time,
    76. '$do_date' start_date,
    77. '9999-99-99' end_date
    78. from ${APP}.ods_user_info
    79. where dt='$do_date'
    80. )new
    81. on old.id=new.id
    82. )
    83. insert overwrite table ${APP}.dim_user_info partition(dt)
    84. select
    85. nvl(new_id,old_id),
    86. nvl(new_login_name,old_login_name),
    87. nvl(new_nick_name,old_nick_name),
    88. nvl(new_name,old_name),
    89. nvl(new_phone_num,old_phone_num),
    90. nvl(new_email,old_email),
    91. nvl(new_user_level,old_user_level),
    92. nvl(new_birthday,old_birthday),
    93. nvl(new_gender,old_gender),
    94. nvl(new_create_time,old_create_time),
    95. nvl(new_operate_time,old_operate_time),
    96. nvl(new_start_date,old_start_date),
    97. nvl(new_end_date,old_end_date),
    98. nvl(new_end_date,old_end_date) dt
    99. from tmp
    100. union all
    101. select
    102. old_id,
    103. old_login_name,
    104. old_nick_name,
    105. old_name,
    106. old_phone_num,
    107. old_email,
    108. old_user_level,
    109. old_birthday,
    110. old_gender,
    111. old_create_time,
    112. old_operate_time,
    113. old_start_date,
    114. cast(date_add('$do_date',-1) as string),
    115. cast(date_add('$do_date',-1) as string) dt
    116. from tmp
    117. where new_id is not null and old_id is not null;
    118. "
    119. dim_sku_info="
    120. set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
    121. with
    122. sku as
    123. (
    124. select
    125. id,
    126. price,
    127. sku_name,
    128. sku_desc,
    129. weight,
    130. is_sale,
    131. spu_id,
    132. category3_id,
    133. tm_id,
    134. create_time
    135. from ${APP}.ods_sku_info
    136. where dt='$do_date'
    137. ),
    138. spu as
    139. (
    140. select
    141. id,
    142. spu_name
    143. from ${APP}.ods_spu_info
    144. where dt='$do_date'
    145. ),
    146. c3 as
    147. (
    148. select
    149. id,
    150. name,
    151. category2_id
    152. from ${APP}.ods_base_category3
    153. where dt='$do_date'
    154. ),
    155. c2 as
    156. (
    157. select
    158. id,
    159. name,
    160. category1_id
    161. from ${APP}.ods_base_category2
    162. where dt='$do_date'
    163. ),
    164. c1 as
    165. (
    166. select
    167. id,
    168. name
    169. from ${APP}.ods_base_category1
    170. where dt='$do_date'
    171. ),
    172. tm as
    173. (
    174. select
    175. id,
    176. tm_name
    177. from ${APP}.ods_base_trademark
    178. where dt='$do_date'
    179. ),
    180. attr as
    181. (
    182. select
    183. sku_id,
    184. collect_set(named_struct('attr_id',attr_id,'value_id',value_id,'attr_name',attr_name,'value_name',value_name)) attrs
    185. from ${APP}.ods_sku_attr_value
    186. where dt='$do_date'
    187. group by sku_id
    188. ),
    189. sale_attr as
    190. (
    191. select
    192. sku_id,
    193. collect_set(named_struct('sale_attr_id',sale_attr_id,'sale_attr_value_id',sale_attr_value_id,'sale_attr_name',sale_attr_name,'sale_attr_value_name',sale_attr_value_name)) sale_attrs
    194. from ${APP}.ods_sku_sale_attr_value
    195. where dt='$do_date'
    196. group by sku_id
    197. )
    198. insert overwrite table ${APP}.dim_sku_info partition(dt='$do_date')
    199. select
    200. sku.id,
    201. sku.price,
    202. sku.sku_name,
    203. sku.sku_desc,
    204. sku.weight,
    205. sku.is_sale,
    206. sku.spu_id,
    207. spu.spu_name,
    208. sku.category3_id,
    209. c3.name,
    210. c3.category2_id,
    211. c2.name,
    212. c2.category1_id,
    213. c1.name,
    214. sku.tm_id,
    215. tm.tm_name,
    216. attr.attrs,
    217. sale_attr.sale_attrs,
    218. sku.create_time
    219. from sku
    220. left join spu on sku.spu_id=spu.id
    221. left join c3 on sku.category3_id=c3.id
    222. left join c2 on c3.category2_id=c2.id
    223. left join c1 on c2.category1_id=c1.id
    224. left join tm on sku.tm_id=tm.id
    225. left join attr on sku.id=attr.sku_id
    226. left join sale_attr on sku.id=sale_attr.sku_id;
    227. "
    228. dim_base_province="
    229. set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
    230. insert overwrite table ${APP}.dim_base_province
    231. select
    232. bp.id,
    233. bp.name,
    234. bp.area_code,
    235. bp.iso_code,
    236. bp.iso_3166_2,
    237. bp.region_id,
    238. bp.name
    239. from ${APP}.ods_base_province bp
    240. join ${APP}.ods_base_region br on bp.region_id = br.id;
    241. "
    242. dim_coupon_info="
    243. set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
    244. insert overwrite table ${APP}.dim_coupon_info partition(dt='$do_date')
    245. select
    246. id,
    247. coupon_name,
    248. coupon_type,
    249. condition_amount,
    250. condition_num,
    251. activity_id,
    252. benefit_amount,
    253. benefit_discount,
    254. create_time,
    255. range_type,
    256. limit_num,
    257. taken_count,
    258. start_time,
    259. end_time,
    260. operate_time,
    261. expire_time
    262. from ${APP}.ods_coupon_info
    263. where dt='$do_date';
    264. "
    265. dim_activity_rule_info="
    266. set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
    267. insert overwrite table ${APP}.dim_activity_rule_info partition(dt='$do_date')
    268. select
    269. ar.id,
    270. ar.activity_id,
    271. ai.activity_name,
    272. ar.activity_type,
    273. ai.start_time,
    274. ai.end_time,
    275. ai.create_time,
    276. ar.condition_amount,
    277. ar.condition_num,
    278. ar.benefit_amount,
    279. ar.benefit_discount,
    280. ar.benefit_level
    281. from
    282. (
    283. select
    284. id,
    285. activity_id,
    286. activity_type,
    287. condition_amount,
    288. condition_num,
    289. benefit_amount,
    290. benefit_discount,
    291. benefit_level
    292. from ${APP}.ods_activity_rule
    293. where dt='$do_date'
    294. )ar
    295. left join
    296. (
    297. select
    298. id,
    299. activity_name,
    300. start_time,
    301. end_time,
    302. create_time
    303. from ${APP}.ods_activity_info
    304. where dt='$do_date'
    305. )ai
    306. on ar.activity_id=ai.id;
    307. "
    308. case $1 in
    309. "dim_user_info"){
    310. hive -e "$dim_user_info"
    311. };;
    312. "dim_sku_info"){
    313. hive -e "$dim_sku_info"
    314. };;
    315. "dim_base_province"){
    316. hive -e "$dim_base_province"
    317. };;
    318. "dim_coupon_info"){
    319. hive -e "$dim_coupon_info"
    320. };;
    321. "dim_activity_rule_info"){
    322. hive -e "$dim_activity_rule_info"
    323. };;
    324. "all"){
    325. hive -e "$dim_user_info$dim_sku_info$dim_coupon_info$dim_activity_rule_info"
    326. };;
    327. esac

    2)增加执行权限并执行脚本

    1. [atguigu@hadoop102 bin]$ chmod +x ods_to_dim_db.sh
    2. [atguigu@hadoop102 bin]$ ods_to_dim_db.sh all 2020-06-14

  • 相关阅读:
    【架构篇】Supabase架构和功能介绍
    golang进程启动及监控
    [linux学习笔记]02 gcc安装与使用
    SpringMVC:全局异常处理(动力)
    代码审计基础php_bugs
    Redhat Linux(RHEL) - Primavera P6 EPPM 安装及分享
    const的自己理解
    MySQL数据库——实体完整性
    【docker】私有仓库搭建
    【自动驾驶地图】OpenDrive协议总结(上)
  • 原文地址:https://blog.csdn.net/weixin_63816398/article/details/126455333