数据安全管理,本shell 实现对hive源表敏感字段进行md5加密,然后写入新表;
read -p 交互输入:要脱敏的hive表、分区,示例: test_db.table_name 20240331
生成更新hive分区表的hql: insert overwrite table xxx
备注:仅供参考,可对本shell代码,做进一步修改调整,满足自己的需求。
- #!/bin/bash
- # 2024.04.19
- read -p "请按示例格式输入要脱敏的hive表、分区,示例: test_db.table_name 20240331 ### " table etl_date
-
- #获取库名、表名,存入变量
- dbname=`echo $table |awk -F. '{print $1}'`
- tbl=`echo $table |awk -F. '{print $2}'`
-
- target_db=import_db
-
- #首先判断目标表是否存在
- hdfs dfs -test -e /user/hive/warehouse/${target_db}.db/$tbl
- if [ $? -eq 0 ] ;then
- echo "${target_db}.$tbl 目标库已经存在...,请确认表结构是否与源表一致"
-
- else
-
- # 1、 在 ${target_db} like 建表
- echo "1、 create table ${target_db}.$tbl like $table;"
- beeline -e "create table ${target_db}.$tbl like $table;"
-
-
- # 2、 把hive表,所有字段 切出来 备用
- beeline --showHeader=false --outputformat=tsv2 -e "desc $table" |awk '{print $1}' >desc_field.out
- echo "desc $table .... beeline ok"
-
-
- #############################################md5表字段加密开始#############################################
- num=`cat desc_field.out |grep NULL |wc -l`
- if [ $num == 1 ] ;then
- # tac desc_field.out |sed -n '6,$p' |tac >desc_field_final.out
- cat desc_field.out |sed '/NULL/,$d' >desc_field_final.out
- else
- cat desc_field.out >desc_field_final.out
- fi
-
-
- # 需要脱敏的敏感字段,手动添加维护
- sensitive_fields=('name' 'phone' 'iden' 'tel')
-
- fields=`cat desc_field_final.out`
- m=`cat desc_field_final.out |wc -l`
- flag=0
-
- md5_fields=""
- echo $md5_fields
- # 嵌套循环,拿 表字段 与 敏感字段 进行匹配,表字段如果是敏感字段,进行MD5 加密,拼接 sql 字符串
- for j in $fields
- do
- biaoji=1
- # 1.匹配到,进行md5加密,拼接;标记置为0,跳出内循环;2.没有匹配到,继续下一轮内循环;
- length=${#sensitive_fields[*]}
- # length 控制内循环的次数
- for ((i=0;i<${length};i++))
- do
- echo "i=$i"
- if [ $j == ${sensitive_fields[i]} ]
- then
- md5_fields=${md5_fields}",md5($j) as $j"
- biaoji=0
- break
- else
- continue
- fi
- done
- # 内循环执行一轮结束,对 biaoji 的值进行判断,为1,说明 表字段 非敏感字段,进行拼接
- if [ $biaoji == 1 ]
- then
- md5_fields=${md5_fields}","$j
- fi
- echo $md5_fields
-
- ((flag++))
- echo "flag=$flag"
- # 外循环次数 等于 表字段数,外循环完成,拼接 md5_fields 完成
- if [ $flag -eq $m ]
- then
- # 删除 md5_fields 串里第一个逗号,正确的 md5_fields 拼接完成
- md5_fields=`echo $md5_fields | sed 's/,//'`
- echo $md5_fields
- fi
-
- done
- #############################################md5表字段加密结束#############################################
-
- #############分区###############
- in_values="'$etl_date'"
- echo "in_values=$in_values"
-
- # in_values=""
- # for i in 20240131,20240228,20240331
- # do
- # in_values=$in_values",'$j'"
- # done
- # # ,'20240131','20240228','20240331'
- # in_values=`echo $in_values |sed 's/,//'`
- # # '20240131','20240228','20240331'
- # echo "in_values=$in_values"
- ###############分区###############
-
-
-
-
- # 3、对分区字段个数计数 如果是分区表,取出分区字段
- partition_field_num= `sed -n '/#/,$p' desc_field.out |grep ^[^#] |wc -l`
- echo "partition_field_num=$partition_field_num"
-
- sql="insert overwrite table $target_db.$tbl "
- echo "初始:$sql"
-
-
- # 4、生成更新分区数据的语句;分区字段个数不同,生成不同的 insert overwrite table
- if [ $partition_field_num -eq 2 ];then
- partition1=$(echo `sed -n '/#/,$p' desc_field.out |grep ^[^#]` |awk '{print $1}')
- partition2=$(echo `sed -n '/#/,$p' desc_field.out |grep ^[^#]` |awk '{print $2}')
- sql=$sql"partition($partition1,$partition2) select $md5_fields from $table where $partition1 in ($in_values);"
- echo "拼接2个分区后:$sql"
-
- elif [ $partition_field_num -eq 1 ];then
- partition1=$(echo `sed -n '/#/,$p' desc_field.out |grep ^[^#]` |awk '{print $1}')
- sql=$sql"partition($partition1) select $md5_fields from $table where $partition1 in ($in_values);"
- echo "拼接1个分区后:$sql"
-
- else
- sql=$sql"select $md5_fields from $table;"
- echo "无分区:$sql"
-
- fi
-
-
-
- # 5、生成 update_partitions.sql
- cat <<-'EOF' >update_partitions.sql
- set hive.execution.engine=tez;
- set tez.queue.name=root.default;
- set hive.tez.java.opts=-Xmx6144m;
- set hive.tez.container.size=8192;
- set hive.exec.dynamic.partition=true;
- set hive.exec.dynamic.partition.mode=nonstrict;
- set hive.exec.dynamic.partitions.pernode=1000;
- EOF
- echo "$sql" >>update_partitions.sql
-
- fi