一行拆多行结合使用了
lateral view
和explode
。
多行拼一行结合使用了concat_ws
与collect_set
,某些hive版本支持group_concat
可以代替前者。
举例说明:
有两张表sen_tbl(敏感字段定义表)
、msg_tbl(消息表)
。其中消息表的消息是一个json字符串,最多有两层json嵌套。而json消息中的某些value是含有敏感信息的,key-value是否敏感的定义存放在sen_tbl
中。
现在要做的是将json消息中的敏感value加密,然后分成不含敏感的json,和只含敏感的json。所以需要分为两步:
- 拆分json,并加密敏感value
- 分别合并敏感和非敏感键值对,合并成2个json,放在不同的列
1. 建表,插入数据:
create table sen_tbl (
col_nam string, --字段(key)名称
is_sen string --是否敏感:1敏感,2不敏感
);
insert into sen_tbl values('CRDACCPTNMELCT','1'); --敏感
insert into sen_tbl values('TXNMERCHNO','1'); --敏感
insert into sen_tbl values('ACCTNBR','0');
insert into sen_tbl values('CRSERVICECODE','0');
insert into sen_tbl values('modelFileId','0');
insert into sen_tbl values('trs_ist','0');
drop table if exists msg_tbl;
create table msg_tbl (
msg_id string, --消息id
json_msg string --消息主体
);
insert into msg_tbl values(
'1',
'{
"sceneParams": {
"ACCTNBR": "0100020100001001",
"CRDACCPTNMELCT": "如家和美酒店管理有限公司如家快捷天坛店",
"CRSERVICECODE": "340",
"TXNMERCHNO": "303605280000102"
},
"modelFileId": 2,
"modelFileThreshold": {
"trs_ist": 0.4
}
}'
);
insert into msg_tbl values(
'2',
'{
"sceneParams": {
"ACCTNBR": "0100020100001222",
"CRDACCPTNMELCT": "招商银行",
"CRSERVICECODE": "340",
"TXNMERCHNO": "303605285555555"
}
}'
);
2. 将json消息的第一层拆分成多行数据:
create table explode_msg as
select msg_id, --消息id
'1' as json_lv, --key在json中的层级
tbl1.key,
tbl1.value
from msg_tbl
lateral view explode(default.json_to_map(json_msg)) tbl1 as key,value
;
SQL执行结果数据在简书中没有对齐,在其他编辑器入notepad++中是可以对齐的
+---------------------+----------------------+---------------------+------------------------------------------------------------------------------------------------------------------------------------------------+
| explode_msg.msg_id | explode_msg.json_lv | explode_msg.key | explode_msg.value |
+---------------------+----------------------+---------------------+------------------------------------------------------------------------------------------------------------------------------------------------+
| 1 | 1 | modelFileId | 2 |
| 1 | 1 | sceneParams | {"CRDACCPTNMELCT":"如家和美酒店管理有限公司如家快捷天坛店","CRSERVICECODE":"340","ACCTNBR":"0100020100001001","TXNMERCHNO":"303605280000102"} |
| 1 | 1 | modelFileThreshold | {"trs_ist":0.4} |
| 2 | 1 | sceneParams | {"CRDACCPTNMELCT":"招商银行","CRSERVICECODE":"340","ACCTNBR":"0100020100001222","TXNMERCHNO":"303605285555555"} |
+---------------------+----------------------+---------------------+------------------------------------------------------------------------------------------------------------------------------------------------+
3. 将json消息第二层拆分成多行数据:
drop table if exists explode_msg_2;
create table explode_msg_2 as
select msg_id,
'2' as json_lv,
key,
tbl1.key2,
tbl1.value2
from explode_msg
lateral view explode(default.json_to_map(value)) tbl1 as key2,value2
;
+-----------------------+------------------------+---------------------+---------------------+-----------------------------------------+
| explode_msg_2.msg_id | explode_msg_2.json_lv | explode_msg_2.key | explode_msg_2.key2 | explode_msg_2.value2 |
+-----------------------+------------------------+---------------------+---------------------+-----------------------------------------+
| 1 | 2 | sceneParams | CRDACCPTNMELCT | 如家和美酒店管理有限公司如家快捷天坛店 |
| 1 | 2 | sceneParams | CRSERVICECODE | 340 |
| 1 | 2 | sceneParams | ACCTNBR | 0100020100001001 |
| 1 | 2 | sceneParams | TXNMERCHNO | 303605280000102 |
| 1 | 2 | modelFileThreshold | trs_ist | 0.4 |
| 2 | 2 | sceneParams | CRDACCPTNMELCT | 招商银行 |
| 2 | 2 | sceneParams | CRSERVICECODE | 340 |
| 2 | 2 | sceneParams | ACCTNBR | 0100020100001222 |
| 2 | 2 | sceneParams | TXNMERCHNO | 303605285555555 |
+-----------------------+------------------------+---------------------+---------------------+-----------------------------------------+
4. 合并以上两步的数据,并对敏感value加密:
drop table if exists explode_msg_fnl;
create table explode_msg_fnl as
select msg_id,
json_lv,
key as key1,
key2,
value2 as value,
case when b.is_sen = '1' then md5(a.value2) else a.value2 end as value_sen
from explode_msg_2 a
join sen_tbl b
on a.key2 = b.col_nam
union all
select msg_id,
json_lv,
key as key1,
null,
value,
case when b.is_sen = '1' then md5(a.value) else a.value end as value_sen
from explode_msg a
join sen_tbl b
on a.key = b.col_nam
where value not like '{"%'
;
+-------------------------+--------------------------+-----------------------+-----------------------+-----------------------------------------+-----------------------------------+
| explode_msg_fnl.msg_id | explode_msg_fnl.json_lv | explode_msg_fnl.key1 | explode_msg_fnl.key2 | explode_msg_fnl.value | explode_msg_fnl.value_sen |
+-------------------------+--------------------------+-----------------------+-----------------------+-----------------------------------------+-----------------------------------+
| 1 | 1 | modelFileId | NULL | 2 | 2 |
| 1 | 2 | sceneParams | ACCTNBR | 0100020100001001 | 0100020100001001 |
| 1 | 2 | modelFileThreshold | trs_ist | 0.4 | 0.4 |
| 1 | 2 | sceneParams | CRSERVICECODE | 340 | 340 |
| 1 | 2 | sceneParams | TXNMERCHNO | 303605280000102 | ecb0166601a8264180164810a2df4ee9 |
| 1 | 2 | sceneParams | CRDACCPTNMELCT | 如家和美酒店管理有限公司如家快捷天坛店 | d531cfc939890cfbb3127f59bc30060a |
| 2 | 2 | sceneParams | ACCTNBR | 0100020100001222 | 0100020100001222 |
| 2 | 2 | sceneParams | CRSERVICECODE | 340 | 340 |
| 2 | 2 | sceneParams | TXNMERCHNO | 303605285555555 | 0ab5cdd213a8313d69b3d8e1b5e1eadb |
| 2 | 2 | sceneParams | CRDACCPTNMELCT | 招商银行 | e0f88f4dbec781d1ab8402e53f0e25c3 |
+-------------------------+--------------------------+-----------------------+-----------------------+-----------------------------------------+-----------------------------------+
5. 重新合并json的第二层的key、value,多行合并为一行:
drop table if exists result_msg;
create table result_msg as
select msg_id, key1, is_sen, concat('"',value,'"') as key1_val
from explode_msg_fnl
where json_lv = 1
union all
select msg_id,
key1,
is_sen,
concat('{', concat_ws(',', collect_set(map_val)), '}') as key1_val
from (select msg_id,
key1,
concat('"',key2,'":"',value,'"') as map_val,
is_sen
from explode_msg_fnl
where json_lv = 2
) t
group by msg_id, key1, is_sen
;
+--------------------+---------------------+--------------------+--------------------------------------------------------------------------------------------------------+
| result_msg.msg_id | result_msg.key1 | result_msg.is_sen | result_msg.key1_val |
+--------------------+---------------------+--------------------+--------------------------------------------------------------------------------------------------------+
| 1 | modelFileThreshold | 0 | {"trs_ist":"0.4"} |
| 1 | sceneParams | 0 | {"CRSERVICECODE":"340","ACCTNBR":"0100020100001001"} |
| 1 | sceneParams | 1 | {"CRDACCPTNMELCT":"d531cfc939890cfbb3127f59bc30060a","TXNMERCHNO":"ecb0166601a8264180164810a2df4ee9"} |
| 2 | sceneParams | 0 | {"CRSERVICECODE":"340","ACCTNBR":"0100020100001222"} |
| 2 | sceneParams | 1 | {"CRDACCPTNMELCT":"e0f88f4dbec781d1ab8402e53f0e25c3","TXNMERCHNO":"0ab5cdd213a8313d69b3d8e1b5e1eadb"} |
| 1 | modelFileId | 0 | "2" |
+--------------------+---------------------+--------------------+--------------------------------------------------------------------------------------------------------+
6. 合并json第一层的key、value:
drop table if exists result_msg_2;
create table result_msg_2 as
select msg_id,
is_sen,
concat('{', concat_ws(',', collect_set(json_val)), '}') as json_val
from (select msg_id,
is_sen,
concat('"',key1,'": ',key1_val) as json_val
from result_msg
) t
group by msg_id, is_sen
;
+----------------------+----------------------+-----------------------------------------------------------------------------------------------------------------------------------+
| result_msg_2.msg_id | result_msg_2.is_sen | result_msg_2.json_val |
+----------------------+----------------------+-----------------------------------------------------------------------------------------------------------------------------------+
| 1 | 0 | {"modelFileThreshold": {"trs_ist":"0.4"},"sceneParams": {"CRSERVICECODE":"340","ACCTNBR":"0100020100001001"},"modelFileId": "2"} |
| 1 | 1 | {"sceneParams": {"CRDACCPTNMELCT":"d531cfc939890cfbb3127f59bc30060a","TXNMERCHNO":"ecb0166601a8264180164810a2df4ee9"}} |
| 2 | 0 | {"sceneParams": {"CRSERVICECODE":"340","ACCTNBR":"0100020100001222"}} |
| 2 | 1 | {"sceneParams": {"CRDACCPTNMELCT":"e0f88f4dbec781d1ab8402e53f0e25c3","TXNMERCHNO":"0ab5cdd213a8313d69b3d8e1b5e1eadb"}} |
+----------------------+----------------------+-----------------------------------------------------------------------------------------------------------------------------------+
7. 行转列,一列为不包含敏感的json,另一列为包含敏感的json:
create table result_msg_fnl as
select msg_id,
max(case when is_sen = 0 then json_val end) as json_val,
max(case when is_sen = 1 then json_val end) as json_val_sen
from result_msg_2
group by msg_id
;
+------------------------+-----------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------+
| result_msg_fnl.msg_id | result_msg_fnl.json_val | result_msg_fnl.json_val_sen |
+------------------------+-----------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------+
| 1 | {"modelFileThreshold": {"trs_ist":"0.4"},"sceneParams": {"CRSERVICECODE":"340","ACCTNBR":"0100020100001001"},"modelFileId": "2"} | {"sceneParams": {"CRDACCPTNMELCT":"d531cfc939890cfbb3127f59bc30060a","TXNMERCHNO":"ecb0166601a8264180164810a2df4ee9"}} |
| 2 | {"sceneParams": {"CRSERVICECODE":"340","ACCTNBR":"0100020100001222"}} | {"sceneParams": {"CRDACCPTNMELCT":"e0f88f4dbec781d1ab8402e53f0e25c3","TXNMERCHNO":"0ab5cdd213a8313d69b3d8e1b5e1eadb"}} |
+------------------------+-----------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------+