1.设备主题
1.1 活跃设备数(日、周、月)
需求定义:
日活:当日活跃的设备数
周活:当周活跃的设备数
月活:当月活跃的设备数
1)建表语句
hive (gmall) > DROP TABLE
IF EXISTS ads_uv_count;
CREATE external TABLE ads_uv_count (
`dt` string COMMENT '统计日期',
`day_count` BIGINT COMMENT '当日用户数量',
`wk_count` BIGINT COMMENT '当周用户数量',
`mn_count` BIGINT COMMENT '当月用户数量',
`is_weekend` string COMMENT 'Y,N 是否是周末,用于得到本周最终结果',
`is_monthend` string COMMENT 'Y,N 是否是月末,用于得到本月最终结果'
) COMMENT '活跃设备数' ROW format delimited FIELDS TERMINATED BY '\t'
location '/warehouse/gmall/ads/ads_uv_count/';
2)导入数据
hive (gmall) > INSERT INTO TABLE ads_uv_count SELECT
'2020-03-10' dt,
daycount.ct,
wkcount.ct,
mncount.ct,
IF (
date_add(
next_day ('2020-03-10', 'MO') ,- 1
) = '2020-03-10',
'Y',
'N'
),
IF (
last_day('2020-03-10') = '2020-03-10',
'Y',
'N'
)
FROM
(
SELECT
'2020-03-10' dt,
count(*) ct
FROM
dwt_uv_topic
WHERE
login_date_last = '2020-03-10'
) daycount
JOIN (
SELECT
'2020-03-10' dt,
count(*) ct
FROM
dwt_uv_topic
WHERE
login_date_last >= date_add(
next_day ('2020-03-10', 'MO') ,- 7
)
AND login_date_last <= date_add(
next_day ('2020-03-10', 'MO') ,- 1
)
) wkcount ON daycount.dt = wkcount.dt
JOIN (
SELECT
'2020-03-10' dt,
count(*) ct
FROM
dwt_uv_topic
WHERE
date_format(login_date_last, 'yyyy-MM') = date_format('2020-03-10', 'yyyy-MM')
) mncount ON daycount.dt = mncount.dt;
3)查询导入结果
hive (gmall)> select * from ads_uv_count;
1.2 每日新增设备
1)建表语句
hive (gmall) > DROP TABLE
IF EXISTS ads_new_mid_count;
CREATE external TABLE ads_new_mid_count (
`create_date` string COMMENT '创建时间',
`new_mid_count` BIGINT COMMENT '新增设备数量'
) COMMENT '每日新增设备信息数量'
ROW format delimited FIELDS TERMINATED BY '\t'
location '/warehouse/gmall/ads/ads_new_mid_count/';
2)导入数据
hive (gmall) > INSERT INTO TABLE ads_new_mid_count SELECT
login_date_first,
count(*)
FROM
dwt_uv_topic
WHERE
login_date_first = '2020-03-10'
GROUP BY
login_date_first;
3)查询导入数据
hive (gmall)> select * from ads_new_mid_count;
1.3 沉默用户数
需求定义:
沉默用户:只在安装当天启动过,且启动时间是在 7 天前
1)建表语句
hive (gmall) > DROP TABLE
IF EXISTS ads_silent_count;
CREATE external TABLE ads_silent_count (
`dt` string COMMENT '统计日期',
`silent_count` BIGINT COMMENT '沉默设备数'
) ROW format delimited FIELDS TERMINATED BY '\t'
location '/warehouse/gmall/ads/ads_silent_count';
2)导入 2020-03-20 数据
hive (gmall) > INSERT INTO TABLE ads_silent_count SELECT
'2020-03-15',
count(*)
FROM
dwt_uv_topic
WHERE
login_date_first = login_date_last
AND login_date_last <= date_add('2020-03-15' ,- 7);
3)查询导入数据
hive (gmall)> select * from ads_silent_count;
1.4 本周回流用户数
需求定义:
本周回流用户:上周未活跃,本周活跃的设备,且不是本周新增设备
1)建表语句
hive (gmall) > DROP TABLE
IF EXISTS ads_back_count;
CREATE external TABLE ads_back_count (
`dt` string COMMENT '统计日期',
`wk_dt` string COMMENT '统计日期所在周',
`wastage_count` BIGINT COMMENT '回流设备数'
) ROW format delimited FIELDS TERMINATED BY '\t'
location '/warehouse/gmall/ads/ads_back_count';
2)导入数据:
hive (gmall) > INSERT INTO TABLE ads_back_count SELECT
'2020-03-15',
count(*)
FROM
(
SELECT
mid_id
FROM
dwt_uv_topic
WHERE
login_date_last >= date_add(
next_day ('2020-03-15', 'MO') ,- 7
)
AND login_date_last <= date_add(
next_day ('2020-03-15', 'MO') ,- 1
)
AND login_date_first < date_add(
next_day ('2020-03-15', 'MO') ,- 7
)
) current_wk
LEFT JOIN (
SELECT
mid_id
FROM
dws_uv_detail_daycount
WHERE
dt >= date_add(
next_day ('2020-03-15', 'MO') ,- 7 * 2
)
AND dt <= date_add(
next_day ('2020-03-15', 'MO') ,- 7 - 1
)
GROUP BY
mid_id
) last_wk ON current_wk.mid_id = last_wk.mid_id
WHERE
last_wk.mid_id IS NULL;
3)查询结果
hive (gmall)> select * from ads_back_count;
1.5 流失用户数
需求定义:
流失用户:最近 7 天未活跃的设备
1)建表语句
hive (gmall) > DROP TABLE
IF EXISTS ads_wastage_count;
CREATE external TABLE ads_wastage_count (
`dt` string COMMENT '统计日期',
`wastage_count` BIGINT COMMENT '流失设备数'
) ROW format delimited FIELDS TERMINATED BY '\t'
location '/warehouse/gmall/ads/ads_wastage_count';
2)导入 2020-03-20 数据
hive (gmall) > INSERT INTO TABLE ads_wastage_count SELECT
'2020-03-20',
count(*)
FROM
(
SELECT
mid_id
FROM
dwt_uv_topic
WHERE
login_date_last <= date_add('2020-03-20' ,- 7)
GROUP BY
mid_id
) t1;
3)查询结果
hive (gmall)> select * from ads_wastage_count;
1.6 留存率
需求定义
1)建表语句
hive (gmall) > DROP TABLE
IF EXISTS ads_user_retention_day_rate;
CREATE external TABLE ads_user_retention_day_rate (
`stat_date` string COMMENT '统计日期',
`create_date` string COMMENT '设备新增日期',
`retention_day` INT COMMENT '截止当前日期留存天数',
`retention_count` BIGINT COMMENT '留存数量',
`new_mid_count` BIGINT COMMENT '设备新增数量',
`retention_ratio` DECIMAL (10, 2) COMMENT '留存率'
) COMMENT '每日用户留存情况'
ROW format delimited FIELDS TERMINATED BY '\t'
location '/warehouse/gmall/ads/ads_user_retention_day_rate/';
2)导入数据
hive (gmall) > INSERT INTO TABLE ads_user_retention_day_rate SELECT
'2020-03-10' ,-- 统计日期 date_add('2020-03-10' ,- 1) ,-- 新增日期 1 ,-- 留存天数 sum(
IF (
login_date_first = date_add('2020-03-10' ,- 1)
AND login_date_last = '2020-03-10',
1,
0
)
) ,-- 2020 - 03 - 09 的 1 日留存数 sum(
IF (
login_date_first = date_add('2020-03-10' ,- 1),
1,
0
)
) ,-- 2020 - 03 - 09 新增 sum(
IF (
login_date_first = date_add('2020-03-10' ,- 1)
AND login_date_last = '2020-03-10',
1,
0
)
) / sum(
IF (
login_date_first = date_add('2020-03-10' ,- 1),
1,
0
)
) * 100
FROM
dwt_uv_topicunion ALL SELECT
'2020-03-10' ,-- 统计日期 date_add('2020-03-10' ,- 2) ,-- 新增日期 2 ,-- 留存天数 sum(
IF (
login_date_first = date_add('2020-03-10' ,- 2)
AND login_date_last = '2020-03-10',
1,
0
)
) ,-- 2020 - 03 - 08 的 2 日留存数 sum(
IF (
login_date_first = date_add('2020-03-10' ,- 2),
1,
0
)
) ,-- 2020 - 03 - 08 新增 sum(
IF (
login_date_first = date_add('2020-03-10' ,- 2)
AND login_date_last = '2020-03-10',
1,
0
)
) / sum(
IF (
login_date_first = date_add('2020-03-10' ,- 2),
1,
0
)
) * 100
FROM
dwt_uv_topic
UNION ALL
SELECT
'2020-03-10' ,-- 统计日期 date_add('2020-03-10' ,- 3) ,-- 新增日期 3 ,-- 留存天数 sum(
IF (
login_date_first = date_add('2020-03-10' ,- 3)
AND login_date_last = '2020-03-10',
1,
0
)
) ,-- 2020 - 03 - 07 的 3 日留存数 sum(
IF (
login_date_first = date_add('2020-03-10' ,- 3),
1,
0
)
) ,-- 2020 - 03 - 07 新增 sum(
IF (
login_date_first = date_add('2020-03-10' ,- 3)
AND login_date_last = '2020-03-10',
1,
0
)
) / sum(
IF (
login_date_first = date_add('2020-03-10' ,- 3),
1,
0
)
) * 100
FROM
dwt_uv_topic;
3)查询导入数据
hive (gmall)>select * from ads_user_retention_day_rate;
1.7 最近连续三周活跃用户数
1)建表语句
hive (gmall) > DROP TABLE
IF EXISTS ads_continuity_wk_count;
CREATE external TABLE ads_continuity_wk_count (
`dt` string COMMENT '统计日期,一般用结束周周日日期,如果每天计算一次,可用当天日 期',
`wk_dt` string COMMENT '持续时间',
`continuity_count` BIGINT COMMENT '活跃次数'
) ROW format delimited FIELDS TERMINATED BY '\t' location '/warehouse/gmall/ads/ads_continuity_wk_count';
2)导入 2020-03-20 所在周的数据
hive (gmall) > INSERT INTO TABLE ads_continuity_wk_count SELECT
'2020-03-15',
concat(
date_add(
next_day ('2020-03-15', 'MO') ,- 7 * 3
),
'_',
date_add(
next_day (' 2020-03-15', 'MO') ,- 1
)
),
count(*)
FROM
(
SELECT
mid_id
FROM
(
selectmid_id
FROM
dws_uv_detail_daycount
WHERE
dt >= date_add(
next_day ('2020-03-10', 'monday') ,- 7
)
AND dt <= date_add(
next_day ('2020-03-10', 'monday') ,- 1
)
GROUP BY
mid_id
UNION ALL
SELECT
mid_id
FROM
dws_uv_detail_daycount
WHERE
dt >= date_add(
next_day ('2020-03-10', 'monday') ,- 7 * 2
)
AND dt <= date_add(
next_day ('2020-03-10', 'monday') ,- 7 - 1
)
GROUP BY
mid_id
UNION ALL
SELECT
mid_id
FROM
dws_uv_detail_daycount
WHERE
dt >= date_add(
next_day ('2020-03-10', 'monday') ,- 7 * 3
)
AND dt <= date_add(
next_day ('2020-03-10', 'monday') ,- 7 * 2 - 1
)
GROUP BY
mid_id
) t1
GROUP BY
mid_id
HAVING
count(*) = 3
) t2
3)查询
hive (gmall)> select * from ads_continuity_wk_count;
1.8 最近七天内连续三天活跃用户数
1)建表语句
hive (gmall) > DROP TABLE
IF EXISTS ads_continuity_uv_count;
CREATE external TABLE ads_continuity_uv_count (
`dt` string COMMENT '统计日期',
`wk_dt` string COMMENT '最近 7 天日期',
`continuity_count` BIGINT
) COMMENT '连续活跃设备数'
ROW format delimited FIELDS TERMINATED BY '\t'
location '/warehouse/gmall/ads/ads_continuity_uv_count';
2)写出导入数据的 SQL 语句
2)写出导入数据的 SQL 语句 hive (gmall) > INSERT INTO TABLE ads_continuity_uv_count SELECT
'2020-03-12',
concat(
date_add('2020-03-12' ,- 6),
'_',
'2020-03-12'
),
count(*)
FROM
(
SELECT
mid_id
FROM
(
SELECT
mid_id
FROM
(
SELECT
mid_id,
date_sub(dt, rank) date_dif
FROM
(
SELECT
mid_id,
dt,
rank () over (PARTITION BY mid_id ORDER BY dt) rank
FROM
dws_uv_detail_daycount
WHERE
dt >= date_add('2020-03-12' ,- 6)
AND dt <= '2020-03-12'
) t1
) t2
GROUP BY
mid_id,
date_dif
HAVING
count(*) >= 3
) t3
GROUP BY
mid_id
) t4;
3)查询
hive (gmall)> select * from ads_continuity_uv_count;
2.会员主题
2.1 会员主题信息
1)建表语句
hive (gmall) > DROP TABLE
IF EXISTS ads_user_topic;
CREATE external TABLE ads_user_topic (
`dt` string COMMENT '统计日期',
`day_users` string COMMENT '活跃会员数',
`day_new_users` string COMMENT '新增会员数',
`day_new_payment_users` string COMMENT '新增消费会员数',
`payment_users` string COMMENT '总付费会员数',
`users` string COMMENT '总会员数',
`day_users2users` DECIMAL (10, 2) COMMENT '会员活跃率',
`payment_users2users` DECIMAL (10, 2) COMMENT '会员付费率',
`day_new_users2users` DECIMAL (10, 2) COMMENT '会员新鲜度'
) COMMENT '会员主题信息表'
ROW format delimited FIELDS TERMINATED BY '\t'
location '/warehouse/gmall/ads/ads_user_topic';
2)导入数据
hive (gmall) > INSERT INTO TABLE ads_user_topic SELECT
'2020-03-10',
sum(
IF (
login_date_last = '2020-03-10',
1,
0
)
),
sum(
IF (
login_date_first = '2020-03-10',
1,
0
)
),
sum(
IF (
payment_date_first = '2020-03-10',
1,
0
)
),
sum(IF(payment_count > 0, 1, 0)),
count(*),
sum(
IF (
login_date_last = '2020-03-10',
1,
0
)
) / count(*),
sum(IF(payment_count > 0, 1, 0)) / count(*),
sum(
IF (
login_date_first = '2020-03-10',
1,
0
)
) / sum(
IF (
login_date_last = '2020-03-10',
1,
0
)
)
FROM
dwt_user_topic
3)查询导入结果
hive (gmall)> select * from ads_user_topic;
4)在/home/atguigu/bin 目录下编写脚本 ads_user_topic.sh
[atguigu@hadoop102 bin]$ vim ads_user_topic.sh
添加如下内容:
#!/bin/bash
APP=gmall
hive=/opt/module/hive/bin/hive
# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n "$1" ] ;
then
do
_date=$1
else
do
_date=`date -d "-1 day" +%F`
fisql="
WITH tmp_day_users AS (
SELECT
'$do_date' dt,
count(*) day_users from$ { APP }.dwt_user_topic wherelogin_date_last = '$do_date'
),
tmp_day_new_users AS (
SELECT
'$do_date' dt,
count(*) day_new_users from$ { APP }.dwt_user_topic wherelogin_date_last = '$do_date'
AND login_date_first = '$do_date'
),
tmp_day_new_payment_users AS (
SELECT
'$do_date' dt,
count(*) day_new_payment_users from$ { APP }.dwt_user_topic wherepayment_date_first = '$do_date'
),
tmp_payment_users AS (
SELECT
'$do_date' dt,
count(*) payment_users from$ { APP }.dwt_user_topic wherepayment_date_first IS NOT NULL
),
tmp_users AS (
SELECT
'$do_date' dt,
count(*) users from$ { APP }.dwt_user_topic tmp_users
) INSERT INTO TABLE $ { APP }.ads_user_topic SELECT
'$do_date' dt,
day_users,
day_new_users,
day_new_payment_users,
payment_users,
users,
day_users / users,
payment_users / users,
day_new_users / users fromtmp_day_users
JOIN tmp_day_new_users ON tmp_day_users.dt = tmp_day_new_users.dt
JOIN tmp_day_new_payment_users ON tmp_day_users.dt = tmp_day_new_payment_users.dt
JOIN tmp_payment_users ON tmp_day_users.dt = tmp_payment_users.dt
JOIN tmp_users ontmp_day_users.dt = tmp_users.dt;
"
$hive -e "$sql"
5)增加脚本执行权限
[atguigu@hadoop102 bin]$ chmod 777 ads_user_topic.sh
6)执行脚本导入数据
[atguigu@hadoop102 bin]$ ads_user_topic.sh 2020-03-11
7)查看导入数据
hive (gmall)> select * from ads_user_topic;
2.2 漏斗分析
统计“浏览->购物车->下单->支付”的转化率
思路:统计各个行为的人数,然后计算比值。
1)建表语句
hive (gmall) > DROP TABLE
IF EXISTS ads_user_action_convert_day;
CREATE external TABLE ads_user_action_convert_day (
`dt` string COMMENT '统计日期',
`total_visitor_m_count` BIGINT COMMENT '总访问人数',
`cart_u_count` BIGINT COMMENT '加入购物车的人数',
`visitor2cart_convert_ratio` DECIMAL (10, 2) COMMENT '访问到加入购物车转化率',
`order_u_count` BIGINT COMMENT '下单人数',
`cart2order_convert_ratio` DECIMAL (10, 2) COMMENT '加入购物车到下单转化率',
`payment_u_count` BIGINT COMMENT '支付人数',
`order2payment_convert_ratio` DECIMAL (10, 2) COMMENT '下单到支付的转化率'
) COMMENT '用户行为漏斗分析'
ROW format delimited FIELDS TERMINATED BY '\t'
location '/warehouse/gmall/ads/ads_user_action_convert_day/';
2)数据装载
hive (gmall) > INSERT INTO TABLE ads_user_action_convert_day SELECT
'2020-03-10',
uv.day_count,
ua.cart_count,
cast(
ua.cart_count / uv.day_count AS DECIMAL (10, 2)
) visitor2cart_convert_ratio,
ua.order_count,
cast(
ua.order_count / ua.cart_count AS DECIMAL (10, 2)
) visitor2order_convert_ratio,
ua.payment_count,
cast(
ua.payment_count / ua.order_count AS DECIMAL (10, 2)
) order2payment_convert_ratio
FROM
(
SELECT
dt,
sum(IF(cart_count > 0, 1, 0)) cart_count,
sum(IF(order_count > 0, 1, 0)) order_count,
sum(IF(payment_count > 0, 1, 0)) payment_count
FROM
dws_user_action_daycount
WHERE
dt = '2020-03-10'
GROUP BY
dt
) ua
JOIN ads_uv_count uv ON uv.dt = ua.dt;
3)查询导入结果
hive (gmall)> select * from ads_user_action_convert_day;
3.商品主题
3.1 商品个数信息
1)建表语句
hive (gmall) > DROP TABLE
IF EXISTS ads_product_info;
CREATE external TABLE ads_product_info (
`dt` string COMMENT '统计日期',
`sku_num` string COMMENT 'sku 个数',
`spu_num` string COMMENT 'spu 个数'
) COMMENT '商品个数信息'
ROW format delimited FIELDS TERMINATED BY '\t'
location '/warehouse/gmall/ads/ads_product_info';
2)导入数据
hive (gmall) > INSERT INTO TABLE ads_product_infoselect '2020-03-10' dt,
sku_num,
spu_num
FROM
(
SELECT
'2020-03-10' dt,
count(*) sku_num fromdwt_sku_topic
) tmp_sku_num
JOIN (
SELECT
'2020-03-10' dt,
count(*) spu_num
FROM
(
SELECT
spu_id fromdwt_sku_topic
GROUP BY
spu_id
) tmp_spu_id
) tmp_spu_num ON tmp_sku_num.dt = tmp_spu_num.dt;
3)查询导入结果
hive (gmall)> select * from ads_product_info;
3.2 商品销量排名
1)建表语句
hive (gmall) > DROP TABLE
IF EXISTS ads_product_sale_topN;
CREATE external TABLE ads_product_sale_topN (
`dt` string COMMENT '统计日期',
`sku_id` string COMMENT '商品 ID',
`payment_amount` BIGINT COMMENT '销量'
) COMMENT '商品个数信息'
ROW format delimited FIELDS TERMINATED BY '\t'
location '/warehouse/gmall/ads/ads_product_sale_topN';
2)导入数据
hive (gmall) > INSERT INTO TABLE ads_product_sale_topN SELECT
'2020-03-10' dt,
sku_id,
payment_amount fromdws_sku_action_daycount wheredt = '2020-03-10'
ORDER BY
payment_amount DESC
LIMIT 10;
3)查询导入结果
hive (gmall)> select * from ads_product_sale_topN;
3.3 商品收藏排名
1)建表语句
hive (gmall) > DROP TABLE
IF EXISTS ads_product_favor_topN;
CREATE external TABLE ads_product_favor_topN (
`dt` string COMMENT '统计日期',
`sku_id` string COMMENT '商品 ID',
`favor_count` BIGINT COMMENT '收藏量'
) COMMENT '商品收藏 TopN'
ROW format delimited FIELDS TERMINATED BY '\t'
location '/warehouse/gmall/ads/ads_product_favor_topN';
2)导入数据
hive (gmall) > INSERT INTO TABLE ads_product_favor_topN SELECT
'2020-03-10' dt,
sku_id,
favor_count from
dws_sku_action_daycount wheredt = '2020-03-10'
ORDER BY
favor_count DESC
LIMIT 10;
3)查询导入结果
hive (gmall)> select * from ads_product_favor_topN;
3.4 商品加入购物车排名
1)建表语句
DROP TABLE
IF EXISTS ads_product_cart_topN;
CREATE external TABLE ads_product_cart_topN (
`dt` string COMMENT '统计日期',
`sku_id` string COMMENT '商品 ID',
`cart_num` BIGINT COMMENT '加入购物车数量'
) COMMENT '商品加入购物车 TopN'
ROW format delimited FIELDS TERMINATED BY '\t'
location '/warehouse/gmall/ads/ads_product_cart_topN';
2)导入数据
hive (gmall) > INSERT INTO TABLE ads_product_cart_topN SELECT
'2020-03-10' dt,
sku_id,
cart_num
FROM
dws_sku_action_daycount wheredt = '2020-03-10'
ORDER BY
cart_num DESC
LIMIT 10;
3)查询导入结果
hive (gmall)> select * from ads_product_cart_topN;
3.5 商品退款率排名(最近 30 天)
1)建表语句
hive (gmall) > DROP TABLE
IF EXISTS ads_product_refund_topN;
CREATE external TABLE ads_product_refund_topN (
`dt` string COMMENT '统计日期',
`sku_id` string COMMENT '商品 ID',
`refund_ratio` DECIMAL (10, 2) COMMENT '退款率'
) COMMENT '商品退款率 TopN'
ROW format delimited FIELDS TERMINATED BY '\t'
location '/warehouse/gmall/ads/ads_product_refund_topN';
2)导入数据
hive (gmall) > INSERT INTO TABLE ads_product_refund_topN SELECT
'2020-03-10',
sku_id,
refund_last_30d_count / payment_last_30d_count * 100 refund_ratio
FROM
dwt_sku_topic
ORDER BY
refund_ratio DESC
LIMIT 10;
3)查询导入结果
hive (gmall)> select * from ads_product_refund_topN;
3.6 商品差评率
1)建表语句
hive (gmall) > DROP TABLE
IF EXISTS ads_appraise_bad_topN;
CREATE external TABLE ads_appraise_bad_topN (
`dt` string COMMENT '统计日期',
`sku_id` string COMMENT '商品 ID',
`appraise_bad_ratio` DECIMAL (10, 2) COMMENT '差评率'
) COMMENT '商品差评率 TopN'
ROW format delimited FIELDS TERMINATED BY '\t'
location '/warehouse/gmall/ads/ads_appraise_bad_topN';
2)导入数据
hive (gmall) > INSERT INTO TABLE ads_appraise_bad_topN SELECT
'2020-03-10' dt,
sku_id,
appraise_bad_count / (
appraise_good_count + appraise_mid_count + appraise_bad_coun t + appraise_default_count
) appraise_bad_ratio fromdws_sku_action_daycount
WHERE
dt = '2020-03-10'
ORDER BY
appraise_bad_ratio DESC
LIMIT 10;
3)查询导入结果
hive (gmall)> select * from ads_appraise_bad_topN;
4.营销主题(用户+商品+购买行为)
4.1 下单数目统计
需求分析:统计每日下单数,下单金额及下单用户数。
hive (gmall) > DROP TABLE
IF EXISTS ads_order_daycount;
CREATE external TABLE ads_order_daycount (
dt string COMMENT '统计日期',
order_count BIGINT COMMENT '单日下单笔数',
order_amount BIGINT COMMENT '单日下单金额',
order_users BIGINT COMMENT '单日下单用户数'
) COMMENT '每日订单总计表'
ROW format delimited FIELDS TERMINATED BY '\t'
location '/warehouse/gmall/ads/ads_order_daycount';
2)导入数据
hive (gmall) > INSERT INTO TABLE ads_order_daycount SELECT
'2020-03-10',
sum(order_count),
sum(order_amount),
sum(IF(order_count > 0, 1, 0))
FROM
dws_user_action_daycount
WHERE
dt = '2020-03-10';
3)查询导入结果
hive (gmall)> select * from ads_order_daycount;
4.2 支付信息统计
每日支付金额、支付人数、支付商品数、支付笔数以及下单到支付的平均时长(取自 DWD)
1)建表语句
hive (gmall) > DROP TABLE
IF EXISTS ads_payment_daycount;
CREATE external TABLE ads_payment_daycount (
dt string COMMENT '统计日期',
order_count BIGINT COMMENT '单日支付笔数',
order_amount BIGINT COMMENT '单日支付金额',
payment_user_count BIGINT COMMENT '单日支付人数',
payment_sku_count BIGINT COMMENT '单日支付商品数',
payment_avg_time DOUBLE COMMENT '下单到支付的平均时长,取分钟数'
) COMMENT '每日订单总计表'
ROW format delimited FIELDS TERMINATED BY '\t'
location '/warehouse/gmall/ads/ads_payment_daycount';
2)导入数据
hive (gmall) > INSERT INTO TABLE ads_payment_daycount SELECT
tmp_payment.dt,
tmp_payment.payment_count,
tmp_payment.payment_amount,
tmp_payment.payment_user_count,
tmp_skucount.payment_sku_count,
tmp_time.payment_avg_time
FROM
(
SELECT
'2020-03-15' dt,
sum(payment_count) payment_count,
sum(payment_amount) payment_amount,
sum(IF(payment_count > 0, 1, 0)) payment_user_count
FROM
dws_user_action_daycount
WHERE
dt = '2020-03-15'
) tmp_payment
JOIN (
SELECT
'2020-03-15' dt,
sum(IF(payment_count > 0, 1, 0)) payment_sku_count
FROM
dws_sku_action_daycount
WHERE
dt = '2020-03-15'
) tmp_skucount ON tmp_payment.dt = tmp_skucount.dt
JOIN (
SELECT
'2020-03-15' dt,
sum(
unix_timestamp(payment_time) - unix_timestamp(create_time)
) / count(*) / 60 payment_avg_time
FROM
dwd_fact_order_info
WHERE
dt = '2020-03-15'
AND payment_time IS NOT NULL
) tmp_time ON tmp_payment.dt = tmp_time.dt
3)查询导入结果
hive (gmall)> select * from ads_payment_daycount;
4.3 复购率
1)建表语句
hive (gmall) > DROP TABLE ads_sale_tm_category1_stat_mn;
CREATE external TABLE ads_sale_tm_category1_stat_mn (
tm_id string COMMENT '品牌 id',
category1_id string COMMENT '1 级品类 id ',
category1_name string COMMENT '1 级品类名称 ',
buycount BIGINT COMMENT '购买人数',
buy_twice_last BIGINT COMMENT '两次以上购买人数',
buy_twice_last_ratio DECIMAL (10, 2) COMMENT '单次复购率',
buy_3times_last BIGINT COMMENT '三次以上购买人数',
buy_3times_last_ratio DECIMAL (10, 2) COMMENT '多次复购率',
stat_mn string COMMENT '统计月份',
stat_date string COMMENT '统计日期'
) COMMENT '复购率统计'
ROW format delimited FIELDS TERMINATED BY '\t'
location '/warehouse/gmall/ads/ads_sale_tm_category1_stat_mn/';
2)导入数据
hive (gmall) > INSERT INTO TABLE ads_sale_tm_category1_stat_mn selectmn.sku_tm_id,
mn.sku_category1_id,
mn.sku_category1_name,
sum(IF(mn.order_count >= 1, 1, 0)) buycount,
sum(IF(mn.order_count >= 2, 1, 0)) buyTwiceLast,
sum(IF(mn.order_count >= 2, 1, 0)) / sum(IF(mn.order_count >= 1, 1, 0)) buyTwiceLastRatio,
sum(IF(mn.order_count >= 3, 1, 0)) buy3timeLast,
sum(IF(mn.order_count >= 3, 1, 0)) / sum(IF(mn.order_count >= 1, 1, 0)) buy3timeLastRatio,
date_format('2019-02-10', 'yyyy-MM') stat_mn,
'2019-02-10' stat_date
FROM
(
SELECT
user_id,
sd.sku_tm_id,
sd.sku_category1_id,
sd.sku_category1_name,
sum(order_count) order_count
FROM
dws_sale_detail_daycount sd
WHERE
date_format(dt, 'yyyy-MM') = date_format('2019-02-10', 'yyyy-MM')
GROUP BY
user_id,
sd.sku_tm_id,
sd.sku_category1_id,
sd.sku_category1_name
) mn
GROUP BY
mn.sku_tm_id,
mn.sku_category1_id,
mn.sku_category1_name;
3)查询导入结果
hive (gmall)> select * from ads_sale_tm_category1_stat_mn;
5.ADS 层导入脚本
1)在/home/atguigu/bin 目录下创建脚本 dwt_to_ads.sh
[atguigu@hadoop102 bin]$ vim dwt_to_ads.sh
在脚本中填写如下内容
#!/bin/bash
hive=/opt/module/hive/bin/hive
# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n "$1" ] ;
then
do
_date=$1
else
do_date=`date -d "-1 day" +%F`
fisql="use gmall;
INSERT INTO TABLE ads_uv_count SELECT
'$do_date',
sum(
IF (
login_date_last = '$do_date',
1,
0
)
),
sum(
IF (
login_date_last >= date_add(
next_day ('$do_date', 'monday') ,- 7
)
AND login_date_last <= date_add(
next_day ('$do_date', 'monday') ,- 1
),
1,
0
)
),
sum(
IF (
date_format(login_date_last, 'yyyy-MM') = date_format('$do_date', 'yyyy-M M'),
1,
0
)
),
IF (
'$do_date' = date_add(
next_day ('$do_date', 'monday') ,- 1
),
'Y',
'N'
),
IF (
'$do_date' = last_day('$do_date'),
'Y',
'N'
)
FROM
dwt_uv_topic;
INSERT INTO TABLE ads_new_mid_count SELECT
'$do_date',
count(*)
FROM
dwt_uv_topic
WHERE
login_date_first = '$do_date';
INSERT INTO TABLE ads_silent_count SELECT
'$do_date',
count(*)
FROM
dwt_uv_topic
WHERE
login_date_first = login_date_last
AND login_date_last <= date_add('$do_date' ,- 7);
INSERT INTO TABLE ads_back_count SELECT
'$do_date',
concat(
date_add(
next_day ('2020-03-10', 'MO') ,- 7
),
'_',
date_add(
next_day ('2020- 03-10', 'MO') ,- 1
)
),
count(*)
FROM
(
SELECT
mid_id
FROM
dwt_uv_topic
WHERE
login_date_last >= date_add(
next_day ('$do_date', 'MO') ,- 7
)
AND login_date_last <= date_add(
next_day ('$do_date', 'MO') ,- 1
)
AND login_date_first < date_add(
next_day ('$do_date', 'MO') ,- 7
)
) current_wk
LEFT JOIN (
SELECT
mid_id
FROM
dws_uv_detail_daycount
WHERE
dt >= date_add(
next_day ('$do_date', 'MO') ,- 7 * 2
)
AND dt <= date_add(
next_day ('$do_date', 'MO') ,- 7 - 1
)
GROUP BY
mid_id
) last_wk ON current_wk.mid_id = last_wk.mid_id
WHERE
last_wk.mid_id IS NULL;
INSERT INTO TABLE ads_wastage_count SELECT
'$do_date',
count(*)
FROM
dwt_uv_topic
WHERE
login_date_last <= date_add('$do_date' ,- 7);
INSERT INTO TABLE ads_user_retention_day_rate SELECT
'$do_date',
date_add('$do_date' ,- 3),
3,
sum(
IF (
login_date_first = date_add('$do_date' ,- 3)
AND login_date_last = '$do_date',
1,
0
)
),
sum(
IF (
login_date_first = date_add('$do_date' ,- 3),
1,
0
)
),
sum(
IF (
login_date_first = date_add('$do_date' ,- 3)
AND login_date_last = '$do_date',
1,
0
)
) / sum(
IF (
login_date_first = date_add('$do_date', - 3),
1,
0
)
) * 100
FROM
dwt_uv_topic
UNION ALL
SELECT
'$do_date',
date_add('$do_date' ,- 2),
2,
sum(
IF (
login_date_first = date_add('$do_date' ,- 2)
AND login_date_last = '$do_date',
1,
0
)
),
sum(
IF (
login_date_first = date_add('$do_date' ,- 2),
1,
0
)
),
sum(
IF (
login_date_first = date_add('$do_date' ,- 2)
AND login_date_last = '$do_date',
1,
0
)
) / sum(
IF (
login_date_first = date_add('$do_date', - 2),
1,
0
)
) * 100
FROM
dwt_uv_topic
UNION ALL
SELECT
'$do_date',
date_add('$do_date' ,- 1),
1,
sum(
IF (
login_date_first = date_add('$do_date' ,- 1)
AND login_date_last = '$do_date',
1,
0
)
),
sum(
IF (
login_date_first = date_add('$do_date' ,- 1),
1,
0
)
),
sum(
IF (
login_date_first = date_add('$do_date' ,- 1)
AND login_date_last = '$do_date',
1,
0
)
) / sum(
IF (
login_date_first = date_add('$do_date', - 1),
1,
0
)
) * 100
FROM
dwt_uv_topic;
INSERT INTO TABLE ads_continuity_wk_count SELECT
'$do_date',
concat(
date_add(
next_day ('$do_date', 'MO') ,- 7 * 3
),
'_',
date_add(
next_day ('$do_d ate', 'MO') ,- 1
)
),
count(*)
FROM
(
SELECT
mid_id
FROM
(
SELECT
mid_id
FROM
dws_uv_detail_daycount
WHERE
dt >= date_add(
next_day ('$do_date', 'monday') ,- 7
)
AND dt <= date_add(
next_day ('$do_date', 'monday') ,- 1
)
GROUP BY
mid_id
UNION ALL
SELECT
mid_id
FROM
dws_uv_detail_daycount
WHERE
dt >= date_add(
next_day ('$do_date', 'monday') ,- 7 * 2
)
AND dt <= date_add(
next_day ('$do_date', 'monday') ,- 7 - 1
)
GROUP BY
mid_id
UNION ALL
SELECT
mid_id
FROM
dws_uv_detail_daycount
WHERE
dt >= date_add(
next_day ('$do_date', 'monday') ,- 7 * 3
)
AND dt <= date_add(
next_day ('$do_date', 'monday') ,- 7 * 2 - 1
)
GROUP BY
mid_id
) t1
GROUP BY
mid_id
HAVING
count(*) = 3
) t2;
INSERT INTO TABLE ads_continuity_uv_count SELECT
'$do_date',
concat(
date_add('$do_date' ,- 6),
'_',
'$do_date'
),
count(*)
FROM
(
SELECT
mid_id
FROM
(
SELECT
mid_id
FROM
(
SELECT
mid_id,
date_sub(dt, rank) date_dif
FROM
(
SELECT
mid_id,
dt,
rank () over (PARTITION BY mid_id ORDER BY dt) rank
FROM
dws_uv_detail_daycount
WHERE
dt >= date_add('$do_date' ,- 6)
AND dt <= '$do_date'
) t1
) t2
GROUP BY
mid_id,
date_dif
HAVING
count(*) >= 3
) t3
GROUP BY
mid_id
) t4;
INSERT INTO TABLE ads_user_topic SELECT
'$do_date',
sum(
IF (
login_date_last = '$do_date',
1,
0
)
),
sum(
IF (
login_date_first = '$do_date',
1,
0
)
),
sum(
IF (
payment_date_first = '$do_date',
1,
0
)
),
sum(IF(payment_count > 0, 1, 0)),
count(*),
sum(
IF (
login_date_last = '$do_date',
1,
0
)
) / count(*),
sum(IF(payment_count > 0, 1, 0)) / count(*),
sum(
IF (
login_date_first = '$do_date',
1,
0
)
) / sum(
IF (
login_date_last = '$do_date',
1,
0
)
)
FROM
dwt_user_topic;
INSERT INTO TABLE ads_user_action_convert_day SELECT
'$do_date',
uv.day_count,
ua.cart_count,
ua.cart_count / uv.day_count * 100 visitor2cart_convert_ratio,
ua.order_count,
ua.order_count / ua.cart_count * 100 visitor2order_convert_ratio,
ua.payment_count,
ua.payment_count / ua.order_count * 100 order2payment_convert_ratio
FROM
(
SELECT
'$do_date' dt,
sum(IF(cart_count > 0, 1, 0)) cart_count,
sum(IF(order_count > 0, 1, 0)) order_count,
sum(IF(payment_count > 0, 1, 0)) payment_count
FROM
dws_user_action_daycount
WHERE
dt = '$do_date'
) ua
JOIN ads_uv_count uv ON uv.dt = ua.dt;
INSERT INTO TABLE ads_product_info SELECT
'$do_date' dt,
sku_num,
spu_num
FROM
(
SELECT
'$do_date' dt,
count(*) sku_num fromdwt_sku_topic
) tmp_sku_num
JOIN (
SELECT
'$do_date' dt,
count(*) spu_num
FROM
(
SELECT
spu_id fromdwt_sku_topic
GROUP BY
spu_id
) tmp_spu_id
) tmp_spu_num ON tmp_sku_num.dt = tmp_spu_num.dt;
INSERT INTO TABLE ads_product_sale_topN SELECT
'$do_date',
sku_id,
payment_amount
FROM
dws_sku_action_daycount
WHERE
dt = '$do_date'
ORDER BY
payment_amount DESC
LIMIT 10;
INSERT INTO TABLE ads_product_favor_topN SELECT
'$do_date',
sku_id,
favor_count
FROM
dws_sku_action_daycount
WHERE
dt = '$do_date'
ORDER BY
favor_count
LIMIT 10;
INSERT INTO TABLE ads_product_cart_topN SELECT
'$do_date' dt,
sku_id,
cart_num
FROM
dws_sku_action_daycount
WHERE
dt = '$do_date'
ORDER BY
cart_num
LIMIT 10;
INSERT INTO TABLE ads_product_refund_topN SELECT
'$do_date',
sku_id,
refund_last_30d_count / payment_last_30d_count * 100 refund_ratio
FROM
dwt_sku_topic
ORDER BY
refund_ratio DESC
LIMIT 10;
INSERT INTO TABLE ads_appraise_bad_topN SELECT
'$do_date' dt,
sku_id,
appraise_bad_count / (
appraise_bad_count + appraise_good_count + appraise_mid_coun t + appraise_default_count
) * 100 appraise_bad_ratio
FROM
dws_sku_action_daycount
WHERE
dt = '$do_date'
ORDER BY
appraise_bad_ratio DESC
LIMIT 10;
INSERT INTO TABLE ads_order_daycount SELECT
'$do_date',
sum(order_count),
sum(order_amount),
sum(IF(order_count > 0, 1, 0))
FROM
dws_user_action_daycount
WHERE
dt = '$do_date';
INSERT INTO TABLE ads_payment_daycount SELECT
tmp_payment.dt,
tmp_payment.payment_count,
tmp_payment.payment_amount,
tmp_payment.payment_user_count,
tmp_skucount.payment_sku_count,
tmp_time.payment_avg_time
FROM
(
SELECT
'$do_date' dt,
sum(payment_count) payment_count,
sum(payment_amount) payment_amount,
sum(IF(payment_count > 0, 1, 0)) payment_user_count
FROM
dws_user_action_daycount
WHERE
dt = '$do_date'
) tmp_payment
JOIN (
SELECT
'$do_date' dt,
sum(IF(payment_count > 0, 1, 0)) payment_sku_count
FROM
dws_sku_action_daycount
WHERE
dt = '$do_date'
) tmp_skucount ON tmp_payment.dt = tmp_skucount.dt
JOIN (
SELECT
'$do_date' dt,
sum(
unix_timestamp(payment_time) - unix_timestamp(create_time)
) / count(*) / 60 payment_avg_time
FROM
dwd_fact_order_info
WHERE
dt = '$do_date'
AND payment_time IS NOT NULL
) tmp_time ON tmp_payment.dt = tmp_time.dt;
INSERT INTO TABLE ads_sale_tm_category1_stat_mn SELECT
mn.sku_tm_id,
mn.sku_category1_id,
mn.sku_category1_name,
sum(IF(mn.order_count >= 1, 1, 0)) buycount,
sum(IF(mn.order_count >= 2, 1, 0)) buyTwiceLast,
sum(IF(mn.order_count >= 2, 1, 0)) / sum(IF(mn.order_count >= 1, 1, 0)) buyTwiceLastRatio,
sum(IF(mn.order_count >= 3, 1, 0)) buy3timeLast,
sum(IF(mn.order_count >= 3, 1, 0)) / sum(IF(mn.order_count >= 1, 1, 0)) buy3timeLastRatio,
date_format('$do_date', 'yyyy-MM') stat_mn,
'$do_date' stat_date
FROM
(
SELECT
user_id,
sd.sku_tm_id,
sd.sku_category1_id,
sd.sku_category1_name,
sum(order_count) order_count
FROM
dws_sale_detail_daycount sd
WHERE
date_format(dt, 'yyyy-MM') = date_format('$do_date', 'yyyy-MM')
GROUP BY
user_id,
sd.sku_tm_id,
sd.sku_category1_id,
sd.sku_category1_name
) mn
GROUP BY
mn.sku_tm_id,
mn.sku_category1_id,
mn.sku_category1_name;
"
$hive -e "$sql"