背景:这是尚硅谷SparkSQL练习题,本文用HiveSQL进行了实现。
数据集:用户点击表,商品表,城市表
题目:
① 求每个地区点击量前三的商品;
② 在①的基础上,求出每个地区点击量前三的商品后,求出每个商品中的点击量前三的城市分别占本商品总点击量的百分比。
建表,导入数据
没啥说的,建表语句直接抄过来
use atguigu;
CREATE TABLE `user_visit_action`(
`date` string,
`user_id` bigint,
`session_id` string,
`page_id` bigint,
`action_time` string,
`search_keyword` string,
`click_category_id` bigint,
`click_product_id` bigint,
`order_category_ids` string,
`order_product_ids` string,
`pay_category_ids` string,
`pay_product_ids` string,
`city_id` bigint)
row format delimited fields terminated by '\t';
load data local inpath 'datas/user_visit_action.txt'
into table atguigu.user_visit_action;
CREATE TABLE `product_info`(
`product_id` bigint,
`product_name` string,
`extend_info` string)
row format delimited fields terminated by '\t';
load data local inpath 'datas/product_info.txt' into table atguigu.product_info;
CREATE TABLE `city_info`(
`city_id` bigint,
`city_name` string,
`area` string)
row format delimited fields terminated by '\t';
load data local inpath 'datas/city_info.txt' into table atguigu.city_info;
第一问:求每个地区点击量前三的商品;
分析:按 area,product_name 两个字段分组,求出点击量click_ct2,保留每个地区点击量前三的商品。
select
area,
product_name,
click_ct2
from (
select
area,
product_name,
click_ct2,
row_number() over( partition by area order by click_ct2 desc ) as rn2
from (
select
area,
product_name,
count(*) as click_ct2
from (
select
a.*,
p.product_name,
c.area,
c.city_name
from user_visit_action a
join product_info p on a.click_product_id = p.product_id
join city_info c on a.city_id = c.city_id
where a.click_product_id > -1
) t1
group by area, product_name
) t2
) t3
where rn2 <= 3
第二问: 在①的基础上,求出每个地区点击量前三的商品后,求每个商品中的点击量前三的城市分别占本商品总点击量的百分比。
分析:
第一问求出了每个地区、每个商品的点击量,这个点击量叫click_ct2,并取了每个地区前三名的商品。
在第二问中,可以先求出每个地区、每个商品、每个城市的点击量,这个点击量叫click_ct3,取每个地区、每个商品点击量的前三名城市。
用click_ct3/click_ct2就是每个地区、每个商品、每个城市点击率,这个点击率叫click_rate3。
click_ct3所在的临时表叫tmp1,click_ct2所在的临时表叫tmp2。
tmp1有三个维度,粒度更细,数据条数会更多,tmp2有两个维度,粒度粗,数据条数少。
无论是tmp1 join tmp2,还是tmp2 join tmp1,两种方式都可以,tmp1中不符合条件的数据会被筛掉。
之后,用concat()将每行的城市名和点击率拼接在一起,
再按地区、商品、总点击量进行分组,用collect_set()收集每组拼接的结果,
将收集的结果拼接成字符串,再转换成map。
--维度:area,city_name,product_name
--度量:点击次数
--限定:前三
with tmp1 as(
select
area,
product_name,
city_name,
click_ct3
from(
select
area,
city_name,
product_name,
click_ct3,
row_number()over(partition by area,product_name order by click_ct3) rn1
from(
select
area,
city_name,
product_name,
count(*) click_ct3
from(
select
a.*,
p.product_name,
c.area,
c.city_name
from user_visit_action a
join product_info p on a.click_product_id = p.product_id
join city_info c on a.city_id = c.city_id
where a.click_product_id > -1
)t1
group by area,city_name,product_name
)t2
)t3
where rn1<=3
order by area,product_name,city_name,click_ct3 desc
),
--维度:area,product_name
--度量:点击次数
--限定:前三
tmp2 as(
select
area,
product_name,
click_ct2
from (
select
area,
product_name,
click_ct2,
row_number() over( partition by area order by click_ct2 desc ) as rn2
from (
select
area,
product_name,
count(*) as click_ct2
from (
select
a.*,
p.product_name,
c.area,
c.city_name
from user_visit_action a
join product_info p on a.click_product_id = p.product_id
join city_info c on a.city_id = c.city_id
where a.click_product_id > -1
) t1
group by area, product_name
) t2
) t3
where rn2 <= 3
)
select
area,
product_name,
click_ct2,
-- 按地区、商品、总点击量进行分组,用collect_set()收集每组拼接的结果,
-- 将收集后的结果转换成map
str_to_map(concat_ws(',',collect_set(city_rate)),',',':') city_rate3
from(
select
area,
product_name,
click_ct2,
click_rate,
-- 将每行的城市名和点击率拼接在一起
concat(city_name,':',click_rate,'%') city_rate
from(
select
tmp1.area,
tmp1.product_name,
tmp1.city_name,
tmp2.click_ct2,
round(tmp1.click_ct3*100/tmp2.click_ct2,2) click_rate
from tmp2
join tmp1 on tmp2.area=tmp1.area and tmp2.product_name=tmp1.product_name
)t1
order by area,click_ct2 desc,click_rate desc
)t2
group by area,product_name,click_ct2
order by area,click_ct2 desc