我们先来看看这两个函数的语法:
LEAD(col,n,default) OVER()
LAG(col,n,default) OVER()
页面停留时长:用户进入A页面,又进入B页面。进入A页面的时间为date1,进入B页面的时间为date2,那么在A页面的停留时间为date2-date1。实际生产场景中,可能会考虑不超过30min等等因素。
CREATE TABLE log_info
(
uid string,
dateline string,
page string
);
-- 模拟数据
INSERT INTO log_info VALUES ('1001', '2021-08-10 10:18:31', 'url3'),
('1001', '2021-08-10 10:01:24', 'url1'),
('1001', '2021-08-10 10:05:22', 'url2'),
('1002', '2021-08-10 10:10:31', 'url1'),
('1002', '2021-08-10 10:15:31', 'url2'),
('1002', '2021-08-10 10:18:31', 'url3');
SELECT *,
ROW_NUMBER() OVER(PARTITION BY uid ORDER BY dateline) rank
FROM log_info
SELECT t1.uid,
t1.dateline start_time,
t2.dateline end_time,
t1.page
FROM
(SELECT *,
ROW_NUMBER() OVER(PARTITION BY uid ORDER BY dateline) rank
FROM log_info ) t1
LEFT JOIN
(SELECT *,
ROW_NUMBER() OVER(PARTITION BY uid ORDER BY dateline) rank
FROM log_info ) t2
ON t1.uid = t2.uid AND t1.rank+1=t2.rank;
用户进入一个页面和跳到另一个页面的时间,通过这两个时间的差值就是我们要计算的用户页面停留时长
SELECT uid,
dateline,
lead(dateline,1) over(distribute by uid sort by dateline asc) as dateline_end,
page
FROM log_info;