# -- hive内置解析url的函数
parse_url_tuple(url,host path,query,queryvalue)
# -- 通常用于把后面的表挂接在左边的表之上 返回成为一个新表
a LATERAL VIEW b
LATERAL VIEW
create table t_ods_tmp_referurl as SELECT a.*,b.* FROM ods_weblog_origin a LATERAL VIEW parse_url_tuple(regexp_replace(http_referer, "\"", ""), 'HOST', 'PATH','QUERY', 'QUERY:id') b as host, path, query, query_id;
select count(*) as pvs from ods_weblog_detail t where datestr='20130918' group by t.hour
select t.hour,count(*) as pvs from ods_weblog_detail t where datestr='20130918' group by t.hour
# -- 在有group by的语句中,出现在select后面的字段要么是分组的字段要么是被聚合函数包围的字段。
解决:
select t.day,t.hour,count(*) as pvs from ods_weblog_detail t where datestr='20130918' group by t.day,t.hour;
1.宽表生成
create table t_ods_tmp_referurl as
SELECT a.*,b.*
FROM ods_weblog_origin a
LATERAL VIEW parse_url_tuple(regexp_replace(http_referer, "\"", ""), 'HOST', 'PATH','QUERY', 'QUERY:id') b as host, path, query, query_id;
create table t_ods_tmp_detail as
select b.*,substring(time_local,0,10) as daystr,
substring(time_local,12) as tmstr,
substring(time_local,6,2) as month,
substring(time_local,9,2) as day,
substring(time_local,11,3) as hour
From t_ods_tmp_referurl b;
create table ods_weblog_detail(
valid string, --有效标识
remote_addr string, --来源IP
remote_user string, --用户标识
time_local string, --访问完整时间
daystr string, --访问日期
timestr string, --访问时间
month string, --访问月
day string, --访问日
hour string, --访问时
request string, --请求的url
status string, --响应码
body_bytes_sent string, --传输字节数
http_referer string, --来源url
ref_host string, --来源的host
ref_path string, --来源的路径
ref_query string, --来源参数query
ref_query_id string, --来源参数query的值
http_user_agent string --客户终端标识
)
partitioned by(datestr string);
insert into table ods_weblog_detail partition(datestr='20130918')
select c.valid,c.remote_addr,c.remote_user,c.time_local,
substring(c.time_local,0,10) as daystr,
substring(c.time_local,12) as tmstr,
substring(c.time_local,6,2) as month,
substring(c.time_local,9,2) as day,
substring(c.time_local,12,2) as hour,
c.request,c.status,c.body_bytes_sent,c.http_referer,c.ref_host,c.ref_path,c.ref_query,c.ref_query_id,c.http_user_agent
from
(select a.*,b.*
from ods_weblog_origin a
LATERAL view
parse_url_tuple(regexp_replace(a.http_referer,"\"",""),'HOST','PATH','QUERY','QUERY_ID')b as ref_host, ref_path, ref_query, ref_query_id) c;
2.DML分析
select
t.month,t.day,t.hour,count(*)
from ods_weblog_detail t
where t.datestr='20130918'
group by t.month,t.day,t.hour;
select t.month,t.day,count(*) from ods_weblog_detail t where t.datestr='20130918' group by t.month,t.day;
select a.month,a.day,sum(a.pvs)
from
(
select
t.month as month,t.day as day,t.hour as hour,count(*) as pvs
from ods_weblog_detail t
where t.datestr='20130918'
group by t.month,t.day,t.hour
) a
group by a.month,a.day;
select
t.day,t.hour,t.http_referer,t.ref_host,count(*)
from ods_weblog_detail t
where datestr='20130918'
group by t.day,t.hour,t.http_referer,t.ref_host
having t.ref_host is not null;
select
t.month,t.day,t.hour,t.ref_host,count(*) as pvs
from ods_weblog_detail t
where datestr='20130918'
group by t.month,t.day,t.hour,t.ref_host
having t.ref_host is not null
order by t.hour asc ,pvs desc;
select
a.month,a.day,a.hour,a.host,a.pvs,a.rmp
from
(
select
t.month as month,t.day as day,t.hour as hour,t.ref_host as host,count(*) as pvs,
row_number()over(partition by concat(t.month,t.day,t.hour) order by pvs desc) rmp
from ods_weblog_detail t
where datestr='20130918'
group by t.month,t.day,t.hour,t.ref_host
having t.ref_host is not null
order by hour asc ,pvs desc
)a
where a.rmp < 4;
select count(*)/count(distinct remote_addr) from ods_weblog_detail where datestr='20130918';
select
sum(a.pvs)/count(a.ip)
from
(
select
t.remote_addr as ip,count(*) as pvs
from ods_weblog_detail t
where t.datestr='20130918'
group by t.remote_addr
) a;
select
t.request,count(*) as counts
from ods_weblog_detail t
where datestr='20130918'
group by t.request
order by counts desc
limit 10;
select
today.ip
from
(
select distinct t.remote_addr as ip
from ods_weblog_detail t
) today
left join history
on today.ip=history.ip
where history.ip is null;
select
remote_addr,count(session) as cs
from ods_click_stream_visit
where datestr='20130918'
group by remote_addr
having cs >1;
select
count(session)/count(distinct remote_addr)
from ods_click_stream_visit
where datestr='20130918';
select
rn.step as rnstep,rn.numbs as rnnumbs,rr.step as rrstep,rr.numbs as rrnumbs
from dw_oute_numbs rn
inner join
dw_oute_numbs rr;
# -- 绝对转化
select
a.rrstep,a.rrnumbs/a.rnnumbs
from
(
select
rn.step as rnstep,rn.numbs as rnnumbs,rr.step as rrstep,rr.numbs as rrnumbs
from dw_oute_numbs rn
inner join dw_oute_numbs rr
)a
where a.rnstep='step1';
# -- 相对转化
select
tmp.rrstep as step,tmp.rrnumbs/tmp.rnnumbs as leakage_rate
from
(
select rn.step as rnstep,rn.numbs as rnnumbs,rr.step as rrstep,rr.numbs as rrnumbs from dw_oute_numbs rn
inner join
dw_oute_numbs rr
) tmp
where cast(substr(tmp.rnstep,5,1) as int)=cast(substr(tmp.rrstep,5,1) as int)-1;
大数据视频推荐:
CSDN
人工智能算法竞赛实战
AIops智能运维机器学习算法实战
ELK7 stack开发运维实战
PySpark机器学习从入门到精通
AIOps智能运维实战
腾讯课堂
大数据语音推荐:
ELK7 stack开发运维
企业级大数据技术应用
大数据机器学习案例之推荐系统
自然语言处理
大数据基础
人工智能:深度学习入门到精通