hive-学习搜狗日志分析 - 码农知识堂

hive-学习搜狗日志分析

–搜狗日志分析—
–创建数据仓库
create external table if not exists sg_table(
ts string,
uid string,
keyword string,
ranks int,
orders int,
url string,
years int,
months int,
days int,
hours int)
comment ‘This is the sogou search data of extend’
row format delimited
fields terminated by ‘\t’
stored as textfile
location ‘/data/sg’;

show tables;
SELECT * from default.sg_table limit 3;

–创建动态分区表
create external table if not exists sg_partition_table(
ts string,
uid string,
keyword string,
ranks int,
orders int,
url string
)
comment ‘This is the sogou search data by partition’
partitioned by (years int,months int,days int,hours int)
row format delimited
fields terminated by ‘\t’
stored as textfile;

–空串处理
alter table sg_partition_table set serdeproperties(‘serialization.null.format’=‘’);
desc formatted sg_partition_table ;
–开启动态分区设置开启非严格模式
set hive.exec.dynamic.partition.mode=nonstrict;

–插入数据
insert overwrite table sg_partition_table partition(years,months,days,hours)
select * from sg_table;
–数据预览
SELECT * from sg_partition_table limit 10;
–查看分区
show partitions sg_partition_table ;
–分区查询
SELECT * FROM default.sg_partition_table t1 WHERE t1.years = ‘2011’ AND t1.months = ‘12’;

–数据分析
–记录数分析
–统计记录数 5000000
SELECT
COUNT(*)
from sg_partition_table ;

–不重复记录根据ts uid keyword url 4999636
SELECT count(t1.uid) uniq_data
from (select DISTINCT t.ts, t.uid, t.keyword, t.url
from sg_partition_table t) t1

–独立用户数 1352664
select
count(t1.uid) uniq_user
from (select DISTINCT uid from sg_partition_table) t1;

–独立url数 2466932
select
count(t1.url) uniq_url
from (select DISTINCT url from sg_partition_table) t1;

–关键词分析
–关键词的平均长度 7.5618454 注意去除空格
SELECT avg(length(replace(keyword,’ ‘,’‘))) avg_kw from sg_partition_table;
–replace(str,’ ‘,’‘) 将字符所有空格去除
–select size(split(trim(’ a sddd d ‘),’ ‘)),replace(’ a sddd d ‘,’ ‘,’‘),length(replace(’ a sddd d ‘,’ ‘,’'))

–最长关键词 1005 最短关键词1
SELECT max(length(replace(keyword,’ ‘,’‘))) max_kw
, min(length(replace(keyword,’ ‘,’'))) min_kw
from sg_partition_table;

SELECT * from sg_partition_table where length(replace(keyword,’ ‘,’‘))=1005;
SELECT * from sg_partition_table where length(replace(keyword,’ ‘,’'))=1;

– 长度为1的关键词记录统计排序。。。
SELECT
replace(keyword,’ ‘,’')
, count() kw_ranks
from sg_partition_table
where length(replace(keyword,’ ‘,’‘))=1
GROUP by replace(keyword,’ ‘,’')
order by count() desc;

– 关键词频率排序前十关键词
SELECT
t.keyword
, count() kw_rank
from sg_partition_table t
group by t.keyword
order by count() desc
limit 10;

– 关键词频率排序后十关键词
SELECT
t.keyword
, count() kw_rank
from sg_partition_table t
group by t.keyword
order by count() asc
limit 10;

– 用户分析
–用户搜索次数统计前十
SELECT
t.uid
,count() user_kw_rank
from sg_partition_table t
group by t.uid
order by count() desc
limit 10;

–搜索次数最多的用户都搜索了什么经常搜索什么内容
SELECT
t.keyword
,count() ss_count
from sg_partition_table t
where t.uid=‘02a8557754445a9b1b22a37b40d6db38’
group by t.keyword
order by count() desc
limit 10

–搜索次数人数人数情况
SELECT
sum(if(t1.user_kw_rank=1,1,0)) one_nums --只搜索一次 549148
,sum(if(t1.user_kw_rank=2,1,0)) tow_nums --只搜索两次 257163
,sum(if(t1.user_kw_rank>3,1,0)) over_three --搜索三次以上 396791
from (
SELECT
t.uid
,count(t.keyword) user_kw_rank
from sg_partition_table t
group by t.uid)t1;

–搜索次数人数人数比列情况
SELECT
round(sum(if(t1.user_kw_rank=1,1,0))/count(t1.uid),2)*100 || ‘%’ one_nums_rts --只搜索一次
,round(sum(if(t1.user_kw_rank=2,1,0))/count(t1.uid),2)*100 || ‘%’ tow_nums_rts --只搜索两次
,round(sum(if(t1.user_kw_rank>3,1,0))/count(t1.uid),2)*100 || ‘%’ over_three_rts --搜索三次以上
from (
SELECT
t.uid
,count(t.keyword) user_kw_rank
from sg_partition_table t
group by t.uid)t1;

–SELECT round(0.222,2)*100 || ‘%’;

–用户平均查询次数所有用户搜索次数 / 用户数 3.6964094557111005
SELECT
sum(t1.user_ss_counts) / count(t1.uid) avg_ss_counts
from (
SELECT
t.uid
,count(*) user_ss_counts --每个用户的搜索次数
from sg_partition_table t
group by t.uid)t1 ;

–用户行为分析
–点击次数与rank之间的关系分析
SELECT
count(*)
from sg_partition_table t
where t.ranks <= 10;
–总数为500万，比例为4999869/5000000，可看出，绝大部分会点击前10条搜索结果。

–直接输入URL作为查询词的比例
SELECT
count(*)
from sg_partition_table t
where t.keyword like ‘%www%’;

– 查询直接输入URL搜索并且搜索的URL位于点击的URL中记录统计
SELECT
sum(if(instr(t.url,t.keyword)>0,1,0)) – 27561 可看出大部分搜索URL，并不能得到自己想要的结果。
from sg_partition_table t
where t.keyword like ‘%www%’

–SELECT instr(‘asdfg’,‘df’) --查看指定字符在字符串中是否存在如果存在返回首次出现的位置不存在则返回0
相关阅读:
SLAM从入门到精通（amcl定位使用）
计算机系统基础实验——数据的机器级表示（条件表达式 x?y:z）
http基础教程(超详细)
nginx_0.7.65_00截断_nginx解析漏洞
 [21天学习挑战赛——内核笔记]（八）——Linux 设备管理机制
 Multer 实现文件上传功能
 jar包或exe程序设置为windows服务
 [论文笔记]Root Mean Square Layer Normalization
linux下doc转docx
逻辑回归(Logistic Regression)
原文地址：https://blog.csdn.net/LLMUZI123456789/article/details/128085771