• hive-学习搜狗日志分析


    –搜狗日志分析—
    –创建数据仓库
    create external table if not exists sg_table(
    ts string,
    uid string,
    keyword string,
    ranks int,
    orders int,
    url string,
    years int,
    months int,
    days int,
    hours int)
    comment ‘This is the sogou search data of extend’
    row format delimited
    fields terminated by ‘\t’
    stored as textfile
    location ‘/data/sg’;

    show tables;
    SELECT * from default.sg_table limit 3;

    –创建动态分区表
    create external table if not exists sg_partition_table(
    ts string,
    uid string,
    keyword string,
    ranks int,
    orders int,
    url string
    )
    comment ‘This is the sogou search data by partition’
    partitioned by (years int,months int,days int,hours int)
    row format delimited
    fields terminated by ‘\t’
    stored as textfile;

    –空串处理
    alter table sg_partition_table set serdeproperties(‘serialization.null.format’=‘’);
    desc formatted sg_partition_table ;
    –开启动态分区设置 开启非严格模式
    set hive.exec.dynamic.partition.mode=nonstrict;

    –插入数据
    insert overwrite table sg_partition_table partition(years,months,days,hours)
    select * from sg_table;
    –数据预览
    SELECT * from sg_partition_table limit 10;
    –查看分区
    show partitions sg_partition_table ;
    –分区查询
    SELECT * FROM default.sg_partition_table t1 WHERE t1.years = ‘2011’ AND t1.months = ‘12’;

    –数据分析
    –记录数分析
    –统计记录数 5000000
    SELECT
    COUNT(*)
    from sg_partition_table ;

    –不重复记录 根据ts uid keyword url 4999636
    SELECT count(t1.uid) uniq_data
    from (select DISTINCT t.ts, t.uid, t.keyword, t.url
    from sg_partition_table t) t1

    –独立用户数 1352664
    select
    count(t1.uid) uniq_user
    from (select DISTINCT uid from sg_partition_table) t1;

    –独立url数 2466932
    select
    count(t1.url) uniq_url
    from (select DISTINCT url from sg_partition_table) t1;

    –关键词分析
    –关键词的平均长度 7.5618454 注意去除空格
    SELECT avg(length(replace(keyword,’ ‘,’‘))) avg_kw from sg_partition_table;
    –replace(str,’ ‘,’‘) 将字符所有空格去除
    –select size(split(trim(’ a sddd d ‘),’ ‘)),replace(’ a sddd d ‘,’ ‘,’‘),length(replace(’ a sddd d ‘,’ ‘,’'))

    –最长关键词 1005 最短关键词1
    SELECT max(length(replace(keyword,’ ‘,’‘))) max_kw
    , min(length(replace(keyword,’ ‘,’'))) min_kw
    from sg_partition_table;

    SELECT * from sg_partition_table where length(replace(keyword,’ ‘,’‘))=1005;
    SELECT * from sg_partition_table where length(replace(keyword,’ ‘,’'))=1;

    – 长度为1的关键词记录统计排序 。。。
    SELECT
    replace(keyword,’ ‘,’')
    , count() kw_ranks
    from sg_partition_table
    where length(replace(keyword,’ ‘,’‘))=1
    GROUP by replace(keyword,’ ‘,’')
    order by count(
    ) desc;

    – 关键词频率排序 前十关键词
    SELECT
    t.keyword
    , count() kw_rank
    from sg_partition_table t
    group by t.keyword
    order by count(
    ) desc
    limit 10;

    – 关键词频率排序 后十关键词
    SELECT
    t.keyword
    , count() kw_rank
    from sg_partition_table t
    group by t.keyword
    order by count(
    ) asc
    limit 10;

    – 用户分析
    –用户搜索次数统计 前十
    SELECT
    t.uid
    ,count() user_kw_rank
    from sg_partition_table t
    group by t.uid
    order by count(
    ) desc
    limit 10;

    –搜索次数最多的用户都搜索了什么 经常搜索什么内容
    SELECT
    t.keyword
    ,count() ss_count
    from sg_partition_table t
    where t.uid=‘02a8557754445a9b1b22a37b40d6db38’
    group by t.keyword
    order by count(
    ) desc
    limit 10

    –搜索次数人数人数情况
    SELECT
    sum(if(t1.user_kw_rank=1,1,0)) one_nums --只搜索一次 549148
    ,sum(if(t1.user_kw_rank=2,1,0)) tow_nums --只搜索两次 257163
    ,sum(if(t1.user_kw_rank>3,1,0)) over_three --搜索三次以上 396791
    from (
    SELECT
    t.uid
    ,count(t.keyword) user_kw_rank
    from sg_partition_table t
    group by t.uid)t1;

    –搜索次数人数人数比列情况
    SELECT
    round(sum(if(t1.user_kw_rank=1,1,0))/count(t1.uid),2)*100 || ‘%’ one_nums_rts --只搜索一次
    ,round(sum(if(t1.user_kw_rank=2,1,0))/count(t1.uid),2)*100 || ‘%’ tow_nums_rts --只搜索两次
    ,round(sum(if(t1.user_kw_rank>3,1,0))/count(t1.uid),2)*100 || ‘%’ over_three_rts --搜索三次以上
    from (
    SELECT
    t.uid
    ,count(t.keyword) user_kw_rank
    from sg_partition_table t
    group by t.uid)t1;

    –SELECT round(0.222,2)*100 || ‘%’;

    –用户平均查询次数 所有用户搜索次数 / 用户数 3.6964094557111005
    SELECT
    sum(t1.user_ss_counts) / count(t1.uid) avg_ss_counts
    from (
    SELECT
    t.uid
    ,count(*) user_ss_counts --每个用户的搜索次数
    from sg_partition_table t
    group by t.uid)t1 ;

    –用户行为分析
    –点击次数与rank之间的关系分析
    SELECT
    count(*)
    from sg_partition_table t
    where t.ranks <= 10;
    –总数为500万,比例为4999869/5000000,可看出,绝大部分会点击前10条搜索结果。

    –直接输入URL作为查询词的比例
    SELECT
    count(*)
    from sg_partition_table t
    where t.keyword like ‘%www%’;

    – 查询直接输入URL搜索并且搜索的URL位于点击的URL中记录统计
    SELECT
    sum(if(instr(t.url,t.keyword)>0,1,0)) – 27561 可看出大部分搜索URL,并不能得到自己想要的结果。
    from sg_partition_table t
    where t.keyword like ‘%www%’

    –SELECT instr(‘asdfg’,‘df’) --查看指定字符在字符串中是否存在 如果存在返回首次出现的位置 不存在则返回0

  • 相关阅读:
    SLAM从入门到精通(amcl定位使用)
    计算机系统基础实验——数据的机器级表示(条件表达式 x?y:z)
    http基础教程(超详细)
    nginx_0.7.65_00截断_nginx解析漏洞
    [21天学习挑战赛——内核笔记](八)——Linux 设备管理机制
    Multer 实现文件上传功能
    jar包或exe程序设置为windows服务
    [论文笔记]Root Mean Square Layer Normalization
    linux下doc转docx
    逻辑回归(Logistic Regression)
  • 原文地址:https://blog.csdn.net/LLMUZI123456789/article/details/128085771