• hive-学习搜狗日志分析


    –搜狗日志分析—
    –创建数据仓库
    create external table if not exists sg_table(
    ts string,
    uid string,
    keyword string,
    ranks int,
    orders int,
    url string,
    years int,
    months int,
    days int,
    hours int)
    comment ‘This is the sogou search data of extend’
    row format delimited
    fields terminated by ‘\t’
    stored as textfile
    location ‘/data/sg’;

    show tables;
    SELECT * from default.sg_table limit 3;

    –创建动态分区表
    create external table if not exists sg_partition_table(
    ts string,
    uid string,
    keyword string,
    ranks int,
    orders int,
    url string
    )
    comment ‘This is the sogou search data by partition’
    partitioned by (years int,months int,days int,hours int)
    row format delimited
    fields terminated by ‘\t’
    stored as textfile;

    –空串处理
    alter table sg_partition_table set serdeproperties(‘serialization.null.format’=‘’);
    desc formatted sg_partition_table ;
    –开启动态分区设置 开启非严格模式
    set hive.exec.dynamic.partition.mode=nonstrict;

    –插入数据
    insert overwrite table sg_partition_table partition(years,months,days,hours)
    select * from sg_table;
    –数据预览
    SELECT * from sg_partition_table limit 10;
    –查看分区
    show partitions sg_partition_table ;
    –分区查询
    SELECT * FROM default.sg_partition_table t1 WHERE t1.years = ‘2011’ AND t1.months = ‘12’;

    –数据分析
    –记录数分析
    –统计记录数 5000000
    SELECT
    COUNT(*)
    from sg_partition_table ;

    –不重复记录 根据ts uid keyword url 4999636
    SELECT count(t1.uid) uniq_data
    from (select DISTINCT t.ts, t.uid, t.keyword, t.url
    from sg_partition_table t) t1

    –独立用户数 1352664
    select
    count(t1.uid) uniq_user
    from (select DISTINCT uid from sg_partition_table) t1;

    –独立url数 2466932
    select
    count(t1.url) uniq_url
    from (select DISTINCT url from sg_partition_table) t1;

    –关键词分析
    –关键词的平均长度 7.5618454 注意去除空格
    SELECT avg(length(replace(keyword,’ ‘,’‘))) avg_kw from sg_partition_table;
    –replace(str,’ ‘,’‘) 将字符所有空格去除
    –select size(split(trim(’ a sddd d ‘),’ ‘)),replace(’ a sddd d ‘,’ ‘,’‘),length(replace(’ a sddd d ‘,’ ‘,’'))

    –最长关键词 1005 最短关键词1
    SELECT max(length(replace(keyword,’ ‘,’‘))) max_kw
    , min(length(replace(keyword,’ ‘,’'))) min_kw
    from sg_partition_table;

    SELECT * from sg_partition_table where length(replace(keyword,’ ‘,’‘))=1005;
    SELECT * from sg_partition_table where length(replace(keyword,’ ‘,’'))=1;

    – 长度为1的关键词记录统计排序 。。。
    SELECT
    replace(keyword,’ ‘,’')
    , count() kw_ranks
    from sg_partition_table
    where length(replace(keyword,’ ‘,’‘))=1
    GROUP by replace(keyword,’ ‘,’')
    order by count(
    ) desc;

    – 关键词频率排序 前十关键词
    SELECT
    t.keyword
    , count() kw_rank
    from sg_partition_table t
    group by t.keyword
    order by count(
    ) desc
    limit 10;

    – 关键词频率排序 后十关键词
    SELECT
    t.keyword
    , count() kw_rank
    from sg_partition_table t
    group by t.keyword
    order by count(
    ) asc
    limit 10;

    – 用户分析
    –用户搜索次数统计 前十
    SELECT
    t.uid
    ,count() user_kw_rank
    from sg_partition_table t
    group by t.uid
    order by count(
    ) desc
    limit 10;

    –搜索次数最多的用户都搜索了什么 经常搜索什么内容
    SELECT
    t.keyword
    ,count() ss_count
    from sg_partition_table t
    where t.uid=‘02a8557754445a9b1b22a37b40d6db38’
    group by t.keyword
    order by count(
    ) desc
    limit 10

    –搜索次数人数人数情况
    SELECT
    sum(if(t1.user_kw_rank=1,1,0)) one_nums --只搜索一次 549148
    ,sum(if(t1.user_kw_rank=2,1,0)) tow_nums --只搜索两次 257163
    ,sum(if(t1.user_kw_rank>3,1,0)) over_three --搜索三次以上 396791
    from (
    SELECT
    t.uid
    ,count(t.keyword) user_kw_rank
    from sg_partition_table t
    group by t.uid)t1;

    –搜索次数人数人数比列情况
    SELECT
    round(sum(if(t1.user_kw_rank=1,1,0))/count(t1.uid),2)*100 || ‘%’ one_nums_rts --只搜索一次
    ,round(sum(if(t1.user_kw_rank=2,1,0))/count(t1.uid),2)*100 || ‘%’ tow_nums_rts --只搜索两次
    ,round(sum(if(t1.user_kw_rank>3,1,0))/count(t1.uid),2)*100 || ‘%’ over_three_rts --搜索三次以上
    from (
    SELECT
    t.uid
    ,count(t.keyword) user_kw_rank
    from sg_partition_table t
    group by t.uid)t1;

    –SELECT round(0.222,2)*100 || ‘%’;

    –用户平均查询次数 所有用户搜索次数 / 用户数 3.6964094557111005
    SELECT
    sum(t1.user_ss_counts) / count(t1.uid) avg_ss_counts
    from (
    SELECT
    t.uid
    ,count(*) user_ss_counts --每个用户的搜索次数
    from sg_partition_table t
    group by t.uid)t1 ;

    –用户行为分析
    –点击次数与rank之间的关系分析
    SELECT
    count(*)
    from sg_partition_table t
    where t.ranks <= 10;
    –总数为500万,比例为4999869/5000000,可看出,绝大部分会点击前10条搜索结果。

    –直接输入URL作为查询词的比例
    SELECT
    count(*)
    from sg_partition_table t
    where t.keyword like ‘%www%’;

    – 查询直接输入URL搜索并且搜索的URL位于点击的URL中记录统计
    SELECT
    sum(if(instr(t.url,t.keyword)>0,1,0)) – 27561 可看出大部分搜索URL,并不能得到自己想要的结果。
    from sg_partition_table t
    where t.keyword like ‘%www%’

    –SELECT instr(‘asdfg’,‘df’) --查看指定字符在字符串中是否存在 如果存在返回首次出现的位置 不存在则返回0

  • 相关阅读:
    java解析Cron时间表达式为中文描述
    2022最新版-李宏毅机器学习深度学习课程-P25 Spacial Transformer Layer
    复习Day05:链表part01:203.移除链表元素、707.设计链表、206.反转链表、234. 回文链表
    Linux网络编程学习笔记(TCP)
    【Head First 设计模式】-- 策略模式
    Tableau1——条形图和直方图
    【2023年11月第四版教材】第15章《风险管理》(第三部分)
    厂商征集 | 2023年中国RPA市场洞察研究报告正式启动
    spring入门aop和ioc
    php组装数据批量插入,比单条循环插入数据快很多
  • 原文地址:https://blog.csdn.net/LLMUZI123456789/article/details/128085771