• hive-学习微博日志分析


    –微博日志分析–
    show databases;
    use weibo_db ;
    –创建数据仓库
    create database if not exists weibo_db;
    use weibo_db ;
    create external table if not exists wb_table(
    json string
    )
    location ‘/data/wb’;

    SELECT * from wb_table limit 10;

    –数据分析
    –微博总量 1451868
    SELECT count(*) from wb_table ;

    –独立用户数 78540
    –SELECT
    – COUNT(DISTINCT GET_JSON_OBJECT(t1.js,‘$.userId’))
    –from (
    –SELECT
    – SUBSTRING(t.json,2,length(t.json)-2) js
    –from wb_table t)t1;

    select
    count(t2.dis_uid)
    from(
    SELECT
    DISTINCT GET_JSON_OBJECT(t1.js,‘$.userId’) dis_uid
    from (SELECT SUBSTRING(t.json,2,length(t.json)-2) js from wb_table t)t1)t2;

    –转发维度
    –用户所有微博被转发的总数,输出前10个用户
    SELECT
    t1.id
    , sum(t1.reportCount)
    from (
    SELECT
    GET_JSON_OBJECT(SUBSTRING(t.json,2,length(t.json)-2),‘ . u s e r I d ′ ) i d , G E T J S O N O B J E C T ( S U B S T R I N G ( t . j s o n , 2 , l e n g t h ( t . j s o n ) − 2 ) , ′ .userId') id , GET_JSON_OBJECT(SUBSTRING(t.json,2,length(t.json)-2),' .userId)id,GETJSONOBJECT(SUBSTRING(t.json,2,length(t.json)2),.reportCount’) reportCount
    from wb_table t)t1
    group by t1.id
    order by sum(t1.reportCount) desc
    limit 10;

    –被转发次数最多的前10条微博,输出用户id
    SELECT
    t1.id , t1.reportCount
    from (
    SELECT
    GET_JSON_OBJECT(SUBSTRING(t.json,2,length(t.json)-2),‘ . u s e r I d ′ ) i d , G E T J S O N O B J E C T ( S U B S T R I N G ( t . j s o n , 2 , l e n g t h ( t . j s o n ) − 2 ) , ′ .userId') id , GET_JSON_OBJECT(SUBSTRING(t.json,2,length(t.json)-2),' .userId)id,GETJSONOBJECT(SUBSTRING(t.json,2,length(t.json)2),.reportCount’) reportCount
    from wb_table t) t1
    order by t1.reportCount desc
    limit 10;

    –被点赞次数最多的前10条微博,输出用户id
    SELECT
    t1.id , t1.praiseCount
    from (
    SELECT
    GET_JSON_OBJECT(SUBSTRING(t.json,2,length(t.json)-2),‘ . u s e r I d ′ ) i d , G E T J S O N O B J E C T ( S U B S T R I N G ( t . j s o n , 2 , l e n g t h ( t . j s o n ) − 2 ) , ′ .userId') id , GET_JSON_OBJECT(SUBSTRING(t.json,2,length(t.json)-2),' .userId)id,GETJSONOBJECT(SUBSTRING(t.json,2,length(t.json)2),.praiseCount’) praiseCount
    from wb_table t) t1
    order by t1.praiseCount desc
    limit 10;

    –每个用户发布的微博总数
    SELECT count(t1.id) from (
    SELECT
    GET_JSON_OBJECT(SUBSTRING(t.json,2,length(t.json)-2),‘$.userId’) id
    from wb_table t)t1
    group by t1.id
    order by count(t1.id) desc
    limit 10;

    –统计带图片的微博数
    SELECT
    count(*)
    from wb_table t
    where GET_JSON_OBJECT(SUBSTRING(t.json,2,length(t.json)-2),‘$.pic_list’) like ‘%http%’;

    –统计使用iphone发微博的独立用户数
    SELECT
    count(*)
    from wb_table t
    where GET_JSON_OBJECT(SUBSTRING(t.json,2,length(t.json)-2),‘$.source’) like ‘%iphone%’;

    –微博中评论次数小于1000的用户id和数据来源
    SELECT
    GET_JSON_OBJECT(SUBSTRING(t.json,2,length(t.json)-2),‘ . u s e r I d ′ ) i d , G E T J S O N O B J E C T ( S U B S T R I N G ( t . j s o n , 2 , l e n g t h ( t . j s o n ) − 2 ) , ′ .userId') id , GET_JSON_OBJECT(SUBSTRING(t.json,2,length(t.json)-2),' .userId)id,GETJSONOBJECT(SUBSTRING(t.json,2,length(t.json)2),.commentCount’) commentCount
    from wb_table t
    where GET_JSON_OBJECT(SUBSTRING(t.json,2,length(t.json)-2),‘$.commentCount’) < 1000;

    –数据ETL
    – 将查询的数据导出到mysql中
    set hive.map.aggr = true;
    –set hive.grouby.mapaggr.checkinterval=100000;
    set hive.groupby.skewindata = true;
    – 先将查询结果保存到临时表 再根据临时表的路径导出即可 默认会在/usr/local/hive_dw/weibo_db.db/wb_user_nums
    create table wb_user_nums(
    uid string ,
    nums int
    )
    row format delimited
    fields terminated by ‘,’;

    insert overwrite table wb_user_nums
    SELECT t1.id, count(*) from (
    SELECT
    GET_JSON_OBJECT(SUBSTRING(t.json,2,length(t.json)-2),‘$.userId’) id
    from wb_table t)t1
    group by t1.id;

  • 相关阅读:
    CCC联盟——UWB MAC(一)
    01 OSI七层网络排查 troubleshooting 思路及对应工具
    「MySQL高级篇」MySQL锁机制 && 事务 -- 临键锁与幻读
    总结四:数据库(MySQL)面经
    基于JAVA动漫网站和特效处理系统(Springboot框架+AI人工智能) 开题报告
    python进阶(26)collections标准库
    Java把Base64编码格式的图片下载到本地指定文件夹下
    品尝葡萄酒要注意的重点事项有哪些?
    LeetCode热题100——二分查找
    基于Java毕业设计虚拟银行业务培训游戏源码+系统+mysql+lw文档+部署软件
  • 原文地址:https://blog.csdn.net/LLMUZI123456789/article/details/128085796