• hive 常用函数


    1.分位数

      percentile_approx(DOUBLE col, p [, B])     Returns an approximate pth percentile of a numeric column (including floating point types) in the group

      含义: 在col列中返回p%的分位数

      select 
      percentile_approx( arr_id , 0.5 )
      from 
      (
        select
          arr_id
        from
          (
            select
              array(1, 2, 3, 4, 5, 6, 7, 8, 9, 1000) as arr
          ) a lateral view explode(arr) tt as arr_id
      ) a 

    2.url解析

    解码url: URL转码,encodeURI,encodeURIComponent — 在线工具
    解析json: 在线JSON校验格式化工具(Be JSON)

    上报:
    :path: /log/web?0000171612422098683https%3A%2F%2Fwww.bilibili.com%2Fbangumi%2Flist%2Fsl17662%3Ffrom_spmid%3Dpgc.cinema-tab.0.0%26intentFrom%3D15%26native.theme%3D1%26navhide%3D1%26share_medium%3Dandroid%26share_plat%3Dandroid%26share_source%3DCOPY%26share_tag%3Ds_i%26timestamp%3D1612343552%26unique_k%3DHKz3gy|666.49.selfDef.click_unfollow||1612422098000|0|0|980x1743|2|{%22event%22:%22click_unfollow%22,%22value%22:{%22sl_id%22:%2217662%22,%22season_id%22:35582,%22season_type%22:3,%22mid%22:36865977},%22bsource%22:%22share_source_copy_link%22}|{}|https%3A%2F%2Fm.bilibili.com%2F|FF7C614F-2539-A3E3-1AE8-E4265F23111B03042infoc|zh-CN|null
    
    
    msg:
    {%22event%22:%22click_season%22,%22value%22:{%22sl_id%22:%2218665%22,%22season_id%22:32429,%22season_type%22:2,%22mid%22:179753067},%22bsource%22:%22%22}
    
    解码url:  https://www.sojson.com/encodeurl.html
    解析json: https://www.bejson.com/
    解析msg 字段:
    select
    *
    ,get_json_object(msg, '$.event') as event1
    ,get_json_object(msg, '$.value') as value1
    ,get_json_object(get_json_object(msg, '$.value'), '$.sl_id')  as sub_value_sl
    ,get_json_object(get_json_object(msg, '$.value'), '$.season_id')  as sub_value_season
    ,get_json_object(get_json_object(msg, '$.value'), '$.season_type')  as sub_value_season_type
    ,get_json_object(get_json_object(msg, '$.value'), '$.mid')  as sub_value_mid
    from
    (
    select '{"event":"click_season","value":{"sl_id":"18665","season_id":32429,"season_type":2,"mid":179753067},"bsource":""}'  as msg  union all
    select '{"event":"unload","value":{"enter":1612292714266,"leave":1612292745642},"bsource":""}'  as msg  union all
    select '{"event":"show_hover","value":{"sl_id":"20233","season_id":33987,"season_type":2,"mid":172278354},"bsource":""}'  as msg  union all
    select '{"event":"click_season","value":{"sl_id":"17834","season_id":34565,"season_type":2,"mid":271658710},"bsource":"search_baidu"}'  as msg  union all
    select '{"event":"show_hover","value":{"sl_id":"17834","season_id":28274,"season_type":2,"mid":281264988},"bsource":""}'  as msg  
    )  a
      
    limit 2000
    
    
    select
    *,
    reflect("java.net.URLDecoder", "decode", msg, "UTF-8")  as url
    ,get_json_object(reflect("java.net.URLDecoder", "decode", msg, "UTF-8"), '$.event') as event_type
    ,get_json_object(reflect("java.net.URLDecoder", "decode", msg, "UTF-8"), '$.value') as event_value
    ,get_json_object(get_json_object(reflect("java.net.URLDecoder", "decode", msg, "UTF-8"), '$.value'), '$.sl_id')  as sub_value_sl
    ,get_json_object(get_json_object(reflect("java.net.URLDecoder", "decode", msg, "UTF-8"), '$.value'), '$.season_id')  as sub_value_season
    ,get_json_object(get_json_object(reflect("java.net.URLDecoder", "decode", msg, "UTF-8"), '$.value'), '$.season_type')  as sub_value_season_type
    ,get_json_object(get_json_object(reflect("java.net.URLDecoder", "decode", msg, "UTF-8"), '$.value'), '$.mid')  as sub_value_mid
    from
      b_dwd.dwd_flow_web_click_l_d
    WHERE
      log_date between '<%= log_date  %>'  and '<%= log_date %>' 
      and   substr(spm_id,1,6)= '666.49'
      
      limit 2000

    3.正则 

     滑动验证页面

    select
    *,
    a.from_spmid rlike concat('^','dynamic') ,
    a.from_spmid rlike concat('^','dt+') ,
    a.from_spmid rlike '^dt'
    from
    (
    select 'ott-platform.ott-dynamic.0.0' as from_spmid union ALL
    select 'out_open_deeplink_gdtlh-and-yf-31yaz-17776718' as from_spmid union ALL
    select 'out_open_deeplink_gdtlh-and-yf-31yaz-18253662' as from_spmid union ALL
    select 'ott-platform.ott-dynamic.0.0' as from_spmid union ALL
    select 'dynamic.video-tab.0.0' as from_spmid union ALL
    select 'default-value' as from_spmid
    ) a

    3.hsql 间隔日期补数

    方法1:窗口函数

    with  tmp_table as (
      select dt ,city ,  val 
      from 
      (
        select  '20220101' as dt ,'上海' as city ,  100  val union all 
        select  '20220102' as dt ,'上海' as city ,  null val union all 
        select  '20220104' as dt ,'上海' as city ,  null val union all 
        select  '20220106' as dt ,'上海' as city ,  200  val 
      ) a 
    ) 
    
    ,tmp_a as (
    select dt ,city ,  val ,
    case when val  is null then 0 else 1 end val1  -- 将空值设置为0,非空值设置为1
    from tmp_table
    ),
    tmp_b as (
    select dt ,city ,  val ,val1,
    sum(val1) over(partition by '1' order by dt) as cnt_val1 -- 区分 空和费控分空分组 
    from tmp_a
    )
    select dt ,city ,  val ,val1,cnt_val1,
    max(val) over(partition by cnt_val1 order by dt) as fill_val  -- 使用开窗函数按照分组求最大值
    from tmp_b
    
    
    


    方法2:udf 

    4. 用户大会员在期的天数
    with tmp_dt as (
      select id,sdt,edt
      from 
      (
      select 1 id, '2022-02-02' as sdt, '2022-02-07' as edt union all 
      select 1 id, '2022-02-07' as sdt, '2022-02-18' as edt union all
      select 1 id, '2022-02-10' as sdt, '2022-02-23' as edt 
      ) a 
    )
    select 
    id,base_line,max(dt) dt 
    ,datediff(max(dt),base_line)+1 as days
    from 
    (
    
    select id ,dt,max(base_f) over (PARTITION BY id ORDER BY dt) base_line
    from 
    (
      select id ,dt,flag,sum(flag) over(partition by id order by concat(dt,flag) desc) as sm 
      ,case when sum(flag) over(partition by id order by concat(dt,flag) desc)=0 then dt else 0 end as base_f
      from 
      (
      select id,sdt as dt,1 as flag
      from tmp_dt
      union all 
      select id,edt,-1 as flag
      from tmp_dt
      ) a 
    ) b
    ) c
    group by id ,base_line

    5.取每个mid同时播放的设备数分布。

    已经整理好buvid、mid、开始播放时间、结束播放时间。主要背景是想看下现在“共享账号”的用户量级有多少

    https://berserker.bilibili.co/?URL=http://berserker.bilibili.co/#/adhoc?sqlId=100670985

    with 
      ogv_play as (
        select mid,buvid,season_id,epid,log_date,
          arch_play_timestamp as stime,
          arch_play_timestamp+total_duration as etime
        from bili_ogv.dwd_flow_ogv_play_app_timed_i_d
        where log_date between '20230226' and '20230226'  and season_id >0 and total_duration>0  
    
        union 
        select mid,buvid,season_id,epid,log_date,
          arch_play_timestamp as stime,
          arch_play_timestamp+played_duration as etime
        from bili_ogv.dwd_flow_ogv_play_web_timed_i_d
        where log_date between '20230226' and '20230226' and season_id >0 and played_duration>0  
      ),
      vip_user as (
            select mid,log_date
            from bili_ogv.dim_user_full_d 
            where log_date between '20230226' and '20230226'  and is_vip_user = 1  
      ),
    
      vip_play_pay_ep as (
        select a.mid,a.buvid,a.epid,a.stime,a.etime ,a.log_date
        from ogv_play a 
        join vip_user b 
        on a.mid = b.mid  and a.log_date = b.log_date
        join (
          select epid,log_date
          from bili_ogv.dim_ep_av_full_d 
          where log_date between '20230226' and '20230226' and ep_status not in (2,14)
        ) c 
        on a.epid = c.epid and a.log_date = c.log_date
      )
    
    
    -- mid同时播放的设备数分布
    select 
     i.buvid_cnt
     ,count(*)  as mid_cnt   
    from
    (
    select 
    -- mid 的设备数去重 buvid
       j.mid
      ,count(distinct j.buvid ) as buvid_cnt  
    from
    (
    
    select  
    -- 删除重复进入播放记录, 保留同时进入播放的记录
        h.log_date
       ,h.mid
       ,h.buvid
       ,h.epid
       ,h.in_out_time
       ,h.label 
       ,h.num -- 每个mid 同时在线计数
       ,h.in_out_time_rank  -- 每个mid 播放开始时间排序 
       ,h.num1  -- 用户进入次数计数
       ,h.diff 
       ,h.max_num 
       ,h.max_inc_rank -- 升序的最大值
       ,h.max_inc_rank_for_row -- 升序的最大值存放在每一行
       ,h.acc_by_mid_num1  --  每个mid重复进入播放计数
    from 
    (
    
    select
        g.log_date
       ,g.mid
       ,g.buvid
       ,g.epid
       ,g.in_out_time
       ,g.label 
       ,g.num -- 每个mid 同时在线计数
       ,g.in_out_time_rank  -- 每个mid 播放开始时间排序 
       ,g.num1  -- 用户进入次数计数
       ,g.diff 
       ,g.max_num 
       ,g.max_inc_rank -- 升序的最大值
       ,g.max_inc_rank_for_row -- 升序的最大值存放在每一行
       ,sum(1)  over(partition by mid,num1 )  as acc_by_mid_num1  --  每个mid重复进入播放计数
    from
    (
    
    select
    -- 得到同时在线num 的升序部分 
        f.log_date
       ,f.mid
       ,f.buvid
       ,f.epid
       ,f.in_out_time
       ,f.label 
       ,f.num -- 每个mid 同时在线计数
       ,f.in_out_time_rank  -- 每个mid 播放开始时间排序 
       ,f.num1  -- 用户进入次数计数
       ,f.diff 
       ,f.max_num 
       ,f.max_inc_rank -- 升序的最大值
       ,f.max_inc_rank_for_row -- 升序的最大值存放在每一行
    from
    (
    
    select 
        e.log_date
       ,e.mid
       ,e.buvid
       ,e.epid
       ,e.in_out_time
       ,e.label 
       ,e.num -- 每个mid 同时在线计数
       ,e.in_out_time_rank
       ,e.num1  -- 用户进入次数计数
       ,e.diff 
       ,e.max_num 
       ,e.max_inc_rank -- 升序的最大值
       ,max(e.max_inc_rank) over(partition by e.mid)  max_inc_rank_for_row -- 升序的最大值存放在每一行
    from 
    (
    
    select 
        d.log_date
       ,d.mid
       ,d.buvid
       ,d.epid
       ,d.in_out_time
       ,d.label 
       ,d.num  -- 每个mid 同时在线计数
       ,d.in_out_time_rank
       ,d.num1  -- 用户进入次数计数
       ,d.diff 
       ,d.max_num 
       ,if(d.max_num=d.num,d.in_out_time_rank,0)  as max_inc_rank -- 升序的最大值
    from
    (
    
    select 
        c.log_date
       ,c.mid
       ,c.buvid
       ,c.epid
       ,c.in_out_time
       ,c.label 
       ,c.num   -- 每个mid 同时在线计数
       ,c.in_out_time_rank
       ,c.num1 -- 用户进入次数计数
       ,c.diff 
       ,first_value(c.num) over(partition by c.mid order by c.num desc ) as max_num 
    from 
    (
    
     select
           b.log_date
          ,b.mid
          ,b.buvid
          ,b.epid
          ,b.in_out_time
          ,b.label 
          ,b.num  -- 每个mid 同时在线计数
          ,b.in_out_time_rank
          ,sum(b.label1)  over(partition by b.mid  order by b.in_out_time )  as num1  -- 用户进入次数计数
          ,b.in_out_time_rank-b.num as  diff 
     from
     (
       select 
           a.log_date
          ,a.mid
          ,a.buvid
          ,a.epid
          ,a.in_out_time
          ,a.label
           
          ,sum(a.label)  over(partition by a.mid  order by a.in_out_time )  as num -- 每个mid 同时在线计数
          ,row_number() over(partition by  a.mid  order by a.in_out_time ) as in_out_time_rank
          ,if(a.label=-1 ,0,a.label) as label1  -- 退出分组
        from 
        (
           -- 开始时间
           select 
             log_date
             ,mid
             ,buvid
             ,epid
             ,stime  as   in_out_time
             ,1      as   label
           from vip_play_pay_ep
           -- where mid = 57706 -- 1645967203
           union all 
           -- 结束时间
           select 
             log_date
             ,mid
             ,buvid
             ,epid
             ,etime   as   in_out_time
             ,-1      as   label
           from vip_play_pay_ep
          -- where mid = 57706 -- 1645967203
        ) a
       ) b 
      
      ) c 
      
      ) d 
      
      ) e 
      
      ) f 
      where  f.in_out_time_rank between 1 and  f.max_inc_rank_for_row
    ) g 
      
    ) h 
    where h.acc_by_mid_num1 =1   --  每个mid重复进入播放计数
    
    ) j 
    group by j.mid
    
    ) i 
    group by i.buvid_cnt
    

    思路:

    1.每个mid 同时在线计数

    2.同时播放数提取升序部分

    3.删除重复进入播放部分

    5. 提取一个字符串中重复的item_id 后缀数字

    select 
    regexp_extract(sub_str,'item_id\\":\\"([0-9]+)(\\")',1)
    FROM 
    (
      select '{"item_infos":[{"item_id":"701969"},{"item_id":"701965"}]},{"item_infos":[{"item_id":"701964"},{"item_id":"701963"}]}' as  string_test
    ) a 
    lateral view explode(split(string_test,',')) t as sub_str 

  • 相关阅读:
    Java中的抽象类和接口(Abstract Class and Interface)的区别
    sqli blind injection盲注,以马冬梅为例
    模拟vue动态路由
    理解编码器M法/T法M\T法转速测量原理
    大语言模型LangChain + ChatGLM3-6B的组合集成:工具调用+提示词解读
    vue3 配置生产和开发 非vite
    基于cifar数据集合成含开集、闭集噪声的数据集
    产品新闻稿撰写流程是怎样的,纯干货
    计算机网络学习
    【深入浅出 Yarn 架构与实现】6-2 NodeManager 状态机管理
  • 原文地址:https://blog.csdn.net/liuxiaoer1/article/details/133467626