• hadoop完全分布+hive数据分析


    hadoop搭建基础:

    Hadoop:

    有道云笔记:http://note.youdao.com/noteshare?id=6afacf1ae14aef32ea32b1c361771eea&sub=832AC6E58A8F4E72B12E4C6A020A5520

    hive+mysql搭建:

    有道云笔记:http://note.youdao.com/noteshare?id=332ed266d16a8b50dca48d1c6e0e4624&sub=3FE5A57DCC394D73B2187610CF03AA91

    flume搭建:

    有道云笔记:http://note.youdao.com/noteshare?id=d6f8928a8e813980054b3a785f98de10&sub=6A004440061B4675AB75EE534BF03B03

    flume基础应用:

    有道云笔记:http://note.youdao.com/noteshare?id=7918e87ddc8a1f8d806be13804463a15&sub=13E6C1DCD231448AB0AF6981426F7C67

    spark安装:

    有道云笔记:http://note.youdao.com/noteshare?id=ef3a676af0e2f8b4be6dc3ddb2757353&sub=5D2CFF72B3964AE392B1212EF96C0871

    一,爬取数据:

    1,scrapy爬取智联招聘和中华英才网

    有道云链接:http://note.youdao.com/noteshare?id=5edad3cccfc222f91bda2b5499df8806&sub=029090D747894803ABE1D6AD1E496AAD

    二、flume传输数据:

    flume配置文件

    #name the components on this agent
    a1.sources = r1
    a1.sinks = k1
    a1.channels = c1
    
    # Describe/configure the source
    a1.sources.r1.type = exec
    a1.sources.r1.command = tail -F /home/data/job.txt #监控data目录下
    a1.sources.r1.channels = c1
    
    # Describe the sink
    a1.sinks.k1.type = hdfs
    a1.sinks.k1.channel = c1
    a1.sinks.k1.hdfs.path =hdfs://hadoop01:9000/flume/tailout/%y-%m-%d/%H-%M/ #把数据收集到hdfs目录下
    a1.sinks.k1.hdfs.filePrefix = log-
    a1.sinks.k1.hdfs.round = true
    a1.sinks.k1.hdfs.roundValue = 10
    a1.sinks.k1.hdfs.roundUnit = minute
    a1.sinks.k1.hdfs.rollInterval = 3
    a1.sinks.k1.hdfs.rollSize = 20
    a1.sinks.k1.hdfs.rollCount = 5
    a1.sinks.k1.hdfs.batchSize = 1
    a1.sinks.k1.hdfs.useLocalTimeStamp = true
    #生成的文件类型,默认是Sequencefile,可用DataStream,则为普通文本
    a1.sinks.k1.hdfs.fileType = DataStream
    
    # Use a channel which buffers events in memory
    a1.channels.c1.type = memory
    a1.channels.c1.capacity = 1000
    a1.channels.c1.transactionCapacity = 100
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30

    启动flume;

    flume-ng agent -c conf -f shixun.conf -n a1 -Dflume.root.logger=INFO,console
    
    • 1

    将文件放到/home/tailout ,就会将数据传到hdfs上

    三、hive数据处理

    有道云笔记:http://note.youdao.com/noteshare?id=f4207943f87a4f8946afd38bef6396b8&sub=0E74436F444B4742B3C1F8998D28A884

    1,编写hql语句,创建数据库,创建表,处理数据

    fenxi.hql

    create database fenxi;
    use fenxi;
    create table shuju(id string,dates string, jobtitle string, pay string,unit string , exp string ,edu string ,site string) row format delimited fields terminated by '	';
    load data local inpath '/home/sofware/hive-2.3.6/data/fenxi.txt' into table shuju;
    create table shuju2 as select substr(jobtitle,13) as jobtitle, substr(pay,8) as pay ,substr(site,9,2) as site ,substr(exp,15) as exp,substr(dates,10,5) as dates from shuju  group by jobtitle,pay,site,exp,dates;
    create table basis  as select regexp_replace(jobtitle, '"', '') as jobtitle, regexp_replace(pay, '"', '')as pay,regexp_replace(site, '"', '')as site ,regexp_replace(exp, '"', '')as exp from shuju2;
    
    create table shuju1y as select jobtitle as jobtitle,regexp_replace(pay, '千/月', '')as pay from basis where pay like '%千/月' ;
    create table shuju1yw as select jobtitle as jobtitle,regexp_replace(pay, '万/月', '')as pay from basis where pay like '%万/月' ;
    create table shuju1n as select jobtitle as jobtitle,regexp_replace(pay, '万/年', '')as pay from basis where pay like '%万/年';
     create table shuju2y as select jobtitle as jobtitle,split(pay,'-')[0]*1000 as min ,split(pay,'-')[1]*1000 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*1000 as avg from shuju1y;
    insert into shuju2y select jobtitle as jobtitle,split(pay,'-')[0]*10000 as min ,split(pay,'-')[1]*10000 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*10000 as avg from shuju1yw;
    insert into shuju2y select jobtitle as jobtitle,split(pay,'-')[0]*10000/12 as min ,split(pay,'-')[1]*10000/12 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*10000/12 as avg from shuju1n;
    create table gzs as select min(min) as min,max(max) as max,avg(avg) as avg from shuju2y;
    drop table shuju1y;
    drop table shuju1yw;
    drop table shuju1n;
    
    insert overwrite table basis select * from basis where exp regexp '1年经验|2年经验|3年经验|1-2年经验|1-3年经验|2-3年经验|无需经验' ;
    create table shuju1y as select jobtitle as jobtitle,regexp_replace(pay, '千/月', '')as pay from basis where pay like '%千/月' ;
    create table shuju1yw as select jobtitle as jobtitle,regexp_replace(pay, '万/月', '')as pay from basis where pay like '%万/月' ;
    create table shuju1n as select jobtitle as jobtitle,regexp_replace(pay, '万/年', '')as pay from basis where pay like '%万/年';
     create table shuju2s as select jobtitle as jobtitle,split(pay,'-')[0]*1000 as min ,split(pay,'-')[1]*1000 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*1000 as avg from shuju1y;
    insert into shuju2s select jobtitle as jobtitle,split(pay,'-')[0]*10000 as min ,split(pay,'-')[1]*10000 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*10000 as avg from shuju1yw;
    insert into shuju2s select jobtitle as jobtitle,split(pay,'-')[0]*10000/12 as min ,split(pay,'-')[1]*10000/12 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*10000/12 as avg from shuju1n;
    create table wages as select min(min) as min,max(max) as max,avg(avg) as avg from shuju2y;
    drop table shuju1y;
    drop table shuju1yw;
    drop table shuju1n;
    
    
    create table kgwei as select site, count(site) from basis where site regexp '上海|成都|北京|深圳|广州' group by site;
    
    
    drop table shuju;
    drop table shuju2;
    drop table shuju2s;
    drop table shuju2y;
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38

    kaifa.hql

    create database kaifa;
    use kaifa;
    create table shuju(id string,dates string, jobtitle string, pay string,unit string , exp string ,edu string ,site string) row format delimited fields terminated by ',';
    load data  inpath '/lmq/trains/kaifa' into table shuju;
    create table shuju2 as select substr(jobtitle,13) as jobtitle, substr(pay,8) as pay ,substr(site,9,2) as site ,substr(exp,15) as exp,substr(dates,10,5) as dates from shuju  group by jobtitle,pay,site,exp,dates;
    create table basis  as select regexp_replace(jobtitle, '"', '') as jobtitle, regexp_replace(pay, '"', '')as pay,regexp_replace(site, '"', '')as site ,regexp_replace(exp, '"', '')as exp from shuju2;
    
    create table shuju1y as select jobtitle as jobtitle,regexp_replace(pay, '千/月', '')as pay from basis where pay like '%千/月' ;
    create table shuju1yw as select jobtitle as jobtitle,regexp_replace(pay, '万/月', '')as pay from basis where pay like '%万/月' ;
    create table shuju1n as select jobtitle as jobtitle,regexp_replace(pay, '万/年', '')as pay from basis where pay like '%万/年';
     create table shuju2y as select jobtitle as jobtitle,split(pay,'-')[0]*1000 as min ,split(pay,'-')[1]*1000 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*1000 as avg from shuju1y;
    insert into shuju2y select jobtitle as jobtitle,split(pay,'-')[0]*10000 as min ,split(pay,'-')[1]*10000 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*10000 as avg from shuju1yw;
    insert into shuju2y select jobtitle as jobtitle,split(pay,'-')[0]*10000/12 as min ,split(pay,'-')[1]*10000/12 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*10000/12 as avg from shuju1n;
    create table gzs as select min(min) as min,max(max) as max,avg(avg) as avg from shuju2y;
    drop table shuju1y;
    drop table shuju1yw;
    drop table shuju1n;
    
    insert overwrite table basis select * from basis where exp regexp '1年经验|2年经验|3年经验|1-2年经验|1-3年经验|2-3年经验|无需经验' ;
    create table shuju1y as select jobtitle as jobtitle,regexp_replace(pay, '千/月', '')as pay from basis where pay like '%千/月' ;
    create table shuju1yw as select jobtitle as jobtitle,regexp_replace(pay, '万/月', '')as pay from basis where pay like '%万/月' ;
    create table shuju1n as select jobtitle as jobtitle,regexp_replace(pay, '万/年', '')as pay from basis where pay like '%万/年';
     create table shuju2s as select jobtitle as jobtitle,split(pay,'-')[0]*1000 as min ,split(pay,'-')[1]*1000 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*1000 as avg from shuju1y;
    insert into shuju2s select jobtitle as jobtitle,split(pay,'-')[0]*10000 as min ,split(pay,'-')[1]*10000 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*10000 as avg from shuju1yw;
    insert into shuju2s select jobtitle as jobtitle,split(pay,'-')[0]*10000/12 as min ,split(pay,'-')[1]*10000/12 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*10000/12 as avg from shuju1n;
    create table wages as select min(min) as min,max(max) as max,avg(avg) as avg from shuju2y;
    drop table shuju1y;
    drop table shuju1yw;
    drop table shuju1n;
    
    
    create table kgwei as select site, count(site) from basis where site regexp '上海|成都|北京|深圳|广州' group by site;
    
    drop table shuju;
    drop table shuju2;
    drop table shuju2s;
    drop table shuju2y;
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37

    cji.hql

    create database cji;
    use cji;
    create table shuju(id string,dates string, jobtitle string, pay string,unit string , exp string ,edu string ,site string) row format delimited fields terminated by ',';
    load data  inpath '/lmq/trains/cji' into table shuju;
    create table shuju2 as select substr(jobtitle,13) as jobtitle, substr(pay,8) as pay ,substr(site,9,2) as site ,substr(exp,15) as exp,substr(dates,10,5) as dates from shuju  group by jobtitle,pay,site,exp,dates;
    create table basis  as select regexp_replace(jobtitle, '"', '') as jobtitle, regexp_replace(pay, '"', '')as pay,regexp_replace(site, '"', '')as site ,regexp_replace(exp, '"', '')as exp from shuju2;
    
    create table shuju1y as select jobtitle as jobtitle,regexp_replace(pay, '千/月', '')as pay from basis where pay like '%千/月' ;
    create table shuju1yw as select jobtitle as jobtitle,regexp_replace(pay, '万/月', '')as pay from basis where pay like '%万/月' ;
    create table shuju1n as select jobtitle as jobtitle,regexp_replace(pay, '万/年', '')as pay from basis where pay like '%万/年';
     create table shuju2y as select jobtitle as jobtitle,split(pay,'-')[0]*1000 as min ,split(pay,'-')[1]*1000 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*1000 as avg from shuju1y;
    insert into shuju2y select jobtitle as jobtitle,split(pay,'-')[0]*10000 as min ,split(pay,'-')[1]*10000 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*10000 as avg from shuju1yw;
    insert into shuju2y select jobtitle as jobtitle,split(pay,'-')[0]*10000/12 as min ,split(pay,'-')[1]*10000/12 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*10000/12 as avg from shuju1n;
    create table gzs as select min(min) as min,max(max) as max,avg(avg) as avg from shuju2y;
    drop table shuju1y;
    drop table shuju1yw;
    drop table shuju1n;
    
    insert overwrite table basis select * from basis where exp regexp '1年经验|2年经验|3年经验|1-2年经验|1-3年经验|2-3年经验|无需经验' ;
    create table shuju1y as select jobtitle as jobtitle,regexp_replace(pay, '千/月', '')as pay from basis where pay like '%千/月' ;
    create table shuju1yw as select jobtitle as jobtitle,regexp_replace(pay, '万/月', '')as pay from basis where pay like '%万/月' ;
    create table shuju1n as select jobtitle as jobtitle,regexp_replace(pay, '万/年', '')as pay from basis where pay like '%万/年';
     create table shuju2s as select jobtitle as jobtitle,split(pay,'-')[0]*1000 as min ,split(pay,'-')[1]*1000 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*1000 as avg from shuju1y;
    insert into shuju2s select jobtitle as jobtitle,split(pay,'-')[0]*10000 as min ,split(pay,'-')[1]*10000 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*10000 as avg from shuju1yw;
    insert into shuju2s select jobtitle as jobtitle,split(pay,'-')[0]*10000/12 as min ,split(pay,'-')[1]*10000/12 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*10000/12 as avg from shuju1n;
    create table wages as select min(min) as min,max(max) as max,avg(avg) as avg from shuju2y;
    drop table shuju1y;
    drop table shuju1yw;
    drop table shuju1n;
    
    
    create table kgwei as select site, count(site) from basis where site regexp '上海|成都|北京|深圳|广州' group by site;
    
    
    drop table shuju;
    drop table shuju2;
    drop table shuju2s;
    drop table shuju2y;
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38

    qushi.hql

    create database qushi;
    use qushi;
    create table shuju(id string,dates string, jobtitle string, pay string,unit string , exp string ,edu string ,site string) row format delimited fields terminated by ',';
    load data local inpath '/home/sofware/hive-2.3.6/data/shuju.txt' into table shuju;
    create table shuju2 as select substr(jobtitle,13) as jobtitle, substr(dates,10,5) as dates from shuju  group by jobtitle,dates;
    create table qushi as select  regexp_replace(jobtitle, '"', '') as jobtitle,regexp_replace(dates, '"', '') as dates from shuju2  group by jobtitle,dates;
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6

    1.2、启动hive,并执行hql命令

    hive> source /home/sofware/hive-2.3.6/data/fenxi.hql;
    hive> source /home/sofware/hive-2.3.6/data/cji.hql;
    hive> source /home/sofware/hive-2.3.6/data/kaifa.hql;
    hive> source /home/sofware/hive-2.3.6/data/qushi.hql;
    
    • 1
    • 2
    • 3
    • 4

    执行后得到表

    将处理后的数据使用sqoop传到mysql上

    2,在MySQL上创建表

    工资表

     create table gz(min varchar(200),max varchar(200),avg varchar(200));
    
    • 1

    1-3年工资

     create table wages(min varchar(200),max varchar(200),avg varchar(200));
    
    • 1

    每个城市岗位数

    create table gwei(city varchar(200),counts varchar(200));
    
    • 1

    趋势

    create table qushi(title varchar(200),dates varchar(200));
    
    • 1

    3,sqoop将hive的数据导入到mysql

    sqoop export --connect jdbc:mysql://192.168.204.133:3306/shixun --username root --password 'lmq12345L@'  --table wages --export-dir '/user/hive/warehouse/shixun.db/wages' --input-fields-terminated-by ''
    sqoop export --connect jdbc:mysql://192.168.204.133:3306/shixun --username root --password 'lmq12345L@'  --table gz --export-dir '/user/hive/warehouse/fenxi.db/gzs' --input-fields-terminated-by ''
    sqoop export --connect jdbc:mysql://192.168.204.133:3306/shixun --username root --password 'lmq12345L@'  --table gwei --export-dir '/user/hive/warehouse/fenxi.db/kgwei' --input-fields-terminated-by ''
    sqoop export --connect jdbc:mysql://192.168.204.133:3306/shixun --username root --password 'lmq12345L@'  --table qushi --export-dir '/user/hive/warehouse/qushi.db/qushi' --input-fields-terminated-by ''
    
    • 1
    • 2
    • 3
    • 4

    四、可视化图

    1,工资

    2,岗位数

    3,经验是1-3年的工资

    4,趋势图

  • 相关阅读:
    Linux virtio-net driver
    【C++ STL】哈希 Hash(闭散列、开散列介绍及其实现)
    15.二分法
    部署机器学习模型必须考虑的5个要素;用Python快速写web应用的框架;神经网络可视化;3D医学图像分割工具包;前沿论文 | ShowMeAI资讯日报
    51单片机驱动TCS3200颜色识别传感器
    逐步解决Could not find artifact com:ojdbc8:jar:12
    vue中ElementUi的el-table表格绑定行点击事件
    8.Docker MySQL 主从复制
    SpringBoot集成Redis业务功能 02、定时任务+Redis删除特定前缀key的优雅实现
    四十二、《大数据项目实战之用户行为分析》多框架整合实时分析用户行为日志数据流
  • 原文地址:https://blog.csdn.net/m0_54866636/article/details/126369188