1,scrapy爬取智联招聘和中华英才网
flume配置文件
#name the components on this agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1
# Describe/configure the source
a1.sources.r1.type = exec
a1.sources.r1.command = tail -F /home/data/job.txt #监控data目录下
a1.sources.r1.channels = c1
# Describe the sink
a1.sinks.k1.type = hdfs
a1.sinks.k1.channel = c1
a1.sinks.k1.hdfs.path =hdfs://hadoop01:9000/flume/tailout/%y-%m-%d/%H-%M/ #把数据收集到hdfs目录下
a1.sinks.k1.hdfs.filePrefix = log-
a1.sinks.k1.hdfs.round = true
a1.sinks.k1.hdfs.roundValue = 10
a1.sinks.k1.hdfs.roundUnit = minute
a1.sinks.k1.hdfs.rollInterval = 3
a1.sinks.k1.hdfs.rollSize = 20
a1.sinks.k1.hdfs.rollCount = 5
a1.sinks.k1.hdfs.batchSize = 1
a1.sinks.k1.hdfs.useLocalTimeStamp = true
#生成的文件类型,默认是Sequencefile,可用DataStream,则为普通文本
a1.sinks.k1.hdfs.fileType = DataStream
# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
启动flume;
flume-ng agent -c conf -f shixun.conf -n a1 -Dflume.root.logger=INFO,console
将文件放到/home/tailout ,就会将数据传到hdfs上
fenxi.hql
create database fenxi;
use fenxi;
create table shuju(id string,dates string, jobtitle string, pay string,unit string , exp string ,edu string ,site string) row format delimited fields terminated by ' ';
load data local inpath '/home/sofware/hive-2.3.6/data/fenxi.txt' into table shuju;
create table shuju2 as select substr(jobtitle,13) as jobtitle, substr(pay,8) as pay ,substr(site,9,2) as site ,substr(exp,15) as exp,substr(dates,10,5) as dates from shuju group by jobtitle,pay,site,exp,dates;
create table basis as select regexp_replace(jobtitle, '"', '') as jobtitle, regexp_replace(pay, '"', '')as pay,regexp_replace(site, '"', '')as site ,regexp_replace(exp, '"', '')as exp from shuju2;
create table shuju1y as select jobtitle as jobtitle,regexp_replace(pay, '千/月', '')as pay from basis where pay like '%千/月' ;
create table shuju1yw as select jobtitle as jobtitle,regexp_replace(pay, '万/月', '')as pay from basis where pay like '%万/月' ;
create table shuju1n as select jobtitle as jobtitle,regexp_replace(pay, '万/年', '')as pay from basis where pay like '%万/年';
create table shuju2y as select jobtitle as jobtitle,split(pay,'-')[0]*1000 as min ,split(pay,'-')[1]*1000 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*1000 as avg from shuju1y;
insert into shuju2y select jobtitle as jobtitle,split(pay,'-')[0]*10000 as min ,split(pay,'-')[1]*10000 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*10000 as avg from shuju1yw;
insert into shuju2y select jobtitle as jobtitle,split(pay,'-')[0]*10000/12 as min ,split(pay,'-')[1]*10000/12 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*10000/12 as avg from shuju1n;
create table gzs as select min(min) as min,max(max) as max,avg(avg) as avg from shuju2y;
drop table shuju1y;
drop table shuju1yw;
drop table shuju1n;
insert overwrite table basis select * from basis where exp regexp '1年经验|2年经验|3年经验|1-2年经验|1-3年经验|2-3年经验|无需经验' ;
create table shuju1y as select jobtitle as jobtitle,regexp_replace(pay, '千/月', '')as pay from basis where pay like '%千/月' ;
create table shuju1yw as select jobtitle as jobtitle,regexp_replace(pay, '万/月', '')as pay from basis where pay like '%万/月' ;
create table shuju1n as select jobtitle as jobtitle,regexp_replace(pay, '万/年', '')as pay from basis where pay like '%万/年';
create table shuju2s as select jobtitle as jobtitle,split(pay,'-')[0]*1000 as min ,split(pay,'-')[1]*1000 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*1000 as avg from shuju1y;
insert into shuju2s select jobtitle as jobtitle,split(pay,'-')[0]*10000 as min ,split(pay,'-')[1]*10000 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*10000 as avg from shuju1yw;
insert into shuju2s select jobtitle as jobtitle,split(pay,'-')[0]*10000/12 as min ,split(pay,'-')[1]*10000/12 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*10000/12 as avg from shuju1n;
create table wages as select min(min) as min,max(max) as max,avg(avg) as avg from shuju2y;
drop table shuju1y;
drop table shuju1yw;
drop table shuju1n;
create table kgwei as select site, count(site) from basis where site regexp '上海|成都|北京|深圳|广州' group by site;
drop table shuju;
drop table shuju2;
drop table shuju2s;
drop table shuju2y;
kaifa.hql
create database kaifa;
use kaifa;
create table shuju(id string,dates string, jobtitle string, pay string,unit string , exp string ,edu string ,site string) row format delimited fields terminated by ',';
load data inpath '/lmq/trains/kaifa' into table shuju;
create table shuju2 as select substr(jobtitle,13) as jobtitle, substr(pay,8) as pay ,substr(site,9,2) as site ,substr(exp,15) as exp,substr(dates,10,5) as dates from shuju group by jobtitle,pay,site,exp,dates;
create table basis as select regexp_replace(jobtitle, '"', '') as jobtitle, regexp_replace(pay, '"', '')as pay,regexp_replace(site, '"', '')as site ,regexp_replace(exp, '"', '')as exp from shuju2;
create table shuju1y as select jobtitle as jobtitle,regexp_replace(pay, '千/月', '')as pay from basis where pay like '%千/月' ;
create table shuju1yw as select jobtitle as jobtitle,regexp_replace(pay, '万/月', '')as pay from basis where pay like '%万/月' ;
create table shuju1n as select jobtitle as jobtitle,regexp_replace(pay, '万/年', '')as pay from basis where pay like '%万/年';
create table shuju2y as select jobtitle as jobtitle,split(pay,'-')[0]*1000 as min ,split(pay,'-')[1]*1000 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*1000 as avg from shuju1y;
insert into shuju2y select jobtitle as jobtitle,split(pay,'-')[0]*10000 as min ,split(pay,'-')[1]*10000 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*10000 as avg from shuju1yw;
insert into shuju2y select jobtitle as jobtitle,split(pay,'-')[0]*10000/12 as min ,split(pay,'-')[1]*10000/12 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*10000/12 as avg from shuju1n;
create table gzs as select min(min) as min,max(max) as max,avg(avg) as avg from shuju2y;
drop table shuju1y;
drop table shuju1yw;
drop table shuju1n;
insert overwrite table basis select * from basis where exp regexp '1年经验|2年经验|3年经验|1-2年经验|1-3年经验|2-3年经验|无需经验' ;
create table shuju1y as select jobtitle as jobtitle,regexp_replace(pay, '千/月', '')as pay from basis where pay like '%千/月' ;
create table shuju1yw as select jobtitle as jobtitle,regexp_replace(pay, '万/月', '')as pay from basis where pay like '%万/月' ;
create table shuju1n as select jobtitle as jobtitle,regexp_replace(pay, '万/年', '')as pay from basis where pay like '%万/年';
create table shuju2s as select jobtitle as jobtitle,split(pay,'-')[0]*1000 as min ,split(pay,'-')[1]*1000 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*1000 as avg from shuju1y;
insert into shuju2s select jobtitle as jobtitle,split(pay,'-')[0]*10000 as min ,split(pay,'-')[1]*10000 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*10000 as avg from shuju1yw;
insert into shuju2s select jobtitle as jobtitle,split(pay,'-')[0]*10000/12 as min ,split(pay,'-')[1]*10000/12 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*10000/12 as avg from shuju1n;
create table wages as select min(min) as min,max(max) as max,avg(avg) as avg from shuju2y;
drop table shuju1y;
drop table shuju1yw;
drop table shuju1n;
create table kgwei as select site, count(site) from basis where site regexp '上海|成都|北京|深圳|广州' group by site;
drop table shuju;
drop table shuju2;
drop table shuju2s;
drop table shuju2y;
cji.hql
create database cji;
use cji;
create table shuju(id string,dates string, jobtitle string, pay string,unit string , exp string ,edu string ,site string) row format delimited fields terminated by ',';
load data inpath '/lmq/trains/cji' into table shuju;
create table shuju2 as select substr(jobtitle,13) as jobtitle, substr(pay,8) as pay ,substr(site,9,2) as site ,substr(exp,15) as exp,substr(dates,10,5) as dates from shuju group by jobtitle,pay,site,exp,dates;
create table basis as select regexp_replace(jobtitle, '"', '') as jobtitle, regexp_replace(pay, '"', '')as pay,regexp_replace(site, '"', '')as site ,regexp_replace(exp, '"', '')as exp from shuju2;
create table shuju1y as select jobtitle as jobtitle,regexp_replace(pay, '千/月', '')as pay from basis where pay like '%千/月' ;
create table shuju1yw as select jobtitle as jobtitle,regexp_replace(pay, '万/月', '')as pay from basis where pay like '%万/月' ;
create table shuju1n as select jobtitle as jobtitle,regexp_replace(pay, '万/年', '')as pay from basis where pay like '%万/年';
create table shuju2y as select jobtitle as jobtitle,split(pay,'-')[0]*1000 as min ,split(pay,'-')[1]*1000 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*1000 as avg from shuju1y;
insert into shuju2y select jobtitle as jobtitle,split(pay,'-')[0]*10000 as min ,split(pay,'-')[1]*10000 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*10000 as avg from shuju1yw;
insert into shuju2y select jobtitle as jobtitle,split(pay,'-')[0]*10000/12 as min ,split(pay,'-')[1]*10000/12 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*10000/12 as avg from shuju1n;
create table gzs as select min(min) as min,max(max) as max,avg(avg) as avg from shuju2y;
drop table shuju1y;
drop table shuju1yw;
drop table shuju1n;
insert overwrite table basis select * from basis where exp regexp '1年经验|2年经验|3年经验|1-2年经验|1-3年经验|2-3年经验|无需经验' ;
create table shuju1y as select jobtitle as jobtitle,regexp_replace(pay, '千/月', '')as pay from basis where pay like '%千/月' ;
create table shuju1yw as select jobtitle as jobtitle,regexp_replace(pay, '万/月', '')as pay from basis where pay like '%万/月' ;
create table shuju1n as select jobtitle as jobtitle,regexp_replace(pay, '万/年', '')as pay from basis where pay like '%万/年';
create table shuju2s as select jobtitle as jobtitle,split(pay,'-')[0]*1000 as min ,split(pay,'-')[1]*1000 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*1000 as avg from shuju1y;
insert into shuju2s select jobtitle as jobtitle,split(pay,'-')[0]*10000 as min ,split(pay,'-')[1]*10000 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*10000 as avg from shuju1yw;
insert into shuju2s select jobtitle as jobtitle,split(pay,'-')[0]*10000/12 as min ,split(pay,'-')[1]*10000/12 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*10000/12 as avg from shuju1n;
create table wages as select min(min) as min,max(max) as max,avg(avg) as avg from shuju2y;
drop table shuju1y;
drop table shuju1yw;
drop table shuju1n;
create table kgwei as select site, count(site) from basis where site regexp '上海|成都|北京|深圳|广州' group by site;
drop table shuju;
drop table shuju2;
drop table shuju2s;
drop table shuju2y;
qushi.hql
create database qushi;
use qushi;
create table shuju(id string,dates string, jobtitle string, pay string,unit string , exp string ,edu string ,site string) row format delimited fields terminated by ',';
load data local inpath '/home/sofware/hive-2.3.6/data/shuju.txt' into table shuju;
create table shuju2 as select substr(jobtitle,13) as jobtitle, substr(dates,10,5) as dates from shuju group by jobtitle,dates;
create table qushi as select regexp_replace(jobtitle, '"', '') as jobtitle,regexp_replace(dates, '"', '') as dates from shuju2 group by jobtitle,dates;
hive> source /home/sofware/hive-2.3.6/data/fenxi.hql;
hive> source /home/sofware/hive-2.3.6/data/cji.hql;
hive> source /home/sofware/hive-2.3.6/data/kaifa.hql;
hive> source /home/sofware/hive-2.3.6/data/qushi.hql;
执行后得到表
工资表
create table gz(min varchar(200),max varchar(200),avg varchar(200));
1-3年工资
create table wages(min varchar(200),max varchar(200),avg varchar(200));
每个城市岗位数
create table gwei(city varchar(200),counts varchar(200));
趋势
create table qushi(title varchar(200),dates varchar(200));
sqoop export --connect jdbc:mysql://192.168.204.133:3306/shixun --username root --password 'lmq12345L@' --table wages --export-dir '/user/hive/warehouse/shixun.db/wages' --input-fields-terminated-by ''
sqoop export --connect jdbc:mysql://192.168.204.133:3306/shixun --username root --password 'lmq12345L@' --table gz --export-dir '/user/hive/warehouse/fenxi.db/gzs' --input-fields-terminated-by ''
sqoop export --connect jdbc:mysql://192.168.204.133:3306/shixun --username root --password 'lmq12345L@' --table gwei --export-dir '/user/hive/warehouse/fenxi.db/kgwei' --input-fields-terminated-by ''
sqoop export --connect jdbc:mysql://192.168.204.133:3306/shixun --username root --password 'lmq12345L@' --table qushi --export-dir '/user/hive/warehouse/qushi.db/qushi' --input-fields-terminated-by ''