【Hadoop】第一个MapReduce程序 - WordCount - Windows本地运行

文章目录

1 客户端环境准备
2 创建Maven工程
3 编写程序
4 输入输出
- 输入
- 输出

1 客户端环境准备

2 创建Maven工程

添加依赖

<dependencies>
    <dependency>
        <groupId>org.apache.hadoopgroupId>
        <artifactId>hadoop-clientartifactId>
        <version>3.1.3version>
    dependency>
    <dependency>
        <groupId>junitgroupId>
        <artifactId>junitartifactId>
        <version>4.12version>
    dependency>
    <dependency>
        <groupId>org.slf4jgroupId>
        <artifactId>slf4j-log4j12artifactId>
        <version>1.7.30version>
    dependency>
dependencies>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17

在resources目录下创建 log4j.properties

log4j.rootLogger=INFO, stdout  
log4j.appender.stdout=org.apache.log4j.ConsoleAppender  
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout  
log4j.appender.stdout.layout.ConversionPattern=%d %p [%c] - %m%n  
log4j.appender.logfile=org.apache.log4j.FileAppender  
log4j.appender.logfile.File=target/spring.log  
log4j.appender.logfile.layout=org.apache.log4j.PatternLayout  
log4j.appender.logfile.layout.ConversionPattern=%d %p [%c] - %m%n
1
2
3
4
5
6
7
8

创建包，创建类

在包下创建

WordCountMapper类
WordCountReducer类
WordCountDriver类

目录结构如下：
在这里插入图片描述

3 编写程序

几点注意：

在编写程序的时候需要注意，有很多同名的类，导包的时候要小心
运行程序时，确保输出路径是不存在的，若输出路径已存在，会报错

WordCountMapper

package maoreduce.wordcount;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {


    Text k = new Text();

    IntWritable v = new IntWritable(1);

    @Override
    protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException {
        String line = value.toString();

        String[] words = line.split(" ");

        for (String word : words) {

            k.set(word);

            context.write(k, v);

        }
    }
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31

WordCountReducer

package maoreduce.wordcount;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {


    private int sum;

    private IntWritable v = new IntWritable();

    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {

        sum = 0;

        for (IntWritable value : values) {

            sum += value.get();

        }

        v.set(sum);

        context.write(key, v);

    }
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32

WordCountDriver

package maoreduce.wordcount;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


import java.io.IOException;

public class WordCountDriver {

    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

        //获取配置信息以及获取job对象
        Configuration configuration = new Configuration();
        Job job = Job.getInstance(configuration);

        //关联本Driver程序的jar
        job.setJarByClass(WordCountDriver.class);

        //关联Mapper和Reducer的jar
        job.setMapperClass(WordCountMapper.class);
        job.setReducerClass(WordCountReducer.class);

        //设置Map输出的kv类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        //设置最终输出的kv类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        //设置输入和输出路径
        FileInputFormat.setInputPaths(job, new Path("D:\\input\\"));
        FileOutputFormat.setOutputPath(job, new Path("D:\\output\\"));

        //提交job
        boolean res = job.waitForCompletion(true);
        System.exit(res ? 0 : 1);
    }
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46

4 输入输出

输入

在D盘的input目录下
在这里插入图片描述

输出

结果输出到代码中指定的路径下，会自动创建output文件夹
在这里插入图片描述

相关阅读:
MySQL 清空表截断表
 搭建VUE前端项目流程——Node.js 、Yarn、npm、Vue、Vite、Webpack
边缘计算结合深度学习实现云边端协同应用概述
 关于计数以及Index返回订单号升级版可以控制年月日累计（不重复）（sqlite）
【无标题】
Go协程揭秘：轻量、并发与性能的完美结合
 ONLYOFFICE8.1版本桌面编辑器测评
 Gson反序列化原理
 入侵检测系统
 在 Python 中构建高度可扩展的数据流管道
原文地址：https://blog.csdn.net/guliguliguliguli/article/details/126577323