总之从几个常见的方面入手:变小文件为大文件,减少Map的数量;压缩最终的输出数据或Map的中间输出结果;在Hadoop安装路径下的conf目录下修改属性,使能够同时运行的Map和Reduce任务数增多,从而提高性能。
import java.io.IOException;
import java.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.*;
import org.apache.hadoop.mapreduce.lib.output.*;
import org.apache.hadoop.util.*;
public class WordCount extends Configured implements Tool{
// Map函数
public static class Map extends Mapper<LongWritable, Text, Text, IntWritable>{
private final static IntWritable one=new IntWritable(1);
private Text word=new Text();
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException{
String line=value.toString();
StringTokenizer tokenizer=new StringTokenizer(line);
while(tokenizer.hasMoreTokens()){
word.set(tokenizer.nextToken());
context.write(word, one);
}
}
}
// Reduce函数
public static class Reduce extends Reducer<Text, IntWritable, Tex