Hadoop的HDFS和MapReduce都是针对大数据文件来设计的,在小文件的处理上不但效率低下,而且十分消耗内存资源
针对HDFS而言,每一个小文件在namenode中都会占用150字节的内存空间,最终会导致集群中虽然存储了很多个文件,但是文件的体积并不大,这样就没有意义了。
针对MapReduce而言,每一个小文件都是一个Block,都会产生一个InputSplit,最终每一个小文件都会产生一个map任务,这样会导致同时启动太多的Map任务,Map任务的启动是非常消耗性能的,但是启动了以后执行了很短时间就停止了,因为小文件的数据量太小了,这样就会造成任务执行消耗的时间还没有启动任务消耗的时间多,这样也会影响MapReduce执行的效率。
针对这个问题,解决办法通常是选择一个容器,将这些小文件组织起来统一存储,HDFS提供了两种类型的容器,分别是SequenceFile 和 MapFile
相对SequenceFile而言,MapFile的检索效率是高效的,缺点是会消耗一部分内存来存储index数据。
- package hadoop.mr;
-
- import org.apache.commons.io.FileUtils;
- import org.apache.hadoop.conf.Configuration;
- import org.apache.hadoop.fs.FileSystem;
- import org.apache.hadoop.fs.FileUtil;
- import org.apache.hadoop.fs.Path;
- import org.apache.hadoop.io.SequenceFile;
- import org.apache.hadoop.io.Text;
-
- import java.io.File;
- import java.io.IOException;
-
- public class smallFileSeq {
-
- public static void main(String[] args) throws Exception{
-
- write("D:\\smallFile","/seqFile");
-
- }
-
-
- private static void write(String inputDir, String outputFile) throws Exception {
-
- Configuration conf = new Configuration();
- conf.set("fs.defaultFS", "hdfs://192.168.221.131:9000");
- try {
- FileSystem fileSystem = FileSystem.get(conf);
- fileSystem.delete(new Path(outputFile),true);
- } catch (IOException e) {
- e.printStackTrace();
- }
-
- //构造opts数组,有三个元素
- /*
- 第一个是输出路径
- 第二个是key类型
- 第三个是value类型
- */
- SequenceFile.Writer.Option[] opts = new SequenceFile.Writer.Option[]{
- SequenceFile.Writer.file(new Path(outputFile)),
-
- SequenceFile.Writer.keyClass(Text.class),
- SequenceFile.Writer.valueClass(Text.class)};
- //创建一个writer实例
- SequenceFile.Writer writer = SequenceFile.createWriter(conf, opts);
-
- File inputDirPath = new File(inputDir);
- if(inputDirPath.isDirectory()){
- File[] files = inputDirPath.listFiles();
- for(File file:files){
- String content= FileUtils.readFileToString(file,"UTF-8");
- Text key = new Text(file.getName());
- Text value = new Text(content);
- writer.append(key,value);
- }
- writer.close();
- }
- }
- }
运行结果:
- public static void main(String[] args) throws Exception{
- read("/seqFile");
- }
-
- private static void write(String inputDir, String outputFile) throws Exception {
-
- Configuration conf = new Configuration();
- conf.set("fs.defaultFS", "hdfs://192.168.221.131:9000");
- try {
- FileSystem fileSystem = FileSystem.get(conf);
- fileSystem.delete(new Path(outputFile),true);
- } catch (IOException e) {
- e.printStackTrace();
- }
-
-
- private static void read(String inputFile) throws Exception{
-
- Configuration conf = new Configuration();
- conf.set("fs.defaultFS", "hdfs://192.168.221.131:9000");
-
- SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(new Path(inputFile)));
- Text key = new Text();
- Text value = new Text();
- while (reader.next(key,value)){
- System.out.print("文件名"+key.toString()+",");
- System.out.print("文件内容"+value.toString()+",");
- System.out.println("====");
- }
- reader.close();
- }
运行结果:
- public static void main(String[] args) throws Exception{
- write("D:\\smallFile","/mapFile");
- }
-
- private static void write(String inputDir, String outputFile) throws Exception {
-
- Configuration conf = new Configuration();
- conf.set("fs.defaultFS", "hdfs://192.168.221.131:9000");
- try {
- FileSystem fileSystem = FileSystem.get(conf);
- fileSystem.delete(new Path(outputFile),true);
- } catch (IOException e) {
- e.printStackTrace();
- }
- SequenceFile.Writer.Option[] opts = new SequenceFile.Writer.Option[]{
- //SequenceFile.Writer.file(new Path(outputFile)),
- MapFile.Writer.keyClass(Text.class),
- MapFile.Writer.valueClass(Text.class)};
-
- MapFile.Writer writer = new MapFile.Writer(conf, new Path(outputFile),opts);
- File inputDirPath = new File(inputDir);
- if(inputDirPath.isDirectory()){
- File[] files = inputDirPath.listFiles();
- for(File file:files){
- String content= FileUtils.readFileToString(file,"UTF-8");
- Text key = new Text(file.getName());
- Text value = new Text(content);
- writer.append(key,value);
- }
- writer.close();
- }
- }

-
- public class smallFileMapSeq {
-
- public static void main(String[] args) throws Exception{
- read("/mapFile");
- }
-
- private static void read(String inputFile) throws Exception{
-
- Configuration conf = new Configuration();
- conf.set("fs.defaultFS", "hdfs://192.168.221.131:9000");
-
- MapFile.Reader reader = new MapFile.Reader(new Path(inputFile),conf);
- Text key = new Text();
- Text value = new Text();
- while (reader.next(key,value)){
- System.out.print("文件名"+key.toString()+",");
- System.out.print("文件内容"+value.toString()+",");
- System.out.println("====");
- }
- reader.close();
- }
- }