• 使用docker compose快速搭建spark集群


    在个人学习大数据的过程中,搭建环境永远是最令人头疼的一环,因为分布式的特点,个人爱好者往往只能在自己的单主机上模拟分布式环境,传统的做法是使用虚拟机模拟,但是创建多个虚拟机不仅非常消耗资源,而且费时费力。因此,我推荐使用docker去搭建大数据的环境。

    前提条件:已经有docker,docker compose环境

    1. 创建docker compose yml

    在任意文件夹下,创建这两个文件:

    1. docker-compose.yml文件,这个文件中描述了我们要创建的容器信息,我这里使用的是hadoop 3.2.1,spark 3.2.1,容器端口映射了hdfs端口(9000)、spark UI(8080),spark master的url是spark://master:7077:
    version: "3.3"
    services:
      namenode:
        image: bde2020/hadoop-namenode:2.0.0-hadoop3.2.1-java8
        container_name: namenode
        ports:
          - 9870:9870
          - 9000:9000
        volumes:
          - ./hadoop/dfs/name:/hadoop/dfs/name
          - ./input:/input
        environment:
          - CLUSTER_NAME=test
        env_file:
          - ./hadoop.env
    
      datanode:
        image: bde2020/hadoop-datanode:2.0.0-hadoop3.2.1-java8
        container_name: datanode
        depends_on:
          - namenode
        volumes:
          - ./hadoop/dfs/data:/hadoop/dfs/data
        environment:
          SERVICE_PRECONDITION: "namenode:9870"
        env_file:
          - ./hadoop.env
      
      resourcemanager:
        image: bde2020/hadoop-resourcemanager:2.0.0-hadoop3.2.1-java8
        container_name: resourcemanager
        environment:
          SERVICE_PRECONDITION: "namenode:9000 namenode:9870 datanode:9864"
        env_file:
          - ./hadoop.env
    
      nodemanager1:
        image: bde2020/hadoop-nodemanager:2.0.0-hadoop3.2.1-java8
        container_name: nodemanager
        environment:
          SERVICE_PRECONDITION: "namenode:9000 namenode:9870 datanode:9864 resourcemanager:8088"
        env_file:
          - ./hadoop.env
      
      historyserver:
        image: bde2020/hadoop-historyserver:2.0.0-hadoop3.2.1-java8
        container_name: historyserver
        environment:
          SERVICE_PRECONDITION: "namenode:9000 namenode:9870 datanode:9864 resourcemanager:8088"
        volumes:
          - ./hadoop/yarn/timeline:/hadoop/yarn/timeline
        env_file:
          - ./hadoop.env
        
      master:
        image: bitnami/spark:3.2.1
        container_name: master
        user: root
        environment:
          - SPARK_MODE=master
          - SPARK_RPC_AUTHENTICATION_ENABLED=no
          - SPARK_RPC_ENCRYPTION_ENABLED=no
          - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
          - SPARK_SSL_ENABLED=no
        ports:
          - '8080:8080'
          - '7077:7077'
        volumes:
          - ./python:/python
    
      worker1:
        image: bitnami/spark:3.2.1
        container_name: worker1
        user: root
        environment:
          - SPARK_MODE=worker
          - SPARK_MASTER_URL=spark://master:7077
          - SPARK_WORKER_MEMORY=1G
          - SPARK_WORKER_CORES=1
          - SPARK_RPC_AUTHENTICATION_ENABLED=no
          - SPARK_RPC_ENCRYPTION_ENABLED=no
          - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
          - SPARK_SSL_ENABLED=no
      worker2:
        image: bitnami/spark:3.2.1
        container_name: worker2
        user: root
        environment:
          - SPARK_MODE=worker
          - SPARK_MASTER_URL=spark://master:7077
          - SPARK_WORKER_MEMORY=1G
          - SPARK_WORKER_CORES=1
          - SPARK_RPC_AUTHENTICATION_ENABLED=no
          - SPARK_RPC_ENCRYPTION_ENABLED=no
          - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
          - SPARK_SSL_ENABLED=no
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56
    • 57
    • 58
    • 59
    • 60
    • 61
    • 62
    • 63
    • 64
    • 65
    • 66
    • 67
    • 68
    • 69
    • 70
    • 71
    • 72
    • 73
    • 74
    • 75
    • 76
    • 77
    • 78
    • 79
    • 80
    • 81
    • 82
    • 83
    • 84
    • 85
    • 86
    • 87
    • 88
    • 89
    • 90
    • 91
    • 92
    • 93
    • 94
    • 95
    • 96
    1. hadoop.env(在docker-compose.yml中定义容器的时候使用了这个文件来指定设置)
    CORE_CONF_fs_defaultFS=hdfs://namenode:9000
    CORE_CONF_hadoop_http_staticuser_user=root
    CORE_CONF_hadoop_proxyuser_hue_hosts=*
    CORE_CONF_hadoop_proxyuser_hue_groups=*
    CORE_CONF_io_compression_codecs=org.apache.hadoop.io.compress.SnappyCodec
    
    HDFS_CONF_dfs_webhdfs_enabled=true
    HDFS_CONF_dfs_permissions_enabled=false
    HDFS_CONF_dfs_namenode_datanode_registration_ip___hostname___check=false
    
    YARN_CONF_yarn_log___aggregation___enable=true
    YARN_CONF_yarn_log_server_url=http://historyserver:8188/applicationhistory/logs/
    YARN_CONF_yarn_resourcemanager_recovery_enabled=true
    YARN_CONF_yarn_resourcemanager_store_class=org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore
    YARN_CONF_yarn_resourcemanager_scheduler_class=org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler
    YARN_CONF_yarn_scheduler_capacity_root_default_maximum___allocation___mb=8192
    YARN_CONF_yarn_scheduler_capacity_root_default_maximum___allocation___vcores=4
    YARN_CONF_yarn_resourcemanager_fs_state___store_uri=/rmstate
    YARN_CONF_yarn_resourcemanager_system___metrics___publisher_enabled=true
    YARN_CONF_yarn_resourcemanager_hostname=resourcemanager
    YARN_CONF_yarn_resourcemanager_address=resourcemanager:8032
    YARN_CONF_yarn_resourcemanager_scheduler_address=resourcemanager:8030
    YARN_CONF_yarn_resourcemanager_resource__tracker_address=resourcemanager:8031
    YARN_CONF_yarn_timeline___service_enabled=true
    YARN_CONF_yarn_timeline___service_generic___application___history_enabled=true
    YARN_CONF_yarn_timeline___service_hostname=historyserver
    YARN_CONF_mapreduce_map_output_compress=true
    YARN_CONF_mapred_map_output_compress_codec=org.apache.hadoop.io.compress.SnappyCodec
    YARN_CONF_yarn_nodemanager_resource_memory___mb=16384
    YARN_CONF_yarn_nodemanager_resource_cpu___vcores=8
    YARN_CONF_yarn_nodemanager_disk___health___checker_max___disk___utilization___per___disk___percentage=98.5
    YARN_CONF_yarn_nodemanager_remote___app___log___dir=/app-logs
    YARN_CONF_yarn_nodemanager_aux___services=mapreduce_shuffle
    
    MAPRED_CONF_mapreduce_framework_name=yarn
    MAPRED_CONF_mapred_child_java_opts=-Xmx4096m
    MAPRED_CONF_mapreduce_map_memory_mb=4096
    MAPRED_CONF_mapreduce_reduce_memory_mb=8192
    MAPRED_CONF_mapreduce_map_java_opts=-Xmx3072m
    MAPRED_CONF_mapreduce_reduce_java_opts=-Xmx6144m
    MAPRED_CONF_yarn_app_mapreduce_am_env=HADOOP_MAPRED_HOME=/data/docker-compose/hadoop-3.2.1/
    MAPRED_CONF_mapreduce_map_env=HADOOP_MAPRED_HOME=/data/docker-compose/hadoop-3.2.1/
    MAPRED_CONF_mapreduce_reduce_env=HADOOP_MAPRED_HOME=/data/docker-compose/hadoop-3.2.1/
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43

    2. 启动容器

    在这个文件夹下运行:

    docker-compose up -d
    
    • 1

    等待docker下载镜像,启动容器之后,就可以查看容器情况了:

    在这里插入图片描述

    3. 运行spark任务

    进入spark master容器:

    sudo docker exec -it ${master容器id} /bin/bash
    
    • 1

    运行spark示例程序:

    ./bin/spark-submit --master spark://master:7077  --executor-memory 1G --executor-cores 1 --class org.apache.spark.examples.SparkPi ./examples/jars/spark-examples_2.12-3.2.1.jar 1000
    
    • 1

    启动任务之后,我们在虚拟机中使用localhost:8080查看程序(其实是master上面的8080端口映射到物理机上的)
    在这里插入图片描述

    感想:虚拟化真是个好玩意,日常后悔没有去浙大去做虚拟化技术。

  • 相关阅读:
    svg图标转组件引入项目中
    手把手教你Magisk安装
    Java—Map
    解决 PLC QModbusTcpClient 通信自动断开
    操作视频的开始与暂停
    MATLAB算法实战应用案例精讲-【图像处理】SLAM技术详解(最终篇)
    JS网页特效实例:动态关闭页面
    微服务系列之单体架构
    【数据结构】-----二叉树(递归、层次实现二叉树的遍历)
    Matlab 删除含有特定数值的行 / Matlab定义excel表头
  • 原文地址:https://blog.csdn.net/cobracanary/article/details/126246695