• 搭建Spark on yarn环境


    1) Build
    ## download spark-3.2.1.tgz from http://archive.apache.org/dist/
    ## unpack to ~/work/spark-3.2.1-src
    $ cd ~/work/spark-3.2.1-src
    $ export MAVEN_OPTS="-Xss64m -Xmx2g -XX:ReservedCodeCacheSize=1g"
    $ dev/make-distribution.sh --name without-hadoop \
        --pip --tgz -Phive -Phive-thriftserver -Phadoop-provided -Pyarn
    $ tar xvf spark-3.2.1-bin-without-hadoop.tgz -C ..
    $ cd ..
    $ mv spark-3.2.1-bin-without-hadoop spark-3.2.1

    ## configure
    $ cd spark-3.2.1
    $ diff -u conf/spark-env.sh.template conf/spark-env.sh

    1. --- conf/spark-env.sh.template 2022-06-24 09:16:18.000000000 +0800
    2. +++ conf/spark-env.sh 2022-06-24 17:52:47.000000000 +0800
    3. @@ -71,3 +71,7 @@
    4. # You might get better performance to enable these options if using native BLAS (see SPARK-21305).
    5. # - MKL_NUM_THREADS=1 Disable multi-threading of Intel MKL
    6. # - OPENBLAS_NUM_THREADS=1 Disable multi-threading of OpenBLAS
    7. +
    8. +JAVA_HOME=/Library/Java/JavaVirtualMachines/jdk1.8.0_321.jdk/Contents/Home
    9. +SPARK_LOCAL_IP=localhost
    10. +SPARK_DIST_CLASSPATH=`hadoop classpath`

    $ diff -u conf/log4j.properties.template conf/log4j.properties

    1. $ diff -u conf/log4j.properties.template conf/log4j.properties
    2. --- conf/log4j.properties.template 2022-06-24 09:16:18.000000000 +0800
    3. +++ conf/log4j.properties 2022-06-24 16:28:28.000000000 +0800
    4. @@ -16,7 +16,7 @@
    5. #
    6. # Set everything to be logged to the console
    7. -log4j.rootCategory=INFO, console
    8. +log4j.rootCategory=WARN, console
    9. log4j.appender.console=org.apache.log4j.ConsoleAppender
    10. log4j.appender.console.target=System.err
    11. log4j.appender.console.layout=org.apache.log4j.PatternLayout

    ## test
    $ `hadoop classpath` bin/spark-submit \
        --class org.apache.spark.examples.SparkPi \
        examples/jars/spark-examples_2.12-3.2.1.jar 10

    1. 22/06/24 17:53:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
    2. Pi is roughly 3.1387311387311385

    2) Spark on yarn
    ## configure yarn
    $ cd ~/work/hadoop
    $ diff -u etc/hadoop/yarn-site.xml.orig etc/hadoop/yarn-site.xml

    1. --- etc/hadoop/yarn-site.xml 2022-05-17 09:20:54.000000000 +0800
    2. +++ /Users/sun_xo/work/hadoop/etc/hadoop/yarn-site.xml 2022-06-23 10:13:52.000000000 +0800
    3. @@ -29,4 +29,17 @@
    4. <name>yarn.log-aggregation.retain-seconds</name>
    5. <value>604800</value>
    6. </property>
    7. + <property>
    8. + <name>yarn.log.server.url</name>
    9. + <value>http://localhost:19888/jobhistory/logs</value>
    10. + </property>
    11. + <!-- close yarn memory check -->
    12. + <property>
    13. + <name>yarn.nodemanager.pmem-check-enabled</name>
    14. + <value>false</value>
    15. + </property>
    16. + <property>
    17. + <name>yarn.nodemanager.vmem-check-enabled</name>
    18. + <value>false</value>
    19. + </property>
    20. </configuration>

    ## configure spark
    $ diff -u spark-env.sh.template spark-env.sh

    1. --- spark-env.sh.template 2022-06-24 09:16:18.000000000 +0800
    2. +++ spark-env.sh 2022-06-24 18:49:42.000000000 +0800
    3. @@ -71,3 +71,10 @@
    4. # You might get better performance to enable these options if using native BLAS (see SPARK-21305).
    5. # - MKL_NUM_THREADS=1 Disable multi-threading of Intel MKL
    6. # - OPENBLAS_NUM_THREADS=1 Disable multi-threading of OpenBLAS
    7. +
    8. +JAVA_HOME=/Library/Java/JavaVirtualMachines/jdk1.8.0_321.jdk/Contents/Home
    9. +SPARK_LOCAL_IP=localhost
    10. +SPARK_DIST_CLASSPATH=`hadoop classpath`
    11. +HADOOP_CONF_DIR=~/work/hadoop/etc/hadoop
    12. +YARN_CONF_DIR=$HADOOP_CONF_DIR
    13. +SPARK_HISTORY_OPTS="-Dspark.history.fs.logDirectory=hdfs://localhost:9000/user/spark/logs/ -Dspark.history.fs.cleaner.enabled=true"

    $ diff -u spark-defaults.conf.template spark-defaults.conf

    1. --- spark-defaults.conf.template 2022-06-24 09:16:18.000000000 +0800
    2. +++ spark-defaults.conf 2022-06-24 16:19:02.000000000 +0800
    3. @@ -25,3 +25,8 @@
    4. # spark.serializer org.apache.spark.serializer.KryoSerializer
    5. # spark.driver.memory 5g
    6. # spark.executor.extraJavaOptions -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three"
    7. +
    8. +spark.eventLog.enabled true
    9. +spark.eventLog.dir hdfs://localhost:9000/user/spark/logs
    10. +spark.yarn.historyServer.address localhost:18080
    11. +spark.yarn.jars hdfs://localhost:9000/user/spark/jars/*

    ## create dirs and upload spark jars to HDFS
    $ hdfs dfs -mkdir -p /user/spark
    $ hdfs dfs -put jars /user/spark
    $ hdfs dfs -mkdir -p /user/spark/logs

    ## restart yarn with JobHistoryServer and spark HistoryServer
    $ start-yarn.sh
    $ mr-jobhistory-daemon.sh start historyserver
    $ sbin/start-history-server.sh
    $ jps

    1. 5696 SecondaryNameNode
    2. 5955 JobHistoryServer
    3. 5509 NameNode
    4. 5813 ResourceManager
    5. 5899 NodeManager
    6. 6683 HistoryServer
    7. 5597 DataNode
    8. 6702 Jps

    ## test
    $ cat test.sh

    1. #!/bin/sh
    2. run() {
    3. bin/spark-submit \
    4. --master yarn \
    5. --deploy-mode cluster \
    6. --driver-memory 512m \
    7. --executor-memory 512m \
    8. --num-executors 1 \
    9. --class org.apache.spark.examples.SparkPi \
    10. examples/jars/spark-examples_2.12-3.2.1.jar 10
    11. }
    12. ## main ##
    13. run
    14. appid=`grep "APPID" $HADOOP_HOME/logs/yarn*.log | tail -1 | awk 'pirnt $NF'`
    15. appid=${appid#*APPID=}
    16. echo $appid
    17. $HADOOP_HOME/bin/yarn logs -applicationId $appid

    $ test.sh

    1. 22/06/25 09:42:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
    2. application_1656115668743_0003
    3. 22/06/25 09:43:05 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
    4. 22/06/25 09:43:06 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032
    5. Container: container_1656115668743_0003_01_000001 on 192.168.124.7_52592
    6. ==========================================================================
    7. LogType:stderr
    8. Log Upload Time:Sat Jun 25 09:43:05 +0800 2022
    9. LogLength:379
    10. Log Contents:
    11. 22/06/25 09:42:54 WARN Utils: Your hostname, sun-xo.local resolves to a loopback address: 127.0.0.1; using 192.168.124.7 instead (on interface en0)
    12. 22/06/25 09:42:54 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
    13. 22/06/25 09:42:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
    14. End of LogType:stderr
    15. LogType:stdout
    16. Log Upload Time:Sat Jun 25 09:43:05 +0800 2022
    17. LogLength:33
    18. Log Contents:
    19. Pi is roughly 3.1423911423911424
    20. End of LogType:stdout
    21. Container: container_1656115668743_0003_01_000002 on 192.168.124.7_52592
    22. ==========================================================================
    23. LogType:stderr
    24. Log Upload Time:Sat Jun 25 09:43:05 +0800 2022
    25. LogLength:379
    26. Log Contents:
    27. 22/06/25 09:43:00 WARN Utils: Your hostname, sun-xo.local resolves to a loopback address: 127.0.0.1; using 192.168.124.7 instead (on interface en0)
    28. 22/06/25 09:43:00 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
    29. 22/06/25 09:43:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
    30. End of LogType:stderr
    31. LogType:stdout
    32. Log Upload Time:Sat Jun 25 09:43:05 +0800 2022
    33. LogLength:0
    34. Log Contents:
    35. End of LogType:stdout

    Actually the output of program is "Pi is roughly 3.1423911423911424"
    or you can see same result from http://localhost:8088/cluster -> appid -> logs

    reference Overview - Spark 3.2.1 Documentation

  • 相关阅读:
    【数学相关知识-概率和矩阵】
    智能供应链管理系统数字化供应链全周期管理,提升企业市场竞争力
    Google Earth Engine(GEE)——Kmeans聚类快速进行土地分类(双for循环快速调参)
    devops-5:从0开始构建一条完成的CI CD流水线
    Chapter 6 Shell Logic and Arithmetic
    A*算法求第k短路
    Tomcat安装及配置教程
    ESP Insights 自定义仪表板
    亚商投资顾问 早餐FM/1125氢燃料电池汽车发展驶入快车道
    【java、微服务】MQ
  • 原文地址:https://blog.csdn.net/sun_xo/article/details/125439797