1) Build
## download spark-3.2.1.tgz from http://archive.apache.org/dist/
## unpack to ~/work/spark-3.2.1-src
$ cd ~/work/spark-3.2.1-src
$ export MAVEN_OPTS="-Xss64m -Xmx2g -XX:ReservedCodeCacheSize=1g"
$ dev/make-distribution.sh --name without-hadoop \
--pip --tgz -Phive -Phive-thriftserver -Phadoop-provided -Pyarn
$ tar xvf spark-3.2.1-bin-without-hadoop.tgz -C ..
$ cd ..
$ mv spark-3.2.1-bin-without-hadoop spark-3.2.1
## configure
$ cd spark-3.2.1
$ diff -u conf/spark-env.sh.template conf/spark-env.sh
- --- conf/spark-env.sh.template 2022-06-24 09:16:18.000000000 +0800
- +++ conf/spark-env.sh 2022-06-24 17:52:47.000000000 +0800
- @@ -71,3 +71,7 @@
- # You might get better performance to enable these options if using native BLAS (see SPARK-21305).
- # - MKL_NUM_THREADS=1 Disable multi-threading of Intel MKL
- # - OPENBLAS_NUM_THREADS=1 Disable multi-threading of OpenBLAS
- +
- +JAVA_HOME=/Library/Java/JavaVirtualMachines/jdk1.8.0_321.jdk/Contents/Home
- +SPARK_LOCAL_IP=localhost
- +SPARK_DIST_CLASSPATH=`hadoop classpath`
$ diff -u conf/log4j.properties.template conf/log4j.properties
- $ diff -u conf/log4j.properties.template conf/log4j.properties
- --- conf/log4j.properties.template 2022-06-24 09:16:18.000000000 +0800
- +++ conf/log4j.properties 2022-06-24 16:28:28.000000000 +0800
- @@ -16,7 +16,7 @@
- #
-
- # Set everything to be logged to the console
- -log4j.rootCategory=INFO, console
- +log4j.rootCategory=WARN, console
- log4j.appender.console=org.apache.log4j.ConsoleAppender
- log4j.appender.console.target=System.err
- log4j.appender.console.layout=org.apache.log4j.PatternLayout
## test
$ `hadoop classpath` bin/spark-submit \
--class org.apache.spark.examples.SparkPi \
examples/jars/spark-examples_2.12-3.2.1.jar 10
- 22/06/24 17:53:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
- Pi is roughly 3.1387311387311385
2) Spark on yarn
## configure yarn
$ cd ~/work/hadoop
$ diff -u etc/hadoop/yarn-site.xml.orig etc/hadoop/yarn-site.xml
- --- etc/hadoop/yarn-site.xml 2022-05-17 09:20:54.000000000 +0800
- +++ /Users/sun_xo/work/hadoop/etc/hadoop/yarn-site.xml 2022-06-23 10:13:52.000000000 +0800
- @@ -29,4 +29,17 @@
- <name>yarn.log-aggregation.retain-seconds</name>
- <value>604800</value>
- </property>
- + <property>
- + <name>yarn.log.server.url</name>
- + <value>http://localhost:19888/jobhistory/logs</value>
- + </property>
- + <!-- close yarn memory check -->
- + <property>
- + <name>yarn.nodemanager.pmem-check-enabled</name>
- + <value>false</value>
- + </property>
- + <property>
- + <name>yarn.nodemanager.vmem-check-enabled</name>
- + <value>false</value>
- + </property>
- </configuration>
## configure spark
$ diff -u spark-env.sh.template spark-env.sh
- --- spark-env.sh.template 2022-06-24 09:16:18.000000000 +0800
- +++ spark-env.sh 2022-06-24 18:49:42.000000000 +0800
- @@ -71,3 +71,10 @@
- # You might get better performance to enable these options if using native BLAS (see SPARK-21305).
- # - MKL_NUM_THREADS=1 Disable multi-threading of Intel MKL
- # - OPENBLAS_NUM_THREADS=1 Disable multi-threading of OpenBLAS
- +
- +JAVA_HOME=/Library/Java/JavaVirtualMachines/jdk1.8.0_321.jdk/Contents/Home
- +SPARK_LOCAL_IP=localhost
- +SPARK_DIST_CLASSPATH=`hadoop classpath`
- +HADOOP_CONF_DIR=~/work/hadoop/etc/hadoop
- +YARN_CONF_DIR=$HADOOP_CONF_DIR
- +SPARK_HISTORY_OPTS="-Dspark.history.fs.logDirectory=hdfs://localhost:9000/user/spark/logs/ -Dspark.history.fs.cleaner.enabled=true"
$ diff -u spark-defaults.conf.template spark-defaults.conf
- --- spark-defaults.conf.template 2022-06-24 09:16:18.000000000 +0800
- +++ spark-defaults.conf 2022-06-24 16:19:02.000000000 +0800
- @@ -25,3 +25,8 @@
- # spark.serializer org.apache.spark.serializer.KryoSerializer
- # spark.driver.memory 5g
- # spark.executor.extraJavaOptions -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three"
- +
- +spark.eventLog.enabled true
- +spark.eventLog.dir hdfs://localhost:9000/user/spark/logs
- +spark.yarn.historyServer.address localhost:18080
- +spark.yarn.jars hdfs://localhost:9000/user/spark/jars/*
## create dirs and upload spark jars to HDFS
$ hdfs dfs -mkdir -p /user/spark
$ hdfs dfs -put jars /user/spark
$ hdfs dfs -mkdir -p /user/spark/logs
## restart yarn with JobHistoryServer and spark HistoryServer
$ start-yarn.sh
$ mr-jobhistory-daemon.sh start historyserver
$ sbin/start-history-server.sh
$ jps
- 5696 SecondaryNameNode
- 5955 JobHistoryServer
- 5509 NameNode
- 5813 ResourceManager
- 5899 NodeManager
- 6683 HistoryServer
- 5597 DataNode
- 6702 Jps
## test
$ cat test.sh
- #!/bin/sh
-
- run() {
- bin/spark-submit \
- --master yarn \
- --deploy-mode cluster \
- --driver-memory 512m \
- --executor-memory 512m \
- --num-executors 1 \
- --class org.apache.spark.examples.SparkPi \
- examples/jars/spark-examples_2.12-3.2.1.jar 10
- }
-
- ## main ##
- run
- appid=`grep "APPID" $HADOOP_HOME/logs/yarn*.log | tail -1 | awk 'pirnt $NF'`
- appid=${appid#*APPID=}
- echo $appid
- $HADOOP_HOME/bin/yarn logs -applicationId $appid
$ test.sh
- 22/06/25 09:42:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
- application_1656115668743_0003
- 22/06/25 09:43:05 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
- 22/06/25 09:43:06 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032
-
-
- Container: container_1656115668743_0003_01_000001 on 192.168.124.7_52592
- ==========================================================================
- LogType:stderr
- Log Upload Time:Sat Jun 25 09:43:05 +0800 2022
- LogLength:379
- Log Contents:
- 22/06/25 09:42:54 WARN Utils: Your hostname, sun-xo.local resolves to a loopback address: 127.0.0.1; using 192.168.124.7 instead (on interface en0)
- 22/06/25 09:42:54 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
- 22/06/25 09:42:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
- End of LogType:stderr
-
- LogType:stdout
- Log Upload Time:Sat Jun 25 09:43:05 +0800 2022
- LogLength:33
- Log Contents:
- Pi is roughly 3.1423911423911424
- End of LogType:stdout
-
-
-
- Container: container_1656115668743_0003_01_000002 on 192.168.124.7_52592
- ==========================================================================
- LogType:stderr
- Log Upload Time:Sat Jun 25 09:43:05 +0800 2022
- LogLength:379
- Log Contents:
- 22/06/25 09:43:00 WARN Utils: Your hostname, sun-xo.local resolves to a loopback address: 127.0.0.1; using 192.168.124.7 instead (on interface en0)
- 22/06/25 09:43:00 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
- 22/06/25 09:43:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
- End of LogType:stderr
-
- LogType:stdout
- Log Upload Time:Sat Jun 25 09:43:05 +0800 2022
- LogLength:0
- Log Contents:
- End of LogType:stdout
Actually the output of program is "Pi is roughly 3.1423911423911424"
or you can see same result from http://localhost:8088/cluster -> appid -> logs
reference Overview - Spark 3.2.1 Documentation