上传安装包到/opt/software目录并解压
[bigdata@node106 software]$ tar -zxvf spark-3.3.1-bin-hadoop3.tgz -C /opt/services/
重命名文件
[bigdata@node106 services]$ mv spark-3.3.1-bin-hadoop3 spark-3.3.1
配置环境变量
[bigdata@node106 ~]$ sudo vim /etc/profile.d/bigdata_env.sh
- export SPARK_HOME=/opt/services/spark-3.3.1
- export $PATH:$JAVA_HOME/bin:$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$ZK_HOME/bin:$KAFKA_HOME/bin:$SEA_HOME/bin:$HIVE_HOME/bin:$SPARK_HOME/bin
分发环境变量
[bigdata@node106 bin]$ sudo ./bin/xsync /etc/profile.d/bigdata_env.sh
刷新环境变量,5台机器上执行
[bigdata@node106 ~]$ source /etc/profile
配置spark-env.sh
- HADOOP_CONF_DIR=/opt/services/hadoop-3.3.5/etc/hadoop
- YARN_CONF_DIR=/opt/services/hadoop-3.3.5/etc/hadoop
- export SPARK_DIST_CLASSPATH=$(hadoop classpath)
- export SPARK_HISTORY_OPTS="
- -Dspark.history.ui.port=18080
- -Dspark.history.fs.logDirectory=hdfs://mycluster:8020/spark-history
- -Dspark.history.retainedApplications=30"
配置spark-defaults.conf
- spark.master yarn
- spark.eventLog.enabled true
- spark.eventLog.dir hdfs://mycluster:8020/spark-history
- spark.serializer org.apache.spark.serializer.KryoSerializer
- spark.yarn.archive hdfs://mycluster:8020/spark-archive/spark-archive.zip
- spark.sql.warehouse.dir hdfs://mycluster:8020/user/hudi/warehouse
- spark.serializer org.apache.spark.serializer.KryoSerializer
- spark.sql.extensions org.apache.spark.sql.hudi.HoodieSparkSessionExtension
- spark.sql.catalog.spark_catalog org.apache.spark.sql.hudi.catalog.HoodieCatalog
- spark.kryo.registrator org.apache.spark.HoodieSparkKryoRegistrar
- spark.hadoop.yarn.timeline-service.enabled false
- spark.executor.cores 4
- spark.executor.memory 3g
- spark.executor.memoryOverhead 1g
- spark.driver.memory 2g
- spark.driver.memoryOverhead 1g
- #启动动态分配
- spark.dynamicAllocation.enabled true
- #启用Spark shuffle服务
- spark.shuffle.service.enabled true
- #Executor个数初始值
- spark.dynamicAllocation.initialExecutors 2
- #Executor个数最小值
- spark.dynamicAllocation.minExecutors 2
- #Executor个数最大值
- spark.dynamicAllocation.maxExecutors 4
- #Executor空闲时长,若某Executor空闲时间超过此值,则会被关闭
- spark.dynamicAllocation.executorIdleTimeout 60s
- #积压任务等待时长,若有Task等待时间超过此值,则申请启动新的Executor
- spark.dynamicAllocation.schedulerBacklogTimeout 1s
- spark.yarn.queue hive
- spark.yarn.historyServer.address=node106:18080
- spark.history.ui.port=18080
- spark.history.fs.logDirectory=hdfs://mycluster:8020/spark-history
创建日志文件
[bigdata@node106 conf]$ hdfs dfs -mkdir /spark-history
创建运行依赖的文件夹
[bigdata@node106 conf]$ hdfs dfs -mkdir /spark-archive
上传mysql驱动包,hudi依赖的包
- [bigdata@node106 software]$ cp mysql-connector-java-8.0.18.jar /opt/services/spark-3.3.1/jars/
- [bigdata@node106 software]$ cp hudi-spark3.3-bundle_2.12-0.14.1.jar /opt/services/spark-3.3.1/jars/
压缩jar包并上传到hdfs
[bigdata@node106 jars]$ zip spark-archive.zip ./*
[bigdata@node106 jars]$ hdfs dfs -put ./spark-archive.zip /spark-archive
上传spark-3.3.1-yarn-shuffle.jar
[bigdata@node106 conf]$ cp $SPARK_HOME/yarn/spark-3.3.1-yarn-shuffle.jar /opt/services/hadoop-3.3.5/share/hadoop/yarn/lib/
配置日志文件
[bigdata@node106 conf]$ cp log4j2.properties.template log4j2.properties
上传hive-site.xml到conf目录下,配置hudi存储目录和spark的server2服务
- <property>
- <name>hive.metastore.warehouse.dirname>
- <value>/user/hudi/warehousevalue>
- property>
- <property>
- <name>hive.server2.thrift.portname>
- <value>10001value>
- property>
- <property>
- <name>hive.server2.thrift.bind.hostname>
- <value>node106value>
- property>
编写spark.sh脚本
[bigdata@node106 bin]$ vim spark.sh
- #!/bin/bash
-
- if [ $# -lt 1 ]
- then
- echo "No Args Input...[start,stop]"
- exit ;
- fi
- case $1 in
- "start")
- echo ==================== 启动history服务 =========================
- ssh node106 "$SPARK_HOME/sbin/start-history-server.sh"
- echo ==================== 启动server2服务 ====================
- ssh node106 "$SPARK_HOME/sbin/start-thriftserver.sh --master yarn"
- ;;
- "stop")
- echo ==================== 关闭history服务 =========================
- ssh node106 "$SPARK_HOME/sbin/stop-history-server.sh"
- echo ==================== 关闭server2服务 ====================
- ssh node106 "$SPARK_HOME/sbin/stop-thriftserver.sh --master yarn"
- ;;
- *)
- echo "Input Args Error...[start,stop]"
- ;;
- esac
授权
[bigdata@node106 bin]$ chmod +x spark.sh
分发到其他机器
[bigdata@node106 bin]$ xsync spark.sh
copy到其他机器
- [bigdata@node107 bin]$ scp -r bigdata@node106:/opt/services/spark-3.3.1/ /opt/services/spark-3.3.1/
- [bigdata@node108 bin]$ scp -r bigdata@node106:/opt/services/spark-3.3.1/ /opt/services/spark-3.3.1/
启动spark
[bigdata@node106 bin]$ spark.sh start