• 数据湖仓一体(五)安装spark


    上传安装包到/opt/software目录并解压

    [bigdata@node106 software]$ tar -zxvf spark-3.3.1-bin-hadoop3.tgz -C /opt/services/ 

    重命名文件

    [bigdata@node106 services]$ mv spark-3.3.1-bin-hadoop3 spark-3.3.1 

     配置环境变量

    [bigdata@node106 ~]$ sudo vim /etc/profile.d/bigdata_env.sh
    1. export SPARK_HOME=/opt/services/spark-3.3.1
    2. export $PATH:$JAVA_HOME/bin:$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$ZK_HOME/bin:$KAFKA_HOME/bin:$SEA_HOME/bin:$HIVE_HOME/bin:$SPARK_HOME/bin

    分发环境变量

    [bigdata@node106 bin]$ sudo ./bin/xsync /etc/profile.d/bigdata_env.sh 

    刷新环境变量,5台机器上执行

    [bigdata@node106 ~]$ source /etc/profile

    配置spark-env.sh

    1. HADOOP_CONF_DIR=/opt/services/hadoop-3.3.5/etc/hadoop
    2. YARN_CONF_DIR=/opt/services/hadoop-3.3.5/etc/hadoop
    3. export SPARK_DIST_CLASSPATH=$(hadoop classpath)
    4. export SPARK_HISTORY_OPTS="
    5. -Dspark.history.ui.port=18080
    6. -Dspark.history.fs.logDirectory=hdfs://mycluster:8020/spark-history
    7. -Dspark.history.retainedApplications=30"

    配置spark-defaults.conf

    1. spark.master yarn
    2. spark.eventLog.enabled true
    3. spark.eventLog.dir hdfs://mycluster:8020/spark-history
    4. spark.serializer org.apache.spark.serializer.KryoSerializer
    5. spark.yarn.archive hdfs://mycluster:8020/spark-archive/spark-archive.zip
    6. spark.sql.warehouse.dir hdfs://mycluster:8020/user/hudi/warehouse
    7. spark.serializer org.apache.spark.serializer.KryoSerializer
    8. spark.sql.extensions org.apache.spark.sql.hudi.HoodieSparkSessionExtension
    9. spark.sql.catalog.spark_catalog org.apache.spark.sql.hudi.catalog.HoodieCatalog
    10. spark.kryo.registrator org.apache.spark.HoodieSparkKryoRegistrar
    11. spark.hadoop.yarn.timeline-service.enabled false
    12. spark.executor.cores 4
    13. spark.executor.memory 3g
    14. spark.executor.memoryOverhead 1g
    15. spark.driver.memory 2g
    16. spark.driver.memoryOverhead 1g
    17. #启动动态分配
    18. spark.dynamicAllocation.enabled true
    19. #启用Spark shuffle服务
    20. spark.shuffle.service.enabled true
    21. #Executor个数初始值
    22. spark.dynamicAllocation.initialExecutors 2
    23. #Executor个数最小值
    24. spark.dynamicAllocation.minExecutors 2
    25. #Executor个数最大值
    26. spark.dynamicAllocation.maxExecutors 4
    27. #Executor空闲时长,若某Executor空闲时间超过此值,则会被关闭
    28. spark.dynamicAllocation.executorIdleTimeout 60s
    29. #积压任务等待时长,若有Task等待时间超过此值,则申请启动新的Executor
    30. spark.dynamicAllocation.schedulerBacklogTimeout 1s
    31. spark.yarn.queue hive
    32. spark.yarn.historyServer.address=node106:18080
    33. spark.history.ui.port=18080
    34. spark.history.fs.logDirectory=hdfs://mycluster:8020/spark-history

    创建日志文件

    [bigdata@node106 conf]$ hdfs dfs -mkdir /spark-history

    创建运行依赖的文件夹

    [bigdata@node106 conf]$ hdfs dfs -mkdir /spark-archive

    上传mysql驱动包,hudi依赖的包

    1. [bigdata@node106 software]$ cp mysql-connector-java-8.0.18.jar /opt/services/spark-3.3.1/jars/
    2. [bigdata@node106 software]$ cp hudi-spark3.3-bundle_2.12-0.14.1.jar /opt/services/spark-3.3.1/jars/

    压缩jar包并上传到hdfs

    [bigdata@node106 jars]$ zip spark-archive.zip ./* 
    [bigdata@node106 jars]$ hdfs dfs -put ./spark-archive.zip /spark-archive

    上传spark-3.3.1-yarn-shuffle.jar

    [bigdata@node106 conf]$ cp $SPARK_HOME/yarn/spark-3.3.1-yarn-shuffle.jar  /opt/services/hadoop-3.3.5/share/hadoop/yarn/lib/

    配置日志文件

    [bigdata@node106 conf]$ cp log4j2.properties.template log4j2.properties

    上传hive-site.xml到conf目录下,配置hudi存储目录和spark的server2服务

    1. <property>
    2. <name>hive.metastore.warehouse.dirname>
    3. <value>/user/hudi/warehousevalue>
    4. property>
    5. <property>
    6. <name>hive.server2.thrift.portname>
    7. <value>10001value>
    8. property>
    9. <property>
    10. <name>hive.server2.thrift.bind.hostname>
    11. <value>node106value>
    12. property>

    编写spark.sh脚本

    [bigdata@node106 bin]$ vim spark.sh 
    1. #!/bin/bash
    2. if [ $# -lt 1 ]
    3. then
    4. echo "No Args Input...[start,stop]"
    5. exit ;
    6. fi
    7. case $1 in
    8. "start")
    9. echo ==================== 启动history服务 =========================
    10. ssh node106 "$SPARK_HOME/sbin/start-history-server.sh"
    11. echo ==================== 启动server2服务 ====================
    12. ssh node106 "$SPARK_HOME/sbin/start-thriftserver.sh --master yarn"
    13. ;;
    14. "stop")
    15. echo ==================== 关闭history服务 =========================
    16. ssh node106 "$SPARK_HOME/sbin/stop-history-server.sh"
    17. echo ==================== 关闭server2服务 ====================
    18. ssh node106 "$SPARK_HOME/sbin/stop-thriftserver.sh --master yarn"
    19. ;;
    20. *)
    21. echo "Input Args Error...[start,stop]"
    22. ;;
    23. esac

    授权

    [bigdata@node106 bin]$ chmod +x spark.sh

    分发到其他机器

    [bigdata@node106 bin]$ xsync  spark.sh 

    copy到其他机器

    1. [bigdata@node107 bin]$ scp -r bigdata@node106:/opt/services/spark-3.3.1/ /opt/services/spark-3.3.1/
    2. [bigdata@node108 bin]$ scp -r bigdata@node106:/opt/services/spark-3.3.1/ /opt/services/spark-3.3.1/

    启动spark

    [bigdata@node106 bin]$ spark.sh start 

  • 相关阅读:
    基于2D连续图像序列的行人骨架关节角度估计
    C专家编程 第7章 对内存的思考 7.5 数据段和堆
    Kopler.gl笔记:可视化功能总览
    APICloud AVM框架 封装车牌号输入键盘组件
    文件操作合集 - C语言
    【LeetCode】2285. 道路的最大总重要性
    ASEMI整流桥GBJ2510参数:拆析其关键性能特点
    web前端面试题附答案044 - vue获取param参数,有什么缺点吗?
    缓存
    聊聊jedis连接池的预热
  • 原文地址:https://blog.csdn.net/mark_wu2000/article/details/140377146