/* SimpleApp.java */
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.SparkSession;
/**
* 计算文件中分别包含a和b的行数
*/
public class SimpleApp {
public static void main(String[] args) {
/*
local 本地单线程
local[K] 本地多线程(指定K个内核)
local[*] 本地多线程(指定所有可用内核)
spark://HOST:PORT 连接到指定的 Spark standalone cluster master,需要指定端口。
mesos://HOST:PORT 连接到指定的 Mesos 集群,需要指定端口。
yarn-client客户端模式 连接到 YARN 集群。需要配置 HADOOP_CONF_DIR。
yarn-cluster集群模式 连接到 YARN 集群。需要配置 HADOOP_CONF_DIR。
*/
//本地运行要设置spark.master为local或在VM options中输入“-Dspark.master=local”
System.setProperty("spark.master", "local");
String logFile = "D:/git/sparkDemo/pom.xml"; // Should be some file on your system
SparkSession spark = SparkSession.builder().appName("Simple Application").getOrCreate();
Dataset<String> logData = spark.read().textFile(logFile).cache();
long numAs = logData.filter((FilterFunction<String>) o -> o.toString().contains("a")).count();
long numBs = logData.filter((FilterFunction<String>) o -> o.toString().contains("b")).count();
System.out.println("Lines with a: " + numAs + ", lines with b: " + numBs);
spark.stop();
}
}
pom.xml
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<groupId>edu.berkeleygroupId>
<artifactId>simple-projectartifactId>
<modelVersion>4.0.0modelVersion>
<name>Simple Projectname>
<packaging>jarpackaging>
<version>1.0version>
<properties>
<maven.compiler.source>8maven.compiler.source>
<maven.compiler.target>8maven.compiler.target>
properties>
<dependencies>
<dependency>
<groupId>org.apache.sparkgroupId>
<artifactId>spark-sql_2.12artifactId>
<version>3.1.2version>
dependency>
dependencies>
project>
https://spark.apache.org/docs/3.3.0/quick-start.html