1.所需依赖
org.apache.spark spark-core_2.11 2.4.3 org.apache.spark spark-sql_2.11 2.4.3
ru.yandex.clickhouse
clickhouse-jdbc
0.1.53
2.spark具体案例
注意:1. implements Serializable 实现
- private static final long serialVersionUID = 1L;
- transient SparkConf conf;//定义配置信息对象
- transient JavaSparkContext jsc ;//声明spark上下文
- transient SQLContext sqlContext ;
- @Test
- public void test() {
- conf = new SparkConf().setMaster("local[1]").setAppName("sparkRDDs");
- jsc = new JavaSparkContext(conf);
- sqlContext= new SQLContext(jsc);
- List<Map<String, String>> listdata = new ArrayList
- String name="测试";
- String age="22";
- String adress="广东深圳";
- String city="陕西西安";
- String habit="孤独百年";
- Map<String, String> map = new HashMap<String, String>();
- map.put("id", new Random().nextInt(1000) + "");
- map.put("name", name);
- map.put("age", age);
- map.put("adress", adress);
- map.put("city", city);
- map.put("habit", habit);
- listdata.add(map);
- // }
- //写入的数据内容
- JavaRDD<Map<String, String>> personData = jsc.parallelize(listdata);
- /**
- * 第一步:在RDD的基础上创建类型为Row的RDD
- */
- //将RDD变成以Row为类型的RDD。Row可以简单理解为Table的一行数据
- JavaRDD<Row> personsRDD = personData.map(new Function<Map<String, String>, Row>() {
- @Override
- public Row call(Map<String, String> line) throws Exception {
-
- return RowFactory.create(line.get("id"), line.get("id"), line.get("name"),line.get("age"),line.get("adress"),line.get("city"),line.get("habit"));
- }
-
- });
- /**
- * 第二步:动态构造DataFrame的元数据。
- */
- List structFields = new ArrayList();
- structFields.add(DataTypes.createStructField("id", DataTypes.StringType, true));
- structFields.add(DataTypes.createStructField("name", DataTypes.StringType, true));
- structFields.add(DataTypes.createStructField("age", DataTypes.StringType, true));
- structFields.add(DataTypes.createStructField("adress", DataTypes.StringType, true));
- structFields.add(DataTypes.createStructField("city", DataTypes.StringType, true));
- structFields.add(DataTypes.createStructField("habit", DataTypes.StringType, true));
- //构建StructType,用于最后DataFrame元数据的描述
- StructType structType = DataTypes.createStructType(structFields);
-
- /**
- * 第三步:基于已有的元数据以及RDD<Row>来构造DataFrame
- */
- Dataset personsDF = sqlContext.createDataFrame(personsRDD, structType);
- /**
- * 第四步:将数据写入到person表中
- */ //数据库内容
- String url = "jdbc:clickhouse://127.0.0.1:8123/test";
- Properties connectionProperties = new Properties();
- connectionProperties.put("user", "default");
- connectionProperties.put("password", "");
- log.info("===========插入开始============");
-
- personsDF.write().mode("append").jdbc(url, "bigtest", connectionProperties);
- jsc.close();
- }
注:此为spark本地模式,集群模式需要注意使用方式,上面直接可以,已经规避掉很多初级坑,此为saprk-sql实现,sparkstreaming后期会更新