目录
withColumn():是Apache Spark中用于DataFrame操作的函数之一,它的作用是在DataFrame中添加或替换列,或者对现有列进行转换操作和更新等等
1. 添加新列(用withColumn为Dataframe)
2. 改变现有列
3. 将现有列派生出新列
4 .更改数据类型(可以在改变该列的同时进行类型转换)
5 .重命名列名(需要使用DataFrame的withColumnRenamed)
6. 删除一个列 (使用drop)
import org.apache.spark.SparkConf import org.apache.spark.sql.{DataFrame, SparkSession} import org.apache.spark.sql.functions.{col, lit, rand, round} object text { def main(args: Array[String]): Unit = { //新建spark val spark = new SparkConf().setMaster("local[*]").setAppName("text") val sc = SparkSession.builder().config(spark).getOrCreate() //准备源数据 val tuples = Seq(("小白", 19, "江西"), ("小红", 20, "安徽"), ("小兰", 21, "河北")) val frame = sc.createDataFrame(tuples).toDF("name","age","address") frame.show()输出结果为:
+------+------+---------+
|name |age |address|
+------+-------+--------+
|小白 | 19 | 江西|
|小红 | 20 | 安徽|
|小兰 | 21 | 河北|
+-------+-------+-------+
1.添加新列
//语法 withColumn(colName : String, col : Column) : DataFrame例子:
//1. 用withColumn为dataframe 添加新列 val seq = Seq("小新", 22, "北京") val frame1 : DataFrame= frame.withColumn("new",round(rand()*100,1) ) frame1.show() //打印输出结果为:
+------+-----+-------+---------+
|name|age|address| new|
+------+------+-------+-------+
|小白 | 19 | 江西|27.7 |
|小红 | 20 | 安徽|98.2 |
|小兰 | 21 | 河北|51.0 |
+------+------+-------+-------+
2. 改变现有列
//2. 改变现有列 val frame2: DataFrame = frame.withColumn("age", col("age") - 5) frame2.show() // 打印输出结果为:
+------+------+-------+
|name|age|address|
+-------+------+------+
|小白 | 14| 江西|
|小红 | 15| 安徽|
|小兰 | 16| 河北|
+------+------+-------+
3.将现有列派生出新列
//3.将现有列派生出新列 val frame3 : DataFrame= frame.withColumn("newCol", col("age")*10) frame3.show()输出结果为:
+------+--------+--------+--------+
|name|age|address|newCol|
+-------+-------+--------+--------+
|小白 | 19 | 江西| 190|
|小红 | 20 | 安徽| 200|
|小兰 | 21 | 河北| 210|
+--------+------+-------+-------+
4.更改数据类型(可以在改变该列的同时进行类型转换)
//4.更改数据类型(可以在改变该列的同时进行类型转换) val frame4 : DataFrame = frame.withColumn("age", col("age").cast("float")) frame4.show输出结果为:
+-------+-------+-------+
|name | age | address|
+-------+-------+-------+
|小白 |19.0 | 江西|
|小红 |20.0 | 安徽|
|小兰 |21.0 | 河北|
+-------+-------+-------+
5.重命名列名(需要使用DataFrame的withColumnRenamed)
// 5.重命名列名(需要使用DataFrame的withColumnRenamed) val frame5: DataFrame = frame.withColumnRenamed("address", "省份") frame5.show()输出结果为:
+------+------+------+
|name|age|省份|
+------+------+----+
|小白 | 19 |江西|
|小红 | 20 |安徽|
|小兰 | 21 |河北|
+------+-----+------+
6. 删除一个列 (使用drop)
// 6. 删除一个列 (使用drop) val frame6: DataFrame = frame.drop("age") frame6.show输出结果为:
|name|address|
+-------+-------+
|小白 | 江西|
|小红 | 安徽|
|小兰 | 河北|
+-------+-------+
-
-
- import org.apache.spark.SparkConf
- import org.apache.spark.sql.{DataFrame, SparkSession}
- import org.apache.spark.sql.functions.{col, lit, rand, round}
-
- object text {
- def main(args: Array[String]): Unit = {
- //新建spark
- val spark = new SparkConf().setMaster("local[*]").setAppName("text")
- val sc = SparkSession.builder().config(spark).getOrCreate()
- //准备源数据
- val tuples = Seq(("小白", 19, "江西"),
- ("小红", 20, "安徽"),
- ("小兰", 21, "河北"))
- val frame = sc.createDataFrame(tuples).toDF("name","age","address")
- frame.show()
-
- //1. 用withColumn为dataframe 添加新列
- val seq = Seq("小新", 22, "北京")
- val frame1 : DataFrame= frame.withColumn("new",round(rand()*100,1) )
- frame1.show()
-
- //2. 改变现有列
- val frame2: DataFrame = frame.withColumn("age", col("age") - 5)
- frame2.show() // 打印
-
- //3.将现有列派生出新列
- var a = "省"
- val frame3 : DataFrame= frame.withColumn("newCol", col("age")*10)
- frame3.show()
-
- //4.更改数据类型(可以在改变该列的同时进行类型转换)
- val frame4 : DataFrame = frame.withColumn("age", col("age").cast("float"))
- frame4.show
-
- // 5.重命名列名(需要使用DataFrame的withColumnRenamed)
- val frame5: DataFrame = frame.withColumnRenamed("address", "省份")
- frame5.show()
-
- // 6. 删除一个列 (使用drop)
- val frame6: DataFrame = frame.drop("age")
- frame6.show()
- }
- }