import org.apache.spark.sql.functions.udf
udf接收一个函数(func)作为参数,返回一个UserDefinedFunction。
UserDefinedFunction接收列(Column)作为参数,并返回Column.
func可以接收普通类型参数,并返回普通类型结果。
udf会自动把Column入参转化成对应func的入参,并将func返回的结果转化成Column类型。、
注意由于udf限制。func最多接收10个参数
import org.apache.spark.sql.functions._
//一个或多个参数
val UDF0= udf{
(c1:String,c2:Int,...,)=> 函数体
}
df.withColumn("new",UDF0(col("old1"),col("old2")...))
//整行输入
val UDF1= udf{
(row: Row) => 处理Row类型的函数体,最好返回常规类型
}
val columns = df.columns
df.withColumn("new",UDF1(struct(columns.map(x=>col(x)): _*))
// 注册到sql中使用
df.createOrReplaceTempView("tempview")
spark.udf.register("UDF0", UDF0)
spark.sql("select *, UDF0(struct(`old1`, `old2`, `...`)) as new from tempview").show(100)
struct函数说明:
def struct(colName: String, colNames: String*): Column
// Creates a new struct column that composes multiple input columns.
def struct(cols: Column*): Column
// Creates a new struct column. If the input column is a column in a DataFrame, or a derived column expression that is named (i.e. aliased), its name would be retained as the StructField's name, otherwise, the newly generated StructField's name would be auto generated as col with a suffix index + 1, i.e. col1, col2, col3, ...