现在我陷入了一个相当基本的案例,但我无法找到一个聪明的解决方案
SparkR
...
我需要从 SparkDataFrame 中的 N 列创建 N 个新的计算列。
df <- data.frame(V1 = base::sample(1:10,5),
V2 = base::sample(1:10,5),
V3 = base::sample(1:10,5),
V4 = base::sample(1:10,5),
V5 = base::sample(1:10,5),
X = runif(n = 5, min = 0, max = 5))
sdf <- createDataFrame(df)
sdf <- withColumn(sdf, "V1_X", column("X") / column("V1"))
sdf <- withColumn(sdf, "V2_X", column("X") / column("V2"))
sdf <- withColumn(sdf, "V3_X", column("X") / column("V3"))
sdf <- withColumn(sdf, "V4_X", column("X") / column("V4"))
sdf <- withColumn(sdf, "V5_X", column("X") / column("V5"))
基本上,我想将一个函数应用于我的向量/列名称列表。 在 R 中很容易。在 SparkR 中,我能够
lapply
一个函数,但我修改了原始列。有什么东西逃脱了我......!?
谢谢!
也许你可以考虑以下方法
library(SparkR)
df <- data.frame(V1 = base::sample(1:10,5),
V2 = base::sample(1:10,5),
V3 = base::sample(1:10,5),
V4 = base::sample(1:10,5),
V5 = base::sample(1:10,5),
X = runif(n = 5, min = 0, max = 5))
sdf <- createDataFrame(df)
col_Names <- colnames(sdf)
nb_Col_Ratio <- length(col_Names) - 1
for(i in 1 : nb_Col_Ratio)
{
new_Col_Name <- paste(col_Names[i], "X")
sdf <- withColumn(sdf, new_Col_Name, column("X") / column(col_Names[i]))
}
print(as.data.frame(sdf))
SparkR::select
命令执行此类任务,而不是循环多个 withColumn
语句。通过 Python 的列表理解,可以在 Python 中使用相同的逻辑。请参阅下面代码的修改版本:
df <- data.frame(V1 = base::sample(1:10,5),
V2 = base::sample(1:10,5),
V3 = base::sample(1:10,5),
V4 = base::sample(1:10,5),
V5 = base::sample(1:10,5),
X = runif(n = 5, min = 0, max = 5))
sdf <- SparkR::createDataFrame(df)
# sdf <- SparkR::withColumn(sdf, "V1_X", SparkR::column("X") / SparkR::column("V1"))
# sdf <- SparkR::withColumn(sdf, "V2_X", SparkR::column("X") / SparkR::column("V2"))
# sdf <- SparkR::withColumn(sdf, "V3_X", SparkR::column("X") / SparkR::column("V3"))
# sdf <- SparkR::withColumn(sdf, "V4_X", SparkR::column("X") / SparkR::column("V4"))
# sdf <- SparkR::withColumn(sdf, "V5_X", SparkR::column("X") / SparkR::column("V5"))
loop_on_cols <- SparkR::colnames(sdf)[SparkR::colnames(sdf)!="X"]
sdf2 <- SparkR::select(
sdf,
c(
SparkR::colnames(sdf),
lapply(
loop_on_cols,
function(c) {
SparkR::alias(SparkR::column("X")/SparkR::column(c), paste0(c,"_X"))
}
)
)
)
SparkR::head(sdf2)
一种优雅的方式是将 SparkR 与 dpylr 结合起来以链接多个列的创建。
library(dplyr)
library(SparkR)
df <- data.frame(V1 = base::sample(1:10,5),
V2 = base::sample(1:10,5),
V3 = base::sample(1:10,5),
V4 = base::sample(1:10,5),
V5 = base::sample(1:10,5),
X = runif(n = 5, min = 0, max = 5))
sdf <- createDataFrame(df)
newsdf <- sdf %>%
withColumn("V1_X", column("X")/column("V1")) %>%
withColumn("V2_X", column("X")/column("V2")) %>%
withColumn("V3_X", column("X")/column("V3")) %>%
withColumn("V4_X", column("X")/column("V4")) %>%
withColumn("V5_X", column("X")/column("V5"))