我在工作中使用 Apache Sedona 来处理地图数据。在阅读了有关 Sedona KNN 的文档后,我真的很想尝试使用 KNN。但是,我什至无法运行文档中的示例代码。
以下是我的主要代码。
val config = SedonaContext.builder()
.master("local[*]") // Delete this if run in cluster mode
.appName("readTestScala") // Change this to a proper name
.getOrCreate()
val sedona = SedonaContext.create(config)
val df1 = sedona.sql("SELECT ST_Point(0.0, 0.0) as geom1").cache()
val df2 = sedona.sql("SELECT ST_Point(0.0, 0.0) as geom2").cache()
df1.show()
df2.show()
val df = df1.join(df2, expr("ST_KNN(geom1, geom2, 1)"))
df.show()
但是不行,报NoClassDefFoundError,下面是错误堆栈。
Exception in thread "sbt-bg-threads-1" java.lang.NoClassDefFoundError: org/apache/commons/lang/NullArgumentException
at org.apache.spark.sql.sedona_sql.strategy.join.TraitJoinQueryBase.toSpatialRDD(TraitJoinQueryBase.scala:47)
at org.apache.spark.sql.sedona_sql.strategy.join.TraitJoinQueryBase.toSpatialRDD$(TraitJoinQueryBase.scala:46)
at org.apache.spark.sql.sedona_sql.strategy.join.BroadcastQuerySideKNNJoinExec.toSpatialRDD(BroadcastQuerySideKNNJoinExec.scala:35)
at org.apache.spark.sql.sedona_sql.strategy.join.BroadcastQuerySideKNNJoinExec.leftToSpatialRDD(BroadcastQuerySideKNNJoinExec.scala:86)
at org.apache.spark.sql.sedona_sql.strategy.join.BroadcastQuerySideKNNJoinExec.toSpatialRddPair(BroadcastQuerySideKNNJoinExec.scala:69)
at org.apache.spark.sql.sedona_sql.strategy.join.TraitKNNJoinQueryExec.executeKNNJoin(TraitKNNJoinQueryExec.scala:92)
at org.apache.spark.sql.sedona_sql.strategy.join.TraitKNNJoinQueryExec.doExecute(TraitKNNJoinQueryExec.scala:57)
at org.apache.spark.sql.sedona_sql.strategy.join.TraitKNNJoinQueryExec.doExecute$(TraitKNNJoinQueryExec.scala:55)
at org.apache.spark.sql.sedona_sql.strategy.join.BroadcastQuerySideKNNJoinExec.doExecute(BroadcastQuerySideKNNJoinExec.scala:35)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:195)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:246)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:243)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:191)
at org.apache.spark.sql.execution.InputAdapter.inputRDD(WholeStageCodegenExec.scala:527)
at org.apache.spark.sql.execution.InputRDDCodegen.inputRDDs(WholeStageCodegenExec.scala:455)
at org.apache.spark.sql.execution.InputRDDCodegen.inputRDDs$(WholeStageCodegenExec.scala:454)
at org.apache.spark.sql.execution.InputAdapter.inputRDDs(WholeStageCodegenExec.scala:498)
at org.apache.spark.sql.execution.ProjectExec.inputRDDs(basicPhysicalOperators.scala:51)
at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:751)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:195)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:246)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:243)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:191)
at org.apache.spark.sql.execution.SparkPlan.getByteArrayRdd(SparkPlan.scala:364)
at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:445)
at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.$anonfun$executeCollect$1(AdaptiveSparkPlanExec.scala:390)
at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.withFinalPlanUpdate(AdaptiveSparkPlanExec.scala:418)
at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.executeCollect(AdaptiveSparkPlanExec.scala:390)
at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:4333)
at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:3316)
at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:4323)
at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:546)
at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:4321)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:4321)
at org.apache.spark.sql.Dataset.head(Dataset.scala:3316)
at org.apache.spark.sql.Dataset.take(Dataset.scala:3539)
at org.apache.spark.sql.Dataset.getRows(Dataset.scala:280)
at org.apache.spark.sql.Dataset.showString(Dataset.scala:315)
at org.apache.spark.sql.Dataset.show(Dataset.scala:838)
at org.apache.spark.sql.Dataset.show(Dataset.scala:797)
at com.example.poc.App$.example(main.scala:81)
at com.example.poc.App$.main(main.scala:29)
有人知道可能是什么问题吗?
这可能是一个依赖性问题。你用的是哪个塞多纳罐子?您使用什么环境?