spark.ndb.secret_access_key ****
spark.ndb.rowgroups_per_subsplit 1
spark.ndb.access_key_id ****
spark.sql.extensions ndb.NDBSparkSessionExtension
spark.ndb.dynamic_filter_compaction_threshold 100
spark.sql.catalog.ndb spark.sql.catalog.ndb.VastCatalog
spark.ndb.retry_sleep_duration 1
spark.ndb.endpoint http://172.19.197.1
spark.ndb.retry_max_count 3
spark.ndb.parallel_import true
spark.master local[*, 4]
spark.ndb.num_of_sub_splits 8
spark.databricks.cluster.profile singleNode
spark.ndb.num_of_splits 64
spark.ndb.data_endpoints http://127.0.0.1
spark.ndb.query_data_rows_per_split 4000000
spark.ndb.dynamic_filtering_wait_timeout 2
clusters clusters
log4j-active.log
具有以下内容:
...
25/02/06 10:41:43 INFO DatabricksILoop$: Successfully initialized SparkContext
25/02/06 10:41:43 WARN SparkSession: Cannot use ndb.NDBSparkSessionExtension to configure session extensions.
java.lang.ClassNotFoundException: ndb.NDBSparkSessionExtension
at java.net.URLClassLoader.findClass(URLClassLoader.java:387)
at java.lang.ClassLoader.loadClass(ClassLoader.java:419)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:352)
at java.lang.ClassLoader.loadClass(ClassLoader.java:352)
at java.lang.Class.forName0(Native Method)
at java.lang.Class.forName(Class.java:348)
at org.apache.spark.util.SparkClassUtils.classForName(SparkClassUtils.scala:44)
at org.apache.spark.util.SparkClassUtils.classForName$(SparkClassUtils.scala:39)
at org.apache.spark.util.Utils$.classForName(Utils.scala:111)
at org.apache.spark.sql.SparkSession$.$anonfun$applyExtensions$2(SparkSession.scala:1666)
at org.apache.spark.sql.SparkSession$.$anonfun$applyExtensions$2$adapted(SparkSession.scala:1664)
at scala.collection.Iterator.foreach(Iterator.scala:943)
at scala.collection.Iterator.foreach$(Iterator.scala:943)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
at scala.collection.IterableLike.foreach(IterableLike.scala:74)
at scala.collection.IterableLike.foreach$(IterableLike.scala:73)
at scala.collection.AbstractIterable.foreach(Iterable.scala:56)
at org.apache.spark.sql.SparkSession$.org$apache$spark$sql$SparkSession$$applyExtensions(SparkSession.scala:1664)
at org.apache.spark.sql.SparkSession$Builder.getOrCreate(SparkSession.scala:1394)
at org.apache.spark.sql.hive.HiveContext.<init>(HiveContext.scala:38)
at com.databricks.backend.daemon.driver.DatabricksILoop$.initializeSharedDriverContext(DatabricksILoop.scala:489)
at com.databricks.backend.daemon.driver.DatabricksILoop$.getOrCreateSharedDriverContext(DatabricksILoop.scala:300)
at com.databricks.backend.daemon.driver.DriverCorral.driverContext(DriverCorral.scala:378)
at com.databricks.backend.daemon.driver.DriverCorral.<init>(DriverCorral.scala:218)
at com.databricks.backend.daemon.driver.DriverDaemon.<init>(DriverDaemon.scala:75)
at com.databricks.backend.daemon.driver.DriverDaemon$.create(DriverDaemon.scala:616)
at com.databricks.backend.daemon.driver.DriverDaemon$.initialize(DriverDaemon.scala:764)
at com.databricks.backend.daemon.driver.DriverDaemon$.wrappedMain(DriverDaemon.scala:729)
at com.databricks.DatabricksMain.$anonfun$main$4(DatabricksMain.scala:230)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at com.databricks.DatabricksMain.$anonfun$withStartupProfilingData$1(DatabricksMain.scala:655)
at com.databricks.logging.UsageLogging.$anonfun$recordOperation$1(UsageLogging.scala:528)
at com.databricks.logging.UsageLogging.executeThunkAndCaptureResultTags$1(UsageLogging.scala:633)
at com.databricks.logging.UsageLogging.$anonfun$recordOperationWithResultTags$4(UsageLogging.scala:656)
at com.databricks.logging.AttributionContextTracing.$anonfun$withAttributionContext$1(AttributionContextTracing.scala:48)
at com.databricks.logging.AttributionContext$.$anonfun$withValue$1(AttributionContext.scala:276)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:62)
at com.databricks.logging.AttributionContext$.withValue(AttributionContext.scala:272)
at com.databricks.logging.AttributionContextTracing.withAttributionContext(AttributionContextTracing.scala:46)
at com.databricks.logging.AttributionContextTracing.withAttributionContext$(AttributionContextTracing.scala:43)
at com.databricks.DatabricksMain.withAttributionContext(DatabricksMain.scala:110)
at com.databricks.logging.AttributionContextTracing.withAttributionTags(AttributionContextTracing.scala:95)
at com.databricks.logging.AttributionContextTracing.withAttributionTags$(AttributionContextTracing.scala:76)
at com.databricks.DatabricksMain.withAttributionTags(DatabricksMain.scala:110)
at com.databricks.logging.UsageLogging.recordOperationWithResultTags(UsageLogging.scala:628)
at com.databricks.logging.UsageLogging.recordOperationWithResultTags$(UsageLogging.scala:537)
at com.databricks.DatabricksMain.recordOperationWithResultTags(DatabricksMain.scala:110)
at com.databricks.logging.UsageLogging.recordOperation(UsageLogging.scala:529)
at com.databricks.logging.UsageLogging.recordOperation$(UsageLogging.scala:495)
at com.databricks.DatabricksMain.recordOperation(DatabricksMain.scala:110)
at com.databricks.DatabricksMain.withStartupProfilingData(DatabricksMain.scala:654)
at com.databricks.DatabricksMain.$anonfun$main$3(DatabricksMain.scala:229)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at com.databricks.context.integrity.IntegrityCheckContext$ThreadLocalStorage$.withValue(IntegrityCheckContext.scala:73)
at com.databricks.DatabricksMain.main(DatabricksMain.scala:229)
at com.databricks.backend.daemon.driver.DriverDaemon.main(DriverDaemon.scala)
25/02/06 10:41:44 INFO SharedState: Scheduler stats enabled.
...
25/02/06 10:42:07 INFO DriverCorral: [Thread 170] AttachLibraries - candidate libraries: List(dbfs:/Volumes/databricks_catalog/default/vastjars/vast-uber.jar)
25/02/06 10:42:07 INFO DriverCorral: [Thread 170] AttachLibraries - new libraries to install (including resolved dependencies): List(dbfs:/Volumes/databricks_catalog/default/vastjars/vast-uber.jar)
25/02/06 10:42:07 INFO SharedDriverContext: [Thread 170] attachLibrariesToSpark dbfs:/Volumes/databricks_catalog/default/vastjars/vast-uber.jar
25/02/06 10:42:07 INFO SharedDriverContext: Attaching lib: dbfs:/Volumes/databricks_catalog/default/vastjars/vast-uber.jar to Spark
25/02/06 10:42:07 INFO LibraryDownloadManager: Downloading a library that was not in the cache: dbfs:/Volumes/databricks_catalog/default/vastjars/vast-uber.jar
25/02/06 10:42:07 INFO LibraryDownloadManager: Attempt 1: wait until library dbfs:/Volumes/databricks_catalog/default/vastjars/vast-uber.jar is downloaded
25/02/06 10:42:07 INFO LibraryDownloadManager: Preparing to download library file from UC Volume path: dbfs:/Volumes/databricks_catalog/default/vastjars/vast-uber.jar
我期望库加载。数据映射时间是:
15.4 LTS (includes Apache Spark 3.5.0, Scala 2.12)
spark.jars dbfs:/Volumes/databricks_catalog/default/vastjars/vast-uber.jar
,但得到此错误:
25/02/06 11:09:20 INFO DatabricksMountsStore: Mount store initialization: Attempting to get the list of mounts from metadata manager of DBFS
25/02/06 11:09:20 INFO log: Logging initialized @22567ms to shaded.v9_4.org.eclipse.jetty.util.log.Slf4jLog
25/02/06 11:09:21 INFO DatabricksMountsStore: Mount store initialization: Received a list of 9 mounts accessible from metadata manager of DBFS
25/02/06 11:09:21 INFO DatabricksMountsStore: Updated mounts cache. Changes: List((+,DbfsMountPoint(s3a://databricks-datasets-virginia/, /databricks-datasets)), (+,DbfsMountPoint(uc-volumes:/Volumes, /Volumes)), (+,DbfsMountPoint(unsupported-access-mechanism-for-path--use-mlflow-client:/, /databricks/mlflow-tracking)), (+,DbfsMountPoint(abfss://dbstorageidhu7n6albopg.dfs.core.windows.net/1138144753953386, /databricks-results)), (+,DbfsMountPoint(unsupported-access-mechanism-for-path--use-mlflow-client:/, /databricks/mlflow-registry)), (+,DbfsMountPoint(dbfs-reserved-path:/uc-volumes-reserved, /Volume)), (+,DbfsMountPoint(dbfs-reserved-path:/uc-volumes-reserved, /volumes)), (+,DbfsMountPoint(abfss://dbstorageidhu7n6albopg.dfs.core.windows.net/1138144753953386, /)), (+,DbfsMountPoint(dbfs-reserved-path:/uc-volumes-reserved, /volume)))
25/02/06 11:09:22 WARN FileSystem: Failed to initialize filesystem dbfs:/Volumes/databricks_catalog/default/vastjars/vast-uber.jar: com.databricks.backend.daemon.data.common.InvalidMountException: Error while using path /Volumes/databricks_catalog/default/vastjars for creating file system within mount at '/Volumes/databricks_catalog/default/vastjars'.
25/02/06 11:09:22 ERROR SparkContext: Failed to add dbfs:/Volumes/databricks_catalog/default/vastjars/vast-uber.jar to Spark environment
com.databricks.backend.daemon.data.common.InvalidMountException: Error while using path /Volumes/databricks_catalog/default/vastjars for creating file system within mount at '/Volumes/databricks_catalog/default/vastjars'.
at com.databricks.backend.daemon.data.common.InvalidMountException$.apply(DataMessages.scala:765)
at com.databricks.backend.daemon.data.filesystem.MountEntryResolver.createFileSystem(MountEntryResolver.scala:154)
如果我在配置中添加
spark.jars /Volumes/databricks_catalog/default/vastjars/vast-uber.jar
(删除dbfs:前缀),则群集不会启动。
宣传您的问题:
“希望在应用配置之前加载库吗?” - 这是正确的,但是对于与群集有关的库,而不是用户应用程序库。 这意味着,您需要查看集群的初始化脚本。 (
https://docs.databricks.com/aws/en/init-scripts/)
) 您应该将所需的扩展名的负载放在脚本中,例如此处(init.sh):
DEFAULT_BASE_PATH=""
BASE_PATH=$1
DB_HOME=${BASE_PATH}/databricks
SPARK_HOME=${BASE_PATH}/databricks/spark
SPARK_CONF_DIR=${BASE_PATH}/databricks/spark/conf
SPARK_JARS=${BASE_PATH}/mnt/driver-daemon/jars
setUpBasePath() {
if [[ DEBUG_MODE -ne 0 ]]; then
logInfo "Init script is going to be run in local debug ..."
logDebug "Check if BASE_PATH is provided for debug mode."
if [[ -z ${BASE_PATH} ]];then
logDebug "BASE_PATH is unset for debug mode. Please provide it."
exit 1
else
logInfo "Arg BASE_PATH is provided: $BASE_PATH"
fi
else
logInfo "Init script is going to be run ..."
BASE_PATH=$DEFAULT_BASE_PATH
fi
}
setUpBasePath
# Init databricks utils
curl -fsSL https://raw.githubusercontent.com/databricks/setup-cli/main/install.sh | sh
STAGE_DIR=$(mktemp -d)
#databricks workspace export-dir /Shared/Init/15.4_LTS $STAGE_DIR --overwrite
${HOME}/bin/databricks workspace export-dir /Shared/Init/15.4_LTS ${STAGE_DIR} --overwrite --debug
ls -R ${STAGE_DIR}
logInfo "Copying listener jars..."
cp -f "${STAGE_DIR}/libs/spark-monitoring_15.4.0.jar" ${SPARK_JARS} || { echo "Error copying file"; exit 1;}
curl https://repo1.maven.org/maven2/org/apache/logging/log4j/log4j-layout-template-json/2.22.1/log4j-layout-template-json-2.22.1.jar > ${SPARK_JARS}/log4j-layout-template-json-2.22.1.jar || { echo "Error fetching file"; exit 1;}