GLOBathy 是一个包含地球上几乎所有湖泊测深数据的数据集(可通过此链接获取 15.58GB .zip)。它包含 1.4m+ 记录,分为多个子目录。有关联的属性表,记录每个文件所属的国家(链接)。
从子目录中提取数据有点麻烦,因为文件名相对于它们所属的国家/地区没有“直观”的序列。此外,我发现 GIS 程序从多个目录中提取数据的速度非常慢,并且通常需要在某种程度上进行循环。
我自我回答这个问题是为了:a)为可能遇到此问题的其他人提供解决方案; b) 看看是否有人知道更有效的方法。
这是一种按国家/地区从 GLOBathy 数据中提取文件的方法。我已经注释了代码,如果需要澄清或发现问题请评论:
library(dplyr)
# Load the GLOBathy_basic_parameters(ALL_LAKES).csv file
GLOBathy_all <- read.csv("C:/GLOBathy_basic_parameters/GLOBathy_basic_parameters(ALL_LAKES).csv",
stringsAsFactors = FALSE)
# Define country of interest
country_sub <- "New Zealand"
# Define output directory (must already exist)
output_folder <- "C:/New Zealand"
# Extract records from GLOBathy_all that match country_sub and create associated file paths
GLOBathy_sub <- GLOBathy_all %>%
filter(Country == country_sub) %>% # Filter by country
mutate(temp1 = ifelse(Hylak_id > 1400000, # Get upper value of sub folder
nrow(GLOBathy_all),
paste0(ceiling(Hylak_id / 100000) * 100, "K")),
temp2 = ifelse(Hylak_id <= 100000, # Round Hylak_id down to nearest 100000, two steps required for cases where Hylak_id == upper threshhold of folder names
1,
floor(Hylak_id / 100000) * 100000),
temp3 = ifelse(temp2 == 1, # Get lower value of sub folder
paste0(1, "_"),
ifelse(temp2 > 100000 & temp2 == Hylak_id, # If value == folder threshhold, subtract 100000
paste0((temp2 - 100000) / 1000, "K_"),
paste0(temp2 / 1000, "K_"))),
temp4 = ceiling(Hylak_id / 1000) * 1000, # Get upper value of sub sub folder
temp5 = paste0(temp4 - 999, "_"), # Get lower value of sub sub folder
folders = paste0("C:/Bathymetry_Rasters/", # Create folder path
temp3, temp1, "/", temp5, temp4),
files = paste0(folders, "/", Hylak_id, "_bathymetry.tif")) %>% # Create file path
select(-contains("temp"))
# Copy files to new directory
file.copy(GLOBathy_sub$files, output_folder)