R 中的相交形状文件

问题描述 投票:0回答:1

我从互联网上获得了这 2 个形状文件:

library(sf)
library(dplyr)
library(fs)

url_1 <- "https://www12.statcan.gc.ca/census-recensement/2021/geo/sip-pis/boundary-limites/files-fichiers/lfsa000b21a_e.zip"

temp_dir <- tempdir()

zip_file <- file.path(temp_dir, "shapefile.zip")

download.file(url_1, destfile = zip_file)

exdir_ <- tempfile(pattern = "shp_1")

unzip(zip_file, exdir = exdir_, junkpaths = TRUE)

fs::dir_tree(exdir_)

file_1 <- st_read(exdir_)

fsa <- st_transform(file_1, "+proj=longlat +datum=WGS84")
unlink(temp_dir, recursive = TRUE)

url_2 <- "https://www12.statcan.gc.ca/census-recensement/alternative_alternatif.cfm?l=eng&dispext=zip&teng=lada000b21a_e.zip&k=%20%20%20151162&loc=//www12.statcan.gc.ca/census-recensement/2021/geo/sip-pis/boundary-limites/files-fichiers/lada000b21a_e.zip"


download.file(url_2, destfile = "lada000b21a_e.zip")

unzip("lada000b21a_e.zip")

ada <- st_read("lada000b21a_e.shp")

对于相同的 PRUID,我想创建一个矩阵,其中包含 shapefile 1 中的每个多边形与 shapefile 2 中的每个多边形相交的百分比。

library(sf)
library(dplyr)


calculate_intersection_percentages <- function(ada, fsa, pruid) {
  ada_filtered <- ada %>% filter(PRUID == pruid)
  fsa_filtered <- fsa %>% filter(PRUID == pruid)
  
  ada_filtered <- st_transform(ada_filtered, st_crs(fsa_filtered))
  
  results <- list()
  
  total_fsas <- nrow(fsa_filtered)
  completed_fsas <- 0
  
  for (ada_id in ada_filtered$ADAUID) {
    ada_single <- ada_filtered %>% filter(ADAUID == ada_id)
    ada_area <- st_area(ada_single)
    
    for (fsa_id in fsa_filtered$CFSAUID) {
      fsa_single <- fsa_filtered %>% filter(CFSAUID == fsa_id)
      
      intersection <- st_intersection(ada_single, fsa_single)
      
      if (length(intersection) > 0) {
        intersection_area <- st_area(intersection)
        percentage <- as.numeric(intersection_area / ada_area * 100)
        
        print(sprintf("ADA: %s, FSA: %s, Intersection = %.2f%%", ada_id, fsa_id, percentage))
        
        results[[ada_id]][[fsa_id]] <- percentage
      }
      
      completed_fsas <- completed_fsas + 1
      progress_percentage <- (completed_fsas / (total_fsas * nrow(ada_filtered))) * 100
      print(sprintf("Progress: %.2f%% of FSA calculations completed", progress_percentage))
    }
  }
  
  result_matrix <- do.call(rbind, lapply(results, function(x) {
    unlist(x, use.names = TRUE)
  }))
  
  result_matrix[is.na(result_matrix)] <- 0
  
  result_matrix <- result_matrix / rowSums(result_matrix) * 100
  
  return(result_matrix)
}

我像这样启动该功能:

result <- calculate_intersection_percentages(ada, fsa, pruid = "10")

代码运行,但我得到了一些非常精确的数字,这让我认为交集没有正确发生:

20.00000000 20.00000000 20.00000000 20.00000000 20.00000000

此外,纽芬兰有 35 个 FSA (https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_A#:~:text=and%20some%20libraries.-,Newfoundland%20and%20Labrador,35%20FSAs% 20in%20this%20list.),但我只看到 5:

> head(result)
              A0A      A1B      A1C      A1E      A1S
10010001 15.50990 26.73515 15.50990 26.73515 15.50990
10010002 23.85278 14.22083 23.85278 14.22083 23.85278
10010003 13.88354 29.17468 13.88354 29.17468 13.88354
10010004 20.84652 18.73022 20.84652 18.73022 20.84652
10010005 26.08074 10.87889 26.08074 10.87889 26.08074
10010006 20.00000 20.00000 20.00000 20.00000 20.00000

如何正确地将这两个形状文件相交?

r r-sf
1个回答
0
投票

只是一些更有趣的问题开始,可能还有更多。

  • 在性能方面,如果没有这些嵌套循环,您可以做得更好,增加结果列表实际上并不重要,但据我了解,这种成对操作有效地禁用了您可以从空间索引获得的所有增益。虽然(对我来说)正确测试和测量并不是那么简单,但不要认为我们可以简单地禁用索引。

  • length(intersection) > 0
    可能不是你想象的那样;当将
    st_intersection()
    sf
    对象一起使用时,它会返回另一个
    sf
    对象,因此
    length()
    永远不会为 0,即使对于非相交几何图形也不会,因为它不是相交数而是列数;有趣的是,这实际上会将您推向正确的方向,因为您的结果列表会获得 0 区域交叉点的
    NULL
    值,这是您在准备
    rbind
    的输入时肯定需要的东西;当
    print()
    没有输出时,不是因为你的
    if
    子句,而是因为
    percentage
    NULL
    。当使用较小的预过滤子集(
    ada[1:5,]
    fsa[1:5,]
    ,都带有
    PRUID == "10"
    )进行测试时,最终的
    results
    看起来像这样:

lobstr::tree(results)
<list>
├─10010001: <list>
│ ├─A0A: 36.6844068668399
│ ├─A0B: 63.3155931332246
│ ├─A0C<dbl [0]>: 
│ ├─A0E<dbl [0]>: 
│ └─A0G<dbl [0]>: 
├─10010002: <list>
│ ├─A0A: 62.6409032955523
│ ├─A0B<dbl [0]>: 
│ ├─A0C<dbl [0]>: 
│ ├─A0E<dbl [0]>: 
│ └─A0G<dbl [0]>: 
├─10010003: <list>
│ ├─A0A: 32.2477921284301
│ ├─A0B: 0
│ ├─A0C<dbl [0]>: 
│ ├─A0E<dbl [0]>: 
│ └─A0G<dbl [0]>: 
├─10010004: <list>
│ ├─A0A<dbl [0]>: 
│ ├─A0B<dbl [0]>: 
│ ├─A0C<dbl [0]>: 
│ ├─A0E<dbl [0]>: 
│ └─A0G<dbl [0]>: 
└─10010005: <list>
  ├─A0A: 0
  ├─A0B<dbl [0]>: 
  ├─A0C<dbl [0]>: 
  ├─A0E<dbl [0]>: 
  └─A0G<dbl [0]>: 
  • 使用
    lapply(results, function(x) {unlist(x, use.names = TRUE)})
    时,所有
    NULL
    值都会被删除,从而产生一个参差不齐的列表,其中
    rbind()
    的对齐被破坏;因此,矩阵列的数量由第一次外循环迭代中非
    NULL
    区域的数量定义,并且仅包含
    NULL
    值的行也将被删除;虽然我们的小 5 行
    sf
    对象应该得到 5x5 矩阵,但我们最终得到 4x2:
lapply(results, unlist, use.names = TRUE) |> lobstr::tree(show_attributes = TRUE)
<list>
├─10010001<dbl [2]>: 36.6844068668399, 63.3155931332246
│ └┄attr(,"names")<chr [2]>: "A0A", "A0B"
├─10010002: 62.6409032955523
│ └┄attr(,"names"): "A0A"
├─10010003<dbl [2]>: 32.2477921284301, 0
│ └┄attr(,"names")<chr [2]>: "A0A", "A0B"
├─10010004<dbl [0]>: 
├─10010005: 0
┊ └┄attr(,"names"): "A0A"
└┄attr(,"names")<chr [5]>: "10010001", "10010002", "10010003", "10010004", "10010005"

lapply(results, unlist, use.names = TRUE) |> do.call(what = rbind)
              A0A      A0B
10010001 36.68441 63.31559
10010002 62.64090 62.64090
10010003 32.24779  0.00000
10010005  0.00000  0.00000

为了继续寻找可能的解决方案,似乎从

st_intersection()
旋转更宽的框架应该非常接近您所追求的。

获取数据并加载一小部分用于测试:

library(sf)
#> Linking to GEOS 3.12.1, GDAL 3.8.4, PROJ 9.3.1; sf_use_s2() is TRUE
library(dplyr)
library(tidyr)
library(ggplot2)

# curl::curl_download(
#   "https://www12.statcan.gc.ca/census-recensement/2021/geo/sip-pis/boundary-limites/files-fichiers/lfsa000b21a_e.zip", 
#   "lfsa000b21a_e.zip",
#   quiet = FALSE)
# 
# curl::curl_download(
#   "https://www12.statcan.gc.ca/census-recensement/2021/geo/sip-pis/boundary-limites/files-fichiers/lada000b21a_e.zip", 
#   "lada000b21a_e.zip",
#   quiet = FALSE)

# check layers
rbind(
  st_layers("/vsizip/lfsa000b21a_e.zip/lfsa000b21a_e/"),
  st_layers("/vsizip/lada000b21a_e.zip")
)
#> Driver: ESRI Shapefile 
#> Driver: ESRI Shapefile 
#> Available layers:
#>      layer_name geometry_type features fields                          crs_name
#> 1 lfsa000b21a_e       Polygon     1643      5 NAD83 / Statistics Canada Lambert
#> 2 lada000b21a_e       Polygon     5433      4 NAD83 / Statistics Canada Lambert

# read relevant subsets for testing
fsa <- read_sf(
  "/vsizip/lfsa000b21a_e.zip/lfsa000b21a_e/", 
  query = "SELECT CFSAUID, PRUID FROM lfsa000b21a_e WHERE PRUID = '10'",
  quiet = TRUE)
nrow(fsa)
#> [1] 35

ada <- read_sf(
  "/vsizip/lada000b21a_e.zip", 
  query = "SELECT ADAUID, PRUID FROM lada000b21a_e WHERE PRUID = '10'",
  quiet = TRUE)
nrow(ada)
#> [1] 86

交叉点面积比例,随意调整缩放比例:

isect_area_ratio <- function(ada, fsa, pruid) {
  fsa_filtered <- 
    fsa |> 
    filter(PRUID == pruid) |> 
    select(CFSAUID)

  # add `area_ada` so it would be included in st_intersection() result
  ada_filtered <- 
    ada |> 
    filter(PRUID == pruid) |> 
    select(ADAUID) |>   
    st_transform(st_crs(fsa_filtered)) |> 
    mutate(area_ada = st_area(geometry)) 

  # intersection also includes geometries without area (points, linestrings, ..), 
  # leave those out;
  # once we have ratios, we can drop geometries, pivot from long to wide and
  # convert to matrix
  area_ratio_m <- 
    st_intersection(ada_filtered, fsa_filtered) |> 
    filter(st_geometry_type(geometry) %in% c("POLYGON", "MULTIPOLYGON", "GEOMETRYCOLLECTION")) |> 
    mutate(area_ratio = (st_area(geometry) / area_ada) |> as.numeric()) |> 
    st_drop_geometry() |> 
    select(ADAUID, CFSAUID, area_ratio) |> 
    pivot_wider(names_from = CFSAUID, values_from = area_ratio, values_fill = 0) |> 
    tibble::column_to_rownames("ADAUID") |> 
    as.matrix()
  
  # make sure matrix row and column order aligns with row order of input dataframes
  area_ratio_m[ada_filtered$ADAUID, fsa_filtered$CFSAUID ]
}

测试:

tictoc::tic("isect_area_ratio")
m <- isect_area_ratio(ada, fsa, pruid = "10")
tictoc::toc()
#> isect_area_ratio: 9.14 sec elapsed

# expect 86x35 matrix
str(m)
#>  num [1:86, 1:35] 0.367 0.626 0.322 0 0 ...
#>  - attr(*, "dimnames")=List of 2
#>   ..$ : chr [1:86] "10010001" "10010002" "10010003" "10010004" ...
#>   ..$ : chr [1:35] "A0A" "A0B" "A0C" "A0E" ...

# upper left corner
m[1:10, 1:10]
#>                A0A       A0B A0C A0E A0G A0H A0J A0K A0L A0M
#> 10010001 0.3668441 0.6331559   0   0   0   0   0   0   0   0
#> 10010002 0.6264090 0.0000000   0   0   0   0   0   0   0   0
#> 10010003 0.3224779 0.0000000   0   0   0   0   0   0   0   0
#> 10010004 0.0000000 0.0000000   0   0   0   0   0   0   0   0
#> 10010005 0.0000000 0.0000000   0   0   0   0   0   0   0   0
#> 10010006 0.0000000 0.0000000   0   0   0   0   0   0   0   0
#> 10010007 0.0000000 0.0000000   0   0   0   0   0   0   0   0
#> 10010008 0.0000000 0.0000000   0   0   0   0   0   0   0   0
#> 10010009 0.0000000 0.0000000   0   0   0   0   0   0   0   0
#> 10010010 0.0000000 0.0000000   0   0   0   0   0   0   0   0

# check if rows add up to ~1
rowSums(m) |> summary() 
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#>       1       1       1       1       1       1

仔细观察一小部分交叉点,注意我们如何最终得到几何图形集合和多边形,每个相交几何图形对都有一个。单个相交多边形的数量可能要高得多。

ada |> 
  filter(ADAUID == "10010032") |> 
  mutate(area_ada = st_area(geometry)) |> 
  st_intersection(fsa) |> 
  filter(st_geometry_type(geometry) %in% c("POLYGON", "MULTIPOLYGON", "GEOMETRYCOLLECTION")) |> 
  mutate(area_ratio = (st_area(geometry) / area_ada) |> as.numeric()) |> 
  print() |> 
  ggplot() +
  geom_sf(data = ada[ada$ADAUID == "10010032","geometry"]) +
  geom_sf(aes(fill = CFSAUID), legend = FALSE) +
  geom_sf_label(aes(label = scales::label_percent()(area_ratio))) +
  facet_grid(rows = vars(ADAUID), cols = vars(CFSAUID)) +
  guides(fill = "none")
#> Warning: attribute variables are assumed to be spatially constant throughout
#> all geometries
#> Simple feature collection with 2 features and 6 fields
#> Geometry type: GEOMETRY
#> Dimension:     XY
#> Bounding box:  xmin: 8952734 ymin: 2027053 xmax: 9015751 ymax: 2119991
#> Projected CRS: NAD83 / Statistics Canada Lambert
#> # A tibble: 2 × 7
#>   ADAUID   PRUID   area_ada CFSAUID PRUID.1                  geometry area_ratio
#> * <chr>    <chr>      [m^2] <chr>   <chr>              <GEOMETRY [m]>      <dbl>
#> 1 10010032 10        3.49e9 A0A     10      MULTIPOLYGON (((8991045 …      0.890
#> 2 10010032 10        3.49e9 A0B     10      GEOMETRYCOLLECTION (POLY…      0.110

创建于 2024-09-23,使用 reprex v2.1.1

© www.soinside.com 2019 - 2024. All rights reserved.