如何让函数运行得更快?

问题描述 投票:0回答:1

我写了函数

find_one
,它的功能是找出重复出现的变量的值,但是当数据框中的行数大于10000时,它运行缓慢。我想修改这个函数以减少运行时间。有什么办法可以达到这样的效果吗? 我是R的初学者,这个功能是AI提供的。我很难找出它的问题所在。 非常感谢您的帮助和指正!

find_one <- function(x, y, z, df){
  library(data.table)
  start_time <- Sys.time() 
  
  data <- data.table(
    id = df[[x]],
    name = df[[y]],
    n = df[[z]]
  )
  
  # Step 1: Create bidirectional relationships with "n"
  relationships <- unique(rbind(
    data[, .(key = id, value = name, n = n)], # id -> name
    data[, .(key = name, value = id, n = n)] # name -> id
  ))
  
  # Step 2: Build clusters of relationships
  graph <- relationships[, .(key, value, n)]
  clusters <- list()
  while (nrow(graph) > 0) {
    # Initialize a cluster
    cluster <- graph[1]
    graph <- graph[-1]
    
    # Expand the cluster
    while (TRUE) {
      new_links <- graph[key %in% cluster$value | value %in% cluster$key]
      if (nrow(new_links) == 0) break
      
      cluster <- rbind(cluster, new_links)
      graph <- graph[!(key %in% cluster$key & value %in% cluster$value)]
    }
    clusters <- append(clusters, list(cluster))
  }
  
  # Step 3: Extract valid records (optimized)
  valid_records <- lapply(clusters, function(cluster) {
    # Identify rows corresponding to `id` and `name` in one go
    is_id <- cluster$key %in% data$id
    is_name <- cluster$value %in% data$name
    
    ids <- unique(cluster$key[is_id])
    names <- unique(cluster$value[is_name])
    ns <- unique(cluster$n)  # Row indices
    
    list(id = ids, name = names, n = ns)
  })
  
  # Step 4: Convert to the desired structure
  result <- tibble(
    id = lapply(valid_records, function(x) as.character(x$id)),
    name = lapply(valid_records, function(x) as.character(x$name)),
    n = lapply(valid_records, function(x) as.integer(x$n))
  )
  
  end_time <- Sys.time() 
  (end_time - start_time) %>% print() 
  
  return(result)
}
data = data.frame(
  v1 = c("a", "a", "a", "b", "c", "c", "d", "e"),
  v2 = c("123", "123", "124", "124", "125", "126", "127", "128"),
  v3 = 1:8
)

desired_result <- find_one(x = "v1", y = "v2", z = "v3", df = data)

desired_result  <- structure(list(id = list(c("a", "b"), "c", "d", "e"), name = list( c("123", "124"), c("125", "126"), "127", "128"), n = list( 1:4, 5:6, 7L, 8L)), class = c("tbl_df", "tbl", "data.frame" ), row.names = c(NA, -4L))
r data.table
1个回答
0
投票

您可以使用

igraph
解决此问题:

a <- igraph::components(igraph::graph_from_data_frame(data))$membership
b <- aggregate(data, list(a[as.character(data[[1]])]), \(x)unique(x))[-1]
names(b) <- c('id', 'name', 'n')

    id     name          n
1 a, b 123, 124 1, 2, 3, 4
2    c 125, 126       5, 6
3    d      127          7
4    e      128          8

请注意,

aggregate
非常慢。我宁愿只保留数据与分组因素而不是嵌套它。


您可以检查等效性:

all.equal(desired_result, structure(b, class = class(desired_result)))
[1] TRUE
© www.soinside.com 2019 - 2024. All rights reserved.