在 R 中编写 SQL 函数

问题描述 投票:0回答:1

我同时使用 R 和 SQL。我在 DB2 服务器中有这个表 - 我在 R 中本地制作了一个更简单的版本:

library(sqldf)

mytable <- data.frame(
    name = c("Alice", "Alice", "Alice",      # Alice appears in all years
             "Bob", "Bob",                    # Bob appears in 2009-2010
             "Charlie", "Charlie",            # Charlie appears in 2010-2011
             "David", "David",                # David appears in 2009,2011
             "Eve"),                          # Eve appears in just 2009
    year = c(2009, 2010, 2011,               # Years for Alice
             2009, 2010,                      # Years for Bob
             2010, 2011,                      # Years for Charlie
             2009, 2011,                      # Years for David
             2009)                           # Year for Eve
)

我正在尝试找出以下内容:

有多少个名字是唯一的:(2009), (2010), (2011), (2009, 2010), (2009, 2011), (2010, 2011), (2009, 2010, 2011)?

我首先列出了最小-最大年份组合的列表:

year_range_query <- sqldf("
    SELECT MIN(year) as min_year, MAX(year) as max_year 
    FROM mytable")

min_year <- year_range_query$min_year
max_year <- year_range_query$max_year
years <- min_year:max_year
results <- data.frame()

现在,我正在尝试编写一个与 SQL 服务器和本地 R 环境进行通信的 R 函数:

for(r in 1:length(years)) {
    combinations <- combn(years, r)
    
    for(i in 1:ncol(combinations)) {
        years_in_combo <- combinations[,i]
        
        query <- sprintf("
            SELECT '%s' as years, COUNT(*) as matching_names
            FROM (
                SELECT name
                FROM mytable
                WHERE year IN (%s)
                GROUP BY name
                HAVING COUNT(DISTINCT year) = %d
            )
        ",
        paste(years_in_combo, collapse="-"),
        paste(years_in_combo, collapse=","),
        length(years_in_combo)
        )
        
        result <- sqldf(query)
        results <- rbind(results, result)
    }
}

代码运行了,但是当查看结果时,我可以看到存在重复项(例如 2009-2010 出现多次):

          years matching_names
1       2009-2010              2
2       2009-2011              2
3       2010-2011              2
4  2009-2010-2011              1
5       2009-2010              2
6       2009-2011              2
7       2010-2011              2
8  2009-2010-2011              1
9            2009              4
10           2010              3
11           2011              3
12      2009-2010              2
13      2009-2011              2
14      2010-2011              2
15 2009-2010-2011              1
16      2009-2010              2
17      2009-2011              2
18      2010-2011              2
19 2009-2010-2011              1
20      2009-2010              2
21      2009-2011              2
22      2010-2011              2
23 2009-2010-2011              1
24           2009              4
25           2010              3
26           2011              3
27      2009-2010              2
28      2009-2011              2
29      2010-2011              2
30 2009-2010-2011              1

有人可以帮助我理解我做错了什么吗?对于最终答案,如果我对计数列求和,它应该与原始表的计数(*)相匹配。请注意,我想在没有 GROUP_CONCAT 的情况下执行此操作,因为稍后我可能需要在不支持 GROUP_CONCAT 的不同服务器上使用此代码。


想法 1:我想避免这种情况,但我想我可以预先设计所有查询,然后启动它们:

    # manually determine the years
    years <- c(2009, 2010, 2011)
    
    generate_year_indicators <- function(years) {
        indicators <- sapply(years, function(y) {
            sprintf("MAX(CASE WHEN year = %d THEN 1 ELSE 0 END) as has_%d", y, y)
        })
        paste(indicators, collapse = ",\n        ")
    }
    
    generate_combination_condition <- function(combo_years, all_years) {
        conditions <- sapply(all_years, function(y) {
            if(y %in% combo_years) {
                sprintf("has_%d = 1", y)
            } else {
                sprintf("has_%d = 0", y)
            }
        })
        paste(conditions, collapse = " AND ")
    }
    
    all_combinations <- list()
    for(r in 1:length(years)) {
        combos <- combn(years, r)
        for(i in 1:ncol(combos)) {
            all_combinations[[length(all_combinations) + 1]] <- combos[,i]
        }
    }
    
    base_query <- sprintf("
        SELECT 
            name,
            %s
        FROM mytable
        GROUP BY name
    ", generate_year_indicators(years))
    
    year_indicators <- sqldf(base_query)
    
    results <- data.frame(year_combination = character(0), count = numeric(0))
    
    for(combo in all_combinations) {
        combo_name <- paste(combo, collapse="_")
        count_query <- sprintf("
            SELECT '%s' as year_combination,
                   SUM(CASE WHEN %s THEN 1 ELSE 0 END) as count
            FROM year_indicators
        ", combo_name, generate_combination_condition(combo, years))
        combo_result <- sqldf(count_query)
        results <- rbind(results, combo_result)
    }
    
    results <- results[order(nchar(results$year_combination), 
                             results$year_combination), ]
    rownames(results) <- NULL
    
    print(results)

  year_combination count
1             2009     1
2             2010     0
3             2011     0
4        2009_2010     1
5        2009_2011     1
6        2010_2011     1
7   2009_2010_2011     1

想法 2:也许我可以让 R 生成完整 SQL 查询的文本,然后手动将其复制/粘贴到 SQL IDE 中?

    generate_full_sql_query <- function(years) {
        # First, let's create the year indicator columns
        year_indicators <- sapply(years, function(y) {
            sprintf("MAX(CASE WHEN year = %d THEN 1 ELSE 0 END) as has_%d", y, y)
        })
        year_indicators_text <- paste(year_indicators, collapse = ",\n    ")
        
        # Generate all possible year combinations
        all_combinations <- list()
        for(r in 1:length(years)) {
            combos <- combn(years, r)
            for(i in 1:ncol(combos)) {
                all_combinations[[length(all_combinations) + 1]] <- combos[,i]
            }
        }
        
        # Create the WITH clause for the base query
        base_cte <- sprintf("WITH year_indicators AS (\n  SELECT \n    name,\n    %s\n  FROM mytable\n  GROUP BY name\n)", 
                            year_indicators_text)
        
        # Create UNION ALL queries for each combination
        combination_queries <- sapply(all_combinations, function(combo) {
            combo_name <- paste(combo, collapse="_")
            conditions <- sapply(years, function(y) {
                if(y %in% combo) {
                    sprintf("has_%d = 1", y)
                } else {
                    sprintf("has_%d = 0", y)
                }
            })
            condition_text <- paste(conditions, collapse = " AND ")
            
            sprintf("  SELECT\n    '%s' as year_combination,\n    COUNT(*) as count\n  FROM year_indicators\n  WHERE %s",
                    combo_name, condition_text)
        })
        
        # Combine everything into the final query
        final_query <- paste(
            base_cte,
            "SELECT * FROM (",
            paste(combination_queries, collapse = "\n  UNION ALL\n"),
            ") results",
            "ORDER BY LENGTH(year_combination), year_combination;",
            sep = "\n"
        )
        
        return(final_query)
    }
    
    years <- c(2009, 2010, 2011)
    cat(generate_full_sql_query(years))


# output

WITH year_indicators AS (
  SELECT 
    name,
    MAX(CASE WHEN year = 2009 THEN 1 ELSE 0 END) as has_2009,
    MAX(CASE WHEN year = 2010 THEN 1 ELSE 0 END) as has_2010,
    MAX(CASE WHEN year = 2011 THEN 1 ELSE 0 END) as has_2011
  FROM mytable
  GROUP BY name
)
SELECT * FROM (
  SELECT
    '2009' as year_combination,
    COUNT(*) as count
  FROM year_indicators
  WHERE has_2009 = 1 AND has_2010 = 0 AND has_2011 = 0
  UNION ALL
  SELECT
    '2010' as year_combination,
    COUNT(*) as count
  FROM year_indicators
  WHERE has_2009 = 0 AND has_2010 = 1 AND has_2011 = 0
  UNION ALL
  SELECT
    '2011' as year_combination,
    COUNT(*) as count
  FROM year_indicators
  WHERE has_2009 = 0 AND has_2010 = 0 AND has_2011 = 1
  UNION ALL
  SELECT
    '2009_2010' as year_combination,
    COUNT(*) as count
  FROM year_indicators
  WHERE has_2009 = 1 AND has_2010 = 1 AND has_2011 = 0
  UNION ALL
  SELECT
    '2009_2011' as year_combination,
    COUNT(*) as count
  FROM year_indicators
  WHERE has_2009 = 1 AND has_2010 = 0 AND has_2011 = 1
  UNION ALL
  SELECT
    '2010_2011' as year_combination,
    COUNT(*) as count
  FROM year_indicators
  WHERE has_2009 = 0 AND has_2010 = 1 AND has_2011 = 1
  UNION ALL
  SELECT
    '2009_2010_2011' as year_combination,
    COUNT(*) as count
  FROM year_indicators
  WHERE has_2009 = 1 AND has_2010 = 1 AND has_2011 = 1
) results
ORDER BY LENGTH(year_combination), year_combination;
sql r db2
1个回答
0
投票

只需将嵌套的

rbind()
移动到内循环即可。更好的是,不要在循环内运行
rbind
以避免二次复制。请参阅帕特里克·伯恩 (Patrick Burn) 的 R Inferno - Circle 2: Growing Objects (PDF):

由于

combn
支持函数参数,因此直接使用
simplify=FALSE
:

合并您的查询运行
years <- 2009:2011

query_run <- \(y) {        
  query <- sprintf(
    "
      SELECT '%s' as years, COUNT(*) as matching_names
      FROM (
        SELECT name
        FROM mytable
        WHERE year IN (%s)
        GROUP BY name
        HAVING COUNT(DISTINCT year) = %d
      )
    ",
    paste(y, collapse="-"),
    paste(y, collapse=","),
    length(y)
  )
       
  cat(query)
  result <- sqldf(query)
  results <- rbind(results, result)
}

combinations <- lapply(
  seq_along(years), \(r) combn(years, r, query_run, simplify = FALSE)
)

cat
输出

      SELECT '2009' as years, COUNT(*) as matching_names
      FROM (
        SELECT name
        FROM mytable
        WHERE year IN (2009)
        GROUP BY name
        HAVING COUNT(DISTINCT year) = 1
      )
    
      SELECT '2010' as years, COUNT(*) as matching_names
      FROM (
        SELECT name
        FROM mytable
        WHERE year IN (2010)
        GROUP BY name
        HAVING COUNT(DISTINCT year) = 1
      )
    
      SELECT '2011' as years, COUNT(*) as matching_names
      FROM (
        SELECT name
        FROM mytable
        WHERE year IN (2011)
        GROUP BY name
        HAVING COUNT(DISTINCT year) = 1
      )
    
      SELECT '2009-2010' as years, COUNT(*) as matching_names
      FROM (
        SELECT name
        FROM mytable
        WHERE year IN (2009,2010)
        GROUP BY name
        HAVING COUNT(DISTINCT year) = 2
      )
    
      SELECT '2009-2011' as years, COUNT(*) as matching_names
      FROM (
        SELECT name
        FROM mytable
        WHERE year IN (2009,2011)
        GROUP BY name
        HAVING COUNT(DISTINCT year) = 2
      )
    
      SELECT '2010-2011' as years, COUNT(*) as matching_names
      FROM (
        SELECT name
        FROM mytable
        WHERE year IN (2010,2011)
        GROUP BY name
        HAVING COUNT(DISTINCT year) = 2
      )
    
      SELECT '2009-2010-2011' as years, COUNT(*) as matching_names
      FROM (
        SELECT name
        FROM mytable
        WHERE year IN (2009,2010,2011)
        GROUP BY name
        HAVING COUNT(DISTINCT year) = 3
      )
© www.soinside.com 2019 - 2024. All rights reserved.