我同时使用 R 和 SQL。我在 DB2 服务器中有这个表 - 我在 R 中本地制作了一个更简单的版本:
library(sqldf)
mytable <- data.frame(
name = c("Alice", "Alice", "Alice", # Alice appears in all years
"Bob", "Bob", # Bob appears in 2009-2010
"Charlie", "Charlie", # Charlie appears in 2010-2011
"David", "David", # David appears in 2009,2011
"Eve"), # Eve appears in just 2009
year = c(2009, 2010, 2011, # Years for Alice
2009, 2010, # Years for Bob
2010, 2011, # Years for Charlie
2009, 2011, # Years for David
2009) # Year for Eve
)
我正在尝试找出以下内容:
有多少个名字是唯一的:(2009), (2010), (2011), (2009, 2010), (2009, 2011), (2010, 2011), (2009, 2010, 2011)?
我首先列出了最小-最大年份组合的列表:
year_range_query <- sqldf("
SELECT MIN(year) as min_year, MAX(year) as max_year
FROM mytable")
min_year <- year_range_query$min_year
max_year <- year_range_query$max_year
years <- min_year:max_year
results <- data.frame()
现在,我正在尝试编写一个与 SQL 服务器和本地 R 环境进行通信的 R 函数:
for(r in 1:length(years)) {
combinations <- combn(years, r)
for(i in 1:ncol(combinations)) {
years_in_combo <- combinations[,i]
query <- sprintf("
SELECT '%s' as years, COUNT(*) as matching_names
FROM (
SELECT name
FROM mytable
WHERE year IN (%s)
GROUP BY name
HAVING COUNT(DISTINCT year) = %d
)
",
paste(years_in_combo, collapse="-"),
paste(years_in_combo, collapse=","),
length(years_in_combo)
)
result <- sqldf(query)
results <- rbind(results, result)
}
}
代码运行了,但是当查看结果时,我可以看到存在重复项(例如 2009-2010 出现多次):
years matching_names
1 2009-2010 2
2 2009-2011 2
3 2010-2011 2
4 2009-2010-2011 1
5 2009-2010 2
6 2009-2011 2
7 2010-2011 2
8 2009-2010-2011 1
9 2009 4
10 2010 3
11 2011 3
12 2009-2010 2
13 2009-2011 2
14 2010-2011 2
15 2009-2010-2011 1
16 2009-2010 2
17 2009-2011 2
18 2010-2011 2
19 2009-2010-2011 1
20 2009-2010 2
21 2009-2011 2
22 2010-2011 2
23 2009-2010-2011 1
24 2009 4
25 2010 3
26 2011 3
27 2009-2010 2
28 2009-2011 2
29 2010-2011 2
30 2009-2010-2011 1
有人可以帮助我理解我做错了什么吗?对于最终答案,如果我对计数列求和,它应该与原始表的计数(*)相匹配。请注意,我想在没有 GROUP_CONCAT 的情况下执行此操作,因为稍后我可能需要在不支持 GROUP_CONCAT 的不同服务器上使用此代码。
想法 1:我想避免这种情况,但我想我可以预先设计所有查询,然后启动它们:
# manually determine the years
years <- c(2009, 2010, 2011)
generate_year_indicators <- function(years) {
indicators <- sapply(years, function(y) {
sprintf("MAX(CASE WHEN year = %d THEN 1 ELSE 0 END) as has_%d", y, y)
})
paste(indicators, collapse = ",\n ")
}
generate_combination_condition <- function(combo_years, all_years) {
conditions <- sapply(all_years, function(y) {
if(y %in% combo_years) {
sprintf("has_%d = 1", y)
} else {
sprintf("has_%d = 0", y)
}
})
paste(conditions, collapse = " AND ")
}
all_combinations <- list()
for(r in 1:length(years)) {
combos <- combn(years, r)
for(i in 1:ncol(combos)) {
all_combinations[[length(all_combinations) + 1]] <- combos[,i]
}
}
base_query <- sprintf("
SELECT
name,
%s
FROM mytable
GROUP BY name
", generate_year_indicators(years))
year_indicators <- sqldf(base_query)
results <- data.frame(year_combination = character(0), count = numeric(0))
for(combo in all_combinations) {
combo_name <- paste(combo, collapse="_")
count_query <- sprintf("
SELECT '%s' as year_combination,
SUM(CASE WHEN %s THEN 1 ELSE 0 END) as count
FROM year_indicators
", combo_name, generate_combination_condition(combo, years))
combo_result <- sqldf(count_query)
results <- rbind(results, combo_result)
}
results <- results[order(nchar(results$year_combination),
results$year_combination), ]
rownames(results) <- NULL
print(results)
year_combination count
1 2009 1
2 2010 0
3 2011 0
4 2009_2010 1
5 2009_2011 1
6 2010_2011 1
7 2009_2010_2011 1
想法 2:也许我可以让 R 生成完整 SQL 查询的文本,然后手动将其复制/粘贴到 SQL IDE 中?
generate_full_sql_query <- function(years) {
# First, let's create the year indicator columns
year_indicators <- sapply(years, function(y) {
sprintf("MAX(CASE WHEN year = %d THEN 1 ELSE 0 END) as has_%d", y, y)
})
year_indicators_text <- paste(year_indicators, collapse = ",\n ")
# Generate all possible year combinations
all_combinations <- list()
for(r in 1:length(years)) {
combos <- combn(years, r)
for(i in 1:ncol(combos)) {
all_combinations[[length(all_combinations) + 1]] <- combos[,i]
}
}
# Create the WITH clause for the base query
base_cte <- sprintf("WITH year_indicators AS (\n SELECT \n name,\n %s\n FROM mytable\n GROUP BY name\n)",
year_indicators_text)
# Create UNION ALL queries for each combination
combination_queries <- sapply(all_combinations, function(combo) {
combo_name <- paste(combo, collapse="_")
conditions <- sapply(years, function(y) {
if(y %in% combo) {
sprintf("has_%d = 1", y)
} else {
sprintf("has_%d = 0", y)
}
})
condition_text <- paste(conditions, collapse = " AND ")
sprintf(" SELECT\n '%s' as year_combination,\n COUNT(*) as count\n FROM year_indicators\n WHERE %s",
combo_name, condition_text)
})
# Combine everything into the final query
final_query <- paste(
base_cte,
"SELECT * FROM (",
paste(combination_queries, collapse = "\n UNION ALL\n"),
") results",
"ORDER BY LENGTH(year_combination), year_combination;",
sep = "\n"
)
return(final_query)
}
years <- c(2009, 2010, 2011)
cat(generate_full_sql_query(years))
# output
WITH year_indicators AS (
SELECT
name,
MAX(CASE WHEN year = 2009 THEN 1 ELSE 0 END) as has_2009,
MAX(CASE WHEN year = 2010 THEN 1 ELSE 0 END) as has_2010,
MAX(CASE WHEN year = 2011 THEN 1 ELSE 0 END) as has_2011
FROM mytable
GROUP BY name
)
SELECT * FROM (
SELECT
'2009' as year_combination,
COUNT(*) as count
FROM year_indicators
WHERE has_2009 = 1 AND has_2010 = 0 AND has_2011 = 0
UNION ALL
SELECT
'2010' as year_combination,
COUNT(*) as count
FROM year_indicators
WHERE has_2009 = 0 AND has_2010 = 1 AND has_2011 = 0
UNION ALL
SELECT
'2011' as year_combination,
COUNT(*) as count
FROM year_indicators
WHERE has_2009 = 0 AND has_2010 = 0 AND has_2011 = 1
UNION ALL
SELECT
'2009_2010' as year_combination,
COUNT(*) as count
FROM year_indicators
WHERE has_2009 = 1 AND has_2010 = 1 AND has_2011 = 0
UNION ALL
SELECT
'2009_2011' as year_combination,
COUNT(*) as count
FROM year_indicators
WHERE has_2009 = 1 AND has_2010 = 0 AND has_2011 = 1
UNION ALL
SELECT
'2010_2011' as year_combination,
COUNT(*) as count
FROM year_indicators
WHERE has_2009 = 0 AND has_2010 = 1 AND has_2011 = 1
UNION ALL
SELECT
'2009_2010_2011' as year_combination,
COUNT(*) as count
FROM year_indicators
WHERE has_2009 = 1 AND has_2010 = 1 AND has_2011 = 1
) results
ORDER BY LENGTH(year_combination), year_combination;
只需将嵌套的
rbind()
移动到内循环即可。更好的是,不要在循环内运行 rbind
以避免二次复制。请参阅帕特里克·伯恩 (Patrick Burn) 的 R Inferno - Circle 2: Growing Objects (PDF):
由于
combn
支持函数参数,因此直接使用 simplify=FALSE
: 合并您的查询运行
years <- 2009:2011
query_run <- \(y) {
query <- sprintf(
"
SELECT '%s' as years, COUNT(*) as matching_names
FROM (
SELECT name
FROM mytable
WHERE year IN (%s)
GROUP BY name
HAVING COUNT(DISTINCT year) = %d
)
",
paste(y, collapse="-"),
paste(y, collapse=","),
length(y)
)
cat(query)
result <- sqldf(query)
results <- rbind(results, result)
}
combinations <- lapply(
seq_along(years), \(r) combn(years, r, query_run, simplify = FALSE)
)
cat
输出
SELECT '2009' as years, COUNT(*) as matching_names
FROM (
SELECT name
FROM mytable
WHERE year IN (2009)
GROUP BY name
HAVING COUNT(DISTINCT year) = 1
)
SELECT '2010' as years, COUNT(*) as matching_names
FROM (
SELECT name
FROM mytable
WHERE year IN (2010)
GROUP BY name
HAVING COUNT(DISTINCT year) = 1
)
SELECT '2011' as years, COUNT(*) as matching_names
FROM (
SELECT name
FROM mytable
WHERE year IN (2011)
GROUP BY name
HAVING COUNT(DISTINCT year) = 1
)
SELECT '2009-2010' as years, COUNT(*) as matching_names
FROM (
SELECT name
FROM mytable
WHERE year IN (2009,2010)
GROUP BY name
HAVING COUNT(DISTINCT year) = 2
)
SELECT '2009-2011' as years, COUNT(*) as matching_names
FROM (
SELECT name
FROM mytable
WHERE year IN (2009,2011)
GROUP BY name
HAVING COUNT(DISTINCT year) = 2
)
SELECT '2010-2011' as years, COUNT(*) as matching_names
FROM (
SELECT name
FROM mytable
WHERE year IN (2010,2011)
GROUP BY name
HAVING COUNT(DISTINCT year) = 2
)
SELECT '2009-2010-2011' as years, COUNT(*) as matching_names
FROM (
SELECT name
FROM mytable
WHERE year IN (2009,2010,2011)
GROUP BY name
HAVING COUNT(DISTINCT year) = 3
)