我有以下 R 代码,借用了 here,它生成了一个可重现的
tibble
表:
# Install/load packages only if needed
# ************************************
if (!require("pacman")) install.packages("pacman")
pacman::p_load(dplyr, expss, ggplot2, grid, purrr, rlang, tibble)
# Data Generation
# ***************
# Set the seed for reproducibility
set.seed(123)
# Generate random data
n <- 490
PTSD <- sample(c(1, 2, NA), n, replace = TRUE) #class(PTSD) = "numeric"
ANX <- sample(c(1, 2, NA), n, replace = TRUE) #class(ANX) = "numeric"
DEP <- sample(c(1, 2, NA), n, replace = TRUE) #class(DEP) = "numeric"
# Create the data frame
df <- data.frame(PTSD, ANX, DEP) #class(df) = "data.frame"
# Label the values: 1 = Low, 2 = High
expss::val_lab(df$PTSD) = expss::num_lab("1 Low\n2 High")
expss::val_lab(df$ANX) = expss::num_lab("1 Low\n2 High")
expss::val_lab(df$DEP) = expss::num_lab("1 Low\n2 High")
# Create a list of tables for each variable to count 1s, 2s, and NAs
count_results <- list(
PTSD = table(df$PTSD, useNA = "ifany"),
ANX = table(df$ANX, useNA = "ifany"),
DEP = table(df$DEP, useNA = "ifany")
)
# Frequency count and data summary
# ********************************
# Combine the count tables into a single table
count_table <- do.call(rbind, count_results)
# Initialize empty vectors to store results
variable_names <- character()
sample_sizes <- numeric()
# Loop through the test results and extract relevant information
for (variable_name in names(count_results)) {
sample_sizes <- c(sample_sizes, sum(count_results[[variable_name]]))
variable_names <- c(variable_names, variable_name)
}
# Create summary data frame
summary_df <- data.frame(
Variable = variable_names,
N = sample_sizes
)
# Combine the count table and chi-squared summary table by columns
final_result <- cbind(count_table, summary_df)
# Remove Variable column in the middle of the table
final_result <- subset(final_result, select = -c(Variable))
# Combination of CMDs (CMD ≥ 1)
# *****************************
cmd <- c("PTSD","ANX","DEP")
combs <- map(seq_along(cmd),\(n)combn(cmd,n,simplify = FALSE)) |> purrr::flatten()
filts <- rlang::parse_exprs(map_chr(combs,\(x)paste0(x ,'== 2',collapse=' & ')))
filtsnames <- rlang::parse_exprs(map_chr(combs,\(x)paste0(x ,collapse=' + ')))
names(filts) <- filtsnames
output <- purrr::map_int(filts,\(x){
df %>%
mutate(id = row_number())%>%
filter(!!(x))%>%
summarise(
n = n())
} |> pull(n)
)
tibble::enframe(output)
tibble
表的输出应该显示N = 490
中有多少人患有以下常见精神障碍(CMD),即仅PTSD、仅ANX、仅DEP、PTSD和ANX、PTSD和DEP ,ANX 和 DEP,以及所有 3 个 CMD:
# A tibble: 7 × 2
name value
<chr> <int>
1 PTSD 167
2 ANX 156
3 DEP 156
4 PTSD + ANX 56
5 PTSD + DEP 52
6 ANX + DEP 51
7 PTSD + ANX + DEP 23
我想以图形方式可视化该表,因此我考虑生成维恩图。我期望在图中看到的内容如下。
期望清单:
但是,虽然所有代码(下面的示例)都没有生成任何技术错误(即 R 代码错误),但我尝试的所有软件包(
VennDiagram
和 ggVennDiagram
)都没有显示出预期的结果(参见 Expectation list
)。
下面是用于生成 4 个不同维恩图的 4 个代码,其中没有一个给出
Expectation list
中概述的结果:
使用包
VennDiagram
版本1
pacman::p_load(VennDiagram)
# Move to new plotting page
grid::grid.newpage()
# Calculate percentages
total_samples <- nrow(df)
percentages <- output / total_samples * 100
venn.plot <- VennDiagram::draw.triple.venn(
area1 = output["PTSD"],
area2 = output["ANX"],
area3 = output["DEP"],
n12 = output["PTSD + ANX"],
n23 = output["ANX + DEP"],
n13 = output["PTSD + DEP"],
n123 = output["PTSD + ANX + DEP"],
category = c("PTSD", "ANX", "DEP"),
fill = c("red", "green", "blue"),
lty = "blank",
cex = rep(1.5,7),
cat.cex = rep(1.5,3),
cat.pos = c(-20,-40,-60),
cat.dist = c(0.05,0.05,0.05),
ind = TRUE,
euler.d =TRUE,
)
grid.draw(venn.plot)
使用套件
VennDiagram
2
pacman::p_load(VennDiagram)
# Move to new plotting page
grid::grid.newpage()
# Use pre-calculated values from 'output'
VennDiagram::draw.triple.venn(
area1 = output["PTSD"],
area2 = output["ANX"],
area3 = output["DEP"],
n12 = output["PTSD + ANX"] + output["PTSD + ANX + DEP"], # Adjust for overlaps
n23 = output["ANX + DEP"] + output["PTSD + ANX + DEP"], # Adjust for overlaps
n13 = output["PTSD + DEP"] + output["PTSD + ANX + DEP"], # Adjust for overlaps
n123 = output["PTSD + ANX + DEP"],
category = c("PTSD", "ANX", "DEP"),
col = "Red", fill = c("Green", "Yellow", "Blue"),
cex = 1.5, cat.cex = 1.5, cat.pos = c(-20, 20, 180)
)
使用套件
VennDiagram
3
pacman::p_load(VennDiagram)
# Calculate exclusive counts for Venn diagram
ptsd_only <- output["PTSD"] - output["PTSD + ANX"] - output["PTSD + DEP"] + output["PTSD + ANX + DEP"]
anx_only <- output["ANX"] - output["PTSD + ANX"] - output["ANX + DEP"] + output["PTSD + ANX + DEP"]
dep_only <- output["DEP"] - output["PTSD + DEP"] - output["ANX + DEP"] + output["PTSD + ANX + DEP"]
ptsd_anx <- output["PTSD + ANX"] - output["PTSD + ANX + DEP"]
ptsd_dep <- output["PTSD + DEP"] - output["PTSD + ANX + DEP"]
anx_dep <- output["ANX + DEP"] - output["PTSD + ANX + DEP"]
ptsd_anx_dep <- output["PTSD + ANX + DEP"]
# Move to new plotting page
grid::grid.newpage()
# Create Venn diagram with 3 sets using adjusted values
VennDiagram::draw.triple.venn(
area1 = ptsd_only,
area2 = anx_only,
area3 = dep_only,
n12 = ptsd_anx,
n23 = anx_dep,
n13 = ptsd_dep,
n123 = ptsd_anx_dep,
category = c("PTSD", "ANX", "DEP"),
col = "Red", fill = c("Green", "Yellow", "Blue"),
cex = 1.5, cat.cex = 1.5, cat.pos = c(-20, 20, 180)
)
使用套件
ggVennDiagram
pacman::p_load(ggVennDiagram)
# Prepare data for Venn diagram
venn_data <- list(
PTSD = which(df$PTSD == 2),
ANX = which(df$ANX == 2),
DEP = which(df$DEP == 2)
)
# Create Venn diagram with ggVennDiagram
ggVennDiagram(venn_data) +
ggplot2::scale_fill_gradient(low = "white", high = "darkgrey") +
theme_void()
我的问题:除了手工绘制图表之外,有没有一种方法可以用 R 生成维恩图,它反映的结果与
tibble
表中的结果相同?
(或者我错过了关于维恩图生成的要点?)
这是绘制维恩图的简单方法,表示这些疾病何时出现
High
。
library(dplyr)
library(ggplot2)
library(ComplexUpset)
set.seed(123)
# Generate random data
n <- 490
PTSD <- sample(c(1, 2, NA), n, replace = TRUE) #class(PTSD) = "numeric"
ANX <- sample(c(1, 2, NA), n, replace = TRUE) #class(ANX) = "numeric"
DEP <- sample(c(1, 2, NA), n, replace = TRUE) #class(DEP) = "numeric"
# Create the data frame
disorders <- c('PTSD', 'ANX', 'DEP')
df <- data.frame(PTSD, ANX, DEP)
# Create boolean indicators where "High" == TRUE
df[disorders] = df[disorders] == 2
# Either drop rows with NA
# df = na.omit(df)
# or impute missing values
df[is.na(df)] <- FALSE
glimpse(df)
#> Rows: 490
#> Columns: 3
#> $ PTSD <lgl> FALSE, FALSE, FALSE, TRUE, FALSE, TRUE, TRUE, TRUE, FALSE, FALSE,…
#> $ ANX <lgl> TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALS…
#> $ DEP <lgl> FALSE, TRUE, FALSE, FALSE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE,…
ggplot() +
theme_void() +
coord_fixed() +
geom_venn_circle(df, sets = disorders, size = 1) +
geom_venn_label_set(df, sets = disorders, aes(label = region), outwards_adjust = 2) +
geom_venn_label_region(df, sets = disorders, aes(label = size))
此处的交叉点大小与您的期望列表不符,因为您错误定义了它们
Intersection of all 3 = 23
正确
Intersection of ANX and DEP = 51
Intersection of PTSD and DEP = 52
Intersection of PTSD and ANX = 56
仅当您包括所有疾病均高的 23 种情况时
DEP only = 156
ANX only = 156
PTSD only = 167
这些都不匹配仅无序度较高的行数。将维恩图中的结果与计数表进行比较。
df |>
count(PTSD, ANX, DEP)
#> PTSD ANX DEP n
#> 1 FALSE FALSE FALSE 147
#> 2 FALSE FALSE TRUE 76
#> 3 FALSE TRUE FALSE 72
#> 4 FALSE TRUE TRUE 28
#> 5 TRUE FALSE FALSE 82
#> 6 TRUE FALSE TRUE 29
#> 7 TRUE TRUE FALSE 33
#> 8 TRUE TRUE TRUE 23