`我正在使用 R 中的数据集,我想创建一个函数,根据四分位数将值分为三个类别。低于第一四分位数 (Q1) 的值应标记为“低表达”,高于第三四分位数 (Q3) 的值应标记为“高表达”,其他所有值应标记为“NA”。
我编写了以下函数,但在函数内部,过滤步骤似乎忽略了具有低值(低于 Q1)和 NA 值的行。有人可以帮我找出问题所在吗?
fun_1 <- function(df, var_1) {
# separating quartiles
quant_25 <- quantile(df[[var_1]], 0.25, names = F)
quant_75 <- quantile(df[[var_1]], 0.75, names = F)
# selecting columns of importance and creating a new column classifying quartiles
tmp_1 <- df %>%
select(c(1:7) | {{var_1}}) %>%
mutate(
expression =
case_when(
{{var_1}} <= quant_25 ~ paste(
"Low expression ( < ", quant_25, ")"),
{{var_1}} >= quant_75 ~ paste(
"High expression ( >= ", quant_75, ")"),
TRUE ~ "NA")) %>%
filter(expression != "NA")
tmp_1
}
fun_1(dataset_name, "gene_name")
运行代码后,我得到以下输出:函数的输出:
但是,我希望该函数删除第一个 (Q1) 和第三个四分位数 (Q3) 之间的值,包括低表达值(低于 Q1)。
编辑:
colnames <- c("sample_id", "cell_type", "treatment", "replicate", "time_point", "read_count", "normalized_count", "fpkm", "gene_name")
dataset_name <- data.frame(matrix(nrow = 20, ncol = length(colnames)))
for (i in 1:length(colnames)) {
dataset_name[,i] <- ifelse(i == length(colnames),
round(runif(20, min = 0.1, max = 100), digits = 3),
round(runif(20, min = 1, max = 100), digits = 3))
}
dataset_name <- dataset_name[, c(1:(length(colnames)-1), length(colnames))]
print(dataset_name)
fun_2 <- function(df, var_1) {
library(dplyr)
quants <- quantile(df[[var_1]], c(0.25, 0.75), names = FALSE)
df %>% select(1:7 | {{var_1}}) %>%
mutate(expression = case_when(
{{var_1}} < quants[1] ~ sprintf("Low expression ( < %.02f )", quants[1]),
{{var_1}} >= quants[2] ~ sprintf("High expression ( >= %.02f )", quants[2]),
.default = "NA")
) %>%
filter(expressions != "NA")
}
fun_2(dataset_name, "X9")
# X1 X2 X3 X4 X5 X6 X7 X9 expression
# 1 75.782 58.059 8.712 45.323 76.409 83.199 84.955 84.366 High expression ( >= 84.37 )
# 2 75.782 58.059 8.712 45.323 76.409 83.199 84.955 84.366 High expression ( >= 84.37 )
# 3 75.782 58.059 8.712 45.323 76.409 83.199 84.955 84.366 High expression ( >= 84.37 )
# 4 75.782 58.059 8.712 45.323 76.409 83.199 84.955 84.366 High expression ( >= 84.37 )
# 5 75.782 58.059 8.712 45.323 76.409 83.199 84.955 84.366 High expression ( >= 84.37 )
# 6 75.782 58.059 8.712 45.323 76.409 83.199 84.955 84.366 High expression ( >= 84.37 )
# 7 75.782 58.059 8.712 45.323 76.409 83.199 84.955 84.366 High expression ( >= 84.37 )
# 8 75.782 58.059 8.712 45.323 76.409 83.199 84.955 84.366 High expression ( >= 84.37 )
# 9 75.782 58.059 8.712 45.323 76.409 83.199 84.955 84.366 High expression ( >= 84.37 )
# 10 75.782 58.059 8.712 45.323 76.409 83.199 84.955 84.366 High expression ( >= 84.37 )
# 11 75.782 58.059 8.712 45.323 76.409 83.199 84.955 84.366 High expression ( >= 84.37 )
# 12 75.782 58.059 8.712 45.323 76.409 83.199 84.955 84.366 High expression ( >= 84.37 )
# 13 75.782 58.059 8.712 45.323 76.409 83.199 84.955 84.366 High expression ( >= 84.37 )
# 14 75.782 58.059 8.712 45.323 76.409 83.199 84.955 84.366 High expression ( >= 84.37 )
# 15 75.782 58.059 8.712 45.323 76.409 83.199 84.955 84.366 High expression ( >= 84.37 )
# 16 75.782 58.059 8.712 45.323 76.409 83.199 84.955 84.366 High expression ( >= 84.37 )
# 17 75.782 58.059 8.712 45.323 76.409 83.199 84.955 84.366 High expression ( >= 84.37 )
# 18 75.782 58.059 8.712 45.323 76.409 83.199 84.955 84.366 High expression ( >= 84.37 )
# 19 75.782 58.059 8.712 45.323 76.409 83.199 84.955 84.366 High expression ( >= 84.37 )
# 20 75.782 58.059 8.712 45.323 76.409 83.199 84.955 84.366 High expression ( >= 84.37 )
意识到
filter
没有执行任何操作,因为没有任何数据(在此数据集中的任何列中)位于四分位数之间。
您的样本数据:
dataset_name <- structure(list(X1 = c(75.782, 75.782, 75.782, 75.782, 75.782, 75.782, 75.782, 75.782, 75.782, 75.782, 75.782, 75.782, 75.782, 75.782, 75.782, 75.782, 75.782, 75.782, 75.782, 75.782), X2 = c(58.059, 58.059, 58.059, 58.059, 58.059, 58.059, 58.059, 58.059, 58.059, 58.059, 58.059, 58.059, 58.059, 58.059, 58.059, 58.059, 58.059, 58.059, 58.059, 58.059), X3 = c(8.712, 8.712, 8.712, 8.712, 8.712, 8.712, 8.712, 8.712, 8.712, 8.712, 8.712, 8.712, 8.712, 8.712, 8.712, 8.712, 8.712, 8.712, 8.712, 8.712), X4 = c(45.323, 45.323, 45.323, 45.323, 45.323, 45.323, 45.323, 45.323, 45.323, 45.323, 45.323, 45.323, 45.323, 45.323, 45.323, 45.323, 45.323, 45.323, 45.323, 45.323), X5 = c(76.409, 76.409, 76.409, 76.409, 76.409, 76.409, 76.409, 76.409, 76.409, 76.409, 76.409, 76.409, 76.409, 76.409, 76.409, 76.409, 76.409, 76.409, 76.409, 76.409), X6 = c(83.199, 83.199, 83.199, 83.199, 83.199, 83.199, 83.199, 83.199, 83.199, 83.199, 83.199, 83.199, 83.199, 83.199, 83.199, 83.199, 83.199, 83.199, 83.199, 83.199), X7 = c(84.955, 84.955, 84.955, 84.955, 84.955, 84.955, 84.955, 84.955, 84.955, 84.955, 84.955, 84.955, 84.955, 84.955, 84.955, 84.955, 84.955, 84.955, 84.955, 84.955), X8 = c(86.255, 86.255, 86.255, 86.255, 86.255, 86.255, 86.255, 86.255, 86.255, 86.255, 86.255, 86.255, 86.255, 86.255, 86.255, 86.255, 86.255, 86.255, 86.255, 86.255), X9 = c(84.366, 84.366, 84.366, 84.366, 84.366, 84.366, 84.366, 84.366, 84.366, 84.366, 84.366, 84.366, 84.366, 84.366, 84.366, 84.366, 84.366, 84.366, 84.366, 84.366)), row.names = c(NA, -20L), class = "data.frame")