我有编程和 R 语言的新手经验。我的数据集样本如下:
library(tidyr)
library(dplyr)
subsest_df <- data.frame(
Condition = c("Oxidative", "Oxidative", "Oxidative", "Oxidative", "Oxidative",
"Oxidative", "Oxidative", "Oxidative"),
gene_name = c("Atu0472", "Atu0472", "Atu0477", "Atu0477", "Atu0479", "Atu0479", "Atu0479", "Atu0479"),
column = c("1", "1_description", "1", "1_description", "1", "2", "1_description", "2_description"),
COG = c("M", "Cell wall/membrane/envelope biogenesis", NA, NA, "E", "T", "Amino acid metabolism and transport", " Signal transduction"),
COG_description = c(NA, "Cell wall/membrane/envelope biogenesis", NA, NA, NA, NA, "Amino acid metabolism and transport", "Signal transduction"))
我想将 COG_description 列中的非 NA 值移动到与“column”列中相应的“1”或“2”位于同一行,而不是“1_description”或“2_description”行,但保持“Condition”和“gene_name”的相同匹配值。我想要的输出应该是这样的:
ideal_df <- data.frame(
Condition = c("Oxidative", "Oxidative", "Oxidative", "Oxidative"),
gene_name = c("Atu0472", "Atu0477", "Atu0479", "Atu0479"),
column = c("1", "1", "1", "2"),
COG = c("M", "NA", "E", "T"),
COG_description = c("Cell wall/membrane/envelope biogenesis", NA, "Amino acid metabolism and transport", "Signal transduction"))
在我们友好的机器人 chatgpt 的帮助下,我们为此提出了一个迭代结构,但最终失败并输出空输出,而且我没有足够的技能来找出失败发生的位置或原因或如何提出更好的问题.
# Initialize an empty dataframe to store the transformed data
transformed_df <- data.frame(
Condition = character(),
gene_name = character(),
column = character(),
COG = character(),
COG_description = character(),
stringsAsFactors = FALSE
)
# Iterate over each row
for (i in 1:nrow(subset_df)) {
# Check if the current row is a description row
if (grepl("_description", subset_df$column[i])) {
# Extract the corresponding "_description" value
description_value <- subset_df$COG_description[i]
# Find the corresponding row with "1", "2", or "3" in the "column" column
corresponding_row <- subset_df %>%
filter(Condition == subset_df$Condition[i],
gene_name == subset_df$gene_name[i],
grepl("^\\d$", column)) # Matches only "1", "2", or "3" without "_description"
# Check if a corresponding row is found
if (nrow(corresponding_row) > 0) {
# Add the corresponding row to the transformed dataframe with COG_description
transformed_df <- rbind(transformed_df, corresponding_row %>% mutate(COG_description = description_value))
}
}
}
# View the transformed dataframe
print(transformed_df)
提前感谢您的帮助!
这是使用
dplyr
的解决方案。将数据分为带描述的行和不带描述的行。将描述加入到数据中。
library(dplyr)
subsest_df <- data.frame(
Condition = c("Oxidative", "Oxidative", "Oxidative", "Oxidative", "Oxidative",
"Oxidative", "Oxidative", "Oxidative"),
gene_name = c("Atu0472", "Atu0472", "Atu0477", "Atu0477", "Atu0479", "Atu0479", "Atu0479", "Atu0479"),
column = c("1", "1_description", "1", "1_description", "1", "2", "1_description", "2_description"),
COG = c("M", "Cell wall/membrane/envelope biogenesis", NA, NA, "E", "T", "Amino acid metabolism and transport", " Signal transduction"),
COG_description = c(NA, "Cell wall/membrane/envelope biogenesis", NA, NA, NA, NA, "Amino acid metabolism and transport", "Signal transduction"))
# Create a lookup using the description rows
description_lookup <- subsest_df %>%
filter(grepl("description", column)) %>%
mutate(column = gsub("_description", "", column)) %>%
select(gene_name, column, COG_description)
# Join it to the non-description rows of the original data
want <- subsest_df %>%
filter(!grepl("description", column)) %>%
select(-COG_description) %>%
left_join(description_lookup, by = c("gene_name", "column"))