我有下面提到的数据帧:
my_data <- read.table(text =
"ID Date1 T1 Date2 Val1
A-1 '2018-01-10 15:05:24' A 2018-01-15 10
A-2 '2018-01-05 14:15:22' B 2018-01-14 12
A-3 '2018-01-04 13:20:21' A 2018-01-13 15
A-4 '2018-01-01 18:35:45' B 2018-01-12 22
A-5 '2017-12-28 19:45:10' A 2018-01-11 18
A-6 '2017-12-10 08:03:29' A 2018-01-10 21
A-7 '2017-12-06 20:55:55' A 2018-01-09 28
A-8 '2018-01-10 10:02:12' A 2018-01-15 10
A-9 '2018-01-05 17:15:14' B 2018-01-14 12
A-10 '2018-01-04 18:35:58' A 2018-01-13 15
A-11 '2018-01-01 21:09:25' B 2018-01-12 22
A-12 '2017-12-28 02:12:22' A 2018-01-11 18
A-13 '2017-12-10 03:45:44' A 2018-01-10 21
A-14 '2017-12-06 07:15:25' A 2018-01-09 28
A-18 '2017-10-07 08:02:84 B 2017-11-05 20
A-21 '2017-10-01 06:04:04 A 2017-10-20 15
A-51 '2017-09-20 08:07:06 A 2017-09-28 10
A-35 '2017-09-14 08:02:45 A 2017-09-25 20
A-30 '2017-08-10 15:03:08 A 2017-08-30 25",
header = TRUE, stringsAsFactors = FALSE)
我正在使用下面提到的代码来获得如下所述的输出:
date_range = expand.grid(Date1 = seq(min(ymd_hms(my_data$Date1)), max(ymd_hms(my_data$Date1)),
by = "1 month"),
T1 = c("A", "B"),
stringsAsFactors = FALSE)
table_2 <- merge(
my_data %>%
mutate(Date2 = ymd(Date2),
Date1 = ymd_hms(Date1)) %>%
full_join(date_range, by = c("Date1", "T1")) %>% # join date ranges to table
arrange(Date1) %>% # sort by date
mutate(Month = paste(month(Date1, label = TRUE), year(Date1), sep = "-"),
row_number = row_number(), # create row_numbers to keep up order
Val1 = coalesce(Val1, 0L)) %>% # replace NA with 0 in Val1
filter(T1 == "A") %>%
group_by(Month) %>%
summarise("# of A" = sum(!is.na(Date2)),
"sum of A" = sum(Val1, na.rm = TRUE),
"Mean of A" = mean(Val1, na.rm = TRUE), # compute median
"Avg Time of A" = round(mean(difftime(Date2, Date1),
na.rm = TRUE), # compute avg time
2),
row_number = min(row_number)) %>% # get min row number
arrange(row_number) %>% # sort by row number (to sort months)
mutate("MOM Growth # of A" = round(apply(cbind(`# of A`, lag(- `# of A`)),
1, sum, na.rm = TRUE) / lag(`# of A`), 2),
"MOM Growth sum of A" = round(apply(cbind(`sum of A`, lag(- `sum of A`)),
1, sum, na.rm = TRUE) / lag(`sum of A`) * 100, 2)) %>%
mutate("MOM Growth # of A" = if_else(is.infinite(`MOM Growth # of A`), 100, `MOM Growth # of A`), # replace Inf with 100
"MOM Growth sum of A" = if_else(is.infinite(`MOM Growth sum of A`), 100, `MOM Growth sum of A`)) %>%
select(Month, `# of A`, `MOM Growth # of A`,
`sum of A`, `MOM Growth sum of A`,
`Mean of A`, `Avg Time of A`),
my_data %>%
mutate(Date2 = ymd(Date2),
Date1 = ymd_hms(Date1)) %>%
full_join(date_range, by = c("Date1", "T1")) %>%
arrange(Date1) %>%
mutate(Month = paste(month(Date1, label = TRUE), year(Date1), sep = "-"),
row_number = row_number(),
Val1 = coalesce(Val1, 0L)) %>%
filter(T1 == "B") %>%
group_by(Month) %>%
summarise("# of B" = sum(!is.na(Date2)),
"sum of B" = sum(Val1, na.rm = TRUE),
"Mean of B" = mean(Val1, na.rm = TRUE),
"Avg Time of B" = round(mean(difftime(Date2, Date1),
na.rm = TRUE),
2),
row_number = min(row_number)) %>%
arrange(row_number) %>%
mutate("MOM Growth # of B" = round(apply(cbind(`# of B`, lag(- `# of B`)),
1, sum, na.rm = TRUE) / lag(`# of B`), 2),
"MOM Growth sum of B" = round(apply(cbind(`sum of B`, lag(- `sum of B`)),
1, sum, na.rm = TRUE) / lag(`sum of B`) * 100, 2)) %>%
mutate("MOM Growth # of B" = if_else(is.infinite(`MOM Growth # of B`), 100, `MOM Growth # of B`),
"MOM Growth sum of B" = if_else(is.infinite(`MOM Growth sum of B`), 100, `MOM Growth sum of B`)) %>%
select(Month, `# of B`, `MOM Growth # of B`,
`sum of B`, `MOM Growth sum of B`,
`Mean of B`, `Avg Time of B`),
by = "Month",
all = TRUE,
sort = FALSE) # do not sort by ID column to keep month order
计算Mean of A
和Mean of B
整整一个月时,它给出了错误的平均值。 (例如,如果# of A
是10
和sum of A
是100
而不是mean
应该是10
但它给9.09
因为我不知道但是对于所有完整月份它在# of A
中自动添加+1而计算平均值。
您定义的# number of A
不是mean
函数正在使用的那个
因为当val1
中的缺失值被替换为0时,mean
将除以所有填充行的数量(原本是NAs),我相信如果你移除Val1 = coalesce(Val1, 0L)
,你将得到你想要的结果
试试这个例子:
x <- c(3, 4, 6, 8, 9, 0)
mean(x, na.rm = TRUE)
然后这个:
x <- c(3, 4, 6, 8, 9, NA)
mean(x, na.rm = TRUE)
编辑:
如果你坚持用零替换NA,你也可以做这样的事情来计算平均值:
my_data %>%
mutate(Date2 = ymd(Date2),
Date1 = ymd_hms(Date1)) %>%
full_join(date_range, by = c("Date1", "T1")) %>% # join date ranges to table
arrange(Date1) %>% # sort by date
mutate(Month = paste(month(Date1, label = TRUE), year(Date1), sep = "-"),
row_number = row_number(), # create row_numbers to keep up order
Val1 = coalesce(Val1, 0L)) %>% # replace NA with 0 in Val1
filter(T1 == "A") %>%
group_by(Month) %>%
summarise("# of A" = sum(!is.na(Date2)),
"sum of A" = sum(Val1, na.rm = TRUE),
"Avg Time of A" = round(mean(difftime(Date2, Date1),
na.rm = TRUE), # compute avg time
2),
row_number = min(row_number)) %>%
mutate("Mean of A" = get('sum of A') / get('# of A') )
注意:我还建议使用不带空格的列名(例如sum_of_A)以便于访问(我必须使用get('sum of A')
访问mutate
中的列)