简化循环遍历列的多个rowSums

Question

我目前正在尝试使用前一个的总和来创建DF多列。想象一下，我得到了这样的DF：

df=    
   sep-2016  oct-2016    nov-2016  dec-2016   jan-2017
1  70        153            NA        28        19
2  57         68            73       118        16
3  29         NA            19        32        36
4 177         36             3        54        53

并且我想在最后添加我正在报告的月份之前的行的总和，因此对于10月，您最终得到sep和oct的总和，并且对于11月，您最终得到sep，oct和十一月，结束这样的事情：

 df=    
     sep-2016  oct-2016    nov-2016  dec-2016   jan-2017 status-Oct2016 status-Nov 2016
    1  70        153            NA        28        19      223       223
    2  57         68            73       118        16      105       198
    3  29         NA            19        32        36       29        48
    4 177         36             3        54        53      213        93

我想知道一种有效的方法，而不是编写很多行Sums（），即使我可以在每个月的迭代中获得标签也会很棒！

谢谢！

Answer 1

我们可以使用lapply循环遍历列以应用rowSums。

dat2 <- as.data.frame(lapply(2:ncol(dat), function(i){
  rowSums(dat[, 1:i], na.rm = TRUE)
}))

names(dat2) <- paste0("status-", names(dat[, -1]))

dat3 <- cbind(dat, dat2)

dat3
#   sep-2016 oct-2016 nov-2016 dec-2016 jan-2017 status-oct-2016 status-nov-2016 status-dec-2016 status-jan-2017
# 1       70      153       NA       28       19             223             223             251             270
# 2       57       68       73      118       16             125             198             316             332
# 3       29       NA       19       32       36              29              48              80             116
# 4      177       36        3       54       53             213             216             270             323

数据

dat <- read.table(text = "   'sep-2016'  'oct-2016'    'nov-2016'  'dec-2016'   'jan-2017'
1  70        153            NA        28        19
                  2  57         68            73       118        16
                  3  29         NA            19        32        36
                  4 177         36             3        54        53",
                  header = TRUE, stringsAsFactors = FALSE)

names(dat) <- c("sep-2016", "oct-2016", "nov-2016", "dec-2016", "jan-2017")

Answer 2

老实说，我不知道为什么你会想要这种格式的数据，但这是一个完成它的tidyverse方法。它涉及将数据转换为整齐的格式，然后再将其传播回宽格式。需要注意的关键是，在整齐的格式中，month是单列中的变量而不是分布在多个列中，您只需使用group_by(rowid)和cumsum来计算所需的所有值。最后几行是构建status-列名称并将数据传播回宽格式。

library(tidyverse)
df <- read_table2(
  "sep-2016  oct-2016    nov-2016  dec-2016   jan-2017
  70        153            NA        28        19
  57         68            73       118        16
  29         NA            19        32        36
 177         36             3        54        53"
)

df %>%
  rowid_to_column() %>%
  gather("month", "value", -rowid) %>%
  arrange(rowid) %>%
  group_by(rowid) %>%
  mutate(
    value = replace_na(value, 0),
    status = cumsum(value)
    ) %>%
  gather("vartype", "number", value, status) %>%
  mutate(colname = ifelse(vartype == "value", month, str_c("status-", month))) %>%
  select(rowid, number, colname) %>%
  spread(colname, number)
#> # A tibble: 4 x 11
#> # Groups:   rowid [4]
#>   rowid `dec-2016` `jan-2017` `nov-2016` `oct-2016` `sep-2016`
#>   <int>      <dbl>      <dbl>      <dbl>      <dbl>      <dbl>
#> 1     1       28.0       19.0       0         153         70.0
#> 2     2      118         16.0      73.0        68.0       57.0
#> 3     3       32.0       36.0      19.0         0         29.0
#> 4     4       54.0       53.0       3.00       36.0      177  
#> # ... with 5 more variables: `status-dec-2016` <dbl>,
#> #   `status-jan-2017` <dbl>, `status-nov-2016` <dbl>,
#> #   `status-oct-2016` <dbl>, `status-sep-2016` <dbl>

由reprex package创建于2018-02-16（v0.2.0）。

Answer 3

干净的方法是以长格式转换数据。

library(tibble)
library(tidyr)
library(dplyr)

your_data <- tribble(~"sep_2016",   ~"oct_2016",    ~"nov_2016",  ~"dec_2016",   ~"jan_2017",
  70,        153,            NA,        28,        19,
  57,         68,            73,       118,        16,
  29,         NA,            19,        32,        36,
 177,         36,             3,        54,        53)

您可以使用gather包中的tidyr更改data.frame的格式。

your_data_long <- your_data %>%
  rowid_to_column() %>% 
  gather(key = month_year, value = the_value, -rowid) 

head(your_data_long)
#> # A tibble: 6 x 3
#>   rowid month_year the_value
#>   <int>      <chr>     <dbl>
#> 1     1   sep_2016        70
#> 2     2   sep_2016        57
#> 3     3   sep_2016        29
#> 4     4   sep_2016       177
#> 5     1   oct_2016       153
#> 6     2   oct_2016        68

一旦您的data.frame采用长格式。您可以使用cumsumand dplyrfunctions mutate和group_by计算累积和。

result <- your_data_long %>%
  group_by(rowid) %>% 
  mutate(cumulative_value = cumsum(the_value)) 

result
#> # A tibble: 20 x 4
#> # Groups:   rowid [4]
#>    rowid month_year the_value cumulative_value
#>    <int>      <chr>     <dbl>            <dbl>
#>  1     1   sep_2016        70               70
#>  2     2   sep_2016        57               57
#>  3     3   sep_2016        29               29
#>  4     4   sep_2016       177              177
#>  5     1   oct_2016       153              223
#>  6     2   oct_2016        68              125
#>  7     3   oct_2016        NA               NA
#>  8     4   oct_2016        36              213
#>  9     1   nov_2016        NA               NA
#> 10     2   nov_2016        73              198
#> 11     3   nov_2016        19               NA
#> 12     4   nov_2016         3              216
#> 13     1   dec_2016        28               NA
#> 14     2   dec_2016       118              316
#> 15     3   dec_2016        32               NA
#> 16     4   dec_2016        54              270
#> 17     1   jan_2017        19               NA
#> 18     2   jan_2017        16              332
#> 19     3   jan_2017        36               NA
#> 20     4   jan_2017        53              323

如果要检索起始表单，可以使用spread进行检索。

Answer 4

我的首选解决方案是：

# library(matrixStats)

DF <- as.matrix(df)
DF[is.na(DF)] <- 0

RES <- matrixStats::rowCumsums(DF)
colnames(RES) <- paste0("status-", colnames(DF))

cbind.data.frame(df, RES)

这与您正在寻找的rowSums最接近。

Answer 5

一种选择可能是使用来自spread的gather和tidyverse函数。

注意：即使1st月份也添加了状态列。并且状态列不是有序的，但值是正确的。

方法是：

# Data
df <- read.table(text = "sep-2016  oct-2016    nov-2016  dec-2016   jan-2017
70        153            NA        28        19
57         68            73       118        16
29         NA            19        32        36
177         36             3        54        53", header = T, stringsAsFactors = F)


library(tidyverse)

# Just add an row number as sl
df <- df %>% mutate(sl = row_number())

#Calculate the cumulative sum after gathering and arranging by date
mod_df <- df %>% 
  gather(key, value, -sl) %>%
  mutate(key = as.Date(paste("01",key, sep="."), format="%d.%b.%Y")) %>%
  arrange(sl, key) %>%
  group_by(sl) %>%
  mutate(status = cumsum(ifelse(is.na(value),0L,value) )) %>%
  select(-value) %>%
  mutate(key = paste("status",as.character(key, format="%b.%Y"))) %>%
  spread(key, status) 

# Finally join cumulative calculated sum columns with original df and then 
# remove sl column
inner_join(df, mod_df, by = "sl") %>% select(-sl)

#  sep.2016 oct.2016 nov.2016 dec.2016 jan.2017 status Dec.2016 status Jan.2017 status Nov.2016 status Oct.2016 status Sep.2016
#1       70      153       NA       28       19             251             270             223             223              70
#2       57       68       73      118       16             316             332             198             125              57
#3       29       NA       19       32       36              80             116              48              29              29
#4      177       36        3       54       53             270             323             216             213             177

Answer 6

另一个基本解决方案，我们构建一个累积行和的矩阵：

status <- setNames(
  as.data.frame(t(apply(dat,1,function(x) Reduce(sum,'[<-'(x,is.na(x),0),accumulate = TRUE)))),
  paste0("status-",names(dat)))

status
#   status-sep-2016 status-oct-2016 status-nov-2016 status-dec-2016 status-jan-2017
# 1              70             223             223             251             270
# 2              57             125             198             316             332
# 3              29              29              48              80             116
# 4             177             213             216             270             323

然后根据需要将其绑定到原始数据：

cbind(dat,status[-1])

简化循环遍历列的多个rowSums

问题描述投票：2回答：6

6个回答

最新问题

简化循环遍历列的多个rowSums

问题描述 投票：2回答：6

6个回答

最新问题

问题描述投票：2回答：6