根据特定条件将新列中不同年份的变量差异归因于新列

问题描述 投票:1回答:3

我有一个名为df的数据框,如下所示:

country     year    taxrevenue  VAT  CIT
Angola      2006    1000        750  250
Albania     1998    1000        750  250
Albania     2002    1500        1125 375
Albania     2005    1200         900 300
Argentina   1984    1900        1425 475
Argentina   1991    1400        1050 350
Argentina   1995    1600        1200 400

我想要做的是为每个(税)列创建一个额外的列,该列保存一年中的税与下一税的差额,条件是它是同一个国家/地区,如下所示:

country     year    dtaxrevenue dvat    dcit
Angola      2006      NA          NA      NA
Albania     1998      NA          NA      NA
Albania     2002     500         375     125
Albania     2005    -300        -225     -75
Argentina   1984      NA         525     175
Argentina   1991    -500        -375    -125
Argentina   1995     200         150      50

我记得以下几点:

#for each column in df
for(i in 1:ncol(TRcomplete)){
#add a new column
TRcomplete <- cbind(TRcomplete[,i])
# When condition holds..
if(TRcomplete$year[,i]- TRcomplete$year[,i+1] < 0) & TRcomplete$country[,i] == TRcomplete$country[,i+1]{
# try to subtract two values and put them in a new column
try(TRcomplete[,i+1] <- TRcomplete[,i]- TRcomplete[,i+1]
} else {
TRcomplete[,i+1]<-NA
}

我很困惑如何引用每一列。任何人都可以提供帮助吗?

r for-loop try-catch rstudio
3个回答
1
投票

mutate_at是候选人

library(dplyr)

df %>%
  arrange(country, year) %>%  #sort data
  group_by(country) %>%
  mutate_at(vars(taxrevenue:CIT), funs(d = . - lag(.)))

这使

  country    year taxrevenue   VAT   CIT taxrevenue_d VAT_d CIT_d
1 Albania    1998       1000   750   250           NA    NA    NA
2 Albania    2002       1500  1125   375          500   375   125
3 Albania    2005       1200   900   300         -300  -225  - 75
4 Angola     2006       1000   750   250           NA    NA    NA
5 Argentina  1984       1900  1425   475           NA    NA    NA
6 Argentina  1991       1400  1050   350         -500  -375  -125
7 Argentina  1995       1600  1200   400          200   150    50

样本数据:

df <- structure(list(country = c("Angola", "Albania", "Albania", "Albania", 
"Argentina", "Argentina", "Argentina"), year = c(2006L, 1998L, 
2002L, 2005L, 1984L, 1991L, 1995L), taxrevenue = c(1000L, 1000L, 
1500L, 1200L, 1900L, 1400L, 1600L), VAT = c(750L, 750L, 1125L, 
900L, 1425L, 1050L, 1200L), CIT = c(250L, 250L, 375L, 300L, 475L, 
350L, 400L)), .Names = c("country", "year", "taxrevenue", "VAT", 
"CIT"), class = "data.frame", row.names = c(NA, -7L))

1
投票

使用dplyr我们可以尝试

library(dplyr)
df %>% group_by(country) %>% 
       mutate(Tax = taxrevenue-lag(taxrevenue))


  # A tibble: 7 x 6
  # Groups:   country [3]
  country    year taxrevenue   VAT   CIT   Tax
  <chr>     <int>      <int> <int> <int> <int>
1 Angola     2006       1000   750   250    NA
2 Albania    1998       1000   750   250    NA
3 Albania    2002       1500  1125   375   500
4 Albania    2005       1200   900   300  -300
5 Argentina  1984       1900  1425   475    NA
6 Argentina  1991       1400  1050   350  -500
7 Argentina  1995       1600  1200   400   200

Update

df %>%
 group_by(country) %>%
 mutate_if(is.numeric, funs(d = . - lag(.))) %>% select(-year_d) %>% 
 rename_at(vars(ends_with("_d")), funs(paste("d", gsub("_d", "", .), sep = "") )) %>% 
 head(n=5)

    # A tibble: 5 x 8
    # Groups:   country [3]
    country    year taxrevenue   VAT   CIT dtaxrevenue  dVAT  dCIT
    <fct>     <int>      <int> <int> <int>       <int> <int> <int>
      1 Angola     2006       1000   750   250          NA    NA    NA
    2 Albania    1998       1000   750   250          NA    NA    NA
    3 Albania    2002       1500  1125   375         500   375   125
    4 Albania    2005       1200   900   300        -300  -225   -75
    5 Argentina  1984       1900  1425   475          NA    NA    NA

1
投票

一个带data.table的版本。

# make data
library(data.table)
dt <- data.table(
    country = c("Angola", "Albania", "Albania", "Albania", "Argentina", "Argentina", "Argentina"),
    year    = c(2006, 1998, 2002, 2005, 1984, 1991, 1995),
    tax     = c(1000, 1000, 1500, 1200, 1900, 1400, 1600),
    vat     = c(750,   750, 1125,  900, 1425, 1050, 1200),
    cit     = c(250,   250,  375,  300,  475,  350,  400)
)

# get lagged taxes
dt <- dt[order(country, year)]
dt[ , lag_tax := shift(tax), by=country]
dt[ , lag_vat := shift(vat), by=country]
dt[ , lag_cit := shift(cit), by=country]

# calculate tax differences
dt[ , dtax := tax - lag_tax]
dt[ , dvat := vat - lag_vat]
dt[ , dcit := cit - lag_cit]

如果您需要为许多列执行此操作,请使用lapply.SD

# get lagged cols
dt[ , paste0(names(dt)[3:5], "_lag") := lapply(.SD, shift), by=country, .SDcols=3:5]

# or go straight to differences
dt[ , paste0(names(dt)[3:5], "_diff") := lapply(.SD, function(x) x - shift(x)), by=country, .SDcols=3:5]
© www.soinside.com 2019 - 2024. All rights reserved.