我有一个名为df的数据框,如下所示:
country year taxrevenue VAT CIT
Angola 2006 1000 750 250
Albania 1998 1000 750 250
Albania 2002 1500 1125 375
Albania 2005 1200 900 300
Argentina 1984 1900 1425 475
Argentina 1991 1400 1050 350
Argentina 1995 1600 1200 400
我想要做的是为每个(税)列创建一个额外的列,该列保存一年中的税与下一税的差额,条件是它是同一个国家/地区,如下所示:
country year dtaxrevenue dvat dcit
Angola 2006 NA NA NA
Albania 1998 NA NA NA
Albania 2002 500 375 125
Albania 2005 -300 -225 -75
Argentina 1984 NA 525 175
Argentina 1991 -500 -375 -125
Argentina 1995 200 150 50
我记得以下几点:
#for each column in df
for(i in 1:ncol(TRcomplete)){
#add a new column
TRcomplete <- cbind(TRcomplete[,i])
# When condition holds..
if(TRcomplete$year[,i]- TRcomplete$year[,i+1] < 0) & TRcomplete$country[,i] == TRcomplete$country[,i+1]{
# try to subtract two values and put them in a new column
try(TRcomplete[,i+1] <- TRcomplete[,i]- TRcomplete[,i+1]
} else {
TRcomplete[,i+1]<-NA
}
我很困惑如何引用每一列。任何人都可以提供帮助吗?
mutate_at
是候选人
library(dplyr)
df %>%
arrange(country, year) %>% #sort data
group_by(country) %>%
mutate_at(vars(taxrevenue:CIT), funs(d = . - lag(.)))
这使
country year taxrevenue VAT CIT taxrevenue_d VAT_d CIT_d
1 Albania 1998 1000 750 250 NA NA NA
2 Albania 2002 1500 1125 375 500 375 125
3 Albania 2005 1200 900 300 -300 -225 - 75
4 Angola 2006 1000 750 250 NA NA NA
5 Argentina 1984 1900 1425 475 NA NA NA
6 Argentina 1991 1400 1050 350 -500 -375 -125
7 Argentina 1995 1600 1200 400 200 150 50
样本数据:
df <- structure(list(country = c("Angola", "Albania", "Albania", "Albania",
"Argentina", "Argentina", "Argentina"), year = c(2006L, 1998L,
2002L, 2005L, 1984L, 1991L, 1995L), taxrevenue = c(1000L, 1000L,
1500L, 1200L, 1900L, 1400L, 1600L), VAT = c(750L, 750L, 1125L,
900L, 1425L, 1050L, 1200L), CIT = c(250L, 250L, 375L, 300L, 475L,
350L, 400L)), .Names = c("country", "year", "taxrevenue", "VAT",
"CIT"), class = "data.frame", row.names = c(NA, -7L))
使用dplyr
我们可以尝试
library(dplyr)
df %>% group_by(country) %>%
mutate(Tax = taxrevenue-lag(taxrevenue))
# A tibble: 7 x 6
# Groups: country [3]
country year taxrevenue VAT CIT Tax
<chr> <int> <int> <int> <int> <int>
1 Angola 2006 1000 750 250 NA
2 Albania 1998 1000 750 250 NA
3 Albania 2002 1500 1125 375 500
4 Albania 2005 1200 900 300 -300
5 Argentina 1984 1900 1425 475 NA
6 Argentina 1991 1400 1050 350 -500
7 Argentina 1995 1600 1200 400 200
df %>%
group_by(country) %>%
mutate_if(is.numeric, funs(d = . - lag(.))) %>% select(-year_d) %>%
rename_at(vars(ends_with("_d")), funs(paste("d", gsub("_d", "", .), sep = "") )) %>%
head(n=5)
# A tibble: 5 x 8
# Groups: country [3]
country year taxrevenue VAT CIT dtaxrevenue dVAT dCIT
<fct> <int> <int> <int> <int> <int> <int> <int>
1 Angola 2006 1000 750 250 NA NA NA
2 Albania 1998 1000 750 250 NA NA NA
3 Albania 2002 1500 1125 375 500 375 125
4 Albania 2005 1200 900 300 -300 -225 -75
5 Argentina 1984 1900 1425 475 NA NA NA
一个带data.table的版本。
# make data
library(data.table)
dt <- data.table(
country = c("Angola", "Albania", "Albania", "Albania", "Argentina", "Argentina", "Argentina"),
year = c(2006, 1998, 2002, 2005, 1984, 1991, 1995),
tax = c(1000, 1000, 1500, 1200, 1900, 1400, 1600),
vat = c(750, 750, 1125, 900, 1425, 1050, 1200),
cit = c(250, 250, 375, 300, 475, 350, 400)
)
# get lagged taxes
dt <- dt[order(country, year)]
dt[ , lag_tax := shift(tax), by=country]
dt[ , lag_vat := shift(vat), by=country]
dt[ , lag_cit := shift(cit), by=country]
# calculate tax differences
dt[ , dtax := tax - lag_tax]
dt[ , dvat := vat - lag_vat]
dt[ , dcit := cit - lag_cit]
如果您需要为许多列执行此操作,请使用lapply
和.SD
:
# get lagged cols
dt[ , paste0(names(dt)[3:5], "_lag") := lapply(.SD, shift), by=country, .SDcols=3:5]
# or go straight to differences
dt[ , paste0(names(dt)[3:5], "_diff") := lapply(.SD, function(x) x - shift(x)), by=country, .SDcols=3:5]