分割图中的ggplot位置为“堆栈”时,带有条形尺寸和总计的交易

问题描述 投票:0回答:1

与geom_bar或geom_col一样使用“堆栈”样式(而不是“躲避”样式时,总计会受到损害使用对数刻度。当其中一个值明显比其他值更频繁时,我设法以一种简单的方式(分隔图)表示正确的总计,请参阅解决方法(而非日志)。但是,对于其他情况和对数刻度,总的问题仍然存在。我想要一个通用的自动化解决方案。

编辑:读取ggplot scale_y_log10() issue后,我发现使用日志没有任何意义。因此,此问题的答案应该是如何推广拆分方法=解决方法-不仅针对一个频繁的小组-。

mydf2<-data.frame(date=c(rep("2020-02-01",25),rep("2020-02-01",25),rep("2020-02-02",35),rep("2020-02-02",40) ),
                  value= c(rep(LETTERS[1],39),rep(LETTERS[1:3],4),rep(LETTERS[1],39),rep(LETTERS[2],35) ) , stringsAsFactors = FALSE)

dateValueCount<-setDT(mydf2)[, .N, by=.(date, value)]
dateValueCount
#          date value  N
# 1: 2020-02-01     A 43
# 2: 2020-02-01     B  4
# 3: 2020-02-01     C  3
# 4: 2020-02-02     C  1
# 5: 2020-02-02     A 39
# 6: 2020-02-02     B 35

library(scales)
prevalent1<-ggplot(mydf2, aes(date, fill = value)) + 
  geom_bar() + scale_y_continuous(breaks= breaks_pretty())

prevalent1log<-ggplot(mydf2, aes(date, fill = value)) + 
  geom_bar() +  scale_y_continuous(trans='log2', breaks = log_breaks(7), 
                                   labels= label_number_auto()
  )
# total Problem, real totals are 50 and 75
{
  require(grid)
  grid.newpage()
  pushViewport(viewport(layout = grid.layout(1, 2)))
  pushViewport(viewport(layout.pos.col = 1, layout.pos.row = 1))
  print(prevalent1,newpage=F) 
  popViewport()
  pushViewport(viewport(layout.pos.col = 2, layout.pos.row = 1))
  print( prevalent1log, newpage = F )
}

enter image description here

解决方法(仅适用于一个普遍值)。

答案应解决第二个日期,所有可能的情况都超过阈值

mydf3<-mydf2[which(mydf2$date=="2020-02-01")]

dateValueCount3<-dateValueCount[which(dateValueCount$date=="2020-02-01"),]
# get the most frequent per group

mydf3Max<-dateValueCount3[, .SD[  N== max(N) ]  , by=date]  
mydf3Max

#          date value  N
# 1: 2020-02-01     A 43


# totals per group
dateCount<-mydf3[, .N, by=.(date)]
dateCount
#          date  N
# 1: 2020-02-01 50

# transfer column to previous table
mydf3Max$totalDay <- dateCount$N[match(mydf3Max$date, dateCount$date)]

threshold <- 10 # splitting threshold

# remove groups with total lower than threshold
mydf3Max<-mydf3Max[which(mydf3Max$totalDay>threshold),]

# the final height of A will be dependent on the values of B and C
mydf3Max$diff<-mydf3Max$totalDay-mydf3Max$N

# shrinkFactor for the upper part of the plot which begins in threshold
shrinkFactor<-.05

# part of our frequent value (A) count must not be shrinked
mydf3Max$notshrink <- threshold - mydf3Max$diff

# part of A data (> threshold) must be shrinked
mydf3Max$NToShrink<-mydf3Max$N-mydf3Max$notshrink

mydf3Max$NToShrinkShrinked<-mydf3Max$NToShrink*shrinkFactor

# now sum the not-shrinked part with the shrinked part to obtain the transformed height
mydf3Max$NToShrinkShrinkedPlusBase<-mydf3Max$NToShrinkShrinked+mydf3Max$notshrink

# transformation function  - works for "dodge" position
# https://stackoverflow.com/questions/44694496/y-break-with-scale-change-in-r
trans <- function(x){pmin(x,threshold) + shrinkFactor*pmax(x-threshold,0)}
# dateValueCount3$transN <- trans(dateValueCount3$N)

setDF(dateValueCount3)
setDF(mydf3Max)

# pass transformed column to original d.f.
dateValueCount3$N2 <- mydf3Max$NToShrinkShrinkedPlusBase[match(interaction( dateValueCount3[c("value","date")]) ,
                                                              interaction( mydf3Max[c("value","date") ] )  )]

# substitute real N with transformed values
dateValueCount3[which(!is.na(dateValueCount3$N2)),]$N <- dateValueCount3[which(!is.na(dateValueCount3$N2)),]$N2

yticks <- c(0, 2,4,6,8,10,20,30,40,50)

ggplot(data=dateValueCount3, aes(date, N, group=value, fill=value)) + #group=longName
  geom_col(position="stack") +
  geom_rect(aes(xmin=0, xmax=3, ymin=threshold, ymax=threshold+.1), fill="white") +
  scale_y_continuous(breaks = trans(yticks), labels= yticks)

enter image description here

r ggplot2 bar-chart
1个回答
0
投票
mydf2<-data.frame(date=c(rep("2020-02-01",25),rep("2020-02-01",25),rep("2020-02-02",35),rep("2020-02-02",40) ), value= c(rep(LETTERS[1],27),rep(LETTERS[1:3],8),rep(LETTERS[1],35),rep(LETTERS[2],39) ) , stringsAsFactors = FALSE) { summaryDT<-setDT(mydf2)[, .N, by=.(date, value)] # summaryDT <- summaryDT[order(summaryDT$N, decreasing = TRUE),] # for highest bars at top # summaryDT$NFac<-factor(summaryDT$N, levels = unique(summaryDT$N) ) #for highest bars at top # sort categories in the inverse order of labels summaryDT$value<-factor(summaryDT$value, levels=unique(summaryDT$value) ) summaryDT<- summaryDT[order(summaryDT$date,-summaryDT$value)] # accum. per date # summaryDT<-summaryDT[order(date, N), .SD,by=.(date)] # for highest bars at top summaryDT[, acc_sum := cumsum(N ) , by= date] threshold<-20 # problematic days, over thres. dVLtoTransfo <- summaryDT[which(summaryDT$acc_sum>threshold),] # accum. down per day - thres dVLtoTransfo$toShrink <- dVLtoTransfo$acc_sum-threshold # correct portion to shrink dVLtoTransfo$toShrink <- ifelse(dVLtoTransfo$toShrink>dVLtoTransfo$N,dVLtoTransfo$N,dVLtoTransfo$toShrink) # not to shrink portion dVLtoTransfo$notToShrink<- dVLtoTransfo$N-dVLtoTransfo$toShrink # shrinkFactor for the upper part of the plot which begins in threshold shrinkFactor<-.04 dVLtoTransfo$NToShrinkShrinked<-dVLtoTransfo$toShrink*shrinkFactor # now sum the not-shrinked part with the shrinked part to obtain the transformed height dVLtoTransfo$NToShrinkShrinkedPlusBase<-dVLtoTransfo$NToShrinkShrinked+dVLtoTransfo$notToShrink # transformation function - works for "dodge" position # https://stackoverflow.com/questions/44694496/y-break-with-scale-change-in-r trans <- function(x){pmin(x,threshold) + shrinkFactor*pmax(x-threshold,0)} # summaryDT$transN <- trans(summaryDT$N) setDF(summaryDT) setDF(dVLtoTransfo) # class(mydfAll) # pass transformed column to original d.f. summaryDT$N2 <- dVLtoTransfo$NToShrinkShrinkedPlusBase[match(interaction( summaryDT[c("value","date")]) , interaction( dVLtoTransfo[c("value","date") ] ) )] # substitute real N with transformed values summaryDT$NOld<-summaryDT$N summaryDT[which(!is.na(summaryDT$N2)),]$N <- summaryDT[which(!is.na(summaryDT$N2)),]$N2 yticks <- c(0,4,8,12,16,20,40,60,80) } ggplot(data=summaryDT, aes(date, N, group=value, fill=value)) + # order by label order # ggplot(data=summaryDT, aes(date, N, group=NFac, fill=value)) + # order by highest frequency geom_col(position="stack") + geom_rect(aes(xmin=0, xmax=3, ymin=threshold, ymax=threshold+.1), fill="white") + scale_y_continuous(breaks = trans(yticks), labels= yticks)

enter image description here

© www.soinside.com 2019 - 2024. All rights reserved.