与geom_bar或geom_col一样使用“堆栈”样式(而不是“躲避”样式时,总计会受到损害使用对数刻度。当其中一个值明显比其他值更频繁时,我设法以一种简单的方式(分隔图)表示正确的总计,请参阅解决方法(而非日志)。但是,对于其他情况和对数刻度,总的问题仍然存在。我想要一个通用的自动化解决方案。
编辑:读取ggplot scale_y_log10() issue后,我发现使用日志没有任何意义。因此,此问题的答案应该是如何推广拆分方法=解决方法-不仅针对一个频繁的小组-。
mydf2<-data.frame(date=c(rep("2020-02-01",25),rep("2020-02-01",25),rep("2020-02-02",35),rep("2020-02-02",40) ),
value= c(rep(LETTERS[1],39),rep(LETTERS[1:3],4),rep(LETTERS[1],39),rep(LETTERS[2],35) ) , stringsAsFactors = FALSE)
dateValueCount<-setDT(mydf2)[, .N, by=.(date, value)]
dateValueCount
# date value N
# 1: 2020-02-01 A 43
# 2: 2020-02-01 B 4
# 3: 2020-02-01 C 3
# 4: 2020-02-02 C 1
# 5: 2020-02-02 A 39
# 6: 2020-02-02 B 35
library(scales)
prevalent1<-ggplot(mydf2, aes(date, fill = value)) +
geom_bar() + scale_y_continuous(breaks= breaks_pretty())
prevalent1log<-ggplot(mydf2, aes(date, fill = value)) +
geom_bar() + scale_y_continuous(trans='log2', breaks = log_breaks(7),
labels= label_number_auto()
)
# total Problem, real totals are 50 and 75
{
require(grid)
grid.newpage()
pushViewport(viewport(layout = grid.layout(1, 2)))
pushViewport(viewport(layout.pos.col = 1, layout.pos.row = 1))
print(prevalent1,newpage=F)
popViewport()
pushViewport(viewport(layout.pos.col = 2, layout.pos.row = 1))
print( prevalent1log, newpage = F )
}
答案应解决第二个日期,所有可能的情况都超过阈值
mydf3<-mydf2[which(mydf2$date=="2020-02-01")]
dateValueCount3<-dateValueCount[which(dateValueCount$date=="2020-02-01"),]
# get the most frequent per group
mydf3Max<-dateValueCount3[, .SD[ N== max(N) ] , by=date]
mydf3Max
# date value N
# 1: 2020-02-01 A 43
# totals per group
dateCount<-mydf3[, .N, by=.(date)]
dateCount
# date N
# 1: 2020-02-01 50
# transfer column to previous table
mydf3Max$totalDay <- dateCount$N[match(mydf3Max$date, dateCount$date)]
threshold <- 10 # splitting threshold
# remove groups with total lower than threshold
mydf3Max<-mydf3Max[which(mydf3Max$totalDay>threshold),]
# the final height of A will be dependent on the values of B and C
mydf3Max$diff<-mydf3Max$totalDay-mydf3Max$N
# shrinkFactor for the upper part of the plot which begins in threshold
shrinkFactor<-.05
# part of our frequent value (A) count must not be shrinked
mydf3Max$notshrink <- threshold - mydf3Max$diff
# part of A data (> threshold) must be shrinked
mydf3Max$NToShrink<-mydf3Max$N-mydf3Max$notshrink
mydf3Max$NToShrinkShrinked<-mydf3Max$NToShrink*shrinkFactor
# now sum the not-shrinked part with the shrinked part to obtain the transformed height
mydf3Max$NToShrinkShrinkedPlusBase<-mydf3Max$NToShrinkShrinked+mydf3Max$notshrink
# transformation function - works for "dodge" position
# https://stackoverflow.com/questions/44694496/y-break-with-scale-change-in-r
trans <- function(x){pmin(x,threshold) + shrinkFactor*pmax(x-threshold,0)}
# dateValueCount3$transN <- trans(dateValueCount3$N)
setDF(dateValueCount3)
setDF(mydf3Max)
# pass transformed column to original d.f.
dateValueCount3$N2 <- mydf3Max$NToShrinkShrinkedPlusBase[match(interaction( dateValueCount3[c("value","date")]) ,
interaction( mydf3Max[c("value","date") ] ) )]
# substitute real N with transformed values
dateValueCount3[which(!is.na(dateValueCount3$N2)),]$N <- dateValueCount3[which(!is.na(dateValueCount3$N2)),]$N2
yticks <- c(0, 2,4,6,8,10,20,30,40,50)
ggplot(data=dateValueCount3, aes(date, N, group=value, fill=value)) + #group=longName
geom_col(position="stack") +
geom_rect(aes(xmin=0, xmax=3, ymin=threshold, ymax=threshold+.1), fill="white") +
scale_y_continuous(breaks = trans(yticks), labels= yticks)
mydf2<-data.frame(date=c(rep("2020-02-01",25),rep("2020-02-01",25),rep("2020-02-02",35),rep("2020-02-02",40) ),
value= c(rep(LETTERS[1],27),rep(LETTERS[1:3],8),rep(LETTERS[1],35),rep(LETTERS[2],39) ) , stringsAsFactors = FALSE)
{
summaryDT<-setDT(mydf2)[, .N, by=.(date, value)]
# summaryDT <- summaryDT[order(summaryDT$N, decreasing = TRUE),] # for highest bars at top
# summaryDT$NFac<-factor(summaryDT$N, levels = unique(summaryDT$N) ) #for highest bars at top
# sort categories in the inverse order of labels
summaryDT$value<-factor(summaryDT$value, levels=unique(summaryDT$value) )
summaryDT<- summaryDT[order(summaryDT$date,-summaryDT$value)]
# accum. per date
# summaryDT<-summaryDT[order(date, N), .SD,by=.(date)] # for highest bars at top
summaryDT[, acc_sum := cumsum(N ) , by= date]
threshold<-20
# problematic days, over thres.
dVLtoTransfo <- summaryDT[which(summaryDT$acc_sum>threshold),]
# accum. down per day - thres
dVLtoTransfo$toShrink <- dVLtoTransfo$acc_sum-threshold
# correct portion to shrink
dVLtoTransfo$toShrink <- ifelse(dVLtoTransfo$toShrink>dVLtoTransfo$N,dVLtoTransfo$N,dVLtoTransfo$toShrink)
# not to shrink portion
dVLtoTransfo$notToShrink<- dVLtoTransfo$N-dVLtoTransfo$toShrink
# shrinkFactor for the upper part of the plot which begins in threshold
shrinkFactor<-.04
dVLtoTransfo$NToShrinkShrinked<-dVLtoTransfo$toShrink*shrinkFactor
# now sum the not-shrinked part with the shrinked part to obtain the transformed height
dVLtoTransfo$NToShrinkShrinkedPlusBase<-dVLtoTransfo$NToShrinkShrinked+dVLtoTransfo$notToShrink
# transformation function - works for "dodge" position
# https://stackoverflow.com/questions/44694496/y-break-with-scale-change-in-r
trans <- function(x){pmin(x,threshold) + shrinkFactor*pmax(x-threshold,0)}
# summaryDT$transN <- trans(summaryDT$N)
setDF(summaryDT)
setDF(dVLtoTransfo)
# class(mydfAll)
# pass transformed column to original d.f.
summaryDT$N2 <- dVLtoTransfo$NToShrinkShrinkedPlusBase[match(interaction( summaryDT[c("value","date")]) ,
interaction( dVLtoTransfo[c("value","date") ] ) )]
# substitute real N with transformed values
summaryDT$NOld<-summaryDT$N
summaryDT[which(!is.na(summaryDT$N2)),]$N <- summaryDT[which(!is.na(summaryDT$N2)),]$N2
yticks <- c(0,4,8,12,16,20,40,60,80)
}
ggplot(data=summaryDT, aes(date, N, group=value, fill=value)) + # order by label order
# ggplot(data=summaryDT, aes(date, N, group=NFac, fill=value)) + # order by highest frequency
geom_col(position="stack") +
geom_rect(aes(xmin=0, xmax=3, ymin=threshold, ymax=threshold+.1), fill="white") +
scale_y_continuous(breaks = trans(yticks), labels= yticks)