我想将 2024 年队列中的学生(n=105,女性=61,男性=44)分配到 2025 年的组中。我希望实现的分配(按重要性顺序):
鉴于严格遵守标准 1-3 可能无法找到解决方案,因此在组大小和随机性方面存在一定的灵活性。
2024年组的规模和性别平衡相对不平衡:
table(df[, c("Gender", "g2024")])
# g2024
# Gender 1 2 3 4 5 6 7 8
# Female 7 7 7 7 9 7 9 8
# Male 4 4 8 6 5 7 4 6
table(df[, "g2024"])
# g2024
# 1 2 3 4 5 6 7 8
# 11 11 15 13 14 14 13 14
使用预定值,这是我想要实现的目标的基本示例:
# Create random vector with group ids for females per group, assign to g2025
set.seed(42)
grp_fem <- as.character(rep(1:8, sample(c(rep(7, 3), rep(8, 5)), 8)))
df$g2025 <- unlist(lapply(1:nrow(df), function(i) {
if (df$Gender[i] == "Female") {
x <- sample(grp_fem, 1)
grp_fem <<- grp_fem[-match(x, grp_fem)]
return(x)
} else {
return(NA)
}
}))
# Get males per group, change one group length so y sums to male count
x <- as.integer(table(df$g2025))
y <- 13 - x
z <- sample(which(y == 5), 1)
y[z] <- 6
# Create vector with group ids for males per group, assign to g2025
grp_mal <- as.character(rep(1:8, rep(y)))
df$g2025 <- unlist(lapply(1:nrow(df), function(i) {
if (df$Gender[i] == "Male") {
x <- sample(grp_mal, 1)
grp_mal <<- grp_mal[-match(x, grp_mal)]
return(x)
} else {
return(df$g2025[i])
}
}))
# Gender distribution per group
table(df[, c("Gender", "g2025")])
# g2025
# Gender 1 2 3 4 5 6 7 8
# Female 7 8 8 8 7 8 7 8
# Male 6 5 5 5 6 6 6 5
# Number of students carried from g2024 to g2025 groups
table(df[, c("g2024", "g2025")])
# g2025
# g2024 1 2 3 4 5 6 7 8
# 1 0 2 1 2 1 2 3 0
# 2 1 3 2 2 0 0 1 2
# 3 2 2 1 2 3 2 1 2
# 4 1 1 2 1 2 2 2 2
# 5 2 1 4 0 2 3 1 1
# 6 5 3 1 2 0 0 2 1
# 7 1 1 0 1 2 2 1 5
# 8 1 0 2 3 3 3 2 0
预定的小组规模和性别分布值显然并不理想,并且它没有考虑到给定的 2024 年小组中有多少学生最终属于同一 2025 年小组。
我尝试使用一个矩阵,其组间分配基本上是随机分布的:
m <- structure(c(1, 1, 2, 1, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 1, 2,
1, 2, 2, 2, 1, 1, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 2,
1, 2, 1, 2, 2, 2, 1, 1, 2, 2, 1, 2, 2, 2, 1, 2, 2, 1, 2, 2, 2,
1, 2, 2, 1, 2), dim = c(8L, 8L))
rowSums(m)
# [1] 11 11 15 13 14 14 13 14
colSums(m)
# [1] 13 13 13 13 13 13 13 14
但这也需要预定义的
colSums()
值和一些手动操作。我怀疑lpSolve
包可能能够处理这样的事情,但我无法理解如何使用它。
数据:
df <- structure(list(Student = c("1", "2", "3", "4", "5", "6", "7",
"8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18",
"19", "20", "21", "22", "23", "24", "25", "26", "27", "28", "29",
"30", "31", "32", "33", "34", "35", "36", "37", "38", "39", "40",
"41", "42", "43", "44", "45", "46", "47", "48", "49", "50", "51",
"52", "53", "54", "55", "56", "57", "58", "59", "60", "61", "62",
"63", "64", "65", "66", "67", "68", "69", "70", "71", "72", "73",
"74", "75", "76", "77", "78", "79", "80", "81", "82", "83", "84",
"85", "86", "87", "88", "89", "90", "91", "92", "93", "94", "95",
"96", "97", "98", "99", "100", "101", "102", "103", "104", "105"
), Gender = c("Female", "Male", "Male", "Male", "Female", "Female",
"Male", "Female", "Female", "Female", "Male", "Male", "Male",
"Female", "Male", "Female", "Male", "Female", "Female", "Female",
"Male", "Male", "Female", "Female", "Female", "Male", "Male",
"Male", "Female", "Female", "Male", "Female", "Female", "Female",
"Male", "Female", "Female", "Female", "Female", "Male", "Male",
"Female", "Female", "Female", "Female", "Male", "Female", "Male",
"Female", "Female", "Female", "Female", "Male", "Female", "Male",
"Female", "Male", "Male", "Male", "Female", "Female", "Female",
"Female", "Female", "Female", "Male", "Female", "Female", "Male",
"Male", "Female", "Female", "Male", "Female", "Male", "Female",
"Female", "Male", "Female", "Female", "Female", "Male", "Female",
"Male", "Female", "Female", "Male", "Female", "Male", "Male",
"Male", "Male", "Male", "Female", "Male", "Female", "Female",
"Male", "Female", "Male", "Female", "Female", "Male", "Male",
"Female"), g2024 = c("4", "3", "3", "8", "2", "8", "4", "5",
"8", "7", "2", "4", "4", "6", "6", "5", "1", "3", "7", "2", "6",
"8", "2", "8", "1", "5", "8", "3", "3", "1", "5", "5", "1", "3",
"8", "6", "1", "7", "5", "5", "1", "7", "4", "7", "5", "4", "8",
"6", "3", "1", "7", "8", "7", "7", "2", "4", "8", "3", "7", "1",
"6", "3", "6", "8", "2", "3", "3", "5", "7", "2", "2", "2", "8",
"6", "1", "1", "4", "1", "6", "8", "2", "6", "5", "2", "5", "3",
"3", "7", "3", "4", "3", "4", "7", "4", "6", "8", "4", "6", "6",
"6", "5", "4", "5", "5", "7"), g2025 = c(NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA)), row.names = c(NA, -105L), class = c("tbl_df",
"tbl", "data.frame"))
我认为
lpSolve
在这里不起作用,因为它需要线性函数,而我认为这必须是非线性的。 来自生产和物流的数据科学的优化包表没有显示任何可以处理非线性、离散优化问题的R包,说实话,那里链接的更全面的列表让我感到害怕。
但是,这让我觉得这是模拟退火的完美应用,因此我实现了维基百科页面上给出的简单模拟退火算法。
anneal <- function(
# initial state
s0,
# energy function
E,
# neighborhood function
neighbor,
# cooling schedule
temperature = function(r) Tmax * r ^ Tpwr,
# transition probability functions
P = function(Eo, En, t) if (En < Eo) 1 else exp(-(En-Eo)/t),
# no. of iterations
kmax = 1000,
# initial temp. and shape of curve; not used if temperature is specified
Tmax = 1, Tpwr = 1
) {
# line-for-line pseudocode from Wikipedia, translated to R
s <- s0
for(k in 1:kmax) {
t <- temperature(1-(k-1)/kmax)
sn <- neighbor(s)
if (P(E(s), E(sn), t) >= runif(1))
s <- sn
}
return(s)
}
比弄清楚如何使用新包更容易(IMO),而且无论如何我们仍然需要定义优化函数的下一部分:
library(dplyr)
# Energy function (i.e. cost function i.e. optimization function)
cost <- function(s) {
df2 <- df
df2$g2025 <- s
# "1. no more than two students from a 2024 group can carry over to the same
# 2025 group"
# Cost: 0 if 2 or fewer students from same group, 1 if 3, increases by square
# beyond 3
cost1 <- sum(
pmax(0, summarize(df2, nsame = max(table(g2024)), .by = g2025)$nsame-2)^2
)
# "2. 2025 groups must be as gender-balanced as possible, so something like 7
# or 8 females and 5 or 6 males per group"
# Cost: Square of difference between group gender proportion and overall
# gender proportion
cost2 <- sum((
summarize(df2, pfem = mean(Gender == "Female"), .by = g2025)$pfem -
mean(df2$Gender == "Female")
)^2)
# return linear combination of cost 1 and cost 2
# weights can be tuned to improve results
return(1 * cost1 + 10 * cost2)
}
# neighborhood function
# swaps to students at random
# preserves equal group sizes
neighbor.swap <- function(s) {
swap <- sample(1:length(s), 2)
s[`[<-`(1:length(s), swap, rev(swap))]
}
# Initial state, satisfies requirement #4 of randomness
s0 <- rep_len(unique(df$g2024), length.out = nrow(df)) |>
sample(nrow(df))
然后我们可以运行模拟退火(通过试错找到参数):
# run simulated annealing
set.seed(8675309)
df$g2025 <- anneal(
s0 = s0,
E = cost,
neighbor = neighbor.swap,
kmax = 2500,
Tmax = 2,
Tpwr = 2
)
看看它是如何做到的:
data.frame(
g2025 = unique(df$g2024) %>% sort,
`Number from same 2024 group` =
summarize(df, nsame = max(table(g2024)), .by = g2025) %>%
arrange(g2025) %>%
.$nsame,
`Number of females` =
summarize(df, nfem = sum(Gender == "Female"), .by = g2025) %>%
arrange(g2025) %>%
.$nfem,
`Number of males` =
summarize(df, nfem = sum(Gender == "Male"), .by = g2025) %>%
arrange(g2025) %>%
.$nfem
)
#> g2025 Group.size No.from.same.group No.of.females No.of.males
#> 1 1 13 2 7 6
#> 2 2 13 2 8 5
#> 3 3 13 2 7 6
#> 4 4 14 2 8 6
#> 5 5 13 2 8 5
#> 6 6 13 2 8 5
#> 7 7 13 2 7 6
#> 8 8 13 2 8 5
从该表中我们可以看到满足标准1-3。从技术上讲,此代码为 #3 提供了比 #1 或 #2 更高的优先级,但由于所有人都满意,因此这些数据并不重要,并且如果确实重要的话,更改它也很容易。至于标准#4,分组是随机的,因为初始猜测是随机的,并且邻域行走是随机的(尽管有引导)。
作为脚注,这是
anneal
的一个版本,带有打印诊断信息的选项:
anneal <- function(
s0,
E,
neighbor,
temperature = function(r) Tmax * r ^ Tpwr,
P = function(Eo, En, t) if (En < Eo) 1 else exp(-(En-Eo)/t),
kmax = 1000,
Tmax = 1,
Tpwr = 1,
verbose = TRUE
) {
s <- s0
if (verbose) Es <- numeric(kmax)
for(k in 1:kmax) {
t <- temperature(1-(k-1)/kmax)
sn <- neighbor(s)
if (verbose) Es[k] <- E(s)
if (verbose) message(
"Iteration: ", k, "\n",
"Temperature: ", t, "\n",
"Energy: ", E(s), "\n"
)
if (P(E(s), E(sn), t) >= runif(1)) s <- sn
}
if (verbose) plot(
x = 1:kmax,
y = Es,
type = "l",
xlab = "Iteration",
ylab = "Energy",
main = "Convergence plot"
)
return(s)
}