我正在创建一个用户定义的奈夫贝叶斯函数,在这里我显示准确率、误分类和召回。我还没有对召回进行编码。非常感谢帮助 以下是我的Naive Bayes函数的代码。
naive_bayes <- function(training.dataset, test.dataset){
##read data from file to a data frame
training.table <- trainSparse
test.table <- testSparse
##retrieve class and features from training data
training.class <- training.table[, 200]
training.features <- training.table[,-200]
remove(training.table)
##funciton for calculating priors
calculate.priors <- function(class.vector){
priors <- c()
for (class in unique(class.vector)){
priors <- rbind(priors, c(class, length(class.vector[class.vector==class])/length(class.vector)))
colnames(priors) <- c("classification", "probability")
}
return (priors)
}
priors <- calculate.priors(training.class)
##Learn the features by calculating likelihood
likelihood.list <- list()
#calculate CPD by feature
for (i in 1:dim(training.features)[2]){
feature.values <- training.features[, i]
unique.feature.values <- unique(feature.values)
likelihood.matrix <- matrix(rep(NA), nrow=dim(priors)[1], ncol=length(unique.feature.values))
colnames(likelihood.matrix) <- unique.feature.values
rownames(likelihood.matrix) <- priors[, "classification"]
for (item in unique.feature.values){
likelihood.item <- vector()
for (class in priors[, "classification"]){
feature.value.inclass <- feature.values[training.class==class]
likelihood.value <- length(feature.value.inclass[feature.value.inclass==item])/length(feature.value.inclass)
likelihood.item <- c(likelihood.item, likelihood.value)
}
likelihood.matrix[, item] <- likelihood.item
}
likelihood.list[[i]] <- likelihood.matrix
}
##Predict class for the test dataset
#retrieve the features and target class of the testing dataset
test.features <- test.table[, -200]
test.target.class <- test.table[, 200]
test.predict.class <- rep(NA, length(test.target.class))
remove(test.table)
#calculate posteriors for each test data record
for (i in 1:dim(test.features)[1]){
record <- test.features[i, ]
posterior <- vector()
#calculate posteriors for each possible class of that record
for (class in priors[, "classification"]){
#initialize posterior as the prior value of that class
posterior.value <- as.numeric(priors[priors[, "classification"]==class, 2])
likelihood.v <- c()
for (item in 1:length(record)){
likelihood.value <- likelihood.list[[item]][class, as.character(record[1, item])]
likelihood.v <- c(likelihood.v, likelihood.value)
posterior.value <- as.numeric(posterior.value) * as.numeric(likelihood.value)
}
posterior <- rbind(posterior, c(class, posterior.value))
}
predict.class <- posterior[posterior[,2]==max(as.numeric(posterior[,2])),1]
test.predict.class[i] <- predict.class
}
accuracy <- length(test.predict.class[test.predict.class==test.target.class])/length(test.target.class)
missclassification <- 1 - accuracy
print(paste("Accuracy:", accuracy))
print(paste("\nMissclassification:", missclassification))
}
我不知道为什么我收到了以下错误。
Error in `[<-`(`*tmp*`, , item, value = likelihood.item) :
subscript out of bounds
下面是我的训练数据集的输出:
> dput(head(trainSparse,5))
structure(list(also = c(0, 0, 0, 0, 0), anoth = c(1, 0, 1, 1,
0), anyon = c(0, 0, 0, 0, 0), anyth = c(1, 0, 0, 0, 0), articl = c(0,
0, 0, 0, 0), avail = c(0, 0, 0, 0, 0), back = c(0, 0, 0, 0, 0
), base = c(0, 0, 0, 1, 0), believ = c(2, 0, 0, 0, 0), best = c(0,
0, 0, 0, 0), better = c(0, 0, 0, 0, 0), book = c(0, 0, 0, 0,
0), can = c(4, 0, 0, 0, 0), case = c(1, 0, 0, 0, 0), consid = c(0,
0, 0, 0, 0), day = c(0, 0, 0, 0, 0), differ = c(0, 0, 0, 0, 0
), distribut = c(0, 0, 0, 0, 0), drive = c(0, 0, 0, 0, 0), etc = c(0,
0, 0, 0, 0), even = c(3, 0, 0, 0, 0), exampl = c(0, 0, 0, 2,
0), exist = c(0, 0, 0, 0, 0), file = c(0, 0, 0, 0, 0), first = c(1,
0, 0, 0, 0), follow = c(2, 0, 0, 0, 0), found = c(0, 0, 0, 0,
0), get = c(0, 0, 0, 0, 0), god = c(3, 0, 0, 0, 0), great = c(0,
0, 0, 0, 0), group = c(0, 0, 0, 0, 0), help = c(0, 0, 0, 0, 0
), high = c(0, 0, 0, 0, 0), howev = c(0, 0, 0, 1, 0), idea = c(1,
0, 0, 0, 0), includ = c(0, 0, 0, 0, 0), inform = c(0, 0, 0, 0,
0), john = c(0, 0, 0, 0, 0), keyword = c(0, 0, 0, 0, 0), kind = c(0,
0, 0, 0, 0), know = c(1, 0, 0, 0, 0), like = c(0, 0, 0, 0, 0),
line = c(1, 1, 1, 3, 1), list = c(0, 0, 0, 0, 0), live = c(0,
0, 0, 0, 0), look = c(0, 0, 0, 0, 0), make = c(2, 0, 0, 0,
0), mani = c(0, 0, 0, 0, 0), may = c(0, 0, 0, 0, 0), must = c(0,
0, 0, 1, 0), nation = c(0, 0, 0, 0, 0), new = c(0, 0, 0,
0, 0), number = c(0, 0, 0, 0, 0), old = c(0, 0, 0, 0, 0),
one = c(1, 0, 0, 0, 0), opinion = c(0, 0, 0, 0, 0), organ = c(1,
1, 1, 1, 1), origin = c(0, 0, 0, 0, 0), peopl = c(1, 0, 2,
0, 0), person = c(0, 0, 0, 0, 0), place = c(2, 0, 0, 0, 0
), possibl = c(0, 0, 0, 0, 0), post = c(0, 0, 0, 0, 0), probabl = c(0,
0, 0, 0, 0), quit = c(0, 0, 0, 0, 0), rather = c(0, 0, 0,
0, 0), read = c(0, 0, 0, 0, 0), reason = c(1, 0, 0, 1, 0),
right = c(0, 0, 0, 0, 0), say = c(0, 0, 0, 0, 0), second = c(0,
0, 0, 0, 0), see = c(0, 0, 0, 1, 0), seem = c(0, 0, 0, 0,
0), send = c(0, 0, 0, 0, 0), set = c(0, 0, 0, 0, 0), state = c(0,
0, 0, 1, 0), subject = c(1, 1, 1, 1, 1), sure = c(0, 0, 0,
0, 0), system = c(0, 0, 0, 6, 0), take = c(2, 0, 0, 0, 0),
technolog = c(0, 1, 0, 1, 1), think = c(0, 0, 0, 1, 0), thought = c(0,
0, 0, 0, 0), time = c(1, 0, 0, 0, 0), tri = c(0, 0, 0, 0,
0), true = c(0, 0, 1, 0, 0), univers = c(1, 0, 1, 0, 0),
usa = c(0, 0, 0, 0, 0), use = c(1, 0, 2, 0, 0), version = c(0,
0, 0, 0, 0), way = c(0, 0, 0, 0, 0), well = c(1, 0, 0, 3,
0), will = c(4, 0, 0, 0, 0), without = c(0, 0, 0, 0, 0),
word = c(0, 0, 0, 0, 0), work = c(0, 0, 0, 0, 0), world = c(0,
0, 0, 0, 0), write = c(0, 0, 0, 0, 0), wrote = c(0, 0, 0,
0, 0), abl = c(0, 0, 0, 0, 0), actual = c(0, 0, 0, 0, 0),
allow = c(1, 0, 0, 0, 0), alway = c(0, 0, 0, 0, 0), answer = c(0,
0, 0, 0, 0), around = c(0, 0, 0, 0, 0), ask = c(0, 0, 0,
0, 0), call = c(0, 0, 0, 0, 0), cant = c(1, 0, 0, 1, 0),
care = c(0, 0, 0, 0, 0), caus = c(1, 0, 1, 0, 0), certain = c(0,
0, 0, 1, 0), chang = c(0, 0, 1, 0, 1), claim = c(1, 0, 0,
0, 0), control = c(0, 0, 0, 0, 0), cours = c(1, 0, 0, 0,
0), doesnt = c(0, 0, 0, 0, 0), done = c(0, 0, 1, 0, 0), dont = c(1,
0, 0, 0, 0), either = c(0, 0, 0, 0, 0), els = c(0, 0, 0,
0, 0), end = c(0, 0, 0, 1, 0), enough = c(0, 0, 0, 0, 0),
ever = c(0, 0, 0, 0, 0), everi = c(0, 0, 0, 1, 0), fact = c(1,
0, 0, 0, 0), far = c(0, 0, 0, 0, 0), find = c(0, 0, 0, 0,
0), game = c(0, 0, 0, 0, 0), general = c(0, 0, 0, 0, 0),
give = c(2, 0, 0, 0, 0), good = c(3, 0, 0, 1, 0), govern = c(0,
0, 0, 0, 0), happen = c(1, 0, 0, 0, 0), hard = c(0, 0, 0,
0, 0), hope = c(0, 0, 0, 0, 0), institut = c(0, 1, 0, 1,
1), isnt = c(0, 0, 0, 0, 0), just = c(0, 1, 0, 0, 0), keep = c(0,
0, 0, 0, 0), law = c(0, 0, 0, 0, 0), least = c(0, 0, 0, 0,
0), let = c(1, 0, 0, 0, 0), littl = c(0, 0, 0, 0, 0), long = c(2,
0, 0, 0, 0), made = c(0, 0, 1, 0, 0), mayb = c(0, 0, 0, 0,
0), mean = c(0, 0, 0, 0, 0), messag = c(0, 0, 0, 0, 0), might = c(0,
0, 0, 0, 0), much = c(0, 0, 0, 0, 0), need = c(0, 0, 0, 0,
0), note = c(0, 0, 0, 0, 0), noth = c(1, 0, 0, 0, 0), now = c(0,
0, 0, 0, 1), order = c(0, 0, 0, 0, 0), other = c(0, 0, 2,
0, 0), part = c(0, 0, 0, 0, 0), play = c(0, 0, 0, 0, 0),
pleas = c(0, 0, 0, 0, 0), point = c(0, 0, 0, 0, 0), power = c(1,
0, 0, 0, 0), problem = c(0, 0, 0, 0, 0), public = c(0, 0,
0, 0, 0), put = c(0, 0, 0, 0, 0), question = c(0, 0, 0, 0,
0), real = c(0, 0, 0, 0, 0), realli = c(0, 0, 0, 0, 0), requir = c(0,
1, 0, 0, 0), respons = c(0, 0, 0, 0, 0), run = c(0, 0, 0,
0, 0), said = c(1, 0, 0, 0, 0), scienc = c(0, 0, 0, 0, 0),
seen = c(0, 0, 0, 0, 0), sever = c(0, 0, 0, 0, 0), show = c(1,
0, 0, 0, 0), sinc = c(0, 1, 0, 0, 0), someon = c(0, 0, 0,
0, 0), someth = c(1, 0, 0, 0, 0), start = c(0, 0, 0, 0, 0
), still = c(0, 0, 0, 0, 0), support = c(0, 0, 0, 0, 0),
talk = c(0, 0, 0, 0, 1), tell = c(0, 0, 0, 0, 0), thank = c(0,
0, 0, 0, 0), that = c(0, 0, 0, 0, 0), thing = c(0, 0, 0,
0, 0), though = c(0, 0, 0, 0, 0), two = c(0, 0, 0, 0, 0),
want = c(1, 0, 0, 0, 0), year = c(0, 0, 0, 0, 1), yes = c(0,
0, 0, 1, 0), bad = c(0, 0, 2, 0, 0), xnewsread = c(0, 0,
0, 0, 0), research = c(0, 0, 0, 0, 0), interest = c(1, 0,
0, 0, 0), lot = c(1, 0, 0, 0, 0), didnt = c(0, 1, 0, 0, 0
), nntppostinghost = c(0, 1, 0, 1, 1), name = c(0, 0, 2,
0, 0), ive = c(0, 0, 0, 0, 0), never = c(0, 0, 0, 0, 0),
inc = c(0, 0, 0, 0, 0), comput = c(0, 0, 0, 0, 0), replyto = c(0,
0, 0, 0, 0), email = c(0, 0, 0, 0, 0), program = c(0, 0,
0, 0, 0), servic = c(0, 0, 0, 0, 0), bit = c(0, 0, 0, 0,
0), dept = c(0, 0, 0, 0, 0), come = c(0, 0, 0, 0, 0), access = c(0,
0, 0, 0, 0), got = c(0, 0, 0, 0, 0), yet = c(0, 0, 0, 0,
0), suggest = c(0, 0, 0, 0, 0), engin = c(0, 0, 0, 0, 0),
last = c(0, 0, 0, 0, 0), your = c(0, 0, 0, 0, 0), next. = c(0,
0, 0, 0, 0), david = c(0, 0, 0, 0, 0), internet = c(0, 0,
0, 0, 0), depart = c(0, 0, 0, 0, 0), softwar = c(0, 0, 0,
0, 0), center = c(0, 0, 0, 0, 0), window = c(0, 0, 0, 0,
0), Negative = structure(c(2L, 1L, 2L, 1L, 2L), .Label = c("FALSE",
"TRUE"), class = "factor")), row.names = c("6", "7", "8",
"9", "11"), class = "data.frame")
测试数据遵循相同的布局!
好的。这是一个调试任务,但我认为我发现了错误.你的定义是
unique.feature.values <- unique(feature.values)
likelihood.matrix <- matrix(rep(NA),
nrow=dim(priors)[1],
ncol=length(unique.feature.values))
让
> unique.feature.values
[1] 4 0
和
> length(unique.feature.values)
[1] 2
在这种情况下,你的矩阵 likelihood.matrix
有
> dim(likelihood.matrix)
[1] 1 2
我们忽略了 dim(priors)[1]
部,并认为它是 1
. 你的第二个for-loop定义为
for (item in unique.feature.values){
[ some code ]
likelihood.matrix[, item] <- likelihood.item
}
而这正是它的破绽所在。item
是一个元素的 unique.feature.values
. 因此,如上文所假设的第一要素 unique.feature.values
等于 4
. 但是... likelihood.matrix
有尺寸 1x2
因此
likelihood.matrix[,item] <- likelihood.item
导致
likelihood.matrix[,4] <- likelihood.item
这是出界的。要么你必须定义你的 likelihood.matrix
由 nrow = max(unique.feature.values)
或者你必须将第二个for-loop改为
for (item in 1:length(unique.feature.values)) { }