我正在使用 Quanteda 4.1.0,并在使用字典调整同义词和复数时出现一些意外的行为。字典中条目的顺序会影响特征的频率计数。
在下面的示例中,“banana”及其复数出现了 3 次,而“apple”及其复数出现了两次。但只有当字典中“苹果”列在“香蕉”之前时,我才能得到正确的频率计数。那么字典中条目的字母顺序似乎会影响 dfm_lookup() 的行为?
library(quanteda)
library(quanteda.textstats)
dfmat <- dfm(tokens(c("I like apples, but I don't like apple pie. Bananas are OK",
"I like bananas, but I don't like banana fritter.")))
textstat_frequency(dfmat) %>% filter(grepl("apple|banana", feature))
# feature frequency rank docfreq group
# 7 bananas 2 3 2 all
# 8 apples 1 8 1 all
# 9 apple 1 8 1 all
# 13 banana 1 8 1 all
#With wildcards
#This works - expected behaviour
dict <- dictionary(list(apple = c("apple*"),
banana = c("banana*")))
dfmat <- dfm_lookup(dfmat,
dictionary = dict, exclusive = FALSE, capkeys = FALSE)
textstat_frequency(dfmat) %>% filter(grepl("apple|banana", feature))
# feature frequency rank docfreq group
# 3 banana 3 3 2 all
# 4 apple 2 4 1 all
#This doesn't work - unexpected behaviour
dict <- dictionary(list(banana = c("banana*"),
apple = c("apple*")))
dfmat <- dfm_lookup(dfmat,
dictionary = dict, exclusive = FALSE, capkeys = FALSE)
textstat_frequency(dfmat) %>% filter(grepl("apple|banana", feature))
# feature frequency rank docfreq group
# 3 apple 3 3 2 all
# 4 banana 2 4 1 all
#Without wildcards - get the same (puzzling) behaviour
#This works
#dict <- dictionary(list(apple = c("apple","apples"),
# banana = c("banana","bananas")))
#This doesn't work
#dict <- dictionary(list(banana = c("banana","bananas"),
# apple = c("apple","apples")))
我对 Quanteda 没有太多经验,但无论如何我都会(也许不明智)分享我所发现的与这个问题相关的内容。
字典中条目的顺序似乎绝对对
dfm_lookup
的结果产生影响,这至少对我来说是意想不到的(也许是无意的?)。
首先,这种意外的行为似乎并不是字母顺序的结果,相反,它似乎与文档频率矩阵中术语的顺序有关。看看下面的例子:
feature_matrix <- dfm(tokens(c("foo1 foo2 foo3",
"bar1 bar2 bar3 foo4")))
# Entries in dictionary in alphabetical order, however resultant dfm clearly wrong
dict1 <- dictionary(list(bar = c("bar*"),foo = c("foo*")))
feature_matrix1 <- dfm_lookup(feature_matrix, dictionary = dict1, exclusive = FALSE, capkeys = FALSE)
feature_matrix1
#docs bar foo
#text1 3 0
#text2 0 4
# Entries in dictionary not in alphabetical order, however resultant dfm correct
dict2 <- dictionary(list(foo = c("foo*"), bar = c("bar*")))
feature_matrix2 <- dfm_lookup(feature_matrix, dictionary = dict2, exclusive = FALSE, capkeys = FALSE)
feature_matrix2
#docs foo bar
#text1 3 0
#text2 1 3
我还稍微查看了
dfm_lookup
的源代码,并认为我有一个快速修复方案(尚未经过充分测试,我对此不是非常有信心,但至少它适用于一些示例) 。我只更改了一行代码,它似乎解决了这两个示例中的意外行为。
my_dfm_lookup <- function(x, dictionary, levels = 1:5,
exclusive = TRUE,
valuetype = c("glob", "regex", "fixed"),
case_insensitive = TRUE,
capkeys = !exclusive,
nomatch = NULL,
verbose = quanteda_options("verbose")) {
x <- as.dfm(x)
exclusive <- check_logical(exclusive)
capkeys <- check_logical(capkeys)
verbose <- check_logical(verbose)
if (!nfeat(x) || !ndoc(x)) return(x)
if (!is.dictionary(dictionary))
stop("dictionary must be a dictionary object")
valuetype <- match.arg(valuetype)
type <- colnames(x)
attrs <- attributes(x)
if (verbose)
catm("applying a dictionary consisting of ", length(dictionary), " key",
if (length(dictionary) > 1L) "s" else "", "\n", sep = "")
ids <- object2id(dictionary, type, valuetype, case_insensitive,
quanteda:::field_object(attrs, "concatenator"), levels)
# flag nested patterns
if (length(ids)) {
m <- factor(names(ids), levels = unique(names(ids)))
dup <- unlist(lapply(split(ids, m), duplicated), use.names = FALSE)
} else {
dup <- logical()
}
key <- attr(ids, "key")
ids <- ids[lengths(ids) == 1 & !dup] # drop phrasal and nested patterns
id_key <- match(names(ids), key)
id <- unlist(ids, use.names = FALSE)
if (capkeys)
key <- char_toupper(key)
if (length(id)) {
if (exclusive) {
if (!is.null(nomatch)) {
id_nomatch <- setdiff(seq_len(nfeat(x)), id)
id <- c(id, id_nomatch)
id_key <- c(id_key, rep(length(key) + 1,
length(id_nomatch)))
key <- c(key, nomatch[1])
}
col_new <- key[id_key]
x <- x[, id]
set_dfm_featnames(x) <- col_new
# merge identical keys and add non-existent keys
result <- dfm_match(dfm_compress(x, margin = "features"), key)
} else {
if (!is.null(nomatch))
warning("nomatch only applies if exclusive = TRUE")
col_new <- type
# repeat columns for multiple keys
if (any(duplicated(id))) {
ids_rep <- as.list(seq_len(nfeat(x)))
ids_rep[unique(id)] <- split(id, id)
id_rep <- unlist(ids_rep, use.names = FALSE)
} else {
id_rep <- seq_len(nfeat(x))
}
col_new <- col_new[id_rep]
# This is the only meaningful change I made to the quanteda function
# originally it was:
# col_new[id_rep %in% id] <- key[id_key]
# But I believe this may be where our unexpected behavior is getting in,
# begin change
col_new[id_rep %in% id] <- key[id_key[order(id)]]
# end change
x <- x[,id_rep]
quanteda:::set_dfm_featnames(x) <- col_new
result <- dfm_compress(x, margin = "features")
}
} else {
if (exclusive) {
if (!is.null(nomatch)) {
result <- as.dfm(matrix(ntoken(x), ncol = 1,
dimnames = list(docnames(x), nomatch)))
} else {
result <- make_null_dfm(document = docnames(x),
feature = key)
}
} else {
result <- x
}
}
if (exclusive)
field_object(attrs, "what") <- "dictionary"
quanteda:::rebuild_dfm(result, attrs)
}