我有两个csv文件,这两个文件的第一列中的字符最多可运行50000个变量。我必须计算两个文件的这些列之间的余弦相似度。我试过在R中使用LSA但是我的结果有些问题。谁能帮我?以下是我的编码。
library(lsa)
Gyan=tempfile()
dir.create(Gyan)
single_tags=read.csv(file.choose(), sep = ',')
as.character(single_tags$CULTAGS)
options(max.print = 1000000)
write(as.character(single_tags$CULTAGS),file = paste(Gyan, 'D1',sep = '1'))
Single_ASFA=read.csv(file.choose(),sep = ',')
options(max.print = 1000000)
as.character(Single_ASFA$ASFACV)
write(as.character(Single_ASFA$ASFCV),file = paste(Gyan, '/'))
Mycomparison = textmatrix(Gyan, minWordLength = 1)
Mycomparison
res = lsa::cosine(myMatrix[,1],myMatrix[,2])
res
似乎与myMatrix
和Mycomparison
没有关系。如果你用MyMatrix
替换Mycomparison
一切正常。见如下:
# Data Simulation
single_tags_df <- data.frame( CULTAGS = c("dog", "cat", "sushi", "mouse", "leech"))
Single_ASFA_df <- data.frame(ASFCV = c("hamster", "mouse", "sushi", "man"))
write.csv(single_tags_df, file = "single.csv")
write.csv(Single_ASFA_df, file = "ASFA.csv")
library(lsa)
Gyan <- tempfile()
dir.create(Gyan)
single_tags <- read.csv("single.csv", sep = ",")
as.character(single_tags$CULTAGS)
options(max.print = 1000000)
write(as.character(single_tags$CULTAGS), file = paste(Gyan, "D1", sep = "/"))
Single_ASFA <- read.csv("ASFA.csv", sep = ",")
options(max.print = 1000000)
as.character(Single_ASFA$ASFCV)
write(as.character(Single_ASFA$ASFCV), file = paste(Gyan, "D2", sep = "/"))
Mycomparison <- textmatrix(Gyan)
Mycomparison
unlink(Gyan)
res <- lsa::cosine(Mycomparison[, 1], Mycomparison[, 2])
res
# [,1]
# [1,] 0.4472136