我正在构建一个逻辑回归来预测最有效的买家,我有一个数据表,我能够编写代码直到拆分功能,但是数据表中的一列只有$ 100- $ 200,$ 200- $ 300 。如何分配随机数值并继续进行,下面是参考代码
file_data_0 = read.csv('......csv')
View(file_data_0)
file_data_0[file_data_0 == '?'] = NA
filter_data_1 = na.omit(file_data_0)
View(filter_data_1)
summary(filter_data_1)
unique(filter_data_1$spend)
table(filter_data_1$spend)
table(filter_data_1$spend)/nrow(filter_data_1)
class(filter_data_1)
library(caTools)
set.seed(64000)
split_data <- sample.split(filter_data_1$spend, SplitRatio = 0.75)
train_data <- subset(filter_data_1, split_data == TRUE)
test_data <- subset(filter_data_1, split_data == FALSE)
dim(train_data)
dim(test_data)
dim(filter_data_1)
class(filter_data_1$history_segment)
filter_data_1$history_segment = as.numeric(filter_data_1$history_segment)
希望这可以帮助!
#sample data
#I copied it from the attached image file but in future you should always share your data using 'dput(df)'
df <- data.frame(recency = c(10,6,7,9),
history_segment=c('2) $100 - $200', '3) $200 - $350', '2) $100 - $200', '5) $500 - $750'),
mens=c(1,1,0,1))
#little bit of data cleaning in 'history_segment' column
# otherwise accessing these variables for further processing would be another pain!
df$history_segment <- paste0("his_seg_", gsub("^\\d)|\\s|\\$", "", gsub("-", "_", df$history_segment)))
#dummy code 'history_segment' variable
library(psych)
df_new <- cbind(df[,names(df) != "history_segment"], dummy.code(df$history_segment))
现在你可以使用df_new
作为主要数据而不是df
> df
recency history_segment mens
1 10 his_seg_100_200 1
2 6 his_seg_200_350 1
3 7 his_seg_100_200 0
4 9 his_seg_500_750 1
> df_new
recency mens his_seg_100_200 his_seg_200_350 his_seg_500_750
1 10 1 1 0 0
2 6 1 0 1 0
3 7 0 1 0 0
4 9 1 0 0 1