我正在尝试导入带有块和过滤器的巨大.csv。但是我的代码只是读取存档的一部分(2000万到4500万)。
我也已经尝试使用data.table()
但没有成功。
arq_grande <- file("cnpj_dados_cadastrais_pj.csv", "r")
tam_chunk <- 5000
df1 <- read.csv(arq_grande, nrows = 10, header = T, sep = "#", dec = ".")
for(i in 1:ncol(df1)){df1[,i] <- df1[,i] %>% iconv(from = 'UTF-8', to = 'latin1')}
df_filtrado <- df1 %>% filter(codigo_natureza_juridica == c("2143","2330")) %>% select(cnpj,everything())
write.table(df_filtrado, "/cnpj_dados_cadastrais_pj_filtrado_coop.csv", row.names = F, sep = "#", dec = ".")
names(df1)
nrow <- 1
totalRows <- 0
repeat {
df <- read.csv(arq_grande, header=FALSE, sep="#", col.names = names(df1), nrows = tam_chunk)
for(i in 1:ncol(df)){df[,i] <- df[,i] %>% iconv(from = 'UTF-8', to = 'latin1')}
nRow = nrow(df)
totalRows <- totalRows + nRow
cat("Lendo", nrow(df), "linhas, total lido", totalRows, "\n")
if (nrow(df) == 0)
break
df_filtrado <- df %>% filter(codigo_natureza_juridica == c("2143","2330")) %>% select(cnpj,everything())
write.table(df_filtrado, "/cnpj_dados_cadastrais_pj_filtrado_coop.csv", append = T, col.names = F, row.names = F, sep = "#", dec = ".")
}
close(arq_grande)
我在这里看到其他示例,但没有任何效果。抱歉,我是这种数据的新手。
我只想阅读我的.csv的所有行。
您可以使用readr::read_csv
和skip
自变量以n_max
读取csv文件:skip
是开始时要跳过的行数,n_max
是之后要读取的行数。
library("readr")
file <- "large.csv"
chunk_size <- 1e+6
line_num <- 0
while (TRUE) {
chunk <- read_csv(
file,
skip = line_num, n_max = chunk_size
)
# Do something with the chunk of data
# If there are fewer than `chunk_size` lines, then reached end of file
if (nrow(chunk) < chunk_size) {
break
} else {
line_num <- line_num + chunk_size
}
}