按索引范围有效地将大型 CSV 文件拆分为较小的文件，无需将整个文件加载到 RAM 中

Question

我有一个很大的 CSV 文件 (60GB)，无法放入 RAM。第一列包含一个从 2000 到 2999 的排序索引，并且可以在行之间重复。我想将 60GB 文件拆分为 10 个文件，每个文件大约 6 GB，但不“拆分”两个文件之间的索引。

因此，第一个文件的行索引为 2000 到 2099，下一个文件的行索引为 2100 到 2199，依此类推。文件还应保留第一个文件的标头。

我无法使用像qsv的split这样的工具，因为10个文件中每个文件的行数可能不同。我也尝试过使用

qsv apply

，但它似乎试图将所有内容加载到RAM中。

更新：Miller看起来也很有希望。我不知道是否应该使用

split

或

tee

，因为我不想在表中添加额外的列。

Answer 1

我最终编写了一个 150 行的 C 程序。它假设 index 位于第一列，并且文件名的格式为

[0-9]+to[0-9]+_.*

，如

10000to10999_Blocks.csv

。

它仅存储当前行和上一行，并对它们进行比较以检查 index 如何更改。

输出文件将写入当前目录。

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <errno.h>

// Use wc --max-line-length to discover max line length
// It returned 296
#define MAX_LINE_LENGTH 512
#define MAX_DIGITS_LENGTH 16
#define swap(x,y) do \
   { unsigned char swap_temp[sizeof(x) == sizeof(y) ? (signed)sizeof(x) : -1]; \
     memcpy(swap_temp,&y,sizeof(x)); \
     memcpy(&y,&x,       sizeof(x)); \
     memcpy(&x,swap_temp,sizeof(x)); \
    } while(0)
#define DAVO_CHECKPTR(n) __davo_checkfile(n, __FILE__, __LINE__)

void __davo_checkfile(const void * ptr, const char f[], const int l) {
    if (ptr == NULL) {
        fprintf(stderr, "Error: %s (%d)\n", strerror(errno), errno);
        fprintf(stderr, "@ %s:%d\n", f, l);
        exit(1);
    }
}

void parse_fname(const char * fname, char* from, char* to, char *rest) {
    // Parse filename in the format XXXXXtoYYYYY_ZZZZZZ
    enum fsname_fname_parser {
        FIRST_NUMBER,
        TO_2,
        SECOND_NUMBER,
        REST,
    } state = FIRST_NUMBER;
    int j = 0;

    for (int i = 0; fname[i] != '\0' && i < 1024; i++) {
        // printf("state: %d, fname[%d]=%c, from[%d]=%c, to[%d]=%c\n", state, i, fname[i], j)
        switch (state) {
            case FIRST_NUMBER:
                if (fname[i] == 't') {
                    state = TO_2;
                    from[j] = '\0';
                } else {
                    from[j] = fname[i];
                }
                j++;
                break;
            case SECOND_NUMBER:
                if (fname[i] == '_') {
                    state = REST;
                    to[j] = '\0';
                    j = 0;
                } else {
                    to[j] = fname[i];
                    j++;
                }
                break;
            case TO_2:
                state = SECOND_NUMBER;
                j = 0;
                break;
            case REST:
                rest[j] = fname[i];
                j++;
                break;
        }
    }
    rest[j] = '\0';
}

int common_prefix_length(const char *str1, const char *str2, const int max_n) {
    int i = 0;
    while (str1[i] == str2[i] && i < max_n) i++;
    return i;
}

int line_to_fname(const char *line, char *fname, int extra_digits, const char * rest) {
    char from[MAX_DIGITS_LENGTH];
    char to[MAX_DIGITS_LENGTH];
    int i;

    for (i = 0; line[i] != ','; i++) {
        from[i] = to[i] = line[i];
    }

    from[i] = to[i] = '\0';
    for (int j = i - extra_digits; j < i; j++) {
        to[j] = '9';
    }

    sprintf(fname, "%sto%s_%s", from, to, rest);
}

int main(int argc, char* argv[]) {
    FILE * fi = NULL, *fo = NULL;
    int required_cpl, extra_digits, index, lastindex = -1;
    char header[MAX_LINE_LENGTH];
    char auxbuf1[MAX_LINE_LENGTH] = "\0";
    char auxbuf2[MAX_LINE_LENGTH] = "\0";
    char * line = auxbuf1, * lastline = auxbuf2;
    char from[MAX_DIGITS_LENGTH], to[MAX_DIGITS_LENGTH], rest[MAX_LINE_LENGTH];

    if (argc < 2) {
        fprintf(stderr, "Usage: %s filename [extra_digits=1]\n", argv[0]);
        return EXIT_FAILURE;
    }

    parse_fname(argv[1], from, to, rest);
    extra_digits = (argc==3)?atoi(argv[2]):1;
    required_cpl = extra_digits + common_prefix_length(from, to, MAX_DIGITS_LENGTH);
    printf("from: %s, to: %s, rest: %s, cpl: %d, rcpl: %d\n", from, to, rest, common_prefix_length(from, to, MAX_DIGITS_LENGTH), required_cpl);

    DAVO_CHECKPTR(fi = fopen(argv[1], "r"));
    // Save header to variable
    DAVO_CHECKPTR(fgets(header, MAX_LINE_LENGTH, fi));

    printf("Saved header: %s", header);

    while (fgets(line, MAX_LINE_LENGTH, fi) != NULL) {
        // If first line or should split
        if (lastline[0] == '\0' || (lastline[0] != '\0' && common_prefix_length(line, lastline, required_cpl) < required_cpl)) {
            char output_fname[MAX_LINE_LENGTH];

            if (fo != NULL) fclose(fo);

            // Open new file and write header
            line_to_fname(line, output_fname, extra_digits, rest);
            printf("Split to %s\n", output_fname);
            DAVO_CHECKPTR(fo = fopen(output_fname, "w"));
            // Write header
            fputs(header, fo);
        }

        fputs(line, fo);

        swap(line, lastline);
    }

    fclose(fi);
    fclose(fo);
}

按索引范围有效地将大型 CSV 文件拆分为较小的文件，无需将整个文件加载到 RAM 中

问题描述投票：0回答：1

1个回答

最新问题

按索引范围有效地将大型 CSV 文件拆分为较小的文件，无需将整个文件加载到 RAM 中

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1