通用快速排序实现很难处理大量字符串

问题描述 投票:0回答:1

好吧,基本上我用 C 语言为 uni 实现了这个快速排序,它应该能够接收任何类型的数组并对其进行排序。为了进行压力测试,我获得了一个包含 2000 万行的 Records.csv 文件,其结构如下:,,,field3 [float]>

我的算法必须通过按 field1 或 field2 或 field3 (在用户输入中指定)排序来对文件行进行排序

问题是,即使按 field2 和 3 排序工作得很好(在我的机器上只需 12/13 秒),按 field1 排序会导致巨大的等待时间和后续的分段错误。如果我尝试对一组较小的行进行排序(我尝试了 1k)并且它工作得很好,则不会发生这种情况。我还在同一个文件中实现了合并排序,即使需要 25 秒也能正常工作,所以我认为这不是 csv 文件的问题。

我留下带有算法的 sortlib.c 文件和带有 csv 文件读取器/写入器的mass_test.c 文件。

每一个建议对我来说都是黄金,所以提前:)

附注 完整代码位于 github 上“https://github.com/SamueleTondaRoc/SortLib-for-uni”

sortlib.h仅包含merge_sort和quick_sort所需的一些库和原型

sortlib.c:

#include "sortlib.h"

#define BASE(i) ((base) + ((i) * size))

void swap (void *a, void *b, size_t size, void* temp)
{
    memcpy (temp, a, size);
    memcpy (a, b, size);
    memcpy (b, temp, size);
}

size_t split (int low, int hig, void* base, size_t size, int (*compar)(const void*, const void*), void* temp)
{
    int i = low - 1, j = low, mid = low + ((hig - low) / 2), pivot = hig;
    
    if (!((*compar)(BASE (low), BASE (mid)) + (*compar)(BASE (low), BASE (hig)))) swap (BASE (low), BASE (hig), size, temp);
    else if (!((*compar)(BASE (mid), BASE (low)) + (*compar)(BASE (mid), BASE (hig)))) swap (BASE (mid), BASE (hig), size, temp);

    while (j < hig)
    {
        if ((*compar)(BASE (j), BASE (pivot)) < 0) 
        {
            i++;
            swap (BASE (i), BASE (j), size, temp);
        }
        j++;
    }
    swap (BASE (i + 1), BASE (pivot), size, temp);
    pivot = i + 1;
    return pivot;
}

/*
    @brief recursive method for the quick sort
*/
void quick_sort_R (int low, int hig, void *base, size_t size, int (*compar)(const void*, const void*), void* temp)
{
    if (low < hig)
    {
        int pivot = split (low, hig, base, size, compar, temp);
        
        quick_sort_R (low, pivot - 1, base, size, compar, temp);
        quick_sort_R (pivot + 1, hig, base, size, compar, temp);
    }
}

/*
    @brief quick sort for any type of array
    @param base pointer to the first element of the arry
    @param nitems lenght of the array
    @param size size of array items
    @param compar compare function
    @warning needs a compare function such that if the first elem is "bigger" than the second it returns 1, if it's equal it ret 0, else -1
*/
void quick_sort (void *base, size_t nitems, size_t size, int (*compar)(const void*, const void*)) 
{
    void *temp = (void*) malloc (size);
    if (base && compar && nitems)
        quick_sort_R (0, nitems - 1, base, size, compar, temp);
    free (temp);
}

mass_tester.c:

#include "support.h"
#include "../src/sortlib.h"
#include <stdint.h>
#include <time.h>

#define MAX_SIZE 100

typedef struct data_line {
    int id;
    char field1[MAX_SIZE];
    int field2;
    float field3;
} Line, *Line_ptr;

size_t calc_csvFile_len (FILE *file)
{
    size_t len = 0;
    printf ("calculating file lenght...\n");
    __clock_t start = clock(), end;

        char s[MAX_SIZE];
    while(fscanf (file, " %99[^\n]", s) == 1) len++;

    end = clock();
    double algo_time = ((double)(end - start)) / CLOCKS_PER_SEC;
    printf ("file lenght: %ld\nTook: %.2fsec\n\n", len, algo_time);

    fseek (file, 0L, SEEK_SET);
    return len;
}

void gather_data (FILE *infile, Line_ptr *data)
{
    int id, field2;
    char field1[MAX_SIZE];
    float field3;

    printf ("reading data...\n");
    __clock_t start = clock(), end;

    for (size_t i = 0; !feof (infile); i++)
    {
        fscanf (infile, "%d,%[^,],%d,%f\n", &id, field1, &field2, &field3);
        (*data)[i].id = id;
        strcpy ((*data)[i].field1, field1);
        (*data)[i].field2 = field2;
        (*data)[i].field3 = field3;
    }
    fclose (infile);

    end = clock();
    double algo_time = ((double)(end - start)) / CLOCKS_PER_SEC;
    printf ("data loading complete!\nTook: %.2fsec\n\n", algo_time);
}

void write_data (FILE *outfile, Line_ptr *data, size_t len)
{
    printf ("writing data...\n");
    __clock_t start = clock(), end;
    
    for (size_t i = 0; i < len; i++)
    {
        fprintf (outfile, "%d,%s,%d,%f\n", (*data)[i].id, (*data)[i].field1, (*data)[i].field2, (*data)[i].field3);
    }
    fclose (outfile);

    end = clock();
    double algo_time = ((double)(end - start)) / CLOCKS_PER_SEC;
    printf ("data writing complete!\nTook: %.2fsec\n\n", algo_time);
}

int cmp_field1 (const void* a, const void* b)
{
    char A[15], B[15];
    strcpy (A, ((Line_ptr) a) -> field1);
    strcpy (B, ((Line_ptr) b) -> field1);
    int cmp = strcmp (A, B);
    return cmp > 0 ? 1 : (cmp < 0 ? -1 : 0);
}

int cmp_field2 (const void* a, const void* b)
{
    int A = ((Line_ptr) a) -> field2;
    int B = ((Line_ptr) b) -> field2;
    return A < B ? -1 : (A == B ? 0 : 1);
}

int cmp_field3 (const void* a, const void* b)
{
    float A = ((Line_ptr) a) -> field3;
    float B = ((Line_ptr) b) -> field3;
    return A < B ? -1 : (A == B ? 0 : 1);
}

void sort_records (FILE *infile, FILE *outfile, size_t field, size_t algo)
{

    // preparing the array 
    size_t len = calc_csvFile_len (infile);
    Line_ptr data = (Line_ptr) malloc (sizeof(Line) * len);

    // filling the array with data from file
    gather_data (infile, &data);

    printf ("sorting...\n");

    // defining compare func
    int (*cmp_func)(const void*, const void*) = field == 1 ? (*cmp_field1) : (field == 2 ? (*cmp_field2) : (*cmp_field3));  

    // chosing sorting algorithm
    __clock_t start = clock(), end;
    if (algo == 1) merge_sort ((void**) &data, len, sizeof(Line), cmp_func);
    else quick_sort ((void*) data, len, sizeof(Line), cmp_func);
    end = clock();
    double algo_time = ((double)(end - start)) / CLOCKS_PER_SEC;
    printf ("array sorted in %.2f seconds\n\n", algo_time);

    // dumping the array data in an output file
    write_data(outfile, &data, len);
    free (data);
}

int main (int argc, char **argv)
{
    // reading command
    char* in_file_path = argv[1];
    char* out_file_path = argv[2];
    size_t field = (int) (*argv[3] - '0'),
            algo = (int) (*argv[4] - '0');

    // opening files (the closing happens in the gather_data and write_data methods)
    FILE *infile = NULL, *outfile = NULL;
    if (in_file_path) infile = fopen (in_file_path, "r");
    if (out_file_path) outfile = fopen (out_file_path, "w");

    // checking inputs and launching sorter
    if (infile && outfile && (field > 0 && field < 4) && (algo > 0 && algo < 3)) 
        sort_records (infile, outfile, field, algo);    
    else 
        printf("error in passed data: \"%s %s %s %ld %ld\"\n", argv[0], in_file_path, out_file_path, field, algo);  // info dump in case of error

    return 0;
}
c string quicksort
1个回答
0
投票

这看起来很奇怪:

#define MAX_SIZE 100

typedef struct data_line {
    int id;
    char field1[MAX_SIZE];  // So field1 can hold 100 chars
    int field2;
    float field3;
} Line, *Line_ptr;

但后来你有:

int cmp_field1 (const void* a, const void* b)
{
    char A[15], B[15];                     // 15 chars per array
    strcpy (A, ((Line_ptr) a) -> field1);  // Potential copy of 100 chars
                                           // into a buffer with only 15 chars
    strcpy (B, ((Line_ptr) b) -> field1);
    int cmp = strcmp (A, B);
    return cmp > 0 ? 1 : (cmp < 0 ? -1 : 0);
}

因此,根据输入文件,您似乎可能会出现“缓冲区溢出”。

顺便说一句:基于字符串的排序不需要特殊的比较函数。

strcmp
应该自己完成这项工作。

© www.soinside.com 2019 - 2024. All rights reserved.