好吧,基本上我用 C 语言为 uni 实现了这个快速排序,它应该能够接收任何类型的数组并对其进行排序。为了进行压力测试,我获得了一个包含 2000 万行的 Records.csv 文件,其结构如下:
我的算法必须通过按 field1 或 field2 或 field3 (在用户输入中指定)排序来对文件行进行排序
问题是,即使按 field2 和 3 排序工作得很好(在我的机器上只需 12/13 秒),按 field1 排序会导致巨大的等待时间和后续的分段错误。如果我尝试对一组较小的行进行排序(我尝试了 1k)并且它工作得很好,则不会发生这种情况。我还在同一个文件中实现了合并排序,即使需要 25 秒也能正常工作,所以我认为这不是 csv 文件的问题。
我留下带有算法的 sortlib.c 文件和带有 csv 文件读取器/写入器的mass_test.c 文件。
每一个建议对我来说都是黄金,所以提前:)
附注 完整代码位于 github 上“https://github.com/SamueleTondaRoc/SortLib-for-uni”
sortlib.h仅包含merge_sort和quick_sort所需的一些库和原型
sortlib.c:
#include "sortlib.h"
#define BASE(i) ((base) + ((i) * size))
void swap (void *a, void *b, size_t size, void* temp)
{
memcpy (temp, a, size);
memcpy (a, b, size);
memcpy (b, temp, size);
}
size_t split (int low, int hig, void* base, size_t size, int (*compar)(const void*, const void*), void* temp)
{
int i = low - 1, j = low, mid = low + ((hig - low) / 2), pivot = hig;
if (!((*compar)(BASE (low), BASE (mid)) + (*compar)(BASE (low), BASE (hig)))) swap (BASE (low), BASE (hig), size, temp);
else if (!((*compar)(BASE (mid), BASE (low)) + (*compar)(BASE (mid), BASE (hig)))) swap (BASE (mid), BASE (hig), size, temp);
while (j < hig)
{
if ((*compar)(BASE (j), BASE (pivot)) < 0)
{
i++;
swap (BASE (i), BASE (j), size, temp);
}
j++;
}
swap (BASE (i + 1), BASE (pivot), size, temp);
pivot = i + 1;
return pivot;
}
/*
@brief recursive method for the quick sort
*/
void quick_sort_R (int low, int hig, void *base, size_t size, int (*compar)(const void*, const void*), void* temp)
{
if (low < hig)
{
int pivot = split (low, hig, base, size, compar, temp);
quick_sort_R (low, pivot - 1, base, size, compar, temp);
quick_sort_R (pivot + 1, hig, base, size, compar, temp);
}
}
/*
@brief quick sort for any type of array
@param base pointer to the first element of the arry
@param nitems lenght of the array
@param size size of array items
@param compar compare function
@warning needs a compare function such that if the first elem is "bigger" than the second it returns 1, if it's equal it ret 0, else -1
*/
void quick_sort (void *base, size_t nitems, size_t size, int (*compar)(const void*, const void*))
{
void *temp = (void*) malloc (size);
if (base && compar && nitems)
quick_sort_R (0, nitems - 1, base, size, compar, temp);
free (temp);
}
mass_tester.c:
#include "support.h"
#include "../src/sortlib.h"
#include <stdint.h>
#include <time.h>
#define MAX_SIZE 100
typedef struct data_line {
int id;
char field1[MAX_SIZE];
int field2;
float field3;
} Line, *Line_ptr;
size_t calc_csvFile_len (FILE *file)
{
size_t len = 0;
printf ("calculating file lenght...\n");
__clock_t start = clock(), end;
char s[MAX_SIZE];
while(fscanf (file, " %99[^\n]", s) == 1) len++;
end = clock();
double algo_time = ((double)(end - start)) / CLOCKS_PER_SEC;
printf ("file lenght: %ld\nTook: %.2fsec\n\n", len, algo_time);
fseek (file, 0L, SEEK_SET);
return len;
}
void gather_data (FILE *infile, Line_ptr *data)
{
int id, field2;
char field1[MAX_SIZE];
float field3;
printf ("reading data...\n");
__clock_t start = clock(), end;
for (size_t i = 0; !feof (infile); i++)
{
fscanf (infile, "%d,%[^,],%d,%f\n", &id, field1, &field2, &field3);
(*data)[i].id = id;
strcpy ((*data)[i].field1, field1);
(*data)[i].field2 = field2;
(*data)[i].field3 = field3;
}
fclose (infile);
end = clock();
double algo_time = ((double)(end - start)) / CLOCKS_PER_SEC;
printf ("data loading complete!\nTook: %.2fsec\n\n", algo_time);
}
void write_data (FILE *outfile, Line_ptr *data, size_t len)
{
printf ("writing data...\n");
__clock_t start = clock(), end;
for (size_t i = 0; i < len; i++)
{
fprintf (outfile, "%d,%s,%d,%f\n", (*data)[i].id, (*data)[i].field1, (*data)[i].field2, (*data)[i].field3);
}
fclose (outfile);
end = clock();
double algo_time = ((double)(end - start)) / CLOCKS_PER_SEC;
printf ("data writing complete!\nTook: %.2fsec\n\n", algo_time);
}
int cmp_field1 (const void* a, const void* b)
{
char A[15], B[15];
strcpy (A, ((Line_ptr) a) -> field1);
strcpy (B, ((Line_ptr) b) -> field1);
int cmp = strcmp (A, B);
return cmp > 0 ? 1 : (cmp < 0 ? -1 : 0);
}
int cmp_field2 (const void* a, const void* b)
{
int A = ((Line_ptr) a) -> field2;
int B = ((Line_ptr) b) -> field2;
return A < B ? -1 : (A == B ? 0 : 1);
}
int cmp_field3 (const void* a, const void* b)
{
float A = ((Line_ptr) a) -> field3;
float B = ((Line_ptr) b) -> field3;
return A < B ? -1 : (A == B ? 0 : 1);
}
void sort_records (FILE *infile, FILE *outfile, size_t field, size_t algo)
{
// preparing the array
size_t len = calc_csvFile_len (infile);
Line_ptr data = (Line_ptr) malloc (sizeof(Line) * len);
// filling the array with data from file
gather_data (infile, &data);
printf ("sorting...\n");
// defining compare func
int (*cmp_func)(const void*, const void*) = field == 1 ? (*cmp_field1) : (field == 2 ? (*cmp_field2) : (*cmp_field3));
// chosing sorting algorithm
__clock_t start = clock(), end;
if (algo == 1) merge_sort ((void**) &data, len, sizeof(Line), cmp_func);
else quick_sort ((void*) data, len, sizeof(Line), cmp_func);
end = clock();
double algo_time = ((double)(end - start)) / CLOCKS_PER_SEC;
printf ("array sorted in %.2f seconds\n\n", algo_time);
// dumping the array data in an output file
write_data(outfile, &data, len);
free (data);
}
int main (int argc, char **argv)
{
// reading command
char* in_file_path = argv[1];
char* out_file_path = argv[2];
size_t field = (int) (*argv[3] - '0'),
algo = (int) (*argv[4] - '0');
// opening files (the closing happens in the gather_data and write_data methods)
FILE *infile = NULL, *outfile = NULL;
if (in_file_path) infile = fopen (in_file_path, "r");
if (out_file_path) outfile = fopen (out_file_path, "w");
// checking inputs and launching sorter
if (infile && outfile && (field > 0 && field < 4) && (algo > 0 && algo < 3))
sort_records (infile, outfile, field, algo);
else
printf("error in passed data: \"%s %s %s %ld %ld\"\n", argv[0], in_file_path, out_file_path, field, algo); // info dump in case of error
return 0;
}
这看起来很奇怪:
#define MAX_SIZE 100
typedef struct data_line {
int id;
char field1[MAX_SIZE]; // So field1 can hold 100 chars
int field2;
float field3;
} Line, *Line_ptr;
但后来你有:
int cmp_field1 (const void* a, const void* b)
{
char A[15], B[15]; // 15 chars per array
strcpy (A, ((Line_ptr) a) -> field1); // Potential copy of 100 chars
// into a buffer with only 15 chars
strcpy (B, ((Line_ptr) b) -> field1);
int cmp = strcmp (A, B);
return cmp > 0 ? 1 : (cmp < 0 ? -1 : 0);
}
因此,根据输入文件,您似乎可能会出现“缓冲区溢出”。
顺便说一句:基于字符串的排序不需要特殊的比较函数。
strcmp
应该自己完成这项工作。