我已经写了这个函数。它对我输入的文件(要压缩的文件)起到了一些作用 - .txt(文本)文件,但是输出(压缩的)文件比原始文件大。例如:我输入了一个1169MB的文件,程序返回了一个1687MB的文件。
可以采取什么措施来解决这个问题?
也许我应该使用不同的方法或变体来使用霍夫曼算法来压缩文件?
struct HuffNode{
unsigned data;
struct HuffNode *left;
struct HuffNode *right;
struct HuffNode *parent;
int is_leaf;
};
typedef struct HuffNode HuffNode;
void count_frequency(FILE *fp, unsigned *freq) {
size_t original_pos = ftell(fp);
int ch;
while ((ch = fgetc(fp)) != EOF) {
if (ch >= 0 && ch < 256)
freq[ch]++;
}
fseek(fp, original_pos, SEEK_SET);
}
void construct_huffman(unsigned *freq_in, HuffNode *tree) {
int count = 256;
unsigned freq[256] = {0};
HuffNode *node[256];
for (int i = 0; i < 256; i++) {
freq[i] = freq_in[i];
tree[i].data = i;
tree[i].left = NULL;
tree[i].right = NULL;
tree[i].parent = NULL;
tree[i].is_leaf = 1;
node[i] = &tree[i];
}
for (int i = 0; i < 256; i++) {
for (int j = 0; j < 256 - i - 1; j++) {
if (j + 1 < 256 && (freq[j] < freq[j + 1] || (freq[j] == freq[j+1] && j < j + 1))) {
unsigned t = freq[j];
freq[j] = freq[j + 1];
freq[j + 1] = t;
HuffNode *p = node[j];
node[j] = node[j + 1];
node[j + 1] = p;
}
}
}
while (count > 1) {
int pos = 512 - count;
HuffNode *parent = &tree[pos];
int i = count - 2, j = count - 1;
parent->left = node[j];
parent->right = node[i];
node[j]->parent = parent;
node[i]->parent = parent;
node[i]->is_leaf = 0;
node[j]->is_leaf = 0;
node[i] = parent;
freq[i] += freq[j];
for (; i > 0 && freq[i] > freq[i - 1]; i--) {
unsigned t = freq[i];
freq[i] = freq[i - 1];
freq[i - 1] = t;
HuffNode *p = node[i];
node[i] = node[i - 1];
node[i - 1] = p;
}
count--;
}
node[0]->parent = NULL;
}
void encode_stream(FILE *fin, FILE *fout, HuffNode *tree, unsigned *padding) {
int n;
unsigned char ch;
unsigned char buff = 0, nbuf = 0;
HuffNode *p;
unsigned char code[256] = {0};
while ((n = fgetc(fin)) != EOF) {
if(n < 0 || n >= 256){
printf("invalid characters in the file");
return;
}
ch = n;
p = &tree[ch];
int code_length = 0;
while (p->parent) {
if (p == p->parent->right) {
code[code_length] = 1;
}
p = p->parent;
code_length++;
}
for (int i = code_length - 1; i >= 0; i--) {
buff |= code[i] << nbuf;
nbuf++;
if (nbuf == 8) {
fputc(buff, fout);
nbuf = 0;
buff = 0;
}
}
}
if (nbuf > 0) {
fputc(buff, fout);
}
*padding = 8 - nbuf;
}
void compress_file(FILE *fin, FILE *fout) {
unsigned freq[256] = {0}, padding = 0; //- хз
int flag = 0;
size_t padding_pos;
HuffNode tree[512];
if (flag == 0) {
count_frequency(fin, freq);
construct_huffman(freq, tree);
rewind(fin);
for (int i = 0; i < 256; i++) {
fwrite(&freq[i], sizeof(unsigned), 1, fout);
}
padding_pos = ftell(fout);
fwrite(&padding, sizeof(unsigned), 1, fout);
encode_stream(fin, fout, tree, &padding);
fseek(fout, padding_pos, SEEK_SET);
fwrite(&padding, sizeof(unsigned), 1, fout);
}
}
我不知道为什么它不能正确压缩文件。
霍夫曼算法产生比原始文件更大的压缩文件
原始文件可能已被压缩或只是大量随机数据。
并非所有文件都可压缩。
如果所有文件都是可压缩的,则可以对生成的文件重新应用压缩。 通过重复,我们最终可以达到 0 的大小。