霍夫曼算法生成的压缩文件比原始文件大

问题描述 投票:0回答:1

我已经写了这个函数。它对我输入的文件(要压缩的文件)起到了一些作用 - .txt(文本)文件,但是输出(压缩的)文件比原始文件大。例如:我输入了一个1169MB的文件,程序返回了一个1687MB的文件。

可以采取什么措施来解决这个问题?

也许我应该使用不同的方法或变体来使用霍夫曼算法来压缩文件?

struct HuffNode{
    unsigned data;
    struct HuffNode *left;
    struct HuffNode *right;
    struct HuffNode *parent;
    int is_leaf;
};
typedef struct HuffNode HuffNode;

void count_frequency(FILE *fp, unsigned *freq) {
  size_t original_pos = ftell(fp);
  int ch;
  while ((ch = fgetc(fp)) != EOF) {
    if (ch >= 0 && ch < 256)
      freq[ch]++;
  }
  fseek(fp, original_pos, SEEK_SET);
}

void construct_huffman(unsigned *freq_in, HuffNode *tree) {
  int count = 256; 
  unsigned freq[256] = {0}; 
  HuffNode *node[256]; 

  for (int i = 0; i < 256; i++) { 
    freq[i] = freq_in[i]; 
    tree[i].data = i; 
    tree[i].left = NULL;
    tree[i].right = NULL;
    tree[i].parent = NULL;
    tree[i].is_leaf = 1; 
    node[i] = &tree[i]; 
  }
  for (int i = 0; i < 256; i++) {  
    for (int j = 0; j < 256 - i - 1; j++) {
      if (j + 1 < 256 && (freq[j] < freq[j + 1] || (freq[j] == freq[j+1] && j < j + 1))) { 
        unsigned t = freq[j];
        freq[j] = freq[j + 1];
        freq[j + 1] = t;
        HuffNode *p = node[j]; 
        node[j] = node[j + 1];
        node[j + 1] = p;
      }
    }
  }

  while (count > 1) {
    int pos = 512 - count;
    HuffNode *parent = &tree[pos];
    int i = count - 2, j = count - 1;
    parent->left = node[j]; 
    parent->right = node[i]; 
    node[j]->parent = parent;
    node[i]->parent = parent;
    node[i]->is_leaf = 0;
    node[j]->is_leaf = 0;
    node[i] = parent;
    freq[i] += freq[j]; 
    for (; i > 0 && freq[i] > freq[i - 1]; i--) { 
      unsigned t = freq[i];
      freq[i] = freq[i - 1];
      freq[i - 1] = t;
      HuffNode *p = node[i];
      node[i] = node[i - 1];
      node[i - 1] = p;
    }
    count--;
  }
  node[0]->parent = NULL;
}

void encode_stream(FILE *fin, FILE *fout, HuffNode *tree, unsigned *padding) {
  int n;
  unsigned char ch;
  unsigned char buff = 0, nbuf = 0;
  HuffNode *p;
  unsigned char code[256] = {0};
  while ((n = fgetc(fin)) != EOF) {
    if(n < 0 || n >= 256){
      printf("invalid characters in the file");
      return;
    }
    ch = n;
    p = &tree[ch];
    int code_length = 0;
    while (p->parent) {
      if (p == p->parent->right) {
        code[code_length] = 1;
      }
      p = p->parent;
      code_length++;
    }
    for (int i = code_length - 1; i >= 0; i--) {
      buff |= code[i] << nbuf;
      nbuf++;
      if (nbuf == 8) {
        fputc(buff, fout);
        nbuf = 0;
        buff = 0;
      }
    }
  }
  if (nbuf > 0) {
    fputc(buff, fout);
  }
  *padding = 8 - nbuf;
}

void compress_file(FILE *fin, FILE *fout) {

  unsigned freq[256] = {0}, padding = 0; //- хз
  int flag = 0;
  size_t padding_pos;
  HuffNode tree[512]; 
  
  if (flag == 0) {
    count_frequency(fin, freq);
    construct_huffman(freq, tree);
    rewind(fin);
    for (int i = 0; i < 256; i++) {
      fwrite(&freq[i], sizeof(unsigned), 1, fout);
    }
    padding_pos = ftell(fout);
    fwrite(&padding, sizeof(unsigned), 1, fout);
    encode_stream(fin, fout, tree, &padding);
    fseek(fout, padding_pos, SEEK_SET);
    fwrite(&padding, sizeof(unsigned), 1, fout);
    
  }
}

我不知道为什么它不能正确压缩文件。

c compression text-files txt huffman-code
1个回答
0
投票

霍夫曼算法产生比原始文件更大的压缩文件

原始文件可能已被压缩或只是大量随机数据。

并非所有文件都可压缩。

如果所有文件都是可压缩的,则可以对生成的文件重新应用压缩。 通过重复,我们最终可以达到 0 的大小。

最新问题
© www.soinside.com 2019 - 2025. All rights reserved.