压缩算法之霍夫曼编码浅析与实现

简介

霍夫曼编码，属于熵编码的范畴，常用于各种信息压缩场合，如图像、视频、音频压缩领域。常与之对比的是算术编码，基本思想是一致的。霍夫曼本质是算术编码的一种特例。

基本原理：

核心原则：出现频率高的信息，分配少的比特，频率低的信息则分配多的比特
简单来讲：将一串信息压缩到一棵前缀二叉树上。

算法效果：

实现思路

编码思路：

统计字符出现概率，由低到高排列
动态构建二叉树（将字符放到对应二叉树上）
重新给二叉树赋值权重
遍历二叉树得到哈夫曼编码，也即码本
重新读一遍字符，将字符串进行编码

实现代码

该算法用C++语言实现，代码如下：

#include <iostream>
#include <vector>
#include <string.h>
#include <algorithm>
#include <stdio.h>
#include <stdlib.h>
#include <queue>
#include <hash_map>

using namespace std;

// Algorithm Coding Test
// 霍夫曼编码
typedef struct node {
   
    char ch;
    int val;
    struct node *left, *right, *self;
    friend bool operator <(const node &a, const node &b) {
   
        return a.val > b.val;   // 由大到小排列，优先出最右端的，最小的数； 
    }                           // 如果是 < 符号，则由小到大排列，优先出最右端的，最大的数
} node;                         // 如果新插入的值，与队列中某值相等，则按稳定排序，从左侧插入。所以两个值相同的时候，后插入的后出队
                                // 如A已经在队列，待插入的B值与A值大小相同，则插入后为优先队列为BA（而非AB），出序列的顺序为AB 
typedef struct mapnode {
   
    char ch;
    char code[256];
    struct mapnode* next;
} mapnode;

priority_queue<node> p; 
char res[30];
string codebook[26];  // codebook[0] 表示A对应的二进制编码字符串
string code;  // 单个字符对应的二进制编码
string strEncode;  // 将字符串编码后的
string strDecode;  // 解码后的字符串

void dfs(node *root, int level) {
   
    if (root->left == root->right) {
   
        if (level == 0) {
   
            res[level] = '0';
            level++;
        }
        res[level] = '\0';
        // printf("%c => %s\n", root->ch, res);
        cout << root->ch << " => " << res << endl;
        int i = 0;
        while (res[i]) {
   
            code.push_back(res[i++]);
        }
        codebook[root->ch -'A'] = code;
        // cout << codebook[root->ch -'A'] << endl;
        code.clear();  // 要临时清空下缓存，否则会一直累积，导致将整个码本的二进制编码都放进去
    } else {
   
        res[level] = '1';
        dfs(root->right, level + 1);
        res[level] = '0';
        dfs(root->left, level + 1);
    }
}

void huffman_encode(char* s) {
   
    int pos = 0;
    while (s[pos] != '\0') {
   
        strEncode.append(codebook[s[pos] - 'A']); //往后面追加
        pos++;
    }
}

void huffman_decode(node *root, string s) {
    
    int pos = 0;
    while (pos < s.size()) {
   
        node *tree = root;
        while (tree->left != tree->right) {
   
            if (s[pos] == '0') tree = tree->left;
            else tree = tree->right;
            pos++;
        }
        strDecode.push_back(tree->ch);
    }
}

node* huffman(int *hash) {
   
    node* root, fir, sec; // root为指针， fir, sec为节点
    // 先将hash的值放入节点中，同时压入优先队列
    for (int i = 0; i < 26; i++) {
   
        if (hash[i] == 0) continue;
        root = (node*) malloc(sizeof(struct node));
        root->ch = 'A' + i;
        root->val = hash[i];
        root->self = root;
        root->right = NULL;
        root->left = NULL;
        p.push(*root);  // 把root指向的节点直接push进去
        // cout << (p.top()).val << endl;
    }

    // 读取优先队列建立二叉树链表
    while (p.size() > 1) {
   
        fir = p.top(); p.pop();
        sec = p.top(); p.pop();
        root = (node *) malloc(sizeof(node));
        if (root != nullptr) {
   
            root->val = fir.val + sec.val;
            root->self = root;
            if (fir.val == sec.val) {
    // 两值相同时
                root->left = sec.self;  // 让新形成的树根节点在左侧
                root->right = fir.self;
            } else {
   
                root->left = fir.self;
                root->right = sec.self;
            }
            p.push(*root); // 形成的新节点再压入优先队列
        } else {
   
            cout << "malloc fail" << endl;
            return nullptr;
        }
    }
    fir = p.top(); p.pop();

    // 遍历该二叉树链表
    dfs(fir.self, 0);
    return fir.self;
}

int main(int argc, const char *argv[])
{
   
    // char text[100];
    int hash[30];
    memset(hash, 0, sizeof(hash));  //哈希数组初始化全为0
    // scanf("%s",text); //读入信息串text
    // char text[] = "ABACDAAB";
    // char text[] = "WEWILLWEWILLRU";
    char text[] = "IWANTTOMEETYOUAREYOUOK";
    int i = 0;
    while (text[i]) {
   
        hash[text[i] - 'A']++;
        i++;
    }
    
    node *root = huffman(hash);  // build codebook
    huffman_encode(text);  // encode
    if (root != nullptr) {
   
        huffman_decode(root, strEncode);  // decode
    } else {
   
        cout << "Decode Error! " << endl;
    }

    cout << "Before: " << text << endl;
    cout << "Encode: " << strEncode << endl;
    cout << "Decode: " << text << endl;

    system("pause");
    return 0;
}

可优化点

限于精力，总结有以下几点可改进之处，待有兴趣的读者深入：

去掉存储自己的指针地址，改为取地址：
- 尝试后发现，不能省，因为fir,sec是固定申请的两个节点，相当于副本临时用，可以把动态申请的内存节点所有内容复制到fir，sec中，但是树的根地址还是要用动态申请时内存的地址，而不能用fir，sec；fir和sec地址一直是固定的，直接取它的地址就建立不了树。
- 再试另一种方案：单独出两个node* 变量来存储取通过p.top()出来的两个地址；
  - =>该方案尝试也不通，top返回来的值不是变量，不能直接取地址firAdd = &(p.top());
根据码本对字符编码
对编码进行解码

更多的功能扩展：下步可扩展到对数字、标点符号进行编码，同时用更好的数据结构去处理，而不只是数组hash表。

参考资料

霍夫曼树及霍夫曼编码的C语言实现