如何创建一个哈夫曼树?这个问题很简单,哈夫曼树的带权路径长度是最小的,为了满足这一条件,我们总是要把权值小的节点放在树的底端,而权值越大,就应该越靠近根。

那么,一个创建哈夫曼树的方法已经呼之欲出了:创建一个以节点权值排序的小根堆,每次取出第一小和第二小的树(节点也是树)合并成一个新的二叉树,直到最后只剩下一棵树,这棵树就是哈夫曼树。

图片说明

下面给出一个基于STL的C++创建哈夫曼树的代码:

#include <algorithm>
#include <iostream>
#include <queue>
#include <string>
#include <vector>
class HuffmanTree {
    struct node {
        int weight;
        struct node *Lchild, *Rchild;
        bool operator<(const node &x) const { return weight < x.weight; }
        bool operator>(const node &x) const { return weight > x.weight; }
    };

    typedef std::priority_queue<node, std::vector<node>, std::greater<node> >
        HuffQueue;

    HuffQueue GetWeight() {
        std::cout << "Input nodes&#39; power, end with -1" << std::endl;
        HuffQueue q;
        int temp;
        while (std::cin >> temp, temp != -1) q.push({temp, NULL, NULL});
        return q;
    }

    node *CreatHuffmanTree(HuffQueue q) {
        node *root = new node;
        while (q.size() > 1) {
            node *Lch = new node, *Rch = new node, *P = new node;
            *Lch = q.top(), q.pop();
            *Rch = q.top(), q.pop();
            P->Lchild = Lch, P->Rchild = Rch,
            P->weight = Lch->weight + Rch->weight;
            q.push(*P);
        }
        *root = q.top();
        return root;
    }

    void PreOrderVisit(node *root) {
        if (root == NULL) return;
        if (root->Lchild == NULL && root->Rchild == NULL)
            std::cout << "LeafNode";
        std::cout << root->weight << &#39;\n&#39;;
        PreOrderVisit(root->Lchild);
        PreOrderVisit(root->Rchild);
    }

    void DestoryHuffmanTree(node *root) {
        if (root == NULL) return;
        DestoryHuffmanTree(root->Lchild);
        DestoryHuffmanTree(root->Rchild);
        delete root;
    }

    void HuffmanCoding(node *root, int len) {
        static bool Hcode[1024];
        if (root == NULL) return;
        if (root->Lchild == NULL && root->Rchild == NULL) {
            std::cout << "the node weigh " << root->weight << "&#39;s code is:";
            for (int i = 0; i < len; ++i) std::cout << Hcode[i];
            std::cout << &#39;\n&#39;;
        } else {
            Hcode[len] = 0;
            HuffmanCoding(root->Lchild, len + 1);
            Hcode[len] = 1;
            HuffmanCoding(root->Rchild, len + 1);
        }
    }

    void PrintAsBracket(node *root) {
        if (root == NULL) return;
        std::cout << root->weight;
        if (root->Lchild || root->Rchild) {
            std::cout << &#39;(&#39;;
            PrintAsBracket(root->Lchild);
            if (root->Rchild) std::cout << &#39;,&#39;;
            PrintAsBracket(root->Rchild);
            std::cout << &#39;)&#39;;
        }
    }

    int GetSumPathLength(node *root, int len) {
        if (!root) return 0;
        if (root->Lchild == NULL && root->Rchild == NULL)
            return root->weight * len;
        else
            return GetSumPathLength(root->Lchild, len + 1) +
                   GetSumPathLength(root->Rchild, len + 1);
    }

   public:
    int show() {
        try {
            node *root = CreatHuffmanTree(GetWeight());
            std::cout << &#39;\n&#39; << "PreOrderVisit:" << &#39;\n&#39;;
            PreOrderVisit(root);
            std::cout << &#39;\n&#39; << "Print as brackets: ";
            PrintAsBracket(root);
            std::cout << &#39;\n&#39; << &#39;\n&#39;;
            HuffmanCoding(root, 0);
            std::cout << &#39;\n&#39;
                      << "The sum of weighted path lengths of Huffman trees: "
                      << GetSumPathLength(root, 0);
            DestoryHuffmanTree(root);
            return 0;
        } catch (const std::exception &e) {
            std::cerr << e.what() << &#39;\n&#39;;
            return 1;
        }
    }
};

int main() {
    HuffmanTree test;
    test.show();
    return 0;
}

然后我们可以做点别的,比较ASCII码和HUFFMAN编码的编码效率:

#include <algorithm>
#include <cstdio>
#include <iostream>
#include <queue>
#include <string>
#include <vector>
using namespace std;
int main() {
    string s;
    priority_queue<int, vector<int>, greater<int> > Q;
    puts("ACSII coding and Huffman coding efficiency comparison\n");
    while (printf("Input string:\n"), getline(cin, s) && s != "END") {
        sort(s.begin(), s.end());
        int t = 1;
        for (int i = 1; i < s.length(); ++i)
            if (s[i] != s[i - 1])
                Q.push(t), t = 1;
            else
                ++t;
        Q.push(t);
        int ans = 0, a, b;
        if (Q.size() == 1) ans = Q.top();
        while (Q.size() > 1) {
            a = Q.top(), Q.pop();
            b = Q.top(), Q.pop();
            Q.push(a + b);
            ans += a + b;
        }
        Q.pop();
        printf(
            "\nASCII Coding\t\t%lu bit\nHuffman Coding\t\t%d "
            "bit\nCompression ratio\t%.2f\n\n",
            s.size() * 8, ans, (s.size() * 8.0) / ans);
    }
    return 0;
}

在这里,我们发现HUFFMAN编码的bit占用恰好和上面那一段代码的哈夫曼权值和相同,我们写了一个简易的程序来验证这一点:

#include <algorithm>
#include <cstdio>
#include <iostream>
#include <map>
#include <queue>
#include <string>
#include <vector>
using namespace std;
struct node {
    int weight;
    char ch;
    struct node *Lchild, *Rchild;
    bool operator<(const node &x) const { return weight < x.weight; }
    bool operator>(const node &x) const { return weight > x.weight; }
};
typedef std::priority_queue<node, std::vector<node>, std::greater<node> >
    HuffQueue;

HuffQueue GetWeight(string s) {
    HuffQueue q;
    map<char, int> cnt;
    for (int i = 0; i < s.size(); i++) ++cnt[s[i]];
    map<char, int>::iterator iter;
    for (iter = cnt.begin(); iter != cnt.end(); iter++)
        q.push({iter->second, iter->first, NULL, NULL});
    return q;
}

node *CreatHuffmanTree(HuffQueue q) {
    node *root = new node;
    while (q.size() > 1) {
        node *Lch = new node, *Rch = new node, *P = new node;
        *Lch = q.top(), q.pop();
        *Rch = q.top(), q.pop();
        P->Lchild = Lch, P->Rchild = Rch, P->weight = Lch->weight + Rch->weight;
        q.push(*P);
    }
    *root = q.top();
    return root;
}
int GetSumPathLength(node *root, int len) {
    if (!root) return 0;
    if (root->Lchild == NULL && root->Rchild == NULL)
        return root->weight * len;
    else
        return GetSumPathLength(root->Lchild, len + 1) +
               GetSumPathLength(root->Rchild, len + 1);
}

////////////////////////////////////////////////////////////////////////////
int res;
map<char, string> HuffCode;
void HuffmanCoding(node *root, int len) {
    static bool Hcode[1024];
    if (root == NULL) return;
    if (root->Lchild == NULL && root->Rchild == NULL) {
        string tmp = "";
        std::cout << "the node weigh " << root->weight << "&#39;s code is:";
        for (int i = 0; i < len; ++i)
            std::cout << Hcode[i], tmp += (&#39;0&#39; + Hcode[i]);
        std::cout << &#39;\n&#39;;
        HuffCode[root->ch] = tmp;
        res += (root->weight * (len));
    } else {
        Hcode[len] = 0;
        HuffmanCoding(root->Lchild, len + 1);
        Hcode[len] = 1;
        HuffmanCoding(root->Rchild, len + 1);
    }
}

void show(string s) {
    string ans = "";
    for (int i = 0; i < s.length(); ++i) ans += HuffCode[s[i]];
    cout << ans << endl;
    cout << &#39;\n&#39; << ans.length() << &#39;\n&#39; << endl;
}

int main() {
    string s;
    priority_queue<int, vector<int>, greater<int> > Q;
    puts("ACSII coding and Huffman coding efficiency comparison\n");
    while (printf("Input string:\n"), getline(cin, s) && s != "END") {
        sort(s.begin(), s.end());
        int t = 1;
        for (int i = 1; i < s.length(); ++i)
            if (s[i] != s[i - 1])
                Q.push(t), t = 1;
            else
                ++t;
        Q.push(t);
        int ans = 0, a, b;
        if (Q.size() == 1) ans = Q.top();
        while (Q.size() > 1) {
            a = Q.top(), Q.pop();
            b = Q.top(), Q.pop();
            Q.push(a + b);
            ans += a + b;
        }
        Q.pop();
        cout << "计算哈夫曼树中间节点的权值和 " << ans << endl;
        // Next Calculate Way
        res = 0;
        node *r = CreatHuffmanTree(GetWeight(s));
        HuffmanCoding(r, 0);
        cout << "计算哈夫曼树的带权路径和 " << res << endl;
        cout << &#39;\n&#39;
             << "The sum of weighted path lengths of Huffman trees: "
             << GetSumPathLength(r, 0) << &#39;\n&#39;
             << endl;
        show(s);
    }
    return 0;
}

​比如说这样的一棵哈夫曼树

图片说明

它对应的字符串可以是AAAABBBCCD

A对应的哈夫曼编码是0,1bit
B对应的哈夫曼编码是10,2bit
C对应的哈夫曼编码是110,3bit
D对应的哈夫曼编码是111,3bit
A出现4次,B出现3次,C出现两次,D出现1次,整个字符串占用

然后另一种计算方式,把树遍历一遍:

图片说明

我们先计算权重1的节点'D'对哈夫曼编码最后的总bit占用的贡献:三根线,每根代表一个bit,那么D所消耗的就是,同理,计算每一个节点,我们就可以得到和上面一模一样的式子:

也就是说,哈夫曼树的带权路径和也就是这种编码模式所对应的权重下占用的总bit。
图片说明

特化到一般情况,我们也会发现非叶子节点之和其实就是:a加了3次b加了3次c加了2次d加了1次,也就是

也就是说中间节点之和等于带权路径和