Import necessary libraries and namespaces
#include <bits/stdc++.h>
#define MAX_TREE_HT 256
using namespace std;
Underlying Data Structures :
1. MinHeap Node - a structure describing a node of the tree built by Huffman Encoding.
// A Huffman tree node
struct MinHeapNode
{
char data; // One of the input characters
int freq; // Frequency of the character
MinHeapNode *left, *right; // Left and right child
MinHeapNode(char data, int freq)
{
left = right = NULL;
this->data = data;
this->freq = freq;
}
};
2. A priority Queue - it stores all singleton nodes initially.
// utility function for the priority queue
struct compare
{
bool operator()(MinHeapNode* l, MinHeapNode* r)
{
return (l->freq > r->freq);
}
};
// STL priority queue to store heap tree, with respect
// to their heap root node value
priority_queue<MinHeapNode*, vector<MinHeapNode*>, compare> minHeap;
-Step 1 : Count Frequencies
// to store the frequency of character of the input data
map<char, int> freq;
// utility function to store map each character with its
// frequency in input string
void calcFreq(string str, int n)
{
for (int i=0; i<str.size(); i++)
freq[str[i]]++;
}
-Step 2: Build Encoding Tree
AND
-Step 3 : Build Encoding Map
// to map each character its huffman value
map<char, string> codes;
// utility function to store characters along with
// there huffman value in a hash table, here we
// have C++ STL map
void storeCodes(struct MinHeapNode* root, string str)
{
if (root==NULL)
return;
if (root->data != '$')
codes[root->data]=str;
storeCodes(root->left, str + "0");
storeCodes(root->right, str + "1");
}
// function to build the Huffman tree and store it
// in minHeap
void HuffmanCodes(int size)
{
struct MinHeapNode *left, *right, *top;
for (map<char, int>::iterator v=freq.begin(); v!=freq.end(); v++)
minHeap.push(new MinHeapNode(v->first, v->second));
while (minHeap.size() != 1)
{
left = minHeap.top();
minHeap.pop();
right = minHeap.top();
minHeap.pop();
top = new MinHeapNode('$', left->freq + right->freq);
top->left = left;
top->right = right;
minHeap.push(top);
}
storeCodes(minHeap.top(), "");
}
-Step 4 : Encode the data
// utility function to print characters along with
// there huffman value
void printCodes(struct MinHeapNode* root, string str)
{
if (!root)
return;
if (root->data != '$')
cout << root->data << ": " << str << "\n";
printCodes(root->left, str + "0");
printCodes(root->right, str + "1");
}
Decoding :
// function iterates through the encoded string s
// if s[i]=='1' then move to node->right
// if s[i]=='0' then move to node->left
// if leaf node append the node->data to our output string
string decode_file(struct MinHeapNode* root, string s)
{
string ans = "";
struct MinHeapNode* curr = root;
for (int i=0;i<s.size();i++)
{
if (s[i] == '0')
curr = curr->left;
else
curr = curr->right;
// reached leaf node
if (curr->left==NULL and curr->right==NULL)
{
ans += curr->data;
curr = root;
}
}
// cout<<ans<<endl;
return ans+'\0';
}
Main program to demonstrate :
string str = "mynameisanjishnu";
string encodedString, decodedString;
calcFreq(str, str.length());
HuffmanCodes(str.length());
cout << "Character With their Frequencies:\n";
for (auto v=codes.begin(); v!=codes.end(); v++)
cout << v->first <<' ' << v->second << endl;
for (auto i: str)
encodedString+=codes[i];
cout << "\nEncoded Huffman data:\n" << encodedString << endl;
decodedString = decode_file(minHeap.top(), encodedString);
cout << "\nDecoded Huffman Data:\n" << decodedString << endl;
string str1 = "thisgivesademo";
string encodedString1, decodedString1;
for (auto i: str1)
encodedString1+=codes[i];
cout << "\nEncoded Huffman data:\n" << encodedString1 << endl;
decodedString1 = decode_file(minHeap.top(), encodedString1);
cout << "\nDecoded Huffman Data:\n" << decodedString1 << endl;
The above example is used to demonstrate the fact that when apriori statistics of one file is used to compress another file, the huffman coding not only consumes more number of bits, but is also highly inaccurate.