In: Computer Science
Please explain how the frequencies of characters were gotten, how they were sorted, and how the codes were gotten for the program below.
#include <iostream>
#include <vector>
#include <queue>
#include <string>
using namespace std;
class Huffman_Code
{
struct New_Node
{
char value;
size_t frequencies;
New_Node* left;
New_Node* right;
New_Node(char value, size_t frequencies) : value(value),
frequencies(frequencies),
left(NULL),
right(NULL)
{}
~New_Node()
{
delete left;
delete right;
}
};
struct compare
{
bool operator()(New_Node* l, New_Node* r)
{
return (l->frequencies > r->frequencies);
}
};
New_Node* top;
void print_Code(New_Node* root, string str)
{
if(root == NULL)
return;
if(root->value == '$')
{
print_Code(root->left, str + "0");
print_Code(root->right, str + "1");
}
if(root->value != '$')
{
cout << root->value <<" : " << str << "\n";
print_Code(root->left, str + "0");
print_Code(root->right, str + "1");
}
}
public:
Huffman_Code() {};
~Huffman_Code()
{
delete top;
}
void Huffman_tree(vector<char>& value, vector<size_t>& frequencies, size_t size)
{
New_Node* left;
New_Node* right;
priority_queue<New_Node*, vector<New_Node*>, compare > minHeap;
for(size_t i = 0; i < size; ++i)
{
minHeap.push(new New_Node(value[i], frequencies[i]));
}
while(minHeap.size() != 1)
{
left = minHeap.top();
minHeap.pop();
right = minHeap.top();
minHeap.pop();
top = new New_Node('$', left->frequencies + right->frequencies);
top->left = left;
top->right = right;
minHeap.push(top);
}
print_Code(minHeap.top(), "");
}
};
int main()
{
int number, freq;
char ch;
Huffman_Code set1;
vector<char> value;
vector<size_t> frequencies;
cout<<"Length of elements \n";
cin>>number;
cout<<"Enter the elements \n";
for (int i=0;i<number;i++)
{
cin>>ch;
value.insert(value.end(), ch);
}
cout<<"Enter the frequencies \n";
for (int i=0;i<number;i++)
{
cin>>freq;
frequencies.insert(frequencies.end(), freq);
}
size_t size = value.size();
set1.Huffman_tree(value, frequencies, size);
return 0;
}
1. The above given code is of Huffman coding algorithm, most basicly used for COMPRESSION.
Question 1: How the frequencies of characters were found?
Imagine you have a some text file which have data i.e "ABCD".
Now usually these characters are stored in binary representation of their ASCII values.
These ASCII values when represented in Binary takes 16bit each, so total size of file=64bit.
Mr. Huffman came up with a technique saying that why not we represent A,B,C,D with binary number
less than 16 bit lets say 2bit but each character having distinct values.
Ex.A:00,B:01,C:10,D:11
So file size would be, 2x4=8bit
So here we saved 56bits.
But practically there would be many times A would be there, many times B would be there right.
So, what if A is represented 50 times, B is represented 60 times but c is 5 times. So again can
we minimise the file size further.
Huffman told may be yes, if most repeated characters have minimum bit representation rather than
those with least repeated characters
This technique further minimized the big file sizes.
So how these frequencies are obtained is,
we select a file which needs to be compressed,
We go through the each distinct character of file and note down how many times they are repeated.
Then we will find frequencies of characters.
2. What is Min heap?
3. Explanation of Huffman Algorithm, which is implemented by the program:
Output for the above example:
Code Explanation:
int main()
{
int number, freq;
char ch;
Huffman_Code set1;
vector<char> value;//Vector containing values a,b,c,d,e,f, vector is nothing but array
vector<size_t> frequencies;//Frequencies of character
cout<<"Length of elements \n";
cin>>number;//Number of characters
cout<<"Enter the elements \n";
for (int i=0;i<number;i++)
{
cin>>ch;
value.insert(value.end(), ch);//inserting from back of the vector
}//taking input
cout<<"Enter the frequencies \n";
for (int i=0;i<number;i++)
{
cin>>freq;
frequencies.insert(frequencies.end(), freq);
}
size_t size = value.size();
set1.Huffman_tree(value, frequencies, size);//calling huffman_tree creation
return 0;
}
In the main function you are just providing the values for the vector of char, vector of frequencies. You can check comments once in snippets.
struct New_Node
{
char value;
size_t frequencies;
New_Node* left;
New_Node* right;
New_Node(char value, size_t frequencies) : value(value),//when new node is created the Node creation calls the constructor with left and right pointer initialized as null
frequencies(frequencies), //the char value and respective frequency is given, to the constructor to be assigned to respective field
left(NULL),
right(NULL)
{}
~New_Node()//this is destructor after the program execution is over, memory is freed
{
delete left;
delete right;
}
};
//creation of new node of the tree
You create a new node by the above code for the tree.
void Huffman_tree(vector<char>& value, vector<size_t>& frequencies, size_t size)
{
New_Node* left;
New_Node* right;
priority_queue<New_Node*, vector<New_Node*>, compare > minHeap;
//for creation of minHeap a STL library is used called PRIOROTY QUEUE, now here is
//declaration but by default it creates a maxHeap(parent greater than child elements)
//But try to understand mean heap is stored in an vector but is implemented as tree and
//various operations are performed on tree. So we have to use the above implementation for
//a minHeap. Where compare defines its going to be a minHeap.
for(size_t i = 0; i < size; ++i)
{
//Initially all frequencies with their characters are pushed into minHeap as
//told in algorithm
minHeap.push(new New_Node(value[i], frequencies[i]));
}
while(minHeap.size() != 1)
{
left = minHeap.top();//minimum element from minHeap put it in new node left subtree
minHeap.pop();//pop it
right = minHeap.top();//2nd minimum element from minHeap put it in new node right subtree
minHeap.pop();
top = new New_Node('$', left->frequencies + right->frequencies);//create new node, with $ sign
top->left = left;//assign 1st min to left
top->right = right;//assign 2nd min to right
minHeap.push(top);//again push the new node to the minHeap
//continue this for all elements of min heap
}
print_Code(minHeap.top(), "");
}
The most important snippet of the code. Go through the comment once
void print_Code(New_Node* root, string str)
{
if(root == NULL)//if we reach root node then we need to return
return;
//code printing part
if(root->value == '$')//if we reach dollar sign then we need to go either left or right
{
print_Code(root->left, str + "0");
print_Code(root->right, str + "1");
}
if(root->value != '$')//if its not $ sign then, we have to print the code of character.
{
cout << root->value <<" : " << str << "\n";
print_Code(root->left, str + "0");
print_Code(root->right, str + "1");
}
}
The traversal for code part.
Note: If still there is any doubts remaining , please post them as comment of this question.