Huffman code

Huffman algorithm is a lossless data compression algorithm. Huffman codes are used for compressing data efficiently from 20% to 90%.
We consider the data to be a sequence of characters.
This algorithm uses a table of the frequencies of occurrence of the characters to build up an optimal way of representing each character as a binary string.



Huffman Algorithm

HUFFMAN(C)
1	n ← |C|
2	Q ← C
3	for i 1 to n - 1
4		do allocate a new node z
5			left[z] ← x ← EXTRACT-MIN (Q)
6 			right[z] ← y ← EXTRACT-MIN (Q)
7 			f [z] ← f [x] + f [y]
8 			INSERT(Q, z)
9 	return EXTRACT-MIN(Q)



Constructing a Huffman code

Example

Character Frequency Fixed-length codeword
a 45 000
b 13 001
c 12 010
d 16 011
e 9 100
f 5 101

There are each character is represented by a unique binary string. If we use a fixed-length code, we need 3 bits to represent the characters.
Total Size = (45+13+12+16+9+5)*10^3
Total Size = 10^5 bits.
Requires bits = 10^5*3 = 3,00,000.

Procedure for Construction of Huffman tree

Step 1.
Arrenge the given character in decending order of their frequency.


Step 2.


Step 3.


Step 4.


Step 5.


Step 6.


Step 7.


Steps to print codes from Huffman Tree

Traverse huffman tree from the root node. Create an array. While moving to the left child, write 0 to the array. While moving to the right child, write 1 to the array. Print the array when a leaf node is encountered.


Character Variable-length codeword Size of string in bit
a 0 1
b 101 3
c 100 3
d 111 3
e 1101 4
f 1100 4

Calculation for Compression ratio

Size = ((45*1) + (13*3) + (12*3) + (16*3) + (9*4) + (5*4))*10^3)
Requires bits = 224 * 10^3
Requires bits = 224000.
Compression ratio
η = ((3,00,000 - 2,24,000) / 3,00,000)*100
η = 25.33%.



Programe For Huffman Code

#include <stdio.h>
#include <stdlib.h>
#include <conio.h>
#define MAX_TREE_HT 100
struct MH_Node
{
	char character;
	unsigned frequency;
	struct MH_Node *l, *r;
};


struct M_Heap
{
	unsigned size;
	unsigned space;
	struct MH_Node **array;
};

struct MH_Node* newNode(char character, unsigned frequency)
{
    struct MH_Node* temp = (struct MH_Node*) malloc(sizeof(struct MH_Node));
    temp->l = temp->r = NULL;
    temp->character = character;
    temp->frequency = frequency;
    return temp;
}


struct M_Heap* createM_Heap(unsigned space)
{
    struct M_Heap* M_Heap = (struct M_Heap*) malloc(sizeof(struct M_Heap));
    M_Heap->size = 0;
    M_Heap->space = space;
    M_Heap->array = (struct MH_Node**)malloc(M_Heap->space * sizeof(struct MH_Node*));
    return M_Heap;
}


void swapMH_Node(struct MH_Node** a, struct MH_Node** b)
{
    struct MH_Node* t = *a;
    *a = *b;
    *b = t;
}


void M_Heapify(struct M_Heap* M_Heap, int idx)
{
    int smallest = idx;
    int l = 2 * idx + 1;
    int r = 2 * idx + 2;

    if (l < M_Heap->size && M_Heap->array[l]->frequency < M_Heap->array[smallest]->frequency)
		smallest = l;

	if (r < M_Heap->size && M_Heap->array[r]->frequency < M_Heap->array[smallest]->frequency)
		smallest = r;

    if (smallest != idx)
    {
		swapMH_Node(&M_Heap->array[smallest], &M_Heap->array[idx]);
		M_Heapify(M_Heap, smallest);
    }
}

int isSizeOne(struct M_Heap* M_Heap)
{
    return (M_Heap->size == 1);
}


struct MH_Node* extractMin(struct M_Heap* M_Heap)
{
    struct MH_Node* temp = M_Heap->array[0];
    M_Heap->array[0] = M_Heap->array[M_Heap->size - 1];
    --M_Heap->size;
    M_Heapify(M_Heap, 0);
    return temp;
}


void insertM_Heap(struct M_Heap* M_Heap, struct MH_Node* MH_Node)
{

    int i = M_Heap->size - 1;
    ++M_Heap->size;
    while (i && MH_Node->frequency < M_Heap->array[(i - 1)/2]->frequency)
    {
		M_Heap->array[i] = M_Heap->array[(i - 1)/2];
		i = (i - 1)/2;
    }
    M_Heap->array[i] = MH_Node;
}


void buildM_Heap(struct M_Heap* M_Heap)
{
    int n = M_Heap->size - 1;
    int i;
    for (i = (n - 1) / 2; i >= 0; --i)
		M_Heapify(M_Heap, i);
}


void printArr(int arr[], int n)
{
    int i;
    for (i = 0; i < n; ++i)
		printf("%d", arr[i]);
		printf("\n");
}


int isLeaf(struct MH_Node* root)
{
    return !(root->l) && !(root->r) ;
}


struct M_Heap* createAndBuildM_Heap(char character[], int frequency[], int size)
{
	int i;
    struct M_Heap* M_Heap = createM_Heap(size);
    for (i = 0; i < size; ++i)
        M_Heap->array[i] = newNode(character[i], frequency[i]);
    M_Heap->size = size;
    buildM_Heap(M_Heap);
    return M_Heap;
}
 

struct MH_Node* buildHuffmanTree(char character[], int frequency[], int size)
{
    struct MH_Node *l, *r, *top;
    struct M_Heap* M_Heap = createAndBuildM_Heap(character, frequency, size);

    while (!isSizeOne(M_Heap))
    {
        l = extractMin(M_Heap);
        r = extractMin(M_Heap);
        top = newNode('$', l->frequency + r->frequency);
        top->l = l;
        top->r = r;
        insertM_Heap(M_Heap, top);
    }
    return extractMin(M_Heap);
}
 
void printCodes(struct MH_Node* root, int arr[], int top)
{
    if (root->l)
    {
        arr[top] = 0;
        printCodes(root->l, arr, top + 1);
    }
 
    if (root->r)
    {
        arr[top] = 1;
        printCodes(root->r, arr, top + 1);
    }
 
    if (isLeaf(root))
    {
        printf("%c: ", root->character);
        printArr(arr, top);
    }
}
 
void HuffmanCodes(char character[], int frequency[], int size)
{
	struct MH_Node* root = buildHuffmanTree(character, frequency, size);
	int arr[MAX_TREE_HT], top = 0;
	printCodes(root, arr, top);
}

void main()
{
	char arr[] = {'a', 'b', 'c', 'd', 'e', 'f'};
	int frequency[] = {5, 9, 12, 13, 16, 45};
	int size; 
	clrscr();
	size = sizeof(arr)/sizeof(arr[0]);
	HuffmanCodes(arr, frequency, size);
	getch();
}