## Tuesday, December 15, 2015

### Huffman Compression - Variable Length Codes - Java Implementation

Concept - Use fewer number of bits to represent frequently occurring characters and more number of bits to represent less frequently occurring characters.

Source:

import java.util.PriorityQueue;

public class HuffmanCompression {
public static class Node implements Comparable<Node>{
private final char ch;
private final int freq;
private final Node left, right;

public Node(char ch, int freq, Node left, Node right){
this.ch = ch;
this.freq = freq;
this.left = left;
this.right = right;
}

public boolean isLeaf(){
return left==null && right==null;
}

public int compareTo(Node that){
return this.freq-that.freq;
}
}

private static final int R = 256;
private static final int lgR = 8;
private static int[] freq;
private static Node root;
private static String[] codes;

public static Node buildTrie(){
PriorityQueue<Node> pq = new PriorityQueue<>();
for(int i=0;i<freq.length;++i){
}

while(pq.size()>1){
Node left = pq.poll();
Node right = pq.poll();
System.out.println("Paired "+left.ch+" : "+left.freq+" with "+right.ch+" : "+right.freq);
Node parent = new Node('\0', left.freq+right.freq, left, right);
}
return pq.poll();
}

public static void writeTrie(StringBuilder out, Node node){
if(node.isLeaf()){
writeInt(out, 1, 1);
writeInt(out, node.ch, lgR);
return;
}
writeInt(out, 0, 1);
writeTrie(out, node.left);
writeTrie(out, node.right);
}

if(t==1){
//A leaf node
Node node = new Node((char)readInt(in, 8), 0, null, null);
return node;
}

//No need for frequencies
return new Node('\0', 0, x, y);
}

public static void constructCodeTable(Node n, String s){
if(n.isLeaf()){
System.out.println("Representation for "+n.ch+" = "+s);
codes[n.ch] = s;
return;
}
constructCodeTable(n.left, s+"0");
constructCodeTable(n.right, s+"1");
}

public static String compress(String input){
StringBuilder out = new StringBuilder();
for(int i=0;i<input.length();++i){
out.append(codes[input.charAt(i)]);
}
return out.toString();
}

public static String decompress(String input){
StringBuilder out = new StringBuilder();
Node traveler = root;
for(int i=0;i<input.length();++i){

if(input.charAt(i)=='1'){
traveler = traveler.right;
}else traveler = traveler.left;

if(traveler.isLeaf()){
out.append(traveler.ch);
traveler = root;
}
}
return out.toString();
}

//Helper Input/Output Functions
inputIndex = 0;
}

private static int inputIndex = 0;
private static int readInt(String in, int width){
String bin = in.substring(inputIndex, inputIndex+width);
inputIndex += width;
return Integer.parseInt(bin, 2);
}

public static void writeInt(StringBuilder out, int x, int width){
StringBuilder sb = new StringBuilder();
String bin = Integer.toBinaryString(x);
int fill = width - bin.length();
while(fill-->0)sb.append(0);
sb.append(bin);

//for viewing
if(x==0)x='0';else if(x==1)x ='1';

System.out.println("Writing "+sb+" for "+(char)x);
out.append(sb);
}

//Main
public static void main(String[] args){
testString("THIS IS A PRETTY LONG AND BORING STRING");
testString("I GOT NOTHING TO DO");
testString("SHE SELLS SEA SHELLS BY THE SEA THE SHELLS SHE SELLS ARE SURELY SEASHELLS");
}

public static void testString(String s){
System.out.println("\n\n\n\n###Testing String \""+s+"\" ###\n\n\n\n");
freq = new int[R];
codes = new String[R];

for(int i=0;i<s.length();++i)freq[s.charAt(i)]++;
//Building a trie
root = buildTrie();

System.out.println("Building trie...");
constructCodeTable(root, "");
System.out.println();

//Test writing and reading of tries
StringBuilder out = new StringBuilder();
System.out.println("\nWriting the trie for input string "+s+"\n");
writeTrie(out, root);

StringBuilder out2 = new StringBuilder();
System.out.println("\nWrote the previous trie, reading it, and writing again...");

System.out.println("\nRepresentation of trie that was written earlier\n"+out);
System.out.println("\nRepresentation of the new trie that was just written\n"+out2);
System.out.println("\nMatches? "+(out.toString().matches(out2.toString())));

//Test Compression and Decompression
System.out.println("\nTesting compression and decompression...");
String compressed = compress(s);
System.out.println("\nCompressed representation of "+s+" is :\n"+compressed);
String decompressed = decompress(compressed);
System.out.println("\nDecompressed -> \n"+decompressed);
System.out.println("\nOriginal string uses : "+s.length()*8+" bits");
System.out.println("Compressed version uses : "+compressed.length()+" bits");
}
}

Output:

Paired ! : 1 with C : 1
Paired D : 1 with   : 2
Paired R : 2 with B : 2
Paired   : 3 with   : 4
Paired A : 5 with   : 7
Building trie...
Representation for A = 0
Representation for D = 100
Representation for ! = 1010
Representation for C = 1011
Representation for R = 110
Representation for B = 111

Writing the trie for input string ABRACADABRA!

Writing 0 for 0
Writing 1 for 1
Writing 01000001 for A
Writing 0 for 0
Writing 0 for 0
Writing 1 for 1
Writing 01000100 for D
Writing 0 for 0
Writing 1 for 1
Writing 00100001 for !
Writing 1 for 1
Writing 01000011 for C
Writing 0 for 0
Writing 1 for 1
Writing 01010010 for R
Writing 1 for 1
Writing 01000010 for B

Wrote the previous trie, reading it, and writing again...
Writing 0 for 0
Writing 1 for 1
Writing 01000001 for A
Writing 0 for 0
Writing 0 for 0
Writing 1 for 1
Writing 01000100 for D
Writing 0 for 0
Writing 1 for 1
Writing 00100001 for !
Writing 1 for 1
Writing 01000011 for C
Writing 0 for 0
Writing 1 for 1
Writing 01010010 for R
Writing 1 for 1
Writing 01000010 for B

Representation of trie that was written earlier
01010000010010100010001001000011010000110101010010101000010

Representation of the new trie that was just written
01010000010010100010001001000011010000110101010010101000010

Matches? true

Testing compression and decompression...

Compressed representation of ABRACADABRA! is :
0111110010110100011111001010

Decompressed ->

Original string uses : 96 bits
Compressed version uses : 28 bits

###Testing String "THIS IS A PRETTY LONG AND BORING STRING" ###

Paired B : 1 with D : 1
Paired L : 1 with Y : 1
Paired E : 1 with H : 1
Paired P : 1 with   : 2
Paired O : 2 with   : 2
Paired   : 2 with A : 2
Paired G : 3 with R : 3
Paired   : 3 with S : 3
Paired   : 4 with T : 4
Paired I : 4 with N : 4
Paired   : 4 with   : 6
Paired   : 6 with   : 7
Paired   : 8 with   : 8
Paired   : 10 with   : 13
Paired   : 16 with   : 23
Building trie...
Representation for E = 00000
Representation for H = 00001
Representation for A = 0001
Representation for T = 001
Representation for I = 010
Representation for N = 011
Representation for O = 1000
Representation for L = 10010
Representation for Y = 10011
Representation for P = 10100
Representation for B = 101010
Representation for D = 101011
Representation for S = 1011
Representation for G = 1100
Representation for R = 1101
Representation for   = 111

Writing the trie for input string THIS IS A PRETTY LONG AND BORING STRING

Writing 0 for 0
Writing 0 for 0
Writing 0 for 0
Writing 0 for 0
Writing 0 for 0
Writing 1 for 1
Writing 01000101 for E
Writing 1 for 1
Writing 01001000 for H
Writing 1 for 1
Writing 01000001 for A
Writing 1 for 1
Writing 01010100 for T
Writing 0 for 0
Writing 1 for 1
Writing 01001001 for I
Writing 1 for 1
Writing 01001110 for N
Writing 0 for 0
Writing 0 for 0
Writing 0 for 0
Writing 1 for 1
Writing 01001111 for O
Writing 0 for 0
Writing 1 for 1
Writing 01001100 for L
Writing 1 for 1
Writing 01011001 for Y
Writing 0 for 0
Writing 0 for 0
Writing 1 for 1
Writing 01010000 for P
Writing 0 for 0
Writing 1 for 1
Writing 01000010 for B
Writing 1 for 1
Writing 01000100 for D
Writing 1 for 1
Writing 01010011 for S
Writing 0 for 0
Writing 0 for 0
Writing 1 for 1
Writing 01000111 for G
Writing 1 for 1
Writing 01010010 for R
Writing 1 for 1
Writing 00100000 for

Wrote the previous trie, reading it, and writing again...
Writing 0 for 0
Writing 0 for 0
Writing 0 for 0
Writing 0 for 0
Writing 0 for 0
Writing 1 for 1
Writing 01000101 for E
Writing 1 for 1
Writing 01001000 for H
Writing 1 for 1
Writing 01000001 for A
Writing 1 for 1
Writing 01010100 for T
Writing 0 for 0
Writing 1 for 1
Writing 01001001 for I
Writing 1 for 1
Writing 01001110 for N
Writing 0 for 0
Writing 0 for 0
Writing 0 for 0
Writing 1 for 1
Writing 01001111 for O
Writing 0 for 0
Writing 1 for 1
Writing 01001100 for L
Writing 1 for 1
Writing 01011001 for Y
Writing 0 for 0
Writing 0 for 0
Writing 1 for 1
Writing 01010000 for P
Writing 0 for 0
Writing 1 for 1
Writing 01000010 for B
Writing 1 for 1
Writing 01000100 for D
Writing 1 for 1
Writing 01010011 for S
Writing 0 for 0
Writing 0 for 0
Writing 1 for 1
Writing 01000111 for G
Writing 1 for 1
Writing 01010010 for R
Writing 1 for 1
Writing 00100000 for

Representation of trie that was written earlier
000001010001011010010001010000011010101000101001001101001110000101001111010100110010101100100101010000010100001010100010010101001100101000111101010010100100000

Representation of the new trie that was just written
000001010001011010010001010000011010101000101001001101001110000101001111010100110010101100100101010000010100001010100010010101001100101000111101010010100100000

Matches? true

Testing compression and decompression...

Compressed representation of THIS IS A PRETTY LONG AND BORING STRING is :
00100001010101111101010111110001111101001101000000010011001111110010100001111001110001011101011111101010100011010100111100111101100111010100111100

Decompressed ->
THIS IS A PRETTY LONG AND BORING STRING

Original string uses : 312 bits
Compressed version uses : 146 bits

###Testing String "I GOT NOTHING TO DO" ###

Paired D : 1 with H : 1
Paired I : 2 with   : 2
Paired N : 2 with G : 2
Paired T : 3 with   : 4
Paired O : 4 with   : 4
Paired   : 4 with   : 7
Paired   : 8 with   : 11
Building trie...
Representation for O = 00
Representation for   = 01
Representation for I = 100
Representation for D = 1010
Representation for H = 1011
Representation for T = 110
Representation for N = 1110
Representation for G = 1111

Writing the trie for input string I GOT NOTHING TO DO

Writing 0 for 0
Writing 0 for 0
Writing 1 for 1
Writing 01001111 for O
Writing 1 for 1
Writing 00100000 for
Writing 0 for 0
Writing 0 for 0
Writing 1 for 1
Writing 01001001 for I
Writing 0 for 0
Writing 1 for 1
Writing 01000100 for D
Writing 1 for 1
Writing 01001000 for H
Writing 0 for 0
Writing 1 for 1
Writing 01010100 for T
Writing 0 for 0
Writing 1 for 1
Writing 01001110 for N
Writing 1 for 1
Writing 01000111 for G

Wrote the previous trie, reading it, and writing again...
Writing 0 for 0
Writing 0 for 0
Writing 1 for 1
Writing 01001111 for O
Writing 1 for 1
Writing 00100000 for
Writing 0 for 0
Writing 0 for 0
Writing 1 for 1
Writing 01001001 for I
Writing 0 for 0
Writing 1 for 1
Writing 01000100 for D
Writing 1 for 1
Writing 01001000 for H
Writing 0 for 0
Writing 1 for 1
Writing 01010100 for T
Writing 0 for 0
Writing 1 for 1
Writing 01001110 for N
Writing 1 for 1
Writing 01000111 for G

Representation of trie that was written earlier
0010100111110010000000101001001010100010010100100001010101000101001110101000111

Representation of the new trie that was just written
0010100111110010000000101001001010100010010100100001010101000101001110101000111

Matches? true

Testing compression and decompression...

Compressed representation of I GOT NOTHING TO DO is :
1000111110011001111000110101110011101111011100001101000

Decompressed ->
I GOT NOTHING TO DO

Original string uses : 152 bits
Compressed version uses : 55 bits

###Testing String "SHE SELLS SEA SHELLS BY THE SEA THE SHELLS SHE SELLS ARE SURELY SEASHELLS" ###

Paired B : 1 with U : 1
Paired Y : 2 with T : 2
Paired   : 2 with R : 2
Paired   : 4 with   : 4
Paired A : 4 with H : 7
Paired   : 8 with L : 11
Paired   : 11 with   : 13
Paired E : 14 with S : 16
Paired   : 19 with   : 24
Paired   : 30 with   : 43
Building trie...
Representation for E = 00
Representation for S = 01
Representation for Y = 10000
Representation for T = 10001
Representation for B = 100100
Representation for U = 100101
Representation for R = 10011
Representation for L = 101
Representation for A = 1100
Representation for H = 1101
Representation for   = 111

Writing the trie for input string SHE SELLS SEA SHELLS BY THE SEA THE SHELLS SHE SELLS ARE SURELY SEASHELLS

Writing 0 for 0
Writing 0 for 0
Writing 1 for 1
Writing 01000101 for E
Writing 1 for 1
Writing 01010011 for S
Writing 0 for 0
Writing 0 for 0
Writing 0 for 0
Writing 0 for 0
Writing 1 for 1
Writing 01011001 for Y
Writing 1 for 1
Writing 01010100 for T
Writing 0 for 0
Writing 0 for 0
Writing 1 for 1
Writing 01000010 for B
Writing 1 for 1
Writing 01010101 for U
Writing 1 for 1
Writing 01010010 for R
Writing 1 for 1
Writing 01001100 for L
Writing 0 for 0
Writing 0 for 0
Writing 1 for 1
Writing 01000001 for A
Writing 1 for 1
Writing 01001000 for H
Writing 1 for 1
Writing 00100000 for

Wrote the previous trie, reading it, and writing again...
Writing 0 for 0
Writing 0 for 0
Writing 1 for 1
Writing 01000101 for E
Writing 1 for 1
Writing 01010011 for S
Writing 0 for 0
Writing 0 for 0
Writing 0 for 0
Writing 0 for 0
Writing 1 for 1
Writing 01011001 for Y
Writing 1 for 1
Writing 01010100 for T
Writing 0 for 0
Writing 0 for 0
Writing 1 for 1
Writing 01000010 for B
Writing 1 for 1
Writing 01010101 for U
Writing 1 for 1
Writing 01010010 for R
Writing 1 for 1
Writing 01001100 for L
Writing 0 for 0
Writing 0 for 0
Writing 1 for 1
Writing 01000001 for A
Writing 1 for 1
Writing 01001000 for H
Writing 1 for 1
Writing 00100000 for

Representation of trie that was written earlier
0010100010110101001100001010110011010101000010100001010101010110101001010100110000101000001101001000100100000

Representation of the new trie that was just written
0010100010110101001100001010110011010101000010100001010101010110101001010100110000101000001101001000100100000

Matches? true

Testing compression and decompression...

Compressed representation of SHE SELLS SEA SHELLS BY THE SEA THE SHELLS SHE SELLS ARE SURELY SEASHELLS is :
01110100111010010110101111010011001110111010010110101111100100100001111000111010011101001100111100011101001110111010010110101111011101001110100101101011111100100110011101100101100110010110000111010011000111010010110101

Decompressed ->
SHE SELLS SEA SHELLS BY THE SEA THE SHELLS SHE SELLS ARE SURELY SEASHELLS

Original string uses : 584 bits
Compressed version uses : 218 bits