Tuesday, December 15, 2015

Huffman Compression - Variable Length Codes - Java Implementation

Concept - Use fewer number of bits to represent frequently occurring characters and more number of bits to represent less frequently occurring characters.

Source:

import java.util.PriorityQueue; 

public class HuffmanCompression {
    public static class Node implements Comparable<Node>{
        private final char ch;
        private final int freq;
        private final Node left, right;
                
        public Node(char ch, int freq, Node left, Node right){
            this.ch = ch;
            this.freq = freq;
            this.left = left;
            this.right = right;
        }
        
        public boolean isLeaf(){
            return left==null && right==null;
        }
        
        public int compareTo(Node that){
            return this.freq-that.freq;
        }
    }
    
    private static final int R = 256;
    private static final int lgR = 8;
    private static int[] freq;
    private static Node root;
    private static String[] codes;
    
    public static Node buildTrie(){
        PriorityQueue<Node> pq = new PriorityQueue<>();
        for(int i=0;i<freq.length;++i){
            if(freq[i]!=0)pq.add(new Node((char)i, freq[i], null, null));
        }
         
        while(pq.size()>1){
            Node left = pq.poll();
            Node right = pq.poll(); 
            System.out.println("Paired "+left.ch+" : "+left.freq+" with "+right.ch+" : "+right.freq);
            Node parent = new Node('\0', left.freq+right.freq, left, right); 
            pq.add(parent);
        }
        return pq.poll();
    }
    
    public static void writeTrie(StringBuilder out, Node node){
        if(node.isLeaf()){
            writeInt(out, 1, 1);
            writeInt(out, node.ch, lgR);
            return;
        }
        writeInt(out, 0, 1);
        writeTrie(out, node.left);
        writeTrie(out, node.right);
    }
    
    public static Node readTrie(String in){
        int t = readInt(in, 1); 
        if(t==1){
            //A leaf node
            Node node = new Node((char)readInt(in, 8), 0, null, null);
            return node;
        }
        Node x = readTrie(in);
        Node y = readTrie(in);
        
        //No need for frequencies
        return new Node('\0', 0, x, y); 
    }
     
    public static void constructCodeTable(Node n, String s){
        if(n.isLeaf()){
            System.out.println("Representation for "+n.ch+" = "+s);
            codes[n.ch] = s;
            return;
        }  
        constructCodeTable(n.left, s+"0");
        constructCodeTable(n.right, s+"1"); 
    }
    
    public static String compress(String input){
        StringBuilder out = new StringBuilder();
        for(int i=0;i<input.length();++i){
            out.append(codes[input.charAt(i)]);
        }
        return out.toString();
    }
    
    public static String decompress(String input){
        StringBuilder out = new StringBuilder();
        Node traveler = root;
        for(int i=0;i<input.length();++i){
            
            if(input.charAt(i)=='1'){
                traveler = traveler.right;
            }else traveler = traveler.left;
            
            if(traveler.isLeaf()){
                out.append(traveler.ch);
                traveler = root;
            }
        }
        return out.toString();
    }
    
    //Helper Input/Output Functions
    public static void resetReader(){
        inputIndex = 0;
    }
    
    private static int inputIndex = 0;
    private static int readInt(String in, int width){  
        String bin = in.substring(inputIndex, inputIndex+width);
        inputIndex += width;
        return Integer.parseInt(bin, 2);
    }
    
    public static void writeInt(StringBuilder out, int x, int width){
        StringBuilder sb = new StringBuilder();
        String bin = Integer.toBinaryString(x); 
        int fill = width - bin.length();
        while(fill-->0)sb.append(0);
        sb.append(bin);

        //for viewing
        if(x==0)x='0';else if(x==1)x ='1';

        System.out.println("Writing "+sb+" for "+(char)x);
        out.append(sb);
    } 
    
    //Main
    public static void main(String[] args){
        testString("ABRACADABRA!");
        testString("THIS IS A PRETTY LONG AND BORING STRING");
        testString("I GOT NOTHING TO DO");
        testString("SHE SELLS SEA SHELLS BY THE SEA THE SHELLS SHE SELLS ARE SURELY SEASHELLS");
    }
    
    public static void testString(String s){ 
        System.out.println("\n\n\n\n###Testing String \""+s+"\" ###\n\n\n\n");
        freq = new int[R];
        codes = new String[R];
        
        for(int i=0;i<s.length();++i)freq[s.charAt(i)]++; 
        //Building a trie
        root = buildTrie();
        
        System.out.println("Building trie...");
        constructCodeTable(root, "");
        System.out.println();
        
        //Test writing and reading of tries
        StringBuilder out = new StringBuilder();
        System.out.println("\nWriting the trie for input string "+s+"\n");
        writeTrie(out, root); 
        
        Node readRootNode = readTrie(out.toString());
        resetReader();
        StringBuilder out2 = new StringBuilder();
        System.out.println("\nWrote the previous trie, reading it, and writing again...");
        writeTrie(out2, readRootNode);
        
        System.out.println("\nRepresentation of trie that was written earlier\n"+out);
        System.out.println("\nRepresentation of the new trie that was just written\n"+out2);
        System.out.println("\nMatches? "+(out.toString().matches(out2.toString())));
        
        //Test Compression and Decompression
        System.out.println("\nTesting compression and decompression...");
        String compressed = compress(s);
        System.out.println("\nCompressed representation of "+s+" is :\n"+compressed);
        String decompressed = decompress(compressed);
        System.out.println("\nDecompressed -> \n"+decompressed);
        System.out.println("\nOriginal string uses : "+s.length()*8+" bits");
        System.out.println("Compressed version uses : "+compressed.length()+" bits");
    }
}

Output: 

###Testing String "ABRACADABRA!" ###




Paired ! : 1 with C : 1
Paired D : 1 with   : 2
Paired R : 2 with B : 2
Paired   : 3 with   : 4
Paired A : 5 with   : 7
Building trie...
Representation for A = 0
Representation for D = 100
Representation for ! = 1010
Representation for C = 1011
Representation for R = 110
Representation for B = 111


Writing the trie for input string ABRACADABRA!

Writing 0 for 0
Writing 1 for 1
Writing 01000001 for A
Writing 0 for 0
Writing 0 for 0
Writing 1 for 1
Writing 01000100 for D
Writing 0 for 0
Writing 1 for 1
Writing 00100001 for !
Writing 1 for 1
Writing 01000011 for C
Writing 0 for 0
Writing 1 for 1
Writing 01010010 for R
Writing 1 for 1
Writing 01000010 for B

Wrote the previous trie, reading it, and writing again...
Writing 0 for 0
Writing 1 for 1
Writing 01000001 for A
Writing 0 for 0
Writing 0 for 0
Writing 1 for 1
Writing 01000100 for D
Writing 0 for 0
Writing 1 for 1
Writing 00100001 for !
Writing 1 for 1
Writing 01000011 for C
Writing 0 for 0
Writing 1 for 1
Writing 01010010 for R
Writing 1 for 1
Writing 01000010 for B

Representation of trie that was written earlier
01010000010010100010001001000011010000110101010010101000010

Representation of the new trie that was just written
01010000010010100010001001000011010000110101010010101000010

Matches? true

Testing compression and decompression...

Compressed representation of ABRACADABRA! is :
0111110010110100011111001010

Decompressed -> 
ABRACADABRA!

Original string uses : 96 bits
Compressed version uses : 28 bits




###Testing String "THIS IS A PRETTY LONG AND BORING STRING" ###




Paired B : 1 with D : 1
Paired L : 1 with Y : 1
Paired E : 1 with H : 1
Paired P : 1 with   : 2
Paired O : 2 with   : 2
Paired   : 2 with A : 2
Paired G : 3 with R : 3
Paired   : 3 with S : 3
Paired   : 4 with T : 4
Paired I : 4 with N : 4
Paired   : 4 with   : 6
Paired   : 6 with   : 7
Paired   : 8 with   : 8
Paired   : 10 with   : 13
Paired   : 16 with   : 23
Building trie...
Representation for E = 00000
Representation for H = 00001
Representation for A = 0001
Representation for T = 001
Representation for I = 010
Representation for N = 011
Representation for O = 1000
Representation for L = 10010
Representation for Y = 10011
Representation for P = 10100
Representation for B = 101010
Representation for D = 101011
Representation for S = 1011
Representation for G = 1100
Representation for R = 1101
Representation for   = 111


Writing the trie for input string THIS IS A PRETTY LONG AND BORING STRING

Writing 0 for 0
Writing 0 for 0
Writing 0 for 0
Writing 0 for 0
Writing 0 for 0
Writing 1 for 1
Writing 01000101 for E
Writing 1 for 1
Writing 01001000 for H
Writing 1 for 1
Writing 01000001 for A
Writing 1 for 1
Writing 01010100 for T
Writing 0 for 0
Writing 1 for 1
Writing 01001001 for I
Writing 1 for 1
Writing 01001110 for N
Writing 0 for 0
Writing 0 for 0
Writing 0 for 0
Writing 1 for 1
Writing 01001111 for O
Writing 0 for 0
Writing 1 for 1
Writing 01001100 for L
Writing 1 for 1
Writing 01011001 for Y
Writing 0 for 0
Writing 0 for 0
Writing 1 for 1
Writing 01010000 for P
Writing 0 for 0
Writing 1 for 1
Writing 01000010 for B
Writing 1 for 1
Writing 01000100 for D
Writing 1 for 1
Writing 01010011 for S
Writing 0 for 0
Writing 0 for 0
Writing 1 for 1
Writing 01000111 for G
Writing 1 for 1
Writing 01010010 for R
Writing 1 for 1
Writing 00100000 for  

Wrote the previous trie, reading it, and writing again...
Writing 0 for 0
Writing 0 for 0
Writing 0 for 0
Writing 0 for 0
Writing 0 for 0
Writing 1 for 1
Writing 01000101 for E
Writing 1 for 1
Writing 01001000 for H
Writing 1 for 1
Writing 01000001 for A
Writing 1 for 1
Writing 01010100 for T
Writing 0 for 0
Writing 1 for 1
Writing 01001001 for I
Writing 1 for 1
Writing 01001110 for N
Writing 0 for 0
Writing 0 for 0
Writing 0 for 0
Writing 1 for 1
Writing 01001111 for O
Writing 0 for 0
Writing 1 for 1
Writing 01001100 for L
Writing 1 for 1
Writing 01011001 for Y
Writing 0 for 0
Writing 0 for 0
Writing 1 for 1
Writing 01010000 for P
Writing 0 for 0
Writing 1 for 1
Writing 01000010 for B
Writing 1 for 1
Writing 01000100 for D
Writing 1 for 1
Writing 01010011 for S
Writing 0 for 0
Writing 0 for 0
Writing 1 for 1
Writing 01000111 for G
Writing 1 for 1
Writing 01010010 for R
Writing 1 for 1
Writing 00100000 for  

Representation of trie that was written earlier
000001010001011010010001010000011010101000101001001101001110000101001111010100110010101100100101010000010100001010100010010101001100101000111101010010100100000

Representation of the new trie that was just written
000001010001011010010001010000011010101000101001001101001110000101001111010100110010101100100101010000010100001010100010010101001100101000111101010010100100000

Matches? true

Testing compression and decompression...

Compressed representation of THIS IS A PRETTY LONG AND BORING STRING is :
00100001010101111101010111110001111101001101000000010011001111110010100001111001110001011101011111101010100011010100111100111101100111010100111100

Decompressed -> 
THIS IS A PRETTY LONG AND BORING STRING

Original string uses : 312 bits
Compressed version uses : 146 bits




###Testing String "I GOT NOTHING TO DO" ###




Paired D : 1 with H : 1
Paired I : 2 with   : 2
Paired N : 2 with G : 2
Paired T : 3 with   : 4
Paired O : 4 with   : 4
Paired   : 4 with   : 7
Paired   : 8 with   : 11
Building trie...
Representation for O = 00
Representation for   = 01
Representation for I = 100
Representation for D = 1010
Representation for H = 1011
Representation for T = 110
Representation for N = 1110
Representation for G = 1111


Writing the trie for input string I GOT NOTHING TO DO

Writing 0 for 0
Writing 0 for 0
Writing 1 for 1
Writing 01001111 for O
Writing 1 for 1
Writing 00100000 for  
Writing 0 for 0
Writing 0 for 0
Writing 1 for 1
Writing 01001001 for I
Writing 0 for 0
Writing 1 for 1
Writing 01000100 for D
Writing 1 for 1
Writing 01001000 for H
Writing 0 for 0
Writing 1 for 1
Writing 01010100 for T
Writing 0 for 0
Writing 1 for 1
Writing 01001110 for N
Writing 1 for 1
Writing 01000111 for G

Wrote the previous trie, reading it, and writing again...
Writing 0 for 0
Writing 0 for 0
Writing 1 for 1
Writing 01001111 for O
Writing 1 for 1
Writing 00100000 for  
Writing 0 for 0
Writing 0 for 0
Writing 1 for 1
Writing 01001001 for I
Writing 0 for 0
Writing 1 for 1
Writing 01000100 for D
Writing 1 for 1
Writing 01001000 for H
Writing 0 for 0
Writing 1 for 1
Writing 01010100 for T
Writing 0 for 0
Writing 1 for 1
Writing 01001110 for N
Writing 1 for 1
Writing 01000111 for G

Representation of trie that was written earlier
0010100111110010000000101001001010100010010100100001010101000101001110101000111

Representation of the new trie that was just written
0010100111110010000000101001001010100010010100100001010101000101001110101000111

Matches? true

Testing compression and decompression...

Compressed representation of I GOT NOTHING TO DO is :
1000111110011001111000110101110011101111011100001101000

Decompressed -> 
I GOT NOTHING TO DO

Original string uses : 152 bits
Compressed version uses : 55 bits




###Testing String "SHE SELLS SEA SHELLS BY THE SEA THE SHELLS SHE SELLS ARE SURELY SEASHELLS" ###




Paired B : 1 with U : 1
Paired Y : 2 with T : 2
Paired   : 2 with R : 2
Paired   : 4 with   : 4
Paired A : 4 with H : 7
Paired   : 8 with L : 11
Paired   : 11 with   : 13
Paired E : 14 with S : 16
Paired   : 19 with   : 24
Paired   : 30 with   : 43
Building trie...
Representation for E = 00
Representation for S = 01
Representation for Y = 10000
Representation for T = 10001
Representation for B = 100100
Representation for U = 100101
Representation for R = 10011
Representation for L = 101
Representation for A = 1100
Representation for H = 1101
Representation for   = 111


Writing the trie for input string SHE SELLS SEA SHELLS BY THE SEA THE SHELLS SHE SELLS ARE SURELY SEASHELLS

Writing 0 for 0
Writing 0 for 0
Writing 1 for 1
Writing 01000101 for E
Writing 1 for 1
Writing 01010011 for S
Writing 0 for 0
Writing 0 for 0
Writing 0 for 0
Writing 0 for 0
Writing 1 for 1
Writing 01011001 for Y
Writing 1 for 1
Writing 01010100 for T
Writing 0 for 0
Writing 0 for 0
Writing 1 for 1
Writing 01000010 for B
Writing 1 for 1
Writing 01010101 for U
Writing 1 for 1
Writing 01010010 for R
Writing 1 for 1
Writing 01001100 for L
Writing 0 for 0
Writing 0 for 0
Writing 1 for 1
Writing 01000001 for A
Writing 1 for 1
Writing 01001000 for H
Writing 1 for 1
Writing 00100000 for  

Wrote the previous trie, reading it, and writing again...
Writing 0 for 0
Writing 0 for 0
Writing 1 for 1
Writing 01000101 for E
Writing 1 for 1
Writing 01010011 for S
Writing 0 for 0
Writing 0 for 0
Writing 0 for 0
Writing 0 for 0
Writing 1 for 1
Writing 01011001 for Y
Writing 1 for 1
Writing 01010100 for T
Writing 0 for 0
Writing 0 for 0
Writing 1 for 1
Writing 01000010 for B
Writing 1 for 1
Writing 01010101 for U
Writing 1 for 1
Writing 01010010 for R
Writing 1 for 1
Writing 01001100 for L
Writing 0 for 0
Writing 0 for 0
Writing 1 for 1
Writing 01000001 for A
Writing 1 for 1
Writing 01001000 for H
Writing 1 for 1
Writing 00100000 for  

Representation of trie that was written earlier
0010100010110101001100001010110011010101000010100001010101010110101001010100110000101000001101001000100100000

Representation of the new trie that was just written
0010100010110101001100001010110011010101000010100001010101010110101001010100110000101000001101001000100100000

Matches? true

Testing compression and decompression...

Compressed representation of SHE SELLS SEA SHELLS BY THE SEA THE SHELLS SHE SELLS ARE SURELY SEASHELLS is :
01110100111010010110101111010011001110111010010110101111100100100001111000111010011101001100111100011101001110111010010110101111011101001110100101101011111100100110011101100101100110010110000111010011000111010010110101

Decompressed -> 
SHE SELLS SEA SHELLS BY THE SEA THE SHELLS SHE SELLS ARE SURELY SEASHELLS

Original string uses : 584 bits
Compressed version uses : 218 bits 

No comments:

Post a Comment