Wednesday, December 16, 2015

LZW Compression - Variable Length Symbols, Fixed Length Codes - Java

LZW Compression has variable length symbols and fixed length codes.

Source:

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.util.LinkedList;
import java.util.Queue;


public class LZW {
    private static int R = 256;
    private static int W = 12;   //Codeword width
    private static int L = 1<<W; //Number of codewords
    private static int c;        //No of code words written out compressed
    
    public static String compress(String in){
        TST<Integer> tst = new TST<>();
        for(int i=0;i<R;++i) tst.add((char)i+"", i); 
        int code = R+1;
        
        int i=0;
        c=1;
        StringBuilder out = new StringBuilder();
        while(i<in.length()){
            String longestPrefix = tst.longestPrefixOf(in.substring(i));
//            System.out.println("Substring : "+in.substring(i, i+1));
            int key = tst.getKey(longestPrefix);
//            System.out.println(longestPrefix+" "+key);
            out.append(key).append(" ");c++;
            int t = longestPrefix.length();
            if(i+t+1<=in.length() && code<L) {
                tst.add(in.substring(i, i+t+1), code++);
            } 
            i+=t;
        } 
        return out.append(R).toString();
    }
    
    private static String[] codes;
    
    public static String decompress(String input){
        String[] d = input.split(" ");
        int[] in = new int[d.length];
        for(int i=0;i<d.length;++i)in[i] = Integer.parseInt(d[i]); 
        codes = new String[L];
        for(int i=0;i<R;++i) codes[i] = (char)i+""; 
         
        StringBuilder out = new StringBuilder();
        int i=0, r = R; 
        String prev="";
        
        //256 is a special end character. Setting codes[256] to some value is not wrong. 
        //(This code does that in first iteration)
        while(true){
            int currentCode = in[i];
//            System.out.println("\nCurrent code = "+currentCode);
            String currentString = codes[currentCode];
//            System.out.println("Current string = "+currentString);
            if(in[i]==R)break;
            
            if(currentCode==r){ currentString = prev+prev.charAt(0);}
            if(r<L){
                codes[r++] = prev+currentString.charAt(0);
//                System.out.println("Set codes["+(r-1)+"] to "+codes[r-1]);
            }
//            System.out.println("Appending string "+currentString);
            out.append(currentString);
            prev = currentString;
            i++; 
        }
        return out.toString();
    }
    
    public static void testString(String s){
        String compressed = compress(s), decompressed;
        System.out.println("\n--> Compressed : "+compressed);
        System.out.println("\n--> Decompressed " +(decompressed = decompress(compressed)));
        
        System.out.println("\nUncompressed data uses "+s.length()*8+" bits. (Assuming 8-bit chars)");
        System.out.println("Compressed data uses "+c*W+" bits");
        
    }
    
    public static void main(String[] args) throws Exception{
        //compressed size more for small strings
        testString("ABABABA"); 
        
        //compression beneficial for larger strings
        testString("ABABABAABABABBABA"); 
        testString("AAAAABBABABBBBBBAAAAAABBBBBBAABABABABABABBABAAAABB");
        
        //Random Input File
        StringBuilder sb = new StringBuilder();
        String s;
        BufferedReader br = new BufferedReader(new FileReader(new File("d:\\input.txt")));
        while((s=br.readLine())!=null){
            sb.append(s);
        }
        testString(sb.toString());
    }
}  


//Ternary Search Trie used as a Symbol Table to match character values against codes.


class TST<T> {
    class Node{
        char c;
        T v;
        Node left, mid, right;
        
        Node(){}
        Node(char c){this.c = c;}
        Node(T v){this.v = v;}
        Node(char c, T v){this.c = c;this.v = v;}
    }
    
    Node root; 
    
    public void add(String s, T value){ 
       root = add(root, s, 0, value);
    }
    
    private Node add(Node n, String s, int i, T value){  
        if(n==null){ 
            n = new Node(s.charAt(i));
            if(i==s.length()-1){n.v = value; return n;}
        }
        if(s.charAt(i)==n.c){
            if(i==s.length()-1){n.v = value; return n;}
            n.mid = add(n.mid, s, i+1, value);
        }
        else if(s.charAt(i)<n.c){
            n.left = add(n.left, s, i, value);
        }
        else{
            n.right = add(n.right, s, i, value);
        }
        return n;
    }
    
    public T getKey(String s){ 
     
        return getKey(root, s, 0); 
    }
    
    private T getKey(Node n, String s, int i){  
        if(n==null) return null;
        if(i==s.length()-1){
            if(n.c==s.charAt(i)){
                if(n.v!=null)return n.v;
                else return null;
            }
        }else if(i>s.length())return null; 
        if(s.charAt(i)==n.c){
            return getKey(n.mid, s, i+1);
        }
        else if(s.charAt(i)<n.c){
            return getKey(n.left, s, i);
        }
        else {
            return getKey(n.right, s, i);
        }        
    }
    
    public void delete(String s){
        delete(root, s, 0);
    }
    
    private boolean delete(Node n, String s, int i){
        if(n==null) return false;
        if(i==s.length()-1){
            if(n.c==s.charAt(i)){
                if(n.v!=null){n.v = null;return true;}
                else return false;
            }
        }else if(i>s.length())return false; 
        
        boolean r;
        if(s.charAt(i)==n.c){
            if((r=delete(n.mid, s, i+1))){ 
                if(n.mid.v == null && n.mid.left == null 
                   && n.mid.mid == null && n.mid.right == null)n.mid = null;
            }
        }else if(s.charAt(i)<n.c){
            if((r=delete(n.left, s, i))){ 
                if(n.left.v == null && n.left.left==null 
                   && n.left.mid == null && n.left.right == null)n.left = null;
            }
        }else{
            if((r=delete(n.right, s, i))){ 
                if(n.right.v == null && n.right.left == null 
                   && n.right.mid == null && n.right.right == null)n.right = null; 
            } 
        }
        return r;
    }
    
    public Iterable<String> keys(){
        Queue<String> al = new LinkedList<>();
        String string = "";
        findKeys(root, al, string);  
        return al;
    }
    
    private void findKeys(Node n, Queue<String> al, String s){
        if(n==null)return;
        if(n.v!=null){al.add(s+n.c);}
        findKeys(n.left, al, s);
        String s2 = s+n.c;
        findKeys(n.mid, al, s2);
        findKeys(n.right,al, s);
    }
    
    public Iterable<String> keysWithPrefix(String prefix){
        Queue<String> al = new LinkedList<>();
        String string = "";
        findKeysP(root, al, string, prefix);  
        return al;
    }
    
    private void findKeysP(Node n, Queue<String> al, String s, String p){
        if(n==null)return;
        if(s.length()<=p.length() && s.length()!=0 && s.charAt(s.length()-1)!=p.charAt(s.length()-1)) return;
        if(n.v!=null){if(s.length()<p.length()&&n.c==p.charAt(s.length()))al.add(s+n.c);
        else if(s.length()>p.length())al.add(s+n.c);}
        findKeysP(n.left, al, s, p);
        
        String s2 = s+n.c;
        findKeysP(n.mid, al, s2, p);
        
        findKeysP(n.right,al, s, p);
    }
    
    public String longestPrefixOf(String s){
        String string = "";
        int l = longestPrefixOf(root, l=0, s);  
        return s.substring(0, l+1);
    }
    
    private int longestPrefixOf(Node n, int i, String p){ 
        if(n==null)return i-1;
        if(i>=p.length()) return i-1; 
         
        int l,m=Integer.MIN_VALUE,r;
        l = longestPrefixOf(n.left, i, p);
        if(n.c==p.charAt(i)) m = longestPrefixOf(n.mid, i+1, p);
        r = longestPrefixOf(n.right, i, p);
        return Math.max(l, Math.max(m, r));
    }
}

Output:

--> Compressed : 65 66 257 259 256

--> Decompressed ABABABA

Uncompressed data uses 56 bits. (Assuming 8-bit chars)
Compressed data uses 60 bits

--> Compressed : 65 66 257 259 259 258 66 262 65 256

--> Decompressed ABABABAABABABBABA

Uncompressed data uses 136 bits. (Assuming 8-bit chars)
Compressed data uses 120 bits

--> Compressed : 65 257 257 66 66 65 261 260 264 261 258 259 265 266 263 262 272 263 271 258 260 256

--> Decompressed AAAAABBABABBBBBBAAAAAABBBBBBAABABABABABABBABAAAABB

Uncompressed data uses 400 bits. (Assuming 8-bit chars)
Compressed data uses 264 bits

--> Compressed : 59 59 32 259 70 111 114 109 ... (output trimmed)

--> Decompressed ;;   Format of the site name database:;;   Column ...

Uncompressed data uses 528400 bits. (Assuming 8-bit chars)
Compressed data uses 238548 bits

No comments:

Post a Comment