LZW Compression has variable length symbols and fixed length codes.
Source:
Output:
--> Compressed : 65 66 257 259 256
--> Decompressed ABABABA
Uncompressed data uses 56 bits. (Assuming 8-bit chars)
Compressed data uses 60 bits
--> Compressed : 65 66 257 259 259 258 66 262 65 256
--> Decompressed ABABABAABABABBABA
Uncompressed data uses 136 bits. (Assuming 8-bit chars)
Compressed data uses 120 bits
--> Compressed : 65 257 257 66 66 65 261 260 264 261 258 259 265 266 263 262 272 263 271 258 260 256
--> Decompressed AAAAABBABABBBBBBAAAAAABBBBBBAABABABABABABBABAAAABB
Uncompressed data uses 400 bits. (Assuming 8-bit chars)
Compressed data uses 264 bits
Source:
import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.util.LinkedList; import java.util.Queue; public class LZW { private static int R = 256; private static int W = 12; //Codeword width private static int L = 1<<W; //Number of codewords private static int c; //No of code words written out compressed public static String compress(String in){ TST<Integer> tst = new TST<>(); for(int i=0;i<R;++i) tst.add((char)i+"", i); int code = R+1; int i=0; c=1; StringBuilder out = new StringBuilder(); while(i<in.length()){ String longestPrefix = tst.longestPrefixOf(in.substring(i)); // System.out.println("Substring : "+in.substring(i, i+1)); int key = tst.getKey(longestPrefix); // System.out.println(longestPrefix+" "+key); out.append(key).append(" ");c++; int t = longestPrefix.length(); if(i+t+1<=in.length() && code<L) { tst.add(in.substring(i, i+t+1), code++); } i+=t; } return out.append(R).toString(); } private static String[] codes; public static String decompress(String input){ String[] d = input.split(" "); int[] in = new int[d.length]; for(int i=0;i<d.length;++i)in[i] = Integer.parseInt(d[i]); codes = new String[L]; for(int i=0;i<R;++i) codes[i] = (char)i+""; StringBuilder out = new StringBuilder(); int i=0, r = R; String prev=""; //256 is a special end character. Setting codes[256] to some value is not wrong. //(This code does that in first iteration) while(true){ int currentCode = in[i]; // System.out.println("\nCurrent code = "+currentCode); String currentString = codes[currentCode]; // System.out.println("Current string = "+currentString); if(in[i]==R)break; if(currentCode==r){ currentString = prev+prev.charAt(0);} if(r<L){ codes[r++] = prev+currentString.charAt(0); // System.out.println("Set codes["+(r-1)+"] to "+codes[r-1]); } // System.out.println("Appending string "+currentString); out.append(currentString); prev = currentString; i++; } return out.toString(); } public static void testString(String s){ String compressed = compress(s), decompressed; System.out.println("\n--> Compressed : "+compressed); System.out.println("\n--> Decompressed " +(decompressed = decompress(compressed))); System.out.println("\nUncompressed data uses "+s.length()*8+" bits. (Assuming 8-bit chars)"); System.out.println("Compressed data uses "+c*W+" bits"); } public static void main(String[] args) throws Exception{ //compressed size more for small strings testString("ABABABA"); //compression beneficial for larger strings testString("ABABABAABABABBABA"); testString("AAAAABBABABBBBBBAAAAAABBBBBBAABABABABABABBABAAAABB"); //Random Input File StringBuilder sb = new StringBuilder(); String s; BufferedReader br = new BufferedReader(new FileReader(new File("d:\\input.txt"))); while((s=br.readLine())!=null){ sb.append(s); } testString(sb.toString()); } } //Ternary Search Trie used as a Symbol Table to match character values against codes. class TST<T> { class Node{ char c; T v; Node left, mid, right; Node(){} Node(char c){this.c = c;} Node(T v){this.v = v;} Node(char c, T v){this.c = c;this.v = v;} } Node root; public void add(String s, T value){ root = add(root, s, 0, value); } private Node add(Node n, String s, int i, T value){ if(n==null){ n = new Node(s.charAt(i)); if(i==s.length()-1){n.v = value; return n;} } if(s.charAt(i)==n.c){ if(i==s.length()-1){n.v = value; return n;} n.mid = add(n.mid, s, i+1, value); } else if(s.charAt(i)<n.c){ n.left = add(n.left, s, i, value); } else{ n.right = add(n.right, s, i, value); } return n; } public T getKey(String s){ return getKey(root, s, 0); } private T getKey(Node n, String s, int i){ if(n==null) return null; if(i==s.length()-1){ if(n.c==s.charAt(i)){ if(n.v!=null)return n.v; else return null; } }else if(i>s.length())return null; if(s.charAt(i)==n.c){ return getKey(n.mid, s, i+1); } else if(s.charAt(i)<n.c){ return getKey(n.left, s, i); } else { return getKey(n.right, s, i); } } public void delete(String s){ delete(root, s, 0); } private boolean delete(Node n, String s, int i){ if(n==null) return false; if(i==s.length()-1){ if(n.c==s.charAt(i)){ if(n.v!=null){n.v = null;return true;} else return false; } }else if(i>s.length())return false; boolean r; if(s.charAt(i)==n.c){ if((r=delete(n.mid, s, i+1))){ if(n.mid.v == null && n.mid.left == null && n.mid.mid == null && n.mid.right == null)n.mid = null; } }else if(s.charAt(i)<n.c){ if((r=delete(n.left, s, i))){ if(n.left.v == null && n.left.left==null && n.left.mid == null && n.left.right == null)n.left = null; } }else{ if((r=delete(n.right, s, i))){ if(n.right.v == null && n.right.left == null && n.right.mid == null && n.right.right == null)n.right = null; } } return r; } public Iterable<String> keys(){ Queue<String> al = new LinkedList<>(); String string = ""; findKeys(root, al, string); return al; } private void findKeys(Node n, Queue<String> al, String s){ if(n==null)return; if(n.v!=null){al.add(s+n.c);} findKeys(n.left, al, s); String s2 = s+n.c; findKeys(n.mid, al, s2); findKeys(n.right,al, s); } public Iterable<String> keysWithPrefix(String prefix){ Queue<String> al = new LinkedList<>(); String string = ""; findKeysP(root, al, string, prefix); return al; } private void findKeysP(Node n, Queue<String> al, String s, String p){ if(n==null)return; if(s.length()<=p.length() && s.length()!=0 && s.charAt(s.length()-1)!=p.charAt(s.length()-1)) return; if(n.v!=null){if(s.length()<p.length()&&n.c==p.charAt(s.length()))al.add(s+n.c); else if(s.length()>p.length())al.add(s+n.c);} findKeysP(n.left, al, s, p); String s2 = s+n.c; findKeysP(n.mid, al, s2, p); findKeysP(n.right,al, s, p); } public String longestPrefixOf(String s){ String string = ""; int l = longestPrefixOf(root, l=0, s); return s.substring(0, l+1); } private int longestPrefixOf(Node n, int i, String p){ if(n==null)return i-1; if(i>=p.length()) return i-1; int l,m=Integer.MIN_VALUE,r; l = longestPrefixOf(n.left, i, p); if(n.c==p.charAt(i)) m = longestPrefixOf(n.mid, i+1, p); r = longestPrefixOf(n.right, i, p); return Math.max(l, Math.max(m, r)); } }
Output:
--> Compressed : 65 66 257 259 256
--> Decompressed ABABABA
Uncompressed data uses 56 bits. (Assuming 8-bit chars)
Compressed data uses 60 bits
--> Compressed : 65 66 257 259 259 258 66 262 65 256
--> Decompressed ABABABAABABABBABA
Uncompressed data uses 136 bits. (Assuming 8-bit chars)
Compressed data uses 120 bits
--> Compressed : 65 257 257 66 66 65 261 260 264 261 258 259 265 266 263 262 272 263 271 258 260 256
--> Decompressed AAAAABBABABBBBBBAAAAAABBBBBBAABABABABABABBABAAAABB
Uncompressed data uses 400 bits. (Assuming 8-bit chars)
Compressed data uses 264 bits
--> Compressed : 59 59 32 259 70 111 114 109 ... (output trimmed)
--> Decompressed ;; Format of the site name database:;; Column ...
Uncompressed data uses 528400 bits. (Assuming 8-bit chars)
Compressed data uses 238548 bits
No comments:
Post a Comment