Huffman-Coding/Huffman.java at main · Zander073/Huffman-Coding · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
import java.util.*;
import java.util.Map.Entry;
import java.io.ByteArrayOutputStream; // Optional

/**
 * Huffman instances provide reusable Huffman Encoding Maps for
 * compressing and decompressing text corpi with comparable
 * distributions of characters.
 */
public class Huffman {

    // -----------------------------------------------
    // Construction
    // -----------------------------------------------

    private static HuffNode trieRoot;
    // TreeMap chosen here just to make debugging easier
    private static TreeMap<Character, String> encodingMap;
    // Character that represents the end of a compressed transmission
    private static final char ETB_CHAR = 23;
    // ETB key in encodingMap
    private static final char ETB_Key = '*';

    /**
     * Creates the Huffman Trie and Encoding Map using the character
     * distributions in the given text corpus
     * @param corpus A String representing a message / document corpus
     *        with distributions over characters that are implicitly used
     *        throughout the methods that follow. Note: this corpus ONLY
     *        establishes the Encoding Map; later compressed corpi may
     *        differ.
     */
    public Huffman (String corpus) {
        trieRoot = null;
    	encodingMap = new TreeMap<Character, String>();

    	// Gets the frequency of each character in the String 'corpus' and returns a
    	// mapping of each character and its associated frequency.
    	HashMap<Character, Integer> corpusChars = getChars(corpus);
    	PriorityQueue<HuffNode> pq = new PriorityQueue<HuffNode>();

    	// Adds the ETB_CHAR to the PQ to be encoded in the encoding map.
    	pq.add(new HuffNode(ETB_CHAR, 1));

    	// Putting the characters and their relative frequencies in a priority queue.
    	for (Entry<Character, Integer> entry : corpusChars.entrySet()) {
    	    HuffNode HuffNode = new HuffNode(entry.getKey(), entry.getValue());
    	    pq.add(HuffNode);
    	}

        while (pq.size() > 1) {
            HuffNode x = pq.peek();
            pq.poll();
            HuffNode y = pq.peek();
            pq.poll();

            // '-' char represents parent node
            HuffNode temp = new HuffNode('-', (x.count + y.count));
            temp.left = x;
            temp.right = y;
            trieRoot = temp;
            pq.add(temp);
        }
        System.out.println(encodingMap);
        getCode(trieRoot, "");
    }


    // -----------------------------------------------
    // Compression
    // -----------------------------------------------

    /**
     * Compresses the given String message / text corpus into its Huffman coded
     * bitstring, as represented by an array of bytes. Uses the encodingMap
     * field generated during construction for this purpose.
     * @param message String representing the corpus to compress.
     * @return {@code byte[]} representing the compressed corpus with the
     *         Huffman coded bytecode. Formatted as:
     *         (1) the bitstring containing the message itself, (2) possible
     *         0-padding on the final byte.
     */
    public byte[] compress (String message) {
    	ByteArrayOutputStream output = new ByteArrayOutputStream();
    	String compressedMessage = "";
    	int i;

    	for(i = 0; i < message.length(); i++) {
    	    char key = message.charAt(i);
    	    compressedMessage = compressedMessage + encodingMap.get(key);
    	}
    	compressedMessage = compressedMessage + encodingMap.get(ETB_Key);

    	// If the length of the bit-string is not divisible by 8, add right padding.
    	if(compressedMessage.length() % 8 != 0) {
    	    int closestNum = (compressedMessage.length() + 8) - (compressedMessage.length() % 8);
    	    compressedMessage = rightPadding(closestNum - compressedMessage.length(), compressedMessage);
    	}
   	for(i = 0; i < compressedMessage.length(); i++) {
    	    if((i + 1) % 8 == 0) {
    	        String temp = compressedMessage.substring((i + 1) - 8, (i + 1));
    		byte m = (byte) Integer.parseInt(temp, 2);
    		output.write(m);
    	    }
   	}
        return output.toByteArray();
    }


    // -----------------------------------------------
    // Decompression
    // -----------------------------------------------

    /**
     * Decompresses the given compressed array of bytes into their original,
     * String representation. Uses the trieRoot field (the Huffman Trie) that
     * generated the compressed message during decoding.
     * @param compressedMsg {@code byte[]} representing the compressed corpus with the
     *        Huffman coded bytecode. Formatted as:
     *        (1) the bitstring containing the message itself, (2) possible
     *        0-padding on the final byte.
     * @return Decompressed String representation of the compressed bytecode message.
     */
    public String decompress (byte[] compressedMsg) {
    	String cm = "";
    	String message = "";
    	int i;

    	// Gets the compressed message and formats the bits into a bit-string.
    	for(i = 0; i < compressedMsg.length; i++) {
    	    byte b = compressedMsg[i];
    	    String temp = String.format("%" + 8 + "s", Integer.toBinaryString(b)).replaceAll(" ", "0");
    	    if(temp.length() > 8) {
    	        int lower = temp.length() - 8;
    		int upper = temp.length();
            	temp = temp.substring(lower, upper);
            	cm = cm + temp;
            } else {
            	cm = cm + temp;
            }
    	}

    	// Traverses Huffman trie for each bit in the bit-string 'cm'.
    	for(i = 0; i < cm.length();) {
    	    HuffNode node = trieRoot;
    	    while(!node.isLeaf()) {
    	        if(cm.charAt(i) == '0') {
    	        	node = node.left;
    	        } else {
    	        	node = node.right;
    	        }
    	    i++;
    	    }
    	    if(node.character == ETB_CHAR) {
    	        break;
    	    } else {
    	    	message = message + node.character;
    	    }
    	}
        return message;
    }

    // Creates a HashMap for the given string's character's relative frequencies <Character, (Integer) Frequency>.
    public static HashMap<Character, Integer> getChars(String s){
    	HashMap<Character, Integer> chars = new HashMap<Character, Integer>();
    	for(int i = 0; i < s.length(); i++) {
    	    Character temp = s.charAt(i);
    	    if(!chars.containsKey(temp)) {
    		chars.put(temp, 1);
    	    } else {
    		chars.put(temp, chars.get(temp) + 1);
    	    }
    	}
    	return chars;
    }

    // Gets the bit string of each character. Left sub trees are assigned '0' and right sub trees are assigned '1'
    public static void getCode(HuffNode root, String s) {
        if(root.isLeaf() && Character.isDefined(root.character)) {
	    if(root.character == ETB_CHAR) {
		// If encountered character is ETB, represent as a '*'.
		encodingMap.put(ETB_Key, s);
	    } else {
		encodingMap.put(root.character, s);
	    }
	    return;
	}
	getCode(root.left, s + "0");
	getCode(root.right, s + "1");
    }

    // Adds n zeros to end of String s.
    public static String rightPadding(int n, String s) {
    	String output = "";
    	for(int i = 0; i < n; i++) {
    	    output = output + "0";
    	}
    	return s + output;
    }

    // -----------------------------------------------
    // Huffman Trie
    // -----------------------------------------------

    /**
     * Huffman Trie Node class used in construction of the Huffman Trie.
     * Each node is a binary (having at most a left and right child), contains
     * a character field that it represents, and a count field that holds the
     * number of times the node's character (or those in its subtrees) appear
     * in the corpus.
     */
    private static class HuffNode implements Comparable<HuffNode> {

        HuffNode left, right;
        char character;
        int count;

        HuffNode (char character, int count) {
            this.count = count;
            this.character = character;
        }

        public boolean isLeaf () {
            return this.left == null && this.right == null;
        }

        public int compareTo (HuffNode other) {
            if(this.count != other.count) {
            	return this.count - other.count;
            } else {
            	@SuppressWarnings("deprecation")
            	Character x = new Character(this.character);
            	@SuppressWarnings("deprecation")
            	Character y = new Character(other.character);
            	return x.compareTo(y);
            }
        }
    }

}