#!/usr/bin/env python2 ''' replicate huffman encoding from https://algs4.cs.princeton.edu/55compression/Huffman.java.html * Execution: cat input.txt | java Huffman - (compress) * Execution: cat file.huf | java Huffman + (expand) * Dependencies: BinaryIn.java BinaryOut.java * Data files: https://algs4.cs.princeton.edu/55compression/abra.txt * https://algs4.cs.princeton.edu/55compression/tinytinyTale.txt * https://algs4.cs.princeton.edu/55compression/medTale.txt * https://algs4.cs.princeton.edu/55compression/tale.txt * * Compress or expand a binary input stream using the Huffman algorithm. * * % cat abra.txt | java Huffman - | java BinaryDump 60 * 010100000100101000100010010000110100001101010100101010000100 * 000000000000000000000000000110001111100101101000111110010100 * 120 bits * * % cat abra.txt | java Huffman - | java Huffman + * ABRACADABRA! ''' from __future__ import print_function import sys class Trie: def __init__(self, char, freq, left, right): if char is not None: print('New node for', char) self.char = char self.freq = freq self.left = left self.right = right def isLeaf(self): return self.left is None and self.right is None class Codec: ''' https://stackoverflow.com/questions/10237926/convert-string-to-list-of-bits-and-viceversa ''' symbols = {} root = None compressed = '' def tobits(self, s): pad = lambda b: ('0'*8)[len(b):] + b c2b = lambda c: pad(bin(ord(c))[2:]) return ''.join([ c2b(c) for c in s ]) def tochars(self, bits): return ''.join([ chr(int(bit, 2)) for bit in bits ]) def readBit(self): if len(self.compressed) > 0: b = self.compressed[0] self.compressed = self.compressed[1:] return b def readByte(self): if len(self.compressed) >= 8: b = self.compressed[0:8] self.compressed = self.compressed[8:] return chr(int(b,2)) def readInt(self): if len(self.compressed) >= 8: b = self.compressed[0:32] self.compressed = self.compressed[32:] return int(b,2) def readTrie(codec): ''' returns Trie ''' is_leaf = lambda b: int(b,2) == 1 if is_leaf(codec.readBit()): return Trie(codec.readByte(), -1, None, None) else: return Trie(None, -1, codec.readTrie(), codec.readTrie()) def readRest(codec): ret = '' msg_len = codec.readInt() while len(codec.compressed) > 0 and len(ret) < msg_len: n = codec.root while not n.isLeaf(): b = codec.readBit() if int(b,2) == 1: n = n.right else: n = n.left if n != codec.root: ret += n.char return ret codec = Codec() f = open('test.huf', 'rb') codec.compressed = codec.tobits( f.read() ) f.close() codec.root = codec.readTrie() print(codec.readRest())