HuffmanTree.cs source code in C# .NET

                        Code:
                         / Net / Net / 3.5.50727.3053 / DEVDIV / depot / DevDiv / releases / whidbey / netfxsp / ndp / fx / src / Sys / System / IO / compression / HuffmanTree.cs / 1 / HuffmanTree.cs
                        
                        
                            namespace System.IO.Compression 
{
    using System;
    using System.Diagnostics;
 
    // Strictly speaking this class is not a HuffmanTree, this class is
    // a lookup table combined with a HuffmanTree. The idea is to speed up 
    // the lookup for short symbols (they should appear more frequently ideally.) 
    // However we don't want to create a huge table since it might take longer to
    // build the table than decoding (Deflate usually generates new tables frequently.) 
    //
    // Jean-loup Gailly and Mark Adler gave a very good explanation about this.
    // The full text (algorithm.txt) can be found inside
    // ftp://ftp.uu.net/pub/archiving/zip/zlib/zlib.zip. 
    //
    // Following paper explains decoding in details: 
    //   Hirschberg and Lelewer, "Efficient decoding of prefix codes," 
    //   Comm. ACM, 33,4, April 1990, pp. 449-459.
    // 

    internal class HuffmanTree {
        internal const int   MaxLiteralTreeElements         = 288;
        internal const int   MaxDistTreeElements            =  32; 
        internal const int   EndOfBlockCode                 = 256;
        internal const int   NumberOfCodeLengthTreeElements =  19; 
 
        int          tableBits;
        short[]      table; 
        short[]      left;
        short[]      right;
        byte[]       codeLengthArray;
#if DEBUG 
        uint[]       codeArrayDebug;
#endif 
 
        int tableMask;
 
        // huffman tree for static block
        static HuffmanTree staticLiteralLengthTree;
        static HuffmanTree staticDistanceTree;
 
        static HuffmanTree() {
            // construct the static literal tree and distance tree 
            staticLiteralLengthTree = new HuffmanTree(GetStaticLiteralTreeLength()); 
            staticDistanceTree = new HuffmanTree(GetStaticDistanceTreeLength());
        } 

        static public HuffmanTree StaticLiteralLengthTree {
            get {
                return staticLiteralLengthTree; 
            }
        } 
 
        static public HuffmanTree StaticDistanceTree {
            get { 
                return staticDistanceTree;
            }
        }
 
        public HuffmanTree(byte[] codeLengths) {
            Debug.Assert( codeLengths.Length == MaxLiteralTreeElements 
                          || codeLengths.Length == MaxDistTreeElements 
                          || codeLengths.Length == NumberOfCodeLengthTreeElements,
                          "we only expect three kinds of Length here"); 
            codeLengthArray = codeLengths;

            if (codeLengthArray.Length == MaxLiteralTreeElements) {  // bits for Literal/Length tree table
                tableBits = 9; 
            }
            else {          // bits for distance tree table and code length tree table 
                tableBits = 7; 
            }
            tableMask = (1 << tableBits) -1; 

            CreateTable();
        }
 

        // Generate the array contains huffman codes lengths for static huffman tree. 
        // The data is in RFC 1951. 
        static byte[] GetStaticLiteralTreeLength() {
            byte[] literalTreeLength = new byte[MaxLiteralTreeElements]; 
            for (int i = 0; i <= 143; i++)
                literalTreeLength[i] = 8;

            for (int i = 144; i <= 255; i++) 
                literalTreeLength[i] = 9;
 
            for (int i = 256; i <= 279; i++) 
                literalTreeLength[i] = 7;
 
            for (int i = 280; i <= 287; i++)
                literalTreeLength[i] = 8;

            return literalTreeLength; 
        }
 
        static byte[] GetStaticDistanceTreeLength() { 
            byte[]  staticDistanceTreeLength = new byte[MaxDistTreeElements];
            for (int i = 0; i < MaxDistTreeElements; i++) { 
                staticDistanceTreeLength[i] = 5;
            }
            return staticDistanceTreeLength;
        } 

 
        // Calculate the huffman code for each character based on the code length for each character. 
        // This algorithm is described in standard RFC 1951
        uint[] CalculateHuffmanCode() { 
            uint[]  bitLengthCount  = new uint[17];
            foreach( int codeLength in codeLengthArray) {
                bitLengthCount[codeLength]++;
            } 
            bitLengthCount[0] = 0;  // clear count for length 0
 
            uint[] nextCode = new uint[17]; 
            uint tempCode   = 0;
            for (int bits = 1; bits <= 16; bits++) { 
                tempCode = (tempCode + bitLengthCount[bits-1]) << 1;
                nextCode[bits] = tempCode;
            }
 
            uint[] code = new uint[MaxLiteralTreeElements];
            for (int i = 0; i < codeLengthArray.Length; i++) { 
                int len = codeLengthArray[i]; 

                if (len > 0) { 
                    code[i] = DecodeHelper.BitReverse(nextCode[len], len);
                    nextCode[len]++;
                }
            } 
            return code;
        } 
 
        private void CreateTable() {
 
            uint[] codeArray = CalculateHuffmanCode();
            table = new short[ 1 << tableBits];
#if DEBUG
            codeArrayDebug = codeArray; 
#endif
 
            // I need to find proof that left and right array will always be 
            // enough. I think they are.
            left  = new short[2* codeLengthArray.Length]; 
            right = new short[2* codeLengthArray.Length];
            short avail = (short)codeLengthArray.Length;

            for (int ch = 0; ch < codeLengthArray.Length; ch++) { 
                // length of this code
                int len = codeLengthArray[ch]; 
                if (len > 0) { 
                    // start value (bit reversed)
                    int start = (int)codeArray[ch]; 

                    if (len <= tableBits) {
                        // If a particular symbol is shorter than nine bits,
                        // then that symbol's translation is duplicated 
                        // in all those entries that start with that symbol's bits.
                        // For example, if the symbol is four bits, then it's duplicated 
                        // 32 times in a nine-bit table. If a symbol is nine bits long, 
                        // it appears in the table once.
                        // 
                        // Make sure that in the loop below, code is always
                        // less than table_size.
                        //
                        // On last iteration we store at array index: 
                        //    initial_start_at + (locs-1)*increment
                        //  = initial_start_at + locs*increment - increment 
                        //  = initial_start_at + (1 << tableBits) - increment 
                        //  = initial_start_at + table_size - increment
                        // 
                        // Therefore we must ensure:
                        //     initial_start_at + table_size - increment < table_size
                        // or: initial_start_at < increment
                        // 
                        int increment = 1 << len;
                        if (start >= increment) { 
                            throw new InvalidDataException(SR.GetString(SR.InvalidHuffmanData)); 
                        }
 
                        // Note the bits in the table are reverted.
                        int locs = 1 << (tableBits - len);
                        for (int j = 0; j < locs; j++) {
                            table[start] = (short) ch; 
                            start += increment;
                        } 
                    } else { 
                        // For any code which has length longer than num_elements,
                        // build a binary tree. 

                        int overflowBits = len - tableBits;    // the nodes we need to respent the data.
                        int codeBitMask = 1 << tableBits;    // mask to get current bit (the bits can't fit in the table)
 
                        // the left, right table is used to repesent the
                        // the rest bits. When we got the first part (number bits.) and look at 
                        // tbe table, we will need to follow the tree to find the real character. 
                        // This is in place to avoid bloating the table if there are
                        // a few ones with long code. 
                        int index = start & ((1 << tableBits) -1);
                        short[] array = table;

                        do { 
                            short value = array[index];
 
                            if (value == 0) {         // set up next pointer if this node is not used before. 
                                array[index] = (short)-avail;  // use next available slot.
                                value = (short)-avail; 
                                avail++;
                            }

                            Debug.Assert( value < 0, "Only negative number is used a tree pointer!"); 
                            if ((start & codeBitMask) == 0) {  // if current bit is 0, go change the left array
                                array  = left; 
                            } else {                // if current bit is 1, set value in the right array 
                                array = right;
                            } 
                            index = -value;         // go to next node

                            codeBitMask <<= 1;
                            overflowBits--; 
                        } while (overflowBits != 0);
 
                        array[index] = (short) ch; 
                    }
                } 
            }
        }

        // 
        // This function will try to get enough bits from input and
        // try to decode the bits. 
        // If there are no enought bits in the input, this function will return -1. 
        //
        public int GetNextSymbol(InputBuffer input) { 
            // Try to load 16 bits into input buffer if possible and get the bitBuffer value.
            // If there aren't 16 bits available we will return all we have in the
            // input buffer.
            uint bitBuffer = input.TryLoad16Bits(); 
            if( input.AvailableBits == 0) {    // running out of input.
                return -1; 
            } 

            // decode an element 
            int symbol = table[bitBuffer & tableMask];
            if( symbol < 0) {       //  this will be the start of the binary tree
                // navigate the tree
                uint mask = (uint)1 << tableBits; 
                do
                { 
                    symbol = -symbol; 
                    if ((bitBuffer & mask) == 0)
                        symbol = left[symbol]; 
                    else
                        symbol = right[symbol];
                    mask <<= 1;
                } while (symbol < 0); 
            }
 
            // 
            // If this code is longer than the # bits we had in the bit buffer (i.e.
            // we read only part of the code), we can hit the entry in the table or the tree 
            // for another symbol. However the length of another symbol will not match the
            // available bits count.
            if (codeLengthArray[symbol] > input.AvailableBits) {
                // We already tried to load 16 bits and maximum length is 15, 
                // so this means we are running out of input.
                return -1; 
            } 

            input.SkipBits(codeLengthArray[symbol]); 
            return symbol;
        }

    } 
}

// File provided for Reference Use Only by Microsoft Corporation (c) 2007.
// Copyright (c) Microsoft Corporation. All rights reserved.
namespace System.IO.Compression 
{
    using System;
    using System.Diagnostics;
 
    // Strictly speaking this class is not a HuffmanTree, this class is
    // a lookup table combined with a HuffmanTree. The idea is to speed up 
    // the lookup for short symbols (they should appear more frequently ideally.) 
    // However we don't want to create a huge table since it might take longer to
    // build the table than decoding (Deflate usually generates new tables frequently.) 
    //
    // Jean-loup Gailly and Mark Adler gave a very good explanation about this.
    // The full text (algorithm.txt) can be found inside
    // ftp://ftp.uu.net/pub/archiving/zip/zlib/zlib.zip. 
    //
    // Following paper explains decoding in details: 
    //   Hirschberg and Lelewer, "Efficient decoding of prefix codes," 
    //   Comm. ACM, 33,4, April 1990, pp. 449-459.
    // 

    internal class HuffmanTree {
        internal const int   MaxLiteralTreeElements         = 288;
        internal const int   MaxDistTreeElements            =  32; 
        internal const int   EndOfBlockCode                 = 256;
        internal const int   NumberOfCodeLengthTreeElements =  19; 
 
        int          tableBits;
        short[]      table; 
        short[]      left;
        short[]      right;
        byte[]       codeLengthArray;
#if DEBUG 
        uint[]       codeArrayDebug;
#endif 
 
        int tableMask;
 
        // huffman tree for static block
        static HuffmanTree staticLiteralLengthTree;
        static HuffmanTree staticDistanceTree;
 
        static HuffmanTree() {
            // construct the static literal tree and distance tree 
            staticLiteralLengthTree = new HuffmanTree(GetStaticLiteralTreeLength()); 
            staticDistanceTree = new HuffmanTree(GetStaticDistanceTreeLength());
        } 

        static public HuffmanTree StaticLiteralLengthTree {
            get {
                return staticLiteralLengthTree; 
            }
        } 
 
        static public HuffmanTree StaticDistanceTree {
            get { 
                return staticDistanceTree;
            }
        }
 
        public HuffmanTree(byte[] codeLengths) {
            Debug.Assert( codeLengths.Length == MaxLiteralTreeElements 
                          || codeLengths.Length == MaxDistTreeElements 
                          || codeLengths.Length == NumberOfCodeLengthTreeElements,
                          "we only expect three kinds of Length here"); 
            codeLengthArray = codeLengths;

            if (codeLengthArray.Length == MaxLiteralTreeElements) {  // bits for Literal/Length tree table
                tableBits = 9; 
            }
            else {          // bits for distance tree table and code length tree table 
                tableBits = 7; 
            }
            tableMask = (1 << tableBits) -1; 

            CreateTable();
        }
 

        // Generate the array contains huffman codes lengths for static huffman tree. 
        // The data is in RFC 1951. 
        static byte[] GetStaticLiteralTreeLength() {
            byte[] literalTreeLength = new byte[MaxLiteralTreeElements]; 
            for (int i = 0; i <= 143; i++)
                literalTreeLength[i] = 8;

            for (int i = 144; i <= 255; i++) 
                literalTreeLength[i] = 9;
 
            for (int i = 256; i <= 279; i++) 
                literalTreeLength[i] = 7;
 
            for (int i = 280; i <= 287; i++)
                literalTreeLength[i] = 8;

            return literalTreeLength; 
        }
 
        static byte[] GetStaticDistanceTreeLength() { 
            byte[]  staticDistanceTreeLength = new byte[MaxDistTreeElements];
            for (int i = 0; i < MaxDistTreeElements; i++) { 
                staticDistanceTreeLength[i] = 5;
            }
            return staticDistanceTreeLength;
        } 

 
        // Calculate the huffman code for each character based on the code length for each character. 
        // This algorithm is described in standard RFC 1951
        uint[] CalculateHuffmanCode() { 
            uint[]  bitLengthCount  = new uint[17];
            foreach( int codeLength in codeLengthArray) {
                bitLengthCount[codeLength]++;
            } 
            bitLengthCount[0] = 0;  // clear count for length 0
 
            uint[] nextCode = new uint[17]; 
            uint tempCode   = 0;
            for (int bits = 1; bits <= 16; bits++) { 
                tempCode = (tempCode + bitLengthCount[bits-1]) << 1;
                nextCode[bits] = tempCode;
            }
 
            uint[] code = new uint[MaxLiteralTreeElements];
            for (int i = 0; i < codeLengthArray.Length; i++) { 
                int len = codeLengthArray[i]; 

                if (len > 0) { 
                    code[i] = DecodeHelper.BitReverse(nextCode[len], len);
                    nextCode[len]++;
                }
            } 
            return code;
        } 
 
        private void CreateTable() {
 
            uint[] codeArray = CalculateHuffmanCode();
            table = new short[ 1 << tableBits];
#if DEBUG
            codeArrayDebug = codeArray; 
#endif
 
            // I need to find proof that left and right array will always be 
            // enough. I think they are.
            left  = new short[2* codeLengthArray.Length]; 
            right = new short[2* codeLengthArray.Length];
            short avail = (short)codeLengthArray.Length;

            for (int ch = 0; ch < codeLengthArray.Length; ch++) { 
                // length of this code
                int len = codeLengthArray[ch]; 
                if (len > 0) { 
                    // start value (bit reversed)
                    int start = (int)codeArray[ch]; 

                    if (len <= tableBits) {
                        // If a particular symbol is shorter than nine bits,
                        // then that symbol's translation is duplicated 
                        // in all those entries that start with that symbol's bits.
                        // For example, if the symbol is four bits, then it's duplicated 
                        // 32 times in a nine-bit table. If a symbol is nine bits long, 
                        // it appears in the table once.
                        // 
                        // Make sure that in the loop below, code is always
                        // less than table_size.
                        //
                        // On last iteration we store at array index: 
                        //    initial_start_at + (locs-1)*increment
                        //  = initial_start_at + locs*increment - increment 
                        //  = initial_start_at + (1 << tableBits) - increment 
                        //  = initial_start_at + table_size - increment
                        // 
                        // Therefore we must ensure:
                        //     initial_start_at + table_size - increment < table_size
                        // or: initial_start_at < increment
                        // 
                        int increment = 1 << len;
                        if (start >= increment) { 
                            throw new InvalidDataException(SR.GetString(SR.InvalidHuffmanData)); 
                        }
 
                        // Note the bits in the table are reverted.
                        int locs = 1 << (tableBits - len);
                        for (int j = 0; j < locs; j++) {
                            table[start] = (short) ch; 
                            start += increment;
                        } 
                    } else { 
                        // For any code which has length longer than num_elements,
                        // build a binary tree. 

                        int overflowBits = len - tableBits;    // the nodes we need to respent the data.
                        int codeBitMask = 1 << tableBits;    // mask to get current bit (the bits can't fit in the table)
 
                        // the left, right table is used to repesent the
                        // the rest bits. When we got the first part (number bits.) and look at 
                        // tbe table, we will need to follow the tree to find the real character. 
                        // This is in place to avoid bloating the table if there are
                        // a few ones with long code. 
                        int index = start & ((1 << tableBits) -1);
                        short[] array = table;

                        do { 
                            short value = array[index];
 
                            if (value == 0) {         // set up next pointer if this node is not used before. 
                                array[index] = (short)-avail;  // use next available slot.
                                value = (short)-avail; 
                                avail++;
                            }

                            Debug.Assert( value < 0, "Only negative number is used a tree pointer!"); 
                            if ((start & codeBitMask) == 0) {  // if current bit is 0, go change the left array
                                array  = left; 
                            } else {                // if current bit is 1, set value in the right array 
                                array = right;
                            } 
                            index = -value;         // go to next node

                            codeBitMask <<= 1;
                            overflowBits--; 
                        } while (overflowBits != 0);
 
                        array[index] = (short) ch; 
                    }
                } 
            }
        }

        // 
        // This function will try to get enough bits from input and
        // try to decode the bits. 
        // If there are no enought bits in the input, this function will return -1. 
        //
        public int GetNextSymbol(InputBuffer input) { 
            // Try to load 16 bits into input buffer if possible and get the bitBuffer value.
            // If there aren't 16 bits available we will return all we have in the
            // input buffer.
            uint bitBuffer = input.TryLoad16Bits(); 
            if( input.AvailableBits == 0) {    // running out of input.
                return -1; 
            } 

            // decode an element 
            int symbol = table[bitBuffer & tableMask];
            if( symbol < 0) {       //  this will be the start of the binary tree
                // navigate the tree
                uint mask = (uint)1 << tableBits; 
                do
                { 
                    symbol = -symbol; 
                    if ((bitBuffer & mask) == 0)
                        symbol = left[symbol]; 
                    else
                        symbol = right[symbol];
                    mask <<= 1;
                } while (symbol < 0); 
            }
 
            // 
            // If this code is longer than the # bits we had in the bit buffer (i.e.
            // we read only part of the code), we can hit the entry in the table or the tree 
            // for another symbol. However the length of another symbol will not match the
            // available bits count.
            if (codeLengthArray[symbol] > input.AvailableBits) {
                // We already tried to load 16 bits and maximum length is 15, 
                // so this means we are running out of input.
                return -1; 
            } 

            input.SkipBits(codeLengthArray[symbol]); 
            return symbol;
        }

    } 
}

// File provided for Reference Use Only by Microsoft Corporation (c) 2007.
// Copyright (c) Microsoft Corporation. All rights reserved.
Link Menu

Network programming in C#, Network Programming in VB.NET, Network Programming in .NET
This book is available now!
Buy at Amazon US or
Buy at Amazon UK
HuffmanTree.cs source code in C# .NET

Source code for the .NET framework in C#

Code:

Link Menu