// // Name: lz.c // Author: Marcus Geelnard // Description: LZ77 coder/decoder implementation. // Reentrant: Yes // $ATH_LICENSE_NULL$ // // The LZ77 compression scheme is a substitutional compression scheme // proposed by Abraham Lempel and Jakob Ziv in 1977. It is very simple in // its design, and uses no fancy bit level compression. // // This is my first attempt at an implementation of a LZ77 code/decoder. // // The principle of the LZ77 compression algorithm is to store repeated // occurrences of strings as references to previous occurrences of the same // string. The point is that the reference consumes less space than the // string itself, provided that the string is long enough (in this // implementation, the string has to be at least 4 bytes long, since the // minimum coded reference is 3 bytes long). Also note that the term // "string" refers to any kind of byte sequence (it does not have to be // an ASCII string, for instance). // // The coder uses a brute force approach to finding string matches in the // history buffer (or "sliding window", if you wish), which is very, very // slow. I recon the complexity is somewhere between O(n^2) and O(n^3), // depending on the input data. // // There is also a faster implementation that uses a large working buffer // in which a "jump table" is stored, which is used to quickly find // possible string matches (see the source code for LZ_CompressFast() for // more information). The faster method is an order of magnitude faster, // but still quite slow compared to other compression methods. // // The upside is that decompression is very fast, and the compression ratio // is often very good. // // The reference to a string is coded as a (length,offset) pair, where the // length indicates the length of the string, and the offset gives the // offset from the current data position. To distinguish between string // references and literal strings (uncompressed bytes), a string reference // is preceded by a marker byte, which is chosen as the least common byte // symbol in the input data stream (this marker byte is stored in the // output stream as the first byte). // // Occurrences of the marker byte in the stream are encoded as the marker // byte followed by a zero byte, which means that occurrences of the marker // byte have to be coded with two bytes. // // The lengths and offsets are coded in a variable length fashion, allowing // values of any magnitude (up to 4294967295 in this implementation). // // With this compression scheme, the worst case compression result is // (257/256)*insize + 1. // //------------------------------------------------------------------------ // Copyright (c) 2003-2006 Marcus Geelnard // // This software is provided 'as-is', without any express or implied // warranty. In no event will the authors be held liable for any damages // arising from the use of this software. // // Permission is granted to anyone to use this software for any purpose, // including commercial applications, and to alter it and redistribute it // freely, subject to the following restrictions: // // 1. The origin of this software must not be misrepresented; you must not // claim that you wrote the original software. If you use this software // in a product, an acknowledgment in the product documentation would // be appreciated but is not required. // // 2. Altered source versions must be plainly marked as such, and must not // be misrepresented as being the original software. // // 3. This notice may not be removed or altered from any source // distribution. // // Marcus Geelnard // marcus.geelnard at home.se // // // This file has been altered from the original version. // /************************************************************************* * Constants used for LZ77 coding *************************************************************************/ /* Maximum offset (can be any size < 2^31). Lower values give faster compression, while higher values gives better compression. The default value of 100000 is quite high. Experiment to see what works best for you. */ #define LZ_MAX_OFFSET 100000 /************************************************************************* * INTERNAL FUNCTIONS * *************************************************************************/ /************************************************************************* * _LZ_StringCompare() - Return maximum length string match. *************************************************************************/ static unsigned int _LZ_StringCompare( unsigned char * str1, unsigned char * str2, unsigned int minlen, unsigned int maxlen ) { unsigned int len; for( len = minlen; (len < maxlen) && (str1[len] == str2[len]); ++ len ); return len; } /************************************************************************* * _LZ_WriteVarSize() - Write unsigned integer with variable number of * bytes depending on value. *************************************************************************/ static int _LZ_WriteVarSize( unsigned int x, unsigned char * buf ) { unsigned int y; int num_bytes, i, b; /* Determine number of bytes needed to store the number x */ y = x >> 3; for( num_bytes = 5; num_bytes >= 2; -- num_bytes ) { if( y & 0xfe000000 ) break; y <<= 7; } /* Write all bytes, seven bits in each, with 8:th bit set for all */ /* but the last byte. */ for( i = num_bytes-1; i >= 0; -- i ) { b = (x >> (i*7)) & 0x0000007f; if( i > 0 ) { b |= 0x00000080; } *buf ++ = (unsigned char) b; } /* Return number of bytes written */ return num_bytes; } /************************************************************************* * _LZ_ReadVarSize() - Read unsigned integer with variable number of * bytes depending on value. *************************************************************************/ static int _LZ_ReadVarSize( unsigned int * x, unsigned char * buf ) { unsigned int y, b, num_bytes; /* Read complete value (stop when byte contains zero in 8:th bit) */ y = 0; num_bytes = 0; do { b = (unsigned int) (*buf ++); y = (y << 7) | (b & 0x0000007f); ++ num_bytes; } while( b & 0x00000080 ); /* Store value in x */ *x = y; /* Return number of bytes read */ return num_bytes; } /************************************************************************* * PUBLIC FUNCTIONS * *************************************************************************/ /************************************************************************* * LZ_Compress() - Compress a block of data using an LZ77 coder. * in - Input (uncompressed) buffer. * out - Output (compressed) buffer. This buffer must be 0.4% larger * than the input buffer, plus one byte. * insize - Number of input bytes. * The function returns the size of the compressed data. *************************************************************************/ int LZ_Compress( unsigned char *in, unsigned char *out, unsigned int insize ) { unsigned char marker, symbol; unsigned int inpos, outpos, bytesleft, i; unsigned int maxoffset, offset, bestoffset; unsigned int maxlength, length, bestlength; unsigned int histogram[ 256 ]; unsigned char *ptr1, *ptr2; /* Do we have anything to compress? */ if( insize < 1 ) { return 0; } /* Create histogram */ for( i = 0; i < 256; ++ i ) { histogram[ i ] = 0; } for( i = 0; i < insize; ++ i ) { ++ histogram[ in[ i ] ]; } /* Find the least common byte, and use it as the marker symbol */ marker = 0; for( i = 1; i < 256; ++ i ) { if( histogram[ i ] < histogram[ marker ] ) { marker = i; } } /* Remember the marker symbol for the decoder */ out[ 0 ] = marker; /* Start of compression */ inpos = 0; outpos = 1; /* Main compression loop */ bytesleft = insize; do { /* Determine most distant position */ if( inpos > LZ_MAX_OFFSET ) maxoffset = LZ_MAX_OFFSET; else maxoffset = inpos; /* Get pointer to current position */ ptr1 = &in[ inpos ]; /* Search history window for maximum length string match */ bestlength = 3; bestoffset = 0; for( offset = 3; offset <= maxoffset; ++ offset ) { /* Get pointer to candidate string */ ptr2 = &ptr1[ -(int)offset ]; /* Quickly determine if this is a candidate (for speed) */ if( (ptr1[ 0 ] == ptr2[ 0 ]) && (ptr1[ bestlength ] == ptr2[ bestlength ]) ) { /* Determine maximum length for this offset */ maxlength = (bytesleft < offset ? bytesleft : offset); /* Count maximum length match at this offset */ length = _LZ_StringCompare( ptr1, ptr2, 0, maxlength ); /* Better match than any previous match? */ if( length > bestlength ) { bestlength = length; bestoffset = offset; } } } /* Was there a good enough match? */ if( (bestlength >= 8) || ((bestlength == 4) && (bestoffset <= 0x0000007f)) || ((bestlength == 5) && (bestoffset <= 0x00003fff)) || ((bestlength == 6) && (bestoffset <= 0x001fffff)) || ((bestlength == 7) && (bestoffset <= 0x0fffffff)) ) { out[ outpos ++ ] = (unsigned char) marker; outpos += _LZ_WriteVarSize( bestlength, &out[ outpos ] ); outpos += _LZ_WriteVarSize( bestoffset, &out[ outpos ] ); inpos += bestlength; bytesleft -= bestlength; } else { /* Output single byte (or two bytes if marker byte) */ symbol = in[ inpos ++ ]; out[ outpos ++ ] = symbol; if( symbol == marker ) { out[ outpos ++ ] = 0; } -- bytesleft; } } while( bytesleft > 3 ); /* Dump remaining bytes, if any */ while( inpos < insize ) { if( in[ inpos ] == marker ) { out[ outpos ++ ] = marker; out[ outpos ++ ] = 0; } else { out[ outpos ++ ] = in[ inpos ]; } ++ inpos; } return outpos; } /************************************************************************* * LZ_CompressFast() - Compress a block of data using an LZ77 coder. * in - Input (uncompressed) buffer. * out - Output (compressed) buffer. This buffer must be 0.4% larger * than the input buffer, plus one byte. * insize - Number of input bytes. * work - Pointer to a temporary buffer (internal working buffer), which * must be able to hold (insize+65536) unsigned integers. * The function returns the size of the compressed data. *************************************************************************/ int LZ_CompressFast( unsigned char *in, unsigned char *out, unsigned int insize, unsigned int *work ) { unsigned char marker, symbol; unsigned int inpos, outpos, bytesleft, i, index, symbols; unsigned int offset, bestoffset; unsigned int maxlength, length, bestlength; unsigned int histogram[ 256 ], *lastindex, *jumptable; unsigned char *ptr1, *ptr2; /* Do we have anything to compress? */ if( insize < 1 ) { return 0; } /* Assign arrays to the working area */ lastindex = work; jumptable = &work[ 65536 ]; /* Build a "jump table". Here is how the jump table works: jumptable[i] points to the nearest previous occurrence of the same symbol pair as in[i]:in[i+1], so in[i] == in[jumptable[i]] and in[i+1] == in[jumptable[i]+1], and so on... Following the jump table gives a dramatic boost for the string search'n'match loop compared to doing a brute force search. The jump table is built in O(n) time, so it is a cheap operation in terms of time, but it is expensice in terms of memory consumption. */ for( i = 0; i < 65536; ++ i ) { lastindex[ i ] = 0xffffffff; } for( i = 0; i < insize-1; ++ i ) { symbols = (((unsigned int)in[i]) << 8) | ((unsigned int)in[i+1]); index = lastindex[ symbols ]; lastindex[ symbols ] = i; jumptable[ i ] = index; } jumptable[ insize-1 ] = 0xffffffff; /* Create histogram */ for( i = 0; i < 256; ++ i ) { histogram[ i ] = 0; } for( i = 0; i < insize; ++ i ) { ++ histogram[ in[ i ] ]; } /* Find the least common byte, and use it as the marker symbol */ marker = 0; for( i = 1; i < 256; ++ i ) { if( histogram[ i ] < histogram[ marker ] ) { marker = i; } } /* Remember the marker symbol for the decoder */ out[ 0 ] = marker; /* Start of compression */ inpos = 0; outpos = 1; /* Main compression loop */ bytesleft = insize; do { /* Get pointer to current position */ ptr1 = &in[ inpos ]; /* Search history window for maximum length string match */ bestlength = 3; bestoffset = 0; index = jumptable[ inpos ]; while( (index != 0xffffffff) && ((inpos - index) < LZ_MAX_OFFSET) ) { /* Get pointer to candidate string */ ptr2 = &in[ index ]; /* Quickly determine if this is a candidate (for speed) */ if( ptr2[ bestlength ] == ptr1[ bestlength ] ) { /* Determine maximum length for this offset */ offset = inpos - index; maxlength = (bytesleft < offset ? bytesleft : offset); /* Count maximum length match at this offset */ length = _LZ_StringCompare( ptr1, ptr2, 2, maxlength ); /* Better match than any previous match? */ if( length > bestlength ) { bestlength = length; bestoffset = offset; } } /* Get next possible index from jump table */ index = jumptable[ index ]; } /* Was there a good enough match? */ if( (bestlength >= 8) || ((bestlength == 4) && (bestoffset <= 0x0000007f)) || ((bestlength == 5) && (bestoffset <= 0x00003fff)) || ((bestlength == 6) && (bestoffset <= 0x001fffff)) || ((bestlength == 7) && (bestoffset <= 0x0fffffff)) ) { out[ outpos ++ ] = (unsigned char) marker; outpos += _LZ_WriteVarSize( bestlength, &out[ outpos ] ); outpos += _LZ_WriteVarSize( bestoffset, &out[ outpos ] ); inpos += bestlength; bytesleft -= bestlength; } else { /* Output single byte (or two bytes if marker byte) */ symbol = in[ inpos ++ ]; out[ outpos ++ ] = symbol; if( symbol == marker ) { out[ outpos ++ ] = 0; } -- bytesleft; } } while( bytesleft > 3 ); /* Dump remaining bytes, if any */ while( inpos < insize ) { if( in[ inpos ] == marker ) { out[ outpos ++ ] = marker; out[ outpos ++ ] = 0; } else { out[ outpos ++ ] = in[ inpos ]; } ++ inpos; } return outpos; } /************************************************************************* * LZ_Uncompress() - Uncompress a block of data using an LZ77 decoder. * in - Input (compressed) buffer. * out - Output (uncompressed) buffer. This buffer must be large * enough to hold the uncompressed data. * insize - Number of input bytes. *************************************************************************/ int LZ_Uncompress( unsigned char *in, unsigned char *out, unsigned int insize ) { unsigned char marker, symbol; unsigned int i, inpos, outpos, length, offset; /* Do we have anything to uncompress? */ if( insize < 1 ) { return 0; } /* Get marker symbol from input stream */ marker = in[ 0 ]; inpos = 1; /* Main decompression loop */ outpos = 0; do { symbol = in[ inpos ++ ]; if( symbol == marker ) { /* We had a marker byte */ if( in[ inpos ] == 0 ) { /* It was a single occurrence of the marker byte */ out[ outpos ++ ] = marker; ++ inpos; } else { /* Extract true length and offset */ inpos += _LZ_ReadVarSize( &length, &in[ inpos ] ); inpos += _LZ_ReadVarSize( &offset, &in[ inpos ] ); /* Copy corresponding data from history window */ for( i = 0; i < length; ++ i ) { out[ outpos ] = out[ outpos - offset ]; ++ outpos; } } } else { /* No marker, plain copy */ out[ outpos ++ ] = symbol; } } while( inpos < insize ); return outpos; }