PennMUSH Community

root/1.8.3/trunk/src/comp_w8.c

Revision 919, 11.6 kB (checked in by shawnw, 1 year ago)

1.8.3p3

Line 
1 /**
2  * \file comp_w8.c
3  *
4  * \brief Word-based compression.
5  *
6  * Table-lookup (word) compress, written by Nick Gammon. 21/Oct/95.
7  * 8bit clean version by Shawn Wagner.
8  *
9  * This method maintains a table of 32768 words, where a "word" is defined
10  * as a sequence of alpha or numeric characters (such as "dog123").
11  *
12  * Compression
13  * -----------
14  *
15  * The text to be compressed is broken up into "words" and other symbols
16  * (e.g. commas, brackets, spaces and so on).
17  *
18  * Words are looked up in the word table by using a hash-table lookup. If
19  * found, the "index" into the table is emitted with the high order bit set.
20  * If not found, the word is added to the table, and is then emitted as
21  * described above.
22  *
23  * For example: "The dog and the other dog sat on the mat (eating fish)."
24  * would compress as:
25  *
26  * 0x8001      The
27  * 0x8002      dog
28  * 0x8003      and
29  * 0x8004      the
30  * 0x8005      other
31  * 0x8002      dog
32  * 0x8007      sat
33  * 0x8008      on
34  * 0x8004      the
35  * 0x8009      mat
36  * 0x28        (
37  * 0x800A      eating
38  * 0x800B      fish)
39  * 0x2E        .
40  *
41  * In the above example, the uncompressed text is 55 bytes, and the compressed
42  * text is 26 bytes.
43  *
44  * For simplicity, the above example assumes that the words hash to consecutive
45  * table positions.
46  *
47  * Note that the trailing punctuation character (space, period or whatever) is
48  * considered _part_ of the word. This is to save having to store multiple
49  * spaces between each word, and is relying on the fact that a certain word is
50  * usually followed by the same punctuation. For example, the word "and" would
51  * normally be followed by a space.
52  *
53  * In the above example, the space following "mat" is stored in the table, and
54  * thus the "(" character had to be output separately. The ")" following the
55  * word "fish" is considered part of the word and is stored in the table,
56  * however the last period was output as a separate character.
57  *
58  * Note how the high-order bit is turned on for words in the table, this is so
59  * the decompression routine can detect table lookup characters from ordinary
60  * ones. Also, any table index with a low-order byte of zero cannot be used,
61  * as this would cause the resulting "string" to be prematurely truncated (by
62  * strcpy, strcmp etc.). Thus the lookup routine skips any positions that hash
63  * to the value 0xXX00. (There are only 256 such indices).
64  *
65  * Also, to prevent characters which the user might type in with the high order
66  * bit set causing decompression confusion, all text is stripped of its high
67  * order bit before being added to the table or emitted.
68  *
69  * In the event of two words hashing to the same value, the compression routine
70  * scans forwards for COLLISION_LIMIT entries, looking for a match or a spare
71  * table entry. If none is found, the word is output "as is" (i.e.
72  * uncompressed). You might speed up compression slightly by lowering
73  * COLLISION_LIMIT, at the cost of slightly lower compression ratios.
74  *
75  * The collision limit does not affect _decompression_ as that merely involves
76  * a direct table lookup.
77  *
78  * Decompression
79  * -------------
80  *
81  * Decompression involves a simple loop, in which each byte of the compressed
82  * text is examined. If it has the high-order bit clear, it is just emitted.
83  * If the high order bit is set it, along with the next byte in the compressed
84  * text, are used to index into the word lookup table. The word found there is
85  * then emitted.
86  *
87  * Speed
88  * -----
89  *
90  * Tests conducted on (hopefully typical) text show that decompression is about
91  * 4 times as fast as Huffman decompression.
92  *
93  * Compression however is about 3.5 times as _slow_ as Huffman compression.
94  *
95  * The slower compression time is considered acceptable on the grounds that
96  * text is much more often _decompressed_ in a MUSH than compressed. Compression
97  * mainly takes part at database load time (say, once a week) whereas
98  * decompression take part every hour, as the database is dumped to disk, and
99  * whenever an object description is displayed, or an attribute searched for,
100  *
101  * Compression ratio
102  * -----------------
103  *
104  * The "table compression" method has a poor compression ratio for small amounts
105  * of text, because of the overhead of the 32768 pointers, and the data stored
106  * in them. However, as the database size increases, the ratio improves because
107  * the table overhead becomes progressively less significant.
108  *
109  * The break-even points is with about 1.5 Mb of text, where both the table
110  * compression and Huffman compress to about 63% of the size of the original.
111  *
112  * After that, the compression ratio gradually improves until reaching somewhere
113  * between 40% and 50% of the size of the original, as the amount of text to
114  * compress reaches 10 Mb.
115  *
116  * The nature of Huffman compression however is such that it will always be
117  * fixed at about 63% regardless of the amount of data compressed.
118  */
119
120 #include "copyrite.h"
121 #include "config.h"
122 #include <string.h>
123 #include <stdlib.h>
124 #include <ctype.h>
125 #include "conf.h"
126 #include "externs.h"
127 #include "mushdb.h"
128 #include "mymalloc.h"
129 #include "confmagic.h"
130
131 #define MAXTABLE 32768          /**< Maximum words in the table */
132 #define MAXWORDS 100            /**< Maximum length of a word */
133 #define COLLISION_LIMIT 20      /**< Maximum allowed collisions */
134
135 #define COMPRESS_HASH_MASK 0x7FFF       /**< 32767 in hex */
136
137 #define MARKER_CHAR 0x06        /**< Separates words. This char is the only one that can't be represented */
138 #define TABLE_FLAG 0x80         /**< Distinguishes a table */
139 #define TABLE_MASK 0x7F         /**< Mask out words within a table */
140
141 /* Table of words */
142
143 static char *words[MAXTABLE];
144 static size_t words_len[MAXTABLE];
145
146 /* The word we are currently compressing */
147
148 static char word[MAXWORDS + 2];
149 static size_t wordpos = 0;
150
151 /* Stats */
152 #ifdef COMP_STATS
153 static long total_mallocs = 0;
154 static long total_uncomp = 0;
155 static long total_comp = 0;
156 static long total_entries = 0;
157 #endif
158
159 /* Work pointer for compression */
160
161 static unsigned char *b;
162
163 static void output_previous_word(void);
164 int init_compress(FILE * f);
165 #ifdef COMP_STATS
166 void compress_stats(long *entries, long *mem_used,
167                     long *total_uncompressed, long *total_compressed);
168 #endif
169 static unsigned int hash_fn(const char *s, int hashtab_mask);
170
171 static void
172 output_previous_word(void)
173 {
174   char *p;
175   int i, j;
176
177   word[wordpos++] = 0;          /* word's trailing null */
178
179 /* Don't bother putting few-letter words in the table */
180
181   if (wordpos <= 4) {
182     p = word;
183     while (*p)
184       *b++ = *p++;
185     return;
186   }
187 /* search table to see if word is already in it; */
188
189   for (i = hash_fn(word, COMPRESS_HASH_MASK), j = 0;
190        i < MAXTABLE &&
191        (words[i] || (i & 0xFF) == 0) && j < COLLISION_LIMIT; i++, j++)
192     if (words[i])
193       if (strcmp(word, words[i]) == 0) {
194         *b++ = MARKER_CHAR;
195         *b++ = (i >> 8) | TABLE_FLAG;
196         *b++ = i & 0xFF;
197         return;
198       }
199 /* not in table, add to it */
200
201   if ((i & 0xFF) == 0) {
202     i++;                        /* make sure we don't have a null in the message */
203     j++;
204   }
205 /* Can't add to table if full */
206
207   if (i >= MAXTABLE || j >= COLLISION_LIMIT) {
208     p = word;
209     while (*p)
210       *b++ = *p++;
211     return;
212   }
213   words[i] = malloc(wordpos);
214
215   if (!words[i])
216     mush_panic("Out of memory in string compression routine");
217
218 #ifdef COMP_STATS
219   total_mallocs += wordpos;
220   total_entries++;
221 #endif
222
223   strncpy(words[i], word, wordpos);
224   words_len[i] = wordpos;
225
226   *b++ = MARKER_CHAR;
227   *b++ = (i >> 8) | TABLE_FLAG;
228   *b++ = i & 0xFF;
229
230 }                               /* end of output_previous_word */
231
232 /** Word-compress a string.
233  *
234  * Important notes:
235  *   This function mallocs memory that should be freed by the caller!
236  *   The caller is also currently responsible for adding mem checks
237  *   Don't use it to compress strings longer than BUFFER_LEN or the
238  *     later uncompression will not go well.
239  *
240  * \param s string to be compressed.
241  * \return newly allocated compressed string.
242  */
243 unsigned char *
244 text_compress(char const *s)
245 {
246   const unsigned char *p;
247   static unsigned char buf[BUFFER_LEN];
248
249   p = (unsigned char *) s;
250   b = buf;
251
252   wordpos = 0;
253
254 /* break up input into words */
255   while (*p) {
256     if (!isalnum(*p) || wordpos >= MAXWORDS) {
257       if (wordpos) {
258         word[wordpos++] = *p;   /* add trailing punctuation */
259         output_previous_word();
260         wordpos = 0;
261       } else
262         *b++ = *p;
263     } else
264       word[wordpos++] = *p;
265     p++;
266   }
267
268   if (wordpos)
269     output_previous_word();
270
271   *b = 0;                       /* trailing null */
272
273 #ifdef COMP_STATS
274   total_comp += u_strlen(buf);  /* calculate size of compressed   text */
275   total_uncomp += strlen(s);    /* calculate size of uncompressed text */
276 #endif
277
278   return u_strdup(buf);
279 }                               /* end of compress; */
280
281
282 /** Word-uncompress a string.
283  * To avoid generating memory problems, this function should be
284  * used with something of the format
285  * \verbatim
286  * char tbuf1[BUFFER_LEN];
287  * strcpy(tbuf1, text_uncompress(a->value));
288  * \endverbatim
289  * if you are using something of type char *buff, use the
290  * safe_uncompress function instead.
291  *
292  * \param s a compressed string.
293  * \return a pointer to a static buffer containing the uncompressed string.
294  */
295 char *
296 text_uncompress(unsigned char const *s)
297 {
298
299   const unsigned char *p;
300   char c;
301   int i;
302   static char buf[BUFFER_LEN];
303
304   buf[0] = '\0';
305   if (!s || !*s)
306     return buf;
307   p = (unsigned char *) s;
308   b = (unsigned char *) buf;
309
310   while (*p) {
311     c = *p;
312     if (c == MARKER_CHAR) {
313       p++;
314       c = *p;
315       i = ((c & TABLE_MASK) << 8) | *(++p);
316       if (i >= MAXTABLE || words[i] == NULL) {
317         static int panicking = 0;
318         if (panicking) {
319           fprintf(stderr,
320                   "Error in string decompression occurred during panic dump.\n");
321           exit(1);
322         } else {
323           panicking = 1;        /* don't panic from within panic */
324           fprintf(stderr, "Error in string decompression, i = %i\n", i);
325           mush_panic("Fatal error in decompression");
326         }
327       }
328       strncpy((char *) b, words[i], words_len[i]);
329       b += words_len[i] - 1;
330     } else
331       *b++ = c;
332     p++;
333   }
334
335   *b++ = 0;                     /* trailing null */
336
337   return buf;
338
339 }                               /* end of uncompress; */
340
341 /** Word-uncompress a string, allocating memory.
342  * this function should be used when you're doing something like
343  * \verbatim
344  * char *attrib = safe_uncompress(a->value);
345  *
346  * NEVER use it with something like
347  *
348  * char tbuf1[BUFFER_LEN];
349  * strcpy(tbuf1, safe_uncompress(a->value));
350  * \endverbatim
351  * or you will create a horrendous memory leak.
352  *
353  * \param s compressed string to uncompress.
354  * \return pointer to newly allocated string containing uncompressed text.
355  */
356 char *
357 safe_uncompress(unsigned char const *s)
358 {
359   return (char *) strdup((char *) uncompress(s));
360 }
361
362
363 /** Initialize the word compression.
364  * This function clears the words table the first time through.
365  * \param f (unused).
366  */
367 int
368 init_compress(FILE * f __attribute__ ((__unused__)))
369 {
370   memset(words, 0, sizeof words);
371   memset(words_len, 0, sizeof words_len);
372   return 0;
373 }
374
375 #ifdef COMP_STATS
376 /** Return word-compression statistics.
377  */
378 void
379 compress_stats(long *entries, long *mem_used, long *total_uncompressed,
380                long *total_compressed)
381 {
382
383   *entries = total_entries;
384   *mem_used = total_mallocs;
385   *total_uncompressed = total_uncomp;
386   *total_compressed = total_comp;
387
388 }
389 #endif
390
391 static unsigned
392 hash_fn(const char *s, int hashtab_mask)
393 {
394   /* hash function, using masks (based on TinyMUSH 2.0) */
395
396   unsigned hashval;
397   const char *p;
398
399   for (hashval = 0, p = s; *p; p++)
400     hashval = (hashval << 5) + hashval + *p;
401   return (hashval & hashtab_mask);
402 }
Note: See TracBrowser for help on using the browser.