PennMUSH Community

root/1.8.3/trunk/src/comp_w.c

Revision 919, 11.4 kB (checked in by shawnw, 1 year ago)

1.8.3p3

Line 
1 /**
2  * \file comp_w.c
3  *
4  * \brief Word-based compression.
5  *
6  * Table-lookup (word) compress, written by Nick Gammon. 21/Oct/95.
7  *
8  * This method maintains a table of 32768 words, where a "word" is defined
9  * as a sequence of alpha or numeric characters (such as "dog123").
10  *
11  * Compression
12  * -----------
13  *
14  * The text to be compressed is broken up into "words" and other symbols
15  * (e.g. commas, brackets, spaces and so on).
16  *
17  * Words are looked up in the word table by using a hash-table lookup. If
18  * found, the "index" into the table is emitted with the high order bit set.
19  * If not found, the word is added to the table, and is then emitted as
20  * described above.
21  *
22  * For example: "The dog and the other dog sat on the mat (eating fish)."
23  * would compress as:
24  *
25  * 0x8001      The
26  * 0x8002      dog
27  * 0x8003      and
28  * 0x8004      the
29  * 0x8005      other
30  * 0x8002      dog
31  * 0x8007      sat
32  * 0x8008      on
33  * 0x8004      the
34  * 0x8009      mat
35  * 0x28        (
36  * 0x800A      eating
37  * 0x800B      fish)
38  * 0x2E        .
39  *
40  * In the above example, the uncompressed text is 55 bytes, and the compressed
41  * text is 26 bytes.
42  *
43  * For simplicity, the above example assumes that the words hash to consecutive
44  * table positions.
45  *
46  * Note that the trailing punctuation character (space, period or whatever) is
47  * considered _part_ of the word. This is to save having to store multiple
48  * spaces between each word, and is relying on the fact that a certain word is
49  * usually followed by the same punctuation. For example, the word "and" would
50  * normally be followed by a space.
51  *
52  * In the above example, the space following "mat" is stored in the table, and
53  * thus the "(" character had to be output separately. The ")" following the
54  * word "fish" is considered part of the word and is stored in the table,
55  * however the last period was output as a separate character.
56  *
57  * Note how the high-order bit is turned on for words in the table, this is so
58  * the decompression routine can detect table lookup characters from ordinary
59  * ones. Also, any table index with a low-order byte of zero cannot be used,
60  * as this would cause the resulting "string" to be prematurely truncated (by
61  * strcpy, strcmp etc.). Thus the lookup routine skips any positions that hash
62  * to the value 0xXX00. (There are only 256 such indices).
63  *
64  * Also, to prevent characters which the user might type in with the high order
65  * bit set causing decompression confusion, all text is stripped of its high
66  * order bit before being added to the table or emitted.
67  *
68  * In the event of two words hashing to the same value, the compression routine
69  * scans forwards for COLLISION_LIMIT entries, looking for a match or a spare
70  * table entry. If none is found, the word is output "as is" (i.e.
71  * uncompressed). You might speed up compression slightly by lowering
72  * COLLISION_LIMIT, at the cost of slightly lower compression ratios.
73  *
74  * The collision limit does not affect _decompression_ as that merely involves
75  * a direct table lookup.
76  *
77  * Decompression
78  * -------------
79  *
80  * Decompression involves a simple loop, in which each byte of the compressed
81  * text is examined. If it has the high-order bit clear, it is just emitted.
82  * If the high order bit is set it, along with the next byte in the compressed
83  * text, are used to index into the word lookup table. The word found there is
84  * then emitted.
85  *
86  * Speed
87  * -----
88  *
89  * Tests conducted on (hopefully typical) text show that decompression is about
90  * 4 times as fast as Huffman decompression.
91  *
92  * Compression however is about 3.5 times as _slow_ as Huffman compression.
93  *
94  * The slower compression time is considered acceptable on the grounds that
95  * text is much more often _decompressed_ in a MUSH than compressed. Compression
96  * mainly takes part at database load time (say, once a week) whereas
97  * decompression take part every hour, as the database is dumped to disk, and
98  * whenever an object description is displayed, or an attribute searched for,
99  *
100  * Compression ratio
101  * -----------------
102  *
103  * The "table compression" method has a poor compression ratio for small amounts
104  * of text, because of the overhead of the 32768 pointers, and the data stored
105  * in them. However, as the database size increases, the ratio improves because
106  * the table overhead becomes progressively less significant.
107  *
108  * The break-even points is with about 1.5 Mb of text, where both the table
109  * compression and Huffman compress to about 63% of the size of the original.
110  *
111  * After that, the compression ratio gradually improves until reaching somewhere
112  * between 40% and 50% of the size of the original, as the amount of text to
113  * compress reaches 10 Mb.
114  *
115  * The nature of Huffman compression however is such that it will always be
116  * fixed at about 63% regardless of the amount of data compressed.
117  */
118
119 #include "copyrite.h"
120 #include "config.h"
121 #include <string.h>
122 #include <stdlib.h>
123 #include <ctype.h>
124 #include "conf.h"
125 #include "externs.h"
126 #include "mushdb.h"
127 #include "mymalloc.h"
128 #include "confmagic.h"
129
130 #define MAXTABLE 32768          /**< Maximum words in the table */
131 #define MAXWORDS 100            /**< Maximum size of a word */
132 #define COLLISION_LIMIT 20      /**< Maximum allowed collisions */
133
134 #define COMPRESS_HASH_MASK 0x7FFF       /**< 32767 in hex */
135
136 #define TABLE_FLAG 0x80         /**< Distinguish a table */
137 #define TABLE_MASK 0x7F         /**< Mask out words within a table */
138
139 /* Table of words */
140
141 static char *words[MAXTABLE];
142 static size_t words_len[MAXTABLE];
143
144 /* The word we are currently compressing */
145
146 static char word[MAXWORDS + 2];
147 static size_t wordpos = 0;
148
149 /* Stats */
150 #ifdef COMP_STATS
151 static long total_mallocs = 0;
152 static long total_uncomp = 0;
153 static long total_comp = 0;
154 static long total_entries = 0;
155 #endif
156
157 /* Work pointer for compression */
158
159 static unsigned char *b;
160
161 static void output_previous_word(void);
162 int init_compress(FILE * f);
163 #ifdef COMP_STATS
164 void compress_stats(long *entries, long *mem_used,
165                     long *total_uncompressed, long *total_compressed);
166 #endif
167 static unsigned int hash_fn(const char *s, int hashtab_mask);
168
169 static void
170 output_previous_word(void)
171 {
172   char *p;
173   int i, j;
174
175   word[wordpos++] = 0;          /* word's trailing null */
176
177   /* Don't bother putting single or double letter words in the table */
178
179   if (wordpos <= 3) {
180     p = word;
181     while (*p)
182       *b++ = *p++;
183     return;
184   }
185   /* search table to see if word is already in it; */
186
187   for (i = hash_fn(word, COMPRESS_HASH_MASK), j = 0;
188        i < MAXTABLE &&
189        (words[i] || (i & 0xFF) == 0) && j < COLLISION_LIMIT; i++, j++)
190     if (words[i])
191       if (strcmp(word, words[i]) == 0) {
192         *b++ = (i >> 8) | TABLE_FLAG;
193         *b++ = i & 0xFF;
194         return;
195       }
196   /* not in table, add to it */
197
198   if ((i & 0xFF) == 0) {
199     i++;                        /* make sure we don't have a null in the message */
200     j++;
201   }
202   /* Can't add to table if full */
203
204   if (i >= MAXTABLE || j >= COLLISION_LIMIT) {
205     p = word;
206     while (*p)
207       *b++ = *p++;
208     return;
209   }
210   words[i] = malloc(wordpos);
211
212   if (!words[i])
213     mush_panic("Out of memory in string compression routine");
214
215 #ifdef COMP_STATS
216   total_mallocs += wordpos;
217   total_entries++;
218 #endif
219
220   strncpy(words[i], word, wordpos);
221   words_len[i] = wordpos;
222
223   *b++ = (i >> 8) | TABLE_FLAG;
224   *b++ = i & 0xFF;
225
226 }                               /* end of output_previous_word */
227
228 /** Word-compress a string.
229  *
230  * Important notes:
231  *   This function mallocs memory that should be freed by the caller!
232  *   The caller is also currently responsible for adding mem checks
233  *   Don't use it to compress strings longer than BUFFER_LEN or the
234  *     later uncompression will not go well.
235  *
236  * \param s string to be compressed.
237  * \return newly allocated compressed string.
238  */
239 unsigned char *
240 text_compress(char const *s)
241 {
242   const unsigned char *p;
243   static unsigned char buf[BUFFER_LEN];
244
245   p = (unsigned char *) s;
246   b = buf;
247
248   wordpos = 0;
249
250 /* break up input into words */
251   while (*p) {
252     if (!isalnum(*p) || wordpos >= MAXWORDS) {
253       if (wordpos) {
254         word[wordpos++] = *p & 0x7F;    /* add trailing punctuation */
255         output_previous_word();
256         wordpos = 0;
257       } else
258         *b++ = *p & 0x7F;
259     } else
260       word[wordpos++] = *p & 0x7F;
261     p++;
262   }
263
264   if (wordpos)
265     output_previous_word();
266
267   *b = 0;                       /* trailing null */
268
269 #ifdef COMP_STATS
270   total_comp += u_strlen(buf);  /* calculate size of compressed   text */
271   total_uncomp += strlen(s);    /* calculate size of uncompressed text */
272 #endif
273
274   return u_strdup(buf);
275 }                               /* end of compress; */
276
277
278 /** Word-uncompress a string.
279  * To avoid generating memory problems, this function should be
280  * used with something of the format
281  * \verbatim
282  * char tbuf1[BUFFER_LEN];
283  * strcpy(tbuf1, text_uncompress(a->value));
284  * \endverbatim
285  * if you are using something of type char *buff, use the
286  * safe_uncompress function instead.
287  *
288  * \param s a compressed string.
289  * \return a pointer to a static buffer containing the uncompressed string.
290  */
291 char *
292 text_uncompress(unsigned char const *s)
293 {
294
295   const unsigned char *p;
296   char c;
297   int i;
298   static char buf[BUFFER_LEN];
299
300   buf[0] = '\0';
301   if (!s || !*s)
302     return buf;
303   p = (unsigned char *) s;
304   b = buf;
305
306   while (*p) {
307     c = *p;
308     if (c & TABLE_FLAG) {
309       i = ((c & TABLE_MASK) << 8) | *(++p);
310       if (i >= MAXTABLE || words[i] == NULL) {
311         static int panicking = 0;
312         if (panicking) {
313           do_rawlog(LT_ERR,
314                     "Error in string decompression occurred during panic dump.");
315           exit(1);
316         } else {
317           panicking = 1;        /* don't panic from within panic */
318           do_rawlog(LT_ERR, "Error in string decompression, i = %i", i);
319           mush_panic("Fatal error in decompression");
320         }
321       }
322       strncpy((char *) b, words[i], words_len[i]);
323       b += words_len[i] - 1;
324     } else
325       *b++ = c;
326     p++;
327   }
328
329   *b++ = 0;                     /* trailing null */
330
331   return buf;
332
333 }                               /* end of uncompress; */
334
335 /** Word-uncompress a string, allocating memory.
336  * this function should be used when you're doing something like
337  * \verbatim
338  * char *attrib = safe_uncompress(a->value);
339  *
340  * NEVER use it with something like
341  *
342  * char tbuf1[BUFFER_LEN];
343  * strcpy(tbuf1, safe_uncompress(a->value));
344  * \endverbatim
345  * or you will create a horrendous memory leak.
346  *
347  * \param s compressed string to uncompress.
348  * \return pointer to newly allocated string containing uncompressed text.
349  */
350 char *
351 safe_uncompress(unsigned char const *s)
352 {
353   return (char *) strdup((char *) uncompress(s));
354 }
355
356
357 /** Initialize the word compression.
358  * This function clears the words table the first time through.
359  * \param f (unused).
360  */
361 int
362 init_compress(FILE * f __attribute__ ((__unused__)))
363 {
364   memset(words, 0, sizeof words);
365   memset(words_len, 0, sizeof words_len);
366   return 0;
367 }
368
369 #ifdef COMP_STATS
370 /** Return word-compression statistics.
371  */
372 void
373 compress_stats(long *entries, long *mem_used, long *total_uncompressed,
374                long *total_compressed)
375 {
376
377   *entries = total_entries;
378   *mem_used = total_mallocs;
379   *total_uncompressed = total_uncomp;
380   *total_compressed = total_comp;
381
382 }
383 #endif
384
385 static unsigned
386 hash_fn(const char *s, int hashtab_mask)
387 {
388   /* hash function, using masks (based on TinyMUSH 2.0) */
389
390   unsigned hashval;
391   const char *p;
392
393   for (hashval = 0, p = s; *p; p++)
394     hashval = (hashval << 5) + hashval + *p;
395   return (hashval & hashtab_mask);
396 }
Note: See TracBrowser for help on using the browser.