PennMUSH Community

root/1.8.3/trunk/src/comp_h.c

Revision 1032, 16.6 kB (checked in by shawnw, 1 year ago)

Merged 1.8.3p4 into trunk

Line 
1 /**
2  * \file comp_h.c
3  *
4  * \brief Huffman compression routines.
5  *
6  * One of several options for attribute compression. Usually the best.
7  * Talek's rewrite of compress.c, using a Huffman compression routine.
8  * This routine adds some time to the MUSH startup, since it reads a file
9  * in order to auto-tune the compression at each restart. The SAMPLE_SIZE
10  * define can trade efficiency for speed.
11  * This rewrite was inspired by Javelin's rewrite on a similar vein.
12  * Most of the comments are his. The nasty ones are Talek's.
13  *
14  */
15
16 /* Compression routines */
17 #include "copyrite.h"
18 #include "config.h"
19 #include <string.h>
20 #include <stdlib.h>
21 #include <ctype.h>
22 #include "conf.h"
23 #include "externs.h"
24 #include "mushdb.h"
25 #include "mymalloc.h"
26 #include "confmagic.h"
27 #ifdef WIN32
28 #pragma warning( disable : 4244)        /* NJG: disable warning re conversion */
29 #endif
30
31 #define TABLE_SIZE      256     /**< allow all characters */
32 #define EOS             0       /**< use null code for end of string */
33 #define CHAR_BITS       8       /**< number of bits in char */
34 #define CHAR_MASK       255     /**< mask for just one char */
35 #define CODE_BITS       25      /**< max number of bits in code */
36 #ifndef SAMPLE_SIZE
37 #define SAMPLE_SIZE     0       /**< sample entire database */
38 #endif
39
40
41 /** Type for a huffman code. It must be at least CODE_BITS+CHAR_BITS-1
42  * bits long.
43  */
44 typedef unsigned long CType;
45
46 /** A node in the huffman compression tree. */
47 typedef struct cnode {
48   struct cnode *left;           /**< Left child node. */
49   struct cnode *right;          /**< Right child node. */
50   unsigned char c;              /**< character at this node. */
51 } CNode;
52
53 static CNode *ctop;
54 static CType ctable[TABLE_SIZE];
55 static char ltable[TABLE_SIZE];
56
57 slab *huffman_slab = NULL;
58
59 static int fix_tree_depth(CNode *node, int height, int zeros);
60 static void add_ones(CNode *node);
61 static void build_ctable(CNode *root, CType code, int numbits);
62 int init_compress(FILE * f);
63
64
65 /** Huffman-compress a string.
66  * Compress a string: this is pretty easy. For each char in the string,
67  * look up its code in ctable and add it to the compressed string we
68  * build, keeping careful track of the number of bits we add.
69  * Then stick the EOS character at the end.
70  *
71  * Important notes:
72  *   This function mallocs memory that should be freed by the caller!
73  *   The caller is also currently responsible for adding mem checks
74  *   Don't use it to compress strings longer than BUFFER_LEN or the
75  *     later uncompression will not go well.
76  *
77  * \param s string to be compressed.
78  * \return newly allocated compressed string.
79  */
80 unsigned char *
81 text_compress(const char *s)
82 {
83   CType stage;
84   int bits = 0;
85   const unsigned char *p;
86   unsigned char *b, *buf;
87   int needed_length;
88
89   /* Part 1 - how long will the compressed string be? */
90   for (p = (const unsigned char *) s; p && *p; p++)
91     bits += ltable[*p];
92   bits += CHAR_BITS * 2 - 1;    /* add space for the ending \0 */
93   needed_length = bits / CHAR_BITS;
94
95   /* Part 2 - Actually get around to compressing the data... */
96   p = (const unsigned char *) s;
97   b = buf = (unsigned char *) malloc(needed_length);
98   stage = 0;
99   bits = 0;
100
101   while (p && *p) {
102     /* Put code on stage */
103     stage |= ctable[*p] << bits;
104     bits += ltable[*p];
105     /* Put any full bytes of stage into the compressed string */
106     while (bits >= CHAR_BITS) {
107       *b++ = stage & CHAR_MASK;
108       stage = stage >> CHAR_BITS;
109       bits -= CHAR_BITS;
110     }
111     p++;
112   }
113   /* Put in EOS, and put the rest of the stage into the compressed string */
114   /* This relies on EOS == 00000000 */
115   bits += ltable[EOS] + CHAR_BITS - 1;
116   while (bits >= CHAR_BITS) {
117     *b++ = stage & CHAR_MASK;
118     stage = stage >> CHAR_BITS;
119     bits -= CHAR_BITS;
120   }
121
122   return buf;
123 }
124
125 /** Walk the huffman tree.
126  * This macro is used for walking the compression tree.
127  * It is a macro for efficiency.
128  */
129 #define WALK_TREE(bitpos) \
130 do { \
131   if (*p & (bitpos)) \
132     node = node->right; \
133   else \
134     node = node->left; \
135   if (!node->left && !node->right) { \
136     /* Got a char */ \
137     *b++ = node->c; \
138     if (!*p || ((long)(b - buf) >= (long)(sizeof(buf) - 1))) { \
139       *b++ = EOS; \
140       return buf; \
141     } \
142     if (node->c == EOS) \
143       return buf; \
144     node = ctop; \
145   } \
146 } while (0)
147
148 /** Huffman uncompress a string.
149  * Uncompression is a snap, too. Go bit by bit, using the
150  * bits to traverse the binary tree (0=left, 1=right) until reaching
151  * a leaf node, which is the uncompressed character.
152  * Stop when the leaf node turns out to be EOS.
153  *
154  * To avoid generating memory problems, this function should be
155  * used with something of the format
156  * \verbatim
157  * char tbuf1[BUFFER_LEN];
158  * strcpy(tbuf1, text_uncompress(a->value));
159  * \endverbatim
160  * if you are using something of type char *buff, use the
161  * safe_uncompress function instead.
162  *
163  * \param s a compressed string.
164  * \return a pointer to a static buffer containing the uncompressed string.
165  */
166 char *
167 text_uncompress(const unsigned char *s)
168 {
169
170   static char buf[BUFFER_LEN];
171   const unsigned char *p;
172   char *b;
173   CNode *node;
174
175   buf[0] = '\0';
176   if (!s || !*s)
177     return buf;
178   p = s;
179   b = buf;
180   /* Finally start decompressing the string... */
181   node = ctop;
182   for (;;) {
183     WALK_TREE(1);
184     WALK_TREE(2);
185     WALK_TREE(4);
186     WALK_TREE(8);
187     WALK_TREE(16);
188     WALK_TREE(32);
189     WALK_TREE(64);
190     WALK_TREE(128);
191     p++;
192   }
193 }
194
195 /** Huffman uncompress a string, allocating memory.
196  * this function should be used when you're doing something like
197  * \verbatim
198  * char *attrib = safe_uncompress(a->value);
199  *
200  * NEVER use it with something like
201  *
202  * char tbuf1[BUFFER_LEN];
203  * strcpy(tbuf1, safe_uncompress(a->value));
204  * \endverbatim
205  * or you will create a horrendous memory leak.
206  *
207  * \param s compressed string to uncompress.
208  * \return pointer to newly allocated string containing uncompressed text.
209  */
210 char *
211 safe_uncompress(unsigned char const *s)
212 {
213   return (char *) strdup((char *) uncompress(s));
214 }
215
216
217 static int
218 fix_tree_depth(CNode *node, int height, int zeros)
219 {
220   int a, b;
221   CNode *temp;
222
223   if (!node)
224     return height + (zeros > 2);
225   a = fix_tree_depth(node->left, height + 1 + (zeros == 7), (zeros + 1) % 8);
226   b = fix_tree_depth(node->right, height + 1, 0);
227   if ((a > CODE_BITS) && (b < (a - 1))) {
228 #ifdef STANDALONE
229     printf("Rotate right at depth %d.\n", height);
230 #endif
231     temp = node->right;
232     node->right = node->left;
233     node->left = node->right->left;
234     node->right->left = node->right->right;
235     node->right->right = temp;
236     a = fix_tree_depth(node->left, height + 1 + (zeros == 7), (zeros + 1) % 8);
237     b = fix_tree_depth(node->right, height + 1, 0);
238   } else if ((b > CODE_BITS) && (a < (b - 1))) {
239 #ifdef STANDALONE
240     printf("Rotate left at depth %d.\n", height);
241 #endif
242     temp = node->left;
243     node->left = node->right;
244     node->right = node->left->right;
245     node->left->right = node->left->left;
246     node->left->left = temp;
247     a = fix_tree_depth(node->left, height + 1 + (zeros == 7), (zeros + 1) % 8);
248     b = fix_tree_depth(node->right, height + 1, 0);
249   }
250   return ((a > b) ? a : b);
251 }
252
253 /* Add 1s to the tree, recursively */
254 static void
255 add_ones(CNode *node)
256 {
257   int count;
258
259   count = 0;
260   do {
261     if (node->right)
262       add_ones(node->right);
263     if ((count >= 7) || ((count >= 3) && !node->left && !node->right)) {
264       ctop = slab_malloc(huffman_slab, node);
265       if (!ctop) {
266         do_rawlog(LT_ERR,
267                   "Cannot allocate memory for compression tree. Aborting.");
268         exit(1);
269       }
270       ctop->left = node->left;
271       ctop->right = node->right;
272       ctop->c = node->c;
273       node->left = (CNode *) NULL;
274       node->right = ctop;
275       node = ctop;
276       count = 0;
277     }
278     node = node->left;
279     count++;
280   } while (node);
281 }
282
283 /* Build ctable and ltable from the tree, recursively */
284 static void
285 build_ctable(CNode *root, CType code, int numbits)
286 {
287 #ifdef STANDALONE
288   int i;
289 #endif
290
291   if (!root->left && !root->right) {
292     ctable[root->c] = code;
293     ltable[root->c] = numbits;
294 #ifdef STANDALONE
295     printf(isprint(root->c) ? "Code for '%c':\t" : "Code for %d:\t", root->c);
296     for (i = 0; i < numbits; i++)
297       printf("%d", (code >> i) & 1);
298     printf("\n");
299 #endif
300     if (numbits > CODE_BITS) {
301       do_rawlog(LT_ERR, "Illegal compression code length (%d). Aborting.",
302                 numbits);
303       exit(1);
304     }
305   } else {
306     if (root->left)
307       build_ctable(root->left, code | (0 << numbits), numbits + 1);
308     if (root->right)
309       build_ctable(root->right, code | (1 << numbits), numbits + 1);
310   }
311 }
312
313 /** Initialize huffman compression.
314  * Initialize the compression tree and table in 5 steps:
315  * 1. Initialize arrays and things
316  * 2. Read indb (up to SAMPLE_SIZE chars, if defined) and count
317  *    the frequency of every character
318  * 3. Cheat the relative frequency of some known special chars
319  *    and upper-case letters
320  * 4. Construct an (un)compression tree based on frequencies
321  * 5. Construct a compression table by searching the tree
322  * \param f filehandle to read from to build the tree.
323  */
324 int
325 init_compress(FILE * f)
326 {
327   int total;
328   unsigned char c;
329   struct {
330     long freq;
331     CNode *node;
332   } table[TABLE_SIZE];
333   int indx, count;
334   long temp;
335   CNode *node;
336
337 #ifdef STANDALONE
338   printf("init_compress: Part 1\n");
339 #endif
340
341   huffman_slab = slab_create("huffman attribute compression", sizeof(CNode));
342   slab_set_opt(huffman_slab, SLAB_ALLOC_BEST_FIT, 1);
343
344   /* Part 1: initialize */
345   for (total = 0; total < TABLE_SIZE; total++) {
346     table[total].freq = 0;
347     table[total].node = slab_malloc(huffman_slab, NULL);
348     if (!table[total].node) {
349       do_rawlog(LT_ERR,
350                 "Cannot allocate memory for compression tree. Aborting.");
351       exit(1);
352     }
353     table[total].node->c = total;
354     table[total].node->left = (CNode *) NULL;
355     table[total].node->right = (CNode *) NULL;
356   }
357
358 #ifdef STANDALONE
359   printf("init_compress: Part 2\n");
360 #endif
361
362   /* Part 2: count frequencies */
363   if (f) {
364     total = 0;
365     while (!feof(f) && (!SAMPLE_SIZE || (total++ < SAMPLE_SIZE))) {
366       c = fgetc(f);
367       table[c].freq++;
368     }
369   }
370 #ifdef STANDALONE
371   for (indx = 0; indx < TABLE_SIZE; indx++) {
372     printf(isprint(indx) ? "Frequency for '%c': %d\n"
373            : "Frequency for %d: %d\n", (unsigned char) indx, table[indx].freq);
374   }
375 #endif
376
377 #ifdef STANDALONE
378   printf("init_compress: Part 3\n");
379 #endif
380
381   /* Part 3: Cheat the frequencies. Because there's a lot of wierd
382    * stuff in indb (like ]'s and upper-case letters), we downplay it
383    * by cutting frequencies. Actually, we shouldn't need to much.
384    */
385
386   /* The ']' character is artificially raised by being the
387    * start-of-attribute marker in indb.  Set it back to '[',
388    * which it should be balancing...
389    */
390   table[']'].freq = table['['].freq;
391
392   /* The DEL character is returned once for no apparent reason (I think
393    * it is returned at EOF), so remove that one count...
394    */
395   if (table[255].freq)
396     table[255].freq--;
397
398   /* Newlines really aren't all that common in the attributes, so
399    * chop the value substantially.
400    */
401   table['\n'].freq /= 16;
402
403 #ifdef STANDALONE
404   printf("init_compress: Part 4(a)\n");
405 #endif
406
407   /* Part 4(a): Sort the table.  I'm using a stupid insert sort here
408    * because this only gets called once, and I don't want to conform
409    * to the qsort interface.  NOTE: I don't sort in EOS.
410    */
411   for (indx = 2; indx < TABLE_SIZE; indx++) {
412     for (count = indx;
413          (count > 1) && (table[count - 1].freq < table[count].freq); count--) {
414       temp = table[count].freq;
415       table[count].freq = table[count - 1].freq;
416       table[count - 1].freq = temp;
417       node = table[count].node;
418       table[count].node = table[count - 1].node;
419       table[count - 1].node = node;
420     }
421   }
422
423 #ifdef STANDALONE
424   printf("init_compress: Part 4(b)\n");
425 #endif
426
427   /* Part 4(b): Now we've got a list sorted from most freq (table[0]) to
428    * least freq. We build a binary tree by traversing the list, making
429    * a subtree out of the two least frequent nodes (creating a parent
430    * node whose frequency is the sum of its children's frequency),
431    * and reinserting the subtree's parent into the list. We repeat
432    * until there's only one node in the list, the root of the tree.
433    * NOTE: I'm still not dealing with EOS.
434    */
435   for (indx = TABLE_SIZE - 1; indx > 0; indx--) {
436 #ifdef NEVER
437     printf("Freq. table:\n");
438     for (count = indx; count >= 0; count--)
439       printf("%3d: %d\t", table[count].node->c, table[count].freq);
440     printf("\n");
441 #endif
442     node = slab_malloc(huffman_slab, table[indx].node);
443     if (!node) {
444       do_rawlog(LT_ERR,
445                 "Cannot allocate memory for compression tree. Aborting.");
446       exit(1);
447     }
448     node->left = table[indx].node;
449     node->right = table[indx - 1].node;
450     table[indx - 1].freq += table[indx].freq;
451     table[indx - 1].node = node;
452     for (count = indx - 1;
453          (count > 1) && (table[count - 1].freq <= table[count].freq); count--) {
454       temp = table[count].freq;
455       table[count].freq = table[count - 1].freq;
456       table[count - 1].freq = temp;
457       node = table[count].node;
458       table[count].node = table[count - 1].node;
459       table[count - 1].node = node;
460     }
461   }
462
463 #ifdef NEVER
464   build_ctable(table[1].node, 0, 0);
465 #endif
466
467 #ifdef STANDALONE
468   printf("init_compress: Part 4(c)\n");
469 #endif
470
471   /* Part 4(c): If necessary, squash the tree so that it obeys
472    * the code length limitations (CODE_BITS et all).  This is
473    * done recursively, with rotations.
474    */
475   (void) fix_tree_depth(table[1].node, 0, 2);
476
477 #ifdef STANDALONE
478   printf("init_compress: Part 4(d)\n");
479 #endif
480
481   /* Part 4(d): It is now time to insure that sequences of eight 0s
482    * never occur in the output data, because having nulls in the
483    * output would royally confuse strcpy et all.
484    */
485
486   /* Force a 1 at fifth position on the left edge of tree. (Or terminating
487    * 1 for the all 0 code.)
488    */
489   node = table[1].node;         /* top of tree */
490   for (count = 0; node->left && (count < 4); count++)
491     node = node->left;
492   ctop = slab_malloc(huffman_slab, node);
493   if (!ctop) {
494     do_rawlog(LT_ERR, "Cannot allocate memory for compression tree. Aborting.");
495     exit(1);
496   }
497   ctop->left = node->left;
498   ctop->right = node->right;
499   ctop->c = node->c;
500   node->left = (CNode *) NULL;
501   node->right = ctop;
502
503   /* Recursively descend tree adding 1s where needed. */
504
505   add_ones(table[1].node);
506
507 #ifdef STANDALONE
508   printf("init_compress: Part 4(e)\n");
509 #endif
510
511   /* Part 4(e): Finally add in EOS as 00000000.
512    */
513   node = table[1].node;         /* top of tree */
514   for (count = 0; count < 8; count++) {
515     if (!node->left) {
516       ctop = slab_malloc(huffman_slab, node);
517       if (!ctop) {
518         do_rawlog(LT_ERR,
519                   "Cannot allocate memory for compression tree. Aborting.");
520         exit(1);
521       }
522       ctop->left = (CNode *) NULL;
523       ctop->right = (CNode *) NULL;
524       ctop->c = EOS;
525       node->left = ctop;
526     }
527     node = node->left;
528   }
529
530 #ifdef STANDALONE
531   printf("init_compress: Part 5\n");
532 #endif
533
534   /* Part 5: Now traverse the tree, depth-first, and construct
535    * the compression table.
536    */
537
538   ctop = table[1].node;
539   build_ctable(ctop, 0, 0);
540
541 #ifdef STANDALONE
542   printf("init_compress: Done\n");
543 #endif
544
545   /* Whew */
546   return 0;
547 }
548
549 #ifdef STANDALONE
550 void
551 main(argc, argv)
552     int argc;
553     char *argv[];
554 {
555   FILE *input;