root/1.8.3/tags/p6/src/comp_h.c

Revision 1032, 16.6 KB (checked in by shawnw, 18 months ago)

Merged 1.8.3p4 into trunk

Line 
1/**
2 * \file comp_h.c
3 *
4 * \brief Huffman compression routines.
5 *
6 * One of several options for attribute compression. Usually the best.
7 * Talek's rewrite of compress.c, using a Huffman compression routine.
8 * This routine adds some time to the MUSH startup, since it reads a file
9 * in order to auto-tune the compression at each restart. The SAMPLE_SIZE
10 * define can trade efficiency for speed.
11 * This rewrite was inspired by Javelin's rewrite on a similar vein.
12 * Most of the comments are his. The nasty ones are Talek's.
13 *
14 */
15
16/* Compression routines */
17#include "copyrite.h"
18#include "config.h"
19#include <string.h>
20#include <stdlib.h>
21#include <ctype.h>
22#include "conf.h"
23#include "externs.h"
24#include "mushdb.h"
25#include "mymalloc.h"
26#include "confmagic.h"
27#ifdef WIN32
28#pragma warning( disable : 4244)        /* NJG: disable warning re conversion */
29#endif
30
31#define TABLE_SIZE      256     /**< allow all characters */
32#define EOS             0       /**< use null code for end of string */
33#define CHAR_BITS       8       /**< number of bits in char */
34#define CHAR_MASK       255     /**< mask for just one char */
35#define CODE_BITS       25      /**< max number of bits in code */
36#ifndef SAMPLE_SIZE
37#define SAMPLE_SIZE     0       /**< sample entire database */
38#endif
39
40
41/** Type for a huffman code. It must be at least CODE_BITS+CHAR_BITS-1
42 * bits long.
43 */
44typedef unsigned long CType;
45
46/** A node in the huffman compression tree. */
47typedef struct cnode {
48  struct cnode *left;           /**< Left child node. */
49  struct cnode *right;          /**< Right child node. */
50  unsigned char c;              /**< character at this node. */
51} CNode;
52
53static CNode *ctop;
54static CType ctable[TABLE_SIZE];
55static char ltable[TABLE_SIZE];
56
57slab *huffman_slab = NULL;
58
59static int fix_tree_depth(CNode *node, int height, int zeros);
60static void add_ones(CNode *node);
61static void build_ctable(CNode *root, CType code, int numbits);
62int init_compress(FILE * f);
63
64
65/** Huffman-compress a string.
66 * Compress a string: this is pretty easy. For each char in the string,
67 * look up its code in ctable and add it to the compressed string we
68 * build, keeping careful track of the number of bits we add.
69 * Then stick the EOS character at the end.
70 *
71 * Important notes:
72 *   This function mallocs memory that should be freed by the caller!
73 *   The caller is also currently responsible for adding mem checks
74 *   Don't use it to compress strings longer than BUFFER_LEN or the
75 *     later uncompression will not go well.
76 *
77 * \param s string to be compressed.
78 * \return newly allocated compressed string.
79 */
80unsigned char *
81text_compress(const char *s)
82{
83  CType stage;
84  int bits = 0;
85  const unsigned char *p;
86  unsigned char *b, *buf;
87  int needed_length;
88
89  /* Part 1 - how long will the compressed string be? */
90  for (p = (const unsigned char *) s; p && *p; p++)
91    bits += ltable[*p];
92  bits += CHAR_BITS * 2 - 1;    /* add space for the ending \0 */
93  needed_length = bits / CHAR_BITS;
94
95  /* Part 2 - Actually get around to compressing the data... */
96  p = (const unsigned char *) s;
97  b = buf = (unsigned char *) malloc(needed_length);
98  stage = 0;
99  bits = 0;
100
101  while (p && *p) {
102    /* Put code on stage */
103    stage |= ctable[*p] << bits;
104    bits += ltable[*p];
105    /* Put any full bytes of stage into the compressed string */
106    while (bits >= CHAR_BITS) {
107      *b++ = stage & CHAR_MASK;
108      stage = stage >> CHAR_BITS;
109      bits -= CHAR_BITS;
110    }
111    p++;
112  }
113  /* Put in EOS, and put the rest of the stage into the compressed string */
114  /* This relies on EOS == 00000000 */
115  bits += ltable[EOS] + CHAR_BITS - 1;
116  while (bits >= CHAR_BITS) {
117    *b++ = stage & CHAR_MASK;
118    stage = stage >> CHAR_BITS;
119    bits -= CHAR_BITS;
120  }
121
122  return buf;
123}
124
125/** Walk the huffman tree.
126 * This macro is used for walking the compression tree.
127 * It is a macro for efficiency.
128 */
129#define WALK_TREE(bitpos) \
130do { \
131  if (*p & (bitpos)) \
132    node = node->right; \
133  else \
134    node = node->left; \
135  if (!node->left && !node->right) { \
136    /* Got a char */ \
137    *b++ = node->c; \
138    if (!*p || ((long)(b - buf) >= (long)(sizeof(buf) - 1))) { \
139      *b++ = EOS; \
140      return buf; \
141    } \
142    if (node->c == EOS) \
143      return buf; \
144    node = ctop; \
145  } \
146} while (0)
147
148/** Huffman uncompress a string.
149 * Uncompression is a snap, too. Go bit by bit, using the
150 * bits to traverse the binary tree (0=left, 1=right) until reaching
151 * a leaf node, which is the uncompressed character.
152 * Stop when the leaf node turns out to be EOS.
153 *
154 * To avoid generating memory problems, this function should be
155 * used with something of the format
156 * \verbatim
157 * char tbuf1[BUFFER_LEN];
158 * strcpy(tbuf1, text_uncompress(a->value));
159 * \endverbatim
160 * if you are using something of type char *buff, use the
161 * safe_uncompress function instead.
162 *
163 * \param s a compressed string.
164 * \return a pointer to a static buffer containing the uncompressed string.
165 */
166char *
167text_uncompress(const unsigned char *s)
168{
169
170  static char buf[BUFFER_LEN];
171  const unsigned char *p;
172  char *b;
173  CNode *node;
174
175  buf[0] = '\0';
176  if (!s || !*s)
177    return buf;
178  p = s;
179  b = buf;
180  /* Finally start decompressing the string... */
181  node = ctop;
182  for (;;) {
183    WALK_TREE(1);
184    WALK_TREE(2);
185    WALK_TREE(4);
186    WALK_TREE(8);
187    WALK_TREE(16);
188    WALK_TREE(32);
189    WALK_TREE(64);
190    WALK_TREE(128);
191    p++;
192  }
193}
194
195/** Huffman uncompress a string, allocating memory.
196 * this function should be used when you're doing something like
197 * \verbatim
198 * char *attrib = safe_uncompress(a->value);
199 *
200 * NEVER use it with something like
201 *
202 * char tbuf1[BUFFER_LEN];
203 * strcpy(tbuf1, safe_uncompress(a->value));
204 * \endverbatim
205 * or you will create a horrendous memory leak.
206 *
207 * \param s compressed string to uncompress.
208 * \return pointer to newly allocated string containing uncompressed text.
209 */
210char *
211safe_uncompress(unsigned char const *s)
212{
213  return (char *) strdup((char *) uncompress(s));
214}
215
216
217static int
218fix_tree_depth(CNode *node, int height, int zeros)
219{
220  int a, b;
221  CNode *temp;
222
223  if (!node)
224    return height + (zeros > 2);
225  a = fix_tree_depth(node->left, height + 1 + (zeros == 7), (zeros + 1) % 8);
226  b = fix_tree_depth(node->right, height + 1, 0);
227  if ((a > CODE_BITS) && (b < (a - 1))) {
228#ifdef STANDALONE
229    printf("Rotate right at depth %d.\n", height);
230#endif
231    temp = node->right;
232    node->right = node->left;
233    node->left = node->right->left;
234    node->right->left = node->right->right;
235    node->right->right = temp;
236    a = fix_tree_depth(node->left, height + 1 + (zeros == 7), (zeros + 1) % 8);
237    b = fix_tree_depth(node->right, height + 1, 0);
238  } else if ((b > CODE_BITS) && (a < (b - 1))) {
239#ifdef STANDALONE
240    printf("Rotate left at depth %d.\n", height);
241#endif
242    temp = node->left;
243    node->left = node->right;
244    node->right = node->left->right;
245    node->left->right = node->left->left;
246    node->left->left = temp;
247    a = fix_tree_depth(node->left, height + 1 + (zeros == 7), (zeros + 1) % 8);
248    b = fix_tree_depth(node->right, height + 1, 0);
249  }
250  return ((a > b) ? a : b);
251}
252
253/* Add 1s to the tree, recursively */
254static void
255add_ones(CNode *node)
256{
257  int count;
258
259  count = 0;
260  do {
261    if (node->right)
262      add_ones(node->right);
263    if ((count >= 7) || ((count >= 3) && !node->left && !node->right)) {
264      ctop = slab_malloc(huffman_slab, node);
265      if (!ctop) {
266        do_rawlog(LT_ERR,
267                  "Cannot allocate memory for compression tree. Aborting.");
268        exit(1);
269      }
270      ctop->left = node->left;
271      ctop->right = node->right;
272      ctop->c = node->c;
273      node->left = (CNode *) NULL;
274      node->right = ctop;
275      node = ctop;
276      count = 0;
277    }
278    node = node->left;
279    count++;
280  } while (node);
281}
282
283/* Build ctable and ltable from the tree, recursively */
284static void
285build_ctable(CNode *root, CType code, int numbits)
286{
287#ifdef STANDALONE
288  int i;
289#endif
290
291  if (!root->left && !root->right) {
292    ctable[root->c] = code;
293    ltable[root->c] = numbits;
294#ifdef STANDALONE
295    printf(isprint(root->c) ? "Code for '%c':\t" : "Code for %d:\t", root->c);
296    for (i = 0; i < numbits; i++)
297      printf("%d", (code >> i) & 1);
298    printf("\n");
299#endif
300    if (numbits > CODE_BITS) {
301      do_rawlog(LT_ERR, "Illegal compression code length (%d). Aborting.",
302                numbits);
303      exit(1);
304    }
305  } else {
306    if (root->left)
307      build_ctable(root->left, code | (0 << numbits), numbits + 1);
308    if (root->right)
309      build_ctable(root->right, code | (1 << numbits), numbits + 1);
310  }
311}
312
313/** Initialize huffman compression.
314 * Initialize the compression tree and table in 5 steps:
315 * 1. Initialize arrays and things
316 * 2. Read indb (up to SAMPLE_SIZE chars, if defined) and count
317 *    the frequency of every character
318 * 3. Cheat the relative frequency of some known special chars
319 *    and upper-case letters
320 * 4. Construct an (un)compression tree based on frequencies
321 * 5. Construct a compression table by searching the tree
322 * \param f filehandle to read from to build the tree.
323 */
324int
325init_compress(FILE * f)
326{
327  int total;
328  unsigned char c;
329  struct {
330    long freq;
331    CNode *node;
332  } table[TABLE_SIZE];
333  int indx, count;
334  long temp;
335  CNode *node;
336
337#ifdef STANDALONE
338  printf("init_compress: Part 1\n");
339#endif
340
341  huffman_slab = slab_create("huffman attribute compression", sizeof(CNode));
342  slab_set_opt(huffman_slab, SLAB_ALLOC_BEST_FIT, 1);
343
344  /* Part 1: initialize */
345  for (total = 0; total < TABLE_SIZE; total++) {
346    table[total].freq = 0;
347    table[total].node = slab_malloc(huffman_slab, NULL);
348    if (!table[total].node) {
349      do_rawlog(LT_ERR,
350                "Cannot allocate memory for compression tree. Aborting.");
351      exit(1);
352    }
353    table[total].node->c = total;
354    table[total].node->left = (CNode *) NULL;
355    table[total].node->right = (CNode *) NULL;
356  }
357
358#ifdef STANDALONE
359  printf("init_compress: Part 2\n");
360#endif
361
362  /* Part 2: count frequencies */
363  if (f) {
364    total = 0;
365    while (!feof(f) && (!SAMPLE_SIZE || (total++ < SAMPLE_SIZE))) {
366      c = fgetc(f);
367      table[c].freq++;
368    }
369  }
370#ifdef STANDALONE
371  for (indx = 0; indx < TABLE_SIZE; indx++) {
372    printf(isprint(indx) ? "Frequency for '%c': %d\n"
373           : "Frequency for %d: %d\n", (unsigned char) indx, table[indx].freq);
374  }
375#endif
376
377#ifdef STANDALONE
378  printf("init_compress: Part 3\n");
379#endif
380
381  /* Part 3: Cheat the frequencies. Because there's a lot of wierd
382   * stuff in indb (like ]'s and upper-case letters), we downplay it
383   * by cutting frequencies. Actually, we shouldn't need to much.
384   */
385
386  /* The ']' character is artificially raised by being the
387   * start-of-attribute marker in indb.  Set it back to '[',
388   * which it should be balancing...
389   */
390  table[']'].freq = table['['].freq;
391
392  /* The DEL character is returned once for no apparent reason (I think
393   * it is returned at EOF), so remove that one count...
394   */
395  if (table[255].freq)
396    table[255].freq--;
397
398  /* Newlines really aren't all that common in the attributes, so
399   * chop the value substantially.
400   */
401  table['\n'].freq /= 16;
402
403#ifdef STANDALONE
404  printf("init_compress: Part 4(a)\n");
405#endif
406
407  /* Part 4(a): Sort the table.  I'm using a stupid insert sort here
408   * because this only gets called once, and I don't want to conform
409   * to the qsort interface.  NOTE: I don't sort in EOS.
410   */
411  for (indx = 2; indx < TABLE_SIZE; indx++) {
412    for (count = indx;
413         (count > 1) && (table[count - 1].freq < table[count].freq); count--) {
414      temp = table[count].freq;
415      table[count].freq = table[count - 1].freq;
416      table[count - 1].freq = temp;
417      node = table[count].node;
418      table[count].node = table[count - 1].node;
419      table[count - 1].node = node;
420    }
421  }
422
423#ifdef STANDALONE
424  printf("init_compress: Part 4(b)\n");
425#endif
426
427  /* Part 4(b): Now we've got a list sorted from most freq (table[0]) to
428   * least freq. We build a binary tree by traversing the list, making
429   * a subtree out of the two least frequent nodes (creating a parent
430   * node whose frequency is the sum of its children's frequency),
431   * and reinserting the subtree's parent into the list. We repeat
432   * until there's only one node in the list, the root of the tree.
433   * NOTE: I'm still not dealing with EOS.
434   */
435  for (indx = TABLE_SIZE - 1; indx > 0; indx--) {
436#ifdef NEVER
437    printf("Freq. table:\n");
438    for (count = indx; count >= 0; count--)
439      printf("%3d: %d\t", table[count].node->c, table[count].freq);
440    printf("\n");
441#endif
442    node = slab_malloc(huffman_slab, table[indx].node);
443    if (!node) {
444      do_rawlog(LT_ERR,
445                "Cannot allocate memory for compression tree. Aborting.");
446      exit(1);
447    }
448    node->left = table[indx].node;
449    node->right = table[indx - 1].node;
450    table[indx - 1].freq += table[indx].freq;
451    table[indx - 1].node = node;
452    for (count = indx - 1;
453         (count > 1) && (table[count - 1].freq <= table[count].freq); count--) {
454      temp = table[count].freq;
455      table[count].freq = table[count - 1].freq;
456      table[count - 1].freq = temp;
457      node = table[count].node;
458      table[count].node = table[count - 1].node;
459      table[count - 1].node = node;
460    }
461  }
462
463#ifdef NEVER
464  build_ctable(table[1].node, 0, 0);
465#endif
466
467#ifdef STANDALONE
468  printf("init_compress: Part 4(c)\n");
469#endif
470
471  /* Part 4(c): If necessary, squash the tree so that it obeys
472   * the code length limitations (CODE_BITS et all).  This is
473   * done recursively, with rotations.
474   */
475  (void) fix_tree_depth(table[1].node, 0, 2);
476
477#ifdef STANDALONE
478  printf("init_compress: Part 4(d)\n");
479#endif
480
481  /* Part 4(d): It is now time to insure that sequences of eight 0s
482   * never occur in the output data, because having nulls in the
483   * output would royally confuse strcpy et all.
484   */
485
486  /* Force a 1 at fifth position on the left edge of tree. (Or terminating
487   * 1 for the all 0 code.)
488   */
489  node = table[1].node;         /* top of tree */
490  for (count = 0; node->left && (count < 4); count++)
491    node = node->left;
492  ctop = slab_malloc(huffman_slab, node);
493  if (!ctop) {
494    do_rawlog(LT_ERR, "Cannot allocate memory for compression tree. Aborting.");
495    exit(1);
496  }
497  ctop->left = node->left;
498  ctop->right = node->right;
499  ctop->c = node->c;
500  node->left = (CNode *) NULL;
501  node->right = ctop;
502
503  /* Recursively descend tree adding 1s where needed. */
504
505  add_ones(table[1].node);
506
507#ifdef STANDALONE
508  printf("init_compress: Part 4(e)\n");
509#endif
510
511  /* Part 4(e): Finally add in EOS as 00000000.
512   */
513  node = table[1].node;         /* top of tree */
514  for (count = 0; count < 8; count++) {
515    if (!node->left) {
516      ctop = slab_malloc(huffman_slab, node);
517      if (!ctop) {
518        do_rawlog(LT_ERR,
519                  "Cannot allocate memory for compression tree. Aborting.");
520        exit(1);
521      }
522      ctop->left = (CNode *) NULL;
523      ctop->right = (CNode *) NULL;
524      ctop->c = EOS;
525      node->left = ctop;
526    }
527    node = node->left;
528  }
529
530#ifdef STANDALONE
531  printf("init_compress: Part 5\n");
532#endif
533
534  /* Part 5: Now traverse the tree, depth-first, and construct
535   * the compression table.
536   */
537
538  ctop = table[1].node;
539  build_ctable(ctop, 0, 0);
540
541#ifdef STANDALONE
542  printf("init_compress: Done\n");
543#endif
544
545  /* Whew */
546  return 0;
547}
548
549#ifdef STANDALONE
550void
551main(argc, argv)
552    int argc;
553    char *argv[];
554{
555  FILE *input;
556  unsigned char buffer[BUFFER_LEN];
557  unsigned char otherbuf[BUFFER_LEN];
558  unsigned char newbuffer[BUFFER_LEN];
559  unsigned char *p1, *p2;
560  int count;
561
562  if ((input = fopen(argv[1], "rb")) == NULL) {
563    printf("Can't open %s.\n", argv[1]);
564    exit(1);
565  }
566  init_compress(input);
567  fclose(input);
568  do {
569    printf("Enter text: ");
570    fgets(buffer, 4095, stdin);
571    if ((buffer[0] == '\n') || (buffer[0] == '\r') || (buffer[0] == '\0'))
572      exit(0);
573    printf("Text: %s!\n", buffer);
574    printf("Compressing\n");
575    strcpy(otherbuf, compress(buffer));
576    printf("Compressed: ");
577    p1 = otherbuf;
578    while (p1 && *p1) {
579      for (count = 0; count < 8; count++)
580        printf("%d", (*p1 >> count) & 1);
581      p1++;
582    }
583    printf("\n");
584    printf("Length: %d, Complength: %d\n", strlen(buffer), strlen(otherbuf));
585    printf("Uncompressing\n");
586    strcpy(newbuffer, uncompress(otherbuf));
587    printf("Text: %s!\n", newbuffer);
588    printf("Strcoll(orig,uncomp) = %d\n", strcoll(newbuffer, buffer));
589    printf("strlen(orig) = %d, strlen(uncomp) = %d\n", strlen(buffer),
590           strlen(newbuffer));
591    p1 = buffer;
592    p2 = newbuffer;
593/*
594 * while (p1 && p2 && *p1 && *p2) {
595 * if (*p1 != *p2) printf("Unequal: %d and %d\n",*p1,*p2);
596 * else printf("Equal: %c and %c\n",*p1,*p2);
597 * p1++; p2++;
598 * }
599 */
600    strcpy(newbuffer, otherbuf);
601/*
602 * printf("Trying safe.\n");
603 * buf = safe_uncompress(newbuffer);
604 * printf("Safe uncompress: %s!\n",buf);
605 */
606  } while (1);
607}
608#endif
Note: See TracBrowser for help on using the browser.