root/1.8.3/tags/p3rc1/src/comp_h.c

Revision 905, 16.4 KB (checked in by shawnw, 19 months ago)

Have indent convert tabs to spaces

Line 
1/**
2 * \file comp_h.c
3 *
4 * \brief Huffman compression routines.
5 *
6 * One of several options for attribute compression. Usually the best.
7 * Talek's rewrite of compress.c, using a Huffman compression routine.
8 * This routine adds some time to the MUSH startup, since it reads a file
9 * in order to auto-tune the compression at each restart. The SAMPLE_SIZE
10 * define can trade efficiency for speed.
11 * This rewrite was inspired by Javelin's rewrite on a similar vein.
12 * Most of the comments are his. The nasty ones are Talek's.
13 *
14 */
15
16/* Compression routines */
17#include "copyrite.h"
18#include "config.h"
19#include <string.h>
20#include <stdlib.h>
21#include <ctype.h>
22#include "conf.h"
23#include "externs.h"
24#include "mushdb.h"
25#include "mymalloc.h"
26#include "confmagic.h"
27#ifdef WIN32
28#pragma warning( disable : 4244)        /* NJG: disable warning re conversion */
29#endif
30
31#define TABLE_SIZE      256     /**< allow all characters */
32#define EOS             0       /**< use null code for end of string */
33#define CHAR_BITS       8       /**< number of bits in char */
34#define CHAR_MASK       255     /**< mask for just one char */
35#define CODE_BITS       25      /**< max number of bits in code */
36#ifndef SAMPLE_SIZE
37#define SAMPLE_SIZE     0       /**< sample entire database */
38#endif
39
40
41/** Type for a huffman code. It must be at least CODE_BITS+CHAR_BITS-1
42 * bits long.
43 */
44typedef unsigned long CType;
45
46/** A node in the huffman compression tree. */
47typedef struct cnode {
48  struct cnode *left;           /**< Left child node. */
49  struct cnode *right;          /**< Right child node. */
50  unsigned char c;              /**< character at this node. */
51} CNode;
52
53static CNode *ctop;
54static CType ctable[TABLE_SIZE];
55static char ltable[TABLE_SIZE];
56
57static int fix_tree_depth(CNode *node, int height, int zeros);
58static void add_ones(CNode *node);
59static void build_ctable(CNode *root, CType code, int numbits);
60int init_compress(FILE * f);
61
62
63/** Huffman-compress a string.
64 * Compress a string: this is pretty easy. For each char in the string,
65 * look up its code in ctable and add it to the compressed string we
66 * build, keeping careful track of the number of bits we add.
67 * Then stick the EOS character at the end.
68 *
69 * Important notes:
70 *   This function mallocs memory that should be freed by the caller!
71 *   The caller is also currently responsible for adding mem checks
72 *   Don't use it to compress strings longer than BUFFER_LEN or the
73 *     later uncompression will not go well.
74 *
75 * \param s string to be compressed.
76 * \return newly allocated compressed string.
77 */
78unsigned char *
79text_compress(const char *s)
80{
81  CType stage;
82  int bits = 0;
83  const unsigned char *p;
84  unsigned char *b, *buf;
85  int needed_length;
86
87  /* Part 1 - how long will the compressed string be? */
88  for (p = (const unsigned char *) s; p && *p; p++)
89    bits += ltable[*p];
90  bits += CHAR_BITS * 2 - 1;    /* add space for the ending \0 */
91  needed_length = bits / CHAR_BITS;
92
93  /* Part 2 - Actually get around to compressing the data... */
94  p = (const unsigned char *) s;
95  b = buf = (unsigned char *) malloc(needed_length);
96  stage = 0;
97  bits = 0;
98
99  while (p && *p) {
100    /* Put code on stage */
101    stage |= ctable[*p] << bits;
102    bits += ltable[*p];
103    /* Put any full bytes of stage into the compressed string */
104    while (bits >= CHAR_BITS) {
105      *b++ = stage & CHAR_MASK;
106      stage = stage >> CHAR_BITS;
107      bits -= CHAR_BITS;
108    }
109    p++;
110  }
111  /* Put in EOS, and put the rest of the stage into the compressed string */
112  /* This relies on EOS == 00000000 */
113  bits += ltable[EOS] + CHAR_BITS - 1;
114  while (bits >= CHAR_BITS) {
115    *b++ = stage & CHAR_MASK;
116    stage = stage >> CHAR_BITS;
117    bits -= CHAR_BITS;
118  }
119
120  return buf;
121}
122
123/** Walk the huffman tree.
124 * This macro is used for walking the compression tree.
125 * It is a macro for efficiency.
126 */
127#define WALK_TREE(bitpos) \
128do { \
129  if (*p & (bitpos)) \
130    node = node->right; \
131  else \
132    node = node->left; \
133  if (!node->left && !node->right) { \
134    /* Got a char */ \
135    *b++ = node->c; \
136    if (!*p || ((long)(b - buf) >= (long)(sizeof(buf) - 1))) { \
137      *b++ = EOS; \
138      return buf; \
139    } \
140    if (node->c == EOS) \
141      return buf; \
142    node = ctop; \
143  } \
144} while (0)
145
146/** Huffman uncompress a string.
147 * Uncompression is a snap, too. Go bit by bit, using the
148 * bits to traverse the binary tree (0=left, 1=right) until reaching
149 * a leaf node, which is the uncompressed character.
150 * Stop when the leaf node turns out to be EOS.
151 *
152 * To avoid generating memory problems, this function should be
153 * used with something of the format
154 * \verbatim
155 * char tbuf1[BUFFER_LEN];
156 * strcpy(tbuf1, text_uncompress(a->value));
157 * \endverbatim
158 * if you are using something of type char *buff, use the
159 * safe_uncompress function instead.
160 *
161 * \param s a compressed string.
162 * \return a pointer to a static buffer containing the uncompressed string.
163 */
164char *
165text_uncompress(const unsigned char *s)
166{
167
168  static char buf[BUFFER_LEN];
169  const unsigned char *p;
170  char *b;
171  CNode *node;
172
173  buf[0] = '\0';
174  if (!s || !*s)
175    return buf;
176  p = s;
177  b = buf;
178  /* Finally start decompressing the string... */
179  node = ctop;
180  for (;;) {
181    WALK_TREE(1);
182    WALK_TREE(2);
183    WALK_TREE(4);
184    WALK_TREE(8);
185    WALK_TREE(16);
186    WALK_TREE(32);
187    WALK_TREE(64);
188    WALK_TREE(128);
189    p++;
190  }
191}
192
193/** Huffman uncompress a string, allocating memory.
194 * this function should be used when you're doing something like
195 * \verbatim
196 * char *attrib = safe_uncompress(a->value);
197 *
198 * NEVER use it with something like
199 *
200 * char tbuf1[BUFFER_LEN];
201 * strcpy(tbuf1, safe_uncompress(a->value));
202 * \endverbatim
203 * or you will create a horrendous memory leak.
204 *
205 * \param s compressed string to uncompress.
206 * \return pointer to newly allocated string containing uncompressed text.
207 */
208char *
209safe_uncompress(unsigned char const *s)
210{
211  return (char *) strdup((char *) uncompress(s));
212}
213
214
215static int
216fix_tree_depth(CNode *node, int height, int zeros)
217{
218  int a, b;
219  CNode *temp;
220
221  if (!node)
222    return height + (zeros > 2);
223  a = fix_tree_depth(node->left, height + 1 + (zeros == 7), (zeros + 1) % 8);
224  b = fix_tree_depth(node->right, height + 1, 0);
225  if ((a > CODE_BITS) && (b < (a - 1))) {
226#ifdef STANDALONE
227    printf("Rotate right at depth %d.\n", height);
228#endif
229    temp = node->right;
230    node->right = node->left;
231    node->left = node->right->left;
232    node->right->left = node->right->right;
233    node->right->right = temp;
234    a = fix_tree_depth(node->left, height + 1 + (zeros == 7), (zeros + 1) % 8);
235    b = fix_tree_depth(node->right, height + 1, 0);
236  } else if ((b > CODE_BITS) && (a < (b - 1))) {
237#ifdef STANDALONE
238    printf("Rotate left at depth %d.\n", height);
239#endif
240    temp = node->left;
241    node->left = node->right;
242    node->right = node->left->right;
243    node->left->right = node->left->left;
244    node->left->left = temp;
245    a = fix_tree_depth(node->left, height + 1 + (zeros == 7), (zeros + 1) % 8);
246    b = fix_tree_depth(node->right, height + 1, 0);
247  }
248  return ((a > b) ? a : b);
249}
250
251/* Add 1s to the tree, recursively */
252static void
253add_ones(CNode *node)
254{
255  int count;
256
257  count = 0;
258  do {
259    if (node->right)
260      add_ones(node->right);
261    if ((count >= 7) || ((count >= 3) && !node->left && !node->right)) {
262      ctop = (CNode *) malloc((unsigned) sizeof(CNode));
263      if (!ctop) {
264        do_rawlog(LT_ERR,
265                  "Cannot allocate memory for compression tree. Aborting.");
266        exit(1);
267      }
268      ctop->left = node->left;
269      ctop->right = node->right;
270      ctop->c = node->c;
271      node->left = (CNode *) NULL;
272      node->right = ctop;
273      node = ctop;
274      count = 0;
275    }
276    node = node->left;
277    count++;
278  } while (node);
279}
280
281/* Build ctable and ltable from the tree, recursively */
282static void
283build_ctable(CNode *root, CType code, int numbits)
284{
285#ifdef STANDALONE
286  int i;
287#endif
288
289  if (!root->left && !root->right) {
290    ctable[root->c] = code;
291    ltable[root->c] = numbits;
292#ifdef STANDALONE
293    printf(isprint(root->c) ? "Code for '%c':\t" : "Code for %d:\t", root->c);
294    for (i = 0; i < numbits; i++)
295      printf("%d", (code >> i) & 1);
296    printf("\n");
297#endif
298    if (numbits > CODE_BITS) {
299      do_rawlog(LT_ERR, "Illegal compression code length (%d). Aborting.",
300                numbits);
301      exit(1);
302    }
303  } else {
304    if (root->left)
305      build_ctable(root->left, code | (0 << numbits), numbits + 1);
306    if (root->right)
307      build_ctable(root->right, code | (1 << numbits), numbits + 1);
308  }
309}
310
311/** Initialize huffman compression.
312 * Initialize the compression tree and table in 5 steps:
313 * 1. Initialize arrays and things
314 * 2. Read indb (up to SAMPLE_SIZE chars, if defined) and count
315 *    the frequency of every character
316 * 3. Cheat the relative frequency of some known special chars
317 *    and upper-case letters
318 * 4. Construct an (un)compression tree based on frequencies
319 * 5. Construct a compression table by searching the tree
320 * \param f filehandle to read from to build the tree.
321 */
322int
323init_compress(FILE * f)
324{
325  int total;
326  unsigned char c;
327  struct {
328    long freq;
329    CNode *node;
330  } table[TABLE_SIZE];
331  int indx, count;
332  long temp;
333  CNode *node;
334
335#ifdef STANDALONE
336  printf("init_compress: Part 1\n");
337#endif
338
339  /* Part 1: initialize */
340  for (total = 0; total < TABLE_SIZE; total++) {
341    table[total].freq = 0;
342    table[total].node = (CNode *) malloc((unsigned) sizeof(CNode));
343    if (!table[total].node) {
344      do_rawlog(LT_ERR,
345                "Cannot allocate memory for compression tree. Aborting.");
346      exit(1);
347    }
348    table[total].node->c = total;
349    table[total].node->left = (CNode *) NULL;
350    table[total].node->right = (CNode *) NULL;
351  }
352
353#ifdef STANDALONE
354  printf("init_compress: Part 2\n");
355#endif
356
357  /* Part 2: count frequencies */
358  if (f) {
359    total = 0;
360    while (!feof(f) && (!SAMPLE_SIZE || (total++ < SAMPLE_SIZE))) {
361      c = fgetc(f);
362      table[c].freq++;
363    }
364  }
365#ifdef STANDALONE
366  for (indx = 0; indx < TABLE_SIZE; indx++) {
367    printf(isprint(indx) ? "Frequency for '%c': %d\n"
368           : "Frequency for %d: %d\n", (unsigned char) indx, table[indx].freq);
369  }
370#endif
371
372#ifdef STANDALONE
373  printf("init_compress: Part 3\n");
374#endif
375
376  /* Part 3: Cheat the frequencies. Because there's a lot of wierd
377   * stuff in indb (like ]'s and upper-case letters), we downplay it
378   * by cutting frequencies. Actually, we shouldn't need to much.
379   */
380
381  /* The ']' character is artificially raised by being the
382   * start-of-attribute marker in indb.  Set it back to '[',
383   * which it should be balancing...
384   */
385  table[']'].freq = table['['].freq;
386
387  /* The DEL character is returned once for no apparent reason (I think
388   * it is returned at EOF), so remove that one count...
389   */
390  if (table[255].freq)
391    table[255].freq--;
392
393  /* Newlines really aren't all that common in the attributes, so
394   * chop the value substantially.
395   */
396  table['\n'].freq /= 16;
397
398#ifdef STANDALONE
399  printf("init_compress: Part 4(a)\n");
400#endif
401
402  /* Part 4(a): Sort the table.  I'm using a stupid insert sort here
403   * because this only gets called once, and I don't want to conform
404   * to the qsort interface.  NOTE: I don't sort in EOS.
405   */
406  for (indx = 2; indx < TABLE_SIZE; indx++) {
407    for (count = indx;
408         (count > 1) && (table[count - 1].freq < table[count].freq); count--) {
409      temp = table[count].freq;
410      table[count].freq = table[count - 1].freq;
411      table[count - 1].freq = temp;
412      node = table[count].node;
413      table[count].node = table[count - 1].node;
414      table[count - 1].node = node;
415    }
416  }
417
418#ifdef STANDALONE
419  printf("init_compress: Part 4(b)\n");
420#endif
421
422  /* Part 4(b): Now we've got a list sorted from most freq (table[0]) to
423   * least freq. We build a binary tree by traversing the list, making
424   * a subtree out of the two least frequent nodes (creating a parent
425   * node whose frequency is the sum of its children's frequency),
426   * and reinserting the subtree's parent into the list. We repeat
427   * until there's only one node in the list, the root of the tree.
428   * NOTE: I'm still not dealing with EOS.
429   */
430  for (indx = TABLE_SIZE - 1; indx > 0; indx--) {
431#ifdef NEVER
432    printf("Freq. table:\n");
433    for (count = indx; count >= 0; count--)
434      printf("%3d: %d\t", table[count].node->c, table[count].freq);
435    printf("\n");
436#endif
437    node = (CNode *) malloc((unsigned) sizeof(CNode));
438    if (!node) {
439      do_rawlog(LT_ERR,
440                "Cannot allocate memory for compression tree. Aborting.");
441      exit(1);
442    }
443    node->left = table[indx].node;
444    node->right = table[indx - 1].node;
445    table[indx - 1].freq += table[indx].freq;
446    table[indx - 1].node = node;
447    for (count = indx - 1;
448         (count > 1) && (table[count - 1].freq <= table[count].freq); count--) {
449      temp = table[count].freq;
450      table[count].freq = table[count - 1].freq;
451      table[count - 1].freq = temp;
452      node = table[count].node;
453      table[count].node = table[count - 1].node;
454      table[count - 1].node = node;
455    }
456  }
457
458#ifdef NEVER
459  build_ctable(table[1].node, 0, 0);
460#endif
461
462#ifdef STANDALONE
463  printf("init_compress: Part 4(c)\n");
464#endif
465
466  /* Part 4(c): If necessary, squash the tree so that it obeys
467   * the code length limitations (CODE_BITS et all).  This is
468   * done recursively, with rotations.
469   */
470  (void) fix_tree_depth(table[1].node, 0, 2);
471
472#ifdef STANDALONE
473  printf("init_compress: Part 4(d)\n");
474#endif
475
476  /* Part 4(d): It is now time to insure that sequences of eight 0s
477   * never occur in the output data, because having nulls in the
478   * output would royally confuse strcpy et all.
479   */
480
481  /* Force a 1 at fifth position on the left edge of tree. (Or terminating
482   * 1 for the all 0 code.)
483   */
484  node = table[1].node;         /* top of tree */
485  for (count = 0; node->left && (count < 4); count++)
486    node = node->left;
487  ctop = (CNode *) malloc((unsigned) sizeof(CNode));
488  if (!ctop) {
489    do_rawlog(LT_ERR, "Cannot allocate memory for compression tree. Aborting.");
490    exit(1);
491  }
492  ctop->left = node->left;
493  ctop->right = node->right;
494  ctop->c = node->c;
495  node->left = (CNode *) NULL;
496  node->right = ctop;
497
498  /* Recursively descend tree adding 1s where needed. */
499
500  add_ones(table[1].node);
501
502#ifdef STANDALONE
503  printf("init_compress: Part 4(e)\n");
504#endif
505
506  /* Part 4(e): Finally add in EOS as 00000000.
507   */
508  node = table[1].node;         /* top of tree */
509  for (count = 0; count < 8; count++) {
510    if (!node->left) {
511      ctop = (CNode *) malloc((unsigned) sizeof(CNode));
512      if (!ctop) {
513        do_rawlog(LT_ERR,
514                  "Cannot allocate memory for compression tree. Aborting.");
515        exit(1);
516      }
517      ctop->left = (CNode *) NULL;
518      ctop->right = (CNode *) NULL;
519      ctop->c = EOS;
520      node->left = ctop;
521    }
522    node = node->left;
523  }
524
525#ifdef STANDALONE
526  printf("init_compress: Part 5\n");
527#endif
528
529  /* Part 5: Now traverse the tree, depth-first, and construct
530   * the compression table.
531   */
532
533  ctop = table[1].node;
534  build_ctable(ctop, 0, 0);
535
536#ifdef STANDALONE
537  printf("init_compress: Done\n");
538#endif
539
540  /* Whew */
541  return 0;
542}
543
544#ifdef STANDALONE
545void
546main(argc, argv)
547    int argc;
548    char *argv[];
549{
550  FILE *input;
551  unsigned char buffer[BUFFER_LEN];
552  unsigned char otherbuf[BUFFER_LEN];
553  unsigned char newbuffer[BUFFER_LEN];
554  unsigned char *p1, *p2;
555  int count;
556
557  if ((input = fopen(argv[1], "rb")) == NULL) {
558    printf("Can't open %s.\n", argv[1]);
559    exit(1);
560  }
561  init_compress(input);
562  fclose(input);
563  do {
564    printf("Enter text: ");
565    fgets(buffer, 4095, stdin);
566    if ((buffer[0] == '\n') || (buffer[0] == '\r') || (buffer[0] == '\0'))
567      exit(0);
568    printf("Text: %s!\n", buffer);
569    printf("Compressing\n");
570    strcpy(otherbuf, compress(buffer));
571    printf("Compressed: ");
572    p1 = otherbuf;
573    while (p1 && *p1) {
574      for (count = 0; count < 8; count++)
575        printf("%d", (*p1 >> count) & 1);
576      p1++;
577    }
578    printf("\n");
579    printf("Length: %d, Complength: %d\n", strlen(buffer), strlen(otherbuf));
580    printf("Uncompressing\n");
581    strcpy(newbuffer, uncompress(otherbuf));
582    printf("Text: %s!\n", newbuffer);
583    printf("Strcoll(orig,uncomp) = %d\n", strcoll(newbuffer, buffer));
584    printf("strlen(orig) = %d, strlen(uncomp) = %d\n", strlen(buffer),
585           strlen(newbuffer));
586    p1 = buffer;
587    p2 = newbuffer;
588/*
589 * while (p1 && p2 && *p1 && *p2) {
590 * if (*p1 != *p2) printf("Unequal: %d and %d\n",*p1,*p2);
591 * else printf("Equal: %c and %c\n",*p1,*p2);
592 * p1++; p2++;
593 * }
594 */
595    strcpy(newbuffer, otherbuf);
596/*
597 * printf("Trying safe.\n");
598 * buf = safe_uncompress(newbuffer);
599 * printf("Safe uncompress: %s!\n",buf);
600 */
601  } while (1);
602}
603#endif
Note: See TracBrowser for help on using the browser.