root/1.8.3/tags/p3/src/comp_w.c

Revision 919, 11.4 KB (checked in by shawnw, 19 months ago)

1.8.3p3

Line 
1/**
2 * \file comp_w.c
3 *
4 * \brief Word-based compression.
5 *
6 * Table-lookup (word) compress, written by Nick Gammon. 21/Oct/95.
7 *
8 * This method maintains a table of 32768 words, where a "word" is defined
9 * as a sequence of alpha or numeric characters (such as "dog123").
10 *
11 * Compression
12 * -----------
13 *
14 * The text to be compressed is broken up into "words" and other symbols
15 * (e.g. commas, brackets, spaces and so on).
16 *
17 * Words are looked up in the word table by using a hash-table lookup. If
18 * found, the "index" into the table is emitted with the high order bit set.
19 * If not found, the word is added to the table, and is then emitted as
20 * described above.
21 *
22 * For example: "The dog and the other dog sat on the mat (eating fish)."
23 * would compress as:
24 *
25 * 0x8001      The
26 * 0x8002      dog
27 * 0x8003      and
28 * 0x8004      the
29 * 0x8005      other
30 * 0x8002      dog
31 * 0x8007      sat
32 * 0x8008      on
33 * 0x8004      the
34 * 0x8009      mat
35 * 0x28        (
36 * 0x800A      eating
37 * 0x800B      fish)
38 * 0x2E        .
39 *
40 * In the above example, the uncompressed text is 55 bytes, and the compressed
41 * text is 26 bytes.
42 *
43 * For simplicity, the above example assumes that the words hash to consecutive
44 * table positions.
45 *
46 * Note that the trailing punctuation character (space, period or whatever) is
47 * considered _part_ of the word. This is to save having to store multiple
48 * spaces between each word, and is relying on the fact that a certain word is
49 * usually followed by the same punctuation. For example, the word "and" would
50 * normally be followed by a space.
51 *
52 * In the above example, the space following "mat" is stored in the table, and
53 * thus the "(" character had to be output separately. The ")" following the
54 * word "fish" is considered part of the word and is stored in the table,
55 * however the last period was output as a separate character.
56 *
57 * Note how the high-order bit is turned on for words in the table, this is so
58 * the decompression routine can detect table lookup characters from ordinary
59 * ones. Also, any table index with a low-order byte of zero cannot be used,
60 * as this would cause the resulting "string" to be prematurely truncated (by
61 * strcpy, strcmp etc.). Thus the lookup routine skips any positions that hash
62 * to the value 0xXX00. (There are only 256 such indices).
63 *
64 * Also, to prevent characters which the user might type in with the high order
65 * bit set causing decompression confusion, all text is stripped of its high
66 * order bit before being added to the table or emitted.
67 *
68 * In the event of two words hashing to the same value, the compression routine
69 * scans forwards for COLLISION_LIMIT entries, looking for a match or a spare
70 * table entry. If none is found, the word is output "as is" (i.e.
71 * uncompressed). You might speed up compression slightly by lowering
72 * COLLISION_LIMIT, at the cost of slightly lower compression ratios.
73 *
74 * The collision limit does not affect _decompression_ as that merely involves
75 * a direct table lookup.
76 *
77 * Decompression
78 * -------------
79 *
80 * Decompression involves a simple loop, in which each byte of the compressed
81 * text is examined. If it has the high-order bit clear, it is just emitted.
82 * If the high order bit is set it, along with the next byte in the compressed
83 * text, are used to index into the word lookup table. The word found there is
84 * then emitted.
85 *
86 * Speed
87 * -----
88 *
89 * Tests conducted on (hopefully typical) text show that decompression is about
90 * 4 times as fast as Huffman decompression.
91 *
92 * Compression however is about 3.5 times as _slow_ as Huffman compression.
93 *
94 * The slower compression time is considered acceptable on the grounds that
95 * text is much more often _decompressed_ in a MUSH than compressed. Compression
96 * mainly takes part at database load time (say, once a week) whereas
97 * decompression take part every hour, as the database is dumped to disk, and
98 * whenever an object description is displayed, or an attribute searched for,
99 *
100 * Compression ratio
101 * -----------------
102 *
103 * The "table compression" method has a poor compression ratio for small amounts
104 * of text, because of the overhead of the 32768 pointers, and the data stored
105 * in them. However, as the database size increases, the ratio improves because
106 * the table overhead becomes progressively less significant.
107 *
108 * The break-even points is with about 1.5 Mb of text, where both the table
109 * compression and Huffman compress to about 63% of the size of the original.
110 *
111 * After that, the compression ratio gradually improves until reaching somewhere
112 * between 40% and 50% of the size of the original, as the amount of text to
113 * compress reaches 10 Mb.
114 *
115 * The nature of Huffman compression however is such that it will always be
116 * fixed at about 63% regardless of the amount of data compressed.
117 */
118
119#include "copyrite.h"
120#include "config.h"
121#include <string.h>
122#include <stdlib.h>
123#include <ctype.h>
124#include "conf.h"
125#include "externs.h"
126#include "mushdb.h"
127#include "mymalloc.h"
128#include "confmagic.h"
129
130#define MAXTABLE 32768          /**< Maximum words in the table */
131#define MAXWORDS 100            /**< Maximum size of a word */
132#define COLLISION_LIMIT 20      /**< Maximum allowed collisions */
133
134#define COMPRESS_HASH_MASK 0x7FFF       /**< 32767 in hex */
135
136#define TABLE_FLAG 0x80         /**< Distinguish a table */
137#define TABLE_MASK 0x7F         /**< Mask out words within a table */
138
139/* Table of words */
140
141static char *words[MAXTABLE];
142static size_t words_len[MAXTABLE];
143
144/* The word we are currently compressing */
145
146static char word[MAXWORDS + 2];
147static size_t wordpos = 0;
148
149/* Stats */
150#ifdef COMP_STATS
151static long total_mallocs = 0;
152static long total_uncomp = 0;
153static long total_comp = 0;
154static long total_entries = 0;
155#endif
156
157/* Work pointer for compression */
158
159static unsigned char *b;
160
161static void output_previous_word(void);
162int init_compress(FILE * f);
163#ifdef COMP_STATS
164void compress_stats(long *entries, long *mem_used,
165                    long *total_uncompressed, long *total_compressed);
166#endif
167static unsigned int hash_fn(const char *s, int hashtab_mask);
168
169static void
170output_previous_word(void)
171{
172  char *p;
173  int i, j;
174
175  word[wordpos++] = 0;          /* word's trailing null */
176
177  /* Don't bother putting single or double letter words in the table */
178
179  if (wordpos <= 3) {
180    p = word;
181    while (*p)
182      *b++ = *p++;
183    return;
184  }
185  /* search table to see if word is already in it; */
186
187  for (i = hash_fn(word, COMPRESS_HASH_MASK), j = 0;
188       i < MAXTABLE &&
189       (words[i] || (i & 0xFF) == 0) && j < COLLISION_LIMIT; i++, j++)
190    if (words[i])
191      if (strcmp(word, words[i]) == 0) {
192        *b++ = (i >> 8) | TABLE_FLAG;
193        *b++ = i & 0xFF;
194        return;
195      }
196  /* not in table, add to it */
197
198  if ((i & 0xFF) == 0) {
199    i++;                        /* make sure we don't have a null in the message */
200    j++;
201  }
202  /* Can't add to table if full */
203
204  if (i >= MAXTABLE || j >= COLLISION_LIMIT) {
205    p = word;
206    while (*p)
207      *b++ = *p++;
208    return;
209  }
210  words[i] = malloc(wordpos);
211
212  if (!words[i])
213    mush_panic("Out of memory in string compression routine");
214
215#ifdef COMP_STATS
216  total_mallocs += wordpos;
217  total_entries++;
218#endif
219
220  strncpy(words[i], word, wordpos);
221  words_len[i] = wordpos;
222
223  *b++ = (i >> 8) | TABLE_FLAG;
224  *b++ = i & 0xFF;
225
226}                               /* end of output_previous_word */
227
228/** Word-compress a string.
229 *
230 * Important notes:
231 *   This function mallocs memory that should be freed by the caller!
232 *   The caller is also currently responsible for adding mem checks
233 *   Don't use it to compress strings longer than BUFFER_LEN or the
234 *     later uncompression will not go well.
235 *
236 * \param s string to be compressed.
237 * \return newly allocated compressed string.
238 */
239unsigned char *
240text_compress(char const *s)
241{
242  const unsigned char *p;
243  static unsigned char buf[BUFFER_LEN];
244
245  p = (unsigned char *) s;
246  b = buf;
247
248  wordpos = 0;
249
250/* break up input into words */
251  while (*p) {
252    if (!isalnum(*p) || wordpos >= MAXWORDS) {
253      if (wordpos) {
254        word[wordpos++] = *p & 0x7F;    /* add trailing punctuation */
255        output_previous_word();
256        wordpos = 0;
257      } else
258        *b++ = *p & 0x7F;
259    } else
260      word[wordpos++] = *p & 0x7F;
261    p++;
262  }
263
264  if (wordpos)
265    output_previous_word();
266
267  *b = 0;                       /* trailing null */
268
269#ifdef COMP_STATS
270  total_comp += u_strlen(buf);  /* calculate size of compressed   text */
271  total_uncomp += strlen(s);    /* calculate size of uncompressed text */
272#endif
273
274  return u_strdup(buf);
275}                               /* end of compress; */
276
277
278/** Word-uncompress a string.
279 * To avoid generating memory problems, this function should be
280 * used with something of the format
281 * \verbatim
282 * char tbuf1[BUFFER_LEN];
283 * strcpy(tbuf1, text_uncompress(a->value));
284 * \endverbatim
285 * if you are using something of type char *buff, use the
286 * safe_uncompress function instead.
287 *
288 * \param s a compressed string.
289 * \return a pointer to a static buffer containing the uncompressed string.
290 */
291char *
292text_uncompress(unsigned char const *s)
293{
294
295  const unsigned char *p;
296  char c;
297  int i;
298  static char buf[BUFFER_LEN];
299
300  buf[0] = '\0';
301  if (!s || !*s)
302    return buf;
303  p = (unsigned char *) s;
304  b = buf;
305
306  while (*p) {
307    c = *p;
308    if (c & TABLE_FLAG) {
309      i = ((c & TABLE_MASK) << 8) | *(++p);
310      if (i >= MAXTABLE || words[i] == NULL) {
311        static int panicking = 0;
312        if (panicking) {
313          do_rawlog(LT_ERR,
314                    "Error in string decompression occurred during panic dump.");
315          exit(1);
316        } else {
317          panicking = 1;        /* don't panic from within panic */
318          do_rawlog(LT_ERR, "Error in string decompression, i = %i", i);
319          mush_panic("Fatal error in decompression");
320        }
321      }
322      strncpy((char *) b, words[i], words_len[i]);
323      b += words_len[i] - 1;
324    } else
325      *b++ = c;
326    p++;
327  }
328
329  *b++ = 0;                     /* trailing null */
330
331  return buf;
332
333}                               /* end of uncompress; */
334
335/** Word-uncompress a string, allocating memory.
336 * this function should be used when you're doing something like
337 * \verbatim
338 * char *attrib = safe_uncompress(a->value);
339 *
340 * NEVER use it with something like
341 *
342 * char tbuf1[BUFFER_LEN];
343 * strcpy(tbuf1, safe_uncompress(a->value));
344 * \endverbatim
345 * or you will create a horrendous memory leak.
346 *
347 * \param s compressed string to uncompress.
348 * \return pointer to newly allocated string containing uncompressed text.
349 */
350char *
351safe_uncompress(unsigned char const *s)
352{
353  return (char *) strdup((char *) uncompress(s));
354}
355
356
357/** Initialize the word compression.
358 * This function clears the words table the first time through.
359 * \param f (unused).
360 */
361int
362init_compress(FILE * f __attribute__ ((__unused__)))
363{
364  memset(words, 0, sizeof words);
365  memset(words_len, 0, sizeof words_len);
366  return 0;
367}
368
369#ifdef COMP_STATS
370/** Return word-compression statistics.
371 */
372void
373compress_stats(long *entries, long *mem_used, long *total_uncompressed,
374               long *total_compressed)
375{
376
377  *entries = total_entries;
378  *mem_used = total_mallocs;
379  *total_uncompressed = total_uncomp;
380  *total_compressed = total_comp;
381
382}
383#endif
384
385static unsigned
386hash_fn(const char *s, int hashtab_mask)
387{
388  /* hash function, using masks (based on TinyMUSH 2.0) */
389
390  unsigned hashval;
391  const char *p;
392
393  for (hashval = 0, p = s; *p; p++)
394    hashval = (hashval << 5) + hashval + *p;
395  return (hashval & hashtab_mask);
396}
Note: See TracBrowser for help on using the browser.