/src/postgres/src/common/pg_lzcompress.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* ---------- |
2 | | * pg_lzcompress.c - |
3 | | * |
4 | | * This is an implementation of LZ compression for PostgreSQL. |
5 | | * It uses a simple history table and generates 2-3 byte tags |
6 | | * capable of backward copy information for 3-273 bytes with |
7 | | * a max offset of 4095. |
8 | | * |
9 | | * Entry routines: |
10 | | * |
11 | | * int32 |
12 | | * pglz_compress(const char *source, int32 slen, char *dest, |
13 | | * const PGLZ_Strategy *strategy); |
14 | | * |
15 | | * source is the input data to be compressed. |
16 | | * |
17 | | * slen is the length of the input data. |
18 | | * |
19 | | * dest is the output area for the compressed result. |
20 | | * It must be at least as big as PGLZ_MAX_OUTPUT(slen). |
21 | | * |
22 | | * strategy is a pointer to some information controlling |
23 | | * the compression algorithm. If NULL, the compiled |
24 | | * in default strategy is used. |
25 | | * |
26 | | * The return value is the number of bytes written in the |
27 | | * buffer dest, or -1 if compression fails; in the latter |
28 | | * case the contents of dest are undefined. |
29 | | * |
30 | | * int32 |
31 | | * pglz_decompress(const char *source, int32 slen, char *dest, |
32 | | * int32 rawsize, bool check_complete) |
33 | | * |
34 | | * source is the compressed input. |
35 | | * |
36 | | * slen is the length of the compressed input. |
37 | | * |
38 | | * dest is the area where the uncompressed data will be |
39 | | * written to. It is the callers responsibility to |
40 | | * provide enough space. |
41 | | * |
42 | | * The data is written to buff exactly as it was handed |
43 | | * to pglz_compress(). No terminating zero byte is added. |
44 | | * |
45 | | * rawsize is the length of the uncompressed data. |
46 | | * |
47 | | * check_complete is a flag to let us know if -1 should be |
48 | | * returned in cases where we don't reach the end of the |
49 | | * source or dest buffers, or not. This should be false |
50 | | * if the caller is asking for only a partial result and |
51 | | * true otherwise. |
52 | | * |
53 | | * The return value is the number of bytes written in the |
54 | | * buffer dest, or -1 if decompression fails. |
55 | | * |
56 | | * The decompression algorithm and internal data format: |
57 | | * |
58 | | * It is made with the compressed data itself. |
59 | | * |
60 | | * The data representation is easiest explained by describing |
61 | | * the process of decompression. |
62 | | * |
63 | | * If compressed_size == rawsize, then the data |
64 | | * is stored uncompressed as plain bytes. Thus, the decompressor |
65 | | * simply copies rawsize bytes to the destination. |
66 | | * |
67 | | * Otherwise the first byte tells what to do the next 8 times. |
68 | | * We call this the control byte. |
69 | | * |
70 | | * An unset bit in the control byte means, that one uncompressed |
71 | | * byte follows, which is copied from input to output. |
72 | | * |
73 | | * A set bit in the control byte means, that a tag of 2-3 bytes |
74 | | * follows. A tag contains information to copy some bytes, that |
75 | | * are already in the output buffer, to the current location in |
76 | | * the output. Let's call the three tag bytes T1, T2 and T3. The |
77 | | * position of the data to copy is coded as an offset from the |
78 | | * actual output position. |
79 | | * |
80 | | * The offset is in the upper nibble of T1 and in T2. |
81 | | * The length is in the lower nibble of T1. |
82 | | * |
83 | | * So the 16 bits of a 2 byte tag are coded as |
84 | | * |
85 | | * 7---T1--0 7---T2--0 |
86 | | * OOOO LLLL OOOO OOOO |
87 | | * |
88 | | * This limits the offset to 1-4095 (12 bits) and the length |
89 | | * to 3-18 (4 bits) because 3 is always added to it. To emit |
90 | | * a tag of 2 bytes with a length of 2 only saves one control |
91 | | * bit. But we lose one byte in the possible length of a tag. |
92 | | * |
93 | | * In the actual implementation, the 2 byte tag's length is |
94 | | * limited to 3-17, because the value 0xF in the length nibble |
95 | | * has special meaning. It means, that the next following |
96 | | * byte (T3) has to be added to the length value of 18. That |
97 | | * makes total limits of 1-4095 for offset and 3-273 for length. |
98 | | * |
99 | | * Now that we have successfully decoded a tag. We simply copy |
100 | | * the output that occurred <offset> bytes back to the current |
101 | | * output location in the specified <length>. Thus, a |
102 | | * sequence of 200 spaces (think about bpchar fields) could be |
103 | | * coded in 4 bytes. One literal space and a three byte tag to |
104 | | * copy 199 bytes with a -1 offset. Whow - that's a compression |
105 | | * rate of 98%! Well, the implementation needs to save the |
106 | | * original data size too, so we need another 4 bytes for it |
107 | | * and end up with a total compression rate of 96%, what's still |
108 | | * worth a Whow. |
109 | | * |
110 | | * The compression algorithm |
111 | | * |
112 | | * The following uses numbers used in the default strategy. |
113 | | * |
114 | | * The compressor works best for attributes of a size between |
115 | | * 1K and 1M. For smaller items there's not that much chance of |
116 | | * redundancy in the character sequence (except for large areas |
117 | | * of identical bytes like trailing spaces) and for bigger ones |
118 | | * our 4K maximum look-back distance is too small. |
119 | | * |
120 | | * The compressor creates a table for lists of positions. |
121 | | * For each input position (except the last 3), a hash key is |
122 | | * built from the 4 next input bytes and the position remembered |
123 | | * in the appropriate list. Thus, the table points to linked |
124 | | * lists of likely to be at least in the first 4 characters |
125 | | * matching strings. This is done on the fly while the input |
126 | | * is compressed into the output area. Table entries are only |
127 | | * kept for the last 4096 input positions, since we cannot use |
128 | | * back-pointers larger than that anyway. The size of the hash |
129 | | * table is chosen based on the size of the input - a larger table |
130 | | * has a larger startup cost, as it needs to be initialized to |
131 | | * zero, but reduces the number of hash collisions on long inputs. |
132 | | * |
133 | | * For each byte in the input, its hash key (built from this |
134 | | * byte and the next 3) is used to find the appropriate list |
135 | | * in the table. The lists remember the positions of all bytes |
136 | | * that had the same hash key in the past in increasing backward |
137 | | * offset order. Now for all entries in the used lists, the |
138 | | * match length is computed by comparing the characters from the |
139 | | * entries position with the characters from the actual input |
140 | | * position. |
141 | | * |
142 | | * The compressor starts with a so called "good_match" of 128. |
143 | | * It is a "prefer speed against compression ratio" optimizer. |
144 | | * So if the first entry looked at already has 128 or more |
145 | | * matching characters, the lookup stops and that position is |
146 | | * used for the next tag in the output. |
147 | | * |
148 | | * For each subsequent entry in the history list, the "good_match" |
149 | | * is lowered by 10%. So the compressor will be more happy with |
150 | | * short matches the further it has to go back in the history. |
151 | | * Another "speed against ratio" preference characteristic of |
152 | | * the algorithm. |
153 | | * |
154 | | * Thus there are 3 stop conditions for the lookup of matches: |
155 | | * |
156 | | * - a match >= good_match is found |
157 | | * - there are no more history entries to look at |
158 | | * - the next history entry is already too far back |
159 | | * to be coded into a tag. |
160 | | * |
161 | | * Finally the match algorithm checks that at least a match |
162 | | * of 3 or more bytes has been found, because that is the smallest |
163 | | * amount of copy information to code into a tag. If so, a tag |
164 | | * is omitted and all the input bytes covered by that are just |
165 | | * scanned for the history add's, otherwise a literal character |
166 | | * is omitted and only his history entry added. |
167 | | * |
168 | | * Acknowledgments: |
169 | | * |
170 | | * Many thanks to Adisak Pochanayon, who's article about SLZ |
171 | | * inspired me to write the PostgreSQL compression this way. |
172 | | * |
173 | | * Jan Wieck |
174 | | * |
175 | | * Copyright (c) 1999-2025, PostgreSQL Global Development Group |
176 | | * |
177 | | * src/common/pg_lzcompress.c |
178 | | * ---------- |
179 | | */ |
180 | | #ifndef FRONTEND |
181 | | #include "postgres.h" |
182 | | #else |
183 | | #include "postgres_fe.h" |
184 | | #endif |
185 | | |
186 | | #include <limits.h> |
187 | | |
188 | | #include "common/pg_lzcompress.h" |
189 | | |
190 | | |
191 | | /* ---------- |
192 | | * Local definitions |
193 | | * ---------- |
194 | | */ |
195 | | #define PGLZ_MAX_HISTORY_LISTS 8192 /* must be power of 2 */ |
196 | 0 | #define PGLZ_HISTORY_SIZE 4096 |
197 | 0 | #define PGLZ_MAX_MATCH 273 |
198 | | |
199 | | |
200 | | /* ---------- |
201 | | * PGLZ_HistEntry - |
202 | | * |
203 | | * Linked list for the backward history lookup |
204 | | * |
205 | | * All the entries sharing a hash key are linked in a doubly linked list. |
206 | | * This makes it easy to remove an entry when it's time to recycle it |
207 | | * (because it's more than 4K positions old). |
208 | | * ---------- |
209 | | */ |
210 | | typedef struct PGLZ_HistEntry |
211 | | { |
212 | | struct PGLZ_HistEntry *next; /* links for my hash key's list */ |
213 | | struct PGLZ_HistEntry *prev; |
214 | | int hindex; /* my current hash key */ |
215 | | const char *pos; /* my input position */ |
216 | | } PGLZ_HistEntry; |
217 | | |
218 | | |
219 | | /* ---------- |
220 | | * The provided standard strategies |
221 | | * ---------- |
222 | | */ |
223 | | static const PGLZ_Strategy strategy_default_data = { |
224 | | 32, /* Data chunks less than 32 bytes are not |
225 | | * compressed */ |
226 | | INT_MAX, /* No upper limit on what we'll try to |
227 | | * compress */ |
228 | | 25, /* Require 25% compression rate, or not worth |
229 | | * it */ |
230 | | 1024, /* Give up if no compression in the first 1KB */ |
231 | | 128, /* Stop history lookup if a match of 128 bytes |
232 | | * is found */ |
233 | | 10 /* Lower good match size by 10% at every loop |
234 | | * iteration */ |
235 | | }; |
236 | | const PGLZ_Strategy *const PGLZ_strategy_default = &strategy_default_data; |
237 | | |
238 | | |
239 | | static const PGLZ_Strategy strategy_always_data = { |
240 | | 0, /* Chunks of any size are compressed */ |
241 | | INT_MAX, |
242 | | 0, /* It's enough to save one single byte */ |
243 | | INT_MAX, /* Never give up early */ |
244 | | 128, /* Stop history lookup if a match of 128 bytes |
245 | | * is found */ |
246 | | 6 /* Look harder for a good match */ |
247 | | }; |
248 | | const PGLZ_Strategy *const PGLZ_strategy_always = &strategy_always_data; |
249 | | |
250 | | |
251 | | /* ---------- |
252 | | * Statically allocated work arrays for history |
253 | | * ---------- |
254 | | */ |
255 | | static int16 hist_start[PGLZ_MAX_HISTORY_LISTS]; |
256 | | static PGLZ_HistEntry hist_entries[PGLZ_HISTORY_SIZE + 1]; |
257 | | |
258 | | /* |
259 | | * Element 0 in hist_entries is unused, and means 'invalid'. Likewise, |
260 | | * INVALID_ENTRY_PTR in next/prev pointers mean 'invalid'. |
261 | | */ |
262 | 0 | #define INVALID_ENTRY 0 |
263 | 0 | #define INVALID_ENTRY_PTR (&hist_entries[INVALID_ENTRY]) |
264 | | |
265 | | /* ---------- |
266 | | * pglz_hist_idx - |
267 | | * |
268 | | * Computes the history table slot for the lookup by the next 4 |
269 | | * characters in the input. |
270 | | * |
271 | | * NB: because we use the next 4 characters, we are not guaranteed to |
272 | | * find 3-character matches; they very possibly will be in the wrong |
273 | | * hash list. This seems an acceptable tradeoff for spreading out the |
274 | | * hash keys more. |
275 | | * ---------- |
276 | | */ |
277 | 0 | #define pglz_hist_idx(_s,_e, _mask) ( \ |
278 | 0 | ((((_e) - (_s)) < 4) ? (int) (_s)[0] : \ |
279 | 0 | (((_s)[0] << 6) ^ ((_s)[1] << 4) ^ \ |
280 | 0 | ((_s)[2] << 2) ^ (_s)[3])) & (_mask) \ |
281 | 0 | ) |
282 | | |
283 | | |
284 | | /* ---------- |
285 | | * pglz_hist_add - |
286 | | * |
287 | | * Adds a new entry to the history table. |
288 | | * |
289 | | * If _recycle is true, then we are recycling a previously used entry, |
290 | | * and must first delink it from its old hashcode's linked list. |
291 | | * |
292 | | * NOTE: beware of multiple evaluations of macro's arguments, and note that |
293 | | * _hn and _recycle are modified in the macro. |
294 | | * ---------- |
295 | | */ |
296 | 0 | #define pglz_hist_add(_hs,_he,_hn,_recycle,_s,_e, _mask) \ |
297 | 0 | do { \ |
298 | 0 | int __hindex = pglz_hist_idx((_s),(_e), (_mask)); \ |
299 | 0 | int16 *__myhsp = &(_hs)[__hindex]; \ |
300 | 0 | PGLZ_HistEntry *__myhe = &(_he)[_hn]; \ |
301 | 0 | if (_recycle) { \ |
302 | 0 | if (__myhe->prev == NULL) \ |
303 | 0 | (_hs)[__myhe->hindex] = __myhe->next - (_he); \ |
304 | 0 | else \ |
305 | 0 | __myhe->prev->next = __myhe->next; \ |
306 | 0 | if (__myhe->next != NULL) \ |
307 | 0 | __myhe->next->prev = __myhe->prev; \ |
308 | 0 | } \ |
309 | 0 | __myhe->next = &(_he)[*__myhsp]; \ |
310 | 0 | __myhe->prev = NULL; \ |
311 | 0 | __myhe->hindex = __hindex; \ |
312 | 0 | __myhe->pos = (_s); \ |
313 | 0 | /* If there was an existing entry in this hash slot, link */ \ |
314 | 0 | /* this new entry to it. However, the 0th entry in the */ \ |
315 | 0 | /* entries table is unused, so we can freely scribble on it. */ \ |
316 | 0 | /* So don't bother checking if the slot was used - we'll */ \ |
317 | 0 | /* scribble on the unused entry if it was not, but that's */ \ |
318 | 0 | /* harmless. Avoiding the branch in this critical path */ \ |
319 | 0 | /* speeds this up a little bit. */ \ |
320 | 0 | /* if (*__myhsp != INVALID_ENTRY) */ \ |
321 | 0 | (_he)[(*__myhsp)].prev = __myhe; \ |
322 | 0 | *__myhsp = _hn; \ |
323 | 0 | if (++(_hn) >= PGLZ_HISTORY_SIZE + 1) { \ |
324 | 0 | (_hn) = 1; \ |
325 | 0 | (_recycle) = true; \ |
326 | 0 | } \ |
327 | 0 | } while (0) |
328 | | |
329 | | |
330 | | /* ---------- |
331 | | * pglz_out_ctrl - |
332 | | * |
333 | | * Outputs the last and allocates a new control byte if needed. |
334 | | * ---------- |
335 | | */ |
336 | 0 | #define pglz_out_ctrl(__ctrlp,__ctrlb,__ctrl,__buf) \ |
337 | 0 | do { \ |
338 | 0 | if ((__ctrl & 0xff) == 0) \ |
339 | 0 | { \ |
340 | 0 | *(__ctrlp) = __ctrlb; \ |
341 | 0 | __ctrlp = (__buf)++; \ |
342 | 0 | __ctrlb = 0; \ |
343 | 0 | __ctrl = 1; \ |
344 | 0 | } \ |
345 | 0 | } while (0) |
346 | | |
347 | | |
348 | | /* ---------- |
349 | | * pglz_out_literal - |
350 | | * |
351 | | * Outputs a literal byte to the destination buffer including the |
352 | | * appropriate control bit. |
353 | | * ---------- |
354 | | */ |
355 | 0 | #define pglz_out_literal(_ctrlp,_ctrlb,_ctrl,_buf,_byte) \ |
356 | 0 | do { \ |
357 | 0 | pglz_out_ctrl(_ctrlp,_ctrlb,_ctrl,_buf); \ |
358 | 0 | *(_buf)++ = (unsigned char)(_byte); \ |
359 | 0 | _ctrl <<= 1; \ |
360 | 0 | } while (0) |
361 | | |
362 | | |
363 | | /* ---------- |
364 | | * pglz_out_tag - |
365 | | * |
366 | | * Outputs a backward reference tag of 2-4 bytes (depending on |
367 | | * offset and length) to the destination buffer including the |
368 | | * appropriate control bit. |
369 | | * ---------- |
370 | | */ |
371 | 0 | #define pglz_out_tag(_ctrlp,_ctrlb,_ctrl,_buf,_len,_off) \ |
372 | 0 | do { \ |
373 | 0 | pglz_out_ctrl(_ctrlp,_ctrlb,_ctrl,_buf); \ |
374 | 0 | _ctrlb |= _ctrl; \ |
375 | 0 | _ctrl <<= 1; \ |
376 | 0 | if (_len > 17) \ |
377 | 0 | { \ |
378 | 0 | (_buf)[0] = (unsigned char)((((_off) & 0xf00) >> 4) | 0x0f); \ |
379 | 0 | (_buf)[1] = (unsigned char)(((_off) & 0xff)); \ |
380 | 0 | (_buf)[2] = (unsigned char)((_len) - 18); \ |
381 | 0 | (_buf) += 3; \ |
382 | 0 | } else { \ |
383 | 0 | (_buf)[0] = (unsigned char)((((_off) & 0xf00) >> 4) | ((_len) - 3)); \ |
384 | 0 | (_buf)[1] = (unsigned char)((_off) & 0xff); \ |
385 | 0 | (_buf) += 2; \ |
386 | 0 | } \ |
387 | 0 | } while (0) |
388 | | |
389 | | |
390 | | /* ---------- |
391 | | * pglz_find_match - |
392 | | * |
393 | | * Lookup the history table if the actual input stream matches |
394 | | * another sequence of characters, starting somewhere earlier |
395 | | * in the input buffer. |
396 | | * ---------- |
397 | | */ |
398 | | static inline int |
399 | | pglz_find_match(int16 *hstart, const char *input, const char *end, |
400 | | int *lenp, int *offp, int good_match, int good_drop, int mask) |
401 | 0 | { |
402 | 0 | PGLZ_HistEntry *hent; |
403 | 0 | int16 hentno; |
404 | 0 | int32 len = 0; |
405 | 0 | int32 off = 0; |
406 | | |
407 | | /* |
408 | | * Traverse the linked history list until a good enough match is found. |
409 | | */ |
410 | 0 | hentno = hstart[pglz_hist_idx(input, end, mask)]; |
411 | 0 | hent = &hist_entries[hentno]; |
412 | 0 | while (hent != INVALID_ENTRY_PTR) |
413 | 0 | { |
414 | 0 | const char *ip = input; |
415 | 0 | const char *hp = hent->pos; |
416 | 0 | int32 thisoff; |
417 | 0 | int32 thislen; |
418 | | |
419 | | /* |
420 | | * Stop if the offset does not fit into our tag anymore. |
421 | | */ |
422 | 0 | thisoff = ip - hp; |
423 | 0 | if (thisoff >= 0x0fff) |
424 | 0 | break; |
425 | | |
426 | | /* |
427 | | * Determine length of match. A better match must be larger than the |
428 | | * best so far. And if we already have a match of 16 or more bytes, |
429 | | * it's worth the call overhead to use memcmp() to check if this match |
430 | | * is equal for the same size. After that we must fallback to |
431 | | * character by character comparison to know the exact position where |
432 | | * the diff occurred. |
433 | | */ |
434 | 0 | thislen = 0; |
435 | 0 | if (len >= 16) |
436 | 0 | { |
437 | 0 | if (memcmp(ip, hp, len) == 0) |
438 | 0 | { |
439 | 0 | thislen = len; |
440 | 0 | ip += len; |
441 | 0 | hp += len; |
442 | 0 | while (ip < end && *ip == *hp && thislen < PGLZ_MAX_MATCH) |
443 | 0 | { |
444 | 0 | thislen++; |
445 | 0 | ip++; |
446 | 0 | hp++; |
447 | 0 | } |
448 | 0 | } |
449 | 0 | } |
450 | 0 | else |
451 | 0 | { |
452 | 0 | while (ip < end && *ip == *hp && thislen < PGLZ_MAX_MATCH) |
453 | 0 | { |
454 | 0 | thislen++; |
455 | 0 | ip++; |
456 | 0 | hp++; |
457 | 0 | } |
458 | 0 | } |
459 | | |
460 | | /* |
461 | | * Remember this match as the best (if it is) |
462 | | */ |
463 | 0 | if (thislen > len) |
464 | 0 | { |
465 | 0 | len = thislen; |
466 | 0 | off = thisoff; |
467 | 0 | } |
468 | | |
469 | | /* |
470 | | * Advance to the next history entry |
471 | | */ |
472 | 0 | hent = hent->next; |
473 | | |
474 | | /* |
475 | | * Be happy with lesser good matches the more entries we visited. But |
476 | | * no point in doing calculation if we're at end of list. |
477 | | */ |
478 | 0 | if (hent != INVALID_ENTRY_PTR) |
479 | 0 | { |
480 | 0 | if (len >= good_match) |
481 | 0 | break; |
482 | 0 | good_match -= (good_match * good_drop) / 100; |
483 | 0 | } |
484 | 0 | } |
485 | | |
486 | | /* |
487 | | * Return match information only if it results at least in one byte |
488 | | * reduction. |
489 | | */ |
490 | 0 | if (len > 2) |
491 | 0 | { |
492 | 0 | *lenp = len; |
493 | 0 | *offp = off; |
494 | 0 | return 1; |
495 | 0 | } |
496 | | |
497 | 0 | return 0; |
498 | 0 | } |
499 | | |
500 | | |
501 | | /* ---------- |
502 | | * pglz_compress - |
503 | | * |
504 | | * Compresses source into dest using strategy. Returns the number of |
505 | | * bytes written in buffer dest, or -1 if compression fails. |
506 | | * ---------- |
507 | | */ |
508 | | int32 |
509 | | pglz_compress(const char *source, int32 slen, char *dest, |
510 | | const PGLZ_Strategy *strategy) |
511 | 0 | { |
512 | 0 | unsigned char *bp = (unsigned char *) dest; |
513 | 0 | unsigned char *bstart = bp; |
514 | 0 | int hist_next = 1; |
515 | 0 | bool hist_recycle = false; |
516 | 0 | const char *dp = source; |
517 | 0 | const char *dend = source + slen; |
518 | 0 | unsigned char ctrl_dummy = 0; |
519 | 0 | unsigned char *ctrlp = &ctrl_dummy; |
520 | 0 | unsigned char ctrlb = 0; |
521 | 0 | unsigned char ctrl = 0; |
522 | 0 | bool found_match = false; |
523 | 0 | int32 match_len; |
524 | 0 | int32 match_off; |
525 | 0 | int32 good_match; |
526 | 0 | int32 good_drop; |
527 | 0 | int32 result_size; |
528 | 0 | int32 result_max; |
529 | 0 | int32 need_rate; |
530 | 0 | int hashsz; |
531 | 0 | int mask; |
532 | | |
533 | | /* |
534 | | * Our fallback strategy is the default. |
535 | | */ |
536 | 0 | if (strategy == NULL) |
537 | 0 | strategy = PGLZ_strategy_default; |
538 | | |
539 | | /* |
540 | | * If the strategy forbids compression (at all or if source chunk size out |
541 | | * of range), fail. |
542 | | */ |
543 | 0 | if (strategy->match_size_good <= 0 || |
544 | 0 | slen < strategy->min_input_size || |
545 | 0 | slen > strategy->max_input_size) |
546 | 0 | return -1; |
547 | | |
548 | | /* |
549 | | * Limit the match parameters to the supported range. |
550 | | */ |
551 | 0 | good_match = strategy->match_size_good; |
552 | 0 | if (good_match > PGLZ_MAX_MATCH) |
553 | 0 | good_match = PGLZ_MAX_MATCH; |
554 | 0 | else if (good_match < 17) |
555 | 0 | good_match = 17; |
556 | |
|
557 | 0 | good_drop = strategy->match_size_drop; |
558 | 0 | if (good_drop < 0) |
559 | 0 | good_drop = 0; |
560 | 0 | else if (good_drop > 100) |
561 | 0 | good_drop = 100; |
562 | |
|
563 | 0 | need_rate = strategy->min_comp_rate; |
564 | 0 | if (need_rate < 0) |
565 | 0 | need_rate = 0; |
566 | 0 | else if (need_rate > 99) |
567 | 0 | need_rate = 99; |
568 | | |
569 | | /* |
570 | | * Compute the maximum result size allowed by the strategy, namely the |
571 | | * input size minus the minimum wanted compression rate. This had better |
572 | | * be <= slen, else we might overrun the provided output buffer. |
573 | | */ |
574 | 0 | if (slen > (INT_MAX / 100)) |
575 | 0 | { |
576 | | /* Approximate to avoid overflow */ |
577 | 0 | result_max = (slen / 100) * (100 - need_rate); |
578 | 0 | } |
579 | 0 | else |
580 | 0 | result_max = (slen * (100 - need_rate)) / 100; |
581 | | |
582 | | /* |
583 | | * Experiments suggest that these hash sizes work pretty well. A large |
584 | | * hash table minimizes collision, but has a higher startup cost. For a |
585 | | * small input, the startup cost dominates. The table size must be a power |
586 | | * of two. |
587 | | */ |
588 | 0 | if (slen < 128) |
589 | 0 | hashsz = 512; |
590 | 0 | else if (slen < 256) |
591 | 0 | hashsz = 1024; |
592 | 0 | else if (slen < 512) |
593 | 0 | hashsz = 2048; |
594 | 0 | else if (slen < 1024) |
595 | 0 | hashsz = 4096; |
596 | 0 | else |
597 | 0 | hashsz = 8192; |
598 | 0 | mask = hashsz - 1; |
599 | | |
600 | | /* |
601 | | * Initialize the history lists to empty. We do not need to zero the |
602 | | * hist_entries[] array; its entries are initialized as they are used. |
603 | | */ |
604 | 0 | memset(hist_start, 0, hashsz * sizeof(int16)); |
605 | | |
606 | | /* |
607 | | * Compress the source directly into the output buffer. |
608 | | */ |
609 | 0 | while (dp < dend) |
610 | 0 | { |
611 | | /* |
612 | | * If we already exceeded the maximum result size, fail. |
613 | | * |
614 | | * We check once per loop; since the loop body could emit as many as 4 |
615 | | * bytes (a control byte and 3-byte tag), PGLZ_MAX_OUTPUT() had better |
616 | | * allow 4 slop bytes. |
617 | | */ |
618 | 0 | if (bp - bstart >= result_max) |
619 | 0 | return -1; |
620 | | |
621 | | /* |
622 | | * If we've emitted more than first_success_by bytes without finding |
623 | | * anything compressible at all, fail. This lets us fall out |
624 | | * reasonably quickly when looking at incompressible input (such as |
625 | | * pre-compressed data). |
626 | | */ |
627 | 0 | if (!found_match && bp - bstart >= strategy->first_success_by) |
628 | 0 | return -1; |
629 | | |
630 | | /* |
631 | | * Try to find a match in the history |
632 | | */ |
633 | 0 | if (pglz_find_match(hist_start, dp, dend, &match_len, |
634 | 0 | &match_off, good_match, good_drop, mask)) |
635 | 0 | { |
636 | | /* |
637 | | * Create the tag and add history entries for all matched |
638 | | * characters. |
639 | | */ |
640 | 0 | pglz_out_tag(ctrlp, ctrlb, ctrl, bp, match_len, match_off); |
641 | 0 | while (match_len--) |
642 | 0 | { |
643 | 0 | pglz_hist_add(hist_start, hist_entries, |
644 | 0 | hist_next, hist_recycle, |
645 | 0 | dp, dend, mask); |
646 | 0 | dp++; /* Do not do this ++ in the line above! */ |
647 | | /* The macro would do it four times - Jan. */ |
648 | 0 | } |
649 | 0 | found_match = true; |
650 | 0 | } |
651 | 0 | else |
652 | 0 | { |
653 | | /* |
654 | | * No match found. Copy one literal byte. |
655 | | */ |
656 | 0 | pglz_out_literal(ctrlp, ctrlb, ctrl, bp, *dp); |
657 | 0 | pglz_hist_add(hist_start, hist_entries, |
658 | 0 | hist_next, hist_recycle, |
659 | 0 | dp, dend, mask); |
660 | 0 | dp++; /* Do not do this ++ in the line above! */ |
661 | | /* The macro would do it four times - Jan. */ |
662 | 0 | } |
663 | 0 | } |
664 | | |
665 | | /* |
666 | | * Write out the last control byte and check that we haven't overrun the |
667 | | * output size allowed by the strategy. |
668 | | */ |
669 | 0 | *ctrlp = ctrlb; |
670 | 0 | result_size = bp - bstart; |
671 | 0 | if (result_size >= result_max) |
672 | 0 | return -1; |
673 | | |
674 | | /* success */ |
675 | 0 | return result_size; |
676 | 0 | } |
677 | | |
678 | | |
679 | | /* ---------- |
680 | | * pglz_decompress - |
681 | | * |
682 | | * Decompresses source into dest. Returns the number of bytes |
683 | | * decompressed into the destination buffer, or -1 if the |
684 | | * compressed data is corrupted. |
685 | | * |
686 | | * If check_complete is true, the data is considered corrupted |
687 | | * if we don't exactly fill the destination buffer. Callers that |
688 | | * are extracting a slice typically can't apply this check. |
689 | | * ---------- |
690 | | */ |
691 | | int32 |
692 | | pglz_decompress(const char *source, int32 slen, char *dest, |
693 | | int32 rawsize, bool check_complete) |
694 | 0 | { |
695 | 0 | const unsigned char *sp; |
696 | 0 | const unsigned char *srcend; |
697 | 0 | unsigned char *dp; |
698 | 0 | unsigned char *destend; |
699 | |
|
700 | 0 | sp = (const unsigned char *) source; |
701 | 0 | srcend = ((const unsigned char *) source) + slen; |
702 | 0 | dp = (unsigned char *) dest; |
703 | 0 | destend = dp + rawsize; |
704 | |
|
705 | 0 | while (sp < srcend && dp < destend) |
706 | 0 | { |
707 | | /* |
708 | | * Read one control byte and process the next 8 items (or as many as |
709 | | * remain in the compressed input). |
710 | | */ |
711 | 0 | unsigned char ctrl = *sp++; |
712 | 0 | int ctrlc; |
713 | |
|
714 | 0 | for (ctrlc = 0; ctrlc < 8 && sp < srcend && dp < destend; ctrlc++) |
715 | 0 | { |
716 | 0 | if (ctrl & 1) |
717 | 0 | { |
718 | | /* |
719 | | * Set control bit means we must read a match tag. The match |
720 | | * is coded with two bytes. First byte uses lower nibble to |
721 | | * code length - 3. Higher nibble contains upper 4 bits of the |
722 | | * offset. The next following byte contains the lower 8 bits |
723 | | * of the offset. If the length is coded as 18, another |
724 | | * extension tag byte tells how much longer the match really |
725 | | * was (0-255). |
726 | | */ |
727 | 0 | int32 len; |
728 | 0 | int32 off; |
729 | |
|
730 | 0 | len = (sp[0] & 0x0f) + 3; |
731 | 0 | off = ((sp[0] & 0xf0) << 4) | sp[1]; |
732 | 0 | sp += 2; |
733 | 0 | if (len == 18) |
734 | 0 | len += *sp++; |
735 | | |
736 | | /* |
737 | | * Check for corrupt data: if we fell off the end of the |
738 | | * source, or if we obtained off = 0, or if off is more than |
739 | | * the distance back to the buffer start, we have problems. |
740 | | * (We must check for off = 0, else we risk an infinite loop |
741 | | * below in the face of corrupt data. Likewise, the upper |
742 | | * limit on off prevents accessing outside the buffer |
743 | | * boundaries.) |
744 | | */ |
745 | 0 | if (unlikely(sp > srcend || off == 0 || |
746 | 0 | off > (dp - (unsigned char *) dest))) |
747 | 0 | return -1; |
748 | | |
749 | | /* |
750 | | * Don't emit more data than requested. |
751 | | */ |
752 | 0 | len = Min(len, destend - dp); |
753 | | |
754 | | /* |
755 | | * Now we copy the bytes specified by the tag from OUTPUT to |
756 | | * OUTPUT (copy len bytes from dp - off to dp). The copied |
757 | | * areas could overlap, so to avoid undefined behavior in |
758 | | * memcpy(), be careful to copy only non-overlapping regions. |
759 | | * |
760 | | * Note that we cannot use memmove() instead, since while its |
761 | | * behavior is well-defined, it's also not what we want. |
762 | | */ |
763 | 0 | while (off < len) |
764 | 0 | { |
765 | | /* |
766 | | * We can safely copy "off" bytes since that clearly |
767 | | * results in non-overlapping source and destination. |
768 | | */ |
769 | 0 | memcpy(dp, dp - off, off); |
770 | 0 | len -= off; |
771 | 0 | dp += off; |
772 | | |
773 | | /*---------- |
774 | | * This bit is less obvious: we can double "off" after |
775 | | * each such step. Consider this raw input: |
776 | | * 112341234123412341234 |
777 | | * This will be encoded as 5 literal bytes "11234" and |
778 | | * then a match tag with length 16 and offset 4. After |
779 | | * memcpy'ing the first 4 bytes, we will have emitted |
780 | | * 112341234 |
781 | | * so we can double "off" to 8, then after the next step |
782 | | * we have emitted |
783 | | * 11234123412341234 |
784 | | * Then we can double "off" again, after which it is more |
785 | | * than the remaining "len" so we fall out of this loop |
786 | | * and finish with a non-overlapping copy of the |
787 | | * remainder. In general, a match tag with off < len |
788 | | * implies that the decoded data has a repeat length of |
789 | | * "off". We can handle 1, 2, 4, etc repetitions of the |
790 | | * repeated string per memcpy until we get to a situation |
791 | | * where the final copy step is non-overlapping. |
792 | | * |
793 | | * (Another way to understand this is that we are keeping |
794 | | * the copy source point dp - off the same throughout.) |
795 | | *---------- |
796 | | */ |
797 | 0 | off += off; |
798 | 0 | } |
799 | 0 | memcpy(dp, dp - off, len); |
800 | 0 | dp += len; |
801 | 0 | } |
802 | 0 | else |
803 | 0 | { |
804 | | /* |
805 | | * An unset control bit means LITERAL BYTE. So we just copy |
806 | | * one from INPUT to OUTPUT. |
807 | | */ |
808 | 0 | *dp++ = *sp++; |
809 | 0 | } |
810 | | |
811 | | /* |
812 | | * Advance the control bit |
813 | | */ |
814 | 0 | ctrl >>= 1; |
815 | 0 | } |
816 | 0 | } |
817 | | |
818 | | /* |
819 | | * If requested, check we decompressed the right amount. |
820 | | */ |
821 | 0 | if (check_complete && (dp != destend || sp != srcend)) |
822 | 0 | return -1; |
823 | | |
824 | | /* |
825 | | * That's it. |
826 | | */ |
827 | 0 | return (char *) dp - dest; |
828 | 0 | } |
829 | | |
830 | | |
831 | | /* ---------- |
832 | | * pglz_maximum_compressed_size - |
833 | | * |
834 | | * Calculate the maximum compressed size for a given amount of raw data. |
835 | | * Return the maximum size, or total compressed size if maximum size is |
836 | | * larger than total compressed size. |
837 | | * |
838 | | * We can't use PGLZ_MAX_OUTPUT for this purpose, because that's used to size |
839 | | * the compression buffer (and abort the compression). It does not really say |
840 | | * what's the maximum compressed size for an input of a given length, and it |
841 | | * may happen that while the whole value is compressible (and thus fits into |
842 | | * PGLZ_MAX_OUTPUT nicely), the prefix is not compressible at all. |
843 | | * ---------- |
844 | | */ |
845 | | int32 |
846 | | pglz_maximum_compressed_size(int32 rawsize, int32 total_compressed_size) |
847 | 0 | { |
848 | 0 | int64 compressed_size; |
849 | | |
850 | | /* |
851 | | * pglz uses one control bit per byte, so if the entire desired prefix is |
852 | | * represented as literal bytes, we'll need (rawsize * 9) bits. We care |
853 | | * about bytes though, so be sure to round up not down. |
854 | | * |
855 | | * Use int64 here to prevent overflow during calculation. |
856 | | */ |
857 | 0 | compressed_size = ((int64) rawsize * 9 + 7) / 8; |
858 | | |
859 | | /* |
860 | | * The above fails to account for a corner case: we could have compressed |
861 | | * data that starts with N-1 or N-2 literal bytes and then has a match tag |
862 | | * of 2 or 3 bytes. It's therefore possible that we need to fetch 1 or 2 |
863 | | * more bytes in order to have the whole match tag. (Match tags earlier |
864 | | * in the compressed data don't cause a problem, since they should |
865 | | * represent more decompressed bytes than they occupy themselves.) |
866 | | */ |
867 | 0 | compressed_size += 2; |
868 | | |
869 | | /* |
870 | | * Maximum compressed size can't be larger than total compressed size. |
871 | | * (This also ensures that our result fits in int32.) |
872 | | */ |
873 | 0 | compressed_size = Min(compressed_size, total_compressed_size); |
874 | |
|
875 | 0 | return (int32) compressed_size; |
876 | 0 | } |