Coverage Report

Created: 2025-08-28 07:58

/src/duckdb/third_party/snappy/snappy.cc
Line
Count
Source (jump to first uncovered line)
1
// Copyright 2005 Google Inc. All Rights Reserved.
2
//
3
// Redistribution and use in source and binary forms, with or without
4
// modification, are permitted provided that the following conditions are
5
// met:
6
//
7
//     * Redistributions of source code must retain the above copyright
8
// notice, this list of conditions and the following disclaimer.
9
//     * Redistributions in binary form must reproduce the above
10
// copyright notice, this list of conditions and the following disclaimer
11
// in the documentation and/or other materials provided with the
12
// distribution.
13
//     * Neither the name of Google Inc. nor the names of its
14
// contributors may be used to endorse or promote products derived from
15
// this software without specific prior written permission.
16
//
17
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
29
#include "snappy_version.hpp"
30
31
#if SNAPPY_NEW_VERSION
32
33
#include "snappy-internal.h"
34
#include "snappy-sinksource.h"
35
#include "snappy.h"
36
#if !defined(SNAPPY_HAVE_BMI2)
37
// __BMI2__ is defined by GCC and Clang. Visual Studio doesn't target BMI2
38
// specifically, but it does define __AVX2__ when AVX2 support is available.
39
// Fortunately, AVX2 was introduced in Haswell, just like BMI2.
40
//
41
// BMI2 is not defined as a subset of AVX2 (unlike SSSE3 and AVX above). So,
42
// GCC and Clang can build code with AVX2 enabled but BMI2 disabled, in which
43
// case issuing BMI2 instructions results in a compiler error.
44
#if defined(__BMI2__) || (defined(_MSC_VER) && defined(__AVX2__))
45
#define SNAPPY_HAVE_BMI2 1
46
#else
47
#define SNAPPY_HAVE_BMI2 0
48
#endif
49
#endif  // !defined(SNAPPY_HAVE_BMI2)
50
51
#if !defined(SNAPPY_HAVE_X86_CRC32)
52
#if defined(__SSE4_2__)
53
#define SNAPPY_HAVE_X86_CRC32 1
54
#else
55
#define SNAPPY_HAVE_X86_CRC32 0
56
#endif
57
#endif  // !defined(SNAPPY_HAVE_X86_CRC32)
58
59
#if !defined(SNAPPY_HAVE_NEON_CRC32)
60
#if SNAPPY_HAVE_NEON && defined(__ARM_FEATURE_CRC32)
61
#define SNAPPY_HAVE_NEON_CRC32 1
62
#else
63
#define SNAPPY_HAVE_NEON_CRC32 0
64
#endif
65
#endif  // !defined(SNAPPY_HAVE_NEON_CRC32)
66
67
#if SNAPPY_HAVE_BMI2 || SNAPPY_HAVE_X86_CRC32
68
// Please do not replace with <x86intrin.h>. or with headers that assume more
69
// advanced SSE versions without checking with all the OWNERS.
70
#include <immintrin.h>
71
#elif SNAPPY_HAVE_NEON_CRC32
72
#include <arm_acle.h>
73
#endif
74
75
#include <algorithm>
76
#include <array>
77
#include <cstddef>
78
#include <cstdint>
79
#include <cstdio>
80
#include <cstring>
81
#include <memory>
82
#include <string>
83
#include <utility>
84
#include <vector>
85
86
namespace duckdb_snappy {
87
88
namespace {
89
90
// The amount of slop bytes writers are using for unconditional copies.
91
constexpr int kSlopBytes = 64;
92
93
using internal::char_table;
94
using internal::COPY_1_BYTE_OFFSET;
95
using internal::COPY_2_BYTE_OFFSET;
96
using internal::COPY_4_BYTE_OFFSET;
97
using internal::kMaximumTagLength;
98
using internal::LITERAL;
99
#if SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE
100
using internal::V128;
101
using internal::V128_Load;
102
using internal::V128_LoadU;
103
using internal::V128_Shuffle;
104
using internal::V128_StoreU;
105
using internal::V128_DupChar;
106
#endif
107
108
// We translate the information encoded in a tag through a lookup table to a
109
// format that requires fewer instructions to decode. Effectively we store
110
// the length minus the tag part of the offset. The lowest significant byte
111
// thus stores the length. While total length - offset is given by
112
// entry - ExtractOffset(type). The nice thing is that the subtraction
113
// immediately sets the flags for the necessary check that offset >= length.
114
// This folds the cmp with sub. We engineer the long literals and copy-4 to
115
// always fail this check, so their presence doesn't affect the fast path.
116
// To prevent literals from triggering the guard against offset < length (offset
117
// does not apply to literals) the table is giving them a spurious offset of
118
// 256.
119
0
inline constexpr int16_t MakeEntry(int16_t len, int16_t offset) {
120
0
  return len - (offset << 8);
121
0
}
122
123
0
inline constexpr int16_t LengthMinusOffset(int data, int type) {
124
0
  return type == 3   ? 0xFF                    // copy-4 (or type == 3)
125
0
         : type == 2 ? MakeEntry(data + 1, 0)  // copy-2
126
0
         : type == 1 ? MakeEntry((data & 7) + 4, data >> 3)  // copy-1
127
0
         : data < 60 ? MakeEntry(data + 1, 1)  // note spurious offset.
128
0
                     : 0xFF;                   // long literal
129
0
}
130
131
0
inline constexpr int16_t LengthMinusOffset(uint8_t tag) {
132
0
  return LengthMinusOffset(tag >> 2, tag & 3);
133
0
}
134
135
template <size_t... Ints>
136
struct index_sequence {};
137
138
template <std::size_t N, size_t... Is>
139
struct make_index_sequence : make_index_sequence<N - 1, N - 1, Is...> {};
140
141
template <size_t... Is>
142
struct make_index_sequence<0, Is...> : index_sequence<Is...> {};
143
144
template <size_t... seq>
145
0
constexpr std::array<int16_t, 256> MakeTable(index_sequence<seq...>) {
146
0
  return std::array<int16_t, 256>{LengthMinusOffset(seq)...};
147
0
}
148
149
alignas(64) const std::array<int16_t, 256> kLengthMinusOffset =
150
    MakeTable(make_index_sequence<256>{});
151
152
// Given a table of uint16_t whose size is mask / 2 + 1, return a pointer to the
153
// relevant entry, if any, for the given bytes.  Any hash function will do,
154
// but a good hash function reduces the number of collisions and thus yields
155
// better compression for compressible input.
156
//
157
// REQUIRES: mask is 2 * (table_size - 1), and table_size is a power of two.
158
0
inline uint16_t* TableEntry(uint16_t* table, uint32_t bytes, uint32_t mask) {
159
  // Our choice is quicker-and-dirtier than the typical hash function;
160
  // empirically, that seems beneficial.  The upper bits of kMagic * bytes are a
161
  // higher-quality hash than the lower bits, so when using kMagic * bytes we
162
  // also shift right to get a higher-quality end result.  There's no similar
163
  // issue with a CRC because all of the output bits of a CRC are equally good
164
  // "hashes." So, a CPU instruction for CRC, if available, tends to be a good
165
  // choice.
166
#if SNAPPY_HAVE_NEON_CRC32
167
  // We use mask as the second arg to the CRC function, as it's about to
168
  // be used anyway; it'd be equally correct to use 0 or some constant.
169
  // Mathematically, _mm_crc32_u32 (or similar) is a function of the
170
  // xor of its arguments.
171
  const uint32_t hash = __crc32cw(bytes, mask);
172
#elif SNAPPY_HAVE_X86_CRC32
173
  const uint32_t hash = _mm_crc32_u32(bytes, mask);
174
#else
175
0
  constexpr uint32_t kMagic = 0x1e35a7bd;
176
0
  const uint32_t hash = (kMagic * bytes) >> (31 - kMaxHashTableBits);
177
0
#endif
178
0
  return reinterpret_cast<uint16_t*>(reinterpret_cast<uintptr_t>(table) +
179
0
                                     (hash & mask));
180
0
}
181
182
inline uint16_t* TableEntry4ByteMatch(uint16_t* table, uint32_t bytes,
183
0
                                      uint32_t mask) {
184
0
  constexpr uint32_t kMagic = 2654435761U;
185
0
  const uint32_t hash = (kMagic * bytes) >> (32 - kMaxHashTableBits);
186
0
  return reinterpret_cast<uint16_t*>(reinterpret_cast<uintptr_t>(table) +
187
0
                                     (hash & mask));
188
0
}
189
190
inline uint16_t* TableEntry8ByteMatch(uint16_t* table, uint64_t bytes,
191
0
                                      uint32_t mask) {
192
0
  constexpr uint64_t kMagic = 58295818150454627ULL;
193
0
  const uint32_t hash = (kMagic * bytes) >> (64 - kMaxHashTableBits);
194
0
  return reinterpret_cast<uint16_t*>(reinterpret_cast<uintptr_t>(table) +
195
0
                                     (hash & mask));
196
0
}
197
198
}  // namespace
199
200
0
size_t MaxCompressedLength(size_t source_bytes) {
201
  // Compressed data can be defined as:
202
  //    compressed := item* literal*
203
  //    item       := literal* copy
204
  //
205
  // The trailing literal sequence has a space blowup of at most 62/60
206
  // since a literal of length 60 needs one tag byte + one extra byte
207
  // for length information.
208
  //
209
  // Item blowup is trickier to measure.  Suppose the "copy" op copies
210
  // 4 bytes of data.  Because of a special check in the encoding code,
211
  // we produce a 4-byte copy only if the offset is < 65536.  Therefore
212
  // the copy op takes 3 bytes to encode, and this type of item leads
213
  // to at most the 62/60 blowup for representing literals.
214
  //
215
  // Suppose the "copy" op copies 5 bytes of data.  If the offset is big
216
  // enough, it will take 5 bytes to encode the copy op.  Therefore the
217
  // worst case here is a one-byte literal followed by a five-byte copy.
218
  // I.e., 6 bytes of input turn into 7 bytes of "compressed" data.
219
  //
220
  // This last factor dominates the blowup, so the final estimate is:
221
0
  return 32 + source_bytes + source_bytes / 6;
222
0
}
223
224
namespace {
225
226
0
void UnalignedCopy64(const void* src, void* dst) {
227
0
  char tmp[8];
228
0
  std::memcpy(tmp, src, 8);
229
0
  std::memcpy(dst, tmp, 8);
230
0
}
231
232
0
void UnalignedCopy128(const void* src, void* dst) {
233
  // std::memcpy() gets vectorized when the appropriate compiler options are
234
  // used. For example, x86 compilers targeting SSE2+ will optimize to an SSE2
235
  // load and store.
236
0
  char tmp[16];
237
0
  std::memcpy(tmp, src, 16);
238
0
  std::memcpy(dst, tmp, 16);
239
0
}
240
241
template <bool use_16bytes_chunk>
242
0
inline void ConditionalUnalignedCopy128(const char* src, char* dst) {
243
0
  if (use_16bytes_chunk) {
244
0
    UnalignedCopy128(src, dst);
245
0
  } else {
246
0
    UnalignedCopy64(src, dst);
247
0
    UnalignedCopy64(src + 8, dst + 8);
248
0
  }
249
0
}
250
251
// Copy [src, src+(op_limit-op)) to [op, (op_limit-op)) a byte at a time. Used
252
// for handling COPY operations where the input and output regions may overlap.
253
// For example, suppose:
254
//    src       == "ab"
255
//    op        == src + 2
256
//    op_limit  == op + 20
257
// After IncrementalCopySlow(src, op, op_limit), the result will have eleven
258
// copies of "ab"
259
//    ababababababababababab
260
// Note that this does not match the semantics of either std::memcpy() or
261
// std::memmove().
262
inline char* IncrementalCopySlow(const char* src, char* op,
263
0
                                 char* const op_limit) {
264
  // TODO: Remove pragma when LLVM is aware this
265
  // function is only called in cold regions and when cold regions don't get
266
  // vectorized or unrolled.
267
0
#ifdef __clang__
268
0
#pragma clang loop unroll(disable)
269
0
#endif
270
0
  while (op < op_limit) {
271
0
    *op++ = *src++;
272
0
  }
273
0
  return op_limit;
274
0
}
275
276
#if SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE
277
278
// Computes the bytes for shuffle control mask (please read comments on
279
// 'pattern_generation_masks' as well) for the given index_offset and
280
// pattern_size. For example, when the 'offset' is 6, it will generate a
281
// repeating pattern of size 6. So, the first 16 byte indexes will correspond to
282
// the pattern-bytes {0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3} and the
283
// next 16 byte indexes will correspond to the pattern-bytes {4, 5, 0, 1, 2, 3,
284
// 4, 5, 0, 1, 2, 3, 4, 5, 0, 1}. These byte index sequences are generated by
285
// calling MakePatternMaskBytes(0, 6, index_sequence<16>()) and
286
// MakePatternMaskBytes(16, 6, index_sequence<16>()) respectively.
287
template <size_t... indexes>
288
inline constexpr std::array<char, sizeof...(indexes)> MakePatternMaskBytes(
289
    int index_offset, int pattern_size, index_sequence<indexes...>) {
290
  return {static_cast<char>((index_offset + indexes) % pattern_size)...};
291
}
292
293
// Computes the shuffle control mask bytes array for given pattern-sizes and
294
// returns an array.
295
template <size_t... pattern_sizes_minus_one>
296
inline constexpr std::array<std::array<char, sizeof(V128)>,
297
                            sizeof...(pattern_sizes_minus_one)>
298
MakePatternMaskBytesTable(int index_offset,
299
                          index_sequence<pattern_sizes_minus_one...>) {
300
  return {
301
      MakePatternMaskBytes(index_offset, pattern_sizes_minus_one + 1,
302
                           make_index_sequence</*indexes=*/sizeof(V128)>())...};
303
}
304
305
// This is an array of shuffle control masks that can be used as the source
306
// operand for PSHUFB to permute the contents of the destination XMM register
307
// into a repeating byte pattern.
308
alignas(16) constexpr std::array<std::array<char, sizeof(V128)>,
309
                                 16> pattern_generation_masks =
310
    MakePatternMaskBytesTable(
311
        /*index_offset=*/0,
312
        /*pattern_sizes_minus_one=*/make_index_sequence<16>());
313
314
// Similar to 'pattern_generation_masks', this table is used to "rotate" the
315
// pattern so that we can copy the *next 16 bytes* consistent with the pattern.
316
// Basically, pattern_reshuffle_masks is a continuation of
317
// pattern_generation_masks. It follows that, pattern_reshuffle_masks is same as
318
// pattern_generation_masks for offsets 1, 2, 4, 8 and 16.
319
alignas(16) constexpr std::array<std::array<char, sizeof(V128)>,
320
                                 16> pattern_reshuffle_masks =
321
    MakePatternMaskBytesTable(
322
        /*index_offset=*/16,
323
        /*pattern_sizes_minus_one=*/make_index_sequence<16>());
324
325
SNAPPY_ATTRIBUTE_ALWAYS_INLINE
326
static inline V128 LoadPattern(const char* src, const size_t pattern_size) {
327
  V128 generation_mask = V128_Load(reinterpret_cast<const V128*>(
328
      pattern_generation_masks[pattern_size - 1].data()));
329
  // Uninitialized bytes are masked out by the shuffle mask.
330
  // TODO: remove annotation and macro defs once MSan is fixed.
331
  SNAPPY_ANNOTATE_MEMORY_IS_INITIALIZED(src + pattern_size, 16 - pattern_size);
332
  return V128_Shuffle(V128_LoadU(reinterpret_cast<const V128*>(src)),
333
                      generation_mask);
334
}
335
336
SNAPPY_ATTRIBUTE_ALWAYS_INLINE
337
static inline std::pair<V128 /* pattern */, V128 /* reshuffle_mask */>
338
LoadPatternAndReshuffleMask(const char* src, const size_t pattern_size) {
339
  V128 pattern = LoadPattern(src, pattern_size);
340
341
  // This mask will generate the next 16 bytes in-place. Doing so enables us to
342
  // write data by at most 4 V128_StoreU.
343
  //
344
  // For example, suppose pattern is:        abcdefabcdefabcd
345
  // Shuffling with this mask will generate: efabcdefabcdefab
346
  // Shuffling again will generate:          cdefabcdefabcdef
347
  V128 reshuffle_mask = V128_Load(reinterpret_cast<const V128*>(
348
      pattern_reshuffle_masks[pattern_size - 1].data()));
349
  return {pattern, reshuffle_mask};
350
}
351
352
#endif  // SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE
353
354
// Fallback for when we need to copy while extending the pattern, for example
355
// copying 10 bytes from 3 positions back abc -> abcabcabcabca.
356
//
357
// REQUIRES: [dst - offset, dst + 64) is a valid address range.
358
SNAPPY_ATTRIBUTE_ALWAYS_INLINE
359
0
static inline bool Copy64BytesWithPatternExtension(char* dst, size_t offset) {
360
#if SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE
361
  if (SNAPPY_PREDICT_TRUE(offset <= 16)) {
362
    switch (offset) {
363
      case 0:
364
        return false;
365
      case 1: {
366
        // TODO: Ideally we should memset, move back once the
367
        // codegen issues are fixed.
368
        V128 pattern = V128_DupChar(dst[-1]);
369
        for (int i = 0; i < 4; i++) {
370
          V128_StoreU(reinterpret_cast<V128*>(dst + 16 * i), pattern);
371
        }
372
        return true;
373
      }
374
      case 2:
375
      case 4:
376
      case 8:
377
      case 16: {
378
        V128 pattern = LoadPattern(dst - offset, offset);
379
        for (int i = 0; i < 4; i++) {
380
          V128_StoreU(reinterpret_cast<V128*>(dst + 16 * i), pattern);
381
        }
382
        return true;
383
      }
384
      default: {
385
        auto pattern_and_reshuffle_mask =
386
            LoadPatternAndReshuffleMask(dst - offset, offset);
387
        V128 pattern = pattern_and_reshuffle_mask.first;
388
        V128 reshuffle_mask = pattern_and_reshuffle_mask.second;
389
        for (int i = 0; i < 4; i++) {
390
          V128_StoreU(reinterpret_cast<V128*>(dst + 16 * i), pattern);
391
          pattern = V128_Shuffle(pattern, reshuffle_mask);
392
        }
393
        return true;
394
      }
395
    }
396
  }
397
#else
398
0
  if (SNAPPY_PREDICT_TRUE(offset < 16)) {
399
0
    if (SNAPPY_PREDICT_FALSE(offset == 0)) return false;
400
    // Extend the pattern to the first 16 bytes.
401
    // The simpler formulation of `dst[i - offset]` induces undefined behavior.
402
0
    for (int i = 0; i < 16; i++) dst[i] = (dst - offset)[i];
403
    // Find a multiple of pattern >= 16.
404
0
    static std::array<uint8_t, 16> pattern_sizes = []() {
405
0
      std::array<uint8_t, 16> res;
406
0
      for (int i = 1; i < 16; i++) res[i] = (16 / i + 1) * i;
407
0
      return res;
408
0
    }();
409
0
    offset = pattern_sizes[offset];
410
0
    for (int i = 1; i < 4; i++) {
411
0
      std::memcpy(dst + i * 16, dst + i * 16 - offset, 16);
412
0
    }
413
0
    return true;
414
0
  }
415
0
#endif  // SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE
416
417
  // Very rare.
418
0
  for (int i = 0; i < 4; i++) {
419
0
    std::memcpy(dst + i * 16, dst + i * 16 - offset, 16);
420
0
  }
421
0
  return true;
422
0
}
423
424
// Copy [src, src+(op_limit-op)) to [op, op_limit) but faster than
425
// IncrementalCopySlow. buf_limit is the address past the end of the writable
426
// region of the buffer.
427
inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
428
0
                             char* const buf_limit) {
429
#if SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE
430
  constexpr int big_pattern_size_lower_bound = 16;
431
#else
432
0
  constexpr int big_pattern_size_lower_bound = 8;
433
0
#endif
434
435
  // Terminology:
436
  //
437
  // slop = buf_limit - op
438
  // pat  = op - src
439
  // len  = op_limit - op
440
0
  assert(src < op);
441
0
  assert(op < op_limit);
442
0
  assert(op_limit <= buf_limit);
443
  // NOTE: The copy tags use 3 or 6 bits to store the copy length, so len <= 64.
444
0
  assert(op_limit - op <= 64);
445
  // NOTE: In practice the compressor always emits len >= 4, so it is ok to
446
  // assume that to optimize this function, but this is not guaranteed by the
447
  // compression format, so we have to also handle len < 4 in case the input
448
  // does not satisfy these conditions.
449
450
0
  size_t pattern_size = op - src;
451
  // The cases are split into different branches to allow the branch predictor,
452
  // FDO, and static prediction hints to work better. For each input we list the
453
  // ratio of invocations that match each condition.
454
  //
455
  // input        slop < 16   pat < 8  len > 16
456
  // ------------------------------------------
457
  // html|html4|cp   0%         1.01%    27.73%
458
  // urls            0%         0.88%    14.79%
459
  // jpg             0%        64.29%     7.14%
460
  // pdf             0%         2.56%    58.06%
461
  // txt[1-4]        0%         0.23%     0.97%
462
  // pb              0%         0.96%    13.88%
463
  // bin             0.01%     22.27%    41.17%
464
  //
465
  // It is very rare that we don't have enough slop for doing block copies. It
466
  // is also rare that we need to expand a pattern. Small patterns are common
467
  // for incompressible formats and for those we are plenty fast already.
468
  // Lengths are normally not greater than 16 but they vary depending on the
469
  // input. In general if we always predict len <= 16 it would be an ok
470
  // prediction.
471
  //
472
  // In order to be fast we want a pattern >= 16 bytes (or 8 bytes in non-SSE)
473
  // and an unrolled loop copying 1x 16 bytes (or 2x 8 bytes in non-SSE) at a
474
  // time.
475
476
  // Handle the uncommon case where pattern is less than 16 (or 8 in non-SSE)
477
  // bytes.
478
0
  if (pattern_size < big_pattern_size_lower_bound) {
479
#if SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE
480
    // Load the first eight bytes into an 128-bit XMM register, then use PSHUFB
481
    // to permute the register's contents in-place into a repeating sequence of
482
    // the first "pattern_size" bytes.
483
    // For example, suppose:
484
    //    src       == "abc"
485
    //    op        == op + 3
486
    // After V128_Shuffle(), "pattern" will have five copies of "abc"
487
    // followed by one byte of slop: abcabcabcabcabca.
488
    //
489
    // The non-SSE fallback implementation suffers from store-forwarding stalls
490
    // because its loads and stores partly overlap. By expanding the pattern
491
    // in-place, we avoid the penalty.
492
493
    // Typically, the op_limit is the gating factor so try to simplify the loop
494
    // based on that.
495
    if (SNAPPY_PREDICT_TRUE(op_limit <= buf_limit - 15)) {
496
      auto pattern_and_reshuffle_mask =
497
          LoadPatternAndReshuffleMask(src, pattern_size);
498
      V128 pattern = pattern_and_reshuffle_mask.first;
499
      V128 reshuffle_mask = pattern_and_reshuffle_mask.second;
500
501
      // There is at least one, and at most four 16-byte blocks. Writing four
502
      // conditionals instead of a loop allows FDO to layout the code with
503
      // respect to the actual probabilities of each length.
504
      // TODO: Replace with loop with trip count hint.
505
      V128_StoreU(reinterpret_cast<V128*>(op), pattern);
506
507
      if (op + 16 < op_limit) {
508
        pattern = V128_Shuffle(pattern, reshuffle_mask);
509
        V128_StoreU(reinterpret_cast<V128*>(op + 16), pattern);
510
      }
511
      if (op + 32 < op_limit) {
512
        pattern = V128_Shuffle(pattern, reshuffle_mask);
513
        V128_StoreU(reinterpret_cast<V128*>(op + 32), pattern);
514
      }
515
      if (op + 48 < op_limit) {
516
        pattern = V128_Shuffle(pattern, reshuffle_mask);
517
        V128_StoreU(reinterpret_cast<V128*>(op + 48), pattern);
518
      }
519
      return op_limit;
520
    }
521
    char* const op_end = buf_limit - 15;
522
    if (SNAPPY_PREDICT_TRUE(op < op_end)) {
523
      auto pattern_and_reshuffle_mask =
524
          LoadPatternAndReshuffleMask(src, pattern_size);
525
      V128 pattern = pattern_and_reshuffle_mask.first;
526
      V128 reshuffle_mask = pattern_and_reshuffle_mask.second;
527
528
      // This code path is relatively cold however so we save code size
529
      // by avoiding unrolling and vectorizing.
530
      //
531
      // TODO: Remove pragma when when cold regions don't get
532
      // vectorized or unrolled.
533
#ifdef __clang__
534
#pragma clang loop unroll(disable)
535
#endif
536
      do {
537
        V128_StoreU(reinterpret_cast<V128*>(op), pattern);
538
        pattern = V128_Shuffle(pattern, reshuffle_mask);
539
        op += 16;
540
      } while (SNAPPY_PREDICT_TRUE(op < op_end));
541
    }
542
    return IncrementalCopySlow(op - pattern_size, op, op_limit);
543
#else   // !SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE
544
    // If plenty of buffer space remains, expand the pattern to at least 8
545
    // bytes. The way the following loop is written, we need 8 bytes of buffer
546
    // space if pattern_size >= 4, 11 bytes if pattern_size is 1 or 3, and 10
547
    // bytes if pattern_size is 2.  Precisely encoding that is probably not
548
    // worthwhile; instead, invoke the slow path if we cannot write 11 bytes
549
    // (because 11 are required in the worst case).
550
0
    if (SNAPPY_PREDICT_TRUE(op <= buf_limit - 11)) {
551
0
      while (pattern_size < 8) {
552
0
        UnalignedCopy64(src, op);
553
0
        op += pattern_size;
554
0
        pattern_size *= 2;
555
0
      }
556
0
      if (SNAPPY_PREDICT_TRUE(op >= op_limit)) return op_limit;
557
0
    } else {
558
0
      return IncrementalCopySlow(src, op, op_limit);
559
0
    }
560
0
#endif  // SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE
561
0
  }
562
0
  assert(pattern_size >= big_pattern_size_lower_bound);
563
0
  constexpr bool use_16bytes_chunk = big_pattern_size_lower_bound == 16;
564
565
  // Copy 1x 16 bytes (or 2x 8 bytes in non-SSE) at a time. Because op - src can
566
  // be < 16 in non-SSE, a single UnalignedCopy128 might overwrite data in op.
567
  // UnalignedCopy64 is safe because expanding the pattern to at least 8 bytes
568
  // guarantees that op - src >= 8.
569
  //
570
  // Typically, the op_limit is the gating factor so try to simplify the loop
571
  // based on that.
572
0
  if (SNAPPY_PREDICT_TRUE(op_limit <= buf_limit - 15)) {
573
    // There is at least one, and at most four 16-byte blocks. Writing four
574
    // conditionals instead of a loop allows FDO to layout the code with respect
575
    // to the actual probabilities of each length.
576
    // TODO: Replace with loop with trip count hint.
577
0
    ConditionalUnalignedCopy128<use_16bytes_chunk>(src, op);
578
0
    if (op + 16 < op_limit) {
579
0
      ConditionalUnalignedCopy128<use_16bytes_chunk>(src + 16, op + 16);
580
0
    }
581
0
    if (op + 32 < op_limit) {
582
0
      ConditionalUnalignedCopy128<use_16bytes_chunk>(src + 32, op + 32);
583
0
    }
584
0
    if (op + 48 < op_limit) {
585
0
      ConditionalUnalignedCopy128<use_16bytes_chunk>(src + 48, op + 48);
586
0
    }
587
0
    return op_limit;
588
0
  }
589
590
  // Fall back to doing as much as we can with the available slop in the
591
  // buffer. This code path is relatively cold however so we save code size by
592
  // avoiding unrolling and vectorizing.
593
  //
594
  // TODO: Remove pragma when when cold regions don't get vectorized
595
  // or unrolled.
596
0
#ifdef __clang__
597
0
#pragma clang loop unroll(disable)
598
0
#endif
599
0
  for (char* op_end = buf_limit - 16; op < op_end; op += 16, src += 16) {
600
0
    ConditionalUnalignedCopy128<use_16bytes_chunk>(src, op);
601
0
  }
602
0
  if (op >= op_limit) return op_limit;
603
604
  // We only take this branch if we didn't have enough slop and we can do a
605
  // single 8 byte copy.
606
0
  if (SNAPPY_PREDICT_FALSE(op <= buf_limit - 8)) {
607
0
    UnalignedCopy64(src, op);
608
0
    src += 8;
609
0
    op += 8;
610
0
  }
611
0
  return IncrementalCopySlow(src, op, op_limit);
612
0
}
613
614
}  // namespace
615
616
template <bool allow_fast_path>
617
0
static inline char* EmitLiteral(char* op, const char* literal, int len) {
618
  // The vast majority of copies are below 16 bytes, for which a
619
  // call to std::memcpy() is overkill. This fast path can sometimes
620
  // copy up to 15 bytes too much, but that is okay in the
621
  // main loop, since we have a bit to go on for both sides:
622
  //
623
  //   - The input will always have kInputMarginBytes = 15 extra
624
  //     available bytes, as long as we're in the main loop, and
625
  //     if not, allow_fast_path = false.
626
  //   - The output will always have 32 spare bytes (see
627
  //     MaxCompressedLength).
628
0
  assert(len > 0);  // Zero-length literals are disallowed
629
0
  int n = len - 1;
630
0
  if (allow_fast_path && len <= 16) {
631
    // Fits in tag byte
632
0
    *op++ = LITERAL | (n << 2);
633
634
0
    UnalignedCopy128(literal, op);
635
0
    return op + len;
636
0
  }
637
638
0
  if (n < 60) {
639
    // Fits in tag byte
640
0
    *op++ = LITERAL | (n << 2);
641
0
  } else {
642
0
    int count = (Bits::Log2Floor(n) >> 3) + 1;
643
0
    assert(count >= 1);
644
0
    assert(count <= 4);
645
0
    *op++ = LITERAL | ((59 + count) << 2);
646
    // Encode in upcoming bytes.
647
    // Write 4 bytes, though we may care about only 1 of them. The output buffer
648
    // is guaranteed to have at least 3 more spaces left as 'len >= 61' holds
649
    // here and there is a std::memcpy() of size 'len' below.
650
0
    LittleEndian::Store32(op, n);
651
0
    op += count;
652
0
  }
653
  // When allow_fast_path is true, we can overwrite up to 16 bytes.
654
0
  if (allow_fast_path) {
655
0
    char* destination = op;
656
0
    const char* source = literal;
657
0
    const char* end = destination + len;
658
0
    do {
659
0
      std::memcpy(destination, source, 16);
660
0
      destination += 16;
661
0
      source += 16;
662
0
    } while (destination < end);
663
0
  } else {
664
0
    std::memcpy(op, literal, len);
665
0
  }
666
0
  return op + len;
667
0
}
Unexecuted instantiation: snappy.cc:char* duckdb_snappy::EmitLiteral<true>(char*, char const*, int)
Unexecuted instantiation: snappy.cc:char* duckdb_snappy::EmitLiteral<false>(char*, char const*, int)
668
669
template <bool len_less_than_12>
670
0
static inline char* EmitCopyAtMost64(char* op, size_t offset, size_t len) {
671
0
  assert(len <= 64);
672
0
  assert(len >= 4);
673
0
  assert(offset < 65536);
674
0
  assert(len_less_than_12 == (len < 12));
675
676
0
  if (len_less_than_12) {
677
0
    uint32_t u = (len << 2) + (offset << 8);
678
0
    uint32_t copy1 = COPY_1_BYTE_OFFSET - (4 << 2) + ((offset >> 3) & 0xe0);
679
0
    uint32_t copy2 = COPY_2_BYTE_OFFSET - (1 << 2);
680
    // It turns out that offset < 2048 is a difficult to predict branch.
681
    // `perf record` shows this is the highest percentage of branch misses in
682
    // benchmarks. This code produces branch free code, the data dependency
683
    // chain that bottlenecks the throughput is so long that a few extra
684
    // instructions are completely free (IPC << 6 because of data deps).
685
0
    u += offset < 2048 ? copy1 : copy2;
686
0
    LittleEndian::Store32(op, u);
687
0
    op += offset < 2048 ? 2 : 3;
688
0
  } else {
689
    // Write 4 bytes, though we only care about 3 of them.  The output buffer
690
    // is required to have some slack, so the extra byte won't overrun it.
691
0
    uint32_t u = COPY_2_BYTE_OFFSET + ((len - 1) << 2) + (offset << 8);
692
0
    LittleEndian::Store32(op, u);
693
0
    op += 3;
694
0
  }
695
0
  return op;
696
0
}
Unexecuted instantiation: snappy.cc:char* duckdb_snappy::EmitCopyAtMost64<true>(char*, unsigned long, unsigned long)
Unexecuted instantiation: snappy.cc:char* duckdb_snappy::EmitCopyAtMost64<false>(char*, unsigned long, unsigned long)
697
698
template <bool len_less_than_12>
699
0
static inline char* EmitCopy(char* op, size_t offset, size_t len) {
700
0
  assert(len_less_than_12 == (len < 12));
701
0
  if (len_less_than_12) {
702
0
    return EmitCopyAtMost64</*len_less_than_12=*/true>(op, offset, len);
703
0
  } else {
704
    // A special case for len <= 64 might help, but so far measurements suggest
705
    // it's in the noise.
706
707
    // Emit 64 byte copies but make sure to keep at least four bytes reserved.
708
0
    while (SNAPPY_PREDICT_FALSE(len >= 68)) {
709
0
      op = EmitCopyAtMost64</*len_less_than_12=*/false>(op, offset, 64);
710
0
      len -= 64;
711
0
    }
712
713
    // One or two copies will now finish the job.
714
0
    if (len > 64) {
715
0
      op = EmitCopyAtMost64</*len_less_than_12=*/false>(op, offset, 60);
716
0
      len -= 60;
717
0
    }
718
719
    // Emit remainder.
720
0
    if (len < 12) {
721
0
      op = EmitCopyAtMost64</*len_less_than_12=*/true>(op, offset, len);
722
0
    } else {
723
0
      op = EmitCopyAtMost64</*len_less_than_12=*/false>(op, offset, len);
724
0
    }
725
0
    return op;
726
0
  }
727
0
}
Unexecuted instantiation: snappy.cc:char* duckdb_snappy::EmitCopy<true>(char*, unsigned long, unsigned long)
Unexecuted instantiation: snappy.cc:char* duckdb_snappy::EmitCopy<false>(char*, unsigned long, unsigned long)
728
729
0
bool GetUncompressedLength(const char* start, size_t n, size_t* result) {
730
0
  uint32_t v = 0;
731
0
  const char* limit = start + n;
732
0
  if (Bignum::Parse32WithLimit(start, limit, &v) != NULL) {
733
0
    *result = v;
734
0
    return true;
735
0
  } else {
736
0
    return false;
737
0
  }
738
0
}
739
740
namespace {
741
0
uint32_t CalculateTableSize(uint32_t input_size) {
742
0
  static_assert(
743
0
      kMaxHashTableSize >= kMinHashTableSize,
744
0
      "kMaxHashTableSize should be greater or equal to kMinHashTableSize.");
745
0
  if (input_size > kMaxHashTableSize) {
746
0
    return kMaxHashTableSize;
747
0
  }
748
0
  if (input_size < kMinHashTableSize) {
749
0
    return kMinHashTableSize;
750
0
  }
751
  // This is equivalent to Log2Ceiling(input_size), assuming input_size > 1.
752
  // 2 << Log2Floor(x - 1) is equivalent to 1 << (1 + Log2Floor(x - 1)).
753
0
  return 2u << Bits::Log2Floor(input_size - 1);
754
0
}
755
}  // namespace
756
757
namespace internal {
758
0
WorkingMemory::WorkingMemory(size_t input_size) {
759
0
  const size_t max_fragment_size = std::min(input_size, kBlockSize);
760
0
  const size_t table_size = CalculateTableSize(max_fragment_size);
761
0
  size_ = table_size * sizeof(*table_) + max_fragment_size +
762
0
          MaxCompressedLength(max_fragment_size);
763
0
  mem_ = std::allocator<char>().allocate(size_);
764
0
  table_ = reinterpret_cast<uint16_t*>(mem_);
765
0
  input_ = mem_ + table_size * sizeof(*table_);
766
0
  output_ = input_ + max_fragment_size;
767
0
}
768
769
0
WorkingMemory::~WorkingMemory() {
770
0
  std::allocator<char>().deallocate(mem_, size_);
771
0
}
772
773
uint16_t* WorkingMemory::GetHashTable(size_t fragment_size,
774
0
                                      int* table_size) const {
775
0
  const size_t htsize = CalculateTableSize(fragment_size);
776
0
  memset(table_, 0, htsize * sizeof(*table_));
777
0
  *table_size = htsize;
778
0
  return table_;
779
0
}
780
}  // end namespace internal
781
782
// Flat array compression that does not emit the "uncompressed length"
783
// prefix. Compresses "input" string to the "*op" buffer.
784
//
785
// REQUIRES: "input" is at most "kBlockSize" bytes long.
786
// REQUIRES: "op" points to an array of memory that is at least
787
// "MaxCompressedLength(input.size())" in size.
788
// REQUIRES: All elements in "table[0..table_size-1]" are initialized to zero.
789
// REQUIRES: "table_size" is a power of two
790
//
791
// Returns an "end" pointer into "op" buffer.
792
// "end - op" is the compressed size of "input".
793
namespace internal {
794
char* CompressFragment(const char* input, size_t input_size, char* op,
795
0
                       uint16_t* table, const int table_size) {
796
  // "ip" is the input pointer, and "op" is the output pointer.
797
0
  const char* ip = input;
798
0
  assert(input_size <= kBlockSize);
799
0
  assert((table_size & (table_size - 1)) == 0);  // table must be power of two
800
0
  const uint32_t mask = 2 * (table_size - 1);
801
0
  const char* ip_end = input + input_size;
802
0
  const char* base_ip = ip;
803
804
0
  const size_t kInputMarginBytes = 15;
805
0
  if (SNAPPY_PREDICT_TRUE(input_size >= kInputMarginBytes)) {
806
0
    const char* ip_limit = input + input_size - kInputMarginBytes;
807
808
0
    for (uint32_t preload = LittleEndian::Load32(ip + 1);;) {
809
      // Bytes in [next_emit, ip) will be emitted as literal bytes.  Or
810
      // [next_emit, ip_end) after the main loop.
811
0
      const char* next_emit = ip++;
812
0
      uint64_t data = LittleEndian::Load64(ip);
813
      // The body of this loop calls EmitLiteral once and then EmitCopy one or
814
      // more times.  (The exception is that when we're close to exhausting
815
      // the input we goto emit_remainder.)
816
      //
817
      // In the first iteration of this loop we're just starting, so
818
      // there's nothing to copy, so calling EmitLiteral once is
819
      // necessary.  And we only start a new iteration when the
820
      // current iteration has determined that a call to EmitLiteral will
821
      // precede the next call to EmitCopy (if any).
822
      //
823
      // Step 1: Scan forward in the input looking for a 4-byte-long match.
824
      // If we get close to exhausting the input then goto emit_remainder.
825
      //
826
      // Heuristic match skipping: If 32 bytes are scanned with no matches
827
      // found, start looking only at every other byte. If 32 more bytes are
828
      // scanned (or skipped), look at every third byte, etc.. When a match is
829
      // found, immediately go back to looking at every byte. This is a small
830
      // loss (~5% performance, ~0.1% density) for compressible data due to more
831
      // bookkeeping, but for non-compressible data (such as JPEG) it's a huge
832
      // win since the compressor quickly "realizes" the data is incompressible
833
      // and doesn't bother looking for matches everywhere.
834
      //
835
      // The "skip" variable keeps track of how many bytes there are since the
836
      // last match; dividing it by 32 (ie. right-shifting by five) gives the
837
      // number of bytes to move ahead for each iteration.
838
0
      uint32_t skip = 32;
839
840
0
      const char* candidate;
841
0
      if (ip_limit - ip >= 16) {
842
0
        auto delta = ip - base_ip;
843
0
        for (int j = 0; j < 4; ++j) {
844
0
          for (int k = 0; k < 4; ++k) {
845
0
            int i = 4 * j + k;
846
            // These for-loops are meant to be unrolled. So we can freely
847
            // special case the first iteration to use the value already
848
            // loaded in preload.
849
0
            uint32_t dword = i == 0 ? preload : static_cast<uint32_t>(data);
850
0
            assert(dword == LittleEndian::Load32(ip + i));
851
0
            uint16_t* table_entry = TableEntry(table, dword, mask);
852
0
            candidate = base_ip + *table_entry;
853
0
            assert(candidate >= base_ip);
854
0
            assert(candidate < ip + i);
855
0
            *table_entry = delta + i;
856
0
            if (SNAPPY_PREDICT_FALSE(LittleEndian::Load32(candidate) == dword)) {
857
0
              *op = LITERAL | (i << 2);
858
0
              UnalignedCopy128(next_emit, op + 1);
859
0
              ip += i;
860
0
              op = op + i + 2;
861
0
              goto emit_match;
862
0
            }
863
0
            data >>= 8;
864
0
          }
865
0
          data = LittleEndian::Load64(ip + 4 * j + 4);
866
0
        }
867
0
        ip += 16;
868
0
        skip += 16;
869
0
      }
870
0
      while (true) {
871
0
        assert(static_cast<uint32_t>(data) == LittleEndian::Load32(ip));
872
0
        uint16_t* table_entry = TableEntry(table, data, mask);
873
0
        uint32_t bytes_between_hash_lookups = skip >> 5;
874
0
        skip += bytes_between_hash_lookups;
875
0
        const char* next_ip = ip + bytes_between_hash_lookups;
876
0
        if (SNAPPY_PREDICT_FALSE(next_ip > ip_limit)) {
877
0
          ip = next_emit;
878
0
          goto emit_remainder;
879
0
        }
880
0
        candidate = base_ip + *table_entry;
881
0
        assert(candidate >= base_ip);
882
0
        assert(candidate < ip);
883
884
0
        *table_entry = ip - base_ip;
885
0
        if (SNAPPY_PREDICT_FALSE(static_cast<uint32_t>(data) ==
886
0
                                LittleEndian::Load32(candidate))) {
887
0
          break;
888
0
        }
889
0
        data = LittleEndian::Load32(next_ip);
890
0
        ip = next_ip;
891
0
      }
892
893
      // Step 2: A 4-byte match has been found.  We'll later see if more
894
      // than 4 bytes match.  But, prior to the match, input
895
      // bytes [next_emit, ip) are unmatched.  Emit them as "literal bytes."
896
0
      assert(next_emit + 16 <= ip_end);
897
0
      op = EmitLiteral</*allow_fast_path=*/true>(op, next_emit, ip - next_emit);
898
899
      // Step 3: Call EmitCopy, and then see if another EmitCopy could
900
      // be our next move.  Repeat until we find no match for the
901
      // input immediately after what was consumed by the last EmitCopy call.
902
      //
903
      // If we exit this loop normally then we need to call EmitLiteral next,
904
      // though we don't yet know how big the literal will be.  We handle that
905
      // by proceeding to the next iteration of the main loop.  We also can exit
906
      // this loop via goto if we get close to exhausting the input.
907
0
    emit_match:
908
0
      do {
909
        // We have a 4-byte match at ip, and no need to emit any
910
        // "literal bytes" prior to ip.
911
0
        const char* base = ip;
912
0
        std::pair<size_t, bool> p =
913
0
            FindMatchLength(candidate + 4, ip + 4, ip_end, &data);
914
0
        size_t matched = 4 + p.first;
915
0
        ip += matched;
916
0
        size_t offset = base - candidate;
917
0
        assert(0 == memcmp(base, candidate, matched));
918
0
        if (p.second) {
919
0
          op = EmitCopy</*len_less_than_12=*/true>(op, offset, matched);
920
0
        } else {
921
0
          op = EmitCopy</*len_less_than_12=*/false>(op, offset, matched);
922
0
        }
923
0
        if (SNAPPY_PREDICT_FALSE(ip >= ip_limit)) {
924
0
          goto emit_remainder;
925
0
        }
926
        // Expect 5 bytes to match
927
0
        assert((data & 0xFFFFFFFFFF) ==
928
0
               (LittleEndian::Load64(ip) & 0xFFFFFFFFFF));
929
        // We are now looking for a 4-byte match again.  We read
930
        // table[Hash(ip, mask)] for that.  To improve compression,
931
        // we also update table[Hash(ip - 1, mask)] and table[Hash(ip, mask)].
932
0
        *TableEntry(table, LittleEndian::Load32(ip - 1), mask) =
933
0
            ip - base_ip - 1;
934
0
        uint16_t* table_entry = TableEntry(table, data, mask);
935
0
        candidate = base_ip + *table_entry;
936
0
        *table_entry = ip - base_ip;
937
        // Measurements on the benchmarks have shown the following probabilities
938
        // for the loop to exit (ie. avg. number of iterations is reciprocal).
939
        // BM_Flat/6  txt1    p = 0.3-0.4
940
        // BM_Flat/7  txt2    p = 0.35
941
        // BM_Flat/8  txt3    p = 0.3-0.4
942
        // BM_Flat/9  txt3    p = 0.34-0.4
943
        // BM_Flat/10 pb      p = 0.4
944
        // BM_Flat/11 gaviota p = 0.1
945
        // BM_Flat/12 cp      p = 0.5
946
        // BM_Flat/13 c       p = 0.3
947
0
      } while (static_cast<uint32_t>(data) == LittleEndian::Load32(candidate));
948
      // Because the least significant 5 bytes matched, we can utilize data
949
      // for the next iteration.
950
0
      preload = data >> 8;
951
0
    }
952
0
  }
953
954
0
emit_remainder:
955
  // Emit the remaining bytes as a literal
956
0
  if (ip < ip_end) {
957
0
    op = EmitLiteral</*allow_fast_path=*/false>(op, ip, ip_end - ip);
958
0
  }
959
960
0
  return op;
961
0
}
962
963
char* CompressFragmentDoubleHash(const char* input, size_t input_size, char* op,
964
                                 uint16_t* table, const int table_size,
965
0
                                 uint16_t* table2, const int table_size2) {
966
0
  (void)table_size2;
967
0
  assert(table_size == table_size2);
968
  // "ip" is the input pointer, and "op" is the output pointer.
969
0
  const char* ip = input;
970
0
  assert(input_size <= kBlockSize);
971
0
  assert((table_size & (table_size - 1)) == 0);  // table must be power of two
972
0
  const uint32_t mask = 2 * (table_size - 1);
973
0
  const char* ip_end = input + input_size;
974
0
  const char* base_ip = ip;
975
976
0
  const size_t kInputMarginBytes = 15;
977
0
  if (SNAPPY_PREDICT_TRUE(input_size >= kInputMarginBytes)) {
978
0
    const char* ip_limit = input + input_size - kInputMarginBytes;
979
980
0
    for (;;) {
981
0
      const char* next_emit = ip++;
982
0
      uint64_t data = LittleEndian::Load64(ip);
983
0
      uint32_t skip = 512;
984
985
0
      const char* candidate;
986
0
      uint32_t candidate_length;
987
0
      while (true) {
988
0
        assert(static_cast<uint32_t>(data) == LittleEndian::Load32(ip));
989
0
        uint16_t* table_entry2 = TableEntry8ByteMatch(table2, data, mask);
990
0
        uint32_t bytes_between_hash_lookups = skip >> 9;
991
0
        skip++;
992
0
        const char* next_ip = ip + bytes_between_hash_lookups;
993
0
        if (SNAPPY_PREDICT_FALSE(next_ip > ip_limit)) {
994
0
          ip = next_emit;
995
0
          goto emit_remainder;
996
0
        }
997
0
        candidate = base_ip + *table_entry2;
998
0
        assert(candidate >= base_ip);
999
0
        assert(candidate < ip);
1000
1001
0
        *table_entry2 = ip - base_ip;
1002
0
        if (SNAPPY_PREDICT_FALSE(static_cast<uint32_t>(data) ==
1003
0
                                LittleEndian::Load32(candidate))) {
1004
0
          candidate_length =
1005
0
              FindMatchLengthPlain(candidate + 4, ip + 4, ip_end) + 4;
1006
0
          break;
1007
0
        }
1008
1009
0
        uint16_t* table_entry = TableEntry4ByteMatch(table, data, mask);
1010
0
        candidate = base_ip + *table_entry;
1011
0
        assert(candidate >= base_ip);
1012
0
        assert(candidate < ip);
1013
1014
0
        *table_entry = ip - base_ip;
1015
0
        if (SNAPPY_PREDICT_FALSE(static_cast<uint32_t>(data) ==
1016
0
                                LittleEndian::Load32(candidate))) {
1017
0
          candidate_length =
1018
0
              FindMatchLengthPlain(candidate + 4, ip + 4, ip_end) + 4;
1019
0
          table_entry2 =
1020
0
              TableEntry8ByteMatch(table2, LittleEndian::Load64(ip + 1), mask);
1021
0
          auto candidate2 = base_ip + *table_entry2;
1022
0
          size_t candidate_length2 =
1023
0
              FindMatchLengthPlain(candidate2, ip + 1, ip_end);
1024
0
          if (candidate_length2 > candidate_length) {
1025
0
            *table_entry2 = ip - base_ip;
1026
0
            candidate = candidate2;
1027
0
            candidate_length = candidate_length2;
1028
0
            ++ip;
1029
0
          }
1030
0
          break;
1031
0
        }
1032
0
        data = LittleEndian::Load64(next_ip);
1033
0
        ip = next_ip;
1034
0
      }
1035
      // Backtrack to the point it matches fully.
1036
0
      while (ip > next_emit && candidate > base_ip &&
1037
0
             *(ip - 1) == *(candidate - 1)) {
1038
0
        --ip;
1039
0
        --candidate;
1040
0
        ++candidate_length;
1041
0
      }
1042
0
      *TableEntry8ByteMatch(table2, LittleEndian::Load64(ip + 1), mask) =
1043
0
          ip - base_ip + 1;
1044
0
      *TableEntry8ByteMatch(table2, LittleEndian::Load64(ip + 2), mask) =
1045
0
          ip - base_ip + 2;
1046
0
      *TableEntry4ByteMatch(table, LittleEndian::Load32(ip + 1), mask) =
1047
0
          ip - base_ip + 1;
1048
      // Step 2: A 4-byte or 8-byte match has been found.
1049
      // We'll later see if more than 4 bytes match.  But, prior to the match,
1050
      // input bytes [next_emit, ip) are unmatched.  Emit them as
1051
      // "literal bytes."
1052
0
      assert(next_emit + 16 <= ip_end);
1053
0
      if (ip - next_emit > 0) {
1054
0
        op = EmitLiteral</*allow_fast_path=*/true>(op, next_emit,
1055
0
                                                   ip - next_emit);
1056
0
      }
1057
      // Step 3: Call EmitCopy, and then see if another EmitCopy could
1058
      // be our next move.  Repeat until we find no match for the
1059
      // input immediately after what was consumed by the last EmitCopy call.
1060
      //
1061
      // If we exit this loop normally then we need to call EmitLiteral next,
1062
      // though we don't yet know how big the literal will be.  We handle that
1063
      // by proceeding to the next iteration of the main loop.  We also can exit
1064
      // this loop via goto if we get close to exhausting the input.
1065
0
      do {
1066
        // We have a 4-byte match at ip, and no need to emit any
1067
        // "literal bytes" prior to ip.
1068
0
        const char* base = ip;
1069
0
        ip += candidate_length;
1070
0
        size_t offset = base - candidate;
1071
0
        if (candidate_length < 12) {
1072
0
          op =
1073
0
              EmitCopy</*len_less_than_12=*/true>(op, offset, candidate_length);
1074
0
        } else {
1075
0
          op = EmitCopy</*len_less_than_12=*/false>(op, offset,
1076
0
                                                    candidate_length);
1077
0
        }
1078
0
        if (SNAPPY_PREDICT_FALSE(ip >= ip_limit)) {
1079
0
          goto emit_remainder;
1080
0
        }
1081
        // We are now looking for a 4-byte match again.  We read
1082
        // table[Hash(ip, mask)] for that. To improve compression,
1083
        // we also update several previous table entries.
1084
0
        if (ip - base_ip > 7) {
1085
0
          *TableEntry8ByteMatch(table2, LittleEndian::Load64(ip - 7), mask) =
1086
0
              ip - base_ip - 7;
1087
0
          *TableEntry8ByteMatch(table2, LittleEndian::Load64(ip - 4), mask) =
1088
0
              ip - base_ip - 4;
1089
0
        }
1090
0
        *TableEntry8ByteMatch(table2, LittleEndian::Load64(ip - 3), mask) =
1091
0
            ip - base_ip - 3;
1092
0
        *TableEntry8ByteMatch(table2, LittleEndian::Load64(ip - 2), mask) =
1093
0
            ip - base_ip - 2;
1094
0
        *TableEntry4ByteMatch(table, LittleEndian::Load32(ip - 2), mask) =
1095
0
            ip - base_ip - 2;
1096
0
        *TableEntry4ByteMatch(table, LittleEndian::Load32(ip - 1), mask) =
1097
0
            ip - base_ip - 1;
1098
1099
0
        uint16_t* table_entry =
1100
0
            TableEntry8ByteMatch(table2, LittleEndian::Load64(ip), mask);
1101
0
        candidate = base_ip + *table_entry;
1102
0
        *table_entry = ip - base_ip;
1103
0
        if (LittleEndian::Load32(ip) == LittleEndian::Load32(candidate)) {
1104
0
          candidate_length =
1105
0
              FindMatchLengthPlain(candidate + 4, ip + 4, ip_end) + 4;
1106
0
          continue;
1107
0
        }
1108
0
        table_entry =
1109
0
            TableEntry4ByteMatch(table, LittleEndian::Load32(ip), mask);
1110
0
        candidate = base_ip + *table_entry;
1111
0
        *table_entry = ip - base_ip;
1112
0
        if (LittleEndian::Load32(ip) == LittleEndian::Load32(candidate)) {
1113
0
          candidate_length =
1114
0
              FindMatchLengthPlain(candidate + 4, ip + 4, ip_end) + 4;
1115
0
          continue;
1116
0
        }
1117
0
        break;
1118
0
      } while (true);
1119
0
    }
1120
0
  }
1121
1122
0
emit_remainder:
1123
  // Emit the remaining bytes as a literal
1124
0
  if (ip < ip_end) {
1125
0
    op = EmitLiteral</*allow_fast_path=*/false>(op, ip, ip_end - ip);
1126
0
  }
1127
1128
0
  return op;
1129
0
}
1130
}  // end namespace internal
1131
1132
static inline void Report(int token, const char *algorithm, size_t
1133
0
compressed_size, size_t uncompressed_size) {
1134
  // TODO: Switch to [[maybe_unused]] when we can assume C++17.
1135
0
  (void)token;
1136
0
  (void)algorithm;
1137
0
  (void)compressed_size;
1138
0
  (void)uncompressed_size;
1139
0
}
1140
1141
// Signature of output types needed by decompression code.
1142
// The decompression code is templatized on a type that obeys this
1143
// signature so that we do not pay virtual function call overhead in
1144
// the middle of a tight decompression loop.
1145
//
1146
// class DecompressionWriter {
1147
//  public:
1148
//   // Called before decompression
1149
//   void SetExpectedLength(size_t length);
1150
//
1151
//   // For performance a writer may choose to donate the cursor variable to the
1152
//   // decompression function. The decompression will inject it in all its
1153
//   // function calls to the writer. Keeping the important output cursor as a
1154
//   // function local stack variable allows the compiler to keep it in
1155
//   // register, which greatly aids performance by avoiding loads and stores of
1156
//   // this variable in the fast path loop iterations.
1157
//   T GetOutputPtr() const;
1158
//
1159
//   // At end of decompression the loop donates the ownership of the cursor
1160
//   // variable back to the writer by calling this function.
1161
//   void SetOutputPtr(T op);
1162
//
1163
//   // Called after decompression
1164
//   bool CheckLength() const;
1165
//
1166
//   // Called repeatedly during decompression
1167
//   // Each function get a pointer to the op (output pointer), that the writer
1168
//   // can use and update. Note it's important that these functions get fully
1169
//   // inlined so that no actual address of the local variable needs to be
1170
//   // taken.
1171
//   bool Append(const char* ip, size_t length, T* op);
1172
//   bool AppendFromSelf(uint32_t offset, size_t length, T* op);
1173
//
1174
//   // The rules for how TryFastAppend differs from Append are somewhat
1175
//   // convoluted:
1176
//   //
1177
//   //  - TryFastAppend is allowed to decline (return false) at any
1178
//   //    time, for any reason -- just "return false" would be
1179
//   //    a perfectly legal implementation of TryFastAppend.
1180
//   //    The intention is for TryFastAppend to allow a fast path
1181
//   //    in the common case of a small append.
1182
//   //  - TryFastAppend is allowed to read up to <available> bytes
1183
//   //    from the input buffer, whereas Append is allowed to read
1184
//   //    <length>. However, if it returns true, it must leave
1185
//   //    at least five (kMaximumTagLength) bytes in the input buffer
1186
//   //    afterwards, so that there is always enough space to read the
1187
//   //    next tag without checking for a refill.
1188
//   //  - TryFastAppend must always return decline (return false)
1189
//   //    if <length> is 61 or more, as in this case the literal length is not
1190
//   //    decoded fully. In practice, this should not be a big problem,
1191
//   //    as it is unlikely that one would implement a fast path accepting
1192
//   //    this much data.
1193
//   //
1194
//   bool TryFastAppend(const char* ip, size_t available, size_t length, T* op);
1195
// };
1196
1197
0
static inline uint32_t ExtractLowBytes(const uint32_t& v, int n) {
1198
0
  assert(n >= 0);
1199
0
  assert(n <= 4);
1200
#if SNAPPY_HAVE_BMI2
1201
  return _bzhi_u32(v, 8 * n);
1202
#else
1203
  // This needs to be wider than uint32_t otherwise `mask << 32` will be
1204
  // undefined.
1205
0
  uint64_t mask = 0xffffffff;
1206
0
  return v & ~(mask << (8 * n));
1207
0
#endif
1208
0
}
1209
1210
0
static inline bool LeftShiftOverflows(uint8_t value, uint32_t shift) {
1211
0
  assert(shift < 32);
1212
0
  static const uint8_t masks[] = {
1213
0
      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  //
1214
0
      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  //
1215
0
      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  //
1216
0
      0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe};
1217
0
  return (value & masks[shift]) != 0;
1218
0
}
1219
1220
0
inline bool Copy64BytesWithPatternExtension(ptrdiff_t dst, size_t offset) {
1221
  // TODO: Switch to [[maybe_unused]] when we can assume C++17.
1222
0
  (void)dst;
1223
0
  return offset != 0;
1224
0
}
1225
1226
// Copies between size bytes and 64 bytes from src to dest.  size cannot exceed
1227
// 64.  More than size bytes, but never exceeding 64, might be copied if doing
1228
// so gives better performance.  [src, src + size) must not overlap with
1229
// [dst, dst + size), but [src, src + 64) may overlap with [dst, dst + 64).
1230
0
void MemCopy64(char* dst, const void* src, size_t size) {
1231
  // Always copy this many bytes.  If that's below size then copy the full 64.
1232
0
  constexpr int kShortMemCopy = 32;
1233
1234
0
  assert(size <= 64);
1235
0
  assert(std::less_equal<const void*>()(static_cast<const char*>(src) + size,
1236
0
                                        dst) ||
1237
0
         std::less_equal<const void*>()(dst + size, src));
1238
1239
  // We know that src and dst are at least size bytes apart. However, because we
1240
  // might copy more than size bytes the copy still might overlap past size.
1241
  // E.g. if src and dst appear consecutively in memory (src + size >= dst).
1242
  // TODO: Investigate wider copies on other platforms.
1243
#if defined(__x86_64__) && defined(__AVX__)
1244
  assert(kShortMemCopy <= 32);
1245
  __m256i data = _mm256_lddqu_si256(static_cast<const __m256i *>(src));
1246
  _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), data);
1247
  // Profiling shows that nearly all copies are short.
1248
  if (SNAPPY_PREDICT_FALSE(size > kShortMemCopy)) {
1249
    data = _mm256_lddqu_si256(static_cast<const __m256i *>(src) + 1);
1250
    _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst) + 1, data);
1251
  }
1252
#else
1253
0
  std::memmove(dst, src, kShortMemCopy);
1254
  // Profiling shows that nearly all copies are short.
1255
0
  if (SNAPPY_PREDICT_FALSE(size > kShortMemCopy)) {
1256
0
    std::memmove(dst + kShortMemCopy,
1257
0
                 static_cast<const uint8_t*>(src) + kShortMemCopy,
1258
0
                 64 - kShortMemCopy);
1259
0
  }
1260
0
#endif
1261
0
}
1262
1263
0
void MemCopy64(ptrdiff_t dst, const void* src, size_t size) {
1264
  // TODO: Switch to [[maybe_unused]] when we can assume C++17.
1265
0
  (void)dst;
1266
0
  (void)src;
1267
0
  (void)size;
1268
0
}
1269
1270
void ClearDeferred(const void** deferred_src, size_t* deferred_length,
1271
0
                   uint8_t* safe_source) {
1272
0
  *deferred_src = safe_source;
1273
0
  *deferred_length = 0;
1274
0
}
1275
1276
void DeferMemCopy(const void** deferred_src, size_t* deferred_length,
1277
0
                  const void* src, size_t length) {
1278
0
  *deferred_src = src;
1279
0
  *deferred_length = length;
1280
0
}
1281
1282
SNAPPY_ATTRIBUTE_ALWAYS_INLINE
1283
0
inline size_t AdvanceToNextTagARMOptimized(const uint8_t** ip_p, size_t* tag) {
1284
0
  const uint8_t*& ip = *ip_p;
1285
0
  // This section is crucial for the throughput of the decompression loop.
1286
0
  // The latency of an iteration is fundamentally constrained by the
1287
0
  // following data chain on ip.
1288
0
  // ip -> c = Load(ip) -> delta1 = (c & 3)        -> ip += delta1 or delta2
1289
0
  //                       delta2 = ((c >> 2) + 1)    ip++
1290
0
  // This is different from X86 optimizations because ARM has conditional add
1291
0
  // instruction (csinc) and it removes several register moves.
1292
0
  const size_t tag_type = *tag & 3;
1293
0
  const bool is_literal = (tag_type == 0);
1294
0
  if (is_literal) {
1295
0
    size_t next_literal_tag = (*tag >> 2) + 1;
1296
0
    *tag = ip[next_literal_tag];
1297
0
    ip += next_literal_tag + 1;
1298
0
  } else {
1299
0
    *tag = ip[tag_type];
1300
0
    ip += tag_type + 1;
1301
0
  }
1302
0
  return tag_type;
1303
0
}
1304
1305
SNAPPY_ATTRIBUTE_ALWAYS_INLINE
1306
0
inline size_t AdvanceToNextTagX86Optimized(const uint8_t** ip_p, size_t* tag) {
1307
0
  const uint8_t*& ip = *ip_p;
1308
  // This section is crucial for the throughput of the decompression loop.
1309
  // The latency of an iteration is fundamentally constrained by the
1310
  // following data chain on ip.
1311
  // ip -> c = Load(ip) -> ip1 = ip + 1 + (c & 3) -> ip = ip1 or ip2
1312
  //                       ip2 = ip + 2 + (c >> 2)
1313
  // This amounts to 8 cycles.
1314
  // 5 (load) + 1 (c & 3) + 1 (lea ip1, [ip + (c & 3) + 1]) + 1 (cmov)
1315
0
  size_t literal_len = *tag >> 2;
1316
0
  size_t tag_type = *tag;
1317
0
  bool is_literal;
1318
0
#if defined(__GCC_ASM_FLAG_OUTPUTS__) && defined(__x86_64__)
1319
  // TODO clang misses the fact that the (c & 3) already correctly
1320
  // sets the zero flag.
1321
0
  asm("and $3, %k[tag_type]\n\t"
1322
0
      : [tag_type] "+r"(tag_type), "=@ccz"(is_literal)
1323
0
      :: "cc");
1324
#else
1325
  tag_type &= 3;
1326
  is_literal = (tag_type == 0);
1327
#endif
1328
  // TODO
1329
  // This is code is subtle. Loading the values first and then cmov has less
1330
  // latency then cmov ip and then load. However clang would move the loads
1331
  // in an optimization phase, volatile prevents this transformation.
1332
  // Note that we have enough slop bytes (64) that the loads are always valid.
1333
0
  size_t tag_literal =
1334
0
      static_cast<const volatile uint8_t*>(ip)[1 + literal_len];
1335
0
  size_t tag_copy = static_cast<const volatile uint8_t*>(ip)[tag_type];
1336
0
  *tag = is_literal ? tag_literal : tag_copy;
1337
0
  const uint8_t* ip_copy = ip + 1 + tag_type;
1338
0
  const uint8_t* ip_literal = ip + 2 + literal_len;
1339
0
  ip = is_literal ? ip_literal : ip_copy;
1340
0
#if defined(__GNUC__) && defined(__x86_64__)
1341
  // TODO Clang is "optimizing" zero-extension (a totally free
1342
  // operation) this means that after the cmov of tag, it emits another movzb
1343
  // tag, byte(tag). It really matters as it's on the core chain. This dummy
1344
  // asm, persuades clang to do the zero-extension at the load (it's automatic)
1345
  // removing the expensive movzb.
1346
0
  asm("" ::"r"(tag_copy));
1347
0
#endif
1348
0
  return tag_type;
1349
0
}
1350
1351
// Extract the offset for copy-1 and copy-2 returns 0 for literals or copy-4.
1352
0
inline uint32_t ExtractOffset(uint32_t val, size_t tag_type) {
1353
  // For x86 non-static storage works better. For ARM static storage is better.
1354
  // TODO: Once the array is recognized as a register, improve the
1355
  // readability for x86.
1356
0
#if defined(__x86_64__)
1357
0
  constexpr uint64_t kExtractMasksCombined = 0x0000FFFF00FF0000ull;
1358
0
  uint16_t result;
1359
0
  memcpy(&result,
1360
0
         reinterpret_cast<const char*>(&kExtractMasksCombined) + 2 * tag_type,
1361
0
         sizeof(result));
1362
0
  return val & result;
1363
#elif defined(__aarch64__)
1364
  constexpr uint64_t kExtractMasksCombined = 0x0000FFFF00FF0000ull;
1365
  return val & static_cast<uint32_t>(
1366
      (kExtractMasksCombined >> (tag_type * 16)) & 0xFFFF);
1367
#else
1368
  static constexpr uint32_t kExtractMasks[4] = {0, 0xFF, 0xFFFF, 0};
1369
  return val & kExtractMasks[tag_type];
1370
#endif
1371
0
};
1372
1373
// Core decompression loop, when there is enough data available.
1374
// Decompresses the input buffer [ip, ip_limit) into the output buffer
1375
// [op, op_limit_min_slop). Returning when either we are too close to the end
1376
// of the input buffer, or we exceed op_limit_min_slop or when a exceptional
1377
// tag is encountered (literal of length > 60) or a copy-4.
1378
// Returns {ip, op} at the points it stopped decoding.
1379
// TODO This function probably does not need to be inlined, as it
1380
// should decode large chunks at a time. This allows runtime dispatch to
1381
// implementations based on CPU capability (BMI2 / perhaps 32 / 64 byte memcpy).
1382
template <typename T>
1383
std::pair<const uint8_t*, ptrdiff_t> DecompressBranchless(
1384
    const uint8_t* ip, const uint8_t* ip_limit, ptrdiff_t op, T op_base,
1385
0
    ptrdiff_t op_limit_min_slop) {
1386
  // If deferred_src is invalid point it here.
1387
0
  uint8_t safe_source[64];
1388
0
  const void* deferred_src;
1389
0
  size_t deferred_length;
1390
0
  ClearDeferred(&deferred_src, &deferred_length, safe_source);
1391
1392
  // We unroll the inner loop twice so we need twice the spare room.
1393
0
  op_limit_min_slop -= kSlopBytes;
1394
0
  if (2 * (kSlopBytes + 1) < ip_limit - ip && op < op_limit_min_slop) {
1395
0
    const uint8_t* const ip_limit_min_slop = ip_limit - 2 * kSlopBytes - 1;
1396
0
    ip++;
1397
    // ip points just past the tag and we are touching at maximum kSlopBytes
1398
    // in an iteration.
1399
0
    size_t tag = ip[-1];
1400
#if defined(__clang__) && defined(__aarch64__)
1401
    // Workaround for https://bugs.llvm.org/show_bug.cgi?id=51317
1402
    // when loading 1 byte, clang for aarch64 doesn't realize that it(ldrb)
1403
    // comes with free zero-extension, so clang generates another
1404
    // 'and xn, xm, 0xff' before it use that as the offset. This 'and' is
1405
    // redundant and can be removed by adding this dummy asm, which gives
1406
    // clang a hint that we're doing the zero-extension at the load.
1407
    asm("" ::"r"(tag));
1408
#endif
1409
0
    do {
1410
      // The throughput is limited by instructions, unrolling the inner loop
1411
      // twice reduces the amount of instructions checking limits and also
1412
      // leads to reduced mov's.
1413
1414
0
      SNAPPY_PREFETCH(ip + 128);
1415
0
      for (int i = 0; i < 2; i++) {
1416
0
        const uint8_t* old_ip = ip;
1417
0
        assert(tag == ip[-1]);
1418
        // For literals tag_type = 0, hence we will always obtain 0 from
1419
        // ExtractLowBytes. For literals offset will thus be kLiteralOffset.
1420
0
        ptrdiff_t len_minus_offset = kLengthMinusOffset[tag];
1421
0
        uint32_t next;
1422
#if defined(__aarch64__)
1423
        size_t tag_type = AdvanceToNextTagARMOptimized(&ip, &tag);
1424
        // We never need more than 16 bits. Doing a Load16 allows the compiler
1425
        // to elide the masking operation in ExtractOffset.
1426
        next = LittleEndian::Load16(old_ip);
1427
#else
1428
0
        size_t tag_type = AdvanceToNextTagX86Optimized(&ip, &tag);
1429
0
        next = LittleEndian::Load32(old_ip);
1430
0
#endif
1431
0
        size_t len = len_minus_offset & 0xFF;
1432
0
        ptrdiff_t extracted = ExtractOffset(next, tag_type);
1433
0
        ptrdiff_t len_min_offset = len_minus_offset - extracted;
1434
0
        if (SNAPPY_PREDICT_FALSE(len_minus_offset > extracted)) {
1435
0
          if (SNAPPY_PREDICT_FALSE(len & 0x80)) {
1436
            // Exceptional case (long literal or copy 4).
1437
            // Actually doing the copy here is negatively impacting the main
1438
            // loop due to compiler incorrectly allocating a register for
1439
            // this fallback. Hence we just break.
1440
0
          break_loop:
1441
0
            ip = old_ip;
1442
0
            goto exit;
1443
0
          }
1444
          // Only copy-1 or copy-2 tags can get here.
1445
0
          assert(tag_type == 1 || tag_type == 2);
1446
0
          std::ptrdiff_t delta = (op + deferred_length) + len_min_offset - len;
1447
          // Guard against copies before the buffer start.
1448
          // Execute any deferred MemCopy since we write to dst here.
1449
0
          MemCopy64(op_base + op, deferred_src, deferred_length);
1450
0
          op += deferred_length;
1451
0
          ClearDeferred(&deferred_src, &deferred_length, safe_source);
1452
0
          if (SNAPPY_PREDICT_FALSE(delta < 0 ||
1453
0
                                  !Copy64BytesWithPatternExtension(
1454
0
                                      op_base + op, len - len_min_offset))) {
1455
0
            goto break_loop;
1456
0
          }
1457
          // We aren't deferring this copy so add length right away.
1458
0
          op += len;
1459
0
          continue;
1460
0
        }
1461
0
        std::ptrdiff_t delta = (op + deferred_length) + len_min_offset - len;
1462
0
        if (SNAPPY_PREDICT_FALSE(delta < 0)) {
1463
          // Due to the spurious offset in literals have this will trigger
1464
          // at the start of a block when op is still smaller than 256.
1465
0
          if (tag_type != 0) goto break_loop;
1466
0
          MemCopy64(op_base + op, deferred_src, deferred_length);
1467
0
          op += deferred_length;
1468
0
          DeferMemCopy(&deferred_src, &deferred_length, old_ip, len);
1469
0
          continue;
1470
0
        }
1471
1472
        // For copies we need to copy from op_base + delta, for literals
1473
        // we need to copy from ip instead of from the stream.
1474
0
        const void* from =
1475
0
            tag_type ? reinterpret_cast<void*>(op_base + delta) : old_ip;
1476
0
        MemCopy64(op_base + op, deferred_src, deferred_length);
1477
0
        op += deferred_length;
1478
0
        DeferMemCopy(&deferred_src, &deferred_length, from, len);
1479
0
      }
1480
0
    } while (ip < ip_limit_min_slop &&
1481
0
             static_cast<ptrdiff_t>(op + deferred_length) < op_limit_min_slop);
1482
0
  exit:
1483
0
    ip--;
1484
0
    assert(ip <= ip_limit);
1485
0
  }
1486
  // If we deferred a copy then we can perform.  If we are up to date then we
1487
  // might not have enough slop bytes and could run past the end.
1488
0
  if (deferred_length) {
1489
0
    MemCopy64(op_base + op, deferred_src, deferred_length);
1490
0
    op += deferred_length;
1491
0
    ClearDeferred(&deferred_src, &deferred_length, safe_source);
1492
0
  }
1493
0
  return {ip, op};
1494
0
}
Unexecuted instantiation: std::__1::pair<unsigned char const*, long> duckdb_snappy::DecompressBranchless<char*>(unsigned char const*, unsigned char const*, long, char*, long)
Unexecuted instantiation: std::__1::pair<unsigned char const*, long> duckdb_snappy::DecompressBranchless<unsigned long>(unsigned char const*, unsigned char const*, long, unsigned long, long)
1495
1496
// Helper class for decompression
1497
class SnappyDecompressor {
1498
 private:
1499
  Source* reader_;        // Underlying source of bytes to decompress
1500
  const char* ip_;        // Points to next buffered byte
1501
  const char* ip_limit_;  // Points just past buffered bytes
1502
  // If ip < ip_limit_min_maxtaglen_ it's safe to read kMaxTagLength from
1503
  // buffer.
1504
  const char* ip_limit_min_maxtaglen_;
1505
  uint32_t peeked_;                  // Bytes peeked from reader (need to skip)
1506
  bool eof_;                         // Hit end of input without an error?
1507
  char scratch_[kMaximumTagLength];  // See RefillTag().
1508
1509
  // Ensure that all of the tag metadata for the next tag is available
1510
  // in [ip_..ip_limit_-1].  Also ensures that [ip,ip+4] is readable even
1511
  // if (ip_limit_ - ip_ < 5).
1512
  //
1513
  // Returns true on success, false on error or end of input.
1514
  bool RefillTag();
1515
1516
0
  void ResetLimit(const char* ip) {
1517
0
    ip_limit_min_maxtaglen_ =
1518
0
        ip_limit_ - std::min<ptrdiff_t>(ip_limit_ - ip, kMaximumTagLength - 1);
1519
0
  }
1520
1521
 public:
1522
  explicit SnappyDecompressor(Source* reader)
1523
0
      : reader_(reader), ip_(NULL), ip_limit_(NULL), peeked_(0), eof_(false) {}
1524
1525
0
  ~SnappyDecompressor() {
1526
    // Advance past any bytes we peeked at from the reader
1527
0
    reader_->Skip(peeked_);
1528
0
  }
1529
1530
  // Returns true iff we have hit the end of the input without an error.
1531
0
  bool eof() const { return eof_; }
1532
1533
  // Read the uncompressed length stored at the start of the compressed data.
1534
  // On success, stores the length in *result and returns true.
1535
  // On failure, returns false.
1536
0
  bool ReadUncompressedLength(uint32_t* result) {
1537
0
    assert(ip_ == NULL);  // Must not have read anything yet
1538
    // Length is encoded in 1..5 bytes
1539
0
    *result = 0;
1540
0
    uint32_t shift = 0;
1541
0
    while (true) {
1542
0
      if (shift >= 32) return false;
1543
0
      size_t n;
1544
0
      const char* ip = reader_->Peek(&n);
1545
0
      if (n == 0) return false;
1546
0
      const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip));
1547
0
      reader_->Skip(1);
1548
0
      uint32_t val = c & 0x7f;
1549
0
      if (LeftShiftOverflows(static_cast<uint8_t>(val), shift)) return false;
1550
0
      *result |= val << shift;
1551
0
      if (c < 128) {
1552
0
        break;
1553
0
      }
1554
0
      shift += 7;
1555
0
    }
1556
0
    return true;
1557
0
  }
1558
1559
  // Process the next item found in the input.
1560
  // Returns true if successful, false on error or end of input.
1561
  template <class Writer>
1562
#if defined(__GNUC__) && defined(__x86_64__)
1563
  __attribute__((aligned(32)))
1564
#endif
1565
  void
1566
0
  DecompressAllTags(Writer* writer) {
1567
0
    const char* ip = ip_;
1568
0
    ResetLimit(ip);
1569
0
    auto op = writer->GetOutputPtr();
1570
    // We could have put this refill fragment only at the beginning of the loop.
1571
    // However, duplicating it at the end of each branch gives the compiler more
1572
    // scope to optimize the <ip_limit_ - ip> expression based on the local
1573
    // context, which overall increases speed.
1574
0
#define MAYBE_REFILL()                                      \
1575
0
  if (SNAPPY_PREDICT_FALSE(ip >= ip_limit_min_maxtaglen_)) { \
1576
0
    ip_ = ip;                                               \
1577
0
    if (SNAPPY_PREDICT_FALSE(!RefillTag())) goto exit;       \
1578
0
    ip = ip_;                                               \
1579
0
    ResetLimit(ip);                                         \
1580
0
  }                                                         \
1581
0
  preload = static_cast<uint8_t>(*ip)
1582
1583
    // At the start of the for loop below the least significant byte of preload
1584
    // contains the tag.
1585
0
    uint32_t preload;
1586
0
    MAYBE_REFILL();
1587
0
    for (;;) {
1588
0
      {
1589
0
        ptrdiff_t op_limit_min_slop;
1590
0
        auto op_base = writer->GetBase(&op_limit_min_slop);
1591
0
        if (op_base) {
1592
0
          auto res =
1593
0
              DecompressBranchless(reinterpret_cast<const uint8_t*>(ip),
1594
0
                                   reinterpret_cast<const uint8_t*>(ip_limit_),
1595
0
                                   op - op_base, op_base, op_limit_min_slop);
1596
0
          ip = reinterpret_cast<const char*>(res.first);
1597
0
          op = op_base + res.second;
1598
0
          MAYBE_REFILL();
1599
0
        }
1600
0
      }
1601
0
      const uint8_t c = static_cast<uint8_t>(preload);
1602
0
      ip++;
1603
1604
      // Ratio of iterations that have LITERAL vs non-LITERAL for different
1605
      // inputs.
1606
      //
1607
      // input          LITERAL  NON_LITERAL
1608
      // -----------------------------------
1609
      // html|html4|cp   23%        77%
1610
      // urls            36%        64%
1611
      // jpg             47%        53%
1612
      // pdf             19%        81%
1613
      // txt[1-4]        25%        75%
1614
      // pb              24%        76%
1615
      // bin             24%        76%
1616
0
      if (SNAPPY_PREDICT_FALSE((c & 0x3) == LITERAL)) {
1617
0
        size_t literal_length = (c >> 2) + 1u;
1618
0
        if (writer->TryFastAppend(ip, ip_limit_ - ip, literal_length, &op)) {
1619
0
          assert(literal_length < 61);
1620
0
          ip += literal_length;
1621
          // NOTE: There is no MAYBE_REFILL() here, as TryFastAppend()
1622
          // will not return true unless there's already at least five spare
1623
          // bytes in addition to the literal.
1624
0
          preload = static_cast<uint8_t>(*ip);
1625
0
          continue;
1626
0
        }
1627
0
        if (SNAPPY_PREDICT_FALSE(literal_length >= 61)) {
1628
          // Long literal.
1629
0
          const size_t literal_length_length = literal_length - 60;
1630
0
          literal_length =
1631
0
              ExtractLowBytes(LittleEndian::Load32(ip), literal_length_length) +
1632
0
              1;
1633
0
          ip += literal_length_length;
1634
0
        }
1635
1636
0
        size_t avail = ip_limit_ - ip;
1637
0
        while (avail < literal_length) {
1638
0
          if (!writer->Append(ip, avail, &op)) goto exit;
1639
0
          literal_length -= avail;
1640
0
          reader_->Skip(peeked_);
1641
0
          size_t n;
1642
0
          ip = reader_->Peek(&n);
1643
0
          avail = n;
1644
0
          peeked_ = avail;
1645
0
          if (avail == 0) goto exit;
1646
0
          ip_limit_ = ip + avail;
1647
0
          ResetLimit(ip);
1648
0
        }
1649
0
        if (!writer->Append(ip, literal_length, &op)) goto exit;
1650
0
        ip += literal_length;
1651
0
        MAYBE_REFILL();
1652
0
      } else {
1653
0
        if (SNAPPY_PREDICT_FALSE((c & 3) == COPY_4_BYTE_OFFSET)) {
1654
0
          const size_t copy_offset = LittleEndian::Load32(ip);
1655
0
          const size_t length = (c >> 2) + 1;
1656
0
          ip += 4;
1657
1658
0
          if (!writer->AppendFromSelf(copy_offset, length, &op)) goto exit;
1659
0
        } else {
1660
0
          const ptrdiff_t entry = kLengthMinusOffset[c];
1661
0
          preload = LittleEndian::Load32(ip);
1662
0
          const uint32_t trailer = ExtractLowBytes(preload, c & 3);
1663
0
          const uint32_t length = entry & 0xff;
1664
0
          assert(length > 0);
1665
1666
          // copy_offset/256 is encoded in bits 8..10.  By just fetching
1667
          // those bits, we get copy_offset (since the bit-field starts at
1668
          // bit 8).
1669
0
          const uint32_t copy_offset = trailer - entry + length;
1670
0
          if (!writer->AppendFromSelf(copy_offset, length, &op)) goto exit;
1671
1672
0
          ip += (c & 3);
1673
          // By using the result of the previous load we reduce the critical
1674
          // dependency chain of ip to 4 cycles.
1675
0
          preload >>= (c & 3) * 8;
1676
0
          if (ip < ip_limit_min_maxtaglen_) continue;
1677
0
        }
1678
0
        MAYBE_REFILL();
1679
0
      }
1680
0
    }
1681
0
#undef MAYBE_REFILL
1682
0
  exit:
1683
0
    writer->SetOutputPtr(op);
1684
0
  }
Unexecuted instantiation: void duckdb_snappy::SnappyDecompressor::DecompressAllTags<duckdb_snappy::SnappyIOVecWriter>(duckdb_snappy::SnappyIOVecWriter*)
Unexecuted instantiation: void duckdb_snappy::SnappyDecompressor::DecompressAllTags<duckdb_snappy::SnappyDecompressionValidator>(duckdb_snappy::SnappyDecompressionValidator*)
Unexecuted instantiation: void duckdb_snappy::SnappyDecompressor::DecompressAllTags<duckdb_snappy::SnappyArrayWriter>(duckdb_snappy::SnappyArrayWriter*)
Unexecuted instantiation: void duckdb_snappy::SnappyDecompressor::DecompressAllTags<duckdb_snappy::SnappyScatteredWriter<duckdb_snappy::SnappySinkAllocator> >(duckdb_snappy::SnappyScatteredWriter<duckdb_snappy::SnappySinkAllocator>*)
1685
};
1686
1687
0
constexpr uint32_t CalculateNeeded(uint8_t tag) {
1688
0
  return ((tag & 3) == 0 && tag >= (60 * 4))
1689
0
             ? (tag >> 2) - 58
1690
0
             : (0x05030201 >> ((tag * 8) & 31)) & 0xFF;
1691
0
}
1692
1693
#if __cplusplus >= 201402L
1694
constexpr bool VerifyCalculateNeeded() {
1695
  for (int i = 0; i < 1; i++) {
1696
    if (CalculateNeeded(i) != (char_table[i] >> 11) + 1) return false;
1697
  }
1698
  return true;
1699
}
1700
1701
// Make sure CalculateNeeded is correct by verifying it against the established
1702
// table encoding the number of added bytes needed.
1703
static_assert(VerifyCalculateNeeded(), "");
1704
#endif  // c++14
1705
1706
0
bool SnappyDecompressor::RefillTag() {
1707
0
  const char* ip = ip_;
1708
0
  if (ip == ip_limit_) {
1709
    // Fetch a new fragment from the reader
1710
0
    reader_->Skip(peeked_);  // All peeked bytes are used up
1711
0
    size_t n;
1712
0
    ip = reader_->Peek(&n);
1713
0
    peeked_ = n;
1714
0
    eof_ = (n == 0);
1715
0
    if (eof_) return false;
1716
0
    ip_limit_ = ip + n;
1717
0
  }
1718
1719
  // Read the tag character
1720
0
  assert(ip < ip_limit_);
1721
0
  const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip));
1722
  // At this point make sure that the data for the next tag is consecutive.
1723
  // For copy 1 this means the next 2 bytes (tag and 1 byte offset)
1724
  // For copy 2 the next 3 bytes (tag and 2 byte offset)
1725
  // For copy 4 the next 5 bytes (tag and 4 byte offset)
1726
  // For all small literals we only need 1 byte buf for literals 60...63 the
1727
  // length is encoded in 1...4 extra bytes.
1728
0
  const uint32_t needed = CalculateNeeded(c);
1729
0
  assert(needed <= sizeof(scratch_));
1730
1731
  // Read more bytes from reader if needed
1732
0
  uint32_t nbuf = ip_limit_ - ip;
1733
0
  if (nbuf < needed) {
1734
    // Stitch together bytes from ip and reader to form the word
1735
    // contents.  We store the needed bytes in "scratch_".  They
1736
    // will be consumed immediately by the caller since we do not
1737
    // read more than we need.
1738
0
    std::memmove(scratch_, ip, nbuf);
1739
0
    reader_->Skip(peeked_);  // All peeked bytes are used up
1740
0
    peeked_ = 0;
1741
0
    while (nbuf < needed) {
1742
0
      size_t length;
1743
0
      const char* src = reader_->Peek(&length);
1744
0
      if (length == 0) return false;
1745
0
      uint32_t to_add = std::min<uint32_t>(needed - nbuf, length);
1746
0
      std::memcpy(scratch_ + nbuf, src, to_add);
1747
0
      nbuf += to_add;
1748
0
      reader_->Skip(to_add);
1749
0
    }
1750
0
    assert(nbuf == needed);
1751
0
    ip_ = scratch_;
1752
0
    ip_limit_ = scratch_ + needed;
1753
0
  } else if (nbuf < kMaximumTagLength) {
1754
    // Have enough bytes, but move into scratch_ so that we do not
1755
    // read past end of input
1756
0
    std::memmove(scratch_, ip, nbuf);
1757
0
    reader_->Skip(peeked_);  // All peeked bytes are used up
1758
0
    peeked_ = 0;
1759
0
    ip_ = scratch_;
1760
0
    ip_limit_ = scratch_ + nbuf;
1761
0
  } else {
1762
    // Pass pointer to buffer returned by reader_.
1763
0
    ip_ = ip;
1764
0
  }
1765
0
  return true;
1766
0
}
1767
1768
template <typename Writer>
1769
0
static bool InternalUncompress(Source* r, Writer* writer) {
1770
  // Read the uncompressed length from the front of the compressed input
1771
0
  SnappyDecompressor decompressor(r);
1772
0
  uint32_t uncompressed_len = 0;
1773
0
  if (!decompressor.ReadUncompressedLength(&uncompressed_len)) return false;
1774
1775
0
  return InternalUncompressAllTags(&decompressor, writer, r->Available(),
1776
0
                                   uncompressed_len);
1777
0
}
Unexecuted instantiation: snappy.cc:bool duckdb_snappy::InternalUncompress<duckdb_snappy::SnappyIOVecWriter>(duckdb_snappy::Source*, duckdb_snappy::SnappyIOVecWriter*)
Unexecuted instantiation: snappy.cc:bool duckdb_snappy::InternalUncompress<duckdb_snappy::SnappyArrayWriter>(duckdb_snappy::Source*, duckdb_snappy::SnappyArrayWriter*)
Unexecuted instantiation: snappy.cc:bool duckdb_snappy::InternalUncompress<duckdb_snappy::SnappyDecompressionValidator>(duckdb_snappy::Source*, duckdb_snappy::SnappyDecompressionValidator*)
Unexecuted instantiation: snappy.cc:bool duckdb_snappy::InternalUncompress<duckdb_snappy::SnappyScatteredWriter<duckdb_snappy::SnappySinkAllocator> >(duckdb_snappy::Source*, duckdb_snappy::SnappyScatteredWriter<duckdb_snappy::SnappySinkAllocator>*)
1778
1779
template <typename Writer>
1780
static bool InternalUncompressAllTags(SnappyDecompressor* decompressor,
1781
                                      Writer* writer, uint32_t compressed_len,
1782
0
                                      uint32_t uncompressed_len) {
1783
0
    int token = 0;
1784
0
  Report(token, "snappy_uncompress", compressed_len, uncompressed_len);
1785
1786
0
  writer->SetExpectedLength(uncompressed_len);
1787
1788
  // Process the entire input
1789
0
  decompressor->DecompressAllTags(writer);
1790
0
  writer->Flush();
1791
0
  return (decompressor->eof() && writer->CheckLength());
1792
0
}
Unexecuted instantiation: snappy.cc:bool duckdb_snappy::InternalUncompressAllTags<duckdb_snappy::SnappyIOVecWriter>(duckdb_snappy::SnappyDecompressor*, duckdb_snappy::SnappyIOVecWriter*, unsigned int, unsigned int)
Unexecuted instantiation: snappy.cc:bool duckdb_snappy::InternalUncompressAllTags<duckdb_snappy::SnappyDecompressionValidator>(duckdb_snappy::SnappyDecompressor*, duckdb_snappy::SnappyDecompressionValidator*, unsigned int, unsigned int)
Unexecuted instantiation: snappy.cc:bool duckdb_snappy::InternalUncompressAllTags<duckdb_snappy::SnappyArrayWriter>(duckdb_snappy::SnappyDecompressor*, duckdb_snappy::SnappyArrayWriter*, unsigned int, unsigned int)
Unexecuted instantiation: snappy.cc:bool duckdb_snappy::InternalUncompressAllTags<duckdb_snappy::SnappyScatteredWriter<duckdb_snappy::SnappySinkAllocator> >(duckdb_snappy::SnappyDecompressor*, duckdb_snappy::SnappyScatteredWriter<duckdb_snappy::SnappySinkAllocator>*, unsigned int, unsigned int)
1793
1794
0
bool GetUncompressedLength(Source* source, uint32_t* result) {
1795
0
  SnappyDecompressor decompressor(source);
1796
0
  return decompressor.ReadUncompressedLength(result);
1797
0
}
1798
1799
0
size_t Compress(Source* reader, Sink* writer) {
1800
0
  return Compress(reader, writer, CompressionOptions{});
1801
0
}
1802
1803
0
size_t Compress(Source* reader, Sink* writer, CompressionOptions options) {
1804
0
  assert(options.level == 1 || options.level == 2);
1805
0
  int token = 0;
1806
0
  size_t written = 0;
1807
0
  size_t N = reader->Available();
1808
0
  const size_t uncompressed_size = N;
1809
0
  char ulength[Bignum::kMax32];
1810
0
  char* p = Bignum::Encode32(ulength, N);
1811
0
  writer->Append(ulength, p - ulength);
1812
0
  written += (p - ulength);
1813
1814
0
  internal::WorkingMemory wmem(N);
1815
1816
0
  while (N > 0) {
1817
    // Get next block to compress (without copying if possible)
1818
0
    size_t fragment_size;
1819
0
    const char* fragment = reader->Peek(&fragment_size);
1820
0
    assert(fragment_size != 0);  // premature end of input
1821
0
    const size_t num_to_read = std::min(N, kBlockSize);
1822
0
    size_t bytes_read = fragment_size;
1823
1824
0
    size_t pending_advance = 0;
1825
0
    if (bytes_read >= num_to_read) {
1826
      // Buffer returned by reader is large enough
1827
0
      pending_advance = num_to_read;
1828
0
      fragment_size = num_to_read;
1829
0
    } else {
1830
0
      char* scratch = wmem.GetScratchInput();
1831
0
      std::memcpy(scratch, fragment, bytes_read);
1832
0
      reader->Skip(bytes_read);
1833
1834
0
      while (bytes_read < num_to_read) {
1835
0
        fragment = reader->Peek(&fragment_size);
1836
0
        size_t n = std::min<size_t>(fragment_size, num_to_read - bytes_read);
1837
0
        std::memcpy(scratch + bytes_read, fragment, n);
1838
0
        bytes_read += n;
1839
0
        reader->Skip(n);
1840
0
      }
1841
0
      assert(bytes_read == num_to_read);
1842
0
      fragment = scratch;
1843
0
      fragment_size = num_to_read;
1844
0
    }
1845
0
    assert(fragment_size == num_to_read);
1846
1847
    // Get encoding table for compression
1848
0
    int table_size;
1849
0
    uint16_t* table = wmem.GetHashTable(num_to_read, &table_size);
1850
1851
    // Compress input_fragment and append to dest
1852
0
    int max_output = MaxCompressedLength(num_to_read);
1853
1854
    // Since we encode kBlockSize regions followed by a region
1855
    // which is <= kBlockSize in length, a previously allocated
1856
    // scratch_output[] region is big enough for this iteration.
1857
    // Need a scratch buffer for the output, in case the byte sink doesn't
1858
    // have room for us directly.
1859
0
    char* dest = writer->GetAppendBuffer(max_output, wmem.GetScratchOutput());
1860
0
    char* end = nullptr;
1861
0
    if (options.level == 1) {
1862
0
      end = internal::CompressFragment(fragment, fragment_size, dest, table,
1863
0
                                       table_size);
1864
0
    } else if (options.level == 2) {
1865
0
      end = internal::CompressFragmentDoubleHash(
1866
0
          fragment, fragment_size, dest, table, table_size >> 1,
1867
0
          table + (table_size >> 1), table_size >> 1);
1868
0
    }
1869
0
    writer->Append(dest, end - dest);
1870
0
    written += (end - dest);
1871
1872
0
    N -= num_to_read;
1873
0
    reader->Skip(pending_advance);
1874
0
  }
1875
1876
0
  Report(token, "snappy_compress", written, uncompressed_size);
1877
0
  return written;
1878
0
}
1879
1880
// -----------------------------------------------------------------------
1881
// IOVec interfaces
1882
// -----------------------------------------------------------------------
1883
1884
// A `Source` implementation that yields the contents of an `iovec` array. Note
1885
// that `total_size` is the total number of bytes to be read from the elements
1886
// of `iov` (_not_ the total number of elements in `iov`).
1887
class SnappyIOVecReader : public Source {
1888
 public:
1889
  SnappyIOVecReader(const struct iovec* iov, size_t total_size)
1890
0
      : curr_iov_(iov),
1891
0
        curr_pos_(total_size > 0 ? reinterpret_cast<const char*>(iov->iov_base)
1892
0
                                 : nullptr),
1893
0
        curr_size_remaining_(total_size > 0 ? iov->iov_len : 0),
1894
0
        total_size_remaining_(total_size) {
1895
    // Skip empty leading `iovec`s.
1896
0
    if (total_size > 0 && curr_size_remaining_ == 0) Advance();
1897
0
  }
1898
1899
  ~SnappyIOVecReader() override = default;
1900
1901
0
  size_t Available() const override { return total_size_remaining_; }
1902
1903
0
  const char* Peek(size_t* len) override {
1904
0
    *len = curr_size_remaining_;
1905
0
    return curr_pos_;
1906
0
  }
1907
1908
0
  void Skip(size_t n) override {
1909
0
    while (n >= curr_size_remaining_ && n > 0) {
1910
0
      n -= curr_size_remaining_;
1911
0
      Advance();
1912
0
    }
1913
0
    curr_size_remaining_ -= n;
1914
0
    total_size_remaining_ -= n;
1915
0
    curr_pos_ += n;
1916
0
  }
1917
1918
 private:
1919
  // Advances to the next nonempty `iovec` and updates related variables.
1920
0
  void Advance() {
1921
0
    do {
1922
0
      assert(total_size_remaining_ >= curr_size_remaining_);
1923
0
      total_size_remaining_ -= curr_size_remaining_;
1924
0
      if (total_size_remaining_ == 0) {
1925
0
        curr_pos_ = nullptr;
1926
0
        curr_size_remaining_ = 0;
1927
0
        return;
1928
0
      }
1929
0
      ++curr_iov_;
1930
0
      curr_pos_ = reinterpret_cast<const char*>(curr_iov_->iov_base);
1931
0
      curr_size_remaining_ = curr_iov_->iov_len;
1932
0
    } while (curr_size_remaining_ == 0);
1933
0
  }
1934
1935
  // The `iovec` currently being read.
1936
  const struct iovec* curr_iov_;
1937
  // The location in `curr_iov_` currently being read.
1938
  const char* curr_pos_;
1939
  // The amount of unread data in `curr_iov_`.
1940
  size_t curr_size_remaining_;
1941
  // The amount of unread data in the entire input array.
1942
  size_t total_size_remaining_;
1943
};
1944
1945
// A type that writes to an iovec.
1946
// Note that this is not a "ByteSink", but a type that matches the
1947
// Writer template argument to SnappyDecompressor::DecompressAllTags().
1948
class SnappyIOVecWriter {
1949
 private:
1950
  // output_iov_end_ is set to iov + count and used to determine when
1951
  // the end of the iovs is reached.
1952
  const struct iovec* output_iov_end_;
1953
1954
#if !defined(NDEBUG)
1955
  const struct iovec* output_iov_;
1956
#endif  // !defined(NDEBUG)
1957
1958
  // Current iov that is being written into.
1959
  const struct iovec* curr_iov_;
1960
1961
  // Pointer to current iov's write location.
1962
  char* curr_iov_output_;
1963
1964
  // Remaining bytes to write into curr_iov_output.
1965
  size_t curr_iov_remaining_;
1966
1967
  // Total bytes decompressed into output_iov_ so far.
1968
  size_t total_written_;
1969
1970
  // Maximum number of bytes that will be decompressed into output_iov_.
1971
  size_t output_limit_;
1972
1973
0
  static inline char* GetIOVecPointer(const struct iovec* iov, size_t offset) {
1974
0
    return reinterpret_cast<char*>(iov->iov_base) + offset;
1975
0
  }
1976
1977
 public:
1978
  // Does not take ownership of iov. iov must be valid during the
1979
  // entire lifetime of the SnappyIOVecWriter.
1980
  inline SnappyIOVecWriter(const struct iovec* iov, size_t iov_count)
1981
0
      : output_iov_end_(iov + iov_count),
1982
#if !defined(NDEBUG)
1983
        output_iov_(iov),
1984
#endif  // !defined(NDEBUG)
1985
0
        curr_iov_(iov),
1986
0
        curr_iov_output_(iov_count ? reinterpret_cast<char*>(iov->iov_base)
1987
0
                                   : nullptr),
1988
0
        curr_iov_remaining_(iov_count ? iov->iov_len : 0),
1989
0
        total_written_(0),
1990
0
        output_limit_(-1) {
1991
0
  }
1992
1993
0
  inline void SetExpectedLength(size_t len) { output_limit_ = len; }
1994
1995
0
  inline bool CheckLength() const { return total_written_ == output_limit_; }
1996
1997
0
  inline bool Append(const char* ip, size_t len, char**) {
1998
0
    if (total_written_ + len > output_limit_) {
1999
0
      return false;
2000
0
    }
2001
2002
0
    return AppendNoCheck(ip, len);
2003
0
  }
2004
2005
0
  char* GetOutputPtr() { return nullptr; }
2006
0
  char* GetBase(ptrdiff_t*) { return nullptr; }
2007
0
  void SetOutputPtr(char* op) {
2008
    // TODO: Switch to [[maybe_unused]] when we can assume C++17.
2009
0
    (void)op;
2010
0
  }
2011
2012
0
  inline bool AppendNoCheck(const char* ip, size_t len) {
2013
0
    while (len > 0) {
2014
0
      if (curr_iov_remaining_ == 0) {
2015
        // This iovec is full. Go to the next one.
2016
0
        if (curr_iov_ + 1 >= output_iov_end_) {
2017
0
          return false;
2018
0
        }
2019
0
        ++curr_iov_;
2020
0
        curr_iov_output_ = reinterpret_cast<char*>(curr_iov_->iov_base);
2021
0
        curr_iov_remaining_ = curr_iov_->iov_len;
2022
0
      }
2023
2024
0
      const size_t to_write = std::min(len, curr_iov_remaining_);
2025
0
      std::memcpy(curr_iov_output_, ip, to_write);
2026
0
      curr_iov_output_ += to_write;
2027
0
      curr_iov_remaining_ -= to_write;
2028
0
      total_written_ += to_write;
2029
0
      ip += to_write;
2030
0
      len -= to_write;
2031
0
    }
2032
2033
0
    return true;
2034
0
  }
2035
2036
  inline bool TryFastAppend(const char* ip, size_t available, size_t len,
2037
0
                            char**) {
2038
0
    const size_t space_left = output_limit_ - total_written_;
2039
0
    if (len <= 16 && available >= 16 + kMaximumTagLength && space_left >= 16 &&
2040
0
        curr_iov_remaining_ >= 16) {
2041
      // Fast path, used for the majority (about 95%) of invocations.
2042
0
      UnalignedCopy128(ip, curr_iov_output_);
2043
0
      curr_iov_output_ += len;
2044
0
      curr_iov_remaining_ -= len;
2045
0
      total_written_ += len;
2046
0
      return true;
2047
0
    }
2048
2049
0
    return false;
2050
0
  }
2051
2052
0
  inline bool AppendFromSelf(size_t offset, size_t len, char**) {
2053
    // See SnappyArrayWriter::AppendFromSelf for an explanation of
2054
    // the "offset - 1u" trick.
2055
0
    if (offset - 1u >= total_written_) {
2056
0
      return false;
2057
0
    }
2058
0
    const size_t space_left = output_limit_ - total_written_;
2059
0
    if (len > space_left) {
2060
0
      return false;
2061
0
    }
2062
2063
    // Locate the iovec from which we need to start the copy.
2064
0
    const iovec* from_iov = curr_iov_;
2065
0
    size_t from_iov_offset = curr_iov_->iov_len - curr_iov_remaining_;
2066
0
    while (offset > 0) {
2067
0
      if (from_iov_offset >= offset) {
2068
0
        from_iov_offset -= offset;
2069
0
        break;
2070
0
      }
2071
2072
0
      offset -= from_iov_offset;
2073
0
      --from_iov;
2074
#if !defined(NDEBUG)
2075
      assert(from_iov >= output_iov_);
2076
#endif  // !defined(NDEBUG)
2077
0
      from_iov_offset = from_iov->iov_len;
2078
0
    }
2079
2080
    // Copy <len> bytes starting from the iovec pointed to by from_iov_index to
2081
    // the current iovec.
2082
0
    while (len > 0) {
2083
0
      assert(from_iov <= curr_iov_);
2084
0
      if (from_iov != curr_iov_) {
2085
0
        const size_t to_copy =
2086
0
            std::min(from_iov->iov_len - from_iov_offset, len);
2087
0
        AppendNoCheck(GetIOVecPointer(from_iov, from_iov_offset), to_copy);
2088
0
        len -= to_copy;
2089
0
        if (len > 0) {
2090
0
          ++from_iov;
2091
0
          from_iov_offset = 0;
2092
0
        }
2093
0
      } else {
2094
0
        size_t to_copy = curr_iov_remaining_;
2095
0
        if (to_copy == 0) {
2096
          // This iovec is full. Go to the next one.
2097
0
          if (curr_iov_ + 1 >= output_iov_end_) {
2098
0
            return false;
2099
0
          }
2100
0
          ++curr_iov_;
2101
0
          curr_iov_output_ = reinterpret_cast<char*>(curr_iov_->iov_base);
2102
0
          curr_iov_remaining_ = curr_iov_->iov_len;
2103
0
          continue;
2104
0
        }
2105
0
        if (to_copy > len) {
2106
0
          to_copy = len;
2107
0
        }
2108
0
        assert(to_copy > 0);
2109
2110
0
        IncrementalCopy(GetIOVecPointer(from_iov, from_iov_offset),
2111
0
                        curr_iov_output_, curr_iov_output_ + to_copy,
2112
0
                        curr_iov_output_ + curr_iov_remaining_);
2113
0
        curr_iov_output_ += to_copy;
2114
0
        curr_iov_remaining_ -= to_copy;
2115
0
        from_iov_offset += to_copy;
2116
0
        total_written_ += to_copy;
2117
0
        len -= to_copy;
2118
0
      }
2119
0
    }
2120
2121
0
    return true;
2122
0
  }
2123
2124
0
  inline void Flush() {}
2125
};
2126
2127
bool RawUncompressToIOVec(const char* compressed, size_t compressed_length,
2128
0
                          const struct iovec* iov, size_t iov_cnt) {
2129
0
  ByteArraySource reader(compressed, compressed_length);
2130
0
  return RawUncompressToIOVec(&reader, iov, iov_cnt);
2131
0
}
2132
2133
bool RawUncompressToIOVec(Source* compressed, const struct iovec* iov,
2134
0
                          size_t iov_cnt) {
2135
0
  SnappyIOVecWriter output(iov, iov_cnt);
2136
0
  return InternalUncompress(compressed, &output);
2137
0
}
2138
2139
// -----------------------------------------------------------------------
2140
// Flat array interfaces
2141
// -----------------------------------------------------------------------
2142
2143
// A type that writes to a flat array.
2144
// Note that this is not a "ByteSink", but a type that matches the
2145
// Writer template argument to SnappyDecompressor::DecompressAllTags().
2146
class SnappyArrayWriter {
2147
 private:
2148
  char* base_;
2149
  char* op_;
2150
  char* op_limit_;
2151
  // If op < op_limit_min_slop_ then it's safe to unconditionally write
2152
  // kSlopBytes starting at op.
2153
  char* op_limit_min_slop_;
2154
2155
 public:
2156
  inline explicit SnappyArrayWriter(char* dst)
2157
0
      : base_(dst),
2158
0
        op_(dst),
2159
0
        op_limit_(dst),
2160
0
        op_limit_min_slop_(dst) {}  // Safe default see invariant.
2161
2162
0
  inline void SetExpectedLength(size_t len) {
2163
0
    op_limit_ = op_ + len;
2164
    // Prevent pointer from being past the buffer.
2165
0
    op_limit_min_slop_ = op_limit_ - std::min<size_t>(kSlopBytes - 1, len);
2166
0
  }
2167
2168
0
  inline bool CheckLength() const { return op_ == op_limit_; }
2169
2170
0
  char* GetOutputPtr() { return op_; }
2171
0
  char* GetBase(ptrdiff_t* op_limit_min_slop) {
2172
0
    *op_limit_min_slop = op_limit_min_slop_ - base_;
2173
0
    return base_;
2174
0
  }
2175
0
  void SetOutputPtr(char* op) { op_ = op; }
2176
2177
0
  inline bool Append(const char* ip, size_t len, char** op_p) {
2178
0
    char* op = *op_p;
2179
0
    const size_t space_left = op_limit_ - op;
2180
0
    if (space_left < len) return false;
2181
0
    std::memcpy(op, ip, len);
2182
0
    *op_p = op + len;
2183
0
    return true;
2184
0
  }
2185
2186
  inline bool TryFastAppend(const char* ip, size_t available, size_t len,
2187
0
                            char** op_p) {
2188
0
    char* op = *op_p;
2189
0
    const size_t space_left = op_limit_ - op;
2190
0
    if (len <= 16 && available >= 16 + kMaximumTagLength && space_left >= 16) {
2191
      // Fast path, used for the majority (about 95%) of invocations.
2192
0
      UnalignedCopy128(ip, op);
2193
0
      *op_p = op + len;
2194
0
      return true;
2195
0
    } else {
2196
0
      return false;
2197
0
    }
2198
0
  }
2199
2200
  SNAPPY_ATTRIBUTE_ALWAYS_INLINE
2201
0
  inline bool AppendFromSelf(size_t offset, size_t len, char** op_p) {
2202
0
    assert(len > 0);
2203
0
    char* const op = *op_p;
2204
0
    assert(op >= base_);
2205
0
    char* const op_end = op + len;
2206
2207
    // Check if we try to append from before the start of the buffer.
2208
0
    if (SNAPPY_PREDICT_FALSE(static_cast<size_t>(op - base_) < offset))
2209
0
      return false;
2210
2211
0
    if (SNAPPY_PREDICT_FALSE((kSlopBytes < 64 && len > kSlopBytes) ||
2212
0
                            op >= op_limit_min_slop_ || offset < len)) {
2213
0
      if (op_end > op_limit_ || offset == 0) return false;
2214
0
      *op_p = IncrementalCopy(op - offset, op, op_end, op_limit_);
2215
0
      return true;
2216
0
    }
2217
0
    std::memmove(op, op - offset, kSlopBytes);
2218
0
    *op_p = op_end;
2219
0
    return true;
2220
0
  }
2221
0
  inline size_t Produced() const {
2222
0
    assert(op_ >= base_);
2223
0
    return op_ - base_;
2224
0
  }
2225
0
  inline void Flush() {}
2226
};
2227
2228
bool RawUncompress(const char* compressed, size_t compressed_length,
2229
0
                   char* uncompressed) {
2230
0
  ByteArraySource reader(compressed, compressed_length);
2231
0
  return RawUncompress(&reader, uncompressed);
2232
0
}
2233
2234
0
bool RawUncompress(Source* compressed, char* uncompressed) {
2235
0
  SnappyArrayWriter output(uncompressed);
2236
0
  return InternalUncompress(compressed, &output);
2237
0
}
2238
2239
bool Uncompress(const char* compressed, size_t compressed_length,
2240
0
                std::string* uncompressed) {
2241
0
  size_t ulength;
2242
0
  if (!GetUncompressedLength(compressed, compressed_length, &ulength)) {
2243
0
    return false;
2244
0
  }
2245
  // On 32-bit builds: max_size() < kuint32max.  Check for that instead
2246
  // of crashing (e.g., consider externally specified compressed data).
2247
0
  if (ulength > uncompressed->max_size()) {
2248
0
    return false;
2249
0
  }
2250
0
  STLStringResizeUninitialized(uncompressed, ulength);
2251
0
  return RawUncompress(compressed, compressed_length,
2252
0
                       string_as_array(uncompressed));
2253
0
}
2254
2255
// A Writer that drops everything on the floor and just does validation
2256
class SnappyDecompressionValidator {
2257
 private:
2258
  size_t expected_;
2259
  size_t produced_;
2260
2261
 public:
2262
0
  inline SnappyDecompressionValidator() : expected_(0), produced_(0) {}
2263
0
  inline void SetExpectedLength(size_t len) { expected_ = len; }
2264
0
  size_t GetOutputPtr() { return produced_; }
2265
0
  size_t GetBase(ptrdiff_t* op_limit_min_slop) {
2266
0
    *op_limit_min_slop = std::numeric_limits<ptrdiff_t>::max() - kSlopBytes + 1;
2267
0
    return 1;
2268
0
  }
2269
0
  void SetOutputPtr(size_t op) { produced_ = op; }
2270
0
  inline bool CheckLength() const { return expected_ == produced_; }
2271
0
  inline bool Append(const char* ip, size_t len, size_t* produced) {
2272
    // TODO: Switch to [[maybe_unused]] when we can assume C++17.
2273
0
    (void)ip;
2274
2275
0
    *produced += len;
2276
0
    return *produced <= expected_;
2277
0
  }
2278
  inline bool TryFastAppend(const char* ip, size_t available, size_t length,
2279
0
                            size_t* produced) {
2280
    // TODO: Switch to [[maybe_unused]] when we can assume C++17.
2281
0
    (void)ip;
2282
0
    (void)available;
2283
0
    (void)length;
2284
0
    (void)produced;
2285
2286
0
    return false;
2287
0
  }
2288
0
  inline bool AppendFromSelf(size_t offset, size_t len, size_t* produced) {
2289
    // See SnappyArrayWriter::AppendFromSelf for an explanation of
2290
    // the "offset - 1u" trick.
2291
0
    if (*produced <= offset - 1u) return false;
2292
0
    *produced += len;
2293
0
    return *produced <= expected_;
2294
0
  }
2295
0
  inline void Flush() {}
2296
};
2297
2298
0
bool IsValidCompressedBuffer(const char* compressed, size_t compressed_length) {
2299
0
  ByteArraySource reader(compressed, compressed_length);
2300
0
  SnappyDecompressionValidator writer;
2301
0
  return InternalUncompress(&reader, &writer);
2302
0
}
2303
2304
0
bool IsValidCompressed(Source* compressed) {
2305
0
  SnappyDecompressionValidator writer;
2306
0
  return InternalUncompress(compressed, &writer);
2307
0
}
2308
2309
void RawCompress(const char* input, size_t input_length, char* compressed,
2310
0
                 size_t* compressed_length) {
2311
0
  RawCompress(input, input_length, compressed, compressed_length,
2312
0
              CompressionOptions{});
2313
0
}
2314
2315
void RawCompress(const char* input, size_t input_length, char* compressed,
2316
0
                 size_t* compressed_length, CompressionOptions options) {
2317
0
  ByteArraySource reader(input, input_length);
2318
0
  UncheckedByteArraySink writer(compressed);
2319
0
  Compress(&reader, &writer, options);
2320
2321
  // Compute how many bytes were added
2322
0
  *compressed_length = (writer.CurrentDestination() - compressed);
2323
0
}
2324
2325
void RawCompressFromIOVec(const struct iovec* iov, size_t uncompressed_length,
2326
0
                          char* compressed, size_t* compressed_length) {
2327
0
  RawCompressFromIOVec(iov, uncompressed_length, compressed, compressed_length,
2328
0
                       CompressionOptions{});
2329
0
}
2330
2331
void RawCompressFromIOVec(const struct iovec* iov, size_t uncompressed_length,
2332
                          char* compressed, size_t* compressed_length,
2333
0
                          CompressionOptions options) {
2334
0
  SnappyIOVecReader reader(iov, uncompressed_length);
2335
0
  UncheckedByteArraySink writer(compressed);
2336
0
  Compress(&reader, &writer, options);
2337
2338
  // Compute how many bytes were added.
2339
0
  *compressed_length = writer.CurrentDestination() - compressed;
2340
0
}
2341
2342
size_t Compress(const char* input, size_t input_length,
2343
0
                std::string* compressed) {
2344
0
  return Compress(input, input_length, compressed, CompressionOptions{});
2345
0
}
2346
2347
size_t Compress(const char* input, size_t input_length, std::string* compressed,
2348
0
                CompressionOptions options) {
2349
  // Pre-grow the buffer to the max length of the compressed output
2350
0
  STLStringResizeUninitialized(compressed, MaxCompressedLength(input_length));
2351
2352
0
  size_t compressed_length;
2353
0
  RawCompress(input, input_length, string_as_array(compressed),
2354
0
              &compressed_length, options);
2355
0
  compressed->erase(compressed_length);
2356
0
  return compressed_length;
2357
0
}
2358
2359
size_t CompressFromIOVec(const struct iovec* iov, size_t iov_cnt,
2360
0
                         std::string* compressed) {
2361
0
  return CompressFromIOVec(iov, iov_cnt, compressed, CompressionOptions{});
2362
0
}
2363
2364
size_t CompressFromIOVec(const struct iovec* iov, size_t iov_cnt,
2365
0
                         std::string* compressed, CompressionOptions options) {
2366
  // Compute the number of bytes to be compressed.
2367
0
  size_t uncompressed_length = 0;
2368
0
  for (size_t i = 0; i < iov_cnt; ++i) {
2369
0
    uncompressed_length += iov[i].iov_len;
2370
0
  }
2371
2372
  // Pre-grow the buffer to the max length of the compressed output.
2373
0
  STLStringResizeUninitialized(compressed, MaxCompressedLength(
2374
0
      uncompressed_length));
2375
2376
0
  size_t compressed_length;
2377
0
  RawCompressFromIOVec(iov, uncompressed_length, string_as_array(compressed),
2378
0
                       &compressed_length, options);
2379
0
  compressed->erase(compressed_length);
2380
0
  return compressed_length;
2381
0
}
2382
2383
// -----------------------------------------------------------------------
2384
// Sink interface
2385
// -----------------------------------------------------------------------
2386
2387
// A type that decompresses into a Sink. The template parameter
2388
// Allocator must export one method "char* Allocate(int size);", which
2389
// allocates a buffer of "size" and appends that to the destination.
2390
template <typename Allocator>
2391
class SnappyScatteredWriter {
2392
  Allocator allocator_;
2393
2394
  // We need random access into the data generated so far.  Therefore
2395
  // we keep track of all of the generated data as an array of blocks.
2396
  // All of the blocks except the last have length kBlockSize.
2397
  std::vector<char*> blocks_;
2398
  size_t expected_;
2399
2400
  // Total size of all fully generated blocks so far
2401
  size_t full_size_;
2402
2403
  // Pointer into current output block
2404
  char* op_base_;   // Base of output block
2405
  char* op_ptr_;    // Pointer to next unfilled byte in block
2406
  char* op_limit_;  // Pointer just past block
2407
  // If op < op_limit_min_slop_ then it's safe to unconditionally write
2408
  // kSlopBytes starting at op.
2409
  char* op_limit_min_slop_;
2410
2411
0
  inline size_t Size() const { return full_size_ + (op_ptr_ - op_base_); }
2412
2413
  bool SlowAppend(const char* ip, size_t len);
2414
  bool SlowAppendFromSelf(size_t offset, size_t len);
2415
2416
 public:
2417
  inline explicit SnappyScatteredWriter(const Allocator& allocator)
2418
0
      : allocator_(allocator),
2419
0
        full_size_(0),
2420
        op_base_(NULL),
2421
        op_ptr_(NULL),
2422
        op_limit_(NULL),
2423
0
        op_limit_min_slop_(NULL) {}
2424
0
  char* GetOutputPtr() { return op_ptr_; }
2425
0
  char* GetBase(ptrdiff_t* op_limit_min_slop) {
2426
0
    *op_limit_min_slop = op_limit_min_slop_ - op_base_;
2427
0
    return op_base_;
2428
0
  }
2429
0
  void SetOutputPtr(char* op) { op_ptr_ = op; }
2430
2431
0
  inline void SetExpectedLength(size_t len) {
2432
0
    assert(blocks_.empty());
2433
0
    expected_ = len;
2434
0
  }
2435
2436
0
  inline bool CheckLength() const { return Size() == expected_; }
2437
2438
  // Return the number of bytes actually uncompressed so far
2439
0
  inline size_t Produced() const { return Size(); }
2440
2441
0
  inline bool Append(const char* ip, size_t len, char** op_p) {
2442
0
    char* op = *op_p;
2443
0
    size_t avail = op_limit_ - op;
2444
0
    if (len <= avail) {
2445
      // Fast path
2446
0
      std::memcpy(op, ip, len);
2447
0
      *op_p = op + len;
2448
0
      return true;
2449
0
    } else {
2450
0
      op_ptr_ = op;
2451
0
      bool res = SlowAppend(ip, len);
2452
0
      *op_p = op_ptr_;
2453
0
      return res;
2454
0
    }
2455
0
  }
2456
2457
  inline bool TryFastAppend(const char* ip, size_t available, size_t length,
2458
0
                            char** op_p) {
2459
0
    char* op = *op_p;
2460
0
    const int space_left = op_limit_ - op;
2461
0
    if (length <= 16 && available >= 16 + kMaximumTagLength &&
2462
0
        space_left >= 16) {
2463
      // Fast path, used for the majority (about 95%) of invocations.
2464
0
      UnalignedCopy128(ip, op);
2465
0
      *op_p = op + length;
2466
0
      return true;
2467
0
    } else {
2468
0
      return false;
2469
0
    }
2470
0
  }
2471
2472
0
  inline bool AppendFromSelf(size_t offset, size_t len, char** op_p) {
2473
0
    char* op = *op_p;
2474
0
    assert(op >= op_base_);
2475
    // Check if we try to append from before the start of the buffer.
2476
0
    if (SNAPPY_PREDICT_FALSE((kSlopBytes < 64 && len > kSlopBytes) ||
2477
0
                            static_cast<size_t>(op - op_base_) < offset ||
2478
0
                            op >= op_limit_min_slop_ || offset < len)) {
2479
0
      if (offset == 0) return false;
2480
0
      if (SNAPPY_PREDICT_FALSE(static_cast<size_t>(op - op_base_) < offset ||
2481
0
                              op + len > op_limit_)) {
2482
0
        op_ptr_ = op;
2483
0
        bool res = SlowAppendFromSelf(offset, len);
2484
0
        *op_p = op_ptr_;
2485
0
        return res;
2486
0
      }
2487
0
      *op_p = IncrementalCopy(op - offset, op, op + len, op_limit_);
2488
0
      return true;
2489
0
    }
2490
    // Fast path
2491
0
    char* const op_end = op + len;
2492
0
    std::memmove(op, op - offset, kSlopBytes);
2493
0
    *op_p = op_end;
2494
0
    return true;
2495
0
  }
2496
2497
  // Called at the end of the decompress. We ask the allocator
2498
  // write all blocks to the sink.
2499
0
  inline void Flush() { allocator_.Flush(Produced()); }
2500
};
2501
2502
template <typename Allocator>
2503
0
bool SnappyScatteredWriter<Allocator>::SlowAppend(const char* ip, size_t len) {
2504
0
  size_t avail = op_limit_ - op_ptr_;
2505
0
  while (len > avail) {
2506
    // Completely fill this block
2507
0
    std::memcpy(op_ptr_, ip, avail);
2508
0
    op_ptr_ += avail;
2509
0
    assert(op_limit_ - op_ptr_ == 0);
2510
0
    full_size_ += (op_ptr_ - op_base_);
2511
0
    len -= avail;
2512
0
    ip += avail;
2513
2514
    // Bounds check
2515
0
    if (full_size_ + len > expected_) return false;
2516
2517
    // Make new block
2518
0
    size_t bsize = std::min<size_t>(kBlockSize, expected_ - full_size_);
2519
0
    op_base_ = allocator_.Allocate(bsize);
2520
0
    op_ptr_ = op_base_;
2521
0
    op_limit_ = op_base_ + bsize;
2522
0
    op_limit_min_slop_ = op_limit_ - std::min<size_t>(kSlopBytes - 1, bsize);
2523
2524
0
    blocks_.push_back(op_base_);
2525
0
    avail = bsize;
2526
0
  }
2527
2528
0
  std::memcpy(op_ptr_, ip, len);
2529
0
  op_ptr_ += len;
2530
0
  return true;
2531
0
}
2532
2533
template <typename Allocator>
2534
bool SnappyScatteredWriter<Allocator>::SlowAppendFromSelf(size_t offset,
2535
0
                                                         size_t len) {
2536
  // Overflow check
2537
  // See SnappyArrayWriter::AppendFromSelf for an explanation of
2538
  // the "offset - 1u" trick.
2539
0
  const size_t cur = Size();
2540
0
  if (offset - 1u >= cur) return false;
2541
0
  if (expected_ - cur < len) return false;
2542
2543
  // Currently we shouldn't ever hit this path because Compress() chops the
2544
  // input into blocks and does not create cross-block copies. However, it is
2545
  // nice if we do not rely on that, since we can get better compression if we
2546
  // allow cross-block copies and thus might want to change the compressor in
2547
  // the future.
2548
  // TODO Replace this with a properly optimized path. This is not
2549
  // triggered right now. But this is so super slow, that it would regress
2550
  // performance unacceptably if triggered.
2551
0
  size_t src = cur - offset;
2552
0
  char* op = op_ptr_;
2553
0
  while (len-- > 0) {
2554
0
    char c = blocks_[src >> kBlockLog][src & (kBlockSize - 1)];
2555
0
    if (!Append(&c, 1, &op)) {
2556
0
      op_ptr_ = op;
2557
0
      return false;
2558
0
    }
2559
0
    src++;
2560
0
  }
2561
0
  op_ptr_ = op;
2562
0
  return true;
2563
0
}
2564
2565
class SnappySinkAllocator {
2566
 public:
2567
0
  explicit SnappySinkAllocator(Sink* dest) : dest_(dest) {}
2568
2569
0
  char* Allocate(int size) {
2570
0
    Datablock block(new char[size], size);
2571
0
    blocks_.push_back(block);
2572
0
    return block.data;
2573
0
  }
2574
2575
  // We flush only at the end, because the writer wants
2576
  // random access to the blocks and once we hand the
2577
  // block over to the sink, we can't access it anymore.
2578
  // Also we don't write more than has been actually written
2579
  // to the blocks.
2580
0
  void Flush(size_t size) {
2581
0
    size_t size_written = 0;
2582
0
    for (Datablock& block : blocks_) {
2583
0
      size_t block_size = std::min<size_t>(block.size, size - size_written);
2584
0
      dest_->AppendAndTakeOwnership(block.data, block_size,
2585
0
                                    &SnappySinkAllocator::Deleter, NULL);
2586
0
      size_written += block_size;
2587
0
    }
2588
0
    blocks_.clear();
2589
0
  }
2590
2591
 private:
2592
  struct Datablock {
2593
    char* data;
2594
    size_t size;
2595
0
    Datablock(char* p, size_t s) : data(p), size(s) {}
2596
  };
2597
2598
0
  static void Deleter(void* arg, const char* bytes, size_t size) {
2599
    // TODO: Switch to [[maybe_unused]] when we can assume C++17.
2600
0
    (void)arg;
2601
0
    (void)size;
2602
2603
0
    delete[] bytes;
2604
0
  }
2605
2606
  Sink* dest_;
2607
  std::vector<Datablock> blocks_;
2608
2609
  // Note: copying this object is allowed
2610
};
2611
2612
0
size_t UncompressAsMuchAsPossible(Source* compressed, Sink* uncompressed) {
2613
0
  SnappySinkAllocator allocator(uncompressed);
2614
0
  SnappyScatteredWriter<SnappySinkAllocator> writer(allocator);
2615
0
  InternalUncompress(compressed, &writer);
2616
0
  return writer.Produced();
2617
0
}
2618
2619
0
bool Uncompress(Source* compressed, Sink* uncompressed) {
2620
  // Read the uncompressed length from the front of the compressed input
2621
0
  SnappyDecompressor decompressor(compressed);
2622
0
  uint32_t uncompressed_len = 0;
2623
0
  if (!decompressor.ReadUncompressedLength(&uncompressed_len)) {
2624
0
    return false;
2625
0
  }
2626
2627
0
  char c;
2628
0
  size_t allocated_size;
2629
0
  char* buf = uncompressed->GetAppendBufferVariable(1, uncompressed_len, &c, 1,
2630
0
                                                    &allocated_size);
2631
2632
0
  const size_t compressed_len = compressed->Available();
2633
  // If we can get a flat buffer, then use it, otherwise do block by block
2634
  // uncompression
2635
0
  if (allocated_size >= uncompressed_len) {
2636
0
    SnappyArrayWriter writer(buf);
2637
0
    bool result = InternalUncompressAllTags(&decompressor, &writer,
2638
0
                                            compressed_len, uncompressed_len);
2639
0
    uncompressed->Append(buf, writer.Produced());
2640
0
    return result;
2641
0
  } else {
2642
0
    SnappySinkAllocator allocator(uncompressed);
2643
0
    SnappyScatteredWriter<SnappySinkAllocator> writer(allocator);
2644
0
    return InternalUncompressAllTags(&decompressor, &writer, compressed_len,
2645
0
                                     uncompressed_len);
2646
0
  }
2647
0
}
2648
2649
}  // namespace duckdb_snappy
2650
2651
#else // #if SNAPPY_NEW_VERSION
2652
2653
#include "snappy.h"
2654
#include "snappy-internal.h"
2655
#include "snappy-sinksource.h"
2656
2657
#if !defined(SNAPPY_HAVE_SSSE3)
2658
// __SSSE3__ is defined by GCC and Clang. Visual Studio doesn't target SIMD
2659
// support between SSE2 and AVX (so SSSE3 instructions require AVX support), and
2660
// defines __AVX__ when AVX support is available.
2661
#if defined(__SSSE3__) || defined(__AVX__)
2662
#define SNAPPY_HAVE_SSSE3 1
2663
#else
2664
#define SNAPPY_HAVE_SSSE3 0
2665
#endif
2666
#endif  // !defined(SNAPPY_HAVE_SSSE3)
2667
2668
#if !defined(SNAPPY_HAVE_BMI2)
2669
// __BMI2__ is defined by GCC and Clang. Visual Studio doesn't target BMI2
2670
// specifically, but it does define __AVX2__ when AVX2 support is available.
2671
// Fortunately, AVX2 was introduced in Haswell, just like BMI2.
2672
//
2673
// BMI2 is not defined as a subset of AVX2 (unlike SSSE3 and AVX above). So,
2674
// GCC and Clang can build code with AVX2 enabled but BMI2 disabled, in which
2675
// case issuing BMI2 instructions results in a compiler error.
2676
#if defined(__BMI2__) || (defined(_MSC_VER) && defined(__AVX2__))
2677
#define SNAPPY_HAVE_BMI2 1
2678
#else
2679
#define SNAPPY_HAVE_BMI2 0
2680
#endif
2681
#endif  // !defined(SNAPPY_HAVE_BMI2)
2682
2683
#if SNAPPY_HAVE_SSSE3
2684
// Please do not replace with <x86intrin.h>. or with headers that assume more
2685
// advanced SSE versions without checking with all the OWNERS.
2686
#include <tmmintrin.h>
2687
#endif
2688
2689
#if SNAPPY_HAVE_BMI2
2690
// Please do not replace with <x86intrin.h>. or with headers that assume more
2691
// advanced SSE versions without checking with all the OWNERS.
2692
#include <immintrin.h>
2693
#endif
2694
2695
#include <stdio.h>
2696
2697
#include <algorithm>
2698
#include <string>
2699
#include <vector>
2700
2701
namespace duckdb_snappy {
2702
2703
using internal::COPY_1_BYTE_OFFSET;
2704
using internal::COPY_2_BYTE_OFFSET;
2705
using internal::LITERAL;
2706
using internal::char_table;
2707
using internal::kMaximumTagLength;
2708
2709
// Any hash function will produce a valid compressed bitstream, but a good
2710
// hash function reduces the number of collisions and thus yields better
2711
// compression for compressible input, and more speed for incompressible
2712
// input. Of course, it doesn't hurt if the hash function is reasonably fast
2713
// either, as it gets called a lot.
2714
static inline uint32 HashBytes(uint32 bytes, int shift) {
2715
  uint32 kMul = 0x1e35a7bd;
2716
  return (bytes * kMul) >> shift;
2717
}
2718
static inline uint32 Hash(const char* p, int shift) {
2719
  return HashBytes(UNALIGNED_LOAD32(p), shift);
2720
}
2721
2722
size_t MaxCompressedLength(size_t source_len) {
2723
  // Compressed data can be defined as:
2724
  //    compressed := item* literal*
2725
  //    item       := literal* copy
2726
  //
2727
  // The trailing literal sequence has a space blowup of at most 62/60
2728
  // since a literal of length 60 needs one tag byte + one extra byte
2729
  // for length information.
2730
  //
2731
  // Item blowup is trickier to measure.  Suppose the "copy" op copies
2732
  // 4 bytes of data.  Because of a special check in the encoding code,
2733
  // we produce a 4-byte copy only if the offset is < 65536.  Therefore
2734
  // the copy op takes 3 bytes to encode, and this type of item leads
2735
  // to at most the 62/60 blowup for representing literals.
2736
  //
2737
  // Suppose the "copy" op copies 5 bytes of data.  If the offset is big
2738
  // enough, it will take 5 bytes to encode the copy op.  Therefore the
2739
  // worst case here is a one-byte literal followed by a five-byte copy.
2740
  // I.e., 6 bytes of input turn into 7 bytes of "compressed" data.
2741
  //
2742
  // This last factor dominates the blowup, so the final estimate is:
2743
  return 32 + source_len + source_len/6;
2744
}
2745
2746
namespace {
2747
2748
void UnalignedCopy64(const void* src, void* dst) {
2749
  char tmp[8];
2750
  memcpy(tmp, src, 8);
2751
  memcpy(dst, tmp, 8);
2752
}
2753
2754
void UnalignedCopy128(const void* src, void* dst) {
2755
  // memcpy gets vectorized when the appropriate compiler options are used.
2756
  // For example, x86 compilers targeting SSE2+ will optimize to an SSE2 load
2757
  // and store.
2758
  char tmp[16];
2759
  memcpy(tmp, src, 16);
2760
  memcpy(dst, tmp, 16);
2761
}
2762
2763
// Copy [src, src+(op_limit-op)) to [op, (op_limit-op)) a byte at a time. Used
2764
// for handling COPY operations where the input and output regions may overlap.
2765
// For example, suppose:
2766
//    src       == "ab"
2767
//    op        == src + 2
2768
//    op_limit  == op + 20
2769
// After IncrementalCopySlow(src, op, op_limit), the result will have eleven
2770
// copies of "ab"
2771
//    ababababababababababab
2772
// Note that this does not match the semantics of either memcpy() or memmove().
2773
inline char* IncrementalCopySlow(const char* src, char* op,
2774
                                 char* const op_limit) {
2775
  // TODO: Remove pragma when LLVM is aware this
2776
  // function is only called in cold regions and when cold regions don't get
2777
  // vectorized or unrolled.
2778
#ifdef __clang__
2779
#pragma clang loop unroll(disable)
2780
#endif
2781
  while (op < op_limit) {
2782
    *op++ = *src++;
2783
  }
2784
  return op_limit;
2785
}
2786
2787
#if SNAPPY_HAVE_SSSE3
2788
2789
// This is a table of shuffle control masks that can be used as the source
2790
// operand for PSHUFB to permute the contents of the destination XMM register
2791
// into a repeating byte pattern.
2792
alignas(16) const char pshufb_fill_patterns[7][16] = {
2793
  {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
2794
  {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1},
2795
  {0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0},
2796
  {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3},
2797
  {0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0},
2798
  {0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3},
2799
  {0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1},
2800
};
2801
2802
#endif  // SNAPPY_HAVE_SSSE3
2803
2804
// Copy [src, src+(op_limit-op)) to [op, (op_limit-op)) but faster than
2805
// IncrementalCopySlow. buf_limit is the address past the end of the writable
2806
// region of the buffer.
2807
inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
2808
                             char* const buf_limit) {
2809
  // Terminology:
2810
  //
2811
  // slop = buf_limit - op
2812
  // pat  = op - src
2813
  // len  = limit - op
2814
  assert(src < op);
2815
  assert(op <= op_limit);
2816
  assert(op_limit <= buf_limit);
2817
  // NOTE: The compressor always emits 4 <= len <= 64. It is ok to assume that
2818
  // to optimize this function but we have to also handle other cases in case
2819
  // the input does not satisfy these conditions.
2820
2821
  size_t pattern_size = op - src;
2822
  // The cases are split into different branches to allow the branch predictor,
2823
  // FDO, and static prediction hints to work better. For each input we list the
2824
  // ratio of invocations that match each condition.
2825
  //
2826
  // input        slop < 16   pat < 8  len > 16
2827
  // ------------------------------------------
2828
  // html|html4|cp   0%         1.01%    27.73%
2829
  // urls            0%         0.88%    14.79%
2830
  // jpg             0%        64.29%     7.14%
2831
  // pdf             0%         2.56%    58.06%
2832
  // txt[1-4]        0%         0.23%     0.97%
2833
  // pb              0%         0.96%    13.88%
2834
  // bin             0.01%     22.27%    41.17%
2835
  //
2836
  // It is very rare that we don't have enough slop for doing block copies. It
2837
  // is also rare that we need to expand a pattern. Small patterns are common
2838
  // for incompressible formats and for those we are plenty fast already.
2839
  // Lengths are normally not greater than 16 but they vary depending on the
2840
  // input. In general if we always predict len <= 16 it would be an ok
2841
  // prediction.
2842
  //
2843
  // In order to be fast we want a pattern >= 8 bytes and an unrolled loop
2844
  // copying 2x 8 bytes at a time.
2845
2846
  // Handle the uncommon case where pattern is less than 8 bytes.
2847
  if (SNAPPY_PREDICT_FALSE(pattern_size < 8)) {
2848
#if SNAPPY_HAVE_SSSE3
2849
    // Load the first eight bytes into an 128-bit XMM register, then use PSHUFB
2850
    // to permute the register's contents in-place into a repeating sequence of
2851
    // the first "pattern_size" bytes.
2852
    // For example, suppose:
2853
    //    src       == "abc"
2854
    //    op        == op + 3
2855
    // After _mm_shuffle_epi8(), "pattern" will have five copies of "abc"
2856
    // followed by one byte of slop: abcabcabcabcabca.
2857
    //
2858
    // The non-SSE fallback implementation suffers from store-forwarding stalls
2859
    // because its loads and stores partly overlap. By expanding the pattern
2860
    // in-place, we avoid the penalty.
2861
    if (SNAPPY_PREDICT_TRUE(op <= buf_limit - 16)) {
2862
      const __m128i shuffle_mask = _mm_load_si128(
2863
          reinterpret_cast<const __m128i*>(pshufb_fill_patterns)
2864
          + pattern_size - 1);
2865
      const __m128i pattern = _mm_shuffle_epi8(
2866
          _mm_loadl_epi64(reinterpret_cast<const __m128i*>(src)), shuffle_mask);
2867
      // Uninitialized bytes are masked out by the shuffle mask.
2868
      // TODO: remove annotation and macro defs once MSan is fixed.
2869
      SNAPPY_ANNOTATE_MEMORY_IS_INITIALIZED(&pattern, sizeof(pattern));
2870
      pattern_size *= 16 / pattern_size;
2871
      char* op_end = std::min(op_limit, buf_limit - 15);
2872
      while (op < op_end) {
2873
        _mm_storeu_si128(reinterpret_cast<__m128i*>(op), pattern);
2874
        op += pattern_size;
2875
      }
2876
      if (SNAPPY_PREDICT_TRUE(op >= op_limit)) return op_limit;
2877
    }
2878
    return IncrementalCopySlow(src, op, op_limit);
2879
#else  // !SNAPPY_HAVE_SSSE3
2880
    // If plenty of buffer space remains, expand the pattern to at least 8
2881
    // bytes. The way the following loop is written, we need 8 bytes of buffer
2882
    // space if pattern_size >= 4, 11 bytes if pattern_size is 1 or 3, and 10
2883
    // bytes if pattern_size is 2.  Precisely encoding that is probably not
2884
    // worthwhile; instead, invoke the slow path if we cannot write 11 bytes
2885
    // (because 11 are required in the worst case).
2886
    if (SNAPPY_PREDICT_TRUE(op <= buf_limit - 11)) {
2887
      while (pattern_size < 8) {
2888
        UnalignedCopy64(src, op);
2889
        op += pattern_size;
2890
        pattern_size *= 2;
2891
      }
2892
      if (SNAPPY_PREDICT_TRUE(op >= op_limit)) return op_limit;
2893
    } else {
2894
      return IncrementalCopySlow(src, op, op_limit);
2895
    }
2896
#endif  // SNAPPY_HAVE_SSSE3
2897
  }
2898
  assert(pattern_size >= 8);
2899
2900
  // Copy 2x 8 bytes at a time. Because op - src can be < 16, a single
2901
  // UnalignedCopy128 might overwrite data in op. UnalignedCopy64 is safe
2902
  // because expanding the pattern to at least 8 bytes guarantees that
2903
  // op - src >= 8.
2904
  //
2905
  // Typically, the op_limit is the gating factor so try to simplify the loop
2906
  // based on that.
2907
  if (SNAPPY_PREDICT_TRUE(op_limit <= buf_limit - 16)) {
2908
    // Factor the displacement from op to the source into a variable. This helps
2909
    // simplify the loop below by only varying the op pointer which we need to
2910
    // test for the end. Note that this was done after carefully examining the
2911
    // generated code to allow the addressing modes in the loop below to
2912
    // maximize micro-op fusion where possible on modern Intel processors. The
2913
    // generated code should be checked carefully for new processors or with
2914
    // major changes to the compiler.
2915
    // TODO: Simplify this code when the compiler reliably produces
2916
    // the correct x86 instruction sequence.
2917
    ptrdiff_t op_to_src = src - op;
2918
2919
    // The trip count of this loop is not large and so unrolling will only hurt
2920
    // code size without helping performance.
2921
    //
2922
    // TODO: Replace with loop trip count hint.
2923
#ifdef __clang__
2924
#pragma clang loop unroll(disable)
2925
#endif
2926
    do {
2927
      UnalignedCopy64(op + op_to_src, op);
2928
      UnalignedCopy64(op + op_to_src + 8, op + 8);
2929
      op += 16;
2930
    } while (op < op_limit);
2931
    return op_limit;
2932
  }
2933
2934
  // Fall back to doing as much as we can with the available slop in the
2935
  // buffer. This code path is relatively cold however so we save code size by
2936
  // avoiding unrolling and vectorizing.
2937
  //
2938
  // TODO: Remove pragma when when cold regions don't get vectorized
2939
  // or unrolled.
2940
#ifdef __clang__
2941
#pragma clang loop unroll(disable)
2942
#endif
2943
  for (char *op_end = buf_limit - 16; op < op_end; op += 16, src += 16) {
2944
    UnalignedCopy64(src, op);
2945
    UnalignedCopy64(src + 8, op + 8);
2946
  }
2947
  if (op >= op_limit)
2948
    return op_limit;
2949
2950
  // We only take this branch if we didn't have enough slop and we can do a
2951
  // single 8 byte copy.
2952
  if (SNAPPY_PREDICT_FALSE(op <= buf_limit - 8)) {
2953
    UnalignedCopy64(src, op);
2954
    src += 8;
2955
    op += 8;
2956
  }
2957
  return IncrementalCopySlow(src, op, op_limit);
2958
}
2959
2960
}  // namespace
2961
2962
template <bool allow_fast_path>
2963
static inline char* EmitLiteral(char* op,
2964
                                const char* literal,
2965
                                int len) {
2966
  // The vast majority of copies are below 16 bytes, for which a
2967
  // call to memcpy is overkill. This fast path can sometimes
2968
  // copy up to 15 bytes too much, but that is okay in the
2969
  // main loop, since we have a bit to go on for both sides:
2970
  //
2971
  //   - The input will always have kInputMarginBytes = 15 extra
2972
  //     available bytes, as long as we're in the main loop, and
2973
  //     if not, allow_fast_path = false.
2974
  //   - The output will always have 32 spare bytes (see
2975
  //     MaxCompressedLength).
2976
  assert(len > 0);      // Zero-length literals are disallowed
2977
  int n = len - 1;
2978
  if (allow_fast_path && len <= 16) {
2979
    // Fits in tag byte
2980
    *op++ = LITERAL | (n << 2);
2981
2982
    UnalignedCopy128(literal, op);
2983
    return op + len;
2984
  }
2985
2986
  if (n < 60) {
2987
    // Fits in tag byte
2988
    *op++ = LITERAL | (n << 2);
2989
  } else {
2990
    int count = (Bits::Log2Floor(n) >> 3) + 1;
2991
    assert(count >= 1);
2992
    assert(count <= 4);
2993
    *op++ = LITERAL | ((59 + count) << 2);
2994
    // Encode in upcoming bytes.
2995
    // Write 4 bytes, though we may care about only 1 of them. The output buffer
2996
    // is guaranteed to have at least 3 more spaces left as 'len >= 61' holds
2997
    // here and there is a memcpy of size 'len' below.
2998
    LittleEndian::Store32(op, n);
2999
    op += count;
3000
  }
3001
  memcpy(op, literal, len);
3002
  return op + len;
3003
}
3004
3005
template <bool len_less_than_12>
3006
static inline char* EmitCopyAtMost64(char* op, size_t offset, size_t len) {
3007
  assert(len <= 64);
3008
  assert(len >= 4);
3009
  assert(offset < 65536);
3010
  assert(len_less_than_12 == (len < 12));
3011
3012
  if (len_less_than_12 && SNAPPY_PREDICT_TRUE(offset < 2048)) {
3013
    // offset fits in 11 bits.  The 3 highest go in the top of the first byte,
3014
    // and the rest go in the second byte.
3015
    *op++ = COPY_1_BYTE_OFFSET + ((len - 4) << 2) + ((offset >> 3) & 0xe0);
3016
    *op++ = offset & 0xff;
3017
  } else {
3018
    // Write 4 bytes, though we only care about 3 of them.  The output buffer
3019
    // is required to have some slack, so the extra byte won't overrun it.
3020
    uint32 u = COPY_2_BYTE_OFFSET + ((len - 1) << 2) + (offset << 8);
3021
    LittleEndian::Store32(op, u);
3022
    op += 3;
3023
  }
3024
  return op;
3025
}
3026
3027
template <bool len_less_than_12>
3028
static inline char* EmitCopy(char* op, size_t offset, size_t len) {
3029
  assert(len_less_than_12 == (len < 12));
3030
  if (len_less_than_12) {
3031
    return EmitCopyAtMost64</*len_less_than_12=*/true>(op, offset, len);
3032
  } else {
3033
    // A special case for len <= 64 might help, but so far measurements suggest
3034
    // it's in the noise.
3035
3036
    // Emit 64 byte copies but make sure to keep at least four bytes reserved.
3037
    while (SNAPPY_PREDICT_FALSE(len >= 68)) {
3038
      op = EmitCopyAtMost64</*len_less_than_12=*/false>(op, offset, 64);
3039
      len -= 64;
3040
    }
3041
3042
    // One or two copies will now finish the job.
3043
    if (len > 64) {
3044
      op = EmitCopyAtMost64</*len_less_than_12=*/false>(op, offset, 60);
3045
      len -= 60;
3046
    }
3047
3048
    // Emit remainder.
3049
    if (len < 12) {
3050
      op = EmitCopyAtMost64</*len_less_than_12=*/true>(op, offset, len);
3051
    } else {
3052
      op = EmitCopyAtMost64</*len_less_than_12=*/false>(op, offset, len);
3053
    }
3054
    return op;
3055
  }
3056
}
3057
3058
bool GetUncompressedLength(const char* start, size_t n, size_t* result) {
3059
  uint32 v = 0;
3060
  const char* limit = start + n;
3061
  if (Bignum::Parse32WithLimit(start, limit, &v) != NULL) {
3062
    *result = v;
3063
    return true;
3064
  } else {
3065
    return false;
3066
  }
3067
}
3068
3069
namespace {
3070
uint32 CalculateTableSize(uint32 input_size) {
3071
  assert(kMaxHashTableSize >= 256);
3072
  if (input_size > kMaxHashTableSize) {
3073
    return kMaxHashTableSize;
3074
  }
3075
  if (input_size < 256) {
3076
    return 256;
3077
  }
3078
  // This is equivalent to Log2Ceiling(input_size), assuming input_size > 1.
3079
  // 2 << Log2Floor(x - 1) is equivalent to 1 << (1 + Log2Floor(x - 1)).
3080
  return 2u << Bits::Log2Floor(input_size - 1);
3081
}
3082
}  // namespace
3083
3084
namespace internal {
3085
WorkingMemory::WorkingMemory(size_t input_size) {
3086
  const size_t max_fragment_size = std::min(input_size, kBlockSize);
3087
  const size_t table_size = CalculateTableSize(max_fragment_size);
3088
  size_ = table_size * sizeof(*table_) + max_fragment_size +
3089
          MaxCompressedLength(max_fragment_size);
3090
  mem_ = std::allocator<char>().allocate(size_);
3091
  table_ = reinterpret_cast<uint16*>(mem_);
3092
  input_ = mem_ + table_size * sizeof(*table_);
3093
  output_ = input_ + max_fragment_size;
3094
}
3095
3096
WorkingMemory::~WorkingMemory() {
3097
  std::allocator<char>().deallocate(mem_, size_);
3098
}
3099
3100
uint16* WorkingMemory::GetHashTable(size_t fragment_size,
3101
                                    int* table_size) const {
3102
  const size_t htsize = CalculateTableSize(fragment_size);
3103
  memset(table_, 0, htsize * sizeof(*table_));
3104
  *table_size = htsize;
3105
  return table_;
3106
}
3107
}  // end namespace internal
3108
3109
// For 0 <= offset <= 4, GetUint32AtOffset(GetEightBytesAt(p), offset) will
3110
// equal UNALIGNED_LOAD32(p + offset).  Motivation: On x86-64 hardware we have
3111
// empirically found that overlapping loads such as
3112
//  UNALIGNED_LOAD32(p) ... UNALIGNED_LOAD32(p+1) ... UNALIGNED_LOAD32(p+2)
3113
// are slower than UNALIGNED_LOAD64(p) followed by shifts and casts to uint32.
3114
//
3115
// We have different versions for 64- and 32-bit; ideally we would avoid the
3116
// two functions and just inline the UNALIGNED_LOAD64 call into
3117
// GetUint32AtOffset, but GCC (at least not as of 4.6) is seemingly not clever
3118
// enough to avoid loading the value multiple times then. For 64-bit, the load
3119
// is done when GetEightBytesAt() is called, whereas for 32-bit, the load is
3120
// done at GetUint32AtOffset() time.
3121
3122
#ifdef ARCH_K8
3123
3124
typedef uint64 EightBytesReference;
3125
3126
static inline EightBytesReference GetEightBytesAt(const char* ptr) {
3127
  return UNALIGNED_LOAD64(ptr);
3128
}
3129
3130
static inline uint32 GetUint32AtOffset(uint64 v, int offset) {
3131
  assert(offset >= 0);
3132
  assert(offset <= 4);
3133
  return v >> (LittleEndian::IsLittleEndian() ? 8 * offset : 32 - 8 * offset);
3134
}
3135
3136
#else
3137
3138
typedef const char* EightBytesReference;
3139
3140
static inline EightBytesReference GetEightBytesAt(const char* ptr) {
3141
  return ptr;
3142
}
3143
3144
static inline uint32 GetUint32AtOffset(const char* v, int offset) {
3145
  assert(offset >= 0);
3146
  assert(offset <= 4);
3147
  return UNALIGNED_LOAD32(v + offset);
3148
}
3149
3150
#endif
3151
3152
// Flat array compression that does not emit the "uncompressed length"
3153
// prefix. Compresses "input" string to the "*op" buffer.
3154
//
3155
// REQUIRES: "input" is at most "kBlockSize" bytes long.
3156
// REQUIRES: "op" points to an array of memory that is at least
3157
// "MaxCompressedLength(input.size())" in size.
3158
// REQUIRES: All elements in "table[0..table_size-1]" are initialized to zero.
3159
// REQUIRES: "table_size" is a power of two
3160
//
3161
// Returns an "end" pointer into "op" buffer.
3162
// "end - op" is the compressed size of "input".
3163
namespace internal {
3164
char* CompressFragment(const char* input,
3165
                       size_t input_size,
3166
                       char* op,
3167
                       uint16* table,
3168
                       const int table_size) {
3169
  // "ip" is the input pointer, and "op" is the output pointer.
3170
  const char* ip = input;
3171
  assert(input_size <= kBlockSize);
3172
  assert((table_size & (table_size - 1)) == 0);  // table must be power of two
3173
  const int shift = 32 - Bits::Log2Floor(table_size);
3174
  // assert(static_cast<int>(kuint32max >> shift) == table_size - 1);
3175
  const char* ip_end = input + input_size;
3176
  const char* base_ip = ip;
3177
  // Bytes in [next_emit, ip) will be emitted as literal bytes.  Or
3178
  // [next_emit, ip_end) after the main loop.
3179
  const char* next_emit = ip;
3180
3181
  const size_t kInputMarginBytes = 15;
3182
  if (SNAPPY_PREDICT_TRUE(input_size >= kInputMarginBytes)) {
3183
    const char* ip_limit = input + input_size - kInputMarginBytes;
3184
3185
    for (uint32 next_hash = Hash(++ip, shift); ; ) {
3186
      assert(next_emit < ip);
3187
      // The body of this loop calls EmitLiteral once and then EmitCopy one or
3188
      // more times.  (The exception is that when we're close to exhausting
3189
      // the input we goto emit_remainder.)
3190
      //
3191
      // In the first iteration of this loop we're just starting, so
3192
      // there's nothing to copy, so calling EmitLiteral once is
3193
      // necessary.  And we only start a new iteration when the
3194
      // current iteration has determined that a call to EmitLiteral will
3195
      // precede the next call to EmitCopy (if any).
3196
      //
3197
      // Step 1: Scan forward in the input looking for a 4-byte-long match.
3198
      // If we get close to exhausting the input then goto emit_remainder.
3199
      //
3200
      // Heuristic match skipping: If 32 bytes are scanned with no matches
3201
      // found, start looking only at every other byte. If 32 more bytes are
3202
      // scanned (or skipped), look at every third byte, etc.. When a match is
3203
      // found, immediately go back to looking at every byte. This is a small
3204
      // loss (~5% performance, ~0.1% density) for compressible data due to more
3205
      // bookkeeping, but for non-compressible data (such as JPEG) it's a huge
3206
      // win since the compressor quickly "realizes" the data is incompressible
3207
      // and doesn't bother looking for matches everywhere.
3208
      //
3209
      // The "skip" variable keeps track of how many bytes there are since the
3210
      // last match; dividing it by 32 (ie. right-shifting by five) gives the
3211
      // number of bytes to move ahead for each iteration.
3212
      uint32 skip = 32;
3213
3214
      const char* next_ip = ip;
3215
      const char* candidate;
3216
      do {
3217
        ip = next_ip;
3218
        uint32 hash = next_hash;
3219
        assert(hash == Hash(ip, shift));
3220
        uint32 bytes_between_hash_lookups = skip >> 5;
3221
        skip += bytes_between_hash_lookups;
3222
        next_ip = ip + bytes_between_hash_lookups;
3223
        if (SNAPPY_PREDICT_FALSE(next_ip > ip_limit)) {
3224
          goto emit_remainder;
3225
        }
3226
        next_hash = Hash(next_ip, shift);
3227
        candidate = base_ip + table[hash];
3228
        assert(candidate >= base_ip);
3229
        assert(candidate < ip);
3230
3231
        table[hash] = ip - base_ip;
3232
      } while (SNAPPY_PREDICT_TRUE(UNALIGNED_LOAD32(ip) !=
3233
                                 UNALIGNED_LOAD32(candidate)));
3234
3235
      // Step 2: A 4-byte match has been found.  We'll later see if more
3236
      // than 4 bytes match.  But, prior to the match, input
3237
      // bytes [next_emit, ip) are unmatched.  Emit them as "literal bytes."
3238
      assert(next_emit + 16 <= ip_end);
3239
      op = EmitLiteral</*allow_fast_path=*/true>(op, next_emit, ip - next_emit);
3240
3241
      // Step 3: Call EmitCopy, and then see if another EmitCopy could
3242
      // be our next move.  Repeat until we find no match for the
3243
      // input immediately after what was consumed by the last EmitCopy call.
3244
      //
3245
      // If we exit this loop normally then we need to call EmitLiteral next,
3246
      // though we don't yet know how big the literal will be.  We handle that
3247
      // by proceeding to the next iteration of the main loop.  We also can exit
3248
      // this loop via goto if we get close to exhausting the input.
3249
      EightBytesReference input_bytes;
3250
      uint32 candidate_bytes = 0;
3251
3252
      do {
3253
        // We have a 4-byte match at ip, and no need to emit any
3254
        // "literal bytes" prior to ip.
3255
        const char* base = ip;
3256
        std::pair<size_t, bool> p =
3257
            FindMatchLength(candidate + 4, ip + 4, ip_end);
3258
        size_t matched = 4 + p.first;
3259
        ip += matched;
3260
        size_t offset = base - candidate;
3261
        assert(0 == memcmp(base, candidate, matched));
3262
        if (p.second) {
3263
          op = EmitCopy</*len_less_than_12=*/true>(op, offset, matched);
3264
        } else {
3265
          op = EmitCopy</*len_less_than_12=*/false>(op, offset, matched);
3266
        }
3267
        next_emit = ip;
3268
        if (SNAPPY_PREDICT_FALSE(ip >= ip_limit)) {
3269
          goto emit_remainder;
3270
        }
3271
        // We are now looking for a 4-byte match again.  We read
3272
        // table[Hash(ip, shift)] for that.  To improve compression,
3273
        // we also update table[Hash(ip - 1, shift)] and table[Hash(ip, shift)].
3274
        input_bytes = GetEightBytesAt(ip - 1);
3275
        uint32 prev_hash = HashBytes(GetUint32AtOffset(input_bytes, 0), shift);
3276
        table[prev_hash] = ip - base_ip - 1;
3277
        uint32 cur_hash = HashBytes(GetUint32AtOffset(input_bytes, 1), shift);
3278
        candidate = base_ip + table[cur_hash];
3279
        candidate_bytes = UNALIGNED_LOAD32(candidate);
3280
        table[cur_hash] = ip - base_ip;
3281
      } while (GetUint32AtOffset(input_bytes, 1) == candidate_bytes);
3282
3283
      next_hash = HashBytes(GetUint32AtOffset(input_bytes, 2), shift);
3284
      ++ip;
3285
    }
3286
  }
3287
3288
 emit_remainder:
3289
  // Emit the remaining bytes as a literal
3290
  if (next_emit < ip_end) {
3291
    op = EmitLiteral</*allow_fast_path=*/false>(op, next_emit,
3292
                                                ip_end - next_emit);
3293
  }
3294
3295
  return op;
3296
}
3297
}  // end namespace internal
3298
3299
// Called back at avery compression call to trace parameters and sizes.
3300
static inline void Report(const char *algorithm, size_t compressed_size,
3301
                          size_t uncompressed_size) {}
3302
3303
// Signature of output types needed by decompression code.
3304
// The decompression code is templatized on a type that obeys this
3305
// signature so that we do not pay virtual function call overhead in
3306
// the middle of a tight decompression loop.
3307
//
3308
// class DecompressionWriter {
3309
//  public:
3310
//   // Called before decompression
3311
//   void SetExpectedLength(size_t length);
3312
//
3313
//   // Called after decompression
3314
//   bool CheckLength() const;
3315
//
3316
//   // Called repeatedly during decompression
3317
//   bool Append(const char* ip, size_t length);
3318
//   bool AppendFromSelf(uint32 offset, size_t length);
3319
//
3320
//   // The rules for how TryFastAppend differs from Append are somewhat
3321
//   // convoluted:
3322
//   //
3323
//   //  - TryFastAppend is allowed to decline (return false) at any
3324
//   //    time, for any reason -- just "return false" would be
3325
//   //    a perfectly legal implementation of TryFastAppend.
3326
//   //    The intention is for TryFastAppend to allow a fast path
3327
//   //    in the common case of a small append.
3328
//   //  - TryFastAppend is allowed to read up to <available> bytes
3329
//   //    from the input buffer, whereas Append is allowed to read
3330
//   //    <length>. However, if it returns true, it must leave
3331
//   //    at least five (kMaximumTagLength) bytes in the input buffer
3332
//   //    afterwards, so that there is always enough space to read the
3333
//   //    next tag without checking for a refill.
3334
//   //  - TryFastAppend must always return decline (return false)
3335
//   //    if <length> is 61 or more, as in this case the literal length is not
3336
//   //    decoded fully. In practice, this should not be a big problem,
3337
//   //    as it is unlikely that one would implement a fast path accepting
3338
//   //    this much data.
3339
//   //
3340
//   bool TryFastAppend(const char* ip, size_t available, size_t length);
3341
// };
3342
3343
static inline uint32 ExtractLowBytes(uint32 v, int n) {
3344
  assert(n >= 0);
3345
  assert(n <= 4);
3346
#if SNAPPY_HAVE_BMI2
3347
  return _bzhi_u32(v, 8 * n);
3348
#else
3349
  // This needs to be wider than uint32 otherwise `mask << 32` will be
3350
  // undefined.
3351
  uint64 mask = 0xffffffff;
3352
  return v & ~(mask << (8 * n));
3353
#endif
3354
}
3355
3356
static inline bool LeftShiftOverflows(uint8 value, uint32 shift) {
3357
  assert(shift < 32);
3358
  static const uint8 masks[] = {
3359
      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  //
3360
      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  //
3361
      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  //
3362
      0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe};
3363
  return (value & masks[shift]) != 0;
3364
}
3365
3366
// Helper class for decompression
3367
class SnappyDecompressor {
3368
 private:
3369
  Source*       reader_;         // Underlying source of bytes to decompress
3370
  const char*   ip_;             // Points to next buffered byte
3371
  const char*   ip_limit_;       // Points just past buffered bytes
3372
  uint32        peeked_;         // Bytes peeked from reader (need to skip)
3373
  bool          eof_;            // Hit end of input without an error?
3374
  char          scratch_[kMaximumTagLength];  // See RefillTag().
3375
3376
  // Ensure that all of the tag metadata for the next tag is available
3377
  // in [ip_..ip_limit_-1].  Also ensures that [ip,ip+4] is readable even
3378
  // if (ip_limit_ - ip_ < 5).
3379
  //
3380
  // Returns true on success, false on error or end of input.
3381
  bool RefillTag();
3382
3383
 public:
3384
  explicit SnappyDecompressor(Source* reader)
3385
      : reader_(reader),
3386
        ip_(NULL),
3387
        ip_limit_(NULL),
3388
        peeked_(0),
3389
        eof_(false) {
3390
  }
3391
3392
  ~SnappyDecompressor() {
3393
    // Advance past any bytes we peeked at from the reader
3394
    reader_->Skip(peeked_);
3395
  }
3396
3397
  // Returns true iff we have hit the end of the input without an error.
3398
  bool eof() const {
3399
    return eof_;
3400
  }
3401
3402
  // Read the uncompressed length stored at the start of the compressed data.
3403
  // On success, stores the length in *result and returns true.
3404
  // On failure, returns false.
3405
  bool ReadUncompressedLength(uint32* result) {
3406
    assert(ip_ == NULL);       // Must not have read anything yet
3407
    // Length is encoded in 1..5 bytes
3408
    *result = 0;
3409
    uint32 shift = 0;
3410
    while (true) {
3411
      if (shift >= 32) return false;
3412
      size_t n;
3413
      const char* ip = reader_->Peek(&n);
3414
      if (n == 0) return false;
3415
      const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip));
3416
      reader_->Skip(1);
3417
      uint32 val = c & 0x7f;
3418
      if (LeftShiftOverflows(static_cast<uint8>(val), shift)) return false;
3419
      *result |= val << shift;
3420
      if (c < 128) {
3421
        break;
3422
      }
3423
      shift += 7;
3424
    }
3425
    return true;
3426
  }
3427
3428
  // Process the next item found in the input.
3429
  // Returns true if successful, false on error or end of input.
3430
  template <class Writer>
3431
#if defined(__GNUC__) && defined(__x86_64__)
3432
  __attribute__((aligned(32)))
3433
#endif
3434
  void DecompressAllTags(Writer* writer) {
3435
    // In x86, pad the function body to start 16 bytes later. This function has
3436
    // a couple of hotspots that are highly sensitive to alignment: we have
3437
    // observed regressions by more than 20% in some metrics just by moving the
3438
    // exact same code to a different position in the benchmark binary.
3439
    //
3440
    // Putting this code on a 32-byte-aligned boundary + 16 bytes makes us hit
3441
    // the "lucky" case consistently. Unfortunately, this is a very brittle
3442
    // workaround, and future differences in code generation may reintroduce
3443
    // this regression. If you experience a big, difficult to explain, benchmark
3444
    // performance regression here, first try removing this hack.
3445
#if defined(__GNUC__) && defined(__x86_64__)
3446
    // Two 8-byte "NOP DWORD ptr [EAX + EAX*1 + 00000000H]" instructions.
3447
    asm(".byte 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00");
3448
    asm(".byte 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00");
3449
#endif
3450
3451
    const char* ip = ip_;
3452
    // We could have put this refill fragment only at the beginning of the loop.
3453
    // However, duplicating it at the end of each branch gives the compiler more
3454
    // scope to optimize the <ip_limit_ - ip> expression based on the local
3455
    // context, which overall increases speed.
3456
    #define MAYBE_REFILL() \
3457
        if (ip_limit_ - ip < kMaximumTagLength) { \
3458
          ip_ = ip; \
3459
          if (!RefillTag()) return; \
3460
          ip = ip_; \
3461
        }
3462
3463
    MAYBE_REFILL();
3464
    for ( ;; ) {
3465
      const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip++));
3466
3467
      // Ratio of iterations that have LITERAL vs non-LITERAL for different
3468
      // inputs.
3469
      //
3470
      // input          LITERAL  NON_LITERAL
3471
      // -----------------------------------
3472
      // html|html4|cp   23%        77%
3473
      // urls            36%        64%
3474
      // jpg             47%        53%
3475
      // pdf             19%        81%
3476
      // txt[1-4]        25%        75%
3477
      // pb              24%        76%
3478
      // bin             24%        76%
3479
      if (SNAPPY_PREDICT_FALSE((c & 0x3) == LITERAL)) {
3480
        size_t literal_length = (c >> 2) + 1u;
3481
        if (writer->TryFastAppend(ip, ip_limit_ - ip, literal_length)) {
3482
          assert(literal_length < 61);
3483
          ip += literal_length;
3484
          // NOTE: There is no MAYBE_REFILL() here, as TryFastAppend()
3485
          // will not return true unless there's already at least five spare
3486
          // bytes in addition to the literal.
3487
          continue;
3488
        }
3489
        if (SNAPPY_PREDICT_FALSE(literal_length >= 61)) {
3490
          // Long literal.
3491
          const size_t literal_length_length = literal_length - 60;
3492
          literal_length =
3493
              ExtractLowBytes(LittleEndian::Load32(ip), literal_length_length) +
3494
              1;
3495
          ip += literal_length_length;
3496
        }
3497
3498
        size_t avail = ip_limit_ - ip;
3499
        while (avail < literal_length) {
3500
          if (!writer->Append(ip, avail)) return;
3501
          literal_length -= avail;
3502
          reader_->Skip(peeked_);
3503
          size_t n;
3504
          ip = reader_->Peek(&n);
3505
          avail = n;
3506
          peeked_ = avail;
3507
          if (avail == 0) return;  // Premature end of input
3508
          ip_limit_ = ip + avail;
3509
        }
3510
        if (!writer->Append(ip, literal_length)) {
3511
          return;
3512
        }
3513
        ip += literal_length;
3514
        MAYBE_REFILL();
3515
      } else {
3516
        const size_t entry = char_table[c];
3517
        const size_t trailer =
3518
            ExtractLowBytes(LittleEndian::Load32(ip), entry >> 11);
3519
        const size_t length = entry & 0xff;
3520
        ip += entry >> 11;
3521
3522
        // copy_offset/256 is encoded in bits 8..10.  By just fetching
3523
        // those bits, we get copy_offset (since the bit-field starts at
3524
        // bit 8).
3525
        const size_t copy_offset = entry & 0x700;
3526
        if (!writer->AppendFromSelf(copy_offset + trailer, length)) {
3527
          return;
3528
        }
3529
        MAYBE_REFILL();
3530
      }
3531
    }
3532
3533
#undef MAYBE_REFILL
3534
  }
3535
};
3536
3537
bool SnappyDecompressor::RefillTag() {
3538
  const char* ip = ip_;
3539
  if (ip == ip_limit_) {
3540
    // Fetch a new fragment from the reader
3541
    reader_->Skip(peeked_);   // All peeked bytes are used up
3542
    size_t n;
3543
    ip = reader_->Peek(&n);
3544
    peeked_ = n;
3545
    eof_ = (n == 0);
3546
    if (eof_) return false;
3547
    ip_limit_ = ip + n;
3548
  }
3549
3550
  // Read the tag character
3551
  assert(ip < ip_limit_);
3552
  const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip));
3553
  const uint32 entry = char_table[c];
3554
  const uint32 needed = (entry >> 11) + 1;  // +1 byte for 'c'
3555
  assert(needed <= sizeof(scratch_));
3556
3557
  // Read more bytes from reader if needed
3558
  uint32 nbuf = ip_limit_ - ip;
3559
  if (nbuf < needed) {
3560
    // Stitch together bytes from ip and reader to form the word
3561
    // contents.  We store the needed bytes in "scratch_".  They
3562
    // will be consumed immediately by the caller since we do not
3563
    // read more than we need.
3564
    memmove(scratch_, ip, nbuf);
3565
    reader_->Skip(peeked_);  // All peeked bytes are used up
3566
    peeked_ = 0;
3567
    while (nbuf < needed) {
3568
      size_t length;
3569
      const char* src = reader_->Peek(&length);
3570
      if (length == 0) return false;
3571
      uint32 to_add = std::min<uint32>(needed - nbuf, length);
3572
      memcpy(scratch_ + nbuf, src, to_add);
3573
      nbuf += to_add;
3574
      reader_->Skip(to_add);
3575
    }
3576
    assert(nbuf == needed);
3577
    ip_ = scratch_;
3578
    ip_limit_ = scratch_ + needed;
3579
  } else if (nbuf < kMaximumTagLength) {
3580
    // Have enough bytes, but move into scratch_ so that we do not
3581
    // read past end of input
3582
    memmove(scratch_, ip, nbuf);
3583
    reader_->Skip(peeked_);  // All peeked bytes are used up
3584
    peeked_ = 0;
3585
    ip_ = scratch_;
3586
    ip_limit_ = scratch_ + nbuf;
3587
  } else {
3588
    // Pass pointer to buffer returned by reader_.
3589
    ip_ = ip;
3590
  }
3591
  return true;
3592
}
3593
3594
template <typename Writer>
3595
static bool InternalUncompress(Source* r, Writer* writer) {
3596
  // Read the uncompressed length from the front of the compressed input
3597
  SnappyDecompressor decompressor(r);
3598
  uint32 uncompressed_len = 0;
3599
  if (!decompressor.ReadUncompressedLength(&uncompressed_len)) return false;
3600
3601
  return InternalUncompressAllTags(&decompressor, writer, r->Available(),
3602
                                   uncompressed_len);
3603
}
3604
3605
template <typename Writer>
3606
static bool InternalUncompressAllTags(SnappyDecompressor* decompressor,
3607
                                      Writer* writer,
3608
                                      uint32 compressed_len,
3609
                                      uint32 uncompressed_len) {
3610
  Report("snappy_uncompress", compressed_len, uncompressed_len);
3611
3612
  writer->SetExpectedLength(uncompressed_len);
3613
3614
  // Process the entire input
3615
  decompressor->DecompressAllTags(writer);
3616
  writer->Flush();
3617
  return (decompressor->eof() && writer->CheckLength());
3618
}
3619
3620
bool GetUncompressedLength(Source* source, uint32* result) {
3621
  SnappyDecompressor decompressor(source);
3622
  return decompressor.ReadUncompressedLength(result);
3623
}
3624
3625
size_t Compress(Source* reader, Sink* writer) {
3626
  size_t written = 0;
3627
  size_t N = reader->Available();
3628
  const size_t uncompressed_size = N;
3629
  char ulength[Bignum::kMax32];
3630
  char* p = Bignum::Encode32(ulength, N);
3631
  writer->Append(ulength, p-ulength);
3632
  written += (p - ulength);
3633
3634
  internal::WorkingMemory wmem(N);
3635
3636
  while (N > 0) {
3637
    // Get next block to compress (without copying if possible)
3638
    size_t fragment_size;
3639
    const char* fragment = reader->Peek(&fragment_size);
3640
    assert(fragment_size != 0);  // premature end of input
3641
    const size_t num_to_read = std::min(N, kBlockSize);
3642
    size_t bytes_read = fragment_size;
3643
3644
    size_t pending_advance = 0;
3645
    if (bytes_read >= num_to_read) {
3646
      // Buffer returned by reader is large enough
3647
      pending_advance = num_to_read;
3648
      fragment_size = num_to_read;
3649
    } else {
3650
      char* scratch = wmem.GetScratchInput();
3651
      memcpy(scratch, fragment, bytes_read);
3652
      reader->Skip(bytes_read);
3653
3654
      while (bytes_read < num_to_read) {
3655
        fragment = reader->Peek(&fragment_size);
3656
        size_t n = std::min<size_t>(fragment_size, num_to_read - bytes_read);
3657
        memcpy(scratch + bytes_read, fragment, n);
3658
        bytes_read += n;
3659
        reader->Skip(n);
3660
      }
3661
      assert(bytes_read == num_to_read);
3662
      fragment = scratch;
3663
      fragment_size = num_to_read;
3664
    }
3665
    assert(fragment_size == num_to_read);
3666
3667
    // Get encoding table for compression
3668
    int table_size;
3669
    uint16* table = wmem.GetHashTable(num_to_read, &table_size);
3670
3671
    // Compress input_fragment and append to dest
3672
    const int max_output = MaxCompressedLength(num_to_read);
3673
3674
    // Need a scratch buffer for the output, in case the byte sink doesn't
3675
    // have room for us directly.
3676
3677
    // Since we encode kBlockSize regions followed by a region
3678
    // which is <= kBlockSize in length, a previously allocated
3679
    // scratch_output[] region is big enough for this iteration.
3680
    char* dest = writer->GetAppendBuffer(max_output, wmem.GetScratchOutput());
3681
    char* end = internal::CompressFragment(fragment, fragment_size, dest, table,
3682
                                           table_size);
3683
    writer->Append(dest, end - dest);
3684
    written += (end - dest);
3685
3686
    N -= num_to_read;
3687
    reader->Skip(pending_advance);
3688
  }
3689
3690
  Report("snappy_compress", written, uncompressed_size);
3691
3692
  return written;
3693
}
3694
3695
// -----------------------------------------------------------------------
3696
// IOVec interfaces
3697
// -----------------------------------------------------------------------
3698
3699
// A type that writes to an iovec.
3700
// Note that this is not a "ByteSink", but a type that matches the
3701
// Writer template argument to SnappyDecompressor::DecompressAllTags().
3702
class SnappyIOVecWriter {
3703
 private:
3704
  // output_iov_end_ is set to iov + count and used to determine when
3705
  // the end of the iovs is reached.
3706
  const struct iovec* output_iov_end_;
3707
3708
#if !defined(NDEBUG)
3709
  const struct iovec* output_iov_;
3710
#endif  // !defined(NDEBUG)
3711
3712
  // Current iov that is being written into.
3713
  const struct iovec* curr_iov_;
3714
3715
  // Pointer to current iov's write location.
3716
  char* curr_iov_output_;
3717
3718
  // Remaining bytes to write into curr_iov_output.
3719
  size_t curr_iov_remaining_;
3720
3721
  // Total bytes decompressed into output_iov_ so far.
3722
  size_t total_written_;
3723
3724
  // Maximum number of bytes that will be decompressed into output_iov_.
3725
  size_t output_limit_;
3726
3727
  static inline char* GetIOVecPointer(const struct iovec* iov, size_t offset) {
3728
    return reinterpret_cast<char*>(iov->iov_base) + offset;
3729
  }
3730
3731
 public:
3732
  // Does not take ownership of iov. iov must be valid during the
3733
  // entire lifetime of the SnappyIOVecWriter.
3734
  inline SnappyIOVecWriter(const struct iovec* iov, size_t iov_count)
3735
      : output_iov_end_(iov + iov_count),
3736
#if !defined(NDEBUG)
3737
        output_iov_(iov),
3738
#endif  // !defined(NDEBUG)
3739
        curr_iov_(iov),
3740
        curr_iov_output_(iov_count ? reinterpret_cast<char*>(iov->iov_base)
3741
                                   : nullptr),
3742
        curr_iov_remaining_(iov_count ? iov->iov_len : 0),
3743
        total_written_(0),
3744
        output_limit_(-1) {}
3745
3746
  inline void SetExpectedLength(size_t len) {
3747
    output_limit_ = len;
3748
  }
3749
3750
  inline bool CheckLength() const {
3751
    return total_written_ == output_limit_;
3752
  }
3753
3754
  inline bool Append(const char* ip, size_t len) {
3755
    if (total_written_ + len > output_limit_) {
3756
      return false;
3757
    }
3758
3759
    return AppendNoCheck(ip, len);
3760
  }
3761
3762
  inline bool AppendNoCheck(const char* ip, size_t len) {
3763
    while (len > 0) {
3764
      if (curr_iov_remaining_ == 0) {
3765
        // This iovec is full. Go to the next one.
3766
        if (curr_iov_ + 1 >= output_iov_end_) {
3767
          return false;
3768
        }
3769
        ++curr_iov_;
3770
        curr_iov_output_ = reinterpret_cast<char*>(curr_iov_->iov_base);
3771
        curr_iov_remaining_ = curr_iov_->iov_len;
3772
      }
3773
3774
      const size_t to_write = std::min(len, curr_iov_remaining_);
3775
      memcpy(curr_iov_output_, ip, to_write);
3776
      curr_iov_output_ += to_write;
3777
      curr_iov_remaining_ -= to_write;
3778
      total_written_ += to_write;
3779
      ip += to_write;
3780
      len -= to_write;
3781
    }
3782
3783
    return true;
3784
  }
3785
3786
  inline bool TryFastAppend(const char* ip, size_t available, size_t len) {
3787
    const size_t space_left = output_limit_ - total_written_;
3788
    if (len <= 16 && available >= 16 + kMaximumTagLength && space_left >= 16 &&
3789
        curr_iov_remaining_ >= 16) {
3790
      // Fast path, used for the majority (about 95%) of invocations.
3791
      UnalignedCopy128(ip, curr_iov_output_);
3792
      curr_iov_output_ += len;
3793
      curr_iov_remaining_ -= len;
3794
      total_written_ += len;
3795
      return true;
3796
    }
3797
3798
    return false;
3799
  }
3800
3801
  inline bool AppendFromSelf(size_t offset, size_t len) {
3802
    // See SnappyArrayWriter::AppendFromSelf for an explanation of
3803
    // the "offset - 1u" trick.
3804
    if (offset - 1u >= total_written_) {
3805
      return false;
3806
    }
3807
    const size_t space_left = output_limit_ - total_written_;
3808
    if (len > space_left) {
3809
      return false;
3810
    }
3811
3812
    // Locate the iovec from which we need to start the copy.
3813
    const iovec* from_iov = curr_iov_;
3814
    size_t from_iov_offset = curr_iov_->iov_len - curr_iov_remaining_;
3815
    while (offset > 0) {
3816
      if (from_iov_offset >= offset) {
3817
        from_iov_offset -= offset;
3818
        break;
3819
      }
3820
3821
      offset -= from_iov_offset;
3822
      --from_iov;
3823
#if !defined(NDEBUG)
3824
      assert(from_iov >= output_iov_);
3825
#endif  // !defined(NDEBUG)
3826
      from_iov_offset = from_iov->iov_len;
3827
    }
3828
3829
    // Copy <len> bytes starting from the iovec pointed to by from_iov_index to
3830
    // the current iovec.
3831
    while (len > 0) {
3832
      assert(from_iov <= curr_iov_);
3833
      if (from_iov != curr_iov_) {
3834
        const size_t to_copy =
3835
            std::min((unsigned long)(from_iov->iov_len - from_iov_offset), (unsigned long)len);
3836
        AppendNoCheck(GetIOVecPointer(from_iov, from_iov_offset), to_copy);
3837
        len -= to_copy;
3838
        if (len > 0) {
3839
          ++from_iov;
3840
          from_iov_offset = 0;
3841
        }
3842
      } else {
3843
        size_t to_copy = curr_iov_remaining_;
3844
        if (to_copy == 0) {
3845
          // This iovec is full. Go to the next one.
3846
          if (curr_iov_ + 1 >= output_iov_end_) {
3847
            return false;
3848
          }
3849
          ++curr_iov_;
3850
          curr_iov_output_ = reinterpret_cast<char*>(curr_iov_->iov_base);
3851
          curr_iov_remaining_ = curr_iov_->iov_len;
3852
          continue;
3853
        }
3854
        if (to_copy > len) {
3855
          to_copy = len;
3856
        }
3857
3858
        IncrementalCopy(GetIOVecPointer(from_iov, from_iov_offset),
3859
                        curr_iov_output_, curr_iov_output_ + to_copy,
3860
                        curr_iov_output_ + curr_iov_remaining_);
3861
        curr_iov_output_ += to_copy;
3862
        curr_iov_remaining_ -= to_copy;
3863
        from_iov_offset += to_copy;
3864
        total_written_ += to_copy;
3865
        len -= to_copy;
3866
      }
3867
    }
3868
3869
    return true;
3870
  }
3871
3872
  inline void Flush() {}
3873
};
3874
3875
bool RawUncompressToIOVec(const char* compressed, size_t compressed_length,
3876
                          const struct iovec* iov, size_t iov_cnt) {
3877
  ByteArraySource reader(compressed, compressed_length);
3878
  return RawUncompressToIOVec(&reader, iov, iov_cnt);
3879
}
3880
3881
bool RawUncompressToIOVec(Source* compressed, const struct iovec* iov,
3882
                          size_t iov_cnt) {
3883
  SnappyIOVecWriter output(iov, iov_cnt);
3884
  return InternalUncompress(compressed, &output);
3885
}
3886
3887
// -----------------------------------------------------------------------
3888
// Flat array interfaces
3889
// -----------------------------------------------------------------------
3890
3891
// A type that writes to a flat array.
3892
// Note that this is not a "ByteSink", but a type that matches the
3893
// Writer template argument to SnappyDecompressor::DecompressAllTags().
3894
class SnappyArrayWriter {
3895
 private:
3896
  char* base_;
3897
  char* op_;
3898
  char* op_limit_;
3899
3900
 public:
3901
  inline explicit SnappyArrayWriter(char* dst)
3902
      : base_(dst),
3903
        op_(dst),
3904
        op_limit_(dst) {
3905
  }
3906
3907
  inline void SetExpectedLength(size_t len) {
3908
    op_limit_ = op_ + len;
3909
  }
3910
3911
  inline bool CheckLength() const {
3912
    return op_ == op_limit_;
3913
  }
3914
3915
  inline bool Append(const char* ip, size_t len) {
3916
    char* op = op_;
3917
    const size_t space_left = op_limit_ - op;
3918
    if (space_left < len) {
3919
      return false;
3920
    }
3921
    memcpy(op, ip, len);
3922
    op_ = op + len;
3923
    return true;
3924
  }
3925
3926
  inline bool TryFastAppend(const char* ip, size_t available, size_t len) {
3927
    char* op = op_;
3928
    const size_t space_left = op_limit_ - op;
3929
    if (len <= 16 && available >= 16 + kMaximumTagLength && space_left >= 16) {
3930
      // Fast path, used for the majority (about 95%) of invocations.
3931
      UnalignedCopy128(ip, op);
3932
      op_ = op + len;
3933
      return true;
3934
    } else {
3935
      return false;
3936
    }
3937
  }
3938
3939
  inline bool AppendFromSelf(size_t offset, size_t len) {
3940
    char* const op_end = op_ + len;
3941
3942
    // Check if we try to append from before the start of the buffer.
3943
    // Normally this would just be a check for "produced < offset",
3944
    // but "produced <= offset - 1u" is equivalent for every case
3945
    // except the one where offset==0, where the right side will wrap around
3946
    // to a very big number. This is convenient, as offset==0 is another
3947
    // invalid case that we also want to catch, so that we do not go
3948
    // into an infinite loop.
3949
    if (Produced() <= offset - 1u || op_end > op_limit_) return false;
3950
    op_ = IncrementalCopy(op_ - offset, op_, op_end, op_limit_);
3951
3952
    return true;
3953
  }
3954
  inline size_t Produced() const {
3955
    assert(op_ >= base_);
3956
    return op_ - base_;
3957
  }
3958
  inline void Flush() {}
3959
};
3960
3961
bool RawUncompress(const char* compressed, size_t n, char* uncompressed) {
3962
  ByteArraySource reader(compressed, n);
3963
  return RawUncompress(&reader, uncompressed);
3964
}
3965
3966
bool RawUncompress(Source* compressed, char* uncompressed) {
3967
  SnappyArrayWriter output(uncompressed);
3968
  return InternalUncompress(compressed, &output);
3969
}
3970
3971
bool Uncompress(const char* compressed, size_t n, string* uncompressed) {
3972
  size_t ulength;
3973
  if (!GetUncompressedLength(compressed, n, &ulength)) {
3974
    return false;
3975
  }
3976
  // On 32-bit builds: max_size() < kuint32max.  Check for that instead
3977
  // of crashing (e.g., consider externally specified compressed data).
3978
  if (ulength > uncompressed->max_size()) {
3979
    return false;
3980
  }
3981
  STLStringResizeUninitialized(uncompressed, ulength);
3982
  return RawUncompress(compressed, n, string_as_array(uncompressed));
3983
}
3984
3985
// A Writer that drops everything on the floor and just does validation
3986
class SnappyDecompressionValidator {
3987
 private:
3988
  size_t expected_;
3989
  size_t produced_;
3990
3991
 public:
3992
  inline SnappyDecompressionValidator() : expected_(0), produced_(0) { }
3993
  inline void SetExpectedLength(size_t len) {
3994
    expected_ = len;
3995
  }
3996
  inline bool CheckLength() const {
3997
    return expected_ == produced_;
3998
  }
3999
  inline bool Append(const char* ip, size_t len) {
4000
    produced_ += len;
4001
    return produced_ <= expected_;
4002
  }
4003
  inline bool TryFastAppend(const char* ip, size_t available, size_t length) {
4004
    return false;
4005
  }
4006
  inline bool AppendFromSelf(size_t offset, size_t len) {
4007
    // See SnappyArrayWriter::AppendFromSelf for an explanation of
4008
    // the "offset - 1u" trick.
4009
    if (produced_ <= offset - 1u) return false;
4010
    produced_ += len;
4011
    return produced_ <= expected_;
4012
  }
4013
  inline void Flush() {}
4014
};
4015
4016
bool IsValidCompressedBuffer(const char* compressed, size_t n) {
4017
  ByteArraySource reader(compressed, n);
4018
  SnappyDecompressionValidator writer;
4019
  return InternalUncompress(&reader, &writer);
4020
}
4021
4022
bool IsValidCompressed(Source* compressed) {
4023
  SnappyDecompressionValidator writer;
4024
  return InternalUncompress(compressed, &writer);
4025
}
4026
4027
void RawCompress(const char* input,
4028
                 size_t input_length,
4029
                 char* compressed,
4030
                 size_t* compressed_length) {
4031
  ByteArraySource reader(input, input_length);
4032
  UncheckedByteArraySink writer(compressed);
4033
  Compress(&reader, &writer);
4034
4035
  // Compute how many bytes were added
4036
  *compressed_length = (writer.CurrentDestination() - compressed);
4037
}
4038
4039
size_t Compress(const char* input, size_t input_length, string* compressed) {
4040
  // Pre-grow the buffer to the max length of the compressed output
4041
  STLStringResizeUninitialized(compressed, MaxCompressedLength(input_length));
4042
4043
  size_t compressed_length;
4044
  RawCompress(input, input_length, string_as_array(compressed),
4045
              &compressed_length);
4046
  compressed->resize(compressed_length);
4047
  return compressed_length;
4048
}
4049
4050
// -----------------------------------------------------------------------
4051
// Sink interface
4052
// -----------------------------------------------------------------------
4053
4054
// A type that decompresses into a Sink. The template parameter
4055
// Allocator must export one method "char* Allocate(int size);", which
4056
// allocates a buffer of "size" and appends that to the destination.
4057
template <typename Allocator>
4058
class SnappyScatteredWriter {
4059
  Allocator allocator_;
4060
4061
  // We need random access into the data generated so far.  Therefore
4062
  // we keep track of all of the generated data as an array of blocks.
4063
  // All of the blocks except the last have length kBlockSize.
4064
  std::vector<char*> blocks_;
4065
  size_t expected_;
4066
4067
  // Total size of all fully generated blocks so far
4068
  size_t full_size_;
4069
4070
  // Pointer into current output block
4071
  char* op_base_;       // Base of output block
4072
  char* op_ptr_;        // Pointer to next unfilled byte in block
4073
  char* op_limit_;      // Pointer just past block
4074
4075
  inline size_t Size() const {
4076
    return full_size_ + (op_ptr_ - op_base_);
4077
  }
4078
4079
  bool SlowAppend(const char* ip, size_t len);
4080
  bool SlowAppendFromSelf(size_t offset, size_t len);
4081
4082
 public:
4083
  inline explicit SnappyScatteredWriter(const Allocator& allocator)
4084
      : allocator_(allocator),
4085
        full_size_(0),
4086
        op_base_(NULL),
4087
        op_ptr_(NULL),
4088
        op_limit_(NULL) {
4089
  }
4090
4091
  inline void SetExpectedLength(size_t len) {
4092
    assert(blocks_.empty());
4093
    expected_ = len;
4094
  }
4095
4096
  inline bool CheckLength() const {
4097
    return Size() == expected_;
4098
  }
4099
4100
  // Return the number of bytes actually uncompressed so far
4101
  inline size_t Produced() const {
4102
    return Size();
4103
  }
4104
4105
  inline bool Append(const char* ip, size_t len) {
4106
    size_t avail = op_limit_ - op_ptr_;
4107
    if (len <= avail) {
4108
      // Fast path
4109
      memcpy(op_ptr_, ip, len);
4110
      op_ptr_ += len;
4111
      return true;
4112
    } else {
4113
      return SlowAppend(ip, len);
4114
    }
4115
  }
4116
4117
  inline bool TryFastAppend(const char* ip, size_t available, size_t length) {
4118
    char* op = op_ptr_;
4119
    const int space_left = op_limit_ - op;
4120
    if (length <= 16 && available >= 16 + kMaximumTagLength &&
4121
        space_left >= 16) {
4122
      // Fast path, used for the majority (about 95%) of invocations.
4123
      UnalignedCopy128(ip, op);
4124
      op_ptr_ = op + length;
4125
      return true;
4126
    } else {
4127
      return false;
4128
    }
4129
  }
4130
4131
  inline bool AppendFromSelf(size_t offset, size_t len) {
4132
    char* const op_end = op_ptr_ + len;
4133
    // See SnappyArrayWriter::AppendFromSelf for an explanation of
4134
    // the "offset - 1u" trick.
4135
    if (SNAPPY_PREDICT_TRUE(offset - 1u < (size_t)(op_ptr_ - op_base_) &&
4136
                          op_end <= op_limit_)) {
4137
      // Fast path: src and dst in current block.
4138
      op_ptr_ = IncrementalCopy(op_ptr_ - offset, op_ptr_, op_end, op_limit_);
4139
      return true;
4140
    }
4141
    return SlowAppendFromSelf(offset, len);
4142
  }
4143
4144
  // Called at the end of the decompress. We ask the allocator
4145
  // write all blocks to the sink.
4146
  inline void Flush() { allocator_.Flush(Produced()); }
4147
};
4148
4149
template<typename Allocator>
4150
bool SnappyScatteredWriter<Allocator>::SlowAppend(const char* ip, size_t len) {
4151
  size_t avail = op_limit_ - op_ptr_;
4152
  while (len > avail) {
4153
    // Completely fill this block
4154
    memcpy(op_ptr_, ip, avail);
4155
    op_ptr_ += avail;
4156
    assert(op_limit_ - op_ptr_ == 0);
4157
    full_size_ += (op_ptr_ - op_base_);
4158
    len -= avail;
4159
    ip += avail;
4160
4161
    // Bounds check
4162
    if (full_size_ + len > expected_) {
4163
      return false;
4164
    }
4165
4166
    // Make new block
4167
    size_t bsize = std::min<size_t>(kBlockSize, expected_ - full_size_);
4168
    op_base_ = allocator_.Allocate(bsize);
4169
    op_ptr_ = op_base_;
4170
    op_limit_ = op_base_ + bsize;
4171
    blocks_.push_back(op_base_);
4172
    avail = bsize;
4173
  }
4174
4175
  memcpy(op_ptr_, ip, len);
4176
  op_ptr_ += len;
4177
  return true;
4178
}
4179
4180
template<typename Allocator>
4181
bool SnappyScatteredWriter<Allocator>::SlowAppendFromSelf(size_t offset,
4182
                                                         size_t len) {
4183
  // Overflow check
4184
  // See SnappyArrayWriter::AppendFromSelf for an explanation of
4185
  // the "offset - 1u" trick.
4186
  const size_t cur = Size();
4187
  if (offset - 1u >= cur) return false;
4188
  if (expected_ - cur < len) return false;
4189
4190
  // Currently we shouldn't ever hit this path because Compress() chops the
4191
  // input into blocks and does not create cross-block copies. However, it is
4192
  // nice if we do not rely on that, since we can get better compression if we
4193
  // allow cross-block copies and thus might want to change the compressor in
4194
  // the future.
4195
  size_t src = cur - offset;
4196
  while (len-- > 0) {
4197
    char c = blocks_[src >> kBlockLog][src & (kBlockSize-1)];
4198
    Append(&c, 1);
4199
    src++;
4200
  }
4201
  return true;
4202
}
4203
4204
class SnappySinkAllocator {
4205
 public:
4206
  explicit SnappySinkAllocator(Sink* dest): dest_(dest) {}
4207
  ~SnappySinkAllocator() {}
4208
4209
  char* Allocate(int size) {
4210
    Datablock block(new char[size], size);
4211
    blocks_.push_back(block);
4212
    return block.data;
4213
  }
4214
4215
  // We flush only at the end, because the writer wants
4216
  // random access to the blocks and once we hand the
4217
  // block over to the sink, we can't access it anymore.
4218
  // Also we don't write more than has been actually written
4219
  // to the blocks.
4220
  void Flush(size_t size) {
4221
    size_t size_written = 0;
4222
    size_t block_size;
4223
    for (size_t i = 0; i < blocks_.size(); ++i) {
4224
      block_size = std::min<size_t>(blocks_[i].size, size - size_written);
4225
      dest_->AppendAndTakeOwnership(blocks_[i].data, block_size,
4226
                                    &SnappySinkAllocator::Deleter, NULL);
4227
      size_written += block_size;
4228
    }
4229
    blocks_.clear();
4230
  }
4231
4232
 private:
4233
  struct Datablock {
4234
    char* data;
4235
    size_t size;
4236
    Datablock(char* p, size_t s) : data(p), size(s) {}
4237
  };
4238
4239
  static void Deleter(void* arg, const char* bytes, size_t size) {
4240
    delete[] bytes;
4241
  }
4242
4243
  Sink* dest_;
4244
  std::vector<Datablock> blocks_;
4245
4246
  // Note: copying this object is allowed
4247
};
4248
4249
size_t UncompressAsMuchAsPossible(Source* compressed, Sink* uncompressed) {
4250
  SnappySinkAllocator allocator(uncompressed);
4251
  SnappyScatteredWriter<SnappySinkAllocator> writer(allocator);
4252
  InternalUncompress(compressed, &writer);
4253
  return writer.Produced();
4254
}
4255
4256
bool Uncompress(Source* compressed, Sink* uncompressed) {
4257
  // Read the uncompressed length from the front of the compressed input
4258
  SnappyDecompressor decompressor(compressed);
4259
  uint32 uncompressed_len = 0;
4260
  if (!decompressor.ReadUncompressedLength(&uncompressed_len)) {
4261
    return false;
4262
  }
4263
4264
  char c;
4265
  size_t allocated_size;
4266
  char* buf = uncompressed->GetAppendBufferVariable(
4267
      1, uncompressed_len, &c, 1, &allocated_size);
4268
4269
  const size_t compressed_len = compressed->Available();
4270
  // If we can get a flat buffer, then use it, otherwise do block by block
4271
  // uncompression
4272
  if (allocated_size >= uncompressed_len) {
4273
    SnappyArrayWriter writer(buf);
4274
    bool result = InternalUncompressAllTags(&decompressor, &writer,
4275
                                            compressed_len, uncompressed_len);
4276
    uncompressed->Append(buf, writer.Produced());
4277
    return result;
4278
  } else {
4279
    SnappySinkAllocator allocator(uncompressed);
4280
    SnappyScatteredWriter<SnappySinkAllocator> writer(allocator);
4281
    return InternalUncompressAllTags(&decompressor, &writer, compressed_len,
4282
                                     uncompressed_len);
4283
  }
4284
}
4285
4286
}  // namespace duckdb_snappy
4287
4288
#endif  // #if SNAPPY_NEW_VERSION # else