/work/snappy_ep-prefix/src/snappy_ep/snappy.cc
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | // Copyright 2005 Google Inc. All Rights Reserved.  | 
2  |  | //  | 
3  |  | // Redistribution and use in source and binary forms, with or without  | 
4  |  | // modification, are permitted provided that the following conditions are  | 
5  |  | // met:  | 
6  |  | //  | 
7  |  | //     * Redistributions of source code must retain the above copyright  | 
8  |  | // notice, this list of conditions and the following disclaimer.  | 
9  |  | //     * Redistributions in binary form must reproduce the above  | 
10  |  | // copyright notice, this list of conditions and the following disclaimer  | 
11  |  | // in the documentation and/or other materials provided with the  | 
12  |  | // distribution.  | 
13  |  | //     * Neither the name of Google Inc. nor the names of its  | 
14  |  | // contributors may be used to endorse or promote products derived from  | 
15  |  | // this software without specific prior written permission.  | 
16  |  | //  | 
17  |  | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS  | 
18  |  | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT  | 
19  |  | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR  | 
20  |  | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT  | 
21  |  | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,  | 
22  |  | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT  | 
23  |  | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,  | 
24  |  | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY  | 
25  |  | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT  | 
26  |  | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE  | 
27  |  | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  | 
28  |  |  | 
29  |  | #include "snappy-internal.h"  | 
30  |  | #include "snappy-sinksource.h"  | 
31  |  | #include "snappy.h"  | 
32  |  | #if !defined(SNAPPY_HAVE_BMI2)  | 
33  |  | // __BMI2__ is defined by GCC and Clang. Visual Studio doesn't target BMI2  | 
34  |  | // specifically, but it does define __AVX2__ when AVX2 support is available.  | 
35  |  | // Fortunately, AVX2 was introduced in Haswell, just like BMI2.  | 
36  |  | //  | 
37  |  | // BMI2 is not defined as a subset of AVX2 (unlike SSSE3 and AVX above). So,  | 
38  |  | // GCC and Clang can build code with AVX2 enabled but BMI2 disabled, in which  | 
39  |  | // case issuing BMI2 instructions results in a compiler error.  | 
40  |  | #if defined(__BMI2__) || (defined(_MSC_VER) && defined(__AVX2__))  | 
41  |  | #define SNAPPY_HAVE_BMI2 1  | 
42  |  | #else  | 
43  |  | #define SNAPPY_HAVE_BMI2 0  | 
44  |  | #endif  | 
45  |  | #endif  // !defined(SNAPPY_HAVE_BMI2)  | 
46  |  |  | 
47  |  | #if !defined(SNAPPY_HAVE_X86_CRC32)  | 
48  |  | #if defined(__SSE4_2__)  | 
49  |  | #define SNAPPY_HAVE_X86_CRC32 1  | 
50  |  | #else  | 
51  |  | #define SNAPPY_HAVE_X86_CRC32 0  | 
52  |  | #endif  | 
53  |  | #endif  // !defined(SNAPPY_HAVE_X86_CRC32)  | 
54  |  |  | 
55  |  | #if !defined(SNAPPY_HAVE_NEON_CRC32)  | 
56  |  | #if SNAPPY_HAVE_NEON && defined(__ARM_FEATURE_CRC32)  | 
57  |  | #define SNAPPY_HAVE_NEON_CRC32 1  | 
58  |  | #else  | 
59  |  | #define SNAPPY_HAVE_NEON_CRC32 0  | 
60  |  | #endif  | 
61  |  | #endif  // !defined(SNAPPY_HAVE_NEON_CRC32)  | 
62  |  |  | 
63  |  | #if SNAPPY_HAVE_BMI2 || SNAPPY_HAVE_X86_CRC32  | 
64  |  | // Please do not replace with <x86intrin.h>. or with headers that assume more  | 
65  |  | // advanced SSE versions without checking with all the OWNERS.  | 
66  |  | #include <immintrin.h>  | 
67  |  | #elif SNAPPY_HAVE_NEON_CRC32  | 
68  |  | #include <arm_acle.h>  | 
69  |  | #endif  | 
70  |  |  | 
71  |  | #include <algorithm>  | 
72  |  | #include <array>  | 
73  |  | #include <cstddef>  | 
74  |  | #include <cstdint>  | 
75  |  | #include <cstdio>  | 
76  |  | #include <cstring>  | 
77  |  | #include <functional>  | 
78  |  | #include <memory>  | 
79  |  | #include <string>  | 
80  |  | #include <utility>  | 
81  |  | #include <vector>  | 
82  |  |  | 
83  |  | namespace snappy { | 
84  |  |  | 
85  |  | namespace { | 
86  |  |  | 
87  |  | // The amount of slop bytes writers are using for unconditional copies.  | 
88  |  | constexpr int kSlopBytes = 64;  | 
89  |  |  | 
90  |  | using internal::char_table;  | 
91  |  | using internal::COPY_1_BYTE_OFFSET;  | 
92  |  | using internal::COPY_2_BYTE_OFFSET;  | 
93  |  | using internal::COPY_4_BYTE_OFFSET;  | 
94  |  | using internal::kMaximumTagLength;  | 
95  |  | using internal::LITERAL;  | 
96  |  | #if SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE  | 
97  |  | using internal::V128;  | 
98  |  | using internal::V128_Load;  | 
99  |  | using internal::V128_LoadU;  | 
100  |  | using internal::V128_Shuffle;  | 
101  |  | using internal::V128_StoreU;  | 
102  |  | using internal::V128_DupChar;  | 
103  |  | #endif  | 
104  |  |  | 
105  |  | // We translate the information encoded in a tag through a lookup table to a  | 
106  |  | // format that requires fewer instructions to decode. Effectively we store  | 
107  |  | // the length minus the tag part of the offset. The lowest significant byte  | 
108  |  | // thus stores the length. While total length - offset is given by  | 
109  |  | // entry - ExtractOffset(type). The nice thing is that the subtraction  | 
110  |  | // immediately sets the flags for the necessary check that offset >= length.  | 
111  |  | // This folds the cmp with sub. We engineer the long literals and copy-4 to  | 
112  |  | // always fail this check, so their presence doesn't affect the fast path.  | 
113  |  | // To prevent literals from triggering the guard against offset < length (offset  | 
114  |  | // does not apply to literals) the table is giving them a spurious offset of  | 
115  |  | // 256.  | 
116  | 0  | inline constexpr int16_t MakeEntry(int16_t len, int16_t offset) { | 
117  | 0  |   return len - (offset << 8);  | 
118  | 0  | }  | 
119  |  |  | 
120  | 0  | inline constexpr int16_t LengthMinusOffset(int data, int type) { | 
121  | 0  |   return type == 3   ? 0xFF                    // copy-4 (or type == 3)  | 
122  | 0  |          : type == 2 ? MakeEntry(data + 1, 0)  // copy-2  | 
123  | 0  |          : type == 1 ? MakeEntry((data & 7) + 4, data >> 3)  // copy-1  | 
124  | 0  |          : data < 60 ? MakeEntry(data + 1, 1)  // note spurious offset.  | 
125  | 0  |                      : 0xFF;                   // long literal  | 
126  | 0  | }  | 
127  |  |  | 
128  | 0  | inline constexpr int16_t LengthMinusOffset(uint8_t tag) { | 
129  | 0  |   return LengthMinusOffset(tag >> 2, tag & 3);  | 
130  | 0  | }  | 
131  |  |  | 
132  |  | template <size_t... Ints>  | 
133  |  | struct index_sequence {}; | 
134  |  |  | 
135  |  | template <std::size_t N, size_t... Is>  | 
136  |  | struct make_index_sequence : make_index_sequence<N - 1, N - 1, Is...> {}; | 
137  |  |  | 
138  |  | template <size_t... Is>  | 
139  |  | struct make_index_sequence<0, Is...> : index_sequence<Is...> {}; | 
140  |  |  | 
141  |  | template <size_t... seq>  | 
142  | 0  | constexpr std::array<int16_t, 256> MakeTable(index_sequence<seq...>) { | 
143  | 0  |   return std::array<int16_t, 256>{LengthMinusOffset(seq)...}; | 
144  | 0  | }  | 
145  |  |  | 
146  |  | alignas(64) const std::array<int16_t, 256> kLengthMinusOffset =  | 
147  |  |     MakeTable(make_index_sequence<256>{}); | 
148  |  |  | 
149  |  | // Given a table of uint16_t whose size is mask / 2 + 1, return a pointer to the  | 
150  |  | // relevant entry, if any, for the given bytes.  Any hash function will do,  | 
151  |  | // but a good hash function reduces the number of collisions and thus yields  | 
152  |  | // better compression for compressible input.  | 
153  |  | //  | 
154  |  | // REQUIRES: mask is 2 * (table_size - 1), and table_size is a power of two.  | 
155  | 0  | inline uint16_t* TableEntry(uint16_t* table, uint32_t bytes, uint32_t mask) { | 
156  |  |   // Our choice is quicker-and-dirtier than the typical hash function;  | 
157  |  |   // empirically, that seems beneficial.  The upper bits of kMagic * bytes are a  | 
158  |  |   // higher-quality hash than the lower bits, so when using kMagic * bytes we  | 
159  |  |   // also shift right to get a higher-quality end result.  There's no similar  | 
160  |  |   // issue with a CRC because all of the output bits of a CRC are equally good  | 
161  |  |   // "hashes." So, a CPU instruction for CRC, if available, tends to be a good  | 
162  |  |   // choice.  | 
163  |  | #if SNAPPY_HAVE_NEON_CRC32  | 
164  |  |   // We use mask as the second arg to the CRC function, as it's about to  | 
165  |  |   // be used anyway; it'd be equally correct to use 0 or some constant.  | 
166  |  |   // Mathematically, _mm_crc32_u32 (or similar) is a function of the  | 
167  |  |   // xor of its arguments.  | 
168  |  |   const uint32_t hash = __crc32cw(bytes, mask);  | 
169  |  | #elif SNAPPY_HAVE_X86_CRC32  | 
170  |  |   const uint32_t hash = _mm_crc32_u32(bytes, mask);  | 
171  |  | #else  | 
172  | 0  |   constexpr uint32_t kMagic = 0x1e35a7bd;  | 
173  | 0  |   const uint32_t hash = (kMagic * bytes) >> (31 - kMaxHashTableBits);  | 
174  | 0  | #endif  | 
175  | 0  |   return reinterpret_cast<uint16_t*>(reinterpret_cast<uintptr_t>(table) +  | 
176  | 0  |                                      (hash & mask));  | 
177  | 0  | }  | 
178  |  |  | 
179  |  | inline uint16_t* TableEntry4ByteMatch(uint16_t* table, uint32_t bytes,  | 
180  | 0  |                                       uint32_t mask) { | 
181  | 0  |   constexpr uint32_t kMagic = 2654435761U;  | 
182  | 0  |   const uint32_t hash = (kMagic * bytes) >> (32 - kMaxHashTableBits);  | 
183  | 0  |   return reinterpret_cast<uint16_t*>(reinterpret_cast<uintptr_t>(table) +  | 
184  | 0  |                                      (hash & mask));  | 
185  | 0  | }  | 
186  |  |  | 
187  |  | inline uint16_t* TableEntry8ByteMatch(uint16_t* table, uint64_t bytes,  | 
188  | 0  |                                       uint32_t mask) { | 
189  | 0  |   constexpr uint64_t kMagic = 58295818150454627ULL;  | 
190  | 0  |   const uint32_t hash = (kMagic * bytes) >> (64 - kMaxHashTableBits);  | 
191  | 0  |   return reinterpret_cast<uint16_t*>(reinterpret_cast<uintptr_t>(table) +  | 
192  | 0  |                                      (hash & mask));  | 
193  | 0  | }  | 
194  |  |  | 
195  |  | }  // namespace  | 
196  |  |  | 
197  | 0  | size_t MaxCompressedLength(size_t source_bytes) { | 
198  |  |   // Compressed data can be defined as:  | 
199  |  |   //    compressed := item* literal*  | 
200  |  |   //    item       := literal* copy  | 
201  |  |   //  | 
202  |  |   // The trailing literal sequence has a space blowup of at most 62/60  | 
203  |  |   // since a literal of length 60 needs one tag byte + one extra byte  | 
204  |  |   // for length information.  | 
205  |  |   //  | 
206  |  |   // Item blowup is trickier to measure.  Suppose the "copy" op copies  | 
207  |  |   // 4 bytes of data.  Because of a special check in the encoding code,  | 
208  |  |   // we produce a 4-byte copy only if the offset is < 65536.  Therefore  | 
209  |  |   // the copy op takes 3 bytes to encode, and this type of item leads  | 
210  |  |   // to at most the 62/60 blowup for representing literals.  | 
211  |  |   //  | 
212  |  |   // Suppose the "copy" op copies 5 bytes of data.  If the offset is big  | 
213  |  |   // enough, it will take 5 bytes to encode the copy op.  Therefore the  | 
214  |  |   // worst case here is a one-byte literal followed by a five-byte copy.  | 
215  |  |   // I.e., 6 bytes of input turn into 7 bytes of "compressed" data.  | 
216  |  |   //  | 
217  |  |   // This last factor dominates the blowup, so the final estimate is:  | 
218  | 0  |   return 32 + source_bytes + source_bytes / 6;  | 
219  | 0  | }  | 
220  |  |  | 
221  |  | namespace { | 
222  |  |  | 
223  | 5.72k  | void UnalignedCopy64(const void* src, void* dst) { | 
224  | 5.72k  |   char tmp[8];  | 
225  | 5.72k  |   std::memcpy(tmp, src, 8);  | 
226  | 5.72k  |   std::memcpy(dst, tmp, 8);  | 
227  | 5.72k  | }  | 
228  |  |  | 
229  | 1.46k  | void UnalignedCopy128(const void* src, void* dst) { | 
230  |  |   // std::memcpy() gets vectorized when the appropriate compiler options are  | 
231  |  |   // used. For example, x86 compilers targeting SSE2+ will optimize to an SSE2  | 
232  |  |   // load and store.  | 
233  | 1.46k  |   char tmp[16];  | 
234  | 1.46k  |   std::memcpy(tmp, src, 16);  | 
235  | 1.46k  |   std::memcpy(dst, tmp, 16);  | 
236  | 1.46k  | }  | 
237  |  |  | 
238  |  | template <bool use_16bytes_chunk>  | 
239  | 964  | inline void ConditionalUnalignedCopy128(const char* src, char* dst) { | 
240  | 964  |   if (use_16bytes_chunk) { | 
241  | 0  |     UnalignedCopy128(src, dst);  | 
242  | 964  |   } else { | 
243  | 964  |     UnalignedCopy64(src, dst);  | 
244  | 964  |     UnalignedCopy64(src + 8, dst + 8);  | 
245  | 964  |   }  | 
246  | 964  | }  | 
247  |  |  | 
248  |  | // Copy [src, src+(op_limit-op)) to [op, (op_limit-op)) a byte at a time. Used  | 
249  |  | // for handling COPY operations where the input and output regions may overlap.  | 
250  |  | // For example, suppose:  | 
251  |  | //    src       == "ab"  | 
252  |  | //    op        == src + 2  | 
253  |  | //    op_limit  == op + 20  | 
254  |  | // After IncrementalCopySlow(src, op, op_limit), the result will have eleven  | 
255  |  | // copies of "ab"  | 
256  |  | //    ababababababababababab  | 
257  |  | // Note that this does not match the semantics of either std::memcpy() or  | 
258  |  | // std::memmove().  | 
259  |  | inline char* IncrementalCopySlow(const char* src, char* op,  | 
260  | 136  |                                  char* const op_limit) { | 
261  |  |   // TODO: Remove pragma when LLVM is aware this  | 
262  |  |   // function is only called in cold regions and when cold regions don't get  | 
263  |  |   // vectorized or unrolled.  | 
264  | 136  | #ifdef __clang__  | 
265  | 136  | #pragma clang loop unroll(disable)  | 
266  | 136  | #endif  | 
267  | 468  |   while (op < op_limit) { | 
268  | 332  |     *op++ = *src++;  | 
269  | 332  |   }  | 
270  | 136  |   return op_limit;  | 
271  | 136  | }  | 
272  |  |  | 
273  |  | #if SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE  | 
274  |  |  | 
275  |  | // Computes the bytes for shuffle control mask (please read comments on  | 
276  |  | // 'pattern_generation_masks' as well) for the given index_offset and  | 
277  |  | // pattern_size. For example, when the 'offset' is 6, it will generate a  | 
278  |  | // repeating pattern of size 6. So, the first 16 byte indexes will correspond to  | 
279  |  | // the pattern-bytes {0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3} and the | 
280  |  | // next 16 byte indexes will correspond to the pattern-bytes {4, 5, 0, 1, 2, 3, | 
281  |  | // 4, 5, 0, 1, 2, 3, 4, 5, 0, 1}. These byte index sequences are generated by  | 
282  |  | // calling MakePatternMaskBytes(0, 6, index_sequence<16>()) and  | 
283  |  | // MakePatternMaskBytes(16, 6, index_sequence<16>()) respectively.  | 
284  |  | template <size_t... indexes>  | 
285  |  | inline constexpr std::array<char, sizeof...(indexes)> MakePatternMaskBytes(  | 
286  |  |     int index_offset, int pattern_size, index_sequence<indexes...>) { | 
287  |  |   return {static_cast<char>((index_offset + indexes) % pattern_size)...}; | 
288  |  | }  | 
289  |  |  | 
290  |  | // Computes the shuffle control mask bytes array for given pattern-sizes and  | 
291  |  | // returns an array.  | 
292  |  | template <size_t... pattern_sizes_minus_one>  | 
293  |  | inline constexpr std::array<std::array<char, sizeof(V128)>,  | 
294  |  |                             sizeof...(pattern_sizes_minus_one)>  | 
295  |  | MakePatternMaskBytesTable(int index_offset,  | 
296  |  |                           index_sequence<pattern_sizes_minus_one...>) { | 
297  |  |   return { | 
298  |  |       MakePatternMaskBytes(index_offset, pattern_sizes_minus_one + 1,  | 
299  |  |                            make_index_sequence</*indexes=*/sizeof(V128)>())...};  | 
300  |  | }  | 
301  |  |  | 
302  |  | // This is an array of shuffle control masks that can be used as the source  | 
303  |  | // operand for PSHUFB to permute the contents of the destination XMM register  | 
304  |  | // into a repeating byte pattern.  | 
305  |  | alignas(16) constexpr std::array<std::array<char, sizeof(V128)>,  | 
306  |  |                                  16> pattern_generation_masks =  | 
307  |  |     MakePatternMaskBytesTable(  | 
308  |  |         /*index_offset=*/0,  | 
309  |  |         /*pattern_sizes_minus_one=*/make_index_sequence<16>());  | 
310  |  |  | 
311  |  | // Similar to 'pattern_generation_masks', this table is used to "rotate" the  | 
312  |  | // pattern so that we can copy the *next 16 bytes* consistent with the pattern.  | 
313  |  | // Basically, pattern_reshuffle_masks is a continuation of  | 
314  |  | // pattern_generation_masks. It follows that, pattern_reshuffle_masks is same as  | 
315  |  | // pattern_generation_masks for offsets 1, 2, 4, 8 and 16.  | 
316  |  | alignas(16) constexpr std::array<std::array<char, sizeof(V128)>,  | 
317  |  |                                  16> pattern_reshuffle_masks =  | 
318  |  |     MakePatternMaskBytesTable(  | 
319  |  |         /*index_offset=*/16,  | 
320  |  |         /*pattern_sizes_minus_one=*/make_index_sequence<16>());  | 
321  |  |  | 
322  |  | SNAPPY_ATTRIBUTE_ALWAYS_INLINE  | 
323  |  | static inline V128 LoadPattern(const char* src, const size_t pattern_size) { | 
324  |  |   V128 generation_mask = V128_Load(reinterpret_cast<const V128*>(  | 
325  |  |       pattern_generation_masks[pattern_size - 1].data()));  | 
326  |  |   // Uninitialized bytes are masked out by the shuffle mask.  | 
327  |  |   // TODO: remove annotation and macro defs once MSan is fixed.  | 
328  |  |   SNAPPY_ANNOTATE_MEMORY_IS_INITIALIZED(src + pattern_size, 16 - pattern_size);  | 
329  |  |   return V128_Shuffle(V128_LoadU(reinterpret_cast<const V128*>(src)),  | 
330  |  |                       generation_mask);  | 
331  |  | }  | 
332  |  |  | 
333  |  | SNAPPY_ATTRIBUTE_ALWAYS_INLINE  | 
334  |  | static inline std::pair<V128 /* pattern */, V128 /* reshuffle_mask */>  | 
335  |  | LoadPatternAndReshuffleMask(const char* src, const size_t pattern_size) { | 
336  |  |   V128 pattern = LoadPattern(src, pattern_size);  | 
337  |  |  | 
338  |  |   // This mask will generate the next 16 bytes in-place. Doing so enables us to  | 
339  |  |   // write data by at most 4 V128_StoreU.  | 
340  |  |   //  | 
341  |  |   // For example, suppose pattern is:        abcdefabcdefabcd  | 
342  |  |   // Shuffling with this mask will generate: efabcdefabcdefab  | 
343  |  |   // Shuffling again will generate:          cdefabcdefabcdef  | 
344  |  |   V128 reshuffle_mask = V128_Load(reinterpret_cast<const V128*>(  | 
345  |  |       pattern_reshuffle_masks[pattern_size - 1].data()));  | 
346  |  |   return {pattern, reshuffle_mask}; | 
347  |  | }  | 
348  |  |  | 
349  |  | #endif  // SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE  | 
350  |  |  | 
351  |  | // Fallback for when we need to copy while extending the pattern, for example  | 
352  |  | // copying 10 bytes from 3 positions back abc -> abcabcabcabca.  | 
353  |  | //  | 
354  |  | // REQUIRES: [dst - offset, dst + 64) is a valid address range.  | 
355  |  | SNAPPY_ATTRIBUTE_ALWAYS_INLINE  | 
356  | 672  | static inline bool Copy64BytesWithPatternExtension(char* dst, size_t offset) { | 
357  |  | #if SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE  | 
358  |  |   if (SNAPPY_PREDICT_TRUE(offset <= 16)) { | 
359  |  |     switch (offset) { | 
360  |  |       case 0:  | 
361  |  |         return false;  | 
362  |  |       case 1: { | 
363  |  |         // TODO: Ideally we should memset, move back once the  | 
364  |  |         // codegen issues are fixed.  | 
365  |  |         V128 pattern = V128_DupChar(dst[-1]);  | 
366  |  |         for (int i = 0; i < 4; i++) { | 
367  |  |           V128_StoreU(reinterpret_cast<V128*>(dst + 16 * i), pattern);  | 
368  |  |         }  | 
369  |  |         return true;  | 
370  |  |       }  | 
371  |  |       case 2:  | 
372  |  |       case 4:  | 
373  |  |       case 8:  | 
374  |  |       case 16: { | 
375  |  |         V128 pattern = LoadPattern(dst - offset, offset);  | 
376  |  |         for (int i = 0; i < 4; i++) { | 
377  |  |           V128_StoreU(reinterpret_cast<V128*>(dst + 16 * i), pattern);  | 
378  |  |         }  | 
379  |  |         return true;  | 
380  |  |       }  | 
381  |  |       default: { | 
382  |  |         auto pattern_and_reshuffle_mask =  | 
383  |  |             LoadPatternAndReshuffleMask(dst - offset, offset);  | 
384  |  |         V128 pattern = pattern_and_reshuffle_mask.first;  | 
385  |  |         V128 reshuffle_mask = pattern_and_reshuffle_mask.second;  | 
386  |  |         for (int i = 0; i < 4; i++) { | 
387  |  |           V128_StoreU(reinterpret_cast<V128*>(dst + 16 * i), pattern);  | 
388  |  |           pattern = V128_Shuffle(pattern, reshuffle_mask);  | 
389  |  |         }  | 
390  |  |         return true;  | 
391  |  |       }  | 
392  |  |     }  | 
393  |  |   }  | 
394  |  | #else  | 
395  | 672  |   if (SNAPPY_PREDICT_TRUE(offset < 16)) { | 
396  | 608  |     if (SNAPPY_PREDICT_FALSE(offset == 0)) return false;  | 
397  |  |     // Extend the pattern to the first 16 bytes.  | 
398  |  |     // The simpler formulation of `dst[i - offset]` induces undefined behavior.  | 
399  | 10.0k  |     for (int i = 0; i < 16; i++) dst[i] = (dst - offset)[i];  | 
400  |  |     // Find a multiple of pattern >= 16.  | 
401  | 592  |     static std::array<uint8_t, 16> pattern_sizes = []() { | 
402  | 1  |       std::array<uint8_t, 16> res;  | 
403  | 16  |       for (int i = 1; i < 16; i++) res[i] = (16 / i + 1) * i;  | 
404  | 1  |       return res;  | 
405  | 1  |     }();  | 
406  | 592  |     offset = pattern_sizes[offset];  | 
407  | 2.36k  |     for (int i = 1; i < 4; i++) { | 
408  | 1.77k  |       std::memcpy(dst + i * 16, dst + i * 16 - offset, 16);  | 
409  | 1.77k  |     }  | 
410  | 592  |     return true;  | 
411  | 608  |   }  | 
412  | 64  | #endif  // SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE  | 
413  |  |  | 
414  |  |   // Very rare.  | 
415  | 320  |   for (int i = 0; i < 4; i++) { | 
416  | 256  |     std::memcpy(dst + i * 16, dst + i * 16 - offset, 16);  | 
417  | 256  |   }  | 
418  | 64  |   return true;  | 
419  | 672  | }  | 
420  |  |  | 
421  |  | // Copy [src, src+(op_limit-op)) to [op, op_limit) but faster than  | 
422  |  | // IncrementalCopySlow. buf_limit is the address past the end of the writable  | 
423  |  | // region of the buffer.  | 
424  |  | inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,  | 
425  | 1.87k  |                              char* const buf_limit) { | 
426  |  | #if SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE  | 
427  |  |   constexpr int big_pattern_size_lower_bound = 16;  | 
428  |  | #else  | 
429  | 1.87k  |   constexpr int big_pattern_size_lower_bound = 8;  | 
430  | 1.87k  | #endif  | 
431  |  |  | 
432  |  |   // Terminology:  | 
433  |  |   //  | 
434  |  |   // slop = buf_limit - op  | 
435  |  |   // pat  = op - src  | 
436  |  |   // len  = op_limit - op  | 
437  | 1.87k  |   assert(src < op);  | 
438  | 1.87k  |   assert(op < op_limit);  | 
439  | 1.87k  |   assert(op_limit <= buf_limit);  | 
440  |  |   // NOTE: The copy tags use 3 or 6 bits to store the copy length, so len <= 64.  | 
441  | 1.87k  |   assert(op_limit - op <= 64);  | 
442  |  |   // NOTE: In practice the compressor always emits len >= 4, so it is ok to  | 
443  |  |   // assume that to optimize this function, but this is not guaranteed by the  | 
444  |  |   // compression format, so we have to also handle len < 4 in case the input  | 
445  |  |   // does not satisfy these conditions.  | 
446  |  |  | 
447  | 1.87k  |   size_t pattern_size = op - src;  | 
448  |  |   // The cases are split into different branches to allow the branch predictor,  | 
449  |  |   // FDO, and static prediction hints to work better. For each input we list the  | 
450  |  |   // ratio of invocations that match each condition.  | 
451  |  |   //  | 
452  |  |   // input        slop < 16   pat < 8  len > 16  | 
453  |  |   // ------------------------------------------  | 
454  |  |   // html|html4|cp   0%         1.01%    27.73%  | 
455  |  |   // urls            0%         0.88%    14.79%  | 
456  |  |   // jpg             0%        64.29%     7.14%  | 
457  |  |   // pdf             0%         2.56%    58.06%  | 
458  |  |   // txt[1-4]        0%         0.23%     0.97%  | 
459  |  |   // pb              0%         0.96%    13.88%  | 
460  |  |   // bin             0.01%     22.27%    41.17%  | 
461  |  |   //  | 
462  |  |   // It is very rare that we don't have enough slop for doing block copies. It  | 
463  |  |   // is also rare that we need to expand a pattern. Small patterns are common  | 
464  |  |   // for incompressible formats and for those we are plenty fast already.  | 
465  |  |   // Lengths are normally not greater than 16 but they vary depending on the  | 
466  |  |   // input. In general if we always predict len <= 16 it would be an ok  | 
467  |  |   // prediction.  | 
468  |  |   //  | 
469  |  |   // In order to be fast we want a pattern >= 16 bytes (or 8 bytes in non-SSE)  | 
470  |  |   // and an unrolled loop copying 1x 16 bytes (or 2x 8 bytes in non-SSE) at a  | 
471  |  |   // time.  | 
472  |  |  | 
473  |  |   // Handle the uncommon case where pattern is less than 16 (or 8 in non-SSE)  | 
474  |  |   // bytes.  | 
475  | 1.87k  |   if (pattern_size < big_pattern_size_lower_bound) { | 
476  |  | #if SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE  | 
477  |  |     // Load the first eight bytes into an 128-bit XMM register, then use PSHUFB  | 
478  |  |     // to permute the register's contents in-place into a repeating sequence of  | 
479  |  |     // the first "pattern_size" bytes.  | 
480  |  |     // For example, suppose:  | 
481  |  |     //    src       == "abc"  | 
482  |  |     //    op        == op + 3  | 
483  |  |     // After V128_Shuffle(), "pattern" will have five copies of "abc"  | 
484  |  |     // followed by one byte of slop: abcabcabcabcabca.  | 
485  |  |     //  | 
486  |  |     // The non-SSE fallback implementation suffers from store-forwarding stalls  | 
487  |  |     // because its loads and stores partly overlap. By expanding the pattern  | 
488  |  |     // in-place, we avoid the penalty.  | 
489  |  |  | 
490  |  |     // Typically, the op_limit is the gating factor so try to simplify the loop  | 
491  |  |     // based on that.  | 
492  |  |     if (SNAPPY_PREDICT_TRUE(op_limit <= buf_limit - 15)) { | 
493  |  |       auto pattern_and_reshuffle_mask =  | 
494  |  |           LoadPatternAndReshuffleMask(src, pattern_size);  | 
495  |  |       V128 pattern = pattern_and_reshuffle_mask.first;  | 
496  |  |       V128 reshuffle_mask = pattern_and_reshuffle_mask.second;  | 
497  |  |  | 
498  |  |       // There is at least one, and at most four 16-byte blocks. Writing four  | 
499  |  |       // conditionals instead of a loop allows FDO to layout the code with  | 
500  |  |       // respect to the actual probabilities of each length.  | 
501  |  |       // TODO: Replace with loop with trip count hint.  | 
502  |  |       V128_StoreU(reinterpret_cast<V128*>(op), pattern);  | 
503  |  |  | 
504  |  |       if (op + 16 < op_limit) { | 
505  |  |         pattern = V128_Shuffle(pattern, reshuffle_mask);  | 
506  |  |         V128_StoreU(reinterpret_cast<V128*>(op + 16), pattern);  | 
507  |  |       }  | 
508  |  |       if (op + 32 < op_limit) { | 
509  |  |         pattern = V128_Shuffle(pattern, reshuffle_mask);  | 
510  |  |         V128_StoreU(reinterpret_cast<V128*>(op + 32), pattern);  | 
511  |  |       }  | 
512  |  |       if (op + 48 < op_limit) { | 
513  |  |         pattern = V128_Shuffle(pattern, reshuffle_mask);  | 
514  |  |         V128_StoreU(reinterpret_cast<V128*>(op + 48), pattern);  | 
515  |  |       }  | 
516  |  |       return op_limit;  | 
517  |  |     }  | 
518  |  |     char* const op_end = buf_limit - 15;  | 
519  |  |     if (SNAPPY_PREDICT_TRUE(op < op_end)) { | 
520  |  |       auto pattern_and_reshuffle_mask =  | 
521  |  |           LoadPatternAndReshuffleMask(src, pattern_size);  | 
522  |  |       V128 pattern = pattern_and_reshuffle_mask.first;  | 
523  |  |       V128 reshuffle_mask = pattern_and_reshuffle_mask.second;  | 
524  |  |  | 
525  |  |       // This code path is relatively cold however so we save code size  | 
526  |  |       // by avoiding unrolling and vectorizing.  | 
527  |  |       //  | 
528  |  |       // TODO: Remove pragma when when cold regions don't get  | 
529  |  |       // vectorized or unrolled.  | 
530  |  | #ifdef __clang__  | 
531  |  | #pragma clang loop unroll(disable)  | 
532  |  | #endif  | 
533  |  |       do { | 
534  |  |         V128_StoreU(reinterpret_cast<V128*>(op), pattern);  | 
535  |  |         pattern = V128_Shuffle(pattern, reshuffle_mask);  | 
536  |  |         op += 16;  | 
537  |  |       } while (SNAPPY_PREDICT_TRUE(op < op_end));  | 
538  |  |     }  | 
539  |  |     return IncrementalCopySlow(op - pattern_size, op, op_limit);  | 
540  |  | #else   // !SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE  | 
541  |  |     // If plenty of buffer space remains, expand the pattern to at least 8  | 
542  |  |     // bytes. The way the following loop is written, we need 8 bytes of buffer  | 
543  |  |     // space if pattern_size >= 4, 11 bytes if pattern_size is 1 or 3, and 10  | 
544  |  |     // bytes if pattern_size is 2.  Precisely encoding that is probably not  | 
545  |  |     // worthwhile; instead, invoke the slow path if we cannot write 11 bytes  | 
546  |  |     // (because 11 are required in the worst case).  | 
547  | 1.53k  |     if (SNAPPY_PREDICT_TRUE(op <= buf_limit - 11)) { | 
548  | 5.22k  |       while (pattern_size < 8) { | 
549  | 3.72k  |         UnalignedCopy64(src, op);  | 
550  | 3.72k  |         op += pattern_size;  | 
551  | 3.72k  |         pattern_size *= 2;  | 
552  | 3.72k  |       }  | 
553  | 1.49k  |       if (SNAPPY_PREDICT_TRUE(op >= op_limit)) return op_limit;  | 
554  | 1.49k  |     } else { | 
555  | 40  |       return IncrementalCopySlow(src, op, op_limit);  | 
556  | 40  |     }  | 
557  | 1.53k  | #endif  // SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE  | 
558  | 1.53k  |   }  | 
559  | 472  |   assert(pattern_size >= big_pattern_size_lower_bound);  | 
560  | 472  |   constexpr bool use_16bytes_chunk = big_pattern_size_lower_bound == 16;  | 
561  |  |  | 
562  |  |   // Copy 1x 16 bytes (or 2x 8 bytes in non-SSE) at a time. Because op - src can  | 
563  |  |   // be < 16 in non-SSE, a single UnalignedCopy128 might overwrite data in op.  | 
564  |  |   // UnalignedCopy64 is safe because expanding the pattern to at least 8 bytes  | 
565  |  |   // guarantees that op - src >= 8.  | 
566  |  |   //  | 
567  |  |   // Typically, the op_limit is the gating factor so try to simplify the loop  | 
568  |  |   // based on that.  | 
569  | 472  |   if (SNAPPY_PREDICT_TRUE(op_limit <= buf_limit - 15)) { | 
570  |  |     // There is at least one, and at most four 16-byte blocks. Writing four  | 
571  |  |     // conditionals instead of a loop allows FDO to layout the code with respect  | 
572  |  |     // to the actual probabilities of each length.  | 
573  |  |     // TODO: Replace with loop with trip count hint.  | 
574  | 328  |     ConditionalUnalignedCopy128<use_16bytes_chunk>(src, op);  | 
575  | 328  |     if (op + 16 < op_limit) { | 
576  | 156  |       ConditionalUnalignedCopy128<use_16bytes_chunk>(src + 16, op + 16);  | 
577  | 156  |     }  | 
578  | 328  |     if (op + 32 < op_limit) { | 
579  | 128  |       ConditionalUnalignedCopy128<use_16bytes_chunk>(src + 32, op + 32);  | 
580  | 128  |     }  | 
581  | 328  |     if (op + 48 < op_limit) { | 
582  | 100  |       ConditionalUnalignedCopy128<use_16bytes_chunk>(src + 48, op + 48);  | 
583  | 100  |     }  | 
584  | 328  |     return op_limit;  | 
585  | 328  |   }  | 
586  |  |  | 
587  |  |   // Fall back to doing as much as we can with the available slop in the  | 
588  |  |   // buffer. This code path is relatively cold however so we save code size by  | 
589  |  |   // avoiding unrolling and vectorizing.  | 
590  |  |   //  | 
591  |  |   // TODO: Remove pragma when when cold regions don't get vectorized  | 
592  |  |   // or unrolled.  | 
593  | 144  | #ifdef __clang__  | 
594  | 144  | #pragma clang loop unroll(disable)  | 
595  | 144  | #endif  | 
596  | 396  |   for (char* op_end = buf_limit - 16; op < op_end; op += 16, src += 16) { | 
597  | 252  |     ConditionalUnalignedCopy128<use_16bytes_chunk>(src, op);  | 
598  | 252  |   }  | 
599  | 144  |   if (op >= op_limit) return op_limit;  | 
600  |  |  | 
601  |  |   // We only take this branch if we didn't have enough slop and we can do a  | 
602  |  |   // single 8 byte copy.  | 
603  | 96  |   if (SNAPPY_PREDICT_FALSE(op <= buf_limit - 8)) { | 
604  | 68  |     UnalignedCopy64(src, op);  | 
605  | 68  |     src += 8;  | 
606  | 68  |     op += 8;  | 
607  | 68  |   }  | 
608  | 96  |   return IncrementalCopySlow(src, op, op_limit);  | 
609  | 144  | }  | 
610  |  |  | 
611  |  | }  // namespace  | 
612  |  |  | 
613  |  | template <bool allow_fast_path>  | 
614  | 0  | static inline char* EmitLiteral(char* op, const char* literal, int len) { | 
615  |  |   // The vast majority of copies are below 16 bytes, for which a  | 
616  |  |   // call to std::memcpy() is overkill. This fast path can sometimes  | 
617  |  |   // copy up to 15 bytes too much, but that is okay in the  | 
618  |  |   // main loop, since we have a bit to go on for both sides:  | 
619  |  |   //  | 
620  |  |   //   - The input will always have kInputMarginBytes = 15 extra  | 
621  |  |   //     available bytes, as long as we're in the main loop, and  | 
622  |  |   //     if not, allow_fast_path = false.  | 
623  |  |   //   - The output will always have 32 spare bytes (see  | 
624  |  |   //     MaxCompressedLength).  | 
625  | 0  |   assert(len > 0);  // Zero-length literals are disallowed  | 
626  | 0  |   int n = len - 1;  | 
627  | 0  |   if (allow_fast_path && len <= 16) { | 
628  |  |     // Fits in tag byte  | 
629  | 0  |     *op++ = LITERAL | (n << 2);  | 
630  |  | 
  | 
631  | 0  |     UnalignedCopy128(literal, op);  | 
632  | 0  |     return op + len;  | 
633  | 0  |   }  | 
634  |  |  | 
635  | 0  |   if (n < 60) { | 
636  |  |     // Fits in tag byte  | 
637  | 0  |     *op++ = LITERAL | (n << 2);  | 
638  | 0  |   } else { | 
639  | 0  |     int count = (Bits::Log2Floor(n) >> 3) + 1;  | 
640  | 0  |     assert(count >= 1);  | 
641  | 0  |     assert(count <= 4);  | 
642  | 0  |     *op++ = LITERAL | ((59 + count) << 2);  | 
643  |  |     // Encode in upcoming bytes.  | 
644  |  |     // Write 4 bytes, though we may care about only 1 of them. The output buffer  | 
645  |  |     // is guaranteed to have at least 3 more spaces left as 'len >= 61' holds  | 
646  |  |     // here and there is a std::memcpy() of size 'len' below.  | 
647  | 0  |     LittleEndian::Store32(op, n);  | 
648  | 0  |     op += count;  | 
649  | 0  |   }  | 
650  |  |   // When allow_fast_path is true, we can overwrite up to 16 bytes.  | 
651  | 0  |   if (allow_fast_path) { | 
652  | 0  |     char* destination = op;  | 
653  | 0  |     const char* source = literal;  | 
654  | 0  |     const char* end = destination + len;  | 
655  | 0  |     do { | 
656  | 0  |       std::memcpy(destination, source, 16);  | 
657  | 0  |       destination += 16;  | 
658  | 0  |       source += 16;  | 
659  | 0  |     } while (destination < end);  | 
660  | 0  |   } else { | 
661  | 0  |     std::memcpy(op, literal, len);  | 
662  | 0  |   }  | 
663  | 0  |   return op + len;  | 
664  | 0  | } Unexecuted instantiation: snappy.cc:char* snappy::EmitLiteral<true>(char*, char const*, int) Unexecuted instantiation: snappy.cc:char* snappy::EmitLiteral<false>(char*, char const*, int)  | 
665  |  |  | 
666  |  | template <bool len_less_than_12>  | 
667  | 0  | static inline char* EmitCopyAtMost64(char* op, size_t offset, size_t len) { | 
668  | 0  |   assert(len <= 64);  | 
669  | 0  |   assert(len >= 4);  | 
670  | 0  |   assert(offset < 65536);  | 
671  | 0  |   assert(len_less_than_12 == (len < 12));  | 
672  |  | 
  | 
673  | 0  |   if (len_less_than_12) { | 
674  | 0  |     uint32_t u = (len << 2) + (offset << 8);  | 
675  | 0  |     uint32_t copy1 = COPY_1_BYTE_OFFSET - (4 << 2) + ((offset >> 3) & 0xe0);  | 
676  | 0  |     uint32_t copy2 = COPY_2_BYTE_OFFSET - (1 << 2);  | 
677  |  |     // It turns out that offset < 2048 is a difficult to predict branch.  | 
678  |  |     // `perf record` shows this is the highest percentage of branch misses in  | 
679  |  |     // benchmarks. This code produces branch free code, the data dependency  | 
680  |  |     // chain that bottlenecks the throughput is so long that a few extra  | 
681  |  |     // instructions are completely free (IPC << 6 because of data deps).  | 
682  | 0  |     u += offset < 2048 ? copy1 : copy2;  | 
683  | 0  |     LittleEndian::Store32(op, u);  | 
684  | 0  |     op += offset < 2048 ? 2 : 3;  | 
685  | 0  |   } else { | 
686  |  |     // Write 4 bytes, though we only care about 3 of them.  The output buffer  | 
687  |  |     // is required to have some slack, so the extra byte won't overrun it.  | 
688  | 0  |     uint32_t u = COPY_2_BYTE_OFFSET + ((len - 1) << 2) + (offset << 8);  | 
689  | 0  |     LittleEndian::Store32(op, u);  | 
690  | 0  |     op += 3;  | 
691  | 0  |   }  | 
692  | 0  |   return op;  | 
693  | 0  | } Unexecuted instantiation: snappy.cc:char* snappy::EmitCopyAtMost64<true>(char*, unsigned long, unsigned long) Unexecuted instantiation: snappy.cc:char* snappy::EmitCopyAtMost64<false>(char*, unsigned long, unsigned long)  | 
694  |  |  | 
695  |  | template <bool len_less_than_12>  | 
696  | 0  | static inline char* EmitCopy(char* op, size_t offset, size_t len) { | 
697  | 0  |   assert(len_less_than_12 == (len < 12));  | 
698  | 0  |   if (len_less_than_12) { | 
699  | 0  |     return EmitCopyAtMost64</*len_less_than_12=*/true>(op, offset, len);  | 
700  | 0  |   } else { | 
701  |  |     // A special case for len <= 64 might help, but so far measurements suggest  | 
702  |  |     // it's in the noise.  | 
703  |  |  | 
704  |  |     // Emit 64 byte copies but make sure to keep at least four bytes reserved.  | 
705  | 0  |     while (SNAPPY_PREDICT_FALSE(len >= 68)) { | 
706  | 0  |       op = EmitCopyAtMost64</*len_less_than_12=*/false>(op, offset, 64);  | 
707  | 0  |       len -= 64;  | 
708  | 0  |     }  | 
709  |  |  | 
710  |  |     // One or two copies will now finish the job.  | 
711  | 0  |     if (len > 64) { | 
712  | 0  |       op = EmitCopyAtMost64</*len_less_than_12=*/false>(op, offset, 60);  | 
713  | 0  |       len -= 60;  | 
714  | 0  |     }  | 
715  |  |  | 
716  |  |     // Emit remainder.  | 
717  | 0  |     if (len < 12) { | 
718  | 0  |       op = EmitCopyAtMost64</*len_less_than_12=*/true>(op, offset, len);  | 
719  | 0  |     } else { | 
720  | 0  |       op = EmitCopyAtMost64</*len_less_than_12=*/false>(op, offset, len);  | 
721  | 0  |     }  | 
722  | 0  |     return op;  | 
723  | 0  |   }  | 
724  | 0  | } Unexecuted instantiation: snappy.cc:char* snappy::EmitCopy<true>(char*, unsigned long, unsigned long) Unexecuted instantiation: snappy.cc:char* snappy::EmitCopy<false>(char*, unsigned long, unsigned long)  | 
725  |  |  | 
726  | 3.50k  | bool GetUncompressedLength(const char* start, size_t n, size_t* result) { | 
727  | 3.50k  |   uint32_t v = 0;  | 
728  | 3.50k  |   const char* limit = start + n;  | 
729  | 3.50k  |   if (Varint::Parse32WithLimit(start, limit, &v) != NULL) { | 
730  | 3.47k  |     *result = v;  | 
731  | 3.47k  |     return true;  | 
732  | 3.47k  |   } else { | 
733  | 28  |     return false;  | 
734  | 28  |   }  | 
735  | 3.50k  | }  | 
736  |  |  | 
737  |  | namespace { | 
738  | 0  | uint32_t CalculateTableSize(uint32_t input_size) { | 
739  | 0  |   static_assert(  | 
740  | 0  |       kMaxHashTableSize >= kMinHashTableSize,  | 
741  | 0  |       "kMaxHashTableSize should be greater or equal to kMinHashTableSize.");  | 
742  | 0  |   if (input_size > kMaxHashTableSize) { | 
743  | 0  |     return kMaxHashTableSize;  | 
744  | 0  |   }  | 
745  | 0  |   if (input_size < kMinHashTableSize) { | 
746  | 0  |     return kMinHashTableSize;  | 
747  | 0  |   }  | 
748  |  |   // This is equivalent to Log2Ceiling(input_size), assuming input_size > 1.  | 
749  |  |   // 2 << Log2Floor(x - 1) is equivalent to 1 << (1 + Log2Floor(x - 1)).  | 
750  | 0  |   return 2u << Bits::Log2Floor(input_size - 1);  | 
751  | 0  | }  | 
752  |  | }  // namespace  | 
753  |  |  | 
754  |  | namespace internal { | 
755  | 0  | WorkingMemory::WorkingMemory(size_t input_size) { | 
756  | 0  |   const size_t max_fragment_size = std::min(input_size, kBlockSize);  | 
757  | 0  |   const size_t table_size = CalculateTableSize(max_fragment_size);  | 
758  | 0  |   size_ = table_size * sizeof(*table_) + max_fragment_size +  | 
759  | 0  |           MaxCompressedLength(max_fragment_size);  | 
760  | 0  |   mem_ = std::allocator<char>().allocate(size_);  | 
761  | 0  |   table_ = reinterpret_cast<uint16_t*>(mem_);  | 
762  | 0  |   input_ = mem_ + table_size * sizeof(*table_);  | 
763  | 0  |   output_ = input_ + max_fragment_size;  | 
764  | 0  | }  | 
765  |  |  | 
766  | 0  | WorkingMemory::~WorkingMemory() { | 
767  | 0  |   std::allocator<char>().deallocate(mem_, size_);  | 
768  | 0  | }  | 
769  |  |  | 
770  |  | uint16_t* WorkingMemory::GetHashTable(size_t fragment_size,  | 
771  | 0  |                                       int* table_size) const { | 
772  | 0  |   const size_t htsize = CalculateTableSize(fragment_size);  | 
773  | 0  |   memset(table_, 0, htsize * sizeof(*table_));  | 
774  | 0  |   *table_size = htsize;  | 
775  | 0  |   return table_;  | 
776  | 0  | }  | 
777  |  | }  // end namespace internal  | 
778  |  |  | 
779  |  | // Flat array compression that does not emit the "uncompressed length"  | 
780  |  | // prefix. Compresses "input" string to the "*op" buffer.  | 
781  |  | //  | 
782  |  | // REQUIRES: "input" is at most "kBlockSize" bytes long.  | 
783  |  | // REQUIRES: "op" points to an array of memory that is at least  | 
784  |  | // "MaxCompressedLength(input.size())" in size.  | 
785  |  | // REQUIRES: All elements in "table[0..table_size-1]" are initialized to zero.  | 
786  |  | // REQUIRES: "table_size" is a power of two  | 
787  |  | //  | 
788  |  | // Returns an "end" pointer into "op" buffer.  | 
789  |  | // "end - op" is the compressed size of "input".  | 
790  |  | namespace internal { | 
791  |  | char* CompressFragment(const char* input, size_t input_size, char* op,  | 
792  | 0  |                        uint16_t* table, const int table_size) { | 
793  |  |   // "ip" is the input pointer, and "op" is the output pointer.  | 
794  | 0  |   const char* ip = input;  | 
795  | 0  |   assert(input_size <= kBlockSize);  | 
796  | 0  |   assert((table_size & (table_size - 1)) == 0);  // table must be power of two  | 
797  | 0  |   const uint32_t mask = 2 * (table_size - 1);  | 
798  | 0  |   const char* ip_end = input + input_size;  | 
799  | 0  |   const char* base_ip = ip;  | 
800  |  | 
  | 
801  | 0  |   const size_t kInputMarginBytes = 15;  | 
802  | 0  |   if (SNAPPY_PREDICT_TRUE(input_size >= kInputMarginBytes)) { | 
803  | 0  |     const char* ip_limit = input + input_size - kInputMarginBytes;  | 
804  |  | 
  | 
805  | 0  |     for (uint32_t preload = LittleEndian::Load32(ip + 1);;) { | 
806  |  |       // Bytes in [next_emit, ip) will be emitted as literal bytes.  Or  | 
807  |  |       // [next_emit, ip_end) after the main loop.  | 
808  | 0  |       const char* next_emit = ip++;  | 
809  | 0  |       uint64_t data = LittleEndian::Load64(ip);  | 
810  |  |       // The body of this loop calls EmitLiteral once and then EmitCopy one or  | 
811  |  |       // more times.  (The exception is that when we're close to exhausting  | 
812  |  |       // the input we goto emit_remainder.)  | 
813  |  |       //  | 
814  |  |       // In the first iteration of this loop we're just starting, so  | 
815  |  |       // there's nothing to copy, so calling EmitLiteral once is  | 
816  |  |       // necessary.  And we only start a new iteration when the  | 
817  |  |       // current iteration has determined that a call to EmitLiteral will  | 
818  |  |       // precede the next call to EmitCopy (if any).  | 
819  |  |       //  | 
820  |  |       // Step 1: Scan forward in the input looking for a 4-byte-long match.  | 
821  |  |       // If we get close to exhausting the input then goto emit_remainder.  | 
822  |  |       //  | 
823  |  |       // Heuristic match skipping: If 32 bytes are scanned with no matches  | 
824  |  |       // found, start looking only at every other byte. If 32 more bytes are  | 
825  |  |       // scanned (or skipped), look at every third byte, etc.. When a match is  | 
826  |  |       // found, immediately go back to looking at every byte. This is a small  | 
827  |  |       // loss (~5% performance, ~0.1% density) for compressible data due to more  | 
828  |  |       // bookkeeping, but for non-compressible data (such as JPEG) it's a huge  | 
829  |  |       // win since the compressor quickly "realizes" the data is incompressible  | 
830  |  |       // and doesn't bother looking for matches everywhere.  | 
831  |  |       //  | 
832  |  |       // The "skip" variable keeps track of how many bytes there are since the  | 
833  |  |       // last match; dividing it by 32 (ie. right-shifting by five) gives the  | 
834  |  |       // number of bytes to move ahead for each iteration.  | 
835  | 0  |       uint32_t skip = 32;  | 
836  |  | 
  | 
837  | 0  |       const char* candidate;  | 
838  | 0  |       if (ip_limit - ip >= 16) { | 
839  | 0  |         auto delta = ip - base_ip;  | 
840  | 0  |         for (int j = 0; j < 4; ++j) { | 
841  | 0  |           for (int k = 0; k < 4; ++k) { | 
842  | 0  |             int i = 4 * j + k;  | 
843  |  |             // These for-loops are meant to be unrolled. So we can freely  | 
844  |  |             // special case the first iteration to use the value already  | 
845  |  |             // loaded in preload.  | 
846  | 0  |             uint32_t dword = i == 0 ? preload : static_cast<uint32_t>(data);  | 
847  | 0  |             assert(dword == LittleEndian::Load32(ip + i));  | 
848  | 0  |             uint16_t* table_entry = TableEntry(table, dword, mask);  | 
849  | 0  |             candidate = base_ip + *table_entry;  | 
850  | 0  |             assert(candidate >= base_ip);  | 
851  | 0  |             assert(candidate < ip + i);  | 
852  | 0  |             *table_entry = delta + i;  | 
853  | 0  |             if (SNAPPY_PREDICT_FALSE(LittleEndian::Load32(candidate) == dword)) { | 
854  | 0  |               *op = LITERAL | (i << 2);  | 
855  | 0  |               UnalignedCopy128(next_emit, op + 1);  | 
856  | 0  |               ip += i;  | 
857  | 0  |               op = op + i + 2;  | 
858  | 0  |               goto emit_match;  | 
859  | 0  |             }  | 
860  | 0  |             data >>= 8;  | 
861  | 0  |           }  | 
862  | 0  |           data = LittleEndian::Load64(ip + 4 * j + 4);  | 
863  | 0  |         }  | 
864  | 0  |         ip += 16;  | 
865  | 0  |         skip += 16;  | 
866  | 0  |       }  | 
867  | 0  |       while (true) { | 
868  | 0  |         assert(static_cast<uint32_t>(data) == LittleEndian::Load32(ip));  | 
869  | 0  |         uint16_t* table_entry = TableEntry(table, data, mask);  | 
870  | 0  |         uint32_t bytes_between_hash_lookups = skip >> 5;  | 
871  | 0  |         skip += bytes_between_hash_lookups;  | 
872  | 0  |         const char* next_ip = ip + bytes_between_hash_lookups;  | 
873  | 0  |         if (SNAPPY_PREDICT_FALSE(next_ip > ip_limit)) { | 
874  | 0  |           ip = next_emit;  | 
875  | 0  |           goto emit_remainder;  | 
876  | 0  |         }  | 
877  | 0  |         candidate = base_ip + *table_entry;  | 
878  | 0  |         assert(candidate >= base_ip);  | 
879  | 0  |         assert(candidate < ip);  | 
880  |  | 
  | 
881  | 0  |         *table_entry = ip - base_ip;  | 
882  | 0  |         if (SNAPPY_PREDICT_FALSE(static_cast<uint32_t>(data) ==  | 
883  | 0  |                                 LittleEndian::Load32(candidate))) { | 
884  | 0  |           break;  | 
885  | 0  |         }  | 
886  | 0  |         data = LittleEndian::Load32(next_ip);  | 
887  | 0  |         ip = next_ip;  | 
888  | 0  |       }  | 
889  |  |  | 
890  |  |       // Step 2: A 4-byte match has been found.  We'll later see if more  | 
891  |  |       // than 4 bytes match.  But, prior to the match, input  | 
892  |  |       // bytes [next_emit, ip) are unmatched.  Emit them as "literal bytes."  | 
893  | 0  |       assert(next_emit + 16 <= ip_end);  | 
894  | 0  |       op = EmitLiteral</*allow_fast_path=*/true>(op, next_emit, ip - next_emit);  | 
895  |  |  | 
896  |  |       // Step 3: Call EmitCopy, and then see if another EmitCopy could  | 
897  |  |       // be our next move.  Repeat until we find no match for the  | 
898  |  |       // input immediately after what was consumed by the last EmitCopy call.  | 
899  |  |       //  | 
900  |  |       // If we exit this loop normally then we need to call EmitLiteral next,  | 
901  |  |       // though we don't yet know how big the literal will be.  We handle that  | 
902  |  |       // by proceeding to the next iteration of the main loop.  We also can exit  | 
903  |  |       // this loop via goto if we get close to exhausting the input.  | 
904  | 0  |     emit_match:  | 
905  | 0  |       do { | 
906  |  |         // We have a 4-byte match at ip, and no need to emit any  | 
907  |  |         // "literal bytes" prior to ip.  | 
908  | 0  |         const char* base = ip;  | 
909  | 0  |         std::pair<size_t, bool> p =  | 
910  | 0  |             FindMatchLength(candidate + 4, ip + 4, ip_end, &data);  | 
911  | 0  |         size_t matched = 4 + p.first;  | 
912  | 0  |         ip += matched;  | 
913  | 0  |         size_t offset = base - candidate;  | 
914  | 0  |         assert(0 == memcmp(base, candidate, matched));  | 
915  | 0  |         if (p.second) { | 
916  | 0  |           op = EmitCopy</*len_less_than_12=*/true>(op, offset, matched);  | 
917  | 0  |         } else { | 
918  | 0  |           op = EmitCopy</*len_less_than_12=*/false>(op, offset, matched);  | 
919  | 0  |         }  | 
920  | 0  |         if (SNAPPY_PREDICT_FALSE(ip >= ip_limit)) { | 
921  | 0  |           goto emit_remainder;  | 
922  | 0  |         }  | 
923  |  |         // Expect 5 bytes to match  | 
924  | 0  |         assert((data & 0xFFFFFFFFFF) ==  | 
925  | 0  |                (LittleEndian::Load64(ip) & 0xFFFFFFFFFF));  | 
926  |  |         // We are now looking for a 4-byte match again.  We read  | 
927  |  |         // table[Hash(ip, mask)] for that.  To improve compression,  | 
928  |  |         // we also update table[Hash(ip - 1, mask)] and table[Hash(ip, mask)].  | 
929  | 0  |         *TableEntry(table, LittleEndian::Load32(ip - 1), mask) =  | 
930  | 0  |             ip - base_ip - 1;  | 
931  | 0  |         uint16_t* table_entry = TableEntry(table, data, mask);  | 
932  | 0  |         candidate = base_ip + *table_entry;  | 
933  | 0  |         *table_entry = ip - base_ip;  | 
934  |  |         // Measurements on the benchmarks have shown the following probabilities  | 
935  |  |         // for the loop to exit (ie. avg. number of iterations is reciprocal).  | 
936  |  |         // BM_Flat/6  txt1    p = 0.3-0.4  | 
937  |  |         // BM_Flat/7  txt2    p = 0.35  | 
938  |  |         // BM_Flat/8  txt3    p = 0.3-0.4  | 
939  |  |         // BM_Flat/9  txt3    p = 0.34-0.4  | 
940  |  |         // BM_Flat/10 pb      p = 0.4  | 
941  |  |         // BM_Flat/11 gaviota p = 0.1  | 
942  |  |         // BM_Flat/12 cp      p = 0.5  | 
943  |  |         // BM_Flat/13 c       p = 0.3  | 
944  | 0  |       } while (static_cast<uint32_t>(data) == LittleEndian::Load32(candidate));  | 
945  |  |       // Because the least significant 5 bytes matched, we can utilize data  | 
946  |  |       // for the next iteration.  | 
947  | 0  |       preload = data >> 8;  | 
948  | 0  |     }  | 
949  | 0  |   }  | 
950  |  |  | 
951  | 0  | emit_remainder:  | 
952  |  |   // Emit the remaining bytes as a literal  | 
953  | 0  |   if (ip < ip_end) { | 
954  | 0  |     op = EmitLiteral</*allow_fast_path=*/false>(op, ip, ip_end - ip);  | 
955  | 0  |   }  | 
956  |  | 
  | 
957  | 0  |   return op;  | 
958  | 0  | }  | 
959  |  |  | 
960  |  | char* CompressFragmentDoubleHash(const char* input, size_t input_size, char* op,  | 
961  |  |                                  uint16_t* table, const int table_size,  | 
962  | 0  |                                  uint16_t* table2, const int table_size2) { | 
963  | 0  |   (void)table_size2;  | 
964  | 0  |   assert(table_size == table_size2);  | 
965  |  |   // "ip" is the input pointer, and "op" is the output pointer.  | 
966  | 0  |   const char* ip = input;  | 
967  | 0  |   assert(input_size <= kBlockSize);  | 
968  | 0  |   assert((table_size & (table_size - 1)) == 0);  // table must be power of two  | 
969  | 0  |   const uint32_t mask = 2 * (table_size - 1);  | 
970  | 0  |   const char* ip_end = input + input_size;  | 
971  | 0  |   const char* base_ip = ip;  | 
972  |  | 
  | 
973  | 0  |   const size_t kInputMarginBytes = 15;  | 
974  | 0  |   if (SNAPPY_PREDICT_TRUE(input_size >= kInputMarginBytes)) { | 
975  | 0  |     const char* ip_limit = input + input_size - kInputMarginBytes;  | 
976  |  | 
  | 
977  | 0  |     for (;;) { | 
978  | 0  |       const char* next_emit = ip++;  | 
979  | 0  |       uint64_t data = LittleEndian::Load64(ip);  | 
980  | 0  |       uint32_t skip = 512;  | 
981  |  | 
  | 
982  | 0  |       const char* candidate;  | 
983  | 0  |       uint32_t candidate_length;  | 
984  | 0  |       while (true) { | 
985  | 0  |         assert(static_cast<uint32_t>(data) == LittleEndian::Load32(ip));  | 
986  | 0  |         uint16_t* table_entry2 = TableEntry8ByteMatch(table2, data, mask);  | 
987  | 0  |         uint32_t bytes_between_hash_lookups = skip >> 9;  | 
988  | 0  |         skip++;  | 
989  | 0  |         const char* next_ip = ip + bytes_between_hash_lookups;  | 
990  | 0  |         if (SNAPPY_PREDICT_FALSE(next_ip > ip_limit)) { | 
991  | 0  |           ip = next_emit;  | 
992  | 0  |           goto emit_remainder;  | 
993  | 0  |         }  | 
994  | 0  |         candidate = base_ip + *table_entry2;  | 
995  | 0  |         assert(candidate >= base_ip);  | 
996  | 0  |         assert(candidate < ip);  | 
997  |  | 
  | 
998  | 0  |         *table_entry2 = ip - base_ip;  | 
999  | 0  |         if (SNAPPY_PREDICT_FALSE(static_cast<uint32_t>(data) ==  | 
1000  | 0  |                                 LittleEndian::Load32(candidate))) { | 
1001  | 0  |           candidate_length =  | 
1002  | 0  |               FindMatchLengthPlain(candidate + 4, ip + 4, ip_end) + 4;  | 
1003  | 0  |           break;  | 
1004  | 0  |         }  | 
1005  |  |  | 
1006  | 0  |         uint16_t* table_entry = TableEntry4ByteMatch(table, data, mask);  | 
1007  | 0  |         candidate = base_ip + *table_entry;  | 
1008  | 0  |         assert(candidate >= base_ip);  | 
1009  | 0  |         assert(candidate < ip);  | 
1010  |  | 
  | 
1011  | 0  |         *table_entry = ip - base_ip;  | 
1012  | 0  |         if (SNAPPY_PREDICT_FALSE(static_cast<uint32_t>(data) ==  | 
1013  | 0  |                                 LittleEndian::Load32(candidate))) { | 
1014  | 0  |           candidate_length =  | 
1015  | 0  |               FindMatchLengthPlain(candidate + 4, ip + 4, ip_end) + 4;  | 
1016  | 0  |           table_entry2 =  | 
1017  | 0  |               TableEntry8ByteMatch(table2, LittleEndian::Load64(ip + 1), mask);  | 
1018  | 0  |           auto candidate2 = base_ip + *table_entry2;  | 
1019  | 0  |           size_t candidate_length2 =  | 
1020  | 0  |               FindMatchLengthPlain(candidate2, ip + 1, ip_end);  | 
1021  | 0  |           if (candidate_length2 > candidate_length) { | 
1022  | 0  |             *table_entry2 = ip - base_ip;  | 
1023  | 0  |             candidate = candidate2;  | 
1024  | 0  |             candidate_length = candidate_length2;  | 
1025  | 0  |             ++ip;  | 
1026  | 0  |           }  | 
1027  | 0  |           break;  | 
1028  | 0  |         }  | 
1029  | 0  |         data = LittleEndian::Load64(next_ip);  | 
1030  | 0  |         ip = next_ip;  | 
1031  | 0  |       }  | 
1032  |  |       // Backtrack to the point it matches fully.  | 
1033  | 0  |       while (ip > next_emit && candidate > base_ip &&  | 
1034  | 0  |              *(ip - 1) == *(candidate - 1)) { | 
1035  | 0  |         --ip;  | 
1036  | 0  |         --candidate;  | 
1037  | 0  |         ++candidate_length;  | 
1038  | 0  |       }  | 
1039  | 0  |       *TableEntry8ByteMatch(table2, LittleEndian::Load64(ip + 1), mask) =  | 
1040  | 0  |           ip - base_ip + 1;  | 
1041  | 0  |       *TableEntry8ByteMatch(table2, LittleEndian::Load64(ip + 2), mask) =  | 
1042  | 0  |           ip - base_ip + 2;  | 
1043  | 0  |       *TableEntry4ByteMatch(table, LittleEndian::Load32(ip + 1), mask) =  | 
1044  | 0  |           ip - base_ip + 1;  | 
1045  |  |       // Step 2: A 4-byte or 8-byte match has been found.  | 
1046  |  |       // We'll later see if more than 4 bytes match.  But, prior to the match,  | 
1047  |  |       // input bytes [next_emit, ip) are unmatched.  Emit them as  | 
1048  |  |       // "literal bytes."  | 
1049  | 0  |       assert(next_emit + 16 <= ip_end);  | 
1050  | 0  |       if (ip - next_emit > 0) { | 
1051  | 0  |         op = EmitLiteral</*allow_fast_path=*/true>(op, next_emit,  | 
1052  | 0  |                                                    ip - next_emit);  | 
1053  | 0  |       }  | 
1054  |  |       // Step 3: Call EmitCopy, and then see if another EmitCopy could  | 
1055  |  |       // be our next move.  Repeat until we find no match for the  | 
1056  |  |       // input immediately after what was consumed by the last EmitCopy call.  | 
1057  |  |       //  | 
1058  |  |       // If we exit this loop normally then we need to call EmitLiteral next,  | 
1059  |  |       // though we don't yet know how big the literal will be.  We handle that  | 
1060  |  |       // by proceeding to the next iteration of the main loop.  We also can exit  | 
1061  |  |       // this loop via goto if we get close to exhausting the input.  | 
1062  | 0  |       do { | 
1063  |  |         // We have a 4-byte match at ip, and no need to emit any  | 
1064  |  |         // "literal bytes" prior to ip.  | 
1065  | 0  |         const char* base = ip;  | 
1066  | 0  |         ip += candidate_length;  | 
1067  | 0  |         size_t offset = base - candidate;  | 
1068  | 0  |         if (candidate_length < 12) { | 
1069  | 0  |           op =  | 
1070  | 0  |               EmitCopy</*len_less_than_12=*/true>(op, offset, candidate_length);  | 
1071  | 0  |         } else { | 
1072  | 0  |           op = EmitCopy</*len_less_than_12=*/false>(op, offset,  | 
1073  | 0  |                                                     candidate_length);  | 
1074  | 0  |         }  | 
1075  | 0  |         if (SNAPPY_PREDICT_FALSE(ip >= ip_limit)) { | 
1076  | 0  |           goto emit_remainder;  | 
1077  | 0  |         }  | 
1078  |  |         // We are now looking for a 4-byte match again.  We read  | 
1079  |  |         // table[Hash(ip, mask)] for that. To improve compression,  | 
1080  |  |         // we also update several previous table entries.  | 
1081  | 0  |         if (ip - base_ip > 7) { | 
1082  | 0  |           *TableEntry8ByteMatch(table2, LittleEndian::Load64(ip - 7), mask) =  | 
1083  | 0  |               ip - base_ip - 7;  | 
1084  | 0  |           *TableEntry8ByteMatch(table2, LittleEndian::Load64(ip - 4), mask) =  | 
1085  | 0  |               ip - base_ip - 4;  | 
1086  | 0  |         }  | 
1087  | 0  |         *TableEntry8ByteMatch(table2, LittleEndian::Load64(ip - 3), mask) =  | 
1088  | 0  |             ip - base_ip - 3;  | 
1089  | 0  |         *TableEntry8ByteMatch(table2, LittleEndian::Load64(ip - 2), mask) =  | 
1090  | 0  |             ip - base_ip - 2;  | 
1091  | 0  |         *TableEntry4ByteMatch(table, LittleEndian::Load32(ip - 2), mask) =  | 
1092  | 0  |             ip - base_ip - 2;  | 
1093  | 0  |         *TableEntry4ByteMatch(table, LittleEndian::Load32(ip - 1), mask) =  | 
1094  | 0  |             ip - base_ip - 1;  | 
1095  |  | 
  | 
1096  | 0  |         uint16_t* table_entry =  | 
1097  | 0  |             TableEntry8ByteMatch(table2, LittleEndian::Load64(ip), mask);  | 
1098  | 0  |         candidate = base_ip + *table_entry;  | 
1099  | 0  |         *table_entry = ip - base_ip;  | 
1100  | 0  |         if (LittleEndian::Load32(ip) == LittleEndian::Load32(candidate)) { | 
1101  | 0  |           candidate_length =  | 
1102  | 0  |               FindMatchLengthPlain(candidate + 4, ip + 4, ip_end) + 4;  | 
1103  | 0  |           continue;  | 
1104  | 0  |         }  | 
1105  | 0  |         table_entry =  | 
1106  | 0  |             TableEntry4ByteMatch(table, LittleEndian::Load32(ip), mask);  | 
1107  | 0  |         candidate = base_ip + *table_entry;  | 
1108  | 0  |         *table_entry = ip - base_ip;  | 
1109  | 0  |         if (LittleEndian::Load32(ip) == LittleEndian::Load32(candidate)) { | 
1110  | 0  |           candidate_length =  | 
1111  | 0  |               FindMatchLengthPlain(candidate + 4, ip + 4, ip_end) + 4;  | 
1112  | 0  |           continue;  | 
1113  | 0  |         }  | 
1114  | 0  |         break;  | 
1115  | 0  |       } while (true);  | 
1116  | 0  |     }  | 
1117  | 0  |   }  | 
1118  |  |  | 
1119  | 0  | emit_remainder:  | 
1120  |  |   // Emit the remaining bytes as a literal  | 
1121  | 0  |   if (ip < ip_end) { | 
1122  | 0  |     op = EmitLiteral</*allow_fast_path=*/false>(op, ip, ip_end - ip);  | 
1123  | 0  |   }  | 
1124  |  | 
  | 
1125  | 0  |   return op;  | 
1126  | 0  | }  | 
1127  |  | }  // end namespace internal  | 
1128  |  |  | 
1129  |  | static inline void Report(int token, const char *algorithm, size_t  | 
1130  | 3.42k  | compressed_size, size_t uncompressed_size) { | 
1131  |  |   // TODO: Switch to [[maybe_unused]] when we can assume C++17.  | 
1132  | 3.42k  |   (void)token;  | 
1133  | 3.42k  |   (void)algorithm;  | 
1134  | 3.42k  |   (void)compressed_size;  | 
1135  | 3.42k  |   (void)uncompressed_size;  | 
1136  | 3.42k  | }  | 
1137  |  |  | 
1138  |  | // Signature of output types needed by decompression code.  | 
1139  |  | // The decompression code is templatized on a type that obeys this  | 
1140  |  | // signature so that we do not pay virtual function call overhead in  | 
1141  |  | // the middle of a tight decompression loop.  | 
1142  |  | //  | 
1143  |  | // class DecompressionWriter { | 
1144  |  | //  public:  | 
1145  |  | //   // Called before decompression  | 
1146  |  | //   void SetExpectedLength(size_t length);  | 
1147  |  | //  | 
1148  |  | //   // For performance a writer may choose to donate the cursor variable to the  | 
1149  |  | //   // decompression function. The decompression will inject it in all its  | 
1150  |  | //   // function calls to the writer. Keeping the important output cursor as a  | 
1151  |  | //   // function local stack variable allows the compiler to keep it in  | 
1152  |  | //   // register, which greatly aids performance by avoiding loads and stores of  | 
1153  |  | //   // this variable in the fast path loop iterations.  | 
1154  |  | //   T GetOutputPtr() const;  | 
1155  |  | //  | 
1156  |  | //   // At end of decompression the loop donates the ownership of the cursor  | 
1157  |  | //   // variable back to the writer by calling this function.  | 
1158  |  | //   void SetOutputPtr(T op);  | 
1159  |  | //  | 
1160  |  | //   // Called after decompression  | 
1161  |  | //   bool CheckLength() const;  | 
1162  |  | //  | 
1163  |  | //   // Called repeatedly during decompression  | 
1164  |  | //   // Each function get a pointer to the op (output pointer), that the writer  | 
1165  |  | //   // can use and update. Note it's important that these functions get fully  | 
1166  |  | //   // inlined so that no actual address of the local variable needs to be  | 
1167  |  | //   // taken.  | 
1168  |  | //   bool Append(const char* ip, size_t length, T* op);  | 
1169  |  | //   bool AppendFromSelf(uint32_t offset, size_t length, T* op);  | 
1170  |  | //  | 
1171  |  | //   // The rules for how TryFastAppend differs from Append are somewhat  | 
1172  |  | //   // convoluted:  | 
1173  |  | //   //  | 
1174  |  | //   //  - TryFastAppend is allowed to decline (return false) at any  | 
1175  |  | //   //    time, for any reason -- just "return false" would be  | 
1176  |  | //   //    a perfectly legal implementation of TryFastAppend.  | 
1177  |  | //   //    The intention is for TryFastAppend to allow a fast path  | 
1178  |  | //   //    in the common case of a small append.  | 
1179  |  | //   //  - TryFastAppend is allowed to read up to <available> bytes  | 
1180  |  | //   //    from the input buffer, whereas Append is allowed to read  | 
1181  |  | //   //    <length>. However, if it returns true, it must leave  | 
1182  |  | //   //    at least five (kMaximumTagLength) bytes in the input buffer  | 
1183  |  | //   //    afterwards, so that there is always enough space to read the  | 
1184  |  | //   //    next tag without checking for a refill.  | 
1185  |  | //   //  - TryFastAppend must always return decline (return false)  | 
1186  |  | //   //    if <length> is 61 or more, as in this case the literal length is not  | 
1187  |  | //   //    decoded fully. In practice, this should not be a big problem,  | 
1188  |  | //   //    as it is unlikely that one would implement a fast path accepting  | 
1189  |  | //   //    this much data.  | 
1190  |  | //   //  | 
1191  |  | //   bool TryFastAppend(const char* ip, size_t available, size_t length, T* op);  | 
1192  |  | // };  | 
1193  |  |  | 
1194  | 4.57k  | static inline uint32_t ExtractLowBytes(const uint32_t& v, int n) { | 
1195  | 4.57k  |   assert(n >= 0);  | 
1196  | 4.57k  |   assert(n <= 4);  | 
1197  |  | #if SNAPPY_HAVE_BMI2  | 
1198  |  |   return _bzhi_u32(v, 8 * n);  | 
1199  |  | #else  | 
1200  |  |   // This needs to be wider than uint32_t otherwise `mask << 32` will be  | 
1201  |  |   // undefined.  | 
1202  | 4.57k  |   uint64_t mask = 0xffffffff;  | 
1203  | 4.57k  |   return v & ~(mask << (8 * n));  | 
1204  | 4.57k  | #endif  | 
1205  | 4.57k  | }  | 
1206  |  |  | 
1207  | 3.81k  | static inline bool LeftShiftOverflows(uint8_t value, uint32_t shift) { | 
1208  | 3.81k  |   assert(shift < 32);  | 
1209  | 3.81k  |   static const uint8_t masks[] = { | 
1210  | 3.81k  |       0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  //  | 
1211  | 3.81k  |       0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  //  | 
1212  | 3.81k  |       0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  //  | 
1213  | 3.81k  |       0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe};  | 
1214  | 3.81k  |   return (value & masks[shift]) != 0;  | 
1215  | 3.81k  | }  | 
1216  |  |  | 
1217  | 0  | inline bool Copy64BytesWithPatternExtension(ptrdiff_t dst, size_t offset) { | 
1218  |  |   // TODO: Switch to [[maybe_unused]] when we can assume C++17.  | 
1219  | 0  |   (void)dst;  | 
1220  | 0  |   return offset != 0;  | 
1221  | 0  | }  | 
1222  |  |  | 
1223  |  | // Copies between size bytes and 64 bytes from src to dest.  size cannot exceed  | 
1224  |  | // 64.  More than size bytes, but never exceeding 64, might be copied if doing  | 
1225  |  | // so gives better performance.  [src, src + size) must not overlap with  | 
1226  |  | // [dst, dst + size), but [src, src + 64) may overlap with [dst, dst + 64).  | 
1227  | 10.3k  | void MemCopy64(char* dst, const void* src, size_t size) { | 
1228  |  |   // Always copy this many bytes.  If that's below size then copy the full 64.  | 
1229  | 10.3k  |   constexpr int kShortMemCopy = 32;  | 
1230  |  |  | 
1231  | 10.3k  |   assert(size <= 64);  | 
1232  | 10.3k  |   assert(std::less_equal<const void*>()(static_cast<const char*>(src) + size,  | 
1233  | 10.3k  |                                         dst) ||  | 
1234  | 10.3k  |          std::less_equal<const void*>()(dst + size, src));  | 
1235  |  |  | 
1236  |  |   // We know that src and dst are at least size bytes apart. However, because we  | 
1237  |  |   // might copy more than size bytes the copy still might overlap past size.  | 
1238  |  |   // E.g. if src and dst appear consecutively in memory (src + size >= dst).  | 
1239  |  |   // TODO: Investigate wider copies on other platforms.  | 
1240  |  | #if defined(__x86_64__) && defined(__AVX__)  | 
1241  |  |   assert(kShortMemCopy <= 32);  | 
1242  |  |   __m256i data = _mm256_lddqu_si256(static_cast<const __m256i *>(src));  | 
1243  |  |   _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), data);  | 
1244  |  |   // Profiling shows that nearly all copies are short.  | 
1245  |  |   if (SNAPPY_PREDICT_FALSE(size > kShortMemCopy)) { | 
1246  |  |     data = _mm256_lddqu_si256(static_cast<const __m256i *>(src) + 1);  | 
1247  |  |     _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst) + 1, data);  | 
1248  |  |   }  | 
1249  |  | #else  | 
1250  | 10.3k  |   std::memmove(dst, src, kShortMemCopy);  | 
1251  |  |   // Profiling shows that nearly all copies are short.  | 
1252  | 10.3k  |   if (SNAPPY_PREDICT_FALSE(size > kShortMemCopy)) { | 
1253  | 5.58k  |     std::memmove(dst + kShortMemCopy,  | 
1254  | 5.58k  |                  static_cast<const uint8_t*>(src) + kShortMemCopy,  | 
1255  | 5.58k  |                  64 - kShortMemCopy);  | 
1256  | 5.58k  |   }  | 
1257  | 10.3k  | #endif  | 
1258  | 10.3k  | }  | 
1259  |  |  | 
1260  | 0  | void MemCopy64(ptrdiff_t dst, const void* src, size_t size) { | 
1261  |  |   // TODO: Switch to [[maybe_unused]] when we can assume C++17.  | 
1262  | 0  |   (void)dst;  | 
1263  | 0  |   (void)src;  | 
1264  | 0  |   (void)size;  | 
1265  | 0  | }  | 
1266  |  |  | 
1267  |  | void ClearDeferred(const void** deferred_src, size_t* deferred_length,  | 
1268  | 10.9k  |                    uint8_t* safe_source) { | 
1269  | 10.9k  |   *deferred_src = safe_source;  | 
1270  | 10.9k  |   *deferred_length = 0;  | 
1271  | 10.9k  | }  | 
1272  |  |  | 
1273  |  | void DeferMemCopy(const void** deferred_src, size_t* deferred_length,  | 
1274  | 9.44k  |                   const void* src, size_t length) { | 
1275  | 9.44k  |   *deferred_src = src;  | 
1276  | 9.44k  |   *deferred_length = length;  | 
1277  | 9.44k  | }  | 
1278  |  |  | 
1279  |  | SNAPPY_ATTRIBUTE_ALWAYS_INLINE  | 
1280  | 0  | inline size_t AdvanceToNextTagARMOptimized(const uint8_t** ip_p, size_t* tag) { | 
1281  | 0  |   const uint8_t*& ip = *ip_p;  | 
1282  | 0  |   // This section is crucial for the throughput of the decompression loop.  | 
1283  | 0  |   // The latency of an iteration is fundamentally constrained by the  | 
1284  | 0  |   // following data chain on ip.  | 
1285  | 0  |   // ip -> c = Load(ip) -> delta1 = (c & 3)        -> ip += delta1 or delta2  | 
1286  | 0  |   //                       delta2 = ((c >> 2) + 1)    ip++  | 
1287  | 0  |   // This is different from X86 optimizations because ARM has conditional add  | 
1288  | 0  |   // instruction (csinc) and it removes several register moves.  | 
1289  | 0  |   const size_t tag_type = *tag & 3;  | 
1290  | 0  |   const bool is_literal = (tag_type == 0);  | 
1291  | 0  |   if (is_literal) { | 
1292  | 0  |     size_t next_literal_tag = (*tag >> 2) + 1;  | 
1293  | 0  |     *tag = ip[next_literal_tag];  | 
1294  | 0  |     ip += next_literal_tag + 1;  | 
1295  | 0  |   } else { | 
1296  | 0  |     *tag = ip[tag_type];  | 
1297  | 0  |     ip += tag_type + 1;  | 
1298  | 0  |   }  | 
1299  | 0  |   return tag_type;  | 
1300  | 0  | }  | 
1301  |  |  | 
1302  |  | SNAPPY_ATTRIBUTE_ALWAYS_INLINE  | 
1303  | 10.3k  | inline size_t AdvanceToNextTagX86Optimized(const uint8_t** ip_p, size_t* tag) { | 
1304  | 10.3k  |   const uint8_t*& ip = *ip_p;  | 
1305  |  |   // This section is crucial for the throughput of the decompression loop.  | 
1306  |  |   // The latency of an iteration is fundamentally constrained by the  | 
1307  |  |   // following data chain on ip.  | 
1308  |  |   // ip -> c = Load(ip) -> ip1 = ip + 1 + (c & 3) -> ip = ip1 or ip2  | 
1309  |  |   //                       ip2 = ip + 2 + (c >> 2)  | 
1310  |  |   // This amounts to 8 cycles.  | 
1311  |  |   // 5 (load) + 1 (c & 3) + 1 (lea ip1, [ip + (c & 3) + 1]) + 1 (cmov)  | 
1312  | 10.3k  |   size_t literal_len = *tag >> 2;  | 
1313  | 10.3k  |   size_t tag_type = *tag;  | 
1314  | 10.3k  |   bool is_literal;  | 
1315  | 10.3k  | #if defined(__GCC_ASM_FLAG_OUTPUTS__) && defined(__x86_64__)  | 
1316  |  |   // TODO clang misses the fact that the (c & 3) already correctly  | 
1317  |  |   // sets the zero flag.  | 
1318  | 10.3k  |   asm("and $3, %k[tag_type]\n\t" | 
1319  | 10.3k  |       : [tag_type] "+r"(tag_type), "=@ccz"(is_literal)  | 
1320  | 10.3k  |       :: "cc");  | 
1321  |  | #else  | 
1322  |  |   tag_type &= 3;  | 
1323  |  |   is_literal = (tag_type == 0);  | 
1324  |  | #endif  | 
1325  |  |   // TODO  | 
1326  |  |   // This is code is subtle. Loading the values first and then cmov has less  | 
1327  |  |   // latency then cmov ip and then load. However clang would move the loads  | 
1328  |  |   // in an optimization phase, volatile prevents this transformation.  | 
1329  |  |   // Note that we have enough slop bytes (64) that the loads are always valid.  | 
1330  | 10.3k  |   size_t tag_literal =  | 
1331  | 10.3k  |       static_cast<const volatile uint8_t*>(ip)[1 + literal_len];  | 
1332  | 10.3k  |   size_t tag_copy = static_cast<const volatile uint8_t*>(ip)[tag_type];  | 
1333  | 10.3k  |   *tag = is_literal ? tag_literal : tag_copy;  | 
1334  | 10.3k  |   const uint8_t* ip_copy = ip + 1 + tag_type;  | 
1335  | 10.3k  |   const uint8_t* ip_literal = ip + 2 + literal_len;  | 
1336  | 10.3k  |   ip = is_literal ? ip_literal : ip_copy;  | 
1337  | 10.3k  | #if defined(__GNUC__) && defined(__x86_64__)  | 
1338  |  |   // TODO Clang is "optimizing" zero-extension (a totally free  | 
1339  |  |   // operation) this means that after the cmov of tag, it emits another movzb  | 
1340  |  |   // tag, byte(tag). It really matters as it's on the core chain. This dummy  | 
1341  |  |   // asm, persuades clang to do the zero-extension at the load (it's automatic)  | 
1342  |  |   // removing the expensive movzb.  | 
1343  | 10.3k  |   asm("" ::"r"(tag_copy)); | 
1344  | 10.3k  | #endif  | 
1345  | 10.3k  |   return tag_type;  | 
1346  | 10.3k  | }  | 
1347  |  |  | 
1348  |  | // Extract the offset for copy-1 and copy-2 returns 0 for literals or copy-4.  | 
1349  | 10.3k  | inline uint32_t ExtractOffset(uint32_t val, size_t tag_type) { | 
1350  |  |   // For x86 non-static storage works better. For ARM static storage is better.  | 
1351  |  |   // TODO: Once the array is recognized as a register, improve the  | 
1352  |  |   // readability for x86.  | 
1353  | 10.3k  | #if defined(__x86_64__)  | 
1354  | 10.3k  |   constexpr uint64_t kExtractMasksCombined = 0x0000FFFF00FF0000ull;  | 
1355  | 10.3k  |   uint16_t result;  | 
1356  | 10.3k  |   memcpy(&result,  | 
1357  | 10.3k  |          reinterpret_cast<const char*>(&kExtractMasksCombined) + 2 * tag_type,  | 
1358  | 10.3k  |          sizeof(result));  | 
1359  | 10.3k  |   return val & result;  | 
1360  |  | #elif defined(__aarch64__)  | 
1361  |  |   constexpr uint64_t kExtractMasksCombined = 0x0000FFFF00FF0000ull;  | 
1362  |  |   return val & static_cast<uint32_t>(  | 
1363  |  |       (kExtractMasksCombined >> (tag_type * 16)) & 0xFFFF);  | 
1364  |  | #else  | 
1365  |  |   static constexpr uint32_t kExtractMasks[4] = {0, 0xFF, 0xFFFF, 0}; | 
1366  |  |   return val & kExtractMasks[tag_type];  | 
1367  |  | #endif  | 
1368  | 10.3k  | };  | 
1369  |  |  | 
1370  |  | // Core decompression loop, when there is enough data available.  | 
1371  |  | // Decompresses the input buffer [ip, ip_limit) into the output buffer  | 
1372  |  | // [op, op_limit_min_slop). Returning when either we are too close to the end  | 
1373  |  | // of the input buffer, or we exceed op_limit_min_slop or when a exceptional  | 
1374  |  | // tag is encountered (literal of length > 60) or a copy-4.  | 
1375  |  | // Returns {ip, op} at the points it stopped decoding. | 
1376  |  | // TODO This function probably does not need to be inlined, as it  | 
1377  |  | // should decode large chunks at a time. This allows runtime dispatch to  | 
1378  |  | // implementations based on CPU capability (BMI2 / perhaps 32 / 64 byte memcpy).  | 
1379  |  | template <typename T>  | 
1380  |  | std::pair<const uint8_t*, ptrdiff_t> DecompressBranchless(  | 
1381  |  |     const uint8_t* ip, const uint8_t* ip_limit, ptrdiff_t op, T op_base,  | 
1382  | 10.0k  |     ptrdiff_t op_limit_min_slop) { | 
1383  |  |   // If deferred_src is invalid point it here.  | 
1384  | 10.0k  |   uint8_t safe_source[64];  | 
1385  | 10.0k  |   const void* deferred_src;  | 
1386  | 10.0k  |   size_t deferred_length;  | 
1387  | 10.0k  |   ClearDeferred(&deferred_src, &deferred_length, safe_source);  | 
1388  |  |  | 
1389  |  |   // We unroll the inner loop twice so we need twice the spare room.  | 
1390  | 10.0k  |   op_limit_min_slop -= kSlopBytes;  | 
1391  | 10.0k  |   if (2 * (kSlopBytes + 1) < ip_limit - ip && op < op_limit_min_slop) { | 
1392  | 372  |     const uint8_t* const ip_limit_min_slop = ip_limit - 2 * kSlopBytes - 1;  | 
1393  | 372  |     ip++;  | 
1394  |  |     // ip points just past the tag and we are touching at maximum kSlopBytes  | 
1395  |  |     // in an iteration.  | 
1396  | 372  |     size_t tag = ip[-1];  | 
1397  |  | #if defined(__clang__) && defined(__aarch64__)  | 
1398  |  |     // Workaround for https://bugs.llvm.org/show_bug.cgi?id=51317  | 
1399  |  |     // when loading 1 byte, clang for aarch64 doesn't realize that it(ldrb)  | 
1400  |  |     // comes with free zero-extension, so clang generates another  | 
1401  |  |     // 'and xn, xm, 0xff' before it use that as the offset. This 'and' is  | 
1402  |  |     // redundant and can be removed by adding this dummy asm, which gives  | 
1403  |  |     // clang a hint that we're doing the zero-extension at the load.  | 
1404  |  |     asm("" ::"r"(tag)); | 
1405  |  | #endif  | 
1406  | 5.29k  |     do { | 
1407  |  |       // The throughput is limited by instructions, unrolling the inner loop  | 
1408  |  |       // twice reduces the amount of instructions checking limits and also  | 
1409  |  |       // leads to reduced mov's.  | 
1410  |  |  | 
1411  | 5.29k  |       SNAPPY_PREFETCH(ip + 128);  | 
1412  | 15.4k  |       for (int i = 0; i < 2; i++) { | 
1413  | 10.3k  |         const uint8_t* old_ip = ip;  | 
1414  | 10.3k  |         assert(tag == ip[-1]);  | 
1415  |  |         // For literals tag_type = 0, hence we will always obtain 0 from  | 
1416  |  |         // ExtractLowBytes. For literals offset will thus be kLiteralOffset.  | 
1417  | 10.3k  |         ptrdiff_t len_minus_offset = kLengthMinusOffset[tag];  | 
1418  | 10.3k  |         uint32_t next;  | 
1419  |  | #if defined(__aarch64__)  | 
1420  |  |         size_t tag_type = AdvanceToNextTagARMOptimized(&ip, &tag);  | 
1421  |  |         // We never need more than 16 bits. Doing a Load16 allows the compiler  | 
1422  |  |         // to elide the masking operation in ExtractOffset.  | 
1423  |  |         next = LittleEndian::Load16(old_ip);  | 
1424  |  | #else  | 
1425  | 10.3k  |         size_t tag_type = AdvanceToNextTagX86Optimized(&ip, &tag);  | 
1426  | 10.3k  |         next = LittleEndian::Load32(old_ip);  | 
1427  | 10.3k  | #endif  | 
1428  | 10.3k  |         size_t len = len_minus_offset & 0xFF;  | 
1429  | 10.3k  |         ptrdiff_t extracted = ExtractOffset(next, tag_type);  | 
1430  | 10.3k  |         ptrdiff_t len_min_offset = len_minus_offset - extracted;  | 
1431  | 10.3k  |         if (SNAPPY_PREDICT_FALSE(len_minus_offset > extracted)) { | 
1432  | 880  |           if (SNAPPY_PREDICT_FALSE(len & 0x80)) { | 
1433  |  |             // Exceptional case (long literal or copy 4).  | 
1434  |  |             // Actually doing the copy here is negatively impacting the main  | 
1435  |  |             // loop due to compiler incorrectly allocating a register for  | 
1436  |  |             // this fallback. Hence we just break.  | 
1437  | 288  |           break_loop:  | 
1438  | 288  |             ip = old_ip;  | 
1439  | 288  |             goto exit;  | 
1440  | 200  |           }  | 
1441  |  |           // Only copy-1 or copy-2 tags can get here.  | 
1442  | 680  |           assert(tag_type == 1 || tag_type == 2);  | 
1443  | 680  |           std::ptrdiff_t delta = (op + deferred_length) + len_min_offset - len;  | 
1444  |  |           // Guard against copies before the buffer start.  | 
1445  |  |           // Execute any deferred MemCopy since we write to dst here.  | 
1446  | 680  |           MemCopy64(op_base + op, deferred_src, deferred_length);  | 
1447  | 680  |           op += deferred_length;  | 
1448  | 680  |           ClearDeferred(&deferred_src, &deferred_length, safe_source);  | 
1449  | 680  |           if (SNAPPY_PREDICT_FALSE(delta < 0 ||  | 
1450  | 680  |                                   !Copy64BytesWithPatternExtension(  | 
1451  | 680  |                                       op_base + op, len - len_min_offset))) { | 
1452  | 24  |             goto break_loop;  | 
1453  | 24  |           }  | 
1454  |  |           // We aren't deferring this copy so add length right away.  | 
1455  | 656  |           op += len;  | 
1456  | 656  |           continue;  | 
1457  | 680  |         }  | 
1458  | 9.51k  |         std::ptrdiff_t delta = (op + deferred_length) + len_min_offset - len;  | 
1459  | 9.51k  |         if (SNAPPY_PREDICT_FALSE(delta < 0)) { | 
1460  |  |           // Due to the spurious offset in literals have this will trigger  | 
1461  |  |           // at the start of a block when op is still smaller than 256.  | 
1462  | 1.44k  |           if (tag_type != 0) goto break_loop;  | 
1463  | 1.38k  |           MemCopy64(op_base + op, deferred_src, deferred_length);  | 
1464  | 1.38k  |           op += deferred_length;  | 
1465  | 1.38k  |           DeferMemCopy(&deferred_src, &deferred_length, old_ip, len);  | 
1466  | 1.38k  |           continue;  | 
1467  | 1.44k  |         }  | 
1468  |  |  | 
1469  |  |         // For copies we need to copy from op_base + delta, for literals  | 
1470  |  |         // we need to copy from ip instead of from the stream.  | 
1471  | 8.06k  |         const void* from =  | 
1472  | 8.06k  |             tag_type ? reinterpret_cast<void*>(op_base + delta) : old_ip;  | 
1473  | 8.06k  |         MemCopy64(op_base + op, deferred_src, deferred_length);  | 
1474  | 8.06k  |         op += deferred_length;  | 
1475  | 8.06k  |         DeferMemCopy(&deferred_src, &deferred_length, from, len);  | 
1476  | 8.06k  |       }  | 
1477  | 5.29k  |     } while (ip < ip_limit_min_slop &&  | 
1478  | 5.00k  |              static_cast<ptrdiff_t>(op + deferred_length) < op_limit_min_slop);  | 
1479  | 372  |   exit:  | 
1480  | 372  |     ip--;  | 
1481  | 372  |     assert(ip <= ip_limit);  | 
1482  | 372  |   }  | 
1483  |  |   // If we deferred a copy then we can perform.  If we are up to date then we  | 
1484  |  |   // might not have enough slop bytes and could run past the end.  | 
1485  | 10.0k  |   if (deferred_length) { | 
1486  | 240  |     MemCopy64(op_base + op, deferred_src, deferred_length);  | 
1487  | 240  |     op += deferred_length;  | 
1488  | 240  |     ClearDeferred(&deferred_src, &deferred_length, safe_source);  | 
1489  | 240  |   }  | 
1490  | 10.0k  |   return {ip, op}; | 
1491  | 10.0k  | } std::__1::pair<unsigned char const*, long> snappy::DecompressBranchless<char*>(unsigned char const*, unsigned char const*, long, char*, long) Line  | Count  | Source  |  1382  | 10.0k  |     ptrdiff_t op_limit_min_slop) { |  1383  |  |   // If deferred_src is invalid point it here.  |  1384  | 10.0k  |   uint8_t safe_source[64];  |  1385  | 10.0k  |   const void* deferred_src;  |  1386  | 10.0k  |   size_t deferred_length;  |  1387  | 10.0k  |   ClearDeferred(&deferred_src, &deferred_length, safe_source);  |  1388  |  |  |  1389  |  |   // We unroll the inner loop twice so we need twice the spare room.  |  1390  | 10.0k  |   op_limit_min_slop -= kSlopBytes;  |  1391  | 10.0k  |   if (2 * (kSlopBytes + 1) < ip_limit - ip && op < op_limit_min_slop) { |  1392  | 372  |     const uint8_t* const ip_limit_min_slop = ip_limit - 2 * kSlopBytes - 1;  |  1393  | 372  |     ip++;  |  1394  |  |     // ip points just past the tag and we are touching at maximum kSlopBytes  |  1395  |  |     // in an iteration.  |  1396  | 372  |     size_t tag = ip[-1];  |  1397  |  | #if defined(__clang__) && defined(__aarch64__)  |  1398  |  |     // Workaround for https://bugs.llvm.org/show_bug.cgi?id=51317  |  1399  |  |     // when loading 1 byte, clang for aarch64 doesn't realize that it(ldrb)  |  1400  |  |     // comes with free zero-extension, so clang generates another  |  1401  |  |     // 'and xn, xm, 0xff' before it use that as the offset. This 'and' is  |  1402  |  |     // redundant and can be removed by adding this dummy asm, which gives  |  1403  |  |     // clang a hint that we're doing the zero-extension at the load.  |  1404  |  |     asm("" ::"r"(tag)); |  1405  |  | #endif  |  1406  | 5.29k  |     do { |  1407  |  |       // The throughput is limited by instructions, unrolling the inner loop  |  1408  |  |       // twice reduces the amount of instructions checking limits and also  |  1409  |  |       // leads to reduced mov's.  |  1410  |  |  |  1411  | 5.29k  |       SNAPPY_PREFETCH(ip + 128);  |  1412  | 15.4k  |       for (int i = 0; i < 2; i++) { |  1413  | 10.3k  |         const uint8_t* old_ip = ip;  |  1414  | 10.3k  |         assert(tag == ip[-1]);  |  1415  |  |         // For literals tag_type = 0, hence we will always obtain 0 from  |  1416  |  |         // ExtractLowBytes. For literals offset will thus be kLiteralOffset.  |  1417  | 10.3k  |         ptrdiff_t len_minus_offset = kLengthMinusOffset[tag];  |  1418  | 10.3k  |         uint32_t next;  |  1419  |  | #if defined(__aarch64__)  |  1420  |  |         size_t tag_type = AdvanceToNextTagARMOptimized(&ip, &tag);  |  1421  |  |         // We never need more than 16 bits. Doing a Load16 allows the compiler  |  1422  |  |         // to elide the masking operation in ExtractOffset.  |  1423  |  |         next = LittleEndian::Load16(old_ip);  |  1424  |  | #else  |  1425  | 10.3k  |         size_t tag_type = AdvanceToNextTagX86Optimized(&ip, &tag);  |  1426  | 10.3k  |         next = LittleEndian::Load32(old_ip);  |  1427  | 10.3k  | #endif  |  1428  | 10.3k  |         size_t len = len_minus_offset & 0xFF;  |  1429  | 10.3k  |         ptrdiff_t extracted = ExtractOffset(next, tag_type);  |  1430  | 10.3k  |         ptrdiff_t len_min_offset = len_minus_offset - extracted;  |  1431  | 10.3k  |         if (SNAPPY_PREDICT_FALSE(len_minus_offset > extracted)) { |  1432  | 880  |           if (SNAPPY_PREDICT_FALSE(len & 0x80)) { |  1433  |  |             // Exceptional case (long literal or copy 4).  |  1434  |  |             // Actually doing the copy here is negatively impacting the main  |  1435  |  |             // loop due to compiler incorrectly allocating a register for  |  1436  |  |             // this fallback. Hence we just break.  |  1437  | 288  |           break_loop:  |  1438  | 288  |             ip = old_ip;  |  1439  | 288  |             goto exit;  |  1440  | 200  |           }  |  1441  |  |           // Only copy-1 or copy-2 tags can get here.  |  1442  | 680  |           assert(tag_type == 1 || tag_type == 2);  |  1443  | 680  |           std::ptrdiff_t delta = (op + deferred_length) + len_min_offset - len;  |  1444  |  |           // Guard against copies before the buffer start.  |  1445  |  |           // Execute any deferred MemCopy since we write to dst here.  |  1446  | 680  |           MemCopy64(op_base + op, deferred_src, deferred_length);  |  1447  | 680  |           op += deferred_length;  |  1448  | 680  |           ClearDeferred(&deferred_src, &deferred_length, safe_source);  |  1449  | 680  |           if (SNAPPY_PREDICT_FALSE(delta < 0 ||  |  1450  | 680  |                                   !Copy64BytesWithPatternExtension(  |  1451  | 680  |                                       op_base + op, len - len_min_offset))) { |  1452  | 24  |             goto break_loop;  |  1453  | 24  |           }  |  1454  |  |           // We aren't deferring this copy so add length right away.  |  1455  | 656  |           op += len;  |  1456  | 656  |           continue;  |  1457  | 680  |         }  |  1458  | 9.51k  |         std::ptrdiff_t delta = (op + deferred_length) + len_min_offset - len;  |  1459  | 9.51k  |         if (SNAPPY_PREDICT_FALSE(delta < 0)) { |  1460  |  |           // Due to the spurious offset in literals have this will trigger  |  1461  |  |           // at the start of a block when op is still smaller than 256.  |  1462  | 1.44k  |           if (tag_type != 0) goto break_loop;  |  1463  | 1.38k  |           MemCopy64(op_base + op, deferred_src, deferred_length);  |  1464  | 1.38k  |           op += deferred_length;  |  1465  | 1.38k  |           DeferMemCopy(&deferred_src, &deferred_length, old_ip, len);  |  1466  | 1.38k  |           continue;  |  1467  | 1.44k  |         }  |  1468  |  |  |  1469  |  |         // For copies we need to copy from op_base + delta, for literals  |  1470  |  |         // we need to copy from ip instead of from the stream.  |  1471  | 8.06k  |         const void* from =  |  1472  | 8.06k  |             tag_type ? reinterpret_cast<void*>(op_base + delta) : old_ip;  |  1473  | 8.06k  |         MemCopy64(op_base + op, deferred_src, deferred_length);  |  1474  | 8.06k  |         op += deferred_length;  |  1475  | 8.06k  |         DeferMemCopy(&deferred_src, &deferred_length, from, len);  |  1476  | 8.06k  |       }  |  1477  | 5.29k  |     } while (ip < ip_limit_min_slop &&  |  1478  | 5.00k  |              static_cast<ptrdiff_t>(op + deferred_length) < op_limit_min_slop);  |  1479  | 372  |   exit:  |  1480  | 372  |     ip--;  |  1481  | 372  |     assert(ip <= ip_limit);  |  1482  | 372  |   }  |  1483  |  |   // If we deferred a copy then we can perform.  If we are up to date then we  |  1484  |  |   // might not have enough slop bytes and could run past the end.  |  1485  | 10.0k  |   if (deferred_length) { |  1486  | 240  |     MemCopy64(op_base + op, deferred_src, deferred_length);  |  1487  | 240  |     op += deferred_length;  |  1488  | 240  |     ClearDeferred(&deferred_src, &deferred_length, safe_source);  |  1489  | 240  |   }  |  1490  | 10.0k  |   return {ip, op}; |  1491  | 10.0k  | }  |  
 Unexecuted instantiation: std::__1::pair<unsigned char const*, long> snappy::DecompressBranchless<unsigned long>(unsigned char const*, unsigned char const*, long, unsigned long, long)  | 
1492  |  |  | 
1493  |  | // Helper class for decompression  | 
1494  |  | class SnappyDecompressor { | 
1495  |  |  private:  | 
1496  |  |   Source* reader_;        // Underlying source of bytes to decompress  | 
1497  |  |   const char* ip_;        // Points to next buffered byte  | 
1498  |  |   const char* ip_limit_;  // Points just past buffered bytes  | 
1499  |  |   // If ip < ip_limit_min_maxtaglen_ it's safe to read kMaxTagLength from  | 
1500  |  |   // buffer.  | 
1501  |  |   const char* ip_limit_min_maxtaglen_;  | 
1502  |  |   uint64_t peeked_;                  // Bytes peeked from reader (need to skip)  | 
1503  |  |   bool eof_;                         // Hit end of input without an error?  | 
1504  |  |   char scratch_[kMaximumTagLength];  // See RefillTag().  | 
1505  |  |  | 
1506  |  |   // Ensure that all of the tag metadata for the next tag is available  | 
1507  |  |   // in [ip_..ip_limit_-1].  Also ensures that [ip,ip+4] is readable even  | 
1508  |  |   // if (ip_limit_ - ip_ < 5).  | 
1509  |  |   //  | 
1510  |  |   // Returns true on success, false on error or end of input.  | 
1511  |  |   bool RefillTag();  | 
1512  |  |  | 
1513  | 7.25k  |   void ResetLimit(const char* ip) { | 
1514  | 7.25k  |     ip_limit_min_maxtaglen_ =  | 
1515  | 7.25k  |         ip_limit_ - std::min<ptrdiff_t>(ip_limit_ - ip, kMaximumTagLength - 1);  | 
1516  | 7.25k  |   }  | 
1517  |  |  | 
1518  |  |  public:  | 
1519  |  |   explicit SnappyDecompressor(Source* reader)  | 
1520  | 3.42k  |       : reader_(reader), ip_(NULL), ip_limit_(NULL), peeked_(0), eof_(false) {} | 
1521  |  |  | 
1522  | 3.42k  |   ~SnappyDecompressor() { | 
1523  |  |     // Advance past any bytes we peeked at from the reader  | 
1524  | 3.42k  |     reader_->Skip(peeked_);  | 
1525  | 3.42k  |   }  | 
1526  |  |  | 
1527  |  |   // Returns true iff we have hit the end of the input without an error.  | 
1528  | 3.42k  |   bool eof() const { return eof_; } | 
1529  |  |  | 
1530  |  |   // Read the uncompressed length stored at the start of the compressed data.  | 
1531  |  |   // On success, stores the length in *result and returns true.  | 
1532  |  |   // On failure, returns false.  | 
1533  | 3.42k  |   bool ReadUncompressedLength(uint32_t* result) { | 
1534  | 3.42k  |     assert(ip_ == NULL);  // Must not have read anything yet  | 
1535  |  |     // Length is encoded in 1..5 bytes  | 
1536  | 3.42k  |     *result = 0;  | 
1537  | 3.42k  |     uint32_t shift = 0;  | 
1538  | 3.81k  |     while (true) { | 
1539  | 3.81k  |       if (shift >= 32) return false;  | 
1540  | 3.81k  |       size_t n;  | 
1541  | 3.81k  |       const char* ip = reader_->Peek(&n);  | 
1542  | 3.81k  |       if (n == 0) return false;  | 
1543  | 3.81k  |       const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip));  | 
1544  | 3.81k  |       reader_->Skip(1);  | 
1545  | 3.81k  |       uint32_t val = c & 0x7f;  | 
1546  | 3.81k  |       if (LeftShiftOverflows(static_cast<uint8_t>(val), shift)) return false;  | 
1547  | 3.81k  |       *result |= val << shift;  | 
1548  | 3.81k  |       if (c < 128) { | 
1549  | 3.42k  |         break;  | 
1550  | 3.42k  |       }  | 
1551  | 384  |       shift += 7;  | 
1552  | 384  |     }  | 
1553  | 3.42k  |     return true;  | 
1554  | 3.42k  |   }  | 
1555  |  |  | 
1556  |  |   // Process the next item found in the input.  | 
1557  |  |   // Returns true if successful, false on error or end of input.  | 
1558  |  |   template <class Writer>  | 
1559  |  | #if defined(__GNUC__) && defined(__x86_64__)  | 
1560  |  |   __attribute__((aligned(32)))  | 
1561  |  | #endif  | 
1562  |  |   void  | 
1563  | 3.42k  |   DecompressAllTags(Writer* writer) { | 
1564  | 3.42k  |     const char* ip = ip_;  | 
1565  | 3.42k  |     ResetLimit(ip);  | 
1566  | 3.42k  |     auto op = writer->GetOutputPtr();  | 
1567  |  |     // We could have put this refill fragment only at the beginning of the loop.  | 
1568  |  |     // However, duplicating it at the end of each branch gives the compiler more  | 
1569  |  |     // scope to optimize the <ip_limit_ - ip> expression based on the local  | 
1570  |  |     // context, which overall increases speed.  | 
1571  | 3.42k  | #define MAYBE_REFILL()                                      \  | 
1572  | 17.5k  |   if (SNAPPY_PREDICT_FALSE(ip >= ip_limit_min_maxtaglen_)) { \ | 
1573  | 6.82k  |     ip_ = ip;                                               \  | 
1574  | 6.82k  |     if (SNAPPY_PREDICT_FALSE(!RefillTag())) goto exit;       \  | 
1575  | 6.82k  |     ip = ip_;                                               \  | 
1576  | 3.82k  |     ResetLimit(ip);                                         \  | 
1577  | 3.82k  |   }                                                         \  | 
1578  | 17.5k  |   preload = static_cast<uint8_t>(*ip)  | 
1579  |  |  | 
1580  |  |     // At the start of the for loop below the least significant byte of preload  | 
1581  |  |     // contains the tag.  | 
1582  | 3.42k  |     uint32_t preload;  | 
1583  | 3.42k  |     MAYBE_REFILL();  | 
1584  | 10.0k  |     for (;;) { | 
1585  | 10.0k  |       { | 
1586  | 10.0k  |         ptrdiff_t op_limit_min_slop;  | 
1587  | 10.0k  |         auto op_base = writer->GetBase(&op_limit_min_slop);  | 
1588  | 10.0k  |         if (op_base) { | 
1589  | 10.0k  |           auto res =  | 
1590  | 10.0k  |               DecompressBranchless(reinterpret_cast<const uint8_t*>(ip),  | 
1591  | 10.0k  |                                    reinterpret_cast<const uint8_t*>(ip_limit_),  | 
1592  | 10.0k  |                                    op - op_base, op_base, op_limit_min_slop);  | 
1593  | 10.0k  |           ip = reinterpret_cast<const char*>(res.first);  | 
1594  | 10.0k  |           op = op_base + res.second;  | 
1595  | 10.0k  |           MAYBE_REFILL();  | 
1596  | 10.0k  |         }  | 
1597  | 10.0k  |       }  | 
1598  | 10.0k  |       const uint8_t c = static_cast<uint8_t>(preload);  | 
1599  | 10.0k  |       ip++;  | 
1600  |  |  | 
1601  |  |       // Ratio of iterations that have LITERAL vs non-LITERAL for different  | 
1602  |  |       // inputs.  | 
1603  |  |       //  | 
1604  |  |       // input          LITERAL  NON_LITERAL  | 
1605  |  |       // -----------------------------------  | 
1606  |  |       // html|html4|cp   23%        77%  | 
1607  |  |       // urls            36%        64%  | 
1608  |  |       // jpg             47%        53%  | 
1609  |  |       // pdf             19%        81%  | 
1610  |  |       // txt[1-4]        25%        75%  | 
1611  |  |       // pb              24%        76%  | 
1612  |  |       // bin             24%        76%  | 
1613  | 10.0k  |       if (SNAPPY_PREDICT_FALSE((c & 0x3) == LITERAL)) { | 
1614  | 5.31k  |         size_t literal_length = (c >> 2) + 1u;  | 
1615  | 5.31k  |         if (writer->TryFastAppend(ip, ip_limit_ - ip, literal_length, &op)) { | 
1616  | 1.46k  |           assert(literal_length < 61);  | 
1617  | 1.46k  |           ip += literal_length;  | 
1618  |  |           // NOTE: There is no MAYBE_REFILL() here, as TryFastAppend()  | 
1619  |  |           // will not return true unless there's already at least five spare  | 
1620  |  |           // bytes in addition to the literal.  | 
1621  | 1.46k  |           preload = static_cast<uint8_t>(*ip);  | 
1622  | 1.46k  |           continue;  | 
1623  | 1.46k  |         }  | 
1624  | 3.84k  |         if (SNAPPY_PREDICT_FALSE(literal_length >= 61)) { | 
1625  |  |           // Long literal.  | 
1626  | 128  |           const size_t literal_length_length = literal_length - 60;  | 
1627  | 128  |           literal_length =  | 
1628  | 128  |               ExtractLowBytes(LittleEndian::Load32(ip), literal_length_length) +  | 
1629  | 128  |               1;  | 
1630  | 128  |           ip += literal_length_length;  | 
1631  | 128  |         }  | 
1632  |  |  | 
1633  | 3.84k  |         size_t avail = ip_limit_ - ip;  | 
1634  | 3.84k  |         while (avail < literal_length) { | 
1635  | 24  |           if (!writer->Append(ip, avail, &op)) goto exit;  | 
1636  | 16  |           literal_length -= avail;  | 
1637  | 16  |           reader_->Skip(peeked_);  | 
1638  | 16  |           size_t n;  | 
1639  | 16  |           ip = reader_->Peek(&n);  | 
1640  | 16  |           avail = n;  | 
1641  | 16  |           peeked_ = avail;  | 
1642  | 16  |           if (avail == 0) goto exit;  | 
1643  | 0  |           ip_limit_ = ip + avail;  | 
1644  | 0  |           ResetLimit(ip);  | 
1645  | 0  |         }  | 
1646  | 3.82k  |         if (!writer->Append(ip, literal_length, &op)) goto exit;  | 
1647  | 3.73k  |         ip += literal_length;  | 
1648  | 3.73k  |         MAYBE_REFILL();  | 
1649  | 4.72k  |       } else { | 
1650  | 4.72k  |         if (SNAPPY_PREDICT_FALSE((c & 3) == COPY_4_BYTE_OFFSET)) { | 
1651  | 276  |           const size_t copy_offset = LittleEndian::Load32(ip);  | 
1652  | 276  |           const size_t length = (c >> 2) + 1;  | 
1653  | 276  |           ip += 4;  | 
1654  |  |  | 
1655  | 276  |           if (!writer->AppendFromSelf(copy_offset, length, &op)) goto exit;  | 
1656  | 4.44k  |         } else { | 
1657  | 4.44k  |           const ptrdiff_t entry = kLengthMinusOffset[c];  | 
1658  | 4.44k  |           preload = LittleEndian::Load32(ip);  | 
1659  | 4.44k  |           const uint32_t trailer = ExtractLowBytes(preload, c & 3);  | 
1660  | 4.44k  |           const uint32_t length = entry & 0xff;  | 
1661  | 4.44k  |           assert(length > 0);  | 
1662  |  |  | 
1663  |  |           // copy_offset/256 is encoded in bits 8..10.  By just fetching  | 
1664  |  |           // those bits, we get copy_offset (since the bit-field starts at  | 
1665  |  |           // bit 8).  | 
1666  | 4.44k  |           const uint32_t copy_offset = trailer - entry + length;  | 
1667  | 4.44k  |           if (!writer->AppendFromSelf(copy_offset, length, &op)) goto exit;  | 
1668  |  |  | 
1669  | 4.24k  |           ip += (c & 3);  | 
1670  |  |           // By using the result of the previous load we reduce the critical  | 
1671  |  |           // dependency chain of ip to 4 cycles.  | 
1672  | 4.24k  |           preload >>= (c & 3) * 8;  | 
1673  | 4.24k  |           if (ip < ip_limit_min_maxtaglen_) continue;  | 
1674  | 4.24k  |         }  | 
1675  | 592  |         MAYBE_REFILL();  | 
1676  | 592  |       }  | 
1677  | 10.0k  |     }  | 
1678  | 0  | #undef MAYBE_REFILL  | 
1679  | 3.42k  |   exit:  | 
1680  | 3.42k  |     writer->SetOutputPtr(op);  | 
1681  | 3.42k  |   } Unexecuted instantiation: void snappy::SnappyDecompressor::DecompressAllTags<snappy::SnappyIOVecWriter>(snappy::SnappyIOVecWriter*) Unexecuted instantiation: void snappy::SnappyDecompressor::DecompressAllTags<snappy::SnappyDecompressionValidator>(snappy::SnappyDecompressionValidator*) void snappy::SnappyDecompressor::DecompressAllTags<snappy::SnappyArrayWriter>(snappy::SnappyArrayWriter*) Line  | Count  | Source  |  1563  | 3.42k  |   DecompressAllTags(Writer* writer) { |  1564  | 3.42k  |     const char* ip = ip_;  |  1565  | 3.42k  |     ResetLimit(ip);  |  1566  | 3.42k  |     auto op = writer->GetOutputPtr();  |  1567  |  |     // We could have put this refill fragment only at the beginning of the loop.  |  1568  |  |     // However, duplicating it at the end of each branch gives the compiler more  |  1569  |  |     // scope to optimize the <ip_limit_ - ip> expression based on the local  |  1570  |  |     // context, which overall increases speed.  |  1571  | 3.42k  | #define MAYBE_REFILL()                                      \  |  1572  | 3.42k  |   if (SNAPPY_PREDICT_FALSE(ip >= ip_limit_min_maxtaglen_)) { \ |  1573  | 3.42k  |     ip_ = ip;                                               \  |  1574  | 3.42k  |     if (SNAPPY_PREDICT_FALSE(!RefillTag())) goto exit;       \  |  1575  | 3.42k  |     ip = ip_;                                               \  |  1576  | 3.42k  |     ResetLimit(ip);                                         \  |  1577  | 3.42k  |   }                                                         \  |  1578  | 3.42k  |   preload = static_cast<uint8_t>(*ip)  |  1579  |  |  |  1580  |  |     // At the start of the for loop below the least significant byte of preload  |  1581  |  |     // contains the tag.  |  1582  | 3.42k  |     uint32_t preload;  |  1583  | 3.42k  |     MAYBE_REFILL();  |  1584  | 10.0k  |     for (;;) { |  1585  | 10.0k  |       { |  1586  | 10.0k  |         ptrdiff_t op_limit_min_slop;  |  1587  | 10.0k  |         auto op_base = writer->GetBase(&op_limit_min_slop);  |  1588  | 10.0k  |         if (op_base) { |  1589  | 10.0k  |           auto res =  |  1590  | 10.0k  |               DecompressBranchless(reinterpret_cast<const uint8_t*>(ip),  |  1591  | 10.0k  |                                    reinterpret_cast<const uint8_t*>(ip_limit_),  |  1592  | 10.0k  |                                    op - op_base, op_base, op_limit_min_slop);  |  1593  | 10.0k  |           ip = reinterpret_cast<const char*>(res.first);  |  1594  | 10.0k  |           op = op_base + res.second;  |  1595  | 10.0k  |           MAYBE_REFILL();  |  1596  | 10.0k  |         }  |  1597  | 10.0k  |       }  |  1598  | 10.0k  |       const uint8_t c = static_cast<uint8_t>(preload);  |  1599  | 10.0k  |       ip++;  |  1600  |  |  |  1601  |  |       // Ratio of iterations that have LITERAL vs non-LITERAL for different  |  1602  |  |       // inputs.  |  1603  |  |       //  |  1604  |  |       // input          LITERAL  NON_LITERAL  |  1605  |  |       // -----------------------------------  |  1606  |  |       // html|html4|cp   23%        77%  |  1607  |  |       // urls            36%        64%  |  1608  |  |       // jpg             47%        53%  |  1609  |  |       // pdf             19%        81%  |  1610  |  |       // txt[1-4]        25%        75%  |  1611  |  |       // pb              24%        76%  |  1612  |  |       // bin             24%        76%  |  1613  | 10.0k  |       if (SNAPPY_PREDICT_FALSE((c & 0x3) == LITERAL)) { |  1614  | 5.31k  |         size_t literal_length = (c >> 2) + 1u;  |  1615  | 5.31k  |         if (writer->TryFastAppend(ip, ip_limit_ - ip, literal_length, &op)) { |  1616  | 1.46k  |           assert(literal_length < 61);  |  1617  | 1.46k  |           ip += literal_length;  |  1618  |  |           // NOTE: There is no MAYBE_REFILL() here, as TryFastAppend()  |  1619  |  |           // will not return true unless there's already at least five spare  |  1620  |  |           // bytes in addition to the literal.  |  1621  | 1.46k  |           preload = static_cast<uint8_t>(*ip);  |  1622  | 1.46k  |           continue;  |  1623  | 1.46k  |         }  |  1624  | 3.84k  |         if (SNAPPY_PREDICT_FALSE(literal_length >= 61)) { |  1625  |  |           // Long literal.  |  1626  | 128  |           const size_t literal_length_length = literal_length - 60;  |  1627  | 128  |           literal_length =  |  1628  | 128  |               ExtractLowBytes(LittleEndian::Load32(ip), literal_length_length) +  |  1629  | 128  |               1;  |  1630  | 128  |           ip += literal_length_length;  |  1631  | 128  |         }  |  1632  |  |  |  1633  | 3.84k  |         size_t avail = ip_limit_ - ip;  |  1634  | 3.84k  |         while (avail < literal_length) { |  1635  | 24  |           if (!writer->Append(ip, avail, &op)) goto exit;  |  1636  | 16  |           literal_length -= avail;  |  1637  | 16  |           reader_->Skip(peeked_);  |  1638  | 16  |           size_t n;  |  1639  | 16  |           ip = reader_->Peek(&n);  |  1640  | 16  |           avail = n;  |  1641  | 16  |           peeked_ = avail;  |  1642  | 16  |           if (avail == 0) goto exit;  |  1643  | 0  |           ip_limit_ = ip + avail;  |  1644  | 0  |           ResetLimit(ip);  |  1645  | 0  |         }  |  1646  | 3.82k  |         if (!writer->Append(ip, literal_length, &op)) goto exit;  |  1647  | 3.73k  |         ip += literal_length;  |  1648  | 3.73k  |         MAYBE_REFILL();  |  1649  | 4.72k  |       } else { |  1650  | 4.72k  |         if (SNAPPY_PREDICT_FALSE((c & 3) == COPY_4_BYTE_OFFSET)) { |  1651  | 276  |           const size_t copy_offset = LittleEndian::Load32(ip);  |  1652  | 276  |           const size_t length = (c >> 2) + 1;  |  1653  | 276  |           ip += 4;  |  1654  |  |  |  1655  | 276  |           if (!writer->AppendFromSelf(copy_offset, length, &op)) goto exit;  |  1656  | 4.44k  |         } else { |  1657  | 4.44k  |           const ptrdiff_t entry = kLengthMinusOffset[c];  |  1658  | 4.44k  |           preload = LittleEndian::Load32(ip);  |  1659  | 4.44k  |           const uint32_t trailer = ExtractLowBytes(preload, c & 3);  |  1660  | 4.44k  |           const uint32_t length = entry & 0xff;  |  1661  | 4.44k  |           assert(length > 0);  |  1662  |  |  |  1663  |  |           // copy_offset/256 is encoded in bits 8..10.  By just fetching  |  1664  |  |           // those bits, we get copy_offset (since the bit-field starts at  |  1665  |  |           // bit 8).  |  1666  | 4.44k  |           const uint32_t copy_offset = trailer - entry + length;  |  1667  | 4.44k  |           if (!writer->AppendFromSelf(copy_offset, length, &op)) goto exit;  |  1668  |  |  |  1669  | 4.24k  |           ip += (c & 3);  |  1670  |  |           // By using the result of the previous load we reduce the critical  |  1671  |  |           // dependency chain of ip to 4 cycles.  |  1672  | 4.24k  |           preload >>= (c & 3) * 8;  |  1673  | 4.24k  |           if (ip < ip_limit_min_maxtaglen_) continue;  |  1674  | 4.24k  |         }  |  1675  | 592  |         MAYBE_REFILL();  |  1676  | 592  |       }  |  1677  | 10.0k  |     }  |  1678  | 0  | #undef MAYBE_REFILL  |  1679  | 3.42k  |   exit:  |  1680  | 3.42k  |     writer->SetOutputPtr(op);  |  1681  | 3.42k  |   }  |  
 Unexecuted instantiation: void snappy::SnappyDecompressor::DecompressAllTags<snappy::SnappyScatteredWriter<snappy::SnappySinkAllocator> >(snappy::SnappyScatteredWriter<snappy::SnappySinkAllocator>*)  | 
1682  |  | };  | 
1683  |  |  | 
1684  | 3.85k  | constexpr uint32_t CalculateNeeded(uint8_t tag) { | 
1685  | 3.85k  |   return ((tag & 3) == 0 && tag >= (60 * 4))  | 
1686  | 3.85k  |              ? (tag >> 2) - 58  | 
1687  | 3.85k  |              : (0x05030201 >> ((tag * 8) & 31)) & 0xFF;  | 
1688  | 3.85k  | }  | 
1689  |  |  | 
1690  |  | #if __cplusplus >= 201402L  | 
1691  | 0  | constexpr bool VerifyCalculateNeeded() { | 
1692  | 0  |   for (int i = 0; i < 1; i++) { | 
1693  | 0  |     if (CalculateNeeded(i) != static_cast<uint32_t>((char_table[i] >> 11)) + 1)  | 
1694  | 0  |       return false;  | 
1695  | 0  |   }  | 
1696  | 0  |   return true;  | 
1697  | 0  | }  | 
1698  |  |  | 
1699  |  | // Make sure CalculateNeeded is correct by verifying it against the established  | 
1700  |  | // table encoding the number of added bytes needed.  | 
1701  |  | static_assert(VerifyCalculateNeeded(), "");  | 
1702  |  | #endif  // c++14  | 
1703  |  |  | 
1704  | 6.82k  | bool SnappyDecompressor::RefillTag() { | 
1705  | 6.82k  |   const char* ip = ip_;  | 
1706  | 6.82k  |   if (ip == ip_limit_) { | 
1707  |  |     // Fetch a new fragment from the reader  | 
1708  | 6.38k  |     reader_->Skip(peeked_);  // All peeked bytes are used up  | 
1709  | 6.38k  |     size_t n;  | 
1710  | 6.38k  |     ip = reader_->Peek(&n);  | 
1711  | 6.38k  |     peeked_ = n;  | 
1712  | 6.38k  |     eof_ = (n == 0);  | 
1713  | 6.38k  |     if (eof_) return false;  | 
1714  | 3.40k  |     ip_limit_ = ip + n;  | 
1715  | 3.40k  |   }  | 
1716  |  |  | 
1717  |  |   // Read the tag character  | 
1718  | 3.85k  |   assert(ip < ip_limit_);  | 
1719  | 3.85k  |   const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip));  | 
1720  |  |   // At this point make sure that the data for the next tag is consecutive.  | 
1721  |  |   // For copy 1 this means the next 2 bytes (tag and 1 byte offset)  | 
1722  |  |   // For copy 2 the next 3 bytes (tag and 2 byte offset)  | 
1723  |  |   // For copy 4 the next 5 bytes (tag and 4 byte offset)  | 
1724  |  |   // For all small literals we only need 1 byte buf for literals 60...63 the  | 
1725  |  |   // length is encoded in 1...4 extra bytes.  | 
1726  | 3.85k  |   const uint32_t needed = CalculateNeeded(c);  | 
1727  | 3.85k  |   assert(needed <= sizeof(scratch_));  | 
1728  |  |  | 
1729  |  |   // Read more bytes from reader if needed  | 
1730  | 3.85k  |   uint64_t nbuf = ip_limit_ - ip;  | 
1731  | 3.85k  |   if (nbuf < needed) { | 
1732  |  |     // Stitch together bytes from ip and reader to form the word  | 
1733  |  |     // contents.  We store the needed bytes in "scratch_".  They  | 
1734  |  |     // will be consumed immediately by the caller since we do not  | 
1735  |  |     // read more than we need.  | 
1736  | 24  |     std::memmove(scratch_, ip, nbuf);  | 
1737  | 24  |     reader_->Skip(peeked_);  // All peeked bytes are used up  | 
1738  | 24  |     peeked_ = 0;  | 
1739  | 24  |     while (nbuf < needed) { | 
1740  | 24  |       size_t length;  | 
1741  | 24  |       const char* src = reader_->Peek(&length);  | 
1742  | 24  |       if (length == 0) return false;  | 
1743  | 0  |       uint64_t to_add = std::min<uint64_t>(needed - nbuf, length);  | 
1744  | 0  |       std::memcpy(scratch_ + nbuf, src, to_add);  | 
1745  | 0  |       nbuf += to_add;  | 
1746  | 0  |       reader_->Skip(to_add);  | 
1747  | 0  |     }  | 
1748  | 0  |     assert(nbuf == needed);  | 
1749  | 0  |     ip_ = scratch_;  | 
1750  | 0  |     ip_limit_ = scratch_ + needed;  | 
1751  | 3.82k  |   } else if (nbuf < kMaximumTagLength) { | 
1752  |  |     // Have enough bytes, but move into scratch_ so that we do not  | 
1753  |  |     // read past end of input  | 
1754  | 544  |     std::memmove(scratch_, ip, nbuf);  | 
1755  | 544  |     reader_->Skip(peeked_);  // All peeked bytes are used up  | 
1756  | 544  |     peeked_ = 0;  | 
1757  | 544  |     ip_ = scratch_;  | 
1758  | 544  |     ip_limit_ = scratch_ + nbuf;  | 
1759  | 3.28k  |   } else { | 
1760  |  |     // Pass pointer to buffer returned by reader_.  | 
1761  | 3.28k  |     ip_ = ip;  | 
1762  | 3.28k  |   }  | 
1763  | 3.82k  |   return true;  | 
1764  | 3.85k  | }  | 
1765  |  |  | 
1766  |  | template <typename Writer>  | 
1767  | 3.42k  | static bool InternalUncompress(Source* r, Writer* writer) { | 
1768  |  |   // Read the uncompressed length from the front of the compressed input  | 
1769  | 3.42k  |   SnappyDecompressor decompressor(r);  | 
1770  | 3.42k  |   uint32_t uncompressed_len = 0;  | 
1771  | 3.42k  |   if (!decompressor.ReadUncompressedLength(&uncompressed_len)) return false;  | 
1772  |  |  | 
1773  | 3.42k  |   return InternalUncompressAllTags(&decompressor, writer, r->Available(),  | 
1774  | 3.42k  |                                    uncompressed_len);  | 
1775  | 3.42k  | } Unexecuted instantiation: snappy.cc:bool snappy::InternalUncompress<snappy::SnappyIOVecWriter>(snappy::Source*, snappy::SnappyIOVecWriter*) snappy.cc:bool snappy::InternalUncompress<snappy::SnappyArrayWriter>(snappy::Source*, snappy::SnappyArrayWriter*) Line  | Count  | Source  |  1767  | 3.42k  | static bool InternalUncompress(Source* r, Writer* writer) { |  1768  |  |   // Read the uncompressed length from the front of the compressed input  |  1769  | 3.42k  |   SnappyDecompressor decompressor(r);  |  1770  | 3.42k  |   uint32_t uncompressed_len = 0;  |  1771  | 3.42k  |   if (!decompressor.ReadUncompressedLength(&uncompressed_len)) return false;  |  1772  |  |  |  1773  | 3.42k  |   return InternalUncompressAllTags(&decompressor, writer, r->Available(),  |  1774  | 3.42k  |                                    uncompressed_len);  |  1775  | 3.42k  | }  |  
 Unexecuted instantiation: snappy.cc:bool snappy::InternalUncompress<snappy::SnappyDecompressionValidator>(snappy::Source*, snappy::SnappyDecompressionValidator*) Unexecuted instantiation: snappy.cc:bool snappy::InternalUncompress<snappy::SnappyScatteredWriter<snappy::SnappySinkAllocator> >(snappy::Source*, snappy::SnappyScatteredWriter<snappy::SnappySinkAllocator>*)  | 
1776  |  |  | 
1777  |  | template <typename Writer>  | 
1778  |  | static bool InternalUncompressAllTags(SnappyDecompressor* decompressor,  | 
1779  |  |                                       Writer* writer, uint32_t compressed_len,  | 
1780  | 3.42k  |                                       uint32_t uncompressed_len) { | 
1781  | 3.42k  |     int token = 0;  | 
1782  | 3.42k  |   Report(token, "snappy_uncompress", compressed_len, uncompressed_len);  | 
1783  |  |  | 
1784  | 3.42k  |   writer->SetExpectedLength(uncompressed_len);  | 
1785  |  |  | 
1786  |  |   // Process the entire input  | 
1787  | 3.42k  |   decompressor->DecompressAllTags(writer);  | 
1788  | 3.42k  |   writer->Flush();  | 
1789  | 3.42k  |   return (decompressor->eof() && writer->CheckLength());  | 
1790  | 3.42k  | } Unexecuted instantiation: snappy.cc:bool snappy::InternalUncompressAllTags<snappy::SnappyIOVecWriter>(snappy::SnappyDecompressor*, snappy::SnappyIOVecWriter*, unsigned int, unsigned int) Unexecuted instantiation: snappy.cc:bool snappy::InternalUncompressAllTags<snappy::SnappyDecompressionValidator>(snappy::SnappyDecompressor*, snappy::SnappyDecompressionValidator*, unsigned int, unsigned int) snappy.cc:bool snappy::InternalUncompressAllTags<snappy::SnappyArrayWriter>(snappy::SnappyDecompressor*, snappy::SnappyArrayWriter*, unsigned int, unsigned int) Line  | Count  | Source  |  1780  | 3.42k  |                                       uint32_t uncompressed_len) { |  1781  | 3.42k  |     int token = 0;  |  1782  | 3.42k  |   Report(token, "snappy_uncompress", compressed_len, uncompressed_len);  |  1783  |  |  |  1784  | 3.42k  |   writer->SetExpectedLength(uncompressed_len);  |  1785  |  |  |  1786  |  |   // Process the entire input  |  1787  | 3.42k  |   decompressor->DecompressAllTags(writer);  |  1788  | 3.42k  |   writer->Flush();  |  1789  | 3.42k  |   return (decompressor->eof() && writer->CheckLength());  |  1790  | 3.42k  | }  |  
 Unexecuted instantiation: snappy.cc:bool snappy::InternalUncompressAllTags<snappy::SnappyScatteredWriter<snappy::SnappySinkAllocator> >(snappy::SnappyDecompressor*, snappy::SnappyScatteredWriter<snappy::SnappySinkAllocator>*, unsigned int, unsigned int)  | 
1791  |  |  | 
1792  | 0  | bool GetUncompressedLength(Source* source, uint32_t* result) { | 
1793  | 0  |   SnappyDecompressor decompressor(source);  | 
1794  | 0  |   return decompressor.ReadUncompressedLength(result);  | 
1795  | 0  | }  | 
1796  |  |  | 
1797  | 0  | size_t Compress(Source* reader, Sink* writer) { | 
1798  | 0  |   return Compress(reader, writer, CompressionOptions{}); | 
1799  | 0  | }  | 
1800  |  |  | 
1801  | 0  | size_t Compress(Source* reader, Sink* writer, CompressionOptions options) { | 
1802  | 0  |   assert(options.level == 1 || options.level == 2);  | 
1803  | 0  |   int token = 0;  | 
1804  | 0  |   size_t written = 0;  | 
1805  | 0  |   size_t N = reader->Available();  | 
1806  | 0  |   assert(N <= 0xFFFFFFFFu);  | 
1807  | 0  |   const size_t uncompressed_size = N;  | 
1808  | 0  |   char ulength[Varint::kMax32];  | 
1809  | 0  |   char* p = Varint::Encode32(ulength, N);  | 
1810  | 0  |   writer->Append(ulength, p - ulength);  | 
1811  | 0  |   written += (p - ulength);  | 
1812  |  | 
  | 
1813  | 0  |   internal::WorkingMemory wmem(N);  | 
1814  |  | 
  | 
1815  | 0  |   while (N > 0) { | 
1816  |  |     // Get next block to compress (without copying if possible)  | 
1817  | 0  |     size_t fragment_size;  | 
1818  | 0  |     const char* fragment = reader->Peek(&fragment_size);  | 
1819  | 0  |     assert(fragment_size != 0);  // premature end of input  | 
1820  | 0  |     const size_t num_to_read = std::min(N, kBlockSize);  | 
1821  | 0  |     size_t bytes_read = fragment_size;  | 
1822  |  | 
  | 
1823  | 0  |     size_t pending_advance = 0;  | 
1824  | 0  |     if (bytes_read >= num_to_read) { | 
1825  |  |       // Buffer returned by reader is large enough  | 
1826  | 0  |       pending_advance = num_to_read;  | 
1827  | 0  |       fragment_size = num_to_read;  | 
1828  | 0  |     } else { | 
1829  | 0  |       char* scratch = wmem.GetScratchInput();  | 
1830  | 0  |       std::memcpy(scratch, fragment, bytes_read);  | 
1831  | 0  |       reader->Skip(bytes_read);  | 
1832  |  | 
  | 
1833  | 0  |       while (bytes_read < num_to_read) { | 
1834  | 0  |         fragment = reader->Peek(&fragment_size);  | 
1835  | 0  |         size_t n = std::min<size_t>(fragment_size, num_to_read - bytes_read);  | 
1836  | 0  |         std::memcpy(scratch + bytes_read, fragment, n);  | 
1837  | 0  |         bytes_read += n;  | 
1838  | 0  |         reader->Skip(n);  | 
1839  | 0  |       }  | 
1840  | 0  |       assert(bytes_read == num_to_read);  | 
1841  | 0  |       fragment = scratch;  | 
1842  | 0  |       fragment_size = num_to_read;  | 
1843  | 0  |     }  | 
1844  | 0  |     assert(fragment_size == num_to_read);  | 
1845  |  |  | 
1846  |  |     // Get encoding table for compression  | 
1847  | 0  |     int table_size;  | 
1848  | 0  |     uint16_t* table = wmem.GetHashTable(num_to_read, &table_size);  | 
1849  |  |  | 
1850  |  |     // Compress input_fragment and append to dest  | 
1851  | 0  |     int max_output = MaxCompressedLength(num_to_read);  | 
1852  |  |  | 
1853  |  |     // Since we encode kBlockSize regions followed by a region  | 
1854  |  |     // which is <= kBlockSize in length, a previously allocated  | 
1855  |  |     // scratch_output[] region is big enough for this iteration.  | 
1856  |  |     // Need a scratch buffer for the output, in case the byte sink doesn't  | 
1857  |  |     // have room for us directly.  | 
1858  | 0  |     char* dest = writer->GetAppendBuffer(max_output, wmem.GetScratchOutput());  | 
1859  | 0  |     char* end = nullptr;  | 
1860  | 0  |     if (options.level == 1) { | 
1861  | 0  |       end = internal::CompressFragment(fragment, fragment_size, dest, table,  | 
1862  | 0  |                                        table_size);  | 
1863  | 0  |     } else if (options.level == 2) { | 
1864  | 0  |       end = internal::CompressFragmentDoubleHash(  | 
1865  | 0  |           fragment, fragment_size, dest, table, table_size >> 1,  | 
1866  | 0  |           table + (table_size >> 1), table_size >> 1);  | 
1867  | 0  |     }  | 
1868  | 0  |     writer->Append(dest, end - dest);  | 
1869  | 0  |     written += (end - dest);  | 
1870  |  | 
  | 
1871  | 0  |     N -= num_to_read;  | 
1872  | 0  |     reader->Skip(pending_advance);  | 
1873  | 0  |   }  | 
1874  |  | 
  | 
1875  | 0  |   Report(token, "snappy_compress", written, uncompressed_size);  | 
1876  | 0  |   return written;  | 
1877  | 0  | }  | 
1878  |  |  | 
1879  |  | // -----------------------------------------------------------------------  | 
1880  |  | // IOVec interfaces  | 
1881  |  | // -----------------------------------------------------------------------  | 
1882  |  |  | 
1883  |  | // A `Source` implementation that yields the contents of an `iovec` array. Note  | 
1884  |  | // that `total_size` is the total number of bytes to be read from the elements  | 
1885  |  | // of `iov` (_not_ the total number of elements in `iov`).  | 
1886  |  | class SnappyIOVecReader : public Source { | 
1887  |  |  public:  | 
1888  |  |   SnappyIOVecReader(const struct iovec* iov, size_t total_size)  | 
1889  | 0  |       : curr_iov_(iov),  | 
1890  | 0  |         curr_pos_(total_size > 0 ? reinterpret_cast<const char*>(iov->iov_base)  | 
1891  | 0  |                                  : nullptr),  | 
1892  | 0  |         curr_size_remaining_(total_size > 0 ? iov->iov_len : 0),  | 
1893  | 0  |         total_size_remaining_(total_size) { | 
1894  |  |     // Skip empty leading `iovec`s.  | 
1895  | 0  |     if (total_size > 0 && curr_size_remaining_ == 0) Advance();  | 
1896  | 0  |   }  | 
1897  |  |  | 
1898  |  |   ~SnappyIOVecReader() override = default;  | 
1899  |  |  | 
1900  | 0  |   size_t Available() const override { return total_size_remaining_; } | 
1901  |  |  | 
1902  | 0  |   const char* Peek(size_t* len) override { | 
1903  | 0  |     *len = curr_size_remaining_;  | 
1904  | 0  |     return curr_pos_;  | 
1905  | 0  |   }  | 
1906  |  |  | 
1907  | 0  |   void Skip(size_t n) override { | 
1908  | 0  |     while (n >= curr_size_remaining_ && n > 0) { | 
1909  | 0  |       n -= curr_size_remaining_;  | 
1910  | 0  |       Advance();  | 
1911  | 0  |     }  | 
1912  | 0  |     curr_size_remaining_ -= n;  | 
1913  | 0  |     total_size_remaining_ -= n;  | 
1914  | 0  |     curr_pos_ += n;  | 
1915  | 0  |   }  | 
1916  |  |  | 
1917  |  |  private:  | 
1918  |  |   // Advances to the next nonempty `iovec` and updates related variables.  | 
1919  | 0  |   void Advance() { | 
1920  | 0  |     do { | 
1921  | 0  |       assert(total_size_remaining_ >= curr_size_remaining_);  | 
1922  | 0  |       total_size_remaining_ -= curr_size_remaining_;  | 
1923  | 0  |       if (total_size_remaining_ == 0) { | 
1924  | 0  |         curr_pos_ = nullptr;  | 
1925  | 0  |         curr_size_remaining_ = 0;  | 
1926  | 0  |         return;  | 
1927  | 0  |       }  | 
1928  | 0  |       ++curr_iov_;  | 
1929  | 0  |       curr_pos_ = reinterpret_cast<const char*>(curr_iov_->iov_base);  | 
1930  | 0  |       curr_size_remaining_ = curr_iov_->iov_len;  | 
1931  | 0  |     } while (curr_size_remaining_ == 0);  | 
1932  | 0  |   }  | 
1933  |  |  | 
1934  |  |   // The `iovec` currently being read.  | 
1935  |  |   const struct iovec* curr_iov_;  | 
1936  |  |   // The location in `curr_iov_` currently being read.  | 
1937  |  |   const char* curr_pos_;  | 
1938  |  |   // The amount of unread data in `curr_iov_`.  | 
1939  |  |   size_t curr_size_remaining_;  | 
1940  |  |   // The amount of unread data in the entire input array.  | 
1941  |  |   size_t total_size_remaining_;  | 
1942  |  | };  | 
1943  |  |  | 
1944  |  | // A type that writes to an iovec.  | 
1945  |  | // Note that this is not a "ByteSink", but a type that matches the  | 
1946  |  | // Writer template argument to SnappyDecompressor::DecompressAllTags().  | 
1947  |  | class SnappyIOVecWriter { | 
1948  |  |  private:  | 
1949  |  |   // output_iov_end_ is set to iov + count and used to determine when  | 
1950  |  |   // the end of the iovs is reached.  | 
1951  |  |   const struct iovec* output_iov_end_;  | 
1952  |  |  | 
1953  |  | #if !defined(NDEBUG)  | 
1954  |  |   const struct iovec* output_iov_;  | 
1955  |  | #endif  // !defined(NDEBUG)  | 
1956  |  |  | 
1957  |  |   // Current iov that is being written into.  | 
1958  |  |   const struct iovec* curr_iov_;  | 
1959  |  |  | 
1960  |  |   // Pointer to current iov's write location.  | 
1961  |  |   char* curr_iov_output_;  | 
1962  |  |  | 
1963  |  |   // Remaining bytes to write into curr_iov_output.  | 
1964  |  |   size_t curr_iov_remaining_;  | 
1965  |  |  | 
1966  |  |   // Total bytes decompressed into output_iov_ so far.  | 
1967  |  |   size_t total_written_;  | 
1968  |  |  | 
1969  |  |   // Maximum number of bytes that will be decompressed into output_iov_.  | 
1970  |  |   size_t output_limit_;  | 
1971  |  |  | 
1972  | 0  |   static inline char* GetIOVecPointer(const struct iovec* iov, size_t offset) { | 
1973  | 0  |     return reinterpret_cast<char*>(iov->iov_base) + offset;  | 
1974  | 0  |   }  | 
1975  |  |  | 
1976  |  |  public:  | 
1977  |  |   // Does not take ownership of iov. iov must be valid during the  | 
1978  |  |   // entire lifetime of the SnappyIOVecWriter.  | 
1979  |  |   inline SnappyIOVecWriter(const struct iovec* iov, size_t iov_count)  | 
1980  | 0  |       : output_iov_end_(iov + iov_count),  | 
1981  |  | #if !defined(NDEBUG)  | 
1982  |  |         output_iov_(iov),  | 
1983  |  | #endif  // !defined(NDEBUG)  | 
1984  | 0  |         curr_iov_(iov),  | 
1985  | 0  |         curr_iov_output_(iov_count ? reinterpret_cast<char*>(iov->iov_base)  | 
1986  | 0  |                                    : nullptr),  | 
1987  | 0  |         curr_iov_remaining_(iov_count ? iov->iov_len : 0),  | 
1988  | 0  |         total_written_(0),  | 
1989  | 0  |         output_limit_(-1) { | 
1990  | 0  |   }  | 
1991  |  |  | 
1992  | 0  |   inline void SetExpectedLength(size_t len) { output_limit_ = len; } | 
1993  |  |  | 
1994  | 0  |   inline bool CheckLength() const { return total_written_ == output_limit_; } | 
1995  |  |  | 
1996  | 0  |   inline bool Append(const char* ip, size_t len, char**) { | 
1997  | 0  |     if (total_written_ + len > output_limit_) { | 
1998  | 0  |       return false;  | 
1999  | 0  |     }  | 
2000  |  |  | 
2001  | 0  |     return AppendNoCheck(ip, len);  | 
2002  | 0  |   }  | 
2003  |  |  | 
2004  | 0  |   char* GetOutputPtr() { return nullptr; } | 
2005  | 0  |   char* GetBase(ptrdiff_t*) { return nullptr; } | 
2006  | 0  |   void SetOutputPtr(char* op) { | 
2007  |  |     // TODO: Switch to [[maybe_unused]] when we can assume C++17.  | 
2008  | 0  |     (void)op;  | 
2009  | 0  |   }  | 
2010  |  |  | 
2011  | 0  |   inline bool AppendNoCheck(const char* ip, size_t len) { | 
2012  | 0  |     while (len > 0) { | 
2013  | 0  |       if (curr_iov_remaining_ == 0) { | 
2014  |  |         // This iovec is full. Go to the next one.  | 
2015  | 0  |         if (curr_iov_ + 1 >= output_iov_end_) { | 
2016  | 0  |           return false;  | 
2017  | 0  |         }  | 
2018  | 0  |         ++curr_iov_;  | 
2019  | 0  |         curr_iov_output_ = reinterpret_cast<char*>(curr_iov_->iov_base);  | 
2020  | 0  |         curr_iov_remaining_ = curr_iov_->iov_len;  | 
2021  | 0  |       }  | 
2022  |  |  | 
2023  | 0  |       const size_t to_write = std::min(len, curr_iov_remaining_);  | 
2024  | 0  |       std::memcpy(curr_iov_output_, ip, to_write);  | 
2025  | 0  |       curr_iov_output_ += to_write;  | 
2026  | 0  |       curr_iov_remaining_ -= to_write;  | 
2027  | 0  |       total_written_ += to_write;  | 
2028  | 0  |       ip += to_write;  | 
2029  | 0  |       len -= to_write;  | 
2030  | 0  |     }  | 
2031  |  |  | 
2032  | 0  |     return true;  | 
2033  | 0  |   }  | 
2034  |  |  | 
2035  |  |   inline bool TryFastAppend(const char* ip, size_t available, size_t len,  | 
2036  | 0  |                             char**) { | 
2037  | 0  |     const size_t space_left = output_limit_ - total_written_;  | 
2038  | 0  |     if (len <= 16 && available >= 16 + kMaximumTagLength && space_left >= 16 &&  | 
2039  | 0  |         curr_iov_remaining_ >= 16) { | 
2040  |  |       // Fast path, used for the majority (about 95%) of invocations.  | 
2041  | 0  |       UnalignedCopy128(ip, curr_iov_output_);  | 
2042  | 0  |       curr_iov_output_ += len;  | 
2043  | 0  |       curr_iov_remaining_ -= len;  | 
2044  | 0  |       total_written_ += len;  | 
2045  | 0  |       return true;  | 
2046  | 0  |     }  | 
2047  |  |  | 
2048  | 0  |     return false;  | 
2049  | 0  |   }  | 
2050  |  |  | 
2051  | 0  |   inline bool AppendFromSelf(size_t offset, size_t len, char**) { | 
2052  |  |     // See SnappyArrayWriter::AppendFromSelf for an explanation of  | 
2053  |  |     // the "offset - 1u" trick.  | 
2054  | 0  |     if (offset - 1u >= total_written_) { | 
2055  | 0  |       return false;  | 
2056  | 0  |     }  | 
2057  | 0  |     const size_t space_left = output_limit_ - total_written_;  | 
2058  | 0  |     if (len > space_left) { | 
2059  | 0  |       return false;  | 
2060  | 0  |     }  | 
2061  |  |  | 
2062  |  |     // Locate the iovec from which we need to start the copy.  | 
2063  | 0  |     const iovec* from_iov = curr_iov_;  | 
2064  | 0  |     size_t from_iov_offset = curr_iov_->iov_len - curr_iov_remaining_;  | 
2065  | 0  |     while (offset > 0) { | 
2066  | 0  |       if (from_iov_offset >= offset) { | 
2067  | 0  |         from_iov_offset -= offset;  | 
2068  | 0  |         break;  | 
2069  | 0  |       }  | 
2070  |  |  | 
2071  | 0  |       offset -= from_iov_offset;  | 
2072  | 0  |       --from_iov;  | 
2073  |  | #if !defined(NDEBUG)  | 
2074  |  |       assert(from_iov >= output_iov_);  | 
2075  |  | #endif  // !defined(NDEBUG)  | 
2076  | 0  |       from_iov_offset = from_iov->iov_len;  | 
2077  | 0  |     }  | 
2078  |  |  | 
2079  |  |     // Copy <len> bytes starting from the iovec pointed to by from_iov_index to  | 
2080  |  |     // the current iovec.  | 
2081  | 0  |     while (len > 0) { | 
2082  | 0  |       assert(from_iov <= curr_iov_);  | 
2083  | 0  |       if (from_iov != curr_iov_) { | 
2084  | 0  |         const size_t to_copy =  | 
2085  | 0  |             std::min(from_iov->iov_len - from_iov_offset, len);  | 
2086  | 0  |         AppendNoCheck(GetIOVecPointer(from_iov, from_iov_offset), to_copy);  | 
2087  | 0  |         len -= to_copy;  | 
2088  | 0  |         if (len > 0) { | 
2089  | 0  |           ++from_iov;  | 
2090  | 0  |           from_iov_offset = 0;  | 
2091  | 0  |         }  | 
2092  | 0  |       } else { | 
2093  | 0  |         size_t to_copy = curr_iov_remaining_;  | 
2094  | 0  |         if (to_copy == 0) { | 
2095  |  |           // This iovec is full. Go to the next one.  | 
2096  | 0  |           if (curr_iov_ + 1 >= output_iov_end_) { | 
2097  | 0  |             return false;  | 
2098  | 0  |           }  | 
2099  | 0  |           ++curr_iov_;  | 
2100  | 0  |           curr_iov_output_ = reinterpret_cast<char*>(curr_iov_->iov_base);  | 
2101  | 0  |           curr_iov_remaining_ = curr_iov_->iov_len;  | 
2102  | 0  |           continue;  | 
2103  | 0  |         }  | 
2104  | 0  |         if (to_copy > len) { | 
2105  | 0  |           to_copy = len;  | 
2106  | 0  |         }  | 
2107  | 0  |         assert(to_copy > 0);  | 
2108  |  | 
  | 
2109  | 0  |         IncrementalCopy(GetIOVecPointer(from_iov, from_iov_offset),  | 
2110  | 0  |                         curr_iov_output_, curr_iov_output_ + to_copy,  | 
2111  | 0  |                         curr_iov_output_ + curr_iov_remaining_);  | 
2112  | 0  |         curr_iov_output_ += to_copy;  | 
2113  | 0  |         curr_iov_remaining_ -= to_copy;  | 
2114  | 0  |         from_iov_offset += to_copy;  | 
2115  | 0  |         total_written_ += to_copy;  | 
2116  | 0  |         len -= to_copy;  | 
2117  | 0  |       }  | 
2118  | 0  |     }  | 
2119  |  |  | 
2120  | 0  |     return true;  | 
2121  | 0  |   }  | 
2122  |  |  | 
2123  | 0  |   inline void Flush() {} | 
2124  |  | };  | 
2125  |  |  | 
2126  |  | bool RawUncompressToIOVec(const char* compressed, size_t compressed_length,  | 
2127  | 0  |                           const struct iovec* iov, size_t iov_cnt) { | 
2128  | 0  |   ByteArraySource reader(compressed, compressed_length);  | 
2129  | 0  |   return RawUncompressToIOVec(&reader, iov, iov_cnt);  | 
2130  | 0  | }  | 
2131  |  |  | 
2132  |  | bool RawUncompressToIOVec(Source* compressed, const struct iovec* iov,  | 
2133  | 0  |                           size_t iov_cnt) { | 
2134  | 0  |   SnappyIOVecWriter output(iov, iov_cnt);  | 
2135  | 0  |   return InternalUncompress(compressed, &output);  | 
2136  | 0  | }  | 
2137  |  |  | 
2138  |  | // -----------------------------------------------------------------------  | 
2139  |  | // Flat array interfaces  | 
2140  |  | // -----------------------------------------------------------------------  | 
2141  |  |  | 
2142  |  | // A type that writes to a flat array.  | 
2143  |  | // Note that this is not a "ByteSink", but a type that matches the  | 
2144  |  | // Writer template argument to SnappyDecompressor::DecompressAllTags().  | 
2145  |  | class SnappyArrayWriter { | 
2146  |  |  private:  | 
2147  |  |   char* base_;  | 
2148  |  |   char* op_;  | 
2149  |  |   char* op_limit_;  | 
2150  |  |   // If op < op_limit_min_slop_ then it's safe to unconditionally write  | 
2151  |  |   // kSlopBytes starting at op.  | 
2152  |  |   char* op_limit_min_slop_;  | 
2153  |  |  | 
2154  |  |  public:  | 
2155  |  |   inline explicit SnappyArrayWriter(char* dst)  | 
2156  | 3.42k  |       : base_(dst),  | 
2157  | 3.42k  |         op_(dst),  | 
2158  | 3.42k  |         op_limit_(dst),  | 
2159  | 3.42k  |         op_limit_min_slop_(dst) {}  // Safe default see invariant. | 
2160  |  |  | 
2161  | 3.42k  |   inline void SetExpectedLength(size_t len) { | 
2162  | 3.42k  |     op_limit_ = op_ + len;  | 
2163  |  |     // Prevent pointer from being past the buffer.  | 
2164  | 3.42k  |     op_limit_min_slop_ = op_limit_ - std::min<size_t>(kSlopBytes - 1, len);  | 
2165  | 3.42k  |   }  | 
2166  |  |  | 
2167  | 2.97k  |   inline bool CheckLength() const { return op_ == op_limit_; } | 
2168  |  |  | 
2169  | 3.42k  |   char* GetOutputPtr() { return op_; } | 
2170  | 10.0k  |   char* GetBase(ptrdiff_t* op_limit_min_slop) { | 
2171  | 10.0k  |     *op_limit_min_slop = op_limit_min_slop_ - base_;  | 
2172  | 10.0k  |     return base_;  | 
2173  | 10.0k  |   }  | 
2174  | 3.42k  |   void SetOutputPtr(char* op) { op_ = op; } | 
2175  |  |  | 
2176  | 3.84k  |   inline bool Append(const char* ip, size_t len, char** op_p) { | 
2177  | 3.84k  |     char* op = *op_p;  | 
2178  | 3.84k  |     const size_t space_left = op_limit_ - op;  | 
2179  | 3.84k  |     if (space_left < len) return false;  | 
2180  | 3.75k  |     std::memcpy(op, ip, len);  | 
2181  | 3.75k  |     *op_p = op + len;  | 
2182  | 3.75k  |     return true;  | 
2183  | 3.84k  |   }  | 
2184  |  |  | 
2185  |  |   inline bool TryFastAppend(const char* ip, size_t available, size_t len,  | 
2186  | 5.31k  |                             char** op_p) { | 
2187  | 5.31k  |     char* op = *op_p;  | 
2188  | 5.31k  |     const size_t space_left = op_limit_ - op;  | 
2189  | 5.31k  |     if (len <= 16 && available >= 16 + kMaximumTagLength && space_left >= 16) { | 
2190  |  |       // Fast path, used for the majority (about 95%) of invocations.  | 
2191  | 1.46k  |       UnalignedCopy128(ip, op);  | 
2192  | 1.46k  |       *op_p = op + len;  | 
2193  | 1.46k  |       return true;  | 
2194  | 3.84k  |     } else { | 
2195  | 3.84k  |       return false;  | 
2196  | 3.84k  |     }  | 
2197  | 5.31k  |   }  | 
2198  |  |  | 
2199  |  |   SNAPPY_ATTRIBUTE_ALWAYS_INLINE  | 
2200  | 4.72k  |   inline bool AppendFromSelf(size_t offset, size_t len, char** op_p) { | 
2201  | 4.72k  |     assert(len > 0);  | 
2202  | 4.72k  |     char* const op = *op_p;  | 
2203  | 4.72k  |     assert(op >= base_);  | 
2204  | 4.72k  |     char* const op_end = op + len;  | 
2205  |  |  | 
2206  |  |     // Check if we try to append from before the start of the buffer.  | 
2207  | 4.72k  |     if (SNAPPY_PREDICT_FALSE(static_cast<size_t>(op - base_) < offset))  | 
2208  | 208  |       return false;  | 
2209  |  |  | 
2210  | 4.51k  |     if (SNAPPY_PREDICT_FALSE((kSlopBytes < 64 && len > kSlopBytes) ||  | 
2211  | 4.51k  |                             op >= op_limit_min_slop_ || offset < len)) { | 
2212  | 1.98k  |       if (op_end > op_limit_ || offset == 0) return false;  | 
2213  | 1.87k  |       *op_p = IncrementalCopy(op - offset, op, op_end, op_limit_);  | 
2214  | 1.87k  |       return true;  | 
2215  | 1.98k  |     }  | 
2216  | 2.52k  |     std::memmove(op, op - offset, kSlopBytes);  | 
2217  | 2.52k  |     *op_p = op_end;  | 
2218  | 2.52k  |     return true;  | 
2219  | 4.51k  |   }  | 
2220  | 0  |   inline size_t Produced() const { | 
2221  | 0  |     assert(op_ >= base_);  | 
2222  | 0  |     return op_ - base_;  | 
2223  | 0  |   }  | 
2224  | 3.42k  |   inline void Flush() {} | 
2225  |  | };  | 
2226  |  |  | 
2227  |  | bool RawUncompress(const char* compressed, size_t compressed_length,  | 
2228  | 3.42k  |                    char* uncompressed) { | 
2229  | 3.42k  |   ByteArraySource reader(compressed, compressed_length);  | 
2230  | 3.42k  |   return RawUncompress(&reader, uncompressed);  | 
2231  | 3.42k  | }  | 
2232  |  |  | 
2233  | 3.42k  | bool RawUncompress(Source* compressed, char* uncompressed) { | 
2234  | 3.42k  |   SnappyArrayWriter output(uncompressed);  | 
2235  | 3.42k  |   return InternalUncompress(compressed, &output);  | 
2236  | 3.42k  | }  | 
2237  |  |  | 
2238  |  | bool Uncompress(const char* compressed, size_t compressed_length,  | 
2239  | 0  |                 std::string* uncompressed) { | 
2240  | 0  |   size_t ulength;  | 
2241  | 0  |   if (!GetUncompressedLength(compressed, compressed_length, &ulength)) { | 
2242  | 0  |     return false;  | 
2243  | 0  |   }  | 
2244  |  |   // On 32-bit builds: max_size() < kuint32max.  Check for that instead  | 
2245  |  |   // of crashing (e.g., consider externally specified compressed data).  | 
2246  | 0  |   if (ulength > uncompressed->max_size()) { | 
2247  | 0  |     return false;  | 
2248  | 0  |   }  | 
2249  | 0  |   STLStringResizeUninitialized(uncompressed, ulength);  | 
2250  | 0  |   return RawUncompress(compressed, compressed_length,  | 
2251  | 0  |                        string_as_array(uncompressed));  | 
2252  | 0  | }  | 
2253  |  |  | 
2254  |  | // A Writer that drops everything on the floor and just does validation  | 
2255  |  | class SnappyDecompressionValidator { | 
2256  |  |  private:  | 
2257  |  |   size_t expected_;  | 
2258  |  |   size_t produced_;  | 
2259  |  |  | 
2260  |  |  public:  | 
2261  | 0  |   inline SnappyDecompressionValidator() : expected_(0), produced_(0) {} | 
2262  | 0  |   inline void SetExpectedLength(size_t len) { expected_ = len; } | 
2263  | 0  |   size_t GetOutputPtr() { return produced_; } | 
2264  | 0  |   size_t GetBase(ptrdiff_t* op_limit_min_slop) { | 
2265  | 0  |     *op_limit_min_slop = std::numeric_limits<ptrdiff_t>::max() - kSlopBytes + 1;  | 
2266  | 0  |     return 1;  | 
2267  | 0  |   }  | 
2268  | 0  |   void SetOutputPtr(size_t op) { produced_ = op; } | 
2269  | 0  |   inline bool CheckLength() const { return expected_ == produced_; } | 
2270  | 0  |   inline bool Append(const char* ip, size_t len, size_t* produced) { | 
2271  |  |     // TODO: Switch to [[maybe_unused]] when we can assume C++17.  | 
2272  | 0  |     (void)ip;  | 
2273  |  | 
  | 
2274  | 0  |     *produced += len;  | 
2275  | 0  |     return *produced <= expected_;  | 
2276  | 0  |   }  | 
2277  |  |   inline bool TryFastAppend(const char* ip, size_t available, size_t length,  | 
2278  | 0  |                             size_t* produced) { | 
2279  |  |     // TODO: Switch to [[maybe_unused]] when we can assume C++17.  | 
2280  | 0  |     (void)ip;  | 
2281  | 0  |     (void)available;  | 
2282  | 0  |     (void)length;  | 
2283  | 0  |     (void)produced;  | 
2284  |  | 
  | 
2285  | 0  |     return false;  | 
2286  | 0  |   }  | 
2287  | 0  |   inline bool AppendFromSelf(size_t offset, size_t len, size_t* produced) { | 
2288  |  |     // See SnappyArrayWriter::AppendFromSelf for an explanation of  | 
2289  |  |     // the "offset - 1u" trick.  | 
2290  | 0  |     if (*produced <= offset - 1u) return false;  | 
2291  | 0  |     *produced += len;  | 
2292  | 0  |     return *produced <= expected_;  | 
2293  | 0  |   }  | 
2294  | 0  |   inline void Flush() {} | 
2295  |  | };  | 
2296  |  |  | 
2297  | 0  | bool IsValidCompressedBuffer(const char* compressed, size_t compressed_length) { | 
2298  | 0  |   ByteArraySource reader(compressed, compressed_length);  | 
2299  | 0  |   SnappyDecompressionValidator writer;  | 
2300  | 0  |   return InternalUncompress(&reader, &writer);  | 
2301  | 0  | }  | 
2302  |  |  | 
2303  | 0  | bool IsValidCompressed(Source* compressed) { | 
2304  | 0  |   SnappyDecompressionValidator writer;  | 
2305  | 0  |   return InternalUncompress(compressed, &writer);  | 
2306  | 0  | }  | 
2307  |  |  | 
2308  |  | void RawCompress(const char* input, size_t input_length, char* compressed,  | 
2309  | 0  |                  size_t* compressed_length) { | 
2310  | 0  |   RawCompress(input, input_length, compressed, compressed_length,  | 
2311  | 0  |               CompressionOptions{}); | 
2312  | 0  | }  | 
2313  |  |  | 
2314  |  | void RawCompress(const char* input, size_t input_length, char* compressed,  | 
2315  | 0  |                  size_t* compressed_length, CompressionOptions options) { | 
2316  | 0  |   ByteArraySource reader(input, input_length);  | 
2317  | 0  |   UncheckedByteArraySink writer(compressed);  | 
2318  | 0  |   Compress(&reader, &writer, options);  | 
2319  |  |  | 
2320  |  |   // Compute how many bytes were added  | 
2321  | 0  |   *compressed_length = (writer.CurrentDestination() - compressed);  | 
2322  | 0  | }  | 
2323  |  |  | 
2324  |  | void RawCompressFromIOVec(const struct iovec* iov, size_t uncompressed_length,  | 
2325  | 0  |                           char* compressed, size_t* compressed_length) { | 
2326  | 0  |   RawCompressFromIOVec(iov, uncompressed_length, compressed, compressed_length,  | 
2327  | 0  |                        CompressionOptions{}); | 
2328  | 0  | }  | 
2329  |  |  | 
2330  |  | void RawCompressFromIOVec(const struct iovec* iov, size_t uncompressed_length,  | 
2331  |  |                           char* compressed, size_t* compressed_length,  | 
2332  | 0  |                           CompressionOptions options) { | 
2333  | 0  |   SnappyIOVecReader reader(iov, uncompressed_length);  | 
2334  | 0  |   UncheckedByteArraySink writer(compressed);  | 
2335  | 0  |   Compress(&reader, &writer, options);  | 
2336  |  |  | 
2337  |  |   // Compute how many bytes were added.  | 
2338  | 0  |   *compressed_length = writer.CurrentDestination() - compressed;  | 
2339  | 0  | }  | 
2340  |  |  | 
2341  |  | size_t Compress(const char* input, size_t input_length,  | 
2342  | 0  |                 std::string* compressed) { | 
2343  | 0  |   return Compress(input, input_length, compressed, CompressionOptions{}); | 
2344  | 0  | }  | 
2345  |  |  | 
2346  |  | size_t Compress(const char* input, size_t input_length, std::string* compressed,  | 
2347  | 0  |                 CompressionOptions options) { | 
2348  |  |   // Pre-grow the buffer to the max length of the compressed output  | 
2349  | 0  |   STLStringResizeUninitialized(compressed, MaxCompressedLength(input_length));  | 
2350  |  | 
  | 
2351  | 0  |   size_t compressed_length;  | 
2352  | 0  |   RawCompress(input, input_length, string_as_array(compressed),  | 
2353  | 0  |               &compressed_length, options);  | 
2354  | 0  |   compressed->erase(compressed_length);  | 
2355  | 0  |   return compressed_length;  | 
2356  | 0  | }  | 
2357  |  |  | 
2358  |  | size_t CompressFromIOVec(const struct iovec* iov, size_t iov_cnt,  | 
2359  | 0  |                          std::string* compressed) { | 
2360  | 0  |   return CompressFromIOVec(iov, iov_cnt, compressed, CompressionOptions{}); | 
2361  | 0  | }  | 
2362  |  |  | 
2363  |  | size_t CompressFromIOVec(const struct iovec* iov, size_t iov_cnt,  | 
2364  | 0  |                          std::string* compressed, CompressionOptions options) { | 
2365  |  |   // Compute the number of bytes to be compressed.  | 
2366  | 0  |   size_t uncompressed_length = 0;  | 
2367  | 0  |   for (size_t i = 0; i < iov_cnt; ++i) { | 
2368  | 0  |     uncompressed_length += iov[i].iov_len;  | 
2369  | 0  |   }  | 
2370  |  |  | 
2371  |  |   // Pre-grow the buffer to the max length of the compressed output.  | 
2372  | 0  |   STLStringResizeUninitialized(compressed, MaxCompressedLength(  | 
2373  | 0  |       uncompressed_length));  | 
2374  |  | 
  | 
2375  | 0  |   size_t compressed_length;  | 
2376  | 0  |   RawCompressFromIOVec(iov, uncompressed_length, string_as_array(compressed),  | 
2377  | 0  |                        &compressed_length, options);  | 
2378  | 0  |   compressed->erase(compressed_length);  | 
2379  | 0  |   return compressed_length;  | 
2380  | 0  | }  | 
2381  |  |  | 
2382  |  | // -----------------------------------------------------------------------  | 
2383  |  | // Sink interface  | 
2384  |  | // -----------------------------------------------------------------------  | 
2385  |  |  | 
2386  |  | // A type that decompresses into a Sink. The template parameter  | 
2387  |  | // Allocator must export one method "char* Allocate(int size);", which  | 
2388  |  | // allocates a buffer of "size" and appends that to the destination.  | 
2389  |  | template <typename Allocator>  | 
2390  |  | class SnappyScatteredWriter { | 
2391  |  |   Allocator allocator_;  | 
2392  |  |  | 
2393  |  |   // We need random access into the data generated so far.  Therefore  | 
2394  |  |   // we keep track of all of the generated data as an array of blocks.  | 
2395  |  |   // All of the blocks except the last have length kBlockSize.  | 
2396  |  |   std::vector<char*> blocks_;  | 
2397  |  |   size_t expected_;  | 
2398  |  |  | 
2399  |  |   // Total size of all fully generated blocks so far  | 
2400  |  |   size_t full_size_;  | 
2401  |  |  | 
2402  |  |   // Pointer into current output block  | 
2403  |  |   char* op_base_;   // Base of output block  | 
2404  |  |   char* op_ptr_;    // Pointer to next unfilled byte in block  | 
2405  |  |   char* op_limit_;  // Pointer just past block  | 
2406  |  |   // If op < op_limit_min_slop_ then it's safe to unconditionally write  | 
2407  |  |   // kSlopBytes starting at op.  | 
2408  |  |   char* op_limit_min_slop_;  | 
2409  |  |  | 
2410  | 0  |   inline size_t Size() const { return full_size_ + (op_ptr_ - op_base_); } | 
2411  |  |  | 
2412  |  |   bool SlowAppend(const char* ip, size_t len);  | 
2413  |  |   bool SlowAppendFromSelf(size_t offset, size_t len);  | 
2414  |  |  | 
2415  |  |  public:  | 
2416  |  |   inline explicit SnappyScatteredWriter(const Allocator& allocator)  | 
2417  | 0  |       : allocator_(allocator),  | 
2418  | 0  |         full_size_(0),  | 
2419  |  |         op_base_(NULL),  | 
2420  |  |         op_ptr_(NULL),  | 
2421  |  |         op_limit_(NULL),  | 
2422  | 0  |         op_limit_min_slop_(NULL) {} | 
2423  | 0  |   char* GetOutputPtr() { return op_ptr_; } | 
2424  | 0  |   char* GetBase(ptrdiff_t* op_limit_min_slop) { | 
2425  | 0  |     *op_limit_min_slop = op_limit_min_slop_ - op_base_;  | 
2426  | 0  |     return op_base_;  | 
2427  | 0  |   }  | 
2428  | 0  |   void SetOutputPtr(char* op) { op_ptr_ = op; } | 
2429  |  |  | 
2430  | 0  |   inline void SetExpectedLength(size_t len) { | 
2431  | 0  |     assert(blocks_.empty());  | 
2432  | 0  |     expected_ = len;  | 
2433  | 0  |   }  | 
2434  |  |  | 
2435  | 0  |   inline bool CheckLength() const { return Size() == expected_; } | 
2436  |  |  | 
2437  |  |   // Return the number of bytes actually uncompressed so far  | 
2438  | 0  |   inline size_t Produced() const { return Size(); } | 
2439  |  |  | 
2440  | 0  |   inline bool Append(const char* ip, size_t len, char** op_p) { | 
2441  | 0  |     char* op = *op_p;  | 
2442  | 0  |     size_t avail = op_limit_ - op;  | 
2443  | 0  |     if (len <= avail) { | 
2444  |  |       // Fast path  | 
2445  | 0  |       std::memcpy(op, ip, len);  | 
2446  | 0  |       *op_p = op + len;  | 
2447  | 0  |       return true;  | 
2448  | 0  |     } else { | 
2449  | 0  |       op_ptr_ = op;  | 
2450  | 0  |       bool res = SlowAppend(ip, len);  | 
2451  | 0  |       *op_p = op_ptr_;  | 
2452  | 0  |       return res;  | 
2453  | 0  |     }  | 
2454  | 0  |   }  | 
2455  |  |  | 
2456  |  |   inline bool TryFastAppend(const char* ip, size_t available, size_t length,  | 
2457  | 0  |                             char** op_p) { | 
2458  | 0  |     char* op = *op_p;  | 
2459  | 0  |     const int space_left = op_limit_ - op;  | 
2460  | 0  |     if (length <= 16 && available >= 16 + kMaximumTagLength &&  | 
2461  | 0  |         space_left >= 16) { | 
2462  |  |       // Fast path, used for the majority (about 95%) of invocations.  | 
2463  | 0  |       UnalignedCopy128(ip, op);  | 
2464  | 0  |       *op_p = op + length;  | 
2465  | 0  |       return true;  | 
2466  | 0  |     } else { | 
2467  | 0  |       return false;  | 
2468  | 0  |     }  | 
2469  | 0  |   }  | 
2470  |  |  | 
2471  | 0  |   inline bool AppendFromSelf(size_t offset, size_t len, char** op_p) { | 
2472  | 0  |     char* op = *op_p;  | 
2473  | 0  |     assert(op >= op_base_);  | 
2474  |  |     // Check if we try to append from before the start of the buffer.  | 
2475  | 0  |     if (SNAPPY_PREDICT_FALSE((kSlopBytes < 64 && len > kSlopBytes) ||  | 
2476  | 0  |                             static_cast<size_t>(op - op_base_) < offset ||  | 
2477  | 0  |                             op >= op_limit_min_slop_ || offset < len)) { | 
2478  | 0  |       if (offset == 0) return false;  | 
2479  | 0  |       if (SNAPPY_PREDICT_FALSE(static_cast<size_t>(op - op_base_) < offset ||  | 
2480  | 0  |                               op + len > op_limit_)) { | 
2481  | 0  |         op_ptr_ = op;  | 
2482  | 0  |         bool res = SlowAppendFromSelf(offset, len);  | 
2483  | 0  |         *op_p = op_ptr_;  | 
2484  | 0  |         return res;  | 
2485  | 0  |       }  | 
2486  | 0  |       *op_p = IncrementalCopy(op - offset, op, op + len, op_limit_);  | 
2487  | 0  |       return true;  | 
2488  | 0  |     }  | 
2489  |  |     // Fast path  | 
2490  | 0  |     char* const op_end = op + len;  | 
2491  | 0  |     std::memmove(op, op - offset, kSlopBytes);  | 
2492  | 0  |     *op_p = op_end;  | 
2493  | 0  |     return true;  | 
2494  | 0  |   }  | 
2495  |  |  | 
2496  |  |   // Called at the end of the decompress. We ask the allocator  | 
2497  |  |   // write all blocks to the sink.  | 
2498  | 0  |   inline void Flush() { allocator_.Flush(Produced()); } | 
2499  |  | };  | 
2500  |  |  | 
2501  |  | template <typename Allocator>  | 
2502  | 0  | bool SnappyScatteredWriter<Allocator>::SlowAppend(const char* ip, size_t len) { | 
2503  | 0  |   size_t avail = op_limit_ - op_ptr_;  | 
2504  | 0  |   while (len > avail) { | 
2505  |  |     // Completely fill this block  | 
2506  | 0  |     std::memcpy(op_ptr_, ip, avail);  | 
2507  | 0  |     op_ptr_ += avail;  | 
2508  | 0  |     assert(op_limit_ - op_ptr_ == 0);  | 
2509  | 0  |     full_size_ += (op_ptr_ - op_base_);  | 
2510  | 0  |     len -= avail;  | 
2511  | 0  |     ip += avail;  | 
2512  |  |  | 
2513  |  |     // Bounds check  | 
2514  | 0  |     if (full_size_ + len > expected_) return false;  | 
2515  |  |  | 
2516  |  |     // Make new block  | 
2517  | 0  |     size_t bsize = std::min<size_t>(kBlockSize, expected_ - full_size_);  | 
2518  | 0  |     op_base_ = allocator_.Allocate(bsize);  | 
2519  | 0  |     op_ptr_ = op_base_;  | 
2520  | 0  |     op_limit_ = op_base_ + bsize;  | 
2521  | 0  |     op_limit_min_slop_ = op_limit_ - std::min<size_t>(kSlopBytes - 1, bsize);  | 
2522  |  | 
  | 
2523  | 0  |     blocks_.push_back(op_base_);  | 
2524  | 0  |     avail = bsize;  | 
2525  | 0  |   }  | 
2526  |  |  | 
2527  | 0  |   std::memcpy(op_ptr_, ip, len);  | 
2528  | 0  |   op_ptr_ += len;  | 
2529  | 0  |   return true;  | 
2530  | 0  | }  | 
2531  |  |  | 
2532  |  | template <typename Allocator>  | 
2533  |  | bool SnappyScatteredWriter<Allocator>::SlowAppendFromSelf(size_t offset,  | 
2534  | 0  |                                                          size_t len) { | 
2535  |  |   // Overflow check  | 
2536  |  |   // See SnappyArrayWriter::AppendFromSelf for an explanation of  | 
2537  |  |   // the "offset - 1u" trick.  | 
2538  | 0  |   const size_t cur = Size();  | 
2539  | 0  |   if (offset - 1u >= cur) return false;  | 
2540  | 0  |   if (expected_ - cur < len) return false;  | 
2541  |  |  | 
2542  |  |   // Currently we shouldn't ever hit this path because Compress() chops the  | 
2543  |  |   // input into blocks and does not create cross-block copies. However, it is  | 
2544  |  |   // nice if we do not rely on that, since we can get better compression if we  | 
2545  |  |   // allow cross-block copies and thus might want to change the compressor in  | 
2546  |  |   // the future.  | 
2547  |  |   // TODO Replace this with a properly optimized path. This is not  | 
2548  |  |   // triggered right now. But this is so super slow, that it would regress  | 
2549  |  |   // performance unacceptably if triggered.  | 
2550  | 0  |   size_t src = cur - offset;  | 
2551  | 0  |   char* op = op_ptr_;  | 
2552  | 0  |   while (len-- > 0) { | 
2553  | 0  |     char c = blocks_[src >> kBlockLog][src & (kBlockSize - 1)];  | 
2554  | 0  |     if (!Append(&c, 1, &op)) { | 
2555  | 0  |       op_ptr_ = op;  | 
2556  | 0  |       return false;  | 
2557  | 0  |     }  | 
2558  | 0  |     src++;  | 
2559  | 0  |   }  | 
2560  | 0  |   op_ptr_ = op;  | 
2561  | 0  |   return true;  | 
2562  | 0  | }  | 
2563  |  |  | 
2564  |  | class SnappySinkAllocator { | 
2565  |  |  public:  | 
2566  | 0  |   explicit SnappySinkAllocator(Sink* dest) : dest_(dest) {} | 
2567  |  |  | 
2568  | 0  |   char* Allocate(int size) { | 
2569  | 0  |     Datablock block(new char[size], size);  | 
2570  | 0  |     blocks_.push_back(block);  | 
2571  | 0  |     return block.data;  | 
2572  | 0  |   }  | 
2573  |  |  | 
2574  |  |   // We flush only at the end, because the writer wants  | 
2575  |  |   // random access to the blocks and once we hand the  | 
2576  |  |   // block over to the sink, we can't access it anymore.  | 
2577  |  |   // Also we don't write more than has been actually written  | 
2578  |  |   // to the blocks.  | 
2579  | 0  |   void Flush(size_t size) { | 
2580  | 0  |     size_t size_written = 0;  | 
2581  | 0  |     for (Datablock& block : blocks_) { | 
2582  | 0  |       size_t block_size = std::min<size_t>(block.size, size - size_written);  | 
2583  | 0  |       dest_->AppendAndTakeOwnership(block.data, block_size,  | 
2584  | 0  |                                     &SnappySinkAllocator::Deleter, NULL);  | 
2585  | 0  |       size_written += block_size;  | 
2586  | 0  |     }  | 
2587  | 0  |     blocks_.clear();  | 
2588  | 0  |   }  | 
2589  |  |  | 
2590  |  |  private:  | 
2591  |  |   struct Datablock { | 
2592  |  |     char* data;  | 
2593  |  |     size_t size;  | 
2594  | 0  |     Datablock(char* p, size_t s) : data(p), size(s) {} | 
2595  |  |   };  | 
2596  |  |  | 
2597  | 0  |   static void Deleter(void* arg, const char* bytes, size_t size) { | 
2598  |  |     // TODO: Switch to [[maybe_unused]] when we can assume C++17.  | 
2599  | 0  |     (void)arg;  | 
2600  | 0  |     (void)size;  | 
2601  |  | 
  | 
2602  | 0  |     delete[] bytes;  | 
2603  | 0  |   }  | 
2604  |  |  | 
2605  |  |   Sink* dest_;  | 
2606  |  |   std::vector<Datablock> blocks_;  | 
2607  |  |  | 
2608  |  |   // Note: copying this object is allowed  | 
2609  |  | };  | 
2610  |  |  | 
2611  | 0  | size_t UncompressAsMuchAsPossible(Source* compressed, Sink* uncompressed) { | 
2612  | 0  |   SnappySinkAllocator allocator(uncompressed);  | 
2613  | 0  |   SnappyScatteredWriter<SnappySinkAllocator> writer(allocator);  | 
2614  | 0  |   InternalUncompress(compressed, &writer);  | 
2615  | 0  |   return writer.Produced();  | 
2616  | 0  | }  | 
2617  |  |  | 
2618  | 0  | bool Uncompress(Source* compressed, Sink* uncompressed) { | 
2619  |  |   // Read the uncompressed length from the front of the compressed input  | 
2620  | 0  |   SnappyDecompressor decompressor(compressed);  | 
2621  | 0  |   uint32_t uncompressed_len = 0;  | 
2622  | 0  |   if (!decompressor.ReadUncompressedLength(&uncompressed_len)) { | 
2623  | 0  |     return false;  | 
2624  | 0  |   }  | 
2625  |  |  | 
2626  | 0  |   char c;  | 
2627  | 0  |   size_t allocated_size;  | 
2628  | 0  |   char* buf = uncompressed->GetAppendBufferVariable(1, uncompressed_len, &c, 1,  | 
2629  | 0  |                                                     &allocated_size);  | 
2630  |  | 
  | 
2631  | 0  |   const size_t compressed_len = compressed->Available();  | 
2632  |  |   // If we can get a flat buffer, then use it, otherwise do block by block  | 
2633  |  |   // uncompression  | 
2634  | 0  |   if (allocated_size >= uncompressed_len) { | 
2635  | 0  |     SnappyArrayWriter writer(buf);  | 
2636  | 0  |     bool result = InternalUncompressAllTags(&decompressor, &writer,  | 
2637  | 0  |                                             compressed_len, uncompressed_len);  | 
2638  | 0  |     uncompressed->Append(buf, writer.Produced());  | 
2639  | 0  |     return result;  | 
2640  | 0  |   } else { | 
2641  | 0  |     SnappySinkAllocator allocator(uncompressed);  | 
2642  | 0  |     SnappyScatteredWriter<SnappySinkAllocator> writer(allocator);  | 
2643  | 0  |     return InternalUncompressAllTags(&decompressor, &writer, compressed_len,  | 
2644  | 0  |                                      uncompressed_len);  | 
2645  | 0  |   }  | 
2646  | 0  | }  | 
2647  |  |  | 
2648  |  | }  // namespace snappy  |