/work/_deps/deflate-src/lib/hc_matchfinder.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * hc_matchfinder.h - Lempel-Ziv matchfinding with a hash table of linked lists |
3 | | * |
4 | | * Copyright 2016 Eric Biggers |
5 | | * |
6 | | * Permission is hereby granted, free of charge, to any person |
7 | | * obtaining a copy of this software and associated documentation |
8 | | * files (the "Software"), to deal in the Software without |
9 | | * restriction, including without limitation the rights to use, |
10 | | * copy, modify, merge, publish, distribute, sublicense, and/or sell |
11 | | * copies of the Software, and to permit persons to whom the |
12 | | * Software is furnished to do so, subject to the following |
13 | | * conditions: |
14 | | * |
15 | | * The above copyright notice and this permission notice shall be |
16 | | * included in all copies or substantial portions of the Software. |
17 | | * |
18 | | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
19 | | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES |
20 | | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
21 | | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT |
22 | | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, |
23 | | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
24 | | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR |
25 | | * OTHER DEALINGS IN THE SOFTWARE. |
26 | | * |
27 | | * --------------------------------------------------------------------------- |
28 | | * |
29 | | * Algorithm |
30 | | * |
31 | | * This is a Hash Chains (hc) based matchfinder. |
32 | | * |
33 | | * The main data structure is a hash table where each hash bucket contains a |
34 | | * linked list (or "chain") of sequences whose first 4 bytes share the same hash |
35 | | * code. Each sequence is identified by its starting position in the input |
36 | | * buffer. |
37 | | * |
38 | | * The algorithm processes the input buffer sequentially. At each byte |
39 | | * position, the hash code of the first 4 bytes of the sequence beginning at |
40 | | * that position (the sequence being matched against) is computed. This |
41 | | * identifies the hash bucket to use for that position. Then, this hash |
42 | | * bucket's linked list is searched for matches. Then, a new linked list node |
43 | | * is created to represent the current sequence and is prepended to the list. |
44 | | * |
45 | | * This algorithm has several useful properties: |
46 | | * |
47 | | * - It only finds true Lempel-Ziv matches; i.e., those where the matching |
48 | | * sequence occurs prior to the sequence being matched against. |
49 | | * |
50 | | * - The sequences in each linked list are always sorted by decreasing starting |
51 | | * position. Therefore, the closest (smallest offset) matches are found |
52 | | * first, which in many compression formats tend to be the cheapest to encode. |
53 | | * |
54 | | * - Although fast running time is not guaranteed due to the possibility of the |
55 | | * lists getting very long, the worst degenerate behavior can be easily |
56 | | * prevented by capping the number of nodes searched at each position. |
57 | | * |
58 | | * - If the compressor decides not to search for matches at a certain position, |
59 | | * then that position can be quickly inserted without searching the list. |
60 | | * |
61 | | * - The algorithm is adaptable to sliding windows: just store the positions |
62 | | * relative to a "base" value that is updated from time to time, and stop |
63 | | * searching each list when the sequences get too far away. |
64 | | * |
65 | | * ---------------------------------------------------------------------------- |
66 | | * |
67 | | * Optimizations |
68 | | * |
69 | | * The main hash table and chains handle length 4+ matches. Length 3 matches |
70 | | * are handled by a separate hash table with no chains. This works well for |
71 | | * typical "greedy" or "lazy"-style compressors, where length 3 matches are |
72 | | * often only helpful if they have small offsets. Instead of searching a full |
73 | | * chain for length 3+ matches, the algorithm just checks for one close length 3 |
74 | | * match, then focuses on finding length 4+ matches. |
75 | | * |
76 | | * The longest_match() and skip_bytes() functions are inlined into the |
77 | | * compressors that use them. This isn't just about saving the overhead of a |
78 | | * function call. These functions are intended to be called from the inner |
79 | | * loops of compressors, where giving the compiler more control over register |
80 | | * allocation is very helpful. There is also significant benefit to be gained |
81 | | * from allowing the CPU to predict branches independently at each call site. |
82 | | * For example, "lazy"-style compressors can be written with two calls to |
83 | | * longest_match(), each of which starts with a different 'best_len' and |
84 | | * therefore has significantly different performance characteristics. |
85 | | * |
86 | | * Although any hash function can be used, a multiplicative hash is fast and |
87 | | * works well. |
88 | | * |
89 | | * On some processors, it is significantly faster to extend matches by whole |
90 | | * words (32 or 64 bits) instead of by individual bytes. For this to be the |
91 | | * case, the processor must implement unaligned memory accesses efficiently and |
92 | | * must have either a fast "find first set bit" instruction or a fast "find last |
93 | | * set bit" instruction, depending on the processor's endianness. |
94 | | * |
95 | | * The code uses one loop for finding the first match and one loop for finding a |
96 | | * longer match. Each of these loops is tuned for its respective task and in |
97 | | * combination are faster than a single generalized loop that handles both |
98 | | * tasks. |
99 | | * |
100 | | * The code also uses a tight inner loop that only compares the last and first |
101 | | * bytes of a potential match. It is only when these bytes match that a full |
102 | | * match extension is attempted. |
103 | | * |
104 | | * ---------------------------------------------------------------------------- |
105 | | */ |
106 | | |
107 | | #ifndef LIB_HC_MATCHFINDER_H |
108 | | #define LIB_HC_MATCHFINDER_H |
109 | | |
110 | | #include "matchfinder_common.h" |
111 | | |
112 | 0 | #define HC_MATCHFINDER_HASH3_ORDER 15 |
113 | 0 | #define HC_MATCHFINDER_HASH4_ORDER 16 |
114 | | |
115 | | #define HC_MATCHFINDER_TOTAL_HASH_SIZE \ |
116 | 0 | (((1UL << HC_MATCHFINDER_HASH3_ORDER) + \ |
117 | 0 | (1UL << HC_MATCHFINDER_HASH4_ORDER)) * sizeof(mf_pos_t)) |
118 | | |
119 | | struct MATCHFINDER_ALIGNED hc_matchfinder { |
120 | | |
121 | | /* The hash table for finding length 3 matches */ |
122 | | mf_pos_t hash3_tab[1UL << HC_MATCHFINDER_HASH3_ORDER]; |
123 | | |
124 | | /* The hash table which contains the first nodes of the linked lists for |
125 | | * finding length 4+ matches */ |
126 | | mf_pos_t hash4_tab[1UL << HC_MATCHFINDER_HASH4_ORDER]; |
127 | | |
128 | | /* The "next node" references for the linked lists. The "next node" of |
129 | | * the node for the sequence with position 'pos' is 'next_tab[pos]'. */ |
130 | | mf_pos_t next_tab[MATCHFINDER_WINDOW_SIZE]; |
131 | | }; |
132 | | |
133 | | /* Prepare the matchfinder for a new input buffer. */ |
134 | | static forceinline void |
135 | | hc_matchfinder_init(struct hc_matchfinder *mf) |
136 | 0 | { |
137 | 0 | STATIC_ASSERT(HC_MATCHFINDER_TOTAL_HASH_SIZE % |
138 | 0 | MATCHFINDER_SIZE_ALIGNMENT == 0); |
139 | |
|
140 | 0 | matchfinder_init((mf_pos_t *)mf, HC_MATCHFINDER_TOTAL_HASH_SIZE); |
141 | 0 | } |
142 | | |
143 | | static forceinline void |
144 | | hc_matchfinder_slide_window(struct hc_matchfinder *mf) |
145 | 0 | { |
146 | 0 | STATIC_ASSERT(sizeof(*mf) % MATCHFINDER_SIZE_ALIGNMENT == 0); |
147 | |
|
148 | 0 | matchfinder_rebase((mf_pos_t *)mf, sizeof(*mf)); |
149 | 0 | } |
150 | | |
151 | | /* |
152 | | * Find the longest match longer than 'best_len' bytes. |
153 | | * |
154 | | * @mf |
155 | | * The matchfinder structure. |
156 | | * @in_base_p |
157 | | * Location of a pointer which points to the place in the input data the |
158 | | * matchfinder currently stores positions relative to. This may be updated |
159 | | * by this function. |
160 | | * @in_next |
161 | | * Pointer to the next position in the input buffer, i.e. the sequence |
162 | | * being matched against. |
163 | | * @best_len |
164 | | * Require a match longer than this length. |
165 | | * @max_len |
166 | | * The maximum permissible match length at this position. |
167 | | * @nice_len |
168 | | * Stop searching if a match of at least this length is found. |
169 | | * Must be <= @max_len. |
170 | | * @max_search_depth |
171 | | * Limit on the number of potential matches to consider. Must be >= 1. |
172 | | * @next_hashes |
173 | | * The precomputed hash codes for the sequence beginning at @in_next. |
174 | | * These will be used and then updated with the precomputed hashcodes for |
175 | | * the sequence beginning at @in_next + 1. |
176 | | * @offset_ret |
177 | | * If a match is found, its offset is returned in this location. |
178 | | * |
179 | | * Return the length of the match found, or 'best_len' if no match longer than |
180 | | * 'best_len' was found. |
181 | | */ |
182 | | static forceinline u32 |
183 | | hc_matchfinder_longest_match(struct hc_matchfinder * const mf, |
184 | | const u8 ** const in_base_p, |
185 | | const u8 * const in_next, |
186 | | u32 best_len, |
187 | | const u32 max_len, |
188 | | const u32 nice_len, |
189 | | const u32 max_search_depth, |
190 | | u32 * const next_hashes, |
191 | | u32 * const offset_ret) |
192 | 0 | { |
193 | 0 | u32 depth_remaining = max_search_depth; |
194 | 0 | const u8 *best_matchptr = in_next; |
195 | 0 | mf_pos_t cur_node3, cur_node4; |
196 | 0 | u32 hash3, hash4; |
197 | 0 | u32 next_hashseq; |
198 | 0 | u32 seq4; |
199 | 0 | const u8 *matchptr; |
200 | 0 | u32 len; |
201 | 0 | u32 cur_pos = in_next - *in_base_p; |
202 | 0 | const u8 *in_base; |
203 | 0 | mf_pos_t cutoff; |
204 | |
|
205 | 0 | if (cur_pos == MATCHFINDER_WINDOW_SIZE) { |
206 | 0 | hc_matchfinder_slide_window(mf); |
207 | 0 | *in_base_p += MATCHFINDER_WINDOW_SIZE; |
208 | 0 | cur_pos = 0; |
209 | 0 | } |
210 | |
|
211 | 0 | in_base = *in_base_p; |
212 | 0 | cutoff = cur_pos - MATCHFINDER_WINDOW_SIZE; |
213 | |
|
214 | 0 | if (unlikely(max_len < 5)) /* can we read 4 bytes from 'in_next + 1'? */ |
215 | 0 | goto out; |
216 | | |
217 | | /* Get the precomputed hash codes. */ |
218 | 0 | hash3 = next_hashes[0]; |
219 | 0 | hash4 = next_hashes[1]; |
220 | | |
221 | | /* From the hash buckets, get the first node of each linked list. */ |
222 | 0 | cur_node3 = mf->hash3_tab[hash3]; |
223 | 0 | cur_node4 = mf->hash4_tab[hash4]; |
224 | | |
225 | | /* Update for length 3 matches. This replaces the singleton node in the |
226 | | * 'hash3' bucket with the node for the current sequence. */ |
227 | 0 | mf->hash3_tab[hash3] = cur_pos; |
228 | | |
229 | | /* Update for length 4 matches. This prepends the node for the current |
230 | | * sequence to the linked list in the 'hash4' bucket. */ |
231 | 0 | mf->hash4_tab[hash4] = cur_pos; |
232 | 0 | mf->next_tab[cur_pos] = cur_node4; |
233 | | |
234 | | /* Compute the next hash codes. */ |
235 | 0 | next_hashseq = get_unaligned_le32(in_next + 1); |
236 | 0 | next_hashes[0] = lz_hash(next_hashseq & 0xFFFFFF, HC_MATCHFINDER_HASH3_ORDER); |
237 | 0 | next_hashes[1] = lz_hash(next_hashseq, HC_MATCHFINDER_HASH4_ORDER); |
238 | 0 | prefetchw(&mf->hash3_tab[next_hashes[0]]); |
239 | 0 | prefetchw(&mf->hash4_tab[next_hashes[1]]); |
240 | |
|
241 | 0 | if (best_len < 4) { /* No match of length >= 4 found yet? */ |
242 | | |
243 | | /* Check for a length 3 match if needed. */ |
244 | |
|
245 | 0 | if (cur_node3 <= cutoff) |
246 | 0 | goto out; |
247 | | |
248 | 0 | seq4 = load_u32_unaligned(in_next); |
249 | |
|
250 | 0 | if (best_len < 3) { |
251 | 0 | matchptr = &in_base[cur_node3]; |
252 | 0 | if (load_u24_unaligned(matchptr) == loaded_u32_to_u24(seq4)) { |
253 | 0 | best_len = 3; |
254 | 0 | best_matchptr = matchptr; |
255 | 0 | } |
256 | 0 | } |
257 | | |
258 | | /* Check for a length 4 match. */ |
259 | |
|
260 | 0 | if (cur_node4 <= cutoff) |
261 | 0 | goto out; |
262 | | |
263 | 0 | for (;;) { |
264 | | /* No length 4 match found yet. Check the first 4 bytes. */ |
265 | 0 | matchptr = &in_base[cur_node4]; |
266 | |
|
267 | 0 | if (load_u32_unaligned(matchptr) == seq4) |
268 | 0 | break; |
269 | | |
270 | | /* The first 4 bytes did not match. Keep trying. */ |
271 | 0 | cur_node4 = mf->next_tab[cur_node4 & (MATCHFINDER_WINDOW_SIZE - 1)]; |
272 | 0 | if (cur_node4 <= cutoff || !--depth_remaining) |
273 | 0 | goto out; |
274 | 0 | } |
275 | | |
276 | | /* Found a match of length >= 4. Extend it to its full length. */ |
277 | 0 | best_matchptr = matchptr; |
278 | 0 | best_len = lz_extend(in_next, best_matchptr, 4, max_len); |
279 | 0 | if (best_len >= nice_len) |
280 | 0 | goto out; |
281 | 0 | cur_node4 = mf->next_tab[cur_node4 & (MATCHFINDER_WINDOW_SIZE - 1)]; |
282 | 0 | if (cur_node4 <= cutoff || !--depth_remaining) |
283 | 0 | goto out; |
284 | 0 | } else { |
285 | 0 | if (cur_node4 <= cutoff || best_len >= nice_len) |
286 | 0 | goto out; |
287 | 0 | } |
288 | | |
289 | | /* Check for matches of length >= 5. */ |
290 | | |
291 | 0 | for (;;) { |
292 | 0 | for (;;) { |
293 | 0 | matchptr = &in_base[cur_node4]; |
294 | | |
295 | | /* Already found a length 4 match. Try for a longer |
296 | | * match; start by checking either the last 4 bytes and |
297 | | * the first 4 bytes, or the last byte. (The last byte, |
298 | | * the one which would extend the match length by 1, is |
299 | | * the most important.) */ |
300 | 0 | #if UNALIGNED_ACCESS_IS_FAST |
301 | 0 | if ((load_u32_unaligned(matchptr + best_len - 3) == |
302 | 0 | load_u32_unaligned(in_next + best_len - 3)) && |
303 | 0 | (load_u32_unaligned(matchptr) == |
304 | 0 | load_u32_unaligned(in_next))) |
305 | | #else |
306 | | if (matchptr[best_len] == in_next[best_len]) |
307 | | #endif |
308 | 0 | break; |
309 | | |
310 | | /* Continue to the next node in the list. */ |
311 | 0 | cur_node4 = mf->next_tab[cur_node4 & (MATCHFINDER_WINDOW_SIZE - 1)]; |
312 | 0 | if (cur_node4 <= cutoff || !--depth_remaining) |
313 | 0 | goto out; |
314 | 0 | } |
315 | | |
316 | 0 | #if UNALIGNED_ACCESS_IS_FAST |
317 | 0 | len = 4; |
318 | | #else |
319 | | len = 0; |
320 | | #endif |
321 | 0 | len = lz_extend(in_next, matchptr, len, max_len); |
322 | 0 | if (len > best_len) { |
323 | | /* This is the new longest match. */ |
324 | 0 | best_len = len; |
325 | 0 | best_matchptr = matchptr; |
326 | 0 | if (best_len >= nice_len) |
327 | 0 | goto out; |
328 | 0 | } |
329 | | |
330 | | /* Continue to the next node in the list. */ |
331 | 0 | cur_node4 = mf->next_tab[cur_node4 & (MATCHFINDER_WINDOW_SIZE - 1)]; |
332 | 0 | if (cur_node4 <= cutoff || !--depth_remaining) |
333 | 0 | goto out; |
334 | 0 | } |
335 | 0 | out: |
336 | 0 | *offset_ret = in_next - best_matchptr; |
337 | 0 | return best_len; |
338 | 0 | } |
339 | | |
340 | | /* |
341 | | * Advance the matchfinder, but don't search for matches. |
342 | | * |
343 | | * @mf |
344 | | * The matchfinder structure. |
345 | | * @in_base_p |
346 | | * Location of a pointer which points to the place in the input data the |
347 | | * matchfinder currently stores positions relative to. This may be updated |
348 | | * by this function. |
349 | | * @in_next |
350 | | * Pointer to the next position in the input buffer. |
351 | | * @in_end |
352 | | * Pointer to the end of the input buffer. |
353 | | * @count |
354 | | * The number of bytes to advance. Must be > 0. |
355 | | * @next_hashes |
356 | | * The precomputed hash codes for the sequence beginning at @in_next. |
357 | | * These will be used and then updated with the precomputed hashcodes for |
358 | | * the sequence beginning at @in_next + @count. |
359 | | */ |
360 | | static forceinline void |
361 | | hc_matchfinder_skip_bytes(struct hc_matchfinder * const mf, |
362 | | const u8 ** const in_base_p, |
363 | | const u8 *in_next, |
364 | | const u8 * const in_end, |
365 | | const u32 count, |
366 | | u32 * const next_hashes) |
367 | 0 | { |
368 | 0 | u32 cur_pos; |
369 | 0 | u32 hash3, hash4; |
370 | 0 | u32 next_hashseq; |
371 | 0 | u32 remaining = count; |
372 | |
|
373 | 0 | if (unlikely(count + 5 > in_end - in_next)) |
374 | 0 | return; |
375 | | |
376 | 0 | cur_pos = in_next - *in_base_p; |
377 | 0 | hash3 = next_hashes[0]; |
378 | 0 | hash4 = next_hashes[1]; |
379 | 0 | do { |
380 | 0 | if (cur_pos == MATCHFINDER_WINDOW_SIZE) { |
381 | 0 | hc_matchfinder_slide_window(mf); |
382 | 0 | *in_base_p += MATCHFINDER_WINDOW_SIZE; |
383 | 0 | cur_pos = 0; |
384 | 0 | } |
385 | 0 | mf->hash3_tab[hash3] = cur_pos; |
386 | 0 | mf->next_tab[cur_pos] = mf->hash4_tab[hash4]; |
387 | 0 | mf->hash4_tab[hash4] = cur_pos; |
388 | |
|
389 | 0 | next_hashseq = get_unaligned_le32(++in_next); |
390 | 0 | hash3 = lz_hash(next_hashseq & 0xFFFFFF, HC_MATCHFINDER_HASH3_ORDER); |
391 | 0 | hash4 = lz_hash(next_hashseq, HC_MATCHFINDER_HASH4_ORDER); |
392 | 0 | cur_pos++; |
393 | 0 | } while (--remaining); |
394 | |
|
395 | 0 | prefetchw(&mf->hash3_tab[hash3]); |
396 | 0 | prefetchw(&mf->hash4_tab[hash4]); |
397 | 0 | next_hashes[0] = hash3; |
398 | 0 | next_hashes[1] = hash4; |
399 | 0 | } |
400 | | |
401 | | #endif /* LIB_HC_MATCHFINDER_H */ |