Line | Count | Source (jump to first uncovered line) |
1 | | /* match_tpl.h -- find longest match template for compare256 variants |
2 | | * |
3 | | * Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler |
4 | | * For conditions of distribution and use, see copyright notice in zlib.h |
5 | | * |
6 | | * Portions copyright (C) 2014-2021 Konstantin Nosov |
7 | | * Fast-zlib optimized longest_match |
8 | | * https://github.com/gildor2/fast_zlib |
9 | | */ |
10 | | |
11 | | #ifndef MATCH_TPL_H |
12 | | #define MATCH_TPL_H |
13 | | |
14 | 0 | #define EARLY_EXIT_TRIGGER_LEVEL 5 |
15 | | |
16 | | #endif |
17 | | |
18 | | /* Set match_start to the longest match starting at the given string and |
19 | | * return its length. Matches shorter or equal to prev_length are discarded, |
20 | | * in which case the result is equal to prev_length and match_start is garbage. |
21 | | * |
22 | | * IN assertions: cur_match is the head of the hash chain for the current |
23 | | * string (strstart) and its distance is <= MAX_DIST, and prev_length >=1 |
24 | | * OUT assertion: the match length is not greater than s->lookahead |
25 | | * |
26 | | * The LONGEST_MATCH_SLOW variant spends more time to attempt to find longer |
27 | | * matches once a match has already been found. |
28 | | */ |
29 | 1.58M | Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, Pos cur_match) { |
30 | 1.58M | unsigned int strstart = s->strstart; |
31 | 1.58M | const unsigned wmask = s->w_mask; |
32 | 1.58M | unsigned char *window = s->window; |
33 | 1.58M | unsigned char *scan = window + strstart; |
34 | 1.58M | Z_REGISTER unsigned char *mbase_start = window; |
35 | 1.58M | Z_REGISTER unsigned char *mbase_end; |
36 | 1.58M | const Pos *prev = s->prev; |
37 | 1.58M | Pos limit; |
38 | | #ifdef LONGEST_MATCH_SLOW |
39 | | Pos limit_base; |
40 | | #else |
41 | | int32_t early_exit; |
42 | | #endif |
43 | 1.58M | uint32_t chain_length, nice_match, best_len, offset; |
44 | 1.58M | uint32_t lookahead = s->lookahead; |
45 | 1.58M | Pos match_offset = 0; |
46 | 1.58M | uint64_t scan_start; |
47 | 1.58M | uint64_t scan_end; |
48 | | |
49 | 1.58M | #define GOTO_NEXT_CHAIN \ |
50 | 1.58M | if (--chain_length && (cur_match = prev[cur_match & wmask]) > limit) \ |
51 | 49.6k | continue; \ |
52 | 0 | return best_len; |
53 | | |
54 | | /* The code is optimized for STD_MAX_MATCH-2 multiple of 16. */ |
55 | 1.58M | Assert(STD_MAX_MATCH == 258, "Code too clever"); |
56 | | |
57 | 1.58M | best_len = s->prev_length ? s->prev_length : STD_MIN_MATCH-1; |
58 | | |
59 | | /* Calculate read offset which should only extend an extra byte |
60 | | * to find the next best match length. |
61 | | */ |
62 | 1.58M | offset = best_len-1; |
63 | 1.58M | if (best_len >= sizeof(uint32_t)) { |
64 | 2.36k | offset -= 2; |
65 | 2.36k | if (best_len >= sizeof(uint64_t)) |
66 | 2.21k | offset -= 4; |
67 | 2.36k | } |
68 | | |
69 | 1.58M | scan_start = zng_memread_8(scan); |
70 | 1.58M | scan_end = zng_memread_8(scan+offset); |
71 | 1.58M | mbase_end = (mbase_start+offset); |
72 | | |
73 | | /* Do not waste too much time if we already have a good match */ |
74 | 1.58M | chain_length = s->max_chain_length; |
75 | 1.58M | if (best_len >= s->good_match) |
76 | 1.87k | chain_length >>= 2; |
77 | 1.58M | nice_match = (uint32_t)s->nice_match; |
78 | | |
79 | | /* Stop when cur_match becomes <= limit. To simplify the code, |
80 | | * we prevent matches with the string of window index 0 |
81 | | */ |
82 | 1.58M | limit = strstart > MAX_DIST(s) ? (Pos)(strstart - MAX_DIST(s)) : 0; |
83 | | #ifdef LONGEST_MATCH_SLOW |
84 | | limit_base = limit; |
85 | 1.58M | if (best_len >= STD_MIN_MATCH) { |
86 | | /* We're continuing search (lazy evaluation). */ |
87 | 2.36k | uint32_t i, hash; |
88 | 2.36k | Pos pos; |
89 | | |
90 | | /* Find a most distant chain starting from scan with index=1 (index=0 corresponds |
91 | | * to cur_match). We cannot use s->prev[strstart+1,...] immediately, because |
92 | | * these strings are not yet inserted into the hash table. |
93 | | */ |
94 | | hash = s->update_hash(0, scan[1]); |
95 | | hash = s->update_hash(hash, scan[2]); |
96 | | |
97 | 266k | for (i = 3; i <= best_len; i++) { |
98 | 264k | hash = s->update_hash(hash, scan[i]); |
99 | | |
100 | | /* If we're starting with best_len >= 3, we can use offset search. */ |
101 | 264k | pos = s->head[hash]; |
102 | 264k | if (pos < cur_match) { |
103 | 0 | match_offset = (Pos)(i - 2); |
104 | 0 | cur_match = pos; |
105 | 0 | } |
106 | 264k | } |
107 | | |
108 | | /* Update offset-dependent variables */ |
109 | 2.36k | limit = limit_base+match_offset; |
110 | 2.36k | if (cur_match <= limit) |
111 | 0 | goto break_matching; |
112 | 2.36k | mbase_start -= match_offset; |
113 | 2.36k | mbase_end -= match_offset; |
114 | 2.36k | } |
115 | | #else |
116 | 0 | early_exit = s->level < EARLY_EXIT_TRIGGER_LEVEL; |
117 | | #endif |
118 | 1.58M | Assert((unsigned long)strstart <= s->window_size - MIN_LOOKAHEAD, "need lookahead"); |
119 | 1.91M | for (;;) { |
120 | 1.91M | if (cur_match >= strstart) |
121 | 0 | break; |
122 | | |
123 | | /* Skip to next match if the match length cannot increase or if the match length is |
124 | | * less than 2. Note that the checks below for insufficient lookahead only occur |
125 | | * occasionally for performance reasons. |
126 | | * Therefore uninitialized memory will be accessed and conditional jumps will be made |
127 | | * that depend on those values. However the length of the match is limited to the |
128 | | * lookahead, so the output of deflate is not affected by the uninitialized values. |
129 | | */ |
130 | 1.91M | if (best_len < sizeof(uint32_t)) { |
131 | 1.58M | for (;;) { |
132 | 1.58M | if (zng_memcmp_2(mbase_end+cur_match, &scan_end) == 0 && |
133 | 1.58M | zng_memcmp_2(mbase_start+cur_match, &scan_start) == 0) |
134 | 1.58M | break; |
135 | 328 | GOTO_NEXT_CHAIN; |
136 | 328 | } |
137 | 1.58M | } else if (best_len >= sizeof(uint64_t)) { |
138 | 337k | for (;;) { |
139 | 337k | if (zng_memcmp_8(mbase_end+cur_match, &scan_end) == 0 && |
140 | 337k | zng_memcmp_8(mbase_start+cur_match, &scan_start) == 0) |
141 | 319k | break; |
142 | 17.8k | GOTO_NEXT_CHAIN; |
143 | 17.8k | } |
144 | 319k | } else { |
145 | 5.30k | for (;;) { |
146 | 5.30k | if (zng_memcmp_4(mbase_end+cur_match, &scan_end) == 0 && |
147 | 5.30k | zng_memcmp_4(mbase_start+cur_match, &scan_start) == 0) |
148 | 4.75k | break; |
149 | 558 | GOTO_NEXT_CHAIN; |
150 | 558 | } |
151 | 4.75k | } |
152 | 1.91M | uint32_t len = COMPARE256(scan+2, mbase_start+cur_match+2) + 2; |
153 | 1.91M | Assert(scan+len <= window+(unsigned)(s->window_size-1), "wild scan"); |
154 | | |
155 | 1.91M | if (len > best_len) { |
156 | 1.88M | uint32_t match_start = cur_match - match_offset; |
157 | 1.88M | s->match_start = match_start; |
158 | | |
159 | | /* Do not look for matches beyond the end of the input. */ |
160 | 1.88M | if (len > lookahead) |
161 | 4.83k | return lookahead; |
162 | 1.87M | best_len = len; |
163 | 1.87M | if (best_len >= nice_match) |
164 | 1.58M | return best_len; |
165 | | |
166 | 292k | offset = best_len-1; |
167 | 292k | if (best_len >= sizeof(uint32_t)) { |
168 | 291k | offset -= 2; |
169 | 291k | if (best_len >= sizeof(uint64_t)) |
170 | 287k | offset -= 4; |
171 | 291k | } |
172 | | |
173 | 292k | scan_end = zng_memread_8(scan+offset); |
174 | | |
175 | | #ifdef LONGEST_MATCH_SLOW |
176 | | /* Look for a better string offset */ |
177 | 292k | if (UNLIKELY(len > STD_MIN_MATCH && match_start + len < strstart)) { |
178 | 291k | Pos pos, next_pos; |
179 | 291k | uint32_t i, hash; |
180 | 291k | unsigned char *scan_endstr; |
181 | | |
182 | | /* Go back to offset 0 */ |
183 | | cur_match -= match_offset; |
184 | | match_offset = 0; |
185 | | next_pos = cur_match; |
186 | 37.7M | for (i = 0; i <= len - STD_MIN_MATCH; i++) { |
187 | 37.5M | pos = prev[(cur_match + i) & wmask]; |
188 | 37.5M | if (pos < next_pos) { |
189 | | /* Hash chain is more distant, use it */ |
190 | 291k | if (pos <= limit_base + i) |
191 | 0 | goto break_matching; |
192 | 291k | next_pos = pos; |
193 | 291k | match_offset = (Pos)i; |
194 | 291k | } |
195 | 37.5M | } |
196 | | /* Switch cur_match to next_pos chain */ |
197 | 291k | cur_match = next_pos; |
198 | | |
199 | | /* Try hash head at len-(STD_MIN_MATCH-1) position to see if we could get |
200 | | * a better cur_match at the end of string. Using (STD_MIN_MATCH-1) lets |
201 | | * us include one more byte into hash - the byte which will be checked |
202 | | * in main loop now, and which allows to grow match by 1. |
203 | | */ |
204 | 291k | scan_endstr = scan + len - (STD_MIN_MATCH+1); |
205 | | |
206 | 291k | hash = s->update_hash(0, scan_endstr[0]); |
207 | 291k | hash = s->update_hash(hash, scan_endstr[1]); |
208 | 291k | hash = s->update_hash(hash, scan_endstr[2]); |
209 | | |
210 | 291k | pos = s->head[hash]; |
211 | 291k | if (pos < cur_match) { |
212 | 0 | match_offset = (Pos)(len - (STD_MIN_MATCH+1)); |
213 | 0 | if (pos <= limit_base + match_offset) |
214 | 0 | goto break_matching; |
215 | 0 | cur_match = pos; |
216 | 0 | } |
217 | | |
218 | | /* Update offset-dependent variables */ |
219 | 291k | limit = limit_base+match_offset; |
220 | 291k | mbase_start = window-match_offset; |
221 | 291k | mbase_end = (mbase_start+offset); |
222 | 291k | continue; |
223 | 291k | } |
224 | 1.14k | #endif |
225 | 1.14k | mbase_end = (mbase_start+offset); |
226 | 1.14k | } |
227 | | #ifndef LONGEST_MATCH_SLOW |
228 | 0 | else if (UNLIKELY(early_exit)) { |
229 | | /* The probability of finding a match later if we here is pretty low, so for |
230 | | * performance it's best to outright stop here for the lower compression levels |
231 | | */ |
232 | 0 | break; |
233 | 0 | } |
234 | 0 | #endif |
235 | 30.9k | GOTO_NEXT_CHAIN; |
236 | 30.9k | } |
237 | 0 | return best_len; |
238 | | |
239 | | #ifdef LONGEST_MATCH_SLOW |
240 | 0 | break_matching: |
241 | | |
242 | 0 | if (best_len < s->lookahead) |
243 | 0 | return best_len; |
244 | | |
245 | 0 | return s->lookahead; |
246 | | #endif |
247 | 0 | } Unexecuted instantiation: longest_match_sse2 Unexecuted instantiation: longest_match_slow_sse2 Unexecuted instantiation: longest_match_avx2 Line | Count | Source | 29 | 1.58M | Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, Pos cur_match) { | 30 | 1.58M | unsigned int strstart = s->strstart; | 31 | 1.58M | const unsigned wmask = s->w_mask; | 32 | 1.58M | unsigned char *window = s->window; | 33 | 1.58M | unsigned char *scan = window + strstart; | 34 | 1.58M | Z_REGISTER unsigned char *mbase_start = window; | 35 | 1.58M | Z_REGISTER unsigned char *mbase_end; | 36 | 1.58M | const Pos *prev = s->prev; | 37 | 1.58M | Pos limit; | 38 | 1.58M | #ifdef LONGEST_MATCH_SLOW | 39 | 1.58M | Pos limit_base; | 40 | | #else | 41 | | int32_t early_exit; | 42 | | #endif | 43 | 1.58M | uint32_t chain_length, nice_match, best_len, offset; | 44 | 1.58M | uint32_t lookahead = s->lookahead; | 45 | 1.58M | Pos match_offset = 0; | 46 | 1.58M | uint64_t scan_start; | 47 | 1.58M | uint64_t scan_end; | 48 | | | 49 | 1.58M | #define GOTO_NEXT_CHAIN \ | 50 | 1.58M | if (--chain_length && (cur_match = prev[cur_match & wmask]) > limit) \ | 51 | 1.58M | continue; \ | 52 | 1.58M | return best_len; | 53 | | | 54 | | /* The code is optimized for STD_MAX_MATCH-2 multiple of 16. */ | 55 | 1.58M | Assert(STD_MAX_MATCH == 258, "Code too clever"); | 56 | | | 57 | 1.58M | best_len = s->prev_length ? s->prev_length : STD_MIN_MATCH-1; | 58 | | | 59 | | /* Calculate read offset which should only extend an extra byte | 60 | | * to find the next best match length. | 61 | | */ | 62 | 1.58M | offset = best_len-1; | 63 | 1.58M | if (best_len >= sizeof(uint32_t)) { | 64 | 2.36k | offset -= 2; | 65 | 2.36k | if (best_len >= sizeof(uint64_t)) | 66 | 2.21k | offset -= 4; | 67 | 2.36k | } | 68 | | | 69 | 1.58M | scan_start = zng_memread_8(scan); | 70 | 1.58M | scan_end = zng_memread_8(scan+offset); | 71 | 1.58M | mbase_end = (mbase_start+offset); | 72 | | | 73 | | /* Do not waste too much time if we already have a good match */ | 74 | 1.58M | chain_length = s->max_chain_length; | 75 | 1.58M | if (best_len >= s->good_match) | 76 | 1.87k | chain_length >>= 2; | 77 | 1.58M | nice_match = (uint32_t)s->nice_match; | 78 | | | 79 | | /* Stop when cur_match becomes <= limit. To simplify the code, | 80 | | * we prevent matches with the string of window index 0 | 81 | | */ | 82 | 1.58M | limit = strstart > MAX_DIST(s) ? (Pos)(strstart - MAX_DIST(s)) : 0; | 83 | 1.58M | #ifdef LONGEST_MATCH_SLOW | 84 | 1.58M | limit_base = limit; | 85 | 1.58M | if (best_len >= STD_MIN_MATCH) { | 86 | | /* We're continuing search (lazy evaluation). */ | 87 | 2.36k | uint32_t i, hash; | 88 | 2.36k | Pos pos; | 89 | | | 90 | | /* Find a most distant chain starting from scan with index=1 (index=0 corresponds | 91 | | * to cur_match). We cannot use s->prev[strstart+1,...] immediately, because | 92 | | * these strings are not yet inserted into the hash table. | 93 | | */ | 94 | 2.36k | hash = s->update_hash(0, scan[1]); | 95 | 2.36k | hash = s->update_hash(hash, scan[2]); | 96 | | | 97 | 266k | for (i = 3; i <= best_len; i++) { | 98 | 264k | hash = s->update_hash(hash, scan[i]); | 99 | | | 100 | | /* If we're starting with best_len >= 3, we can use offset search. */ | 101 | 264k | pos = s->head[hash]; | 102 | 264k | if (pos < cur_match) { | 103 | 0 | match_offset = (Pos)(i - 2); | 104 | 0 | cur_match = pos; | 105 | 0 | } | 106 | 264k | } | 107 | | | 108 | | /* Update offset-dependent variables */ | 109 | 2.36k | limit = limit_base+match_offset; | 110 | 2.36k | if (cur_match <= limit) | 111 | 0 | goto break_matching; | 112 | 2.36k | mbase_start -= match_offset; | 113 | 2.36k | mbase_end -= match_offset; | 114 | 2.36k | } | 115 | | #else | 116 | | early_exit = s->level < EARLY_EXIT_TRIGGER_LEVEL; | 117 | | #endif | 118 | 1.58M | Assert((unsigned long)strstart <= s->window_size - MIN_LOOKAHEAD, "need lookahead"); | 119 | 1.91M | for (;;) { | 120 | 1.91M | if (cur_match >= strstart) | 121 | 0 | break; | 122 | | | 123 | | /* Skip to next match if the match length cannot increase or if the match length is | 124 | | * less than 2. Note that the checks below for insufficient lookahead only occur | 125 | | * occasionally for performance reasons. | 126 | | * Therefore uninitialized memory will be accessed and conditional jumps will be made | 127 | | * that depend on those values. However the length of the match is limited to the | 128 | | * lookahead, so the output of deflate is not affected by the uninitialized values. | 129 | | */ | 130 | 1.91M | if (best_len < sizeof(uint32_t)) { | 131 | 1.58M | for (;;) { | 132 | 1.58M | if (zng_memcmp_2(mbase_end+cur_match, &scan_end) == 0 && | 133 | 1.58M | zng_memcmp_2(mbase_start+cur_match, &scan_start) == 0) | 134 | 1.58M | break; | 135 | 328 | GOTO_NEXT_CHAIN; | 136 | 328 | } | 137 | 1.58M | } else if (best_len >= sizeof(uint64_t)) { | 138 | 337k | for (;;) { | 139 | 337k | if (zng_memcmp_8(mbase_end+cur_match, &scan_end) == 0 && | 140 | 337k | zng_memcmp_8(mbase_start+cur_match, &scan_start) == 0) | 141 | 319k | break; | 142 | 17.8k | GOTO_NEXT_CHAIN; | 143 | 17.8k | } | 144 | 319k | } else { | 145 | 5.30k | for (;;) { | 146 | 5.30k | if (zng_memcmp_4(mbase_end+cur_match, &scan_end) == 0 && | 147 | 5.30k | zng_memcmp_4(mbase_start+cur_match, &scan_start) == 0) | 148 | 4.75k | break; | 149 | 558 | GOTO_NEXT_CHAIN; | 150 | 558 | } | 151 | 4.75k | } | 152 | 1.91M | uint32_t len = COMPARE256(scan+2, mbase_start+cur_match+2) + 2; | 153 | 1.91M | Assert(scan+len <= window+(unsigned)(s->window_size-1), "wild scan"); | 154 | | | 155 | 1.91M | if (len > best_len) { | 156 | 1.88M | uint32_t match_start = cur_match - match_offset; | 157 | 1.88M | s->match_start = match_start; | 158 | | | 159 | | /* Do not look for matches beyond the end of the input. */ | 160 | 1.88M | if (len > lookahead) | 161 | 4.83k | return lookahead; | 162 | 1.87M | best_len = len; | 163 | 1.87M | if (best_len >= nice_match) | 164 | 1.58M | return best_len; | 165 | | | 166 | 292k | offset = best_len-1; | 167 | 292k | if (best_len >= sizeof(uint32_t)) { | 168 | 291k | offset -= 2; | 169 | 291k | if (best_len >= sizeof(uint64_t)) | 170 | 287k | offset -= 4; | 171 | 291k | } | 172 | | | 173 | 292k | scan_end = zng_memread_8(scan+offset); | 174 | | | 175 | 292k | #ifdef LONGEST_MATCH_SLOW | 176 | | /* Look for a better string offset */ | 177 | 292k | if (UNLIKELY(len > STD_MIN_MATCH && match_start + len < strstart)) { | 178 | 291k | Pos pos, next_pos; | 179 | 291k | uint32_t i, hash; | 180 | 291k | unsigned char *scan_endstr; | 181 | | | 182 | | /* Go back to offset 0 */ | 183 | 291k | cur_match -= match_offset; | 184 | 291k | match_offset = 0; | 185 | 291k | next_pos = cur_match; | 186 | 37.7M | for (i = 0; i <= len - STD_MIN_MATCH; i++) { | 187 | 37.5M | pos = prev[(cur_match + i) & wmask]; | 188 | 37.5M | if (pos < next_pos) { | 189 | | /* Hash chain is more distant, use it */ | 190 | 291k | if (pos <= limit_base + i) | 191 | 0 | goto break_matching; | 192 | 291k | next_pos = pos; | 193 | 291k | match_offset = (Pos)i; | 194 | 291k | } | 195 | 37.5M | } | 196 | | /* Switch cur_match to next_pos chain */ | 197 | 291k | cur_match = next_pos; | 198 | | | 199 | | /* Try hash head at len-(STD_MIN_MATCH-1) position to see if we could get | 200 | | * a better cur_match at the end of string. Using (STD_MIN_MATCH-1) lets | 201 | | * us include one more byte into hash - the byte which will be checked | 202 | | * in main loop now, and which allows to grow match by 1. | 203 | | */ | 204 | 291k | scan_endstr = scan + len - (STD_MIN_MATCH+1); | 205 | | | 206 | 291k | hash = s->update_hash(0, scan_endstr[0]); | 207 | 291k | hash = s->update_hash(hash, scan_endstr[1]); | 208 | 291k | hash = s->update_hash(hash, scan_endstr[2]); | 209 | | | 210 | 291k | pos = s->head[hash]; | 211 | 291k | if (pos < cur_match) { | 212 | 0 | match_offset = (Pos)(len - (STD_MIN_MATCH+1)); | 213 | 0 | if (pos <= limit_base + match_offset) | 214 | 0 | goto break_matching; | 215 | 0 | cur_match = pos; | 216 | 0 | } | 217 | | | 218 | | /* Update offset-dependent variables */ | 219 | 291k | limit = limit_base+match_offset; | 220 | 291k | mbase_start = window-match_offset; | 221 | 291k | mbase_end = (mbase_start+offset); | 222 | 291k | continue; | 223 | 291k | } | 224 | 1.14k | #endif | 225 | 1.14k | mbase_end = (mbase_start+offset); | 226 | 1.14k | } | 227 | | #ifndef LONGEST_MATCH_SLOW | 228 | | else if (UNLIKELY(early_exit)) { | 229 | | /* The probability of finding a match later if we here is pretty low, so for | 230 | | * performance it's best to outright stop here for the lower compression levels | 231 | | */ | 232 | | break; | 233 | | } | 234 | | #endif | 235 | 30.9k | GOTO_NEXT_CHAIN; | 236 | 30.9k | } | 237 | 0 | return best_len; | 238 | | | 239 | 0 | #ifdef LONGEST_MATCH_SLOW | 240 | 0 | break_matching: | 241 | |
| 242 | 0 | if (best_len < s->lookahead) | 243 | 0 | return best_len; | 244 | | | 245 | 0 | return s->lookahead; | 246 | 0 | #endif | 247 | 0 | } |
Unexecuted instantiation: longest_match_avx512 Unexecuted instantiation: longest_match_slow_avx512 |
248 | | |
249 | | #undef LONGEST_MATCH_SLOW |
250 | | #undef LONGEST_MATCH |