Line | Count | Source |
1 | | /* match_tpl.h -- find longest match template for compare256 variants |
2 | | * |
3 | | * Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler |
4 | | * For conditions of distribution and use, see copyright notice in zlib.h |
5 | | * |
6 | | * Portions copyright (C) 2014-2021 Konstantin Nosov |
7 | | * Fast-zlib optimized longest_match |
8 | | * https://github.com/gildor2/fast_zlib |
9 | | */ |
10 | | |
11 | | #include "insert_string_p.h" |
12 | | |
13 | 15.5M | #define EARLY_EXIT_TRIGGER_LEVEL 5 |
14 | | |
15 | | #define GOTO_NEXT_CHAIN \ |
16 | 227M | if (--chain_length && (cur_match = prev[cur_match & wmask]) > limit) \ |
17 | 199M | continue; \ |
18 | 28.5M | return best_len; |
19 | | |
20 | | /* Set match_start to the longest match starting at the given string and |
21 | | * return its length. Matches shorter or equal to prev_length are discarded, |
22 | | * in which case the result is equal to prev_length and match_start is garbage. |
23 | | * |
24 | | * IN assertions: cur_match is the head of the hash chain for the current |
25 | | * string (strstart) and its distance is <= MAX_DIST, and prev_length >=1 |
26 | | * OUT assertion: the match length is not greater than s->lookahead |
27 | | * |
28 | | * The LONGEST_MATCH_SLOW variant spends more time to attempt to find longer |
29 | | * matches once a match has already been found. |
30 | | */ |
31 | 30.3M | Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, uint32_t cur_match) { |
32 | 30.3M | const unsigned wmask = W_MASK(s); |
33 | 30.3M | unsigned int strstart = s->strstart; |
34 | 30.3M | const unsigned char *window = s->window; |
35 | 30.3M | const Pos *prev = s->prev; |
36 | | #ifdef LONGEST_MATCH_SLOW |
37 | | const Pos *head = s->head; |
38 | | #endif |
39 | 30.3M | const unsigned char *scan; |
40 | 30.3M | const unsigned char *mbase_start = window; |
41 | 30.3M | const unsigned char *mbase_end; |
42 | 30.3M | uint32_t limit; |
43 | | #ifdef LONGEST_MATCH_SLOW |
44 | | uint32_t limit_base; |
45 | | #else |
46 | | int32_t early_exit; |
47 | | #endif |
48 | 30.3M | uint32_t chain_length = s->max_chain_length; |
49 | 30.3M | uint32_t nice_match = (uint32_t)s->nice_match; |
50 | 30.3M | uint32_t best_len, offset; |
51 | 30.3M | uint32_t lookahead = s->lookahead; |
52 | 30.3M | uint32_t match_offset = 0; |
53 | 30.3M | uint64_t scan_start; |
54 | 30.3M | uint64_t scan_end; |
55 | | |
56 | | /* The code is optimized for STD_MAX_MATCH-2 multiple of 16. */ |
57 | 30.3M | Assert(STD_MAX_MATCH == 258, "Code too clever"); |
58 | | |
59 | 30.3M | scan = window + strstart; |
60 | 30.3M | best_len = s->prev_length ? s->prev_length : STD_MIN_MATCH-1; |
61 | | |
62 | | /* Calculate read offset which should only extend an extra byte |
63 | | * to find the next best match length. |
64 | | */ |
65 | 30.3M | offset = best_len-1; |
66 | 30.3M | if (best_len >= sizeof(uint32_t)) { |
67 | 2.44M | offset -= 2; |
68 | 2.44M | if (best_len >= sizeof(uint64_t)) |
69 | 561k | offset -= 4; |
70 | 2.44M | } |
71 | | |
72 | 30.3M | scan_start = zng_memread_8(scan); |
73 | 30.3M | scan_end = zng_memread_8(scan+offset); |
74 | 30.3M | mbase_end = (mbase_start+offset); |
75 | | |
76 | | /* Do not waste too much time if we already have a good match */ |
77 | 30.3M | if (best_len >= s->good_match) |
78 | 386k | chain_length >>= 2; |
79 | | |
80 | | /* Stop when cur_match becomes <= limit. To simplify the code, |
81 | | * we prevent matches with the string of window index 0 |
82 | | */ |
83 | 30.3M | limit = strstart > MAX_DIST(s) ? (strstart - MAX_DIST(s)) : 0; |
84 | | #ifdef LONGEST_MATCH_SLOW |
85 | | limit_base = limit; |
86 | 14.7M | if (best_len >= STD_MIN_MATCH) { |
87 | | /* We're continuing search (lazy evaluation). */ |
88 | 2.74M | uint32_t hash; |
89 | 2.74M | uint32_t pos; |
90 | | |
91 | | /* Find a most distant chain starting from scan with index=1 (index=0 corresponds |
92 | | * to cur_match). We cannot use s->prev[strstart+1,...] immediately, because |
93 | | * these strings are not yet inserted into the hash table. |
94 | | */ |
95 | | // use update_hash_roll for deflate_slow |
96 | | hash = update_hash_roll(0, scan[1]); |
97 | | hash = update_hash_roll(hash, scan[2]); |
98 | | |
99 | 11.3M | for (uint32_t i = 3; i <= best_len; i++) { |
100 | | // use update_hash_roll for deflate_slow |
101 | 8.63M | hash = update_hash_roll(hash, scan[i]); |
102 | | /* If we're starting with best_len >= 3, we can use offset search. */ |
103 | 8.63M | pos = head[hash]; |
104 | 8.63M | if (pos < cur_match) { |
105 | 2.34M | match_offset = i - 2; |
106 | 2.34M | cur_match = pos; |
107 | 2.34M | } |
108 | 8.63M | } |
109 | | |
110 | | /* Update offset-dependent variables */ |
111 | 2.74M | limit = limit_base+match_offset; |
112 | 2.74M | if (cur_match <= limit) |
113 | 595k | goto break_matching; |
114 | 2.14M | mbase_start -= match_offset; |
115 | 2.14M | mbase_end -= match_offset; |
116 | 2.14M | } |
117 | | #else |
118 | 15.5M | early_exit = s->level < EARLY_EXIT_TRIGGER_LEVEL; |
119 | | #endif |
120 | 14.1M | Assert((unsigned long)strstart <= s->window_size - MIN_LOOKAHEAD, "need lookahead"); |
121 | 39.7M | for (;;) { |
122 | 39.7M | if (cur_match >= strstart) |
123 | 25 | break; |
124 | | |
125 | | /* Skip to next match if the match length cannot increase or if the match length is |
126 | | * less than 2. Note that the checks below for insufficient lookahead only occur |
127 | | * occasionally for performance reasons. |
128 | | * Therefore uninitialized memory will be accessed and conditional jumps will be made |
129 | | * that depend on those values. However the length of the match is limited to the |
130 | | * lookahead, so the output of deflate is not affected by the uninitialized values. |
131 | | */ |
132 | 39.7M | if (best_len < sizeof(uint32_t)) { |
133 | 69.9M | for (;;) { |
134 | 69.9M | if (zng_memcmp_2(mbase_end+cur_match, &scan_end) == 0 && |
135 | 14.0M | zng_memcmp_2(mbase_start+cur_match, &scan_start) == 0) |
136 | 8.80M | break; |
137 | 61.1M | GOTO_NEXT_CHAIN; |
138 | 61.1M | } |
139 | 30.2M | } else if (best_len >= sizeof(uint64_t)) { |
140 | 77.4M | for (;;) { |
141 | 77.4M | if (zng_memcmp_8(mbase_end+cur_match, &scan_end) == 0 && |
142 | 3.46M | zng_memcmp_8(mbase_start+cur_match, &scan_start) == 0) |
143 | 2.71M | break; |
144 | 74.7M | GOTO_NEXT_CHAIN; |
145 | 74.7M | } |
146 | 5.46M | } else { |
147 | 83.0M | for (;;) { |
148 | 83.0M | if (zng_memcmp_4(mbase_end+cur_match, &scan_end) == 0 && |
149 | 2.60M | zng_memcmp_4(mbase_start+cur_match, &scan_start) == 0) |
150 | 1.37M | break; |
151 | 81.7M | GOTO_NEXT_CHAIN; |
152 | 81.7M | } |
153 | 5.46M | } |
154 | 12.8M | uint32_t len = COMPARE256(scan+2, mbase_start+cur_match+2) + 2; |
155 | 12.8M | Assert(scan+len <= window+(unsigned)(s->window_size-1), "wild scan"); |
156 | | |
157 | 12.8M | if (len > best_len) { |
158 | 11.8M | uint32_t match_start = cur_match - match_offset; |
159 | 11.8M | s->match_start = match_start; |
160 | | |
161 | | /* Do not look for matches beyond the end of the input. */ |
162 | 11.8M | if (len > lookahead) |
163 | 747 | return lookahead; |
164 | 11.8M | if (len >= nice_match) |
165 | 501k | return len; |
166 | | |
167 | 11.2M | best_len = len; |
168 | | |
169 | 11.2M | offset = best_len-1; |
170 | 11.2M | if (best_len >= sizeof(uint32_t)) { |
171 | 8.42M | offset -= 2; |
172 | 8.42M | if (best_len >= sizeof(uint64_t)) |
173 | 2.72M | offset -= 4; |
174 | 8.42M | } |
175 | | |
176 | 11.2M | scan_end = zng_memread_8(scan+offset); |
177 | | |
178 | | #ifdef LONGEST_MATCH_SLOW |
179 | | /* Look for a better string offset */ |
180 | 5.40M | if (UNLIKELY(len > STD_MIN_MATCH && match_start + len < strstart)) { |
181 | 2.45M | const unsigned char *scan_endstr; |
182 | 2.45M | uint32_t hash; |
183 | 2.45M | uint32_t pos, next_pos; |
184 | | |
185 | | /* Go back to offset 0 */ |
186 | | cur_match -= match_offset; |
187 | | match_offset = 0; |
188 | | next_pos = cur_match; |
189 | 32.4M | for (uint32_t i = 0; i <= len - STD_MIN_MATCH; i++) { |
190 | 30.7M | pos = prev[(cur_match + i) & wmask]; |
191 | 30.7M | if (pos < next_pos) { |
192 | | /* Hash chain is more distant, use it */ |
193 | 3.37M | if (pos <= limit_base + i) |
194 | 714k | goto break_matching; |
195 | 2.66M | next_pos = pos; |
196 | 2.66M | match_offset = i; |
197 | 2.66M | } |
198 | 30.7M | } |
199 | | /* Switch cur_match to next_pos chain */ |
200 | 1.74M | cur_match = next_pos; |
201 | | |
202 | | /* Try hash head at len-(STD_MIN_MATCH-1) position to see if we could get |
203 | | * a better cur_match at the end of string. Using (STD_MIN_MATCH-1) lets |
204 | | * us include one more byte into hash - the byte which will be checked |
205 | | * in main loop now, and which allows to grow match by 1. |
206 | | */ |
207 | 1.74M | scan_endstr = scan + len - (STD_MIN_MATCH+1); |
208 | | |
209 | | // use update_hash_roll for deflate_slow |
210 | 1.74M | hash = update_hash_roll(0, scan_endstr[0]); |
211 | 1.74M | hash = update_hash_roll(hash, scan_endstr[1]); |
212 | 1.74M | hash = update_hash_roll(hash, scan_endstr[2]); |
213 | | |
214 | 1.74M | pos = head[hash]; |
215 | 1.74M | if (pos < cur_match) { |
216 | 0 | match_offset = len - (STD_MIN_MATCH+1); |
217 | 0 | if (pos <= limit_base + match_offset) |
218 | 0 | goto break_matching; |
219 | 0 | cur_match = pos; |
220 | 0 | } |
221 | | |
222 | | /* Update offset-dependent variables */ |
223 | 1.74M | limit = limit_base+match_offset; |
224 | 1.74M | mbase_start = window-match_offset; |
225 | 1.74M | mbase_end = (mbase_start+offset); |
226 | 1.74M | continue; |
227 | 1.74M | } |
228 | 2.94M | #endif |
229 | 2.94M | mbase_end = (mbase_start+offset); |
230 | 2.94M | } |
231 | | #ifndef LONGEST_MATCH_SLOW |
232 | 250k | else if (UNLIKELY(early_exit)) { |
233 | | /* The probability of finding a match later if we here is pretty low, so for |
234 | | * performance it's best to outright stop here for the lower compression levels |
235 | | */ |
236 | 908 | break; |
237 | 908 | } |
238 | 6.14M | #endif |
239 | 9.93M | GOTO_NEXT_CHAIN; |
240 | 9.93M | } |
241 | 933 | return best_len; |
242 | | |
243 | | #ifdef LONGEST_MATCH_SLOW |
244 | 1.30M | break_matching: |
245 | | |
246 | 1.30M | if (best_len < lookahead) |
247 | 1.30M | return best_len; |
248 | | |
249 | 176 | return lookahead; |
250 | | #endif |
251 | 1.30M | } Unexecuted instantiation: longest_match_sse2 Unexecuted instantiation: longest_match_slow_sse2 Line | Count | Source | 31 | 15.5M | Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, uint32_t cur_match) { | 32 | 15.5M | const unsigned wmask = W_MASK(s); | 33 | 15.5M | unsigned int strstart = s->strstart; | 34 | 15.5M | const unsigned char *window = s->window; | 35 | 15.5M | const Pos *prev = s->prev; | 36 | | #ifdef LONGEST_MATCH_SLOW | 37 | | const Pos *head = s->head; | 38 | | #endif | 39 | 15.5M | const unsigned char *scan; | 40 | 15.5M | const unsigned char *mbase_start = window; | 41 | 15.5M | const unsigned char *mbase_end; | 42 | 15.5M | uint32_t limit; | 43 | | #ifdef LONGEST_MATCH_SLOW | 44 | | uint32_t limit_base; | 45 | | #else | 46 | 15.5M | int32_t early_exit; | 47 | 15.5M | #endif | 48 | 15.5M | uint32_t chain_length = s->max_chain_length; | 49 | 15.5M | uint32_t nice_match = (uint32_t)s->nice_match; | 50 | 15.5M | uint32_t best_len, offset; | 51 | 15.5M | uint32_t lookahead = s->lookahead; | 52 | 15.5M | uint32_t match_offset = 0; | 53 | 15.5M | uint64_t scan_start; | 54 | 15.5M | uint64_t scan_end; | 55 | | | 56 | | /* The code is optimized for STD_MAX_MATCH-2 multiple of 16. */ | 57 | 15.5M | Assert(STD_MAX_MATCH == 258, "Code too clever"); | 58 | | | 59 | 15.5M | scan = window + strstart; | 60 | 15.5M | best_len = s->prev_length ? s->prev_length : STD_MIN_MATCH-1; | 61 | | | 62 | | /* Calculate read offset which should only extend an extra byte | 63 | | * to find the next best match length. | 64 | | */ | 65 | 15.5M | offset = best_len-1; | 66 | 15.5M | if (best_len >= sizeof(uint32_t)) { | 67 | 868k | offset -= 2; | 68 | 868k | if (best_len >= sizeof(uint64_t)) | 69 | 387k | offset -= 4; | 70 | 868k | } | 71 | | | 72 | 15.5M | scan_start = zng_memread_8(scan); | 73 | 15.5M | scan_end = zng_memread_8(scan+offset); | 74 | 15.5M | mbase_end = (mbase_start+offset); | 75 | | | 76 | | /* Do not waste too much time if we already have a good match */ | 77 | 15.5M | if (best_len >= s->good_match) | 78 | 359k | chain_length >>= 2; | 79 | | | 80 | | /* Stop when cur_match becomes <= limit. To simplify the code, | 81 | | * we prevent matches with the string of window index 0 | 82 | | */ | 83 | 15.5M | limit = strstart > MAX_DIST(s) ? (strstart - MAX_DIST(s)) : 0; | 84 | | #ifdef LONGEST_MATCH_SLOW | 85 | | limit_base = limit; | 86 | | if (best_len >= STD_MIN_MATCH) { | 87 | | /* We're continuing search (lazy evaluation). */ | 88 | | uint32_t hash; | 89 | | uint32_t pos; | 90 | | | 91 | | /* Find a most distant chain starting from scan with index=1 (index=0 corresponds | 92 | | * to cur_match). We cannot use s->prev[strstart+1,...] immediately, because | 93 | | * these strings are not yet inserted into the hash table. | 94 | | */ | 95 | | // use update_hash_roll for deflate_slow | 96 | | hash = update_hash_roll(0, scan[1]); | 97 | | hash = update_hash_roll(hash, scan[2]); | 98 | | | 99 | | for (uint32_t i = 3; i <= best_len; i++) { | 100 | | // use update_hash_roll for deflate_slow | 101 | | hash = update_hash_roll(hash, scan[i]); | 102 | | /* If we're starting with best_len >= 3, we can use offset search. */ | 103 | | pos = head[hash]; | 104 | | if (pos < cur_match) { | 105 | | match_offset = i - 2; | 106 | | cur_match = pos; | 107 | | } | 108 | | } | 109 | | | 110 | | /* Update offset-dependent variables */ | 111 | | limit = limit_base+match_offset; | 112 | | if (cur_match <= limit) | 113 | | goto break_matching; | 114 | | mbase_start -= match_offset; | 115 | | mbase_end -= match_offset; | 116 | | } | 117 | | #else | 118 | 15.5M | early_exit = s->level < EARLY_EXIT_TRIGGER_LEVEL; | 119 | 15.5M | #endif | 120 | 15.5M | Assert((unsigned long)strstart <= s->window_size - MIN_LOOKAHEAD, "need lookahead"); | 121 | 20.3M | for (;;) { | 122 | 20.3M | if (cur_match >= strstart) | 123 | 25 | break; | 124 | | | 125 | | /* Skip to next match if the match length cannot increase or if the match length is | 126 | | * less than 2. Note that the checks below for insufficient lookahead only occur | 127 | | * occasionally for performance reasons. | 128 | | * Therefore uninitialized memory will be accessed and conditional jumps will be made | 129 | | * that depend on those values. However the length of the match is limited to the | 130 | | * lookahead, so the output of deflate is not affected by the uninitialized values. | 131 | | */ | 132 | 20.3M | if (best_len < sizeof(uint32_t)) { | 133 | 18.6M | for (;;) { | 134 | 18.6M | if (zng_memcmp_2(mbase_end+cur_match, &scan_end) == 0 && | 135 | 4.07M | zng_memcmp_2(mbase_start+cur_match, &scan_start) == 0) | 136 | 4.07M | break; | 137 | 14.6M | GOTO_NEXT_CHAIN; | 138 | 14.6M | } | 139 | 14.7M | } else if (best_len >= sizeof(uint64_t)) { | 140 | 53.4M | for (;;) { | 141 | 53.4M | if (zng_memcmp_8(mbase_end+cur_match, &scan_end) == 0 && | 142 | 1.70M | zng_memcmp_8(mbase_start+cur_match, &scan_start) == 0) | 143 | 1.50M | break; | 144 | 51.9M | GOTO_NEXT_CHAIN; | 145 | 51.9M | } | 146 | 3.06M | } else { | 147 | 57.7M | for (;;) { | 148 | 57.7M | if (zng_memcmp_4(mbase_end+cur_match, &scan_end) == 0 && | 149 | 1.04M | zng_memcmp_4(mbase_start+cur_match, &scan_start) == 0) | 150 | 1.04M | break; | 151 | 56.6M | GOTO_NEXT_CHAIN; | 152 | 56.6M | } | 153 | 3.06M | } | 154 | 6.62M | uint32_t len = COMPARE256(scan+2, mbase_start+cur_match+2) + 2; | 155 | 6.62M | Assert(scan+len <= window+(unsigned)(s->window_size-1), "wild scan"); | 156 | | | 157 | 6.62M | if (len > best_len) { | 158 | 6.37M | uint32_t match_start = cur_match - match_offset; | 159 | 6.37M | s->match_start = match_start; | 160 | | | 161 | | /* Do not look for matches beyond the end of the input. */ | 162 | 6.37M | if (len > lookahead) | 163 | 421 | return lookahead; | 164 | 6.37M | if (len >= nice_match) | 165 | 476k | return len; | 166 | | | 167 | 5.89M | best_len = len; | 168 | | | 169 | 5.89M | offset = best_len-1; | 170 | 5.89M | if (best_len >= sizeof(uint32_t)) { | 171 | 5.89M | offset -= 2; | 172 | 5.89M | if (best_len >= sizeof(uint64_t)) | 173 | 2.20M | offset -= 4; | 174 | 5.89M | } | 175 | | | 176 | 5.89M | scan_end = zng_memread_8(scan+offset); | 177 | | | 178 | | #ifdef LONGEST_MATCH_SLOW | 179 | | /* Look for a better string offset */ | 180 | | if (UNLIKELY(len > STD_MIN_MATCH && match_start + len < strstart)) { | 181 | | const unsigned char *scan_endstr; | 182 | | uint32_t hash; | 183 | | uint32_t pos, next_pos; | 184 | | | 185 | | /* Go back to offset 0 */ | 186 | | cur_match -= match_offset; | 187 | | match_offset = 0; | 188 | | next_pos = cur_match; | 189 | | for (uint32_t i = 0; i <= len - STD_MIN_MATCH; i++) { | 190 | | pos = prev[(cur_match + i) & wmask]; | 191 | | if (pos < next_pos) { | 192 | | /* Hash chain is more distant, use it */ | 193 | | if (pos <= limit_base + i) | 194 | | goto break_matching; | 195 | | next_pos = pos; | 196 | | match_offset = i; | 197 | | } | 198 | | } | 199 | | /* Switch cur_match to next_pos chain */ | 200 | | cur_match = next_pos; | 201 | | | 202 | | /* Try hash head at len-(STD_MIN_MATCH-1) position to see if we could get | 203 | | * a better cur_match at the end of string. Using (STD_MIN_MATCH-1) lets | 204 | | * us include one more byte into hash - the byte which will be checked | 205 | | * in main loop now, and which allows to grow match by 1. | 206 | | */ | 207 | | scan_endstr = scan + len - (STD_MIN_MATCH+1); | 208 | | | 209 | | // use update_hash_roll for deflate_slow | 210 | | hash = update_hash_roll(0, scan_endstr[0]); | 211 | | hash = update_hash_roll(hash, scan_endstr[1]); | 212 | | hash = update_hash_roll(hash, scan_endstr[2]); | 213 | | | 214 | | pos = head[hash]; | 215 | | if (pos < cur_match) { | 216 | | match_offset = len - (STD_MIN_MATCH+1); | 217 | | if (pos <= limit_base + match_offset) | 218 | | goto break_matching; | 219 | | cur_match = pos; | 220 | | } | 221 | | | 222 | | /* Update offset-dependent variables */ | 223 | | limit = limit_base+match_offset; | 224 | | mbase_start = window-match_offset; | 225 | | mbase_end = (mbase_start+offset); | 226 | | continue; | 227 | | } | 228 | | #endif | 229 | 5.89M | mbase_end = (mbase_start+offset); | 230 | 5.89M | } | 231 | 250k | #ifndef LONGEST_MATCH_SLOW | 232 | 250k | else if (UNLIKELY(early_exit)) { | 233 | | /* The probability of finding a match later if we here is pretty low, so for | 234 | | * performance it's best to outright stop here for the lower compression levels | 235 | | */ | 236 | 908 | break; | 237 | 908 | } | 238 | 6.14M | #endif | 239 | 6.14M | GOTO_NEXT_CHAIN; | 240 | 6.14M | } | 241 | 933 | return best_len; | 242 | | | 243 | | #ifdef LONGEST_MATCH_SLOW | 244 | | break_matching: | 245 | | | 246 | | if (best_len < lookahead) | 247 | | return best_len; | 248 | | | 249 | | return lookahead; | 250 | | #endif | 251 | 15.5M | } |
Line | Count | Source | 31 | 14.7M | Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, uint32_t cur_match) { | 32 | 14.7M | const unsigned wmask = W_MASK(s); | 33 | 14.7M | unsigned int strstart = s->strstart; | 34 | 14.7M | const unsigned char *window = s->window; | 35 | 14.7M | const Pos *prev = s->prev; | 36 | 14.7M | #ifdef LONGEST_MATCH_SLOW | 37 | 14.7M | const Pos *head = s->head; | 38 | 14.7M | #endif | 39 | 14.7M | const unsigned char *scan; | 40 | 14.7M | const unsigned char *mbase_start = window; | 41 | 14.7M | const unsigned char *mbase_end; | 42 | 14.7M | uint32_t limit; | 43 | 14.7M | #ifdef LONGEST_MATCH_SLOW | 44 | 14.7M | uint32_t limit_base; | 45 | | #else | 46 | | int32_t early_exit; | 47 | | #endif | 48 | 14.7M | uint32_t chain_length = s->max_chain_length; | 49 | 14.7M | uint32_t nice_match = (uint32_t)s->nice_match; | 50 | 14.7M | uint32_t best_len, offset; | 51 | 14.7M | uint32_t lookahead = s->lookahead; | 52 | 14.7M | uint32_t match_offset = 0; | 53 | 14.7M | uint64_t scan_start; | 54 | 14.7M | uint64_t scan_end; | 55 | | | 56 | | /* The code is optimized for STD_MAX_MATCH-2 multiple of 16. */ | 57 | 14.7M | Assert(STD_MAX_MATCH == 258, "Code too clever"); | 58 | | | 59 | 14.7M | scan = window + strstart; | 60 | 14.7M | best_len = s->prev_length ? s->prev_length : STD_MIN_MATCH-1; | 61 | | | 62 | | /* Calculate read offset which should only extend an extra byte | 63 | | * to find the next best match length. | 64 | | */ | 65 | 14.7M | offset = best_len-1; | 66 | 14.7M | if (best_len >= sizeof(uint32_t)) { | 67 | 1.57M | offset -= 2; | 68 | 1.57M | if (best_len >= sizeof(uint64_t)) | 69 | 174k | offset -= 4; | 70 | 1.57M | } | 71 | | | 72 | 14.7M | scan_start = zng_memread_8(scan); | 73 | 14.7M | scan_end = zng_memread_8(scan+offset); | 74 | 14.7M | mbase_end = (mbase_start+offset); | 75 | | | 76 | | /* Do not waste too much time if we already have a good match */ | 77 | 14.7M | if (best_len >= s->good_match) | 78 | 26.7k | chain_length >>= 2; | 79 | | | 80 | | /* Stop when cur_match becomes <= limit. To simplify the code, | 81 | | * we prevent matches with the string of window index 0 | 82 | | */ | 83 | 14.7M | limit = strstart > MAX_DIST(s) ? (strstart - MAX_DIST(s)) : 0; | 84 | 14.7M | #ifdef LONGEST_MATCH_SLOW | 85 | 14.7M | limit_base = limit; | 86 | 14.7M | if (best_len >= STD_MIN_MATCH) { | 87 | | /* We're continuing search (lazy evaluation). */ | 88 | 2.74M | uint32_t hash; | 89 | 2.74M | uint32_t pos; | 90 | | | 91 | | /* Find a most distant chain starting from scan with index=1 (index=0 corresponds | 92 | | * to cur_match). We cannot use s->prev[strstart+1,...] immediately, because | 93 | | * these strings are not yet inserted into the hash table. | 94 | | */ | 95 | | // use update_hash_roll for deflate_slow | 96 | 2.74M | hash = update_hash_roll(0, scan[1]); | 97 | 2.74M | hash = update_hash_roll(hash, scan[2]); | 98 | | | 99 | 11.3M | for (uint32_t i = 3; i <= best_len; i++) { | 100 | | // use update_hash_roll for deflate_slow | 101 | 8.63M | hash = update_hash_roll(hash, scan[i]); | 102 | | /* If we're starting with best_len >= 3, we can use offset search. */ | 103 | 8.63M | pos = head[hash]; | 104 | 8.63M | if (pos < cur_match) { | 105 | 2.34M | match_offset = i - 2; | 106 | 2.34M | cur_match = pos; | 107 | 2.34M | } | 108 | 8.63M | } | 109 | | | 110 | | /* Update offset-dependent variables */ | 111 | 2.74M | limit = limit_base+match_offset; | 112 | 2.74M | if (cur_match <= limit) | 113 | 595k | goto break_matching; | 114 | 2.14M | mbase_start -= match_offset; | 115 | 2.14M | mbase_end -= match_offset; | 116 | 2.14M | } | 117 | | #else | 118 | | early_exit = s->level < EARLY_EXIT_TRIGGER_LEVEL; | 119 | | #endif | 120 | 14.1M | Assert((unsigned long)strstart <= s->window_size - MIN_LOOKAHEAD, "need lookahead"); | 121 | 19.3M | for (;;) { | 122 | 19.3M | if (cur_match >= strstart) | 123 | 0 | break; | 124 | | | 125 | | /* Skip to next match if the match length cannot increase or if the match length is | 126 | | * less than 2. Note that the checks below for insufficient lookahead only occur | 127 | | * occasionally for performance reasons. | 128 | | * Therefore uninitialized memory will be accessed and conditional jumps will be made | 129 | | * that depend on those values. However the length of the match is limited to the | 130 | | * lookahead, so the output of deflate is not affected by the uninitialized values. | 131 | | */ | 132 | 19.3M | if (best_len < sizeof(uint32_t)) { | 133 | 51.2M | for (;;) { | 134 | 51.2M | if (zng_memcmp_2(mbase_end+cur_match, &scan_end) == 0 && | 135 | 9.94M | zng_memcmp_2(mbase_start+cur_match, &scan_start) == 0) | 136 | 4.73M | break; | 137 | 46.5M | GOTO_NEXT_CHAIN; | 138 | 46.5M | } | 139 | 15.5M | } else if (best_len >= sizeof(uint64_t)) { | 140 | 23.9M | for (;;) { | 141 | 23.9M | if (zng_memcmp_8(mbase_end+cur_match, &scan_end) == 0 && | 142 | 1.76M | zng_memcmp_8(mbase_start+cur_match, &scan_start) == 0) | 143 | 1.20M | break; | 144 | 22.7M | GOTO_NEXT_CHAIN; | 145 | 22.7M | } | 146 | 2.39M | } else { | 147 | 25.3M | for (;;) { | 148 | 25.3M | if (zng_memcmp_4(mbase_end+cur_match, &scan_end) == 0 && | 149 | 1.55M | zng_memcmp_4(mbase_start+cur_match, &scan_start) == 0) | 150 | 327k | break; | 151 | 25.0M | GOTO_NEXT_CHAIN; | 152 | 25.0M | } | 153 | 2.39M | } | 154 | 6.26M | uint32_t len = COMPARE256(scan+2, mbase_start+cur_match+2) + 2; | 155 | 6.26M | Assert(scan+len <= window+(unsigned)(s->window_size-1), "wild scan"); | 156 | | | 157 | 6.26M | if (len > best_len) { | 158 | 5.42M | uint32_t match_start = cur_match - match_offset; | 159 | 5.42M | s->match_start = match_start; | 160 | | | 161 | | /* Do not look for matches beyond the end of the input. */ | 162 | 5.42M | if (len > lookahead) | 163 | 326 | return lookahead; | 164 | 5.42M | if (len >= nice_match) | 165 | 25.1k | return len; | 166 | | | 167 | 5.40M | best_len = len; | 168 | | | 169 | 5.40M | offset = best_len-1; | 170 | 5.40M | if (best_len >= sizeof(uint32_t)) { | 171 | 2.52M | offset -= 2; | 172 | 2.52M | if (best_len >= sizeof(uint64_t)) | 173 | 524k | offset -= 4; | 174 | 2.52M | } | 175 | | | 176 | 5.40M | scan_end = zng_memread_8(scan+offset); | 177 | | | 178 | 5.40M | #ifdef LONGEST_MATCH_SLOW | 179 | | /* Look for a better string offset */ | 180 | 5.40M | if (UNLIKELY(len > STD_MIN_MATCH && match_start + len < strstart)) { | 181 | 2.45M | const unsigned char *scan_endstr; | 182 | 2.45M | uint32_t hash; | 183 | 2.45M | uint32_t pos, next_pos; | 184 | | | 185 | | /* Go back to offset 0 */ | 186 | 2.45M | cur_match -= match_offset; | 187 | 2.45M | match_offset = 0; | 188 | 2.45M | next_pos = cur_match; | 189 | 32.4M | for (uint32_t i = 0; i <= len - STD_MIN_MATCH; i++) { | 190 | 30.7M | pos = prev[(cur_match + i) & wmask]; | 191 | 30.7M | if (pos < next_pos) { | 192 | | /* Hash chain is more distant, use it */ | 193 | 3.37M | if (pos <= limit_base + i) | 194 | 714k | goto break_matching; | 195 | 2.66M | next_pos = pos; | 196 | 2.66M | match_offset = i; | 197 | 2.66M | } | 198 | 30.7M | } | 199 | | /* Switch cur_match to next_pos chain */ | 200 | 1.74M | cur_match = next_pos; | 201 | | | 202 | | /* Try hash head at len-(STD_MIN_MATCH-1) position to see if we could get | 203 | | * a better cur_match at the end of string. Using (STD_MIN_MATCH-1) lets | 204 | | * us include one more byte into hash - the byte which will be checked | 205 | | * in main loop now, and which allows to grow match by 1. | 206 | | */ | 207 | 1.74M | scan_endstr = scan + len - (STD_MIN_MATCH+1); | 208 | | | 209 | | // use update_hash_roll for deflate_slow | 210 | 1.74M | hash = update_hash_roll(0, scan_endstr[0]); | 211 | 1.74M | hash = update_hash_roll(hash, scan_endstr[1]); | 212 | 1.74M | hash = update_hash_roll(hash, scan_endstr[2]); | 213 | | | 214 | 1.74M | pos = head[hash]; | 215 | 1.74M | if (pos < cur_match) { | 216 | 0 | match_offset = len - (STD_MIN_MATCH+1); | 217 | 0 | if (pos <= limit_base + match_offset) | 218 | 0 | goto break_matching; | 219 | 0 | cur_match = pos; | 220 | 0 | } | 221 | | | 222 | | /* Update offset-dependent variables */ | 223 | 1.74M | limit = limit_base+match_offset; | 224 | 1.74M | mbase_start = window-match_offset; | 225 | 1.74M | mbase_end = (mbase_start+offset); | 226 | 1.74M | continue; | 227 | 1.74M | } | 228 | 2.94M | #endif | 229 | 2.94M | mbase_end = (mbase_start+offset); | 230 | 2.94M | } | 231 | | #ifndef LONGEST_MATCH_SLOW | 232 | | else if (UNLIKELY(early_exit)) { | 233 | | /* The probability of finding a match later if we here is pretty low, so for | 234 | | * performance it's best to outright stop here for the lower compression levels | 235 | | */ | 236 | | break; | 237 | | } | 238 | | #endif | 239 | 3.78M | GOTO_NEXT_CHAIN; | 240 | 3.78M | } | 241 | 0 | return best_len; | 242 | | | 243 | 0 | #ifdef LONGEST_MATCH_SLOW | 244 | 1.30M | break_matching: | 245 | | | 246 | 1.30M | if (best_len < lookahead) | 247 | 1.30M | return best_len; | 248 | | | 249 | 176 | return lookahead; | 250 | 1.30M | #endif | 251 | 1.30M | } |
Unexecuted instantiation: longest_match_avx512 Unexecuted instantiation: longest_match_slow_avx512 |
252 | | |
253 | | #undef LONGEST_MATCH_SLOW |
254 | | #undef LONGEST_MATCH |