/src/duckdb/third_party/brotli/enc/static_dict.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /* Copyright 2013 Google Inc. All Rights Reserved. |
2 | | |
3 | | Distributed under MIT license. |
4 | | See file LICENSE for detail or copy at https://opensource.org/licenses/MIT |
5 | | */ |
6 | | |
7 | | #include "static_dict.h" |
8 | | |
9 | | #include "../common/dictionary.h" |
10 | | #include "../common/brotli_platform.h" |
11 | | #include "../common/transform.h" |
12 | | #include "encoder_dict.h" |
13 | | #include "find_match_length.h" |
14 | | |
15 | | using namespace duckdb_brotli; |
16 | | |
17 | 0 | static BROTLI_INLINE uint32_t Hash(const uint8_t* data) { |
18 | 0 | uint32_t h = BROTLI_UNALIGNED_LOAD32LE(data) * kDictHashMul32; |
19 | | /* The higher bits contain more mixture from the multiplication, |
20 | | so we take our results from there. */ |
21 | 0 | return h >> (32 - kDictNumBits); |
22 | 0 | } |
23 | | |
24 | | static BROTLI_INLINE void AddMatch(size_t distance, size_t len, size_t len_code, |
25 | 0 | uint32_t* matches) { |
26 | 0 | uint32_t match = (uint32_t)((distance << 5) + len_code); |
27 | 0 | matches[len] = BROTLI_MIN(uint32_t, matches[len], match); |
28 | 0 | } |
29 | | |
30 | | static BROTLI_INLINE size_t DictMatchLength(const BrotliDictionary* dictionary, |
31 | | const uint8_t* data, |
32 | | size_t id, |
33 | | size_t len, |
34 | 0 | size_t maxlen) { |
35 | 0 | const size_t offset = dictionary->offsets_by_length[len] + len * id; |
36 | 0 | return FindMatchLengthWithLimit(&dictionary->data[offset], data, |
37 | 0 | BROTLI_MIN(size_t, len, maxlen)); |
38 | 0 | } |
39 | | |
40 | | static BROTLI_INLINE BROTLI_BOOL IsMatch(const BrotliDictionary* dictionary, |
41 | 0 | DictWord w, const uint8_t* data, size_t max_length) { |
42 | 0 | if (w.len > max_length) { |
43 | 0 | return BROTLI_FALSE; |
44 | 0 | } else { |
45 | 0 | const size_t offset = dictionary->offsets_by_length[w.len] + |
46 | 0 | (size_t)w.len * (size_t)w.idx; |
47 | 0 | const uint8_t* dict = &dictionary->data[offset]; |
48 | 0 | if (w.transform == 0) { |
49 | | /* Match against base dictionary word. */ |
50 | 0 | return |
51 | 0 | TO_BROTLI_BOOL(FindMatchLengthWithLimit(dict, data, w.len) == w.len); |
52 | 0 | } else if (w.transform == 10) { |
53 | | /* Match against uppercase first transform. |
54 | | Note that there are only ASCII uppercase words in the lookup table. */ |
55 | 0 | return TO_BROTLI_BOOL(dict[0] >= 'a' && dict[0] <= 'z' && |
56 | 0 | (dict[0] ^ 32) == data[0] && |
57 | 0 | FindMatchLengthWithLimit(&dict[1], &data[1], w.len - 1u) == |
58 | 0 | w.len - 1u); |
59 | 0 | } else { |
60 | | /* Match against uppercase all transform. |
61 | | Note that there are only ASCII uppercase words in the lookup table. */ |
62 | 0 | size_t i; |
63 | 0 | for (i = 0; i < w.len; ++i) { |
64 | 0 | if (dict[i] >= 'a' && dict[i] <= 'z') { |
65 | 0 | if ((dict[i] ^ 32) != data[i]) return BROTLI_FALSE; |
66 | 0 | } else { |
67 | 0 | if (dict[i] != data[i]) return BROTLI_FALSE; |
68 | 0 | } |
69 | 0 | } |
70 | 0 | return BROTLI_TRUE; |
71 | 0 | } |
72 | 0 | } |
73 | 0 | } |
74 | | |
75 | | /* Finds matches for a single static dictionary */ |
76 | | static BROTLI_BOOL BrotliFindAllStaticDictionaryMatchesFor( |
77 | | const BrotliEncoderDictionary* dictionary, const uint8_t* data, |
78 | 0 | size_t min_length, size_t max_length, uint32_t* matches) { |
79 | 0 | BROTLI_BOOL has_found_match = BROTLI_FALSE; |
80 | | #if defined(BROTLI_EXPERIMENTAL) |
81 | | if (dictionary->has_words_heavy) { |
82 | | const BrotliTrieNode* node = &dictionary->trie.root; |
83 | | size_t l = 0; |
84 | | while (node && l < max_length) { |
85 | | uint8_t c; |
86 | | if (l >= min_length && node->len_) { |
87 | | AddMatch(node->idx_, l, node->len_, matches); |
88 | | has_found_match = BROTLI_TRUE; |
89 | | } |
90 | | c = data[l++]; |
91 | | node = BrotliTrieSub(&dictionary->trie, node, c); |
92 | | } |
93 | | return has_found_match; |
94 | | } |
95 | | #endif /* BROTLI_EXPERIMENTAL */ |
96 | 0 | { |
97 | 0 | size_t offset = dictionary->buckets[Hash(data)]; |
98 | 0 | BROTLI_BOOL end = !offset; |
99 | 0 | while (!end) { |
100 | 0 | DictWord w = dictionary->dict_words[offset++]; |
101 | 0 | const size_t l = w.len & 0x1F; |
102 | 0 | const size_t n = (size_t)1 << dictionary->words->size_bits_by_length[l]; |
103 | 0 | const size_t id = w.idx; |
104 | 0 | end = !!(w.len & 0x80); |
105 | 0 | w.len = (uint8_t)l; |
106 | 0 | if (w.transform == 0) { |
107 | 0 | const size_t matchlen = |
108 | 0 | DictMatchLength(dictionary->words, data, id, l, max_length); |
109 | 0 | const uint8_t* s; |
110 | 0 | size_t minlen; |
111 | 0 | size_t maxlen; |
112 | 0 | size_t len; |
113 | | /* Transform "" + BROTLI_TRANSFORM_IDENTITY + "" */ |
114 | 0 | if (matchlen == l) { |
115 | 0 | AddMatch(id, l, l, matches); |
116 | 0 | has_found_match = BROTLI_TRUE; |
117 | 0 | } |
118 | | /* Transforms "" + BROTLI_TRANSFORM_OMIT_LAST_1 + "" and |
119 | | "" + BROTLI_TRANSFORM_OMIT_LAST_1 + "ing " */ |
120 | 0 | if (matchlen >= l - 1) { |
121 | 0 | AddMatch(id + 12 * n, l - 1, l, matches); |
122 | 0 | if (l + 2 < max_length && |
123 | 0 | data[l - 1] == 'i' && data[l] == 'n' && data[l + 1] == 'g' && |
124 | 0 | data[l + 2] == ' ') { |
125 | 0 | AddMatch(id + 49 * n, l + 3, l, matches); |
126 | 0 | } |
127 | 0 | has_found_match = BROTLI_TRUE; |
128 | 0 | } |
129 | | /* Transform "" + BROTLI_TRANSFORM_OMIT_LAST_# + "" (# = 2 .. 9) */ |
130 | 0 | minlen = min_length; |
131 | 0 | if (l > 9) minlen = BROTLI_MAX(size_t, minlen, l - 9); |
132 | 0 | maxlen = BROTLI_MIN(size_t, matchlen, l - 2); |
133 | 0 | for (len = minlen; len <= maxlen; ++len) { |
134 | 0 | size_t cut = l - len; |
135 | 0 | size_t transform_id = (cut << 2) + |
136 | 0 | (size_t)((dictionary->cutoffTransforms >> (cut * 6)) & 0x3F); |
137 | 0 | AddMatch(id + transform_id * n, len, l, matches); |
138 | 0 | has_found_match = BROTLI_TRUE; |
139 | 0 | } |
140 | 0 | if (matchlen < l || l + 6 >= max_length) { |
141 | 0 | continue; |
142 | 0 | } |
143 | 0 | s = &data[l]; |
144 | | /* Transforms "" + BROTLI_TRANSFORM_IDENTITY + <suffix> */ |
145 | 0 | if (s[0] == ' ') { |
146 | 0 | AddMatch(id + n, l + 1, l, matches); |
147 | 0 | if (s[1] == 'a') { |
148 | 0 | if (s[2] == ' ') { |
149 | 0 | AddMatch(id + 28 * n, l + 3, l, matches); |
150 | 0 | } else if (s[2] == 's') { |
151 | 0 | if (s[3] == ' ') AddMatch(id + 46 * n, l + 4, l, matches); |
152 | 0 | } else if (s[2] == 't') { |
153 | 0 | if (s[3] == ' ') AddMatch(id + 60 * n, l + 4, l, matches); |
154 | 0 | } else if (s[2] == 'n') { |
155 | 0 | if (s[3] == 'd' && s[4] == ' ') { |
156 | 0 | AddMatch(id + 10 * n, l + 5, l, matches); |
157 | 0 | } |
158 | 0 | } |
159 | 0 | } else if (s[1] == 'b') { |
160 | 0 | if (s[2] == 'y' && s[3] == ' ') { |
161 | 0 | AddMatch(id + 38 * n, l + 4, l, matches); |
162 | 0 | } |
163 | 0 | } else if (s[1] == 'i') { |
164 | 0 | if (s[2] == 'n') { |
165 | 0 | if (s[3] == ' ') AddMatch(id + 16 * n, l + 4, l, matches); |
166 | 0 | } else if (s[2] == 's') { |
167 | 0 | if (s[3] == ' ') AddMatch(id + 47 * n, l + 4, l, matches); |
168 | 0 | } |
169 | 0 | } else if (s[1] == 'f') { |
170 | 0 | if (s[2] == 'o') { |
171 | 0 | if (s[3] == 'r' && s[4] == ' ') { |
172 | 0 | AddMatch(id + 25 * n, l + 5, l, matches); |
173 | 0 | } |
174 | 0 | } else if (s[2] == 'r') { |
175 | 0 | if (s[3] == 'o' && s[4] == 'm' && s[5] == ' ') { |
176 | 0 | AddMatch(id + 37 * n, l + 6, l, matches); |
177 | 0 | } |
178 | 0 | } |
179 | 0 | } else if (s[1] == 'o') { |
180 | 0 | if (s[2] == 'f') { |
181 | 0 | if (s[3] == ' ') AddMatch(id + 8 * n, l + 4, l, matches); |
182 | 0 | } else if (s[2] == 'n') { |
183 | 0 | if (s[3] == ' ') AddMatch(id + 45 * n, l + 4, l, matches); |
184 | 0 | } |
185 | 0 | } else if (s[1] == 'n') { |
186 | 0 | if (s[2] == 'o' && s[3] == 't' && s[4] == ' ') { |
187 | 0 | AddMatch(id + 80 * n, l + 5, l, matches); |
188 | 0 | } |
189 | 0 | } else if (s[1] == 't') { |
190 | 0 | if (s[2] == 'h') { |
191 | 0 | if (s[3] == 'e') { |
192 | 0 | if (s[4] == ' ') AddMatch(id + 5 * n, l + 5, l, matches); |
193 | 0 | } else if (s[3] == 'a') { |
194 | 0 | if (s[4] == 't' && s[5] == ' ') { |
195 | 0 | AddMatch(id + 29 * n, l + 6, l, matches); |
196 | 0 | } |
197 | 0 | } |
198 | 0 | } else if (s[2] == 'o') { |
199 | 0 | if (s[3] == ' ') AddMatch(id + 17 * n, l + 4, l, matches); |
200 | 0 | } |
201 | 0 | } else if (s[1] == 'w') { |
202 | 0 | if (s[2] == 'i' && s[3] == 't' && s[4] == 'h' && s[5] == ' ') { |
203 | 0 | AddMatch(id + 35 * n, l + 6, l, matches); |
204 | 0 | } |
205 | 0 | } |
206 | 0 | } else if (s[0] == '"') { |
207 | 0 | AddMatch(id + 19 * n, l + 1, l, matches); |
208 | 0 | if (s[1] == '>') { |
209 | 0 | AddMatch(id + 21 * n, l + 2, l, matches); |
210 | 0 | } |
211 | 0 | } else if (s[0] == '.') { |
212 | 0 | AddMatch(id + 20 * n, l + 1, l, matches); |
213 | 0 | if (s[1] == ' ') { |
214 | 0 | AddMatch(id + 31 * n, l + 2, l, matches); |
215 | 0 | if (s[2] == 'T' && s[3] == 'h') { |
216 | 0 | if (s[4] == 'e') { |
217 | 0 | if (s[5] == ' ') AddMatch(id + 43 * n, l + 6, l, matches); |
218 | 0 | } else if (s[4] == 'i') { |
219 | 0 | if (s[5] == 's' && s[6] == ' ') { |
220 | 0 | AddMatch(id + 75 * n, l + 7, l, matches); |
221 | 0 | } |
222 | 0 | } |
223 | 0 | } |
224 | 0 | } |
225 | 0 | } else if (s[0] == ',') { |
226 | 0 | AddMatch(id + 76 * n, l + 1, l, matches); |
227 | 0 | if (s[1] == ' ') { |
228 | 0 | AddMatch(id + 14 * n, l + 2, l, matches); |
229 | 0 | } |
230 | 0 | } else if (s[0] == '\n') { |
231 | 0 | AddMatch(id + 22 * n, l + 1, l, matches); |
232 | 0 | if (s[1] == '\t') { |
233 | 0 | AddMatch(id + 50 * n, l + 2, l, matches); |
234 | 0 | } |
235 | 0 | } else if (s[0] == ']') { |
236 | 0 | AddMatch(id + 24 * n, l + 1, l, matches); |
237 | 0 | } else if (s[0] == '\'') { |
238 | 0 | AddMatch(id + 36 * n, l + 1, l, matches); |
239 | 0 | } else if (s[0] == ':') { |
240 | 0 | AddMatch(id + 51 * n, l + 1, l, matches); |
241 | 0 | } else if (s[0] == '(') { |
242 | 0 | AddMatch(id + 57 * n, l + 1, l, matches); |
243 | 0 | } else if (s[0] == '=') { |
244 | 0 | if (s[1] == '"') { |
245 | 0 | AddMatch(id + 70 * n, l + 2, l, matches); |
246 | 0 | } else if (s[1] == '\'') { |
247 | 0 | AddMatch(id + 86 * n, l + 2, l, matches); |
248 | 0 | } |
249 | 0 | } else if (s[0] == 'a') { |
250 | 0 | if (s[1] == 'l' && s[2] == ' ') { |
251 | 0 | AddMatch(id + 84 * n, l + 3, l, matches); |
252 | 0 | } |
253 | 0 | } else if (s[0] == 'e') { |
254 | 0 | if (s[1] == 'd') { |
255 | 0 | if (s[2] == ' ') AddMatch(id + 53 * n, l + 3, l, matches); |
256 | 0 | } else if (s[1] == 'r') { |
257 | 0 | if (s[2] == ' ') AddMatch(id + 82 * n, l + 3, l, matches); |
258 | 0 | } else if (s[1] == 's') { |
259 | 0 | if (s[2] == 't' && s[3] == ' ') { |
260 | 0 | AddMatch(id + 95 * n, l + 4, l, matches); |
261 | 0 | } |
262 | 0 | } |
263 | 0 | } else if (s[0] == 'f') { |
264 | 0 | if (s[1] == 'u' && s[2] == 'l' && s[3] == ' ') { |
265 | 0 | AddMatch(id + 90 * n, l + 4, l, matches); |
266 | 0 | } |
267 | 0 | } else if (s[0] == 'i') { |
268 | 0 | if (s[1] == 'v') { |
269 | 0 | if (s[2] == 'e' && s[3] == ' ') { |
270 | 0 | AddMatch(id + 92 * n, l + 4, l, matches); |
271 | 0 | } |
272 | 0 | } else if (s[1] == 'z') { |
273 | 0 | if (s[2] == 'e' && s[3] == ' ') { |
274 | 0 | AddMatch(id + 100 * n, l + 4, l, matches); |
275 | 0 | } |
276 | 0 | } |
277 | 0 | } else if (s[0] == 'l') { |
278 | 0 | if (s[1] == 'e') { |
279 | 0 | if (s[2] == 's' && s[3] == 's' && s[4] == ' ') { |
280 | 0 | AddMatch(id + 93 * n, l + 5, l, matches); |
281 | 0 | } |
282 | 0 | } else if (s[1] == 'y') { |
283 | 0 | if (s[2] == ' ') AddMatch(id + 61 * n, l + 3, l, matches); |
284 | 0 | } |
285 | 0 | } else if (s[0] == 'o') { |
286 | 0 | if (s[1] == 'u' && s[2] == 's' && s[3] == ' ') { |
287 | 0 | AddMatch(id + 106 * n, l + 4, l, matches); |
288 | 0 | } |
289 | 0 | } |
290 | 0 | } else { |
291 | | /* Set is_all_caps=0 for BROTLI_TRANSFORM_UPPERCASE_FIRST and |
292 | | is_all_caps=1 otherwise (BROTLI_TRANSFORM_UPPERCASE_ALL) |
293 | | transform. */ |
294 | 0 | const BROTLI_BOOL is_all_caps = |
295 | 0 | TO_BROTLI_BOOL(w.transform != BROTLI_TRANSFORM_UPPERCASE_FIRST); |
296 | 0 | const uint8_t* s; |
297 | 0 | if (!IsMatch(dictionary->words, w, data, max_length)) { |
298 | 0 | continue; |
299 | 0 | } |
300 | | /* Transform "" + kUppercase{First,All} + "" */ |
301 | 0 | AddMatch(id + (is_all_caps ? 44 : 9) * n, l, l, matches); |
302 | 0 | has_found_match = BROTLI_TRUE; |
303 | 0 | if (l + 1 >= max_length) { |
304 | 0 | continue; |
305 | 0 | } |
306 | | /* Transforms "" + kUppercase{First,All} + <suffix> */ |
307 | 0 | s = &data[l]; |
308 | 0 | if (s[0] == ' ') { |
309 | 0 | AddMatch(id + (is_all_caps ? 68 : 4) * n, l + 1, l, matches); |
310 | 0 | } else if (s[0] == '"') { |
311 | 0 | AddMatch(id + (is_all_caps ? 87 : 66) * n, l + 1, l, matches); |
312 | 0 | if (s[1] == '>') { |
313 | 0 | AddMatch(id + (is_all_caps ? 97 : 69) * n, l + 2, l, matches); |
314 | 0 | } |
315 | 0 | } else if (s[0] == '.') { |
316 | 0 | AddMatch(id + (is_all_caps ? 101 : 79) * n, l + 1, l, matches); |
317 | 0 | if (s[1] == ' ') { |
318 | 0 | AddMatch(id + (is_all_caps ? 114 : 88) * n, l + 2, l, matches); |
319 | 0 | } |
320 | 0 | } else if (s[0] == ',') { |
321 | 0 | AddMatch(id + (is_all_caps ? 112 : 99) * n, l + 1, l, matches); |
322 | 0 | if (s[1] == ' ') { |
323 | 0 | AddMatch(id + (is_all_caps ? 107 : 58) * n, l + 2, l, matches); |
324 | 0 | } |
325 | 0 | } else if (s[0] == '\'') { |
326 | 0 | AddMatch(id + (is_all_caps ? 94 : 74) * n, l + 1, l, matches); |
327 | 0 | } else if (s[0] == '(') { |
328 | 0 | AddMatch(id + (is_all_caps ? 113 : 78) * n, l + 1, l, matches); |
329 | 0 | } else if (s[0] == '=') { |
330 | 0 | if (s[1] == '"') { |
331 | 0 | AddMatch(id + (is_all_caps ? 105 : 104) * n, l + 2, l, matches); |
332 | 0 | } else if (s[1] == '\'') { |
333 | 0 | AddMatch(id + (is_all_caps ? 116 : 108) * n, l + 2, l, matches); |
334 | 0 | } |
335 | 0 | } |
336 | 0 | } |
337 | 0 | } |
338 | 0 | } |
339 | | /* Transforms with prefixes " " and "." */ |
340 | 0 | if (max_length >= 5 && (data[0] == ' ' || data[0] == '.')) { |
341 | 0 | BROTLI_BOOL is_space = TO_BROTLI_BOOL(data[0] == ' '); |
342 | 0 | size_t offset = dictionary->buckets[Hash(&data[1])]; |
343 | 0 | BROTLI_BOOL end = !offset; |
344 | 0 | while (!end) { |
345 | 0 | DictWord w = dictionary->dict_words[offset++]; |
346 | 0 | const size_t l = w.len & 0x1F; |
347 | 0 | const size_t n = (size_t)1 << dictionary->words->size_bits_by_length[l]; |
348 | 0 | const size_t id = w.idx; |
349 | 0 | end = !!(w.len & 0x80); |
350 | 0 | w.len = (uint8_t)l; |
351 | 0 | if (w.transform == 0) { |
352 | 0 | const uint8_t* s; |
353 | 0 | if (!IsMatch(dictionary->words, w, &data[1], max_length - 1)) { |
354 | 0 | continue; |
355 | 0 | } |
356 | | /* Transforms " " + BROTLI_TRANSFORM_IDENTITY + "" and |
357 | | "." + BROTLI_TRANSFORM_IDENTITY + "" */ |
358 | 0 | AddMatch(id + (is_space ? 6 : 32) * n, l + 1, l, matches); |
359 | 0 | has_found_match = BROTLI_TRUE; |
360 | 0 | if (l + 2 >= max_length) { |
361 | 0 | continue; |
362 | 0 | } |
363 | | /* Transforms " " + BROTLI_TRANSFORM_IDENTITY + <suffix> and |
364 | | "." + BROTLI_TRANSFORM_IDENTITY + <suffix> |
365 | | */ |
366 | 0 | s = &data[l + 1]; |
367 | 0 | if (s[0] == ' ') { |
368 | 0 | AddMatch(id + (is_space ? 2 : 77) * n, l + 2, l, matches); |
369 | 0 | } else if (s[0] == '(') { |
370 | 0 | AddMatch(id + (is_space ? 89 : 67) * n, l + 2, l, matches); |
371 | 0 | } else if (is_space) { |
372 | 0 | if (s[0] == ',') { |
373 | 0 | AddMatch(id + 103 * n, l + 2, l, matches); |
374 | 0 | if (s[1] == ' ') { |
375 | 0 | AddMatch(id + 33 * n, l + 3, l, matches); |
376 | 0 | } |
377 | 0 | } else if (s[0] == '.') { |
378 | 0 | AddMatch(id + 71 * n, l + 2, l, matches); |
379 | 0 | if (s[1] == ' ') { |
380 | 0 | AddMatch(id + 52 * n, l + 3, l, matches); |
381 | 0 | } |
382 | 0 | } else if (s[0] == '=') { |
383 | 0 | if (s[1] == '"') { |
384 | 0 | AddMatch(id + 81 * n, l + 3, l, matches); |
385 | 0 | } else if (s[1] == '\'') { |
386 | 0 | AddMatch(id + 98 * n, l + 3, l, matches); |
387 | 0 | } |
388 | 0 | } |
389 | 0 | } |
390 | 0 | } else if (is_space) { |
391 | | /* Set is_all_caps=0 for BROTLI_TRANSFORM_UPPERCASE_FIRST and |
392 | | is_all_caps=1 otherwise (BROTLI_TRANSFORM_UPPERCASE_ALL) |
393 | | transform. */ |
394 | 0 | const BROTLI_BOOL is_all_caps = |
395 | 0 | TO_BROTLI_BOOL(w.transform != BROTLI_TRANSFORM_UPPERCASE_FIRST); |
396 | 0 | const uint8_t* s; |
397 | 0 | if (!IsMatch(dictionary->words, w, &data[1], max_length - 1)) { |
398 | 0 | continue; |
399 | 0 | } |
400 | | /* Transforms " " + kUppercase{First,All} + "" */ |
401 | 0 | AddMatch(id + (is_all_caps ? 85 : 30) * n, l + 1, l, matches); |
402 | 0 | has_found_match = BROTLI_TRUE; |
403 | 0 | if (l + 2 >= max_length) { |
404 | 0 | continue; |
405 | 0 | } |
406 | | /* Transforms " " + kUppercase{First,All} + <suffix> */ |
407 | 0 | s = &data[l + 1]; |
408 | 0 | if (s[0] == ' ') { |
409 | 0 | AddMatch(id + (is_all_caps ? 83 : 15) * n, l + 2, l, matches); |
410 | 0 | } else if (s[0] == ',') { |
411 | 0 | if (!is_all_caps) { |
412 | 0 | AddMatch(id + 109 * n, l + 2, l, matches); |
413 | 0 | } |
414 | 0 | if (s[1] == ' ') { |
415 | 0 | AddMatch(id + (is_all_caps ? 111 : 65) * n, l + 3, l, matches); |
416 | 0 | } |
417 | 0 | } else if (s[0] == '.') { |
418 | 0 | AddMatch(id + (is_all_caps ? 115 : 96) * n, l + 2, l, matches); |
419 | 0 | if (s[1] == ' ') { |
420 | 0 | AddMatch(id + (is_all_caps ? 117 : 91) * n, l + 3, l, matches); |
421 | 0 | } |
422 | 0 | } else if (s[0] == '=') { |
423 | 0 | if (s[1] == '"') { |
424 | 0 | AddMatch(id + (is_all_caps ? 110 : 118) * n, l + 3, l, matches); |
425 | 0 | } else if (s[1] == '\'') { |
426 | 0 | AddMatch(id + (is_all_caps ? 119 : 120) * n, l + 3, l, matches); |
427 | 0 | } |
428 | 0 | } |
429 | 0 | } |
430 | 0 | } |
431 | 0 | } |
432 | 0 | if (max_length >= 6) { |
433 | | /* Transforms with prefixes "e ", "s ", ", " and "\xC2\xA0" */ |
434 | 0 | if ((data[1] == ' ' && |
435 | 0 | (data[0] == 'e' || data[0] == 's' || data[0] == ',')) || |
436 | 0 | (data[0] == 0xC2 && data[1] == 0xA0)) { |
437 | 0 | size_t offset = dictionary->buckets[Hash(&data[2])]; |
438 | 0 | BROTLI_BOOL end = !offset; |
439 | 0 | while (!end) { |
440 | 0 | DictWord w = dictionary->dict_words[offset++]; |
441 | 0 | const size_t l = w.len & 0x1F; |
442 | 0 | const size_t n = (size_t)1 << dictionary->words->size_bits_by_length[l]; |
443 | 0 | const size_t id = w.idx; |
444 | 0 | end = !!(w.len & 0x80); |
445 | 0 | w.len = (uint8_t)l; |
446 | 0 | if (w.transform == 0 && |
447 | 0 | IsMatch(dictionary->words, w, &data[2], max_length - 2)) { |
448 | 0 | if (data[0] == 0xC2) { |
449 | 0 | AddMatch(id + 102 * n, l + 2, l, matches); |
450 | 0 | has_found_match = BROTLI_TRUE; |
451 | 0 | } else if (l + 2 < max_length && data[l + 2] == ' ') { |
452 | 0 | size_t t = data[0] == 'e' ? 18 : (data[0] == 's' ? 7 : 13); |
453 | 0 | AddMatch(id + t * n, l + 3, l, matches); |
454 | 0 | has_found_match = BROTLI_TRUE; |
455 | 0 | } |
456 | 0 | } |
457 | 0 | } |
458 | 0 | } |
459 | 0 | } |
460 | 0 | if (max_length >= 9) { |
461 | | /* Transforms with prefixes " the " and ".com/" */ |
462 | 0 | if ((data[0] == ' ' && data[1] == 't' && data[2] == 'h' && |
463 | 0 | data[3] == 'e' && data[4] == ' ') || |
464 | 0 | (data[0] == '.' && data[1] == 'c' && data[2] == 'o' && |
465 | 0 | data[3] == 'm' && data[4] == '/')) { |
466 | 0 | size_t offset = dictionary->buckets[Hash(&data[5])]; |
467 | 0 | BROTLI_BOOL end = !offset; |
468 | 0 | while (!end) { |
469 | 0 | DictWord w = dictionary->dict_words[offset++]; |
470 | 0 | const size_t l = w.len & 0x1F; |
471 | 0 | const size_t n = (size_t)1 << dictionary->words->size_bits_by_length[l]; |
472 | 0 | const size_t id = w.idx; |
473 | 0 | end = !!(w.len & 0x80); |
474 | 0 | w.len = (uint8_t)l; |
475 | 0 | if (w.transform == 0 && |
476 | 0 | IsMatch(dictionary->words, w, &data[5], max_length - 5)) { |
477 | 0 | AddMatch(id + (data[0] == ' ' ? 41 : 72) * n, l + 5, l, matches); |
478 | 0 | has_found_match = BROTLI_TRUE; |
479 | 0 | if (l + 5 < max_length) { |
480 | 0 | const uint8_t* s = &data[l + 5]; |
481 | 0 | if (data[0] == ' ') { |
482 | 0 | if (l + 8 < max_length && |
483 | 0 | s[0] == ' ' && s[1] == 'o' && s[2] == 'f' && s[3] == ' ') { |
484 | 0 | AddMatch(id + 62 * n, l + 9, l, matches); |
485 | 0 | if (l + 12 < max_length && |
486 | 0 | s[4] == 't' && s[5] == 'h' && s[6] == 'e' && s[7] == ' ') { |
487 | 0 | AddMatch(id + 73 * n, l + 13, l, matches); |
488 | 0 | } |
489 | 0 | } |
490 | 0 | } |
491 | 0 | } |
492 | 0 | } |
493 | 0 | } |
494 | 0 | } |
495 | 0 | } |
496 | 0 | return has_found_match; |
497 | 0 | } |
498 | | |
499 | | /* Finds matches for one or more dictionaries, if multiple are present |
500 | | in the contextual dictionary */ |
501 | | BROTLI_BOOL duckdb_brotli::BrotliFindAllStaticDictionaryMatches( |
502 | | const BrotliEncoderDictionary* dictionary, const uint8_t* data, |
503 | 0 | size_t min_length, size_t max_length, uint32_t* matches) { |
504 | 0 | BROTLI_BOOL has_found_match = |
505 | 0 | BrotliFindAllStaticDictionaryMatchesFor( |
506 | 0 | dictionary, data, min_length, max_length, matches); |
507 | |
|
508 | 0 | if (!!dictionary->parent && dictionary->parent->num_dictionaries > 1) { |
509 | 0 | uint32_t matches2[BROTLI_MAX_STATIC_DICTIONARY_MATCH_LEN + 1]; |
510 | 0 | int l; |
511 | 0 | const BrotliEncoderDictionary* dictionary2 = dictionary->parent->dict[0]; |
512 | 0 | if (dictionary2 == dictionary) { |
513 | 0 | dictionary2 = dictionary->parent->dict[1]; |
514 | 0 | } |
515 | |
|
516 | 0 | for (l = 0; l < BROTLI_MAX_STATIC_DICTIONARY_MATCH_LEN + 1; l++) { |
517 | 0 | matches2[l] = kInvalidMatch; |
518 | 0 | } |
519 | |
|
520 | 0 | has_found_match |= BrotliFindAllStaticDictionaryMatchesFor( |
521 | 0 | dictionary2, data, min_length, max_length, matches2); |
522 | |
|
523 | 0 | for (l = 0; l < BROTLI_MAX_STATIC_DICTIONARY_MATCH_LEN + 1; l++) { |
524 | 0 | if (matches2[l] != kInvalidMatch) { |
525 | 0 | uint32_t dist = (uint32_t)(matches2[l] >> 5); |
526 | 0 | uint32_t len_code = matches2[l] & 31; |
527 | 0 | uint32_t skipdist = (uint32_t)((uint32_t)(1 << dictionary->words-> |
528 | 0 | size_bits_by_length[len_code]) & ~1u) * |
529 | 0 | (uint32_t)dictionary->num_transforms; |
530 | | /* TODO(lode): check for dist overflow */ |
531 | 0 | dist += skipdist; |
532 | 0 | AddMatch(dist, (size_t)l, len_code, matches); |
533 | 0 | } |
534 | 0 | } |
535 | 0 | } |
536 | 0 | return has_found_match; |
537 | 0 | } |
538 | | |