Coverage Report

Created: 2025-06-24 07:53

/src/duckdb/third_party/brotli/common/shared_dictionary.cpp
Line
Count
Source (jump to first uncovered line)
1
/* Copyright 2017 Google Inc. All Rights Reserved.
2
3
   Distributed under MIT license.
4
   See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
5
*/
6
7
/* Shared Dictionary definition and utilities. */
8
9
#include <brotli/shared_dictionary.h>
10
11
#include <memory.h>
12
#include <stdlib.h>  /* malloc, free */
13
#include <stdio.h>
14
15
#include "dictionary.h"
16
#include "brotli_platform.h"
17
#include "shared_dictionary_internal.h"
18
19
using namespace duckdb_brotli;
20
21
#if defined(BROTLI_EXPERIMENTAL)
22
23
#define BROTLI_NUM_ENCODED_LENGTHS (SHARED_BROTLI_MAX_DICTIONARY_WORD_LENGTH \
24
    - SHARED_BROTLI_MIN_DICTIONARY_WORD_LENGTH + 1)
25
26
/* Max allowed by spec */
27
#define BROTLI_MAX_SIZE_BITS 15u
28
29
/* Returns BROTLI_TRUE on success, BROTLI_FALSE on failure. */
30
static BROTLI_BOOL ReadBool(const uint8_t* encoded, size_t size, size_t* pos,
31
    BROTLI_BOOL* result) {
32
  uint8_t value;
33
  size_t position = *pos;
34
  if (position >= size) return BROTLI_FALSE;  /* past file end */
35
  value = encoded[position++];
36
  if (value > 1) return BROTLI_FALSE;  /* invalid bool */
37
  *result = TO_BROTLI_BOOL(value);
38
  *pos = position;
39
  return BROTLI_TRUE;  /* success */
40
}
41
42
/* Returns BROTLI_TRUE on success, BROTLI_FALSE on failure. */
43
static BROTLI_BOOL ReadUint8(const uint8_t* encoded, size_t size, size_t* pos,
44
    uint8_t* result) {
45
  size_t position = *pos;
46
  if (position + sizeof(uint8_t) > size) return BROTLI_FALSE;
47
  *result = encoded[position++];
48
  *pos = position;
49
  return BROTLI_TRUE;
50
}
51
52
/* Returns BROTLI_TRUE on success, BROTLI_FALSE on failure. */
53
static BROTLI_BOOL ReadUint16(const uint8_t* encoded, size_t size, size_t* pos,
54
    uint16_t* result) {
55
  size_t position = *pos;
56
  if (position + sizeof(uint16_t) > size) return BROTLI_FALSE;
57
  *result = BROTLI_UNALIGNED_LOAD16LE(&encoded[position]);
58
  position += 2;
59
  *pos = position;
60
  return BROTLI_TRUE;
61
}
62
63
/* Reads a varint into a uint32_t, and returns error if it's too large */
64
/* Returns BROTLI_TRUE on success, BROTLI_FALSE on failure. */
65
static BROTLI_BOOL ReadVarint32(const uint8_t* encoded, size_t size,
66
    size_t* pos, uint32_t* result) {
67
  int num = 0;
68
  uint8_t byte;
69
  *result = 0;
70
  for (;;) {
71
    if (*pos >= size) return BROTLI_FALSE;
72
    byte = encoded[(*pos)++];
73
    if (num == 4 && byte > 15) return BROTLI_FALSE;
74
    *result |= (uint32_t)(byte & 127) << (num * 7);
75
    if (byte < 128) return BROTLI_TRUE;
76
    num++;
77
  }
78
}
79
80
/* Returns the total length of word list. */
81
static size_t BrotliSizeBitsToOffsets(const uint8_t* size_bits_by_length,
82
    uint32_t* offsets_by_length) {
83
  uint32_t pos = 0;
84
  uint32_t i;
85
  for (i = 0; i <= SHARED_BROTLI_MAX_DICTIONARY_WORD_LENGTH; i++) {
86
    offsets_by_length[i] = pos;
87
    if (size_bits_by_length[i] != 0) {
88
      pos += i << size_bits_by_length[i];
89
    }
90
  }
91
  return pos;
92
}
93
94
static BROTLI_BOOL ParseWordList(size_t size, const uint8_t* encoded,
95
    size_t* pos, BrotliDictionary* out) {
96
  size_t offset;
97
  size_t i;
98
  size_t position = *pos;
99
  if (position + BROTLI_NUM_ENCODED_LENGTHS > size) {
100
    return BROTLI_FALSE;
101
  }
102
103
  memset(out->size_bits_by_length, 0, SHARED_BROTLI_MIN_DICTIONARY_WORD_LENGTH);
104
  memcpy(out->size_bits_by_length + SHARED_BROTLI_MIN_DICTIONARY_WORD_LENGTH,
105
      &encoded[position], BROTLI_NUM_ENCODED_LENGTHS);
106
  for (i = SHARED_BROTLI_MIN_DICTIONARY_WORD_LENGTH;
107
      i <= SHARED_BROTLI_MAX_DICTIONARY_WORD_LENGTH; i++) {
108
    if (out->size_bits_by_length[i] > BROTLI_MAX_SIZE_BITS) {
109
      return BROTLI_FALSE;
110
    }
111
  }
112
  position += BROTLI_NUM_ENCODED_LENGTHS;
113
  offset = BrotliSizeBitsToOffsets(
114
      out->size_bits_by_length, out->offsets_by_length);
115
116
  out->data = &encoded[position];
117
  out->data_size = offset;
118
  position += offset;
119
  if (position > size) return BROTLI_FALSE;
120
  *pos = position;
121
  return BROTLI_TRUE;
122
}
123
124
/* Computes the cutOffTransforms of a BrotliTransforms which already has the
125
   transforms data correctly filled in. */
126
static void ComputeCutoffTransforms(BrotliTransforms* transforms) {
127
  uint32_t i;
128
  for (i = 0; i < BROTLI_TRANSFORMS_MAX_CUT_OFF + 1; i++) {
129
    transforms->cutOffTransforms[i] = -1;
130
  }
131
  for (i = 0; i < transforms->num_transforms; i++) {
132
    const uint8_t* prefix = BROTLI_TRANSFORM_PREFIX(transforms, i);
133
    uint8_t type = BROTLI_TRANSFORM_TYPE(transforms, i);
134
    const uint8_t* suffix = BROTLI_TRANSFORM_SUFFIX(transforms, i);
135
    if (type <= BROTLI_TRANSFORM_OMIT_LAST_9 && *prefix == 0 && *suffix == 0 &&
136
        transforms->cutOffTransforms[type] == -1) {
137
      transforms->cutOffTransforms[type] = (int16_t)i;
138
    }
139
  }
140
}
141
142
static BROTLI_BOOL ParsePrefixSuffixTable(size_t size, const uint8_t* encoded,
143
    size_t* pos, BrotliTransforms* out, uint16_t* out_table,
144
    size_t* out_table_size) {
145
  size_t position = *pos;
146
  size_t offset = 0;
147
  size_t stringlet_count = 0;  /* NUM_PREFIX_SUFFIX */
148
  size_t data_length = 0;
149
150
  /* PREFIX_SUFFIX_LENGTH */
151
  if (!ReadUint16(encoded, size, &position, &out->prefix_suffix_size)) {
152
    return BROTLI_FALSE;
153
  }
154
  data_length = out->prefix_suffix_size;
155
156
  /* Must at least have space for null terminator. */
157
  if (data_length < 1) return BROTLI_FALSE;
158
  out->prefix_suffix = &encoded[position];
159
  if (position + data_length >= size) return BROTLI_FALSE;
160
  while (BROTLI_TRUE) {
161
    /* STRING_LENGTH */
162
    size_t stringlet_len = encoded[position + offset];
163
    out_table[stringlet_count] = (uint16_t)offset;
164
    stringlet_count++;
165
    offset++;
166
    if (stringlet_len == 0) {
167
      if (offset == data_length) {
168
        break;
169
      } else {
170
        return BROTLI_FALSE;
171
      }
172
    }
173
    if (stringlet_count > 255) return BROTLI_FALSE;
174
    offset += stringlet_len;
175
    if (offset >= data_length) return BROTLI_FALSE;
176
  }
177
178
  position += data_length;
179
  *pos = position;
180
  *out_table_size = (uint16_t)stringlet_count;
181
  return BROTLI_TRUE;
182
}
183
184
static BROTLI_BOOL ParseTransformsList(size_t size, const uint8_t* encoded,
185
    size_t* pos, BrotliTransforms* out, uint16_t* prefix_suffix_table,
186
    size_t* prefix_suffix_count) {
187
  uint32_t i;
188
  BROTLI_BOOL has_params = BROTLI_FALSE;
189
  BROTLI_BOOL prefix_suffix_ok = BROTLI_FALSE;
190
  size_t position = *pos;
191
  size_t stringlet_cnt = 0;
192
  if (position >= size) return BROTLI_FALSE;
193
194
  prefix_suffix_ok = ParsePrefixSuffixTable(
195
      size, encoded, &position, out, prefix_suffix_table, &stringlet_cnt);
196
  if (!prefix_suffix_ok) return BROTLI_FALSE;
197
  out->prefix_suffix_map = prefix_suffix_table;
198
  *prefix_suffix_count = stringlet_cnt;
199
200
  out->num_transforms = encoded[position++];
201
  out->transforms = &encoded[position];
202
  position += (size_t)out->num_transforms * 3;
203
  if (position > size) return BROTLI_FALSE;
204
  /* Check for errors and read extra parameters. */
205
  for (i = 0; i < out->num_transforms; i++) {
206
    uint8_t prefix_id = BROTLI_TRANSFORM_PREFIX_ID(out, i);
207
    uint8_t type = BROTLI_TRANSFORM_TYPE(out, i);
208
    uint8_t suffix_id = BROTLI_TRANSFORM_SUFFIX_ID(out, i);
209
    if (prefix_id >= stringlet_cnt) return BROTLI_FALSE;
210
    if (type >= BROTLI_NUM_TRANSFORM_TYPES) return BROTLI_FALSE;
211
    if (suffix_id >= stringlet_cnt) return BROTLI_FALSE;
212
    if (type == BROTLI_TRANSFORM_SHIFT_FIRST ||
213
        type == BROTLI_TRANSFORM_SHIFT_ALL) {
214
      has_params = BROTLI_TRUE;
215
    }
216
  }
217
  if (has_params) {
218
    out->params = &encoded[position];
219
    position += (size_t)out->num_transforms * 2;
220
    if (position > size) return BROTLI_FALSE;
221
    for (i = 0; i < out->num_transforms; i++) {
222
      uint8_t type = BROTLI_TRANSFORM_TYPE(out, i);
223
      if (type != BROTLI_TRANSFORM_SHIFT_FIRST &&
224
          type != BROTLI_TRANSFORM_SHIFT_ALL) {
225
        if (out->params[i * 2] != 0 || out->params[i * 2 + 1] != 0) {
226
          return BROTLI_FALSE;
227
        }
228
      }
229
    }
230
  } else {
231
    out->params = NULL;
232
  }
233
  ComputeCutoffTransforms(out);
234
  *pos = position;
235
  return BROTLI_TRUE;
236
}
237
238
static BROTLI_BOOL DryParseDictionary(const uint8_t* encoded,
239
    size_t size, uint32_t* num_prefix, BROTLI_BOOL* is_custom_static_dict) {
240
  size_t pos = 0;
241
  uint32_t chunk_size = 0;
242
  uint8_t num_word_lists;
243
  uint8_t num_transform_lists;
244
  *is_custom_static_dict = BROTLI_FALSE;
245
  *num_prefix = 0;
246
247
  /* Skip magic header bytes. */
248
  pos += 2;
249
250
  /* LZ77_DICTIONARY_LENGTH */
251
  if (!ReadVarint32(encoded, size, &pos, &chunk_size)) return BROTLI_FALSE;
252
  if (chunk_size != 0) {
253
    /* This limitation is not specified but the 32-bit Brotli decoder for now */
254
    if (chunk_size > 1073741823) return BROTLI_FALSE;
255
    *num_prefix = 1;
256
    if (pos + chunk_size > size) return BROTLI_FALSE;
257
    pos += chunk_size;
258
  }
259
260
  if (!ReadUint8(encoded, size, &pos, &num_word_lists)) {
261
    return BROTLI_FALSE;
262
  }
263
  if (!ReadUint8(encoded, size, &pos, &num_transform_lists)) {
264
    return BROTLI_FALSE;
265
  }
266
267
  if (num_word_lists > 0 || num_transform_lists > 0) {
268
    *is_custom_static_dict = BROTLI_TRUE;
269
  }
270
271
  return BROTLI_TRUE;
272
}
273
274
static BROTLI_BOOL ParseDictionary(const uint8_t* encoded, size_t size,
275
    BrotliSharedDictionary* dict) {
276
  uint32_t i;
277
  size_t pos = 0;
278
  uint32_t chunk_size = 0;
279
  size_t total_prefix_suffix_count = 0;
280
  size_t trasform_list_start[SHARED_BROTLI_NUM_DICTIONARY_CONTEXTS];
281
  uint16_t temporary_prefix_suffix_table[256];
282
283
  /* Skip magic header bytes. */
284
  pos += 2;
285
286
  /* LZ77_DICTIONARY_LENGTH */
287
  if (!ReadVarint32(encoded, size, &pos, &chunk_size)) return BROTLI_FALSE;
288
  if (chunk_size != 0) {
289
    if (pos + chunk_size > size) return BROTLI_FALSE;
290
    dict->prefix_size[dict->num_prefix] = chunk_size;
291
    dict->prefix[dict->num_prefix] = &encoded[pos];
292
    dict->num_prefix++;
293
    /* LZ77_DICTIONARY_LENGTH bytes. */
294
    pos += chunk_size;
295
  }
296
297
  /* NUM_WORD_LISTS */
298
  if (!ReadUint8(encoded, size, &pos, &dict->num_word_lists)) {
299
    return BROTLI_FALSE;
300
  }
301
  if (dict->num_word_lists > SHARED_BROTLI_NUM_DICTIONARY_CONTEXTS) {
302
    return BROTLI_FALSE;
303
  }
304
305
  if (dict->num_word_lists != 0) {
306
    dict->words_instances = (BrotliDictionary*)dict->alloc_func(
307
        dict->memory_manager_opaque,
308
        dict->num_word_lists * sizeof(*dict->words_instances));
309
    if (!dict->words_instances) return BROTLI_FALSE;  /* OOM */
310
  }
311
  for (i = 0; i < dict->num_word_lists; i++) {
312
    if (!ParseWordList(size, encoded, &pos, &dict->words_instances[i])) {
313
      return BROTLI_FALSE;
314
    }
315
  }
316
317
  /* NUM_TRANSFORM_LISTS */
318
  if (!ReadUint8(encoded, size, &pos, &dict->num_transform_lists)) {
319
    return BROTLI_FALSE;
320
  }
321
  if (dict->num_transform_lists > SHARED_BROTLI_NUM_DICTIONARY_CONTEXTS) {
322
    return BROTLI_FALSE;
323
  }
324
325
  if (dict->num_transform_lists != 0) {
326
    dict->transforms_instances = (BrotliTransforms*)dict->alloc_func(
327
        dict->memory_manager_opaque,
328
        dict->num_transform_lists * sizeof(*dict->transforms_instances));
329
    if (!dict->transforms_instances) return BROTLI_FALSE;  /* OOM */
330
  }
331
  for (i = 0; i < dict->num_transform_lists; i++) {
332
    BROTLI_BOOL ok = BROTLI_FALSE;
333
    size_t prefix_suffix_count = 0;
334
    trasform_list_start[i] = pos;
335
    dict->transforms_instances[i].prefix_suffix_map =
336
        temporary_prefix_suffix_table;
337
    ok = ParseTransformsList(
338
        size, encoded, &pos, &dict->transforms_instances[i],
339
        temporary_prefix_suffix_table, &prefix_suffix_count);
340
    if (!ok) return BROTLI_FALSE;
341
    total_prefix_suffix_count += prefix_suffix_count;
342
  }
343
  if (total_prefix_suffix_count != 0) {
344
    dict->prefix_suffix_maps = (uint16_t*)dict->alloc_func(
345
        dict->memory_manager_opaque,
346
        total_prefix_suffix_count * sizeof(*dict->prefix_suffix_maps));
347
    if (!dict->prefix_suffix_maps) return BROTLI_FALSE;  /* OOM */
348
  }
349
  total_prefix_suffix_count = 0;
350
  for (i = 0; i < dict->num_transform_lists; i++) {
351
    size_t prefix_suffix_count = 0;
352
    size_t position = trasform_list_start[i];
353
    uint16_t* prefix_suffix_map =
354
      &dict->prefix_suffix_maps[total_prefix_suffix_count];
355
    BROTLI_BOOL ok = ParsePrefixSuffixTable(
356
        size, encoded, &position, &dict->transforms_instances[i],
357
        prefix_suffix_map, &prefix_suffix_count);
358
    if (!ok) return BROTLI_FALSE;
359
    dict->transforms_instances[i].prefix_suffix_map = prefix_suffix_map;
360
    total_prefix_suffix_count += prefix_suffix_count;
361
  }
362
363
  if (dict->num_word_lists != 0 || dict->num_transform_lists != 0) {
364
    if (!ReadUint8(encoded, size, &pos, &dict->num_dictionaries)) {
365
      return BROTLI_FALSE;
366
    }
367
    if (dict->num_dictionaries == 0 ||
368
        dict->num_dictionaries > SHARED_BROTLI_NUM_DICTIONARY_CONTEXTS) {
369
      return BROTLI_FALSE;
370
    }
371
    for (i = 0; i < dict->num_dictionaries; i++) {
372
      uint8_t words_index;
373
      uint8_t transforms_index;
374
      if (!ReadUint8(encoded, size, &pos, &words_index)) {
375
        return BROTLI_FALSE;
376
      }
377
      if (words_index > dict->num_word_lists) return BROTLI_FALSE;
378
      if (!ReadUint8(encoded, size, &pos, &transforms_index)) {
379
        return BROTLI_FALSE;
380
      }
381
      if (transforms_index > dict->num_transform_lists) return BROTLI_FALSE;
382
      dict->words[i] = words_index == dict->num_word_lists ?
383
          BrotliGetDictionary() : &dict->words_instances[words_index];
384
      dict->transforms[i] = transforms_index == dict->num_transform_lists ?
385
          BrotliGetTransforms(): &dict->transforms_instances[transforms_index];
386
    }
387
    /* CONTEXT_ENABLED */
388
    if (!ReadBool(encoded, size, &pos, &dict->context_based)) {
389
      return BROTLI_FALSE;
390
    }
391
392
    /* CONTEXT_MAP */
393
    if (dict->context_based) {
394
      for (i = 0; i < SHARED_BROTLI_NUM_DICTIONARY_CONTEXTS; i++) {
395
        if (!ReadUint8(encoded, size, &pos, &dict->context_map[i])) {
396
          return BROTLI_FALSE;
397
        }
398
        if (dict->context_map[i] >= dict->num_dictionaries) {
399
          return BROTLI_FALSE;
400
        }
401
      }
402
    }
403
  } else {
404
    dict->context_based = BROTLI_FALSE;
405
    dict->num_dictionaries = 1;
406
    dict->words[0] = BrotliGetDictionary();
407
    dict->transforms[0] = BrotliGetTransforms();
408
  }
409
410
  return BROTLI_TRUE;
411
}
412
413
/* Decodes shared dictionary and verifies correctness.
414
   Returns BROTLI_TRUE if dictionary is valid, BROTLI_FALSE otherwise.
415
   The BrotliSharedDictionary must already have been initialized. If the
416
   BrotliSharedDictionary already contains data, compound dictionaries
417
   will be appended, but an error will be returned if it already has
418
   custom words or transforms.
419
   TODO(lode): link to RFC for shared brotli once published. */
420
static BROTLI_BOOL DecodeSharedDictionary(
421
    const uint8_t* encoded, size_t size, BrotliSharedDictionary* dict) {
422
  uint32_t num_prefix = 0;
423
  BROTLI_BOOL is_custom_static_dict = BROTLI_FALSE;
424
  BROTLI_BOOL has_custom_static_dict =
425
      dict->num_word_lists > 0 || dict->num_transform_lists > 0;
426
427
  /* Check magic header bytes. */
428
  if (size < 2) return BROTLI_FALSE;
429
  if (encoded[0] != 0x91 || encoded[1] != 0) return BROTLI_FALSE;
430
431
  if (!DryParseDictionary(encoded, size, &num_prefix, &is_custom_static_dict)) {
432
    return BROTLI_FALSE;
433
  }
434
435
  if (num_prefix + dict->num_prefix > SHARED_BROTLI_MAX_COMPOUND_DICTS) {
436
    return BROTLI_FALSE;
437
  }
438
439
  /* Cannot combine different static dictionaries, only prefix dictionaries */
440
  if (has_custom_static_dict && is_custom_static_dict) return BROTLI_FALSE;
441
442
  return ParseDictionary(encoded, size, dict);
443
}
444
445
#endif  /* BROTLI_EXPERIMENTAL */
446
447
void duckdb_brotli::BrotliSharedDictionaryDestroyInstance(
448
0
    BrotliSharedDictionary* dict) {
449
0
  if (!dict) {
450
0
    return;
451
0
  } else {
452
0
    brotli_free_func free_func = dict->free_func;
453
0
    void* opaque = dict->memory_manager_opaque;
454
    /* Cleanup. */
455
0
    free_func(opaque, dict->words_instances);
456
0
    free_func(opaque, dict->transforms_instances);
457
0
    free_func(opaque, dict->prefix_suffix_maps);
458
    /* Self-destruction. */
459
0
    free_func(opaque, dict);
460
0
  }
461
0
}
462
463
BROTLI_BOOL duckdb_brotli::BrotliSharedDictionaryAttach(
464
    BrotliSharedDictionary* dict, BrotliSharedDictionaryType type,
465
0
    size_t data_size, const uint8_t data[BROTLI_ARRAY_PARAM(data_size)]) {
466
0
  if (!dict) {
467
0
    return BROTLI_FALSE;
468
0
  }
469
#if defined(BROTLI_EXPERIMENTAL)
470
  if (type == BROTLI_SHARED_DICTIONARY_SERIALIZED) {
471
    return DecodeSharedDictionary(data, data_size, dict);
472
  }
473
#endif  /* BROTLI_EXPERIMENTAL */
474
0
  if (type == BROTLI_SHARED_DICTIONARY_RAW) {
475
0
    if (dict->num_prefix >= SHARED_BROTLI_MAX_COMPOUND_DICTS) {
476
0
      return BROTLI_FALSE;
477
0
    }
478
0
    dict->prefix_size[dict->num_prefix] = data_size;
479
0
    dict->prefix[dict->num_prefix] = data;
480
0
    dict->num_prefix++;
481
0
    return BROTLI_TRUE;
482
0
  }
483
0
  return BROTLI_FALSE;
484
0
}
485
486
BrotliSharedDictionary* duckdb_brotli::BrotliSharedDictionaryCreateInstance(
487
0
    brotli_alloc_func alloc_func, brotli_free_func free_func, void* opaque) {
488
0
  BrotliSharedDictionary* dict = 0;
489
0
  if (!alloc_func && !free_func) {
490
0
    dict = (BrotliSharedDictionary*)malloc(sizeof(BrotliSharedDictionary));
491
0
  } else if (alloc_func && free_func) {
492
0
    dict = (BrotliSharedDictionary*)alloc_func(
493
0
        opaque, sizeof(BrotliSharedDictionary));
494
0
  }
495
0
  if (dict == 0) {
496
0
    return 0;
497
0
  }
498
499
  /* TODO(eustas): explicitly initialize all the fields? */
500
0
  memset(dict, 0, sizeof(BrotliSharedDictionary));
501
502
0
  dict->context_based = BROTLI_FALSE;
503
0
  dict->num_dictionaries = 1;
504
0
  dict->num_word_lists = 0;
505
0
  dict->num_transform_lists = 0;
506
507
0
  dict->words[0] = BrotliGetDictionary();
508
0
  dict->transforms[0] = BrotliGetTransforms();
509
510
0
  dict->alloc_func = alloc_func ? alloc_func :  duckdb_brotli::BrotliDefaultAllocFunc;
511
0
  dict->free_func = free_func ? free_func :  duckdb_brotli::BrotliDefaultFreeFunc;
512
0
  dict->memory_manager_opaque = opaque;
513
514
0
  return dict;
515
0
}
516
517