Coverage Report

Created: 2025-08-26 06:14

/src/zstd/lib/dictBuilder/cover.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) Meta Platforms, Inc. and affiliates.
3
 * All rights reserved.
4
 *
5
 * This source code is licensed under both the BSD-style license (found in the
6
 * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7
 * in the COPYING file in the root directory of this source tree).
8
 * You may select, at your option, one of the above-listed licenses.
9
 */
10
11
/* *****************************************************************************
12
 * Constructs a dictionary using a heuristic based on the following paper:
13
 *
14
 * Liao, Petri, Moffat, Wirth
15
 * Effective Construction of Relative Lempel-Ziv Dictionaries
16
 * Published in WWW 2016.
17
 *
18
 * Adapted from code originally written by @ot (Giuseppe Ottaviano).
19
 ******************************************************************************/
20
21
/*-*************************************
22
*  Dependencies
23
***************************************/
24
/* qsort_r is an extension.
25
 *
26
 * Android NDK does not ship qsort_r().
27
 */
28
#if (defined(__linux__) && !defined(__ANDROID__)) || defined(__CYGWIN__) || defined(__MSYS__)
29
# ifndef _GNU_SOURCE
30
#   define _GNU_SOURCE
31
# endif
32
#endif
33
34
#define __STDC_WANT_LIB_EXT1__ 1 /* request C11 Annex K, which includes qsort_s() */
35
36
#include <stdio.h>  /* fprintf */
37
#include <stdlib.h> /* malloc, free, qsort_r */
38
39
#include <string.h> /* memset */
40
#include <time.h>   /* clock */
41
42
#ifndef ZDICT_STATIC_LINKING_ONLY
43
#  define ZDICT_STATIC_LINKING_ONLY
44
#endif
45
46
#include "../common/debug.h" /* DEBUG_STATIC_ASSERT */
47
#include "../common/mem.h" /* read */
48
#include "../common/pool.h" /* POOL_ctx */
49
#include "../common/threading.h" /* ZSTD_pthread_mutex_t */
50
#include "../common/zstd_internal.h" /* includes zstd.h */
51
#include "../common/bits.h" /* ZSTD_highbit32 */
52
#include "../zdict.h"
53
#include "cover.h"
54
55
/*-*************************************
56
*  Constants
57
***************************************/
58
/**
59
* There are 32bit indexes used to ref samples, so limit samples size to 4GB
60
* on 64bit builds.
61
* For 32bit builds we choose 1 GB.
62
* Most 32bit platforms have 2GB user-mode addressable space and we allocate a large
63
* contiguous buffer, so 1GB is already a high limit.
64
*/
65
0
#define COVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((unsigned)-1) : ((unsigned)1 GB))
66
0
#define COVER_DEFAULT_SPLITPOINT 1.0
67
68
/**
69
 * Select the qsort() variant used by cover
70
 */
71
#define ZDICT_QSORT_MIN 0
72
#define ZDICT_QSORT_C90 ZDICT_QSORT_MIN
73
#define ZDICT_QSORT_GNU 1
74
#define ZDICT_QSORT_APPLE 2
75
#define ZDICT_QSORT_MSVC 3
76
#define ZDICT_QSORT_C11 ZDICT_QSORT_MAX
77
#define ZDICT_QSORT_MAX 4
78
79
#ifndef ZDICT_QSORT
80
# if defined(__APPLE__)
81
#   define ZDICT_QSORT ZDICT_QSORT_APPLE /* uses qsort_r() with a different order for parameters */
82
# elif (defined(__linux__) && !defined(__ANDROID__)) || defined(__CYGWIN__) || defined(__MSYS__)
83
#   define ZDICT_QSORT ZDICT_QSORT_GNU /* uses qsort_r() */
84
# elif defined(_WIN32) && defined(_MSC_VER)
85
#   define ZDICT_QSORT ZDICT_QSORT_MSVC /* uses qsort_s() with a different order for parameters */
86
# elif defined(STDC_LIB_EXT1) && (STDC_LIB_EXT1 > 0) /* C11 Annex K */
87
#   define ZDICT_QSORT ZDICT_QSORT_C11 /* uses qsort_s() */
88
# else
89
#   define ZDICT_QSORT ZDICT_QSORT_C90 /* uses standard qsort() which is not re-entrant (requires global variable) */
90
# endif
91
#endif
92
93
94
/*-*************************************
95
*  Console display
96
*
97
* Captures the `displayLevel` variable in the local scope.
98
***************************************/
99
#undef  DISPLAY
100
#define DISPLAY(...)                                                           \
101
0
  {                                                                            \
102
0
    fprintf(stderr, __VA_ARGS__);                                              \
103
0
    fflush(stderr);                                                            \
104
0
  }
105
#undef  DISPLAYLEVEL
106
#define DISPLAYLEVEL(l, ...)                                                   \
107
0
  if (displayLevel >= l) {                                                     \
108
0
    DISPLAY(__VA_ARGS__);                                                      \
109
0
  } /* 0 : no display;   1: errors;   2: default;  3: details;  4: debug */
110
111
#undef  DISPLAYUPDATE
112
#define DISPLAYUPDATE(lastUpdateTime, l, ...)                                  \
113
0
  if (displayLevel >= l) {                                                     \
114
0
    const clock_t refreshRate = CLOCKS_PER_SEC * 15 / 100;                     \
115
0
    if ((clock() - lastUpdateTime > refreshRate) || (displayLevel >= 4)) {     \
116
0
      lastUpdateTime = clock();                                                \
117
0
      DISPLAY(__VA_ARGS__);                                                    \
118
0
    }                                                                          \
119
0
  }
120
121
/*-*************************************
122
* Hash table
123
***************************************
124
* A small specialized hash map for storing activeDmers.
125
* The map does not resize, so if it becomes full it will loop forever.
126
* Thus, the map must be large enough to store every value.
127
* The map implements linear probing and keeps its load less than 0.5.
128
*/
129
130
0
#define MAP_EMPTY_VALUE ((U32)-1)
131
typedef struct COVER_map_pair_t_s {
132
  U32 key;
133
  U32 value;
134
} COVER_map_pair_t;
135
136
typedef struct COVER_map_s {
137
  COVER_map_pair_t *data;
138
  U32 sizeLog;
139
  U32 size;
140
  U32 sizeMask;
141
} COVER_map_t;
142
143
/**
144
 * Clear the map.
145
 */
146
0
static void COVER_map_clear(COVER_map_t *map) {
147
0
  memset(map->data, MAP_EMPTY_VALUE, map->size * sizeof(COVER_map_pair_t));
148
0
}
149
150
/**
151
 * Initializes a map of the given size.
152
 * Returns 1 on success and 0 on failure.
153
 * The map must be destroyed with COVER_map_destroy().
154
 * The map is only guaranteed to be large enough to hold size elements.
155
 */
156
0
static int COVER_map_init(COVER_map_t *map, U32 size) {
157
0
  map->sizeLog = ZSTD_highbit32(size) + 2;
158
0
  map->size = (U32)1 << map->sizeLog;
159
0
  map->sizeMask = map->size - 1;
160
0
  map->data = (COVER_map_pair_t *)malloc(map->size * sizeof(COVER_map_pair_t));
161
0
  if (!map->data) {
162
0
    map->sizeLog = 0;
163
0
    map->size = 0;
164
0
    return 0;
165
0
  }
166
0
  COVER_map_clear(map);
167
0
  return 1;
168
0
}
169
170
/**
171
 * Internal hash function
172
 */
173
static const U32 COVER_prime4bytes = 2654435761U;
174
0
static U32 COVER_map_hash(COVER_map_t *map, U32 key) {
175
0
  return (key * COVER_prime4bytes) >> (32 - map->sizeLog);
176
0
}
177
178
/**
179
 * Helper function that returns the index that a key should be placed into.
180
 */
181
0
static U32 COVER_map_index(COVER_map_t *map, U32 key) {
182
0
  const U32 hash = COVER_map_hash(map, key);
183
0
  U32 i;
184
0
  for (i = hash;; i = (i + 1) & map->sizeMask) {
185
0
    COVER_map_pair_t *pos = &map->data[i];
186
0
    if (pos->value == MAP_EMPTY_VALUE) {
187
0
      return i;
188
0
    }
189
0
    if (pos->key == key) {
190
0
      return i;
191
0
    }
192
0
  }
193
0
}
194
195
/**
196
 * Returns the pointer to the value for key.
197
 * If key is not in the map, it is inserted and the value is set to 0.
198
 * The map must not be full.
199
 */
200
0
static U32 *COVER_map_at(COVER_map_t *map, U32 key) {
201
0
  COVER_map_pair_t *pos = &map->data[COVER_map_index(map, key)];
202
0
  if (pos->value == MAP_EMPTY_VALUE) {
203
0
    pos->key = key;
204
0
    pos->value = 0;
205
0
  }
206
0
  return &pos->value;
207
0
}
208
209
/**
210
 * Deletes key from the map if present.
211
 */
212
0
static void COVER_map_remove(COVER_map_t *map, U32 key) {
213
0
  U32 i = COVER_map_index(map, key);
214
0
  COVER_map_pair_t* del = &map->data[i];
215
0
  U32 shift = 1;
216
0
  if (del->value == MAP_EMPTY_VALUE) {
217
0
    return;
218
0
  }
219
0
  for (i = (i + 1) & map->sizeMask;; i = (i + 1) & map->sizeMask) {
220
0
    COVER_map_pair_t *const pos = &map->data[i];
221
    /* If the position is empty we are done */
222
0
    if (pos->value == MAP_EMPTY_VALUE) {
223
0
      del->value = MAP_EMPTY_VALUE;
224
0
      return;
225
0
    }
226
    /* If pos can be moved to del do so */
227
0
    if (((i - COVER_map_hash(map, pos->key)) & map->sizeMask) >= shift) {
228
0
      del->key = pos->key;
229
0
      del->value = pos->value;
230
0
      del = pos;
231
0
      shift = 1;
232
0
    } else {
233
0
      ++shift;
234
0
    }
235
0
  }
236
0
}
237
238
/**
239
 * Destroys a map that is inited with COVER_map_init().
240
 */
241
0
static void COVER_map_destroy(COVER_map_t *map) {
242
0
  if (map->data) {
243
0
    free(map->data);
244
0
  }
245
0
  map->data = NULL;
246
0
  map->size = 0;
247
0
}
248
249
/*-*************************************
250
* Context
251
***************************************/
252
253
typedef struct {
254
  const BYTE *samples;
255
  size_t *offsets;
256
  const size_t *samplesSizes;
257
  size_t nbSamples;
258
  size_t nbTrainSamples;
259
  size_t nbTestSamples;
260
  U32 *suffix;
261
  size_t suffixSize;
262
  U32 *freqs;
263
  U32 *dmerAt;
264
  unsigned d;
265
  int displayLevel;
266
} COVER_ctx_t;
267
268
#if ZDICT_QSORT == ZDICT_QSORT_C90
269
/* Use global context for non-reentrant sort functions */
270
static COVER_ctx_t *g_coverCtx = NULL;
271
#endif
272
273
/*-*************************************
274
*  Helper functions
275
***************************************/
276
277
/**
278
 * Returns the sum of the sample sizes.
279
 */
280
0
size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) {
281
0
  size_t sum = 0;
282
0
  unsigned i;
283
0
  for (i = 0; i < nbSamples; ++i) {
284
0
    sum += samplesSizes[i];
285
0
  }
286
0
  return sum;
287
0
}
288
289
/**
290
 * Returns -1 if the dmer at lp is less than the dmer at rp.
291
 * Return 0 if the dmers at lp and rp are equal.
292
 * Returns 1 if the dmer at lp is greater than the dmer at rp.
293
 */
294
0
static int COVER_cmp(COVER_ctx_t *ctx, const void *lp, const void *rp) {
295
0
  U32 const lhs = *(U32 const *)lp;
296
0
  U32 const rhs = *(U32 const *)rp;
297
0
  return memcmp(ctx->samples + lhs, ctx->samples + rhs, ctx->d);
298
0
}
299
/**
300
 * Faster version for d <= 8.
301
 */
302
0
static int COVER_cmp8(COVER_ctx_t *ctx, const void *lp, const void *rp) {
303
0
  U64 const mask = (ctx->d == 8) ? (U64)-1 : (((U64)1 << (8 * ctx->d)) - 1);
304
0
  U64 const lhs = MEM_readLE64(ctx->samples + *(U32 const *)lp) & mask;
305
0
  U64 const rhs = MEM_readLE64(ctx->samples + *(U32 const *)rp) & mask;
306
0
  if (lhs < rhs) {
307
0
    return -1;
308
0
  }
309
0
  return (lhs > rhs);
310
0
}
311
312
/**
313
 * Same as COVER_cmp() except ties are broken by pointer value
314
 */
315
#if (ZDICT_QSORT == ZDICT_QSORT_MSVC) || (ZDICT_QSORT == ZDICT_QSORT_APPLE)
316
static int WIN_CDECL COVER_strict_cmp(void* g_coverCtx, const void* lp, const void* rp) {
317
#elif (ZDICT_QSORT == ZDICT_QSORT_GNU) || (ZDICT_QSORT == ZDICT_QSORT_C11)
318
0
static int COVER_strict_cmp(const void *lp, const void *rp, void *g_coverCtx) {
319
#else /* C90 fallback.*/
320
static int COVER_strict_cmp(const void *lp, const void *rp) {
321
#endif
322
0
  int result = COVER_cmp((COVER_ctx_t*)g_coverCtx, lp, rp);
323
0
  if (result == 0) {
324
0
    result = lp < rp ? -1 : 1;
325
0
  }
326
0
  return result;
327
0
}
328
/**
329
 * Faster version for d <= 8.
330
 */
331
#if (ZDICT_QSORT == ZDICT_QSORT_MSVC) || (ZDICT_QSORT == ZDICT_QSORT_APPLE)
332
static int WIN_CDECL COVER_strict_cmp8(void* g_coverCtx, const void* lp, const void* rp) {
333
#elif (ZDICT_QSORT == ZDICT_QSORT_GNU) || (ZDICT_QSORT == ZDICT_QSORT_C11)
334
0
static int COVER_strict_cmp8(const void *lp, const void *rp, void *g_coverCtx) {
335
#else /* C90 fallback.*/
336
static int COVER_strict_cmp8(const void *lp, const void *rp) {
337
#endif
338
0
  int result = COVER_cmp8((COVER_ctx_t*)g_coverCtx, lp, rp);
339
0
  if (result == 0) {
340
0
    result = lp < rp ? -1 : 1;
341
0
  }
342
0
  return result;
343
0
}
344
345
/**
346
 * Abstract away divergence of qsort_r() parameters.
347
 * Hopefully when C11 become the norm, we will be able
348
 * to clean it up.
349
 */
350
static void stableSort(COVER_ctx_t *ctx)
351
0
{
352
0
    DEBUG_STATIC_ASSERT(ZDICT_QSORT_MIN <= ZDICT_QSORT && ZDICT_QSORT <= ZDICT_QSORT_MAX);
353
#if (ZDICT_QSORT == ZDICT_QSORT_APPLE)
354
    qsort_r(ctx->suffix, ctx->suffixSize, sizeof(U32),
355
            ctx,
356
            (ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp));
357
#elif (ZDICT_QSORT == ZDICT_QSORT_GNU)
358
    qsort_r(ctx->suffix, ctx->suffixSize, sizeof(U32),
359
0
            (ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp),
360
0
            ctx);
361
#elif (ZDICT_QSORT == ZDICT_QSORT_MSVC)
362
    qsort_s(ctx->suffix, ctx->suffixSize, sizeof(U32),
363
            (ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp),
364
            ctx);
365
#elif (ZDICT_QSORT == ZDICT_QSORT_C11)
366
    qsort_s(ctx->suffix, ctx->suffixSize, sizeof(U32),
367
            (ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp),
368
            ctx);
369
#else /* C90 fallback.*/
370
    g_coverCtx = ctx;
371
    /* TODO(cavalcanti): implement a reentrant qsort() when _r is not available. */
372
    qsort(ctx->suffix, ctx->suffixSize, sizeof(U32),
373
          (ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp));
374
#endif
375
0
}
376
377
/**
378
 * Returns the first pointer in [first, last) whose element does not compare
379
 * less than value.  If no such element exists it returns last.
380
 */
381
static const size_t *COVER_lower_bound(const size_t* first, const size_t* last,
382
0
                                       size_t value) {
383
0
  size_t count = (size_t)(last - first);
384
0
  assert(last >= first);
385
0
  while (count != 0) {
386
0
    size_t step = count / 2;
387
0
    const size_t *ptr = first;
388
0
    ptr += step;
389
0
    if (*ptr < value) {
390
0
      first = ++ptr;
391
0
      count -= step + 1;
392
0
    } else {
393
0
      count = step;
394
0
    }
395
0
  }
396
0
  return first;
397
0
}
398
399
/**
400
 * Generic groupBy function.
401
 * Groups an array sorted by cmp into groups with equivalent values.
402
 * Calls grp for each group.
403
 */
404
static void
405
COVER_groupBy(const void *data, size_t count, size_t size, COVER_ctx_t *ctx,
406
              int (*cmp)(COVER_ctx_t *, const void *, const void *),
407
0
              void (*grp)(COVER_ctx_t *, const void *, const void *)) {
408
0
  const BYTE *ptr = (const BYTE *)data;
409
0
  size_t num = 0;
410
0
  while (num < count) {
411
0
    const BYTE *grpEnd = ptr + size;
412
0
    ++num;
413
0
    while (num < count && cmp(ctx, ptr, grpEnd) == 0) {
414
0
      grpEnd += size;
415
0
      ++num;
416
0
    }
417
0
    grp(ctx, ptr, grpEnd);
418
0
    ptr = grpEnd;
419
0
  }
420
0
}
421
422
/*-*************************************
423
*  Cover functions
424
***************************************/
425
426
/**
427
 * Called on each group of positions with the same dmer.
428
 * Counts the frequency of each dmer and saves it in the suffix array.
429
 * Fills `ctx->dmerAt`.
430
 */
431
static void COVER_group(COVER_ctx_t *ctx, const void *group,
432
0
                        const void *groupEnd) {
433
  /* The group consists of all the positions with the same first d bytes. */
434
0
  const U32 *grpPtr = (const U32 *)group;
435
0
  const U32 *grpEnd = (const U32 *)groupEnd;
436
  /* The dmerId is how we will reference this dmer.
437
   * This allows us to map the whole dmer space to a much smaller space, the
438
   * size of the suffix array.
439
   */
440
0
  const U32 dmerId = (U32)(grpPtr - ctx->suffix);
441
  /* Count the number of samples this dmer shows up in */
442
0
  U32 freq = 0;
443
  /* Details */
444
0
  const size_t *curOffsetPtr = ctx->offsets;
445
0
  const size_t *offsetsEnd = ctx->offsets + ctx->nbSamples;
446
  /* Once *grpPtr >= curSampleEnd this occurrence of the dmer is in a
447
   * different sample than the last.
448
   */
449
0
  size_t curSampleEnd = ctx->offsets[0];
450
0
  for (; grpPtr != grpEnd; ++grpPtr) {
451
    /* Save the dmerId for this position so we can get back to it. */
452
0
    ctx->dmerAt[*grpPtr] = dmerId;
453
    /* Dictionaries only help for the first reference to the dmer.
454
     * After that zstd can reference the match from the previous reference.
455
     * So only count each dmer once for each sample it is in.
456
     */
457
0
    if (*grpPtr < curSampleEnd) {
458
0
      continue;
459
0
    }
460
0
    freq += 1;
461
    /* Binary search to find the end of the sample *grpPtr is in.
462
     * In the common case that grpPtr + 1 == grpEnd we can skip the binary
463
     * search because the loop is over.
464
     */
465
0
    if (grpPtr + 1 != grpEnd) {
466
0
      const size_t *sampleEndPtr =
467
0
          COVER_lower_bound(curOffsetPtr, offsetsEnd, *grpPtr);
468
0
      curSampleEnd = *sampleEndPtr;
469
0
      curOffsetPtr = sampleEndPtr + 1;
470
0
    }
471
0
  }
472
  /* At this point we are never going to look at this segment of the suffix
473
   * array again.  We take advantage of this fact to save memory.
474
   * We store the frequency of the dmer in the first position of the group,
475
   * which is dmerId.
476
   */
477
0
  ctx->suffix[dmerId] = freq;
478
0
}
479
480
481
/**
482
 * Selects the best segment in an epoch.
483
 * Segments of are scored according to the function:
484
 *
485
 * Let F(d) be the frequency of dmer d.
486
 * Let S_i be the dmer at position i of segment S which has length k.
487
 *
488
 *     Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1})
489
 *
490
 * Once the dmer d is in the dictionary we set F(d) = 0.
491
 */
492
static COVER_segment_t COVER_selectSegment(const COVER_ctx_t *ctx, U32 *freqs,
493
                                           COVER_map_t *activeDmers, U32 begin,
494
                                           U32 end,
495
0
                                           ZDICT_cover_params_t parameters) {
496
  /* Constants */
497
0
  const U32 k = parameters.k;
498
0
  const U32 d = parameters.d;
499
0
  const U32 dmersInK = k - d + 1;
500
  /* Try each segment (activeSegment) and save the best (bestSegment) */
501
0
  COVER_segment_t bestSegment = {0, 0, 0};
502
0
  COVER_segment_t activeSegment;
503
  /* Reset the activeDmers in the segment */
504
0
  COVER_map_clear(activeDmers);
505
  /* The activeSegment starts at the beginning of the epoch. */
506
0
  activeSegment.begin = begin;
507
0
  activeSegment.end = begin;
508
0
  activeSegment.score = 0;
509
  /* Slide the activeSegment through the whole epoch.
510
   * Save the best segment in bestSegment.
511
   */
512
0
  while (activeSegment.end < end) {
513
    /* The dmerId for the dmer at the next position */
514
0
    U32 newDmer = ctx->dmerAt[activeSegment.end];
515
    /* The entry in activeDmers for this dmerId */
516
0
    U32 *newDmerOcc = COVER_map_at(activeDmers, newDmer);
517
    /* If the dmer isn't already present in the segment add its score. */
518
0
    if (*newDmerOcc == 0) {
519
      /* The paper suggest using the L-0.5 norm, but experiments show that it
520
       * doesn't help.
521
       */
522
0
      activeSegment.score += freqs[newDmer];
523
0
    }
524
    /* Add the dmer to the segment */
525
0
    activeSegment.end += 1;
526
0
    *newDmerOcc += 1;
527
528
    /* If the window is now too large, drop the first position */
529
0
    if (activeSegment.end - activeSegment.begin == dmersInK + 1) {
530
0
      U32 delDmer = ctx->dmerAt[activeSegment.begin];
531
0
      U32 *delDmerOcc = COVER_map_at(activeDmers, delDmer);
532
0
      activeSegment.begin += 1;
533
0
      *delDmerOcc -= 1;
534
      /* If this is the last occurrence of the dmer, subtract its score */
535
0
      if (*delDmerOcc == 0) {
536
0
        COVER_map_remove(activeDmers, delDmer);
537
0
        activeSegment.score -= freqs[delDmer];
538
0
      }
539
0
    }
540
541
    /* If this segment is the best so far save it */
542
0
    if (activeSegment.score > bestSegment.score) {
543
0
      bestSegment = activeSegment;
544
0
    }
545
0
  }
546
0
  {
547
    /* Trim off the zero frequency head and tail from the segment. */
548
0
    U32 newBegin = bestSegment.end;
549
0
    U32 newEnd = bestSegment.begin;
550
0
    U32 pos;
551
0
    for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {
552
0
      U32 freq = freqs[ctx->dmerAt[pos]];
553
0
      if (freq != 0) {
554
0
        newBegin = MIN(newBegin, pos);
555
0
        newEnd = pos + 1;
556
0
      }
557
0
    }
558
0
    bestSegment.begin = newBegin;
559
0
    bestSegment.end = newEnd;
560
0
  }
561
0
  {
562
    /* Zero out the frequency of each dmer covered by the chosen segment. */
563
0
    U32 pos;
564
0
    for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {
565
0
      freqs[ctx->dmerAt[pos]] = 0;
566
0
    }
567
0
  }
568
0
  return bestSegment;
569
0
}
570
571
/**
572
 * Check the validity of the parameters.
573
 * Returns non-zero if the parameters are valid and 0 otherwise.
574
 */
575
static int COVER_checkParameters(ZDICT_cover_params_t parameters,
576
0
                                 size_t maxDictSize) {
577
  /* k and d are required parameters */
578
0
  if (parameters.d == 0 || parameters.k == 0) {
579
0
    return 0;
580
0
  }
581
  /* k <= maxDictSize */
582
0
  if (parameters.k > maxDictSize) {
583
0
    return 0;
584
0
  }
585
  /* d <= k */
586
0
  if (parameters.d > parameters.k) {
587
0
    return 0;
588
0
  }
589
  /* 0 < splitPoint <= 1 */
590
0
  if (parameters.splitPoint <= 0 || parameters.splitPoint > 1){
591
0
    return 0;
592
0
  }
593
0
  return 1;
594
0
}
595
596
/**
597
 * Clean up a context initialized with `COVER_ctx_init()`.
598
 */
599
0
static void COVER_ctx_destroy(COVER_ctx_t *ctx) {
600
0
  if (!ctx) {
601
0
    return;
602
0
  }
603
0
  if (ctx->suffix) {
604
0
    free(ctx->suffix);
605
0
    ctx->suffix = NULL;
606
0
  }
607
0
  if (ctx->freqs) {
608
0
    free(ctx->freqs);
609
0
    ctx->freqs = NULL;
610
0
  }
611
0
  if (ctx->dmerAt) {
612
0
    free(ctx->dmerAt);
613
0
    ctx->dmerAt = NULL;
614
0
  }
615
0
  if (ctx->offsets) {
616
0
    free(ctx->offsets);
617
0
    ctx->offsets = NULL;
618
0
  }
619
0
}
620
621
/**
622
 * Prepare a context for dictionary building.
623
 * The context is only dependent on the parameter `d` and can be used multiple
624
 * times.
625
 * Returns 0 on success or error code on error.
626
 * The context must be destroyed with `COVER_ctx_destroy()`.
627
 */
628
static size_t COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
629
                          const size_t *samplesSizes, unsigned nbSamples,
630
                          unsigned d, double splitPoint, int displayLevel)
631
0
{
632
0
  const BYTE *const samples = (const BYTE *)samplesBuffer;
633
0
  const size_t totalSamplesSize = COVER_sum(samplesSizes, nbSamples);
634
  /* Split samples into testing and training sets */
635
0
  const unsigned nbTrainSamples = splitPoint < 1.0 ? (unsigned)((double)nbSamples * splitPoint) : nbSamples;
636
0
  const unsigned nbTestSamples = splitPoint < 1.0 ? nbSamples - nbTrainSamples : nbSamples;
637
0
  const size_t trainingSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes, nbTrainSamples) : totalSamplesSize;
638
0
  const size_t testSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes + nbTrainSamples, nbTestSamples) : totalSamplesSize;
639
0
  ctx->displayLevel = displayLevel;
640
  /* Checks */
641
0
  if (totalSamplesSize < MAX(d, sizeof(U64)) ||
642
0
      totalSamplesSize >= (size_t)COVER_MAX_SAMPLES_SIZE) {
643
0
    DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n",
644
0
                 (unsigned)(totalSamplesSize>>20), (COVER_MAX_SAMPLES_SIZE >> 20));
645
0
    return ERROR(srcSize_wrong);
646
0
  }
647
  /* Check if there are at least 5 training samples */
648
0
  if (nbTrainSamples < 5) {
649
0
    DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid.", nbTrainSamples);
650
0
    return ERROR(srcSize_wrong);
651
0
  }
652
  /* Check if there's testing sample */
653
0
  if (nbTestSamples < 1) {
654
0
    DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.", nbTestSamples);
655
0
    return ERROR(srcSize_wrong);
656
0
  }
657
  /* Zero the context */
658
0
  memset(ctx, 0, sizeof(*ctx));
659
0
  DISPLAYLEVEL(2, "Training on %u samples of total size %u\n", nbTrainSamples,
660
0
               (unsigned)trainingSamplesSize);
661
0
  DISPLAYLEVEL(2, "Testing on %u samples of total size %u\n", nbTestSamples,
662
0
               (unsigned)testSamplesSize);
663
0
  ctx->samples = samples;
664
0
  ctx->samplesSizes = samplesSizes;
665
0
  ctx->nbSamples = nbSamples;
666
0
  ctx->nbTrainSamples = nbTrainSamples;
667
0
  ctx->nbTestSamples = nbTestSamples;
668
  /* Partial suffix array */
669
0
  ctx->suffixSize = trainingSamplesSize - MAX(d, sizeof(U64)) + 1;
670
0
  ctx->suffix = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
671
  /* Maps index to the dmerID */
672
0
  ctx->dmerAt = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
673
  /* The offsets of each file */
674
0
  ctx->offsets = (size_t *)malloc((nbSamples + 1) * sizeof(size_t));
675
0
  if (!ctx->suffix || !ctx->dmerAt || !ctx->offsets) {
676
0
    DISPLAYLEVEL(1, "Failed to allocate scratch buffers\n");
677
0
    COVER_ctx_destroy(ctx);
678
0
    return ERROR(memory_allocation);
679
0
  }
680
0
  ctx->freqs = NULL;
681
0
  ctx->d = d;
682
683
  /* Fill offsets from the samplesSizes */
684
0
  {
685
0
    U32 i;
686
0
    ctx->offsets[0] = 0;
687
0
    for (i = 1; i <= nbSamples; ++i) {
688
0
      ctx->offsets[i] = ctx->offsets[i - 1] + samplesSizes[i - 1];
689
0
    }
690
0
  }
691
0
  DISPLAYLEVEL(2, "Constructing partial suffix array\n");
692
0
  {
693
    /* suffix is a partial suffix array.
694
     * It only sorts suffixes by their first parameters.d bytes.
695
     * The sort is stable, so each dmer group is sorted by position in input.
696
     */
697
0
    U32 i;
698
0
    for (i = 0; i < ctx->suffixSize; ++i) {
699
0
      ctx->suffix[i] = i;
700
0
    }
701
0
    stableSort(ctx);
702
0
  }
703
0
  DISPLAYLEVEL(2, "Computing frequencies\n");
704
  /* For each dmer group (group of positions with the same first d bytes):
705
   * 1. For each position we set dmerAt[position] = dmerID.  The dmerID is
706
   *    (groupBeginPtr - suffix).  This allows us to go from position to
707
   *    dmerID so we can look up values in freq.
708
   * 2. We calculate how many samples the dmer occurs in and save it in
709
   *    freqs[dmerId].
710
   */
711
0
  COVER_groupBy(ctx->suffix, ctx->suffixSize, sizeof(U32), ctx,
712
0
                (ctx->d <= 8 ? &COVER_cmp8 : &COVER_cmp), &COVER_group);
713
0
  ctx->freqs = ctx->suffix;
714
0
  ctx->suffix = NULL;
715
0
  return 0;
716
0
}
717
718
void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel)
719
0
{
720
0
  const double ratio = (double)nbDmers / (double)maxDictSize;
721
0
  if (ratio >= 10) {
722
0
      return;
723
0
  }
724
0
  DISPLAYLEVEL(1,
725
0
               "WARNING: The maximum dictionary size %u is too large "
726
0
               "compared to the source size %u! "
727
0
               "size(source)/size(dictionary) = %f, but it should be >= "
728
0
               "10! This may lead to a subpar dictionary! We recommend "
729
0
               "training on sources at least 10x, and preferably 100x "
730
0
               "the size of the dictionary! \n", (U32)maxDictSize,
731
0
               (U32)nbDmers, ratio);
732
0
}
733
734
COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize,
735
                                       U32 nbDmers, U32 k, U32 passes)
736
0
{
737
0
  const U32 minEpochSize = k * 10;
738
0
  COVER_epoch_info_t epochs;
739
0
  epochs.num = MAX(1, maxDictSize / k / passes);
740
0
  epochs.size = nbDmers / epochs.num;
741
0
  if (epochs.size >= minEpochSize) {
742
0
      assert(epochs.size * epochs.num <= nbDmers);
743
0
      return epochs;
744
0
  }
745
0
  epochs.size = MIN(minEpochSize, nbDmers);
746
0
  epochs.num = nbDmers / epochs.size;
747
0
  assert(epochs.size * epochs.num <= nbDmers);
748
0
  return epochs;
749
0
}
750
751
/**
752
 * Given the prepared context build the dictionary.
753
 */
754
static size_t COVER_buildDictionary(const COVER_ctx_t *ctx, U32 *freqs,
755
                                    COVER_map_t *activeDmers, void *dictBuffer,
756
                                    size_t dictBufferCapacity,
757
0
                                    ZDICT_cover_params_t parameters) {
758
0
  BYTE *const dict = (BYTE *)dictBuffer;
759
0
  size_t tail = dictBufferCapacity;
760
  /* Divide the data into epochs. We will select one segment from each epoch. */
761
0
  const COVER_epoch_info_t epochs = COVER_computeEpochs(
762
0
      (U32)dictBufferCapacity, (U32)ctx->suffixSize, parameters.k, 4);
763
0
  const size_t maxZeroScoreRun = MAX(10, MIN(100, epochs.num >> 3));
764
0
  size_t zeroScoreRun = 0;
765
0
  size_t epoch;
766
0
  clock_t lastUpdateTime = 0;
767
0
  const int displayLevel = ctx->displayLevel;
768
0
  DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n",
769
0
                (U32)epochs.num, (U32)epochs.size);
770
  /* Loop through the epochs until there are no more segments or the dictionary
771
   * is full.
772
   */
773
0
  for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs.num) {
774
0
    const U32 epochBegin = (U32)(epoch * epochs.size);
775
0
    const U32 epochEnd = epochBegin + epochs.size;
776
0
    size_t segmentSize;
777
    /* Select a segment */
778
0
    COVER_segment_t segment = COVER_selectSegment(
779
0
        ctx, freqs, activeDmers, epochBegin, epochEnd, parameters);
780
    /* If the segment covers no dmers, then we are out of content.
781
     * There may be new content in other epochs, for continue for some time.
782
     */
783
0
    if (segment.score == 0) {
784
0
      if (++zeroScoreRun >= maxZeroScoreRun) {
785
0
          break;
786
0
      }
787
0
      continue;
788
0
    }
789
0
    zeroScoreRun = 0;
790
    /* Trim the segment if necessary and if it is too small then we are done */
791
0
    segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail);
792
0
    if (segmentSize < parameters.d) {
793
0
      break;
794
0
    }
795
    /* We fill the dictionary from the back to allow the best segments to be
796
     * referenced with the smallest offsets.
797
     */
798
0
    tail -= segmentSize;
799
0
    memcpy(dict + tail, ctx->samples + segment.begin, segmentSize);
800
0
    DISPLAYUPDATE(
801
0
        lastUpdateTime,
802
0
        2, "\r%u%%       ",
803
0
        (unsigned)(((dictBufferCapacity - tail) * 100) / dictBufferCapacity));
804
0
  }
805
0
  DISPLAYLEVEL(2, "\r%79s\r", "");
806
0
  return tail;
807
0
}
808
809
ZDICTLIB_STATIC_API size_t ZDICT_trainFromBuffer_cover(
810
    void *dictBuffer, size_t dictBufferCapacity,
811
    const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,
812
    ZDICT_cover_params_t parameters)
813
0
{
814
0
  BYTE* const dict = (BYTE*)dictBuffer;
815
0
  COVER_ctx_t ctx;
816
0
  COVER_map_t activeDmers;
817
0
  const int displayLevel = (int)parameters.zParams.notificationLevel;
818
0
  parameters.splitPoint = 1.0;
819
  /* Checks */
820
0
  if (!COVER_checkParameters(parameters, dictBufferCapacity)) {
821
0
    DISPLAYLEVEL(1, "Cover parameters incorrect\n");
822
0
    return ERROR(parameter_outOfBound);
823
0
  }
824
0
  if (nbSamples == 0) {
825
0
    DISPLAYLEVEL(1, "Cover must have at least one input file\n");
826
0
    return ERROR(srcSize_wrong);
827
0
  }
828
0
  if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
829
0
    DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
830
0
                 ZDICT_DICTSIZE_MIN);
831
0
    return ERROR(dstSize_tooSmall);
832
0
  }
833
  /* Initialize context and activeDmers */
834
0
  {
835
0
    size_t const initVal = COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
836
0
                      parameters.d, parameters.splitPoint, displayLevel);
837
0
    if (ZSTD_isError(initVal)) {
838
0
      return initVal;
839
0
    }
840
0
  }
841
0
  COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.suffixSize, displayLevel);
842
0
  if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
843
0
    DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");
844
0
    COVER_ctx_destroy(&ctx);
845
0
    return ERROR(memory_allocation);
846
0
  }
847
848
0
  DISPLAYLEVEL(2, "Building dictionary\n");
849
0
  {
850
0
    const size_t tail =
851
0
        COVER_buildDictionary(&ctx, ctx.freqs, &activeDmers, dictBuffer,
852
0
                              dictBufferCapacity, parameters);
853
0
    const size_t dictionarySize = ZDICT_finalizeDictionary(
854
0
        dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
855
0
        samplesBuffer, samplesSizes, nbSamples, parameters.zParams);
856
0
    if (!ZSTD_isError(dictionarySize)) {
857
0
      DISPLAYLEVEL(2, "Constructed dictionary of size %u\n",
858
0
                   (unsigned)dictionarySize);
859
0
    }
860
0
    COVER_ctx_destroy(&ctx);
861
0
    COVER_map_destroy(&activeDmers);
862
0
    return dictionarySize;
863
0
  }
864
0
}
865
866
867
868
size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,
869
                                    const size_t *samplesSizes, const BYTE *samples,
870
                                    size_t *offsets,
871
                                    size_t nbTrainSamples, size_t nbSamples,
872
0
                                    BYTE *const dict, size_t dictBufferCapacity) {
873
0
  size_t totalCompressedSize = ERROR(GENERIC);
874
  /* Pointers */
875
0
  ZSTD_CCtx *cctx;
876
0
  ZSTD_CDict *cdict;
877
0
  void *dst;
878
  /* Local variables */
879
0
  size_t dstCapacity;
880
0
  size_t i;
881
  /* Allocate dst with enough space to compress the maximum sized sample */
882
0
  {
883
0
    size_t maxSampleSize = 0;
884
0
    i = parameters.splitPoint < 1.0 ? nbTrainSamples : 0;
885
0
    for (; i < nbSamples; ++i) {
886
0
      maxSampleSize = MAX(samplesSizes[i], maxSampleSize);
887
0
    }
888
0
    dstCapacity = ZSTD_compressBound(maxSampleSize);
889
0
    dst = malloc(dstCapacity);
890
0
  }
891
  /* Create the cctx and cdict */
892
0
  cctx = ZSTD_createCCtx();
893
0
  cdict = ZSTD_createCDict(dict, dictBufferCapacity,
894
0
                           parameters.zParams.compressionLevel);
895
0
  if (!dst || !cctx || !cdict) {
896
0
    goto _compressCleanup;
897
0
  }
898
  /* Compress each sample and sum their sizes (or error) */
899
0
  totalCompressedSize = dictBufferCapacity;
900
0
  i = parameters.splitPoint < 1.0 ? nbTrainSamples : 0;
901
0
  for (; i < nbSamples; ++i) {
902
0
    const size_t size = ZSTD_compress_usingCDict(
903
0
        cctx, dst, dstCapacity, samples + offsets[i],
904
0
        samplesSizes[i], cdict);
905
0
    if (ZSTD_isError(size)) {
906
0
      totalCompressedSize = size;
907
0
      goto _compressCleanup;
908
0
    }
909
0
    totalCompressedSize += size;
910
0
  }
911
0
_compressCleanup:
912
0
  ZSTD_freeCCtx(cctx);
913
0
  ZSTD_freeCDict(cdict);
914
0
  if (dst) {
915
0
    free(dst);
916
0
  }
917
0
  return totalCompressedSize;
918
0
}
919
920
921
/**
922
 * Initialize the `COVER_best_t`.
923
 */
924
0
void COVER_best_init(COVER_best_t *best) {
925
0
  if (best==NULL) return; /* compatible with init on NULL */
926
0
  (void)ZSTD_pthread_mutex_init(&best->mutex, NULL);
927
0
  (void)ZSTD_pthread_cond_init(&best->cond, NULL);
928
0
  best->liveJobs = 0;
929
0
  best->dict = NULL;
930
0
  best->dictSize = 0;
931
0
  best->compressedSize = (size_t)-1;
932
0
  memset(&best->parameters, 0, sizeof(best->parameters));
933
0
}
934
935
/**
936
 * Wait until liveJobs == 0.
937
 */
938
0
void COVER_best_wait(COVER_best_t *best) {
939
0
  if (!best) {
940
0
    return;
941
0
  }
942
0
  ZSTD_pthread_mutex_lock(&best->mutex);
943
0
  while (best->liveJobs != 0) {
944
0
    ZSTD_pthread_cond_wait(&best->cond, &best->mutex);
945
0
  }
946
0
  ZSTD_pthread_mutex_unlock(&best->mutex);
947
0
}
948
949
/**
950
 * Call COVER_best_wait() and then destroy the COVER_best_t.
951
 */
952
0
void COVER_best_destroy(COVER_best_t *best) {
953
0
  if (!best) {
954
0
    return;
955
0
  }
956
0
  COVER_best_wait(best);
957
0
  if (best->dict) {
958
0
    free(best->dict);
959
0
  }
960
0
  ZSTD_pthread_mutex_destroy(&best->mutex);
961
0
  ZSTD_pthread_cond_destroy(&best->cond);
962
0
}
963
964
/**
965
 * Called when a thread is about to be launched.
966
 * Increments liveJobs.
967
 */
968
0
void COVER_best_start(COVER_best_t *best) {
969
0
  if (!best) {
970
0
    return;
971
0
  }
972
0
  ZSTD_pthread_mutex_lock(&best->mutex);
973
0
  ++best->liveJobs;
974
0
  ZSTD_pthread_mutex_unlock(&best->mutex);
975
0
}
976
977
/**
978
 * Called when a thread finishes executing, both on error or success.
979
 * Decrements liveJobs and signals any waiting threads if liveJobs == 0.
980
 * If this dictionary is the best so far save it and its parameters.
981
 */
982
void COVER_best_finish(COVER_best_t* best,
983
                      ZDICT_cover_params_t parameters,
984
                      COVER_dictSelection_t selection)
985
0
{
986
0
  void* dict = selection.dictContent;
987
0
  size_t compressedSize = selection.totalCompressedSize;
988
0
  size_t dictSize = selection.dictSize;
989
0
  if (!best) {
990
0
    return;
991
0
  }
992
0
  {
993
0
    size_t liveJobs;
994
0
    ZSTD_pthread_mutex_lock(&best->mutex);
995
0
    --best->liveJobs;
996
0
    liveJobs = best->liveJobs;
997
    /* If the new dictionary is better */
998
0
    if (compressedSize < best->compressedSize) {
999
      /* Allocate space if necessary */
1000
0
      if (!best->dict || best->dictSize < dictSize) {
1001
0
        if (best->dict) {
1002
0
          free(best->dict);
1003
0
        }
1004
0
        best->dict = malloc(dictSize);
1005
0
        if (!best->dict) {
1006
0
          best->compressedSize = ERROR(GENERIC);
1007
0
          best->dictSize = 0;
1008
0
          ZSTD_pthread_cond_signal(&best->cond);
1009
0
          ZSTD_pthread_mutex_unlock(&best->mutex);
1010
0
          return;
1011
0
        }
1012
0
      }
1013
      /* Save the dictionary, parameters, and size */
1014
0
      if (dict) {
1015
0
        memcpy(best->dict, dict, dictSize);
1016
0
        best->dictSize = dictSize;
1017
0
        best->parameters = parameters;
1018
0
        best->compressedSize = compressedSize;
1019
0
      }
1020
0
    }
1021
0
    if (liveJobs == 0) {
1022
0
      ZSTD_pthread_cond_broadcast(&best->cond);
1023
0
    }
1024
0
    ZSTD_pthread_mutex_unlock(&best->mutex);
1025
0
  }
1026
0
}
1027
1028
static COVER_dictSelection_t setDictSelection(BYTE* buf, size_t s, size_t csz)
1029
0
{
1030
0
    COVER_dictSelection_t ds;
1031
0
    ds.dictContent = buf;
1032
0
    ds.dictSize = s;
1033
0
    ds.totalCompressedSize = csz;
1034
0
    return ds;
1035
0
}
1036
1037
0
COVER_dictSelection_t COVER_dictSelectionError(size_t error) {
1038
0
    return setDictSelection(NULL, 0, error);
1039
0
}
1040
1041
0
unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection) {
1042
0
  return (ZSTD_isError(selection.totalCompressedSize) || !selection.dictContent);
1043
0
}
1044
1045
0
void COVER_dictSelectionFree(COVER_dictSelection_t selection){
1046
0
  free(selection.dictContent);
1047
0
}
1048
1049
COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent, size_t dictBufferCapacity,
1050
        size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples,
1051
0
        size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize) {
1052
1053
0
  size_t largestDict = 0;
1054
0
  size_t largestCompressed = 0;
1055
0
  BYTE* customDictContentEnd = customDictContent + dictContentSize;
1056
1057
0
  BYTE* largestDictbuffer = (BYTE*)malloc(dictBufferCapacity);
1058
0
  BYTE* candidateDictBuffer = (BYTE*)malloc(dictBufferCapacity);
1059
0
  double regressionTolerance = ((double)params.shrinkDictMaxRegression / 100.0) + 1.00;
1060
1061
0
  if (!largestDictbuffer || !candidateDictBuffer) {
1062
0
    free(largestDictbuffer);
1063
0
    free(candidateDictBuffer);
1064
0
    return COVER_dictSelectionError(dictContentSize);
1065
0
  }
1066
1067
  /* Initial dictionary size and compressed size */
1068
0
  memcpy(largestDictbuffer, customDictContent, dictContentSize);
1069
0
  dictContentSize = ZDICT_finalizeDictionary(
1070
0
    largestDictbuffer, dictBufferCapacity, customDictContent, dictContentSize,
1071
0
    samplesBuffer, samplesSizes, nbFinalizeSamples, params.zParams);
1072
1073
0
  if (ZDICT_isError(dictContentSize)) {
1074
0
    free(largestDictbuffer);
1075
0
    free(candidateDictBuffer);
1076
0
    return COVER_dictSelectionError(dictContentSize);
1077
0
  }
1078
1079
0
  totalCompressedSize = COVER_checkTotalCompressedSize(params, samplesSizes,
1080
0
                                                       samplesBuffer, offsets,
1081
0
                                                       nbCheckSamples, nbSamples,
1082
0
                                                       largestDictbuffer, dictContentSize);
1083
1084
0
  if (ZSTD_isError(totalCompressedSize)) {
1085
0
    free(largestDictbuffer);
1086
0
    free(candidateDictBuffer);
1087
0
    return COVER_dictSelectionError(totalCompressedSize);
1088
0
  }
1089
1090
0
  if (params.shrinkDict == 0) {
1091
0
    free(candidateDictBuffer);
1092
0
    return setDictSelection(largestDictbuffer, dictContentSize, totalCompressedSize);
1093
0
  }
1094
1095
0
  largestDict = dictContentSize;
1096
0
  largestCompressed = totalCompressedSize;
1097
0
  dictContentSize = ZDICT_DICTSIZE_MIN;
1098
1099
  /* Largest dict is initially at least ZDICT_DICTSIZE_MIN */
1100
0
  while (dictContentSize < largestDict) {
1101
0
    memcpy(candidateDictBuffer, largestDictbuffer, largestDict);
1102
0
    dictContentSize = ZDICT_finalizeDictionary(
1103
0
      candidateDictBuffer, dictBufferCapacity, customDictContentEnd - dictContentSize, dictContentSize,
1104
0
      samplesBuffer, samplesSizes, nbFinalizeSamples, params.zParams);
1105
1106
0
    if (ZDICT_isError(dictContentSize)) {
1107
0
      free(largestDictbuffer);
1108
0
      free(candidateDictBuffer);
1109
0
      return COVER_dictSelectionError(dictContentSize);
1110
1111
0
    }
1112
1113
0
    totalCompressedSize = COVER_checkTotalCompressedSize(params, samplesSizes,
1114
0
                                                         samplesBuffer, offsets,
1115
0
                                                         nbCheckSamples, nbSamples,
1116
0
                                                         candidateDictBuffer, dictContentSize);
1117
1118
0
    if (ZSTD_isError(totalCompressedSize)) {
1119
0
      free(largestDictbuffer);
1120
0
      free(candidateDictBuffer);
1121
0
      return COVER_dictSelectionError(totalCompressedSize);
1122
0
    }
1123
1124
0
    if ((double)totalCompressedSize <= (double)largestCompressed * regressionTolerance) {
1125
0
      free(largestDictbuffer);
1126
0
      return setDictSelection( candidateDictBuffer, dictContentSize, totalCompressedSize );
1127
0
    }
1128
0
    dictContentSize *= 2;
1129
0
  }
1130
0
  dictContentSize = largestDict;
1131
0
  totalCompressedSize = largestCompressed;
1132
0
  free(candidateDictBuffer);
1133
0
  return setDictSelection( largestDictbuffer, dictContentSize, totalCompressedSize );
1134
0
}
1135
1136
/**
1137
 * Parameters for COVER_tryParameters().
1138
 */
1139
typedef struct COVER_tryParameters_data_s {
1140
  const COVER_ctx_t *ctx;
1141
  COVER_best_t *best;
1142
  size_t dictBufferCapacity;
1143
  ZDICT_cover_params_t parameters;
1144
} COVER_tryParameters_data_t;
1145
1146
/**
1147
 * Tries a set of parameters and updates the COVER_best_t with the results.
1148
 * This function is thread safe if zstd is compiled with multithreaded support.
1149
 * It takes its parameters as an *OWNING* opaque pointer to support threading.
1150
 */
1151
static void COVER_tryParameters(void *opaque)
1152
0
{
1153
  /* Save parameters as local variables */
1154
0
  COVER_tryParameters_data_t *const data = (COVER_tryParameters_data_t*)opaque;
1155
0
  const COVER_ctx_t *const ctx = data->ctx;
1156
0
  const ZDICT_cover_params_t parameters = data->parameters;
1157
0
  size_t dictBufferCapacity = data->dictBufferCapacity;
1158
0
  size_t totalCompressedSize = ERROR(GENERIC);
1159
  /* Allocate space for hash table, dict, and freqs */
1160
0
  COVER_map_t activeDmers;
1161
0
  BYTE* const dict = (BYTE*)malloc(dictBufferCapacity);
1162
0
  COVER_dictSelection_t selection = COVER_dictSelectionError(ERROR(GENERIC));
1163
0
  U32* const freqs = (U32*)malloc(ctx->suffixSize * sizeof(U32));
1164
0
  const int displayLevel = ctx->displayLevel;
1165
0
  if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
1166
0
    DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");
1167
0
    goto _cleanup;
1168
0
  }
1169
0
  if (!dict || !freqs) {
1170
0
    DISPLAYLEVEL(1, "Failed to allocate buffers: out of memory\n");
1171
0
    goto _cleanup;
1172
0
  }
1173
  /* Copy the frequencies because we need to modify them */
1174
0
  memcpy(freqs, ctx->freqs, ctx->suffixSize * sizeof(U32));
1175
  /* Build the dictionary */
1176
0
  {
1177
0
    const size_t tail = COVER_buildDictionary(ctx, freqs, &activeDmers, dict,
1178
0
                                              dictBufferCapacity, parameters);
1179
0
    selection = COVER_selectDict(dict + tail, dictBufferCapacity, dictBufferCapacity - tail,
1180
0
        ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbTrainSamples, ctx->nbTrainSamples, ctx->nbSamples, parameters, ctx->offsets,
1181
0
        totalCompressedSize);
1182
1183
0
    if (COVER_dictSelectionIsError(selection)) {
1184
0
      DISPLAYLEVEL(1, "Failed to select dictionary\n");
1185
0
      goto _cleanup;
1186
0
    }
1187
0
  }
1188
0
_cleanup:
1189
0
  free(dict);
1190
0
  COVER_best_finish(data->best, parameters, selection);
1191
0
  free(data);
1192
0
  COVER_map_destroy(&activeDmers);
1193
0
  COVER_dictSelectionFree(selection);
1194
0
  free(freqs);
1195
0
}
1196
1197
ZDICTLIB_STATIC_API size_t ZDICT_optimizeTrainFromBuffer_cover(
1198
    void* dictBuffer, size_t dictBufferCapacity, const void* samplesBuffer,
1199
    const size_t* samplesSizes, unsigned nbSamples,
1200
    ZDICT_cover_params_t* parameters)
1201
0
{
1202
  /* constants */
1203
0
  const unsigned nbThreads = parameters->nbThreads;
1204
0
  const double splitPoint =
1205
0
      parameters->splitPoint <= 0.0 ? COVER_DEFAULT_SPLITPOINT : parameters->splitPoint;
1206
0
  const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d;
1207
0
  const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d;
1208
0
  const unsigned kMinK = parameters->k == 0 ? 50 : parameters->k;
1209
0
  const unsigned kMaxK = parameters->k == 0 ? 2000 : parameters->k;
1210
0
  const unsigned kSteps = parameters->steps == 0 ? 40 : parameters->steps;
1211
0
  const unsigned kStepSize = MAX((kMaxK - kMinK) / kSteps, 1);
1212
0
  const unsigned kIterations =
1213
0
      (1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize);
1214
0
  const unsigned shrinkDict = 0;
1215
  /* Local variables */
1216
0
  int displayLevel = (int)parameters->zParams.notificationLevel;
1217
0
  unsigned iteration = 1;
1218
0
  unsigned d;
1219
0
  unsigned k;
1220
0
  COVER_best_t best;
1221
0
  POOL_ctx *pool = NULL;
1222
0
  int warned = 0;
1223
0
  clock_t lastUpdateTime = 0;
1224
1225
  /* Checks */
1226
0
  if (splitPoint <= 0 || splitPoint > 1) {
1227
0
    DISPLAYLEVEL(1, "Incorrect parameters\n");
1228
0
    return ERROR(parameter_outOfBound);
1229
0
  }
1230
0
  if (kMinK < kMaxD || kMaxK < kMinK) {
1231
0
    DISPLAYLEVEL(1, "Incorrect parameters\n");
1232
0
    return ERROR(parameter_outOfBound);
1233
0
  }
1234
0
  if (nbSamples == 0) {
1235
0
    DISPLAYLEVEL(1, "Cover must have at least one input file\n");
1236
0
    return ERROR(srcSize_wrong);
1237
0
  }
1238
0
  if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
1239
0
    DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
1240
0
                 ZDICT_DICTSIZE_MIN);
1241
0
    return ERROR(dstSize_tooSmall);
1242
0
  }
1243
0
  if (nbThreads > 1) {
1244
0
    pool = POOL_create(nbThreads, 1);
1245
0
    if (!pool) {
1246
0
      return ERROR(memory_allocation);
1247
0
    }
1248
0
  }
1249
  /* Initialization */
1250
0
  COVER_best_init(&best);
1251
  /* Loop through d first because each new value needs a new context */
1252
0
  DISPLAYLEVEL(2, "Trying %u different sets of parameters\n",
1253
0
                    kIterations);
1254
0
  for (d = kMinD; d <= kMaxD; d += 2) {
1255
    /* Initialize the context for this value of d */
1256
0
    COVER_ctx_t ctx;
1257
0
    DISPLAYLEVEL(3, "d=%u\n", d);
1258
0
    {
1259
      /* Turn down global display level to clean up display at level 2 and below */
1260
0
      const int childDisplayLevel = (displayLevel == 0) ? 0 : displayLevel - 1;
1261
0
      const size_t initVal = COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint, childDisplayLevel);
1262
0
      if (ZSTD_isError(initVal)) {
1263
0
        DISPLAYLEVEL(1, "Failed to initialize context\n");
1264
0
        COVER_best_destroy(&best);
1265
0
        POOL_free(pool);
1266
0
        return initVal;
1267
0
      }
1268
0
    }
1269
0
    if (!warned) {
1270
0
      COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.suffixSize, displayLevel);
1271
0
      warned = 1;
1272
0
    }
1273
    /* Loop through k reusing the same context */
1274
0
    for (k = kMinK; k <= kMaxK; k += kStepSize) {
1275
      /* Prepare the arguments */
1276
0
      COVER_tryParameters_data_t *data = (COVER_tryParameters_data_t *)malloc(
1277
0
          sizeof(COVER_tryParameters_data_t));
1278
0
      DISPLAYLEVEL(3, "k=%u\n", k);
1279
0
      if (!data) {
1280
0
        DISPLAYLEVEL(1, "Failed to allocate parameters\n");
1281
0
        COVER_best_destroy(&best);
1282
0
        COVER_ctx_destroy(&ctx);
1283
0
        POOL_free(pool);
1284
0
        return ERROR(memory_allocation);
1285
0
      }
1286
0
      data->ctx = &ctx;
1287
0
      data->best = &best;
1288
0
      data->dictBufferCapacity = dictBufferCapacity;
1289
0
      data->parameters = *parameters;
1290
0
      data->parameters.k = k;
1291
0
      data->parameters.d = d;
1292
0
      data->parameters.splitPoint = splitPoint;
1293
0
      data->parameters.steps = kSteps;
1294
0
      data->parameters.shrinkDict = shrinkDict;
1295
0
      data->parameters.zParams.notificationLevel = (unsigned)ctx.displayLevel;
1296
      /* Check the parameters */
1297
0
      if (!COVER_checkParameters(data->parameters, dictBufferCapacity)) {
1298
0
        DISPLAYLEVEL(1, "Cover parameters incorrect\n");
1299
0
        free(data);
1300
0
        continue;
1301
0
      }
1302
      /* Call the function and pass ownership of data to it */
1303
0
      COVER_best_start(&best);
1304
0
      if (pool) {
1305
0
        POOL_add(pool, &COVER_tryParameters, data);
1306
0
      } else {
1307
0
        COVER_tryParameters(data);
1308
0
      }
1309
      /* Print status */
1310
0
      DISPLAYUPDATE(lastUpdateTime, 2, "\r%u%%       ",
1311
0
                    (unsigned)((iteration * 100) / kIterations));
1312
0
      ++iteration;
1313
0
    }
1314
0
    COVER_best_wait(&best);
1315
0
    COVER_ctx_destroy(&ctx);
1316
0
  }
1317
0
  DISPLAYLEVEL(2, "\r%79s\r", "");
1318
  /* Fill the output buffer and parameters with output of the best parameters */
1319
0
  {
1320
0
    const size_t dictSize = best.dictSize;
1321
0
    if (ZSTD_isError(best.compressedSize)) {
1322
0
      const size_t compressedSize = best.compressedSize;
1323
0
      COVER_best_destroy(&best);
1324
0
      POOL_free(pool);
1325
0
      return compressedSize;
1326
0
    }
1327
0
    *parameters = best.parameters;
1328
0
    memcpy(dictBuffer, best.dict, dictSize);
1329
0
    COVER_best_destroy(&best);
1330
0
    POOL_free(pool);
1331
0
    return dictSize;
1332
0
  }
1333
0
}