Coverage Report

Created: 2024-04-24 06:23

/src/tarantool/src/box/vy_regulator.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright 2010-2018, Tarantool AUTHORS, please see AUTHORS file.
3
 *
4
 * Redistribution and use in source and binary forms, with or
5
 * without modification, are permitted provided that the following
6
 * conditions are met:
7
 *
8
 * 1. Redistributions of source code must retain the above
9
 *    copyright notice, this list of conditions and the
10
 *    following disclaimer.
11
 *
12
 * 2. Redistributions in binary form must reproduce the above
13
 *    copyright notice, this list of conditions and the following
14
 *    disclaimer in the documentation and/or other materials
15
 *    provided with the distribution.
16
 *
17
 * THIS SOFTWARE IS PROVIDED BY AUTHORS ``AS IS'' AND
18
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
19
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
21
 * AUTHORS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
22
 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
25
 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
26
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
28
 * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29
 * SUCH DAMAGE.
30
 */
31
#include "vy_regulator.h"
32
33
#include <math.h>
34
#include <stdbool.h>
35
#include <stddef.h>
36
#include <stdint.h>
37
#include <string.h>
38
#include <tarantool_ev.h>
39
40
#include "fiber.h"
41
#include "histogram.h"
42
#include "say.h"
43
#include "trivia/util.h"
44
45
#include "vy_quota.h"
46
#include "vy_stat.h"
47
48
/**
49
 * Regulator timer period, in seconds.
50
 */
51
static const double VY_REGULATOR_TIMER_PERIOD = 1;
52
53
/**
54
 * Time window over which the write rate is averaged,
55
 * in seconds.
56
 */
57
static const double VY_WRITE_RATE_AVG_WIN = 5;
58
59
/**
60
 * Histogram percentile used for estimating dump bandwidth.
61
 * For details see the comment to vy_regulator::dump_bandwidth_hist.
62
 */
63
static const int VY_DUMP_BANDWIDTH_PCT = 10;
64
65
/*
66
 * Until we dump anything, assume bandwidth to be 10 MB/s,
67
 * which should be fine for initial guess.
68
 */
69
static const size_t VY_DUMP_BANDWIDTH_DEFAULT = 10 * 1024 * 1024;
70
71
/**
72
 * Do not take into account small dumps when estimating dump
73
 * bandwidth, because they have too high overhead associated
74
 * with file creation.
75
 */
76
static const size_t VY_DUMP_SIZE_ACCT_MIN = 1024 * 1024;
77
78
/**
79
 * Number of dumps to take into account for rate limit calculation.
80
 * Shouldn't be too small to avoid uneven RPS. Shouldn't be too big
81
 * either - otherwise the rate limit will adapt too slowly to workload
82
 * changes. 100 feels like a good choice.
83
 */
84
static const int VY_RECENT_DUMP_COUNT = 100;
85
86
static void
87
vy_regulator_trigger_dump(struct vy_regulator *regulator)
88
0
{
89
0
  if (regulator->dump_in_progress)
90
0
    return;
91
92
0
  if (regulator->trigger_dump_cb(regulator) != 0)
93
0
    return;
94
95
0
  regulator->dump_in_progress = true;
96
97
  /*
98
   * To avoid unpredictably long stalls, we must limit
99
   * the write rate when a dump is in progress so that
100
   * we don't hit the hard limit before the dump has
101
   * completed, i.e.
102
   *
103
   *    mem_left        mem_used
104
   *   ---------- >= --------------
105
   *   write_rate    dump_bandwidth
106
   */
107
0
  struct vy_quota *quota = regulator->quota;
108
0
  size_t mem_left = (quota->used < quota->limit ?
109
0
         quota->limit - quota->used : 0);
110
0
  size_t mem_used = quota->used;
111
0
  size_t max_write_rate = (double)mem_left / (mem_used + 1) *
112
0
          regulator->dump_bandwidth;
113
0
  max_write_rate = MIN(max_write_rate, regulator->dump_bandwidth);
114
0
  vy_quota_set_rate_limit(quota, VY_QUOTA_RESOURCE_MEMORY,
115
0
        max_write_rate);
116
117
0
  say_info("dumping %zu bytes, expected rate %.1f MB/s, "
118
0
     "ETA %.1f s, write rate (avg/max) %.1f/%.1f MB/s",
119
0
     quota->used, (double)regulator->dump_bandwidth / 1024 / 1024,
120
0
     (double)quota->used / (regulator->dump_bandwidth + 1),
121
0
     (double)regulator->write_rate / 1024 / 1024,
122
0
     (double)regulator->write_rate_max / 1024 / 1024);
123
124
0
  regulator->write_rate_max = regulator->write_rate;
125
0
}
126
127
static void
128
vy_regulator_update_write_rate(struct vy_regulator *regulator)
129
0
{
130
0
  size_t used_curr = regulator->quota->used;
131
0
  size_t used_last = regulator->quota_used_last;
132
133
  /*
134
   * Memory can be dumped between two subsequent timer
135
   * callback invocations, in which case memory usage
136
   * will decrease. Ignore such observations - it's not
137
   * a big deal, because dump is a rare event.
138
   */
139
0
  if (used_curr < used_last) {
140
0
    regulator->quota_used_last = used_curr;
141
0
    return;
142
0
  }
143
144
0
  size_t rate_avg = regulator->write_rate;
145
0
  size_t rate_curr = (used_curr - used_last) / VY_REGULATOR_TIMER_PERIOD;
146
147
0
  double weight = 1 - exp(-VY_REGULATOR_TIMER_PERIOD /
148
0
        VY_WRITE_RATE_AVG_WIN);
149
0
  rate_avg = (1 - weight) * rate_avg + weight * rate_curr;
150
151
0
  regulator->write_rate = rate_avg;
152
0
  if (regulator->write_rate_max < rate_curr)
153
0
    regulator->write_rate_max = rate_curr;
154
0
  regulator->quota_used_last = used_curr;
155
0
}
156
157
static void
158
vy_regulator_update_dump_watermark(struct vy_regulator *regulator)
159
0
{
160
0
  struct vy_quota *quota = regulator->quota;
161
162
  /*
163
   * Due to log structured nature of the lsregion allocator,
164
   * which is used for allocating statements, we cannot free
165
   * memory in chunks, only all at once. Therefore we should
166
   * configure the watermark so that by the time we hit the
167
   * limit, all memory have been dumped, i.e.
168
   *
169
   *   limit - watermark      watermark
170
   *   ----------------- = --------------
171
   *       write_rate      dump_bandwidth
172
   *
173
   * Be pessimistic when predicting the write rate - use the
174
   * max observed write rate multiplied by 1.5 - because it's
175
   * better to start memory dump early than delay it as long
176
   * as possible at the risk of experiencing unpredictably
177
   * long stalls.
178
   */
179
0
  size_t write_rate = regulator->write_rate_max * 3 / 2;
180
0
  regulator->dump_watermark =
181
0
      (double)quota->limit * regulator->dump_bandwidth /
182
0
      (regulator->dump_bandwidth + write_rate + 1);
183
  /*
184
   * It doesn't make sense to set the watermark below 50%
185
   * of the memory limit because the write rate can exceed
186
   * the dump bandwidth under no circumstances.
187
   */
188
0
  regulator->dump_watermark = MAX(regulator->dump_watermark,
189
0
          quota->limit / 2);
190
0
}
191
192
static void
193
vy_regulator_timer_cb(ev_loop *loop, ev_timer *timer, int events)
194
0
{
195
0
  (void)loop;
196
0
  (void)events;
197
198
0
  struct vy_regulator *regulator = timer->data;
199
200
0
  vy_regulator_update_write_rate(regulator);
201
0
  vy_regulator_update_dump_watermark(regulator);
202
0
  vy_regulator_check_dump_watermark(regulator);
203
0
}
204
205
void
206
vy_regulator_create(struct vy_regulator *regulator, struct vy_quota *quota,
207
        vy_trigger_dump_f trigger_dump_cb)
208
0
{
209
0
  enum { KB = 1024, MB = KB * KB };
210
0
  static int64_t dump_bandwidth_buckets[] = {
211
0
    100 * KB, 200 * KB, 300 * KB, 400 * KB, 500 * KB, 600 * KB,
212
0
    700 * KB, 800 * KB, 900 * KB,   1 * MB,   2 * MB,   3 * MB,
213
0
      4 * MB,   5 * MB,   6 * MB,   7 * MB,   8 * MB,   9 * MB,
214
0
     10 * MB,  15 * MB,  20 * MB,  25 * MB,  30 * MB,  35 * MB,
215
0
     40 * MB,  45 * MB,  50 * MB,  55 * MB,  60 * MB,  65 * MB,
216
0
     70 * MB,  75 * MB,  80 * MB,  85 * MB,  90 * MB,  95 * MB,
217
0
    100 * MB, 200 * MB, 300 * MB, 400 * MB, 500 * MB, 600 * MB,
218
0
    700 * MB, 800 * MB, 900 * MB,
219
0
  };
220
0
  memset(regulator, 0, sizeof(*regulator));
221
0
  regulator->dump_bandwidth_hist = histogram_new(dump_bandwidth_buckets,
222
0
          lengthof(dump_bandwidth_buckets));
223
0
  if (regulator->dump_bandwidth_hist == NULL)
224
0
    panic("failed to allocate dump bandwidth histogram");
225
226
0
  regulator->quota = quota;
227
0
  regulator->trigger_dump_cb = trigger_dump_cb;
228
0
  ev_timer_init(&regulator->timer, vy_regulator_timer_cb, 0,
229
0
          VY_REGULATOR_TIMER_PERIOD);
230
0
  regulator->timer.data = regulator;
231
0
  regulator->dump_bandwidth = VY_DUMP_BANDWIDTH_DEFAULT;
232
0
  regulator->dump_watermark = SIZE_MAX;
233
0
}
234
235
void
236
vy_regulator_start(struct vy_regulator *regulator)
237
0
{
238
0
  regulator->quota_used_last = regulator->quota->used;
239
0
  vy_quota_set_rate_limit(regulator->quota, VY_QUOTA_RESOURCE_MEMORY,
240
0
        regulator->dump_bandwidth);
241
0
  ev_timer_start(loop(), &regulator->timer);
242
0
}
243
244
void
245
vy_regulator_destroy(struct vy_regulator *regulator)
246
0
{
247
0
  ev_timer_stop(loop(), &regulator->timer);
248
0
  histogram_delete(regulator->dump_bandwidth_hist);
249
0
}
250
251
void
252
vy_regulator_quota_exceeded(struct vy_regulator *regulator)
253
0
{
254
0
  vy_regulator_trigger_dump(regulator);
255
0
}
256
257
void
258
vy_regulator_check_dump_watermark(struct vy_regulator *regulator)
259
0
{
260
0
  if (regulator->quota->used >= regulator->dump_watermark)
261
0
    vy_regulator_trigger_dump(regulator);
262
0
}
263
264
void
265
vy_regulator_dump_complete(struct vy_regulator *regulator,
266
         size_t mem_dumped, double dump_duration)
267
0
{
268
0
  regulator->dump_in_progress = false;
269
270
0
  if (mem_dumped >= VY_DUMP_SIZE_ACCT_MIN && dump_duration > 0) {
271
0
    histogram_collect(regulator->dump_bandwidth_hist,
272
0
          mem_dumped / dump_duration);
273
    /*
274
     * To avoid unpredictably long stalls caused by
275
     * mispredicting dump time duration, we need to
276
     * know the worst (smallest) dump bandwidth so
277
     * use a lower-bound percentile estimate.
278
     */
279
0
    regulator->dump_bandwidth = histogram_percentile_lower(
280
0
      regulator->dump_bandwidth_hist, VY_DUMP_BANDWIDTH_PCT);
281
0
  }
282
283
  /*
284
   * Reset the rate limit.
285
   *
286
   * It doesn't make sense to allow to consume memory at
287
   * a higher rate than it can be dumped so we set the rate
288
   * limit to the dump bandwidth rather than disabling it
289
   * completely.
290
   */
291
0
  vy_quota_set_rate_limit(regulator->quota, VY_QUOTA_RESOURCE_MEMORY,
292
0
        regulator->dump_bandwidth);
293
294
0
  if (dump_duration > 0) {
295
0
    say_info("dumped %zu bytes in %.1f s, rate %.1f MB/s",
296
0
       mem_dumped, dump_duration,
297
0
       mem_dumped / dump_duration / 1024 / 1024);
298
0
  }
299
0
}
300
301
void
302
vy_regulator_set_memory_limit(struct vy_regulator *regulator, size_t limit)
303
0
{
304
0
  vy_quota_set_limit(regulator->quota, limit);
305
0
  vy_regulator_update_dump_watermark(regulator);
306
0
}
307
308
void
309
vy_regulator_reset_dump_bandwidth(struct vy_regulator *regulator, size_t max)
310
0
{
311
0
  histogram_reset(regulator->dump_bandwidth_hist);
312
0
  regulator->dump_bandwidth = VY_DUMP_BANDWIDTH_DEFAULT;
313
0
  if (max > 0 && regulator->dump_bandwidth > max)
314
0
    regulator->dump_bandwidth = max;
315
0
  vy_quota_set_rate_limit(regulator->quota, VY_QUOTA_RESOURCE_MEMORY,
316
0
        regulator->dump_bandwidth);
317
0
}
318
319
void
320
vy_regulator_reset_stat(struct vy_regulator *regulator)
321
0
{
322
0
  memset(&regulator->sched_stat_last, 0,
323
0
         sizeof(regulator->sched_stat_last));
324
0
}
325
326
/*
327
 * The goal of rate limiting is to ensure LSM trees stay close to
328
 * their perfect shape, as defined by run_size_ratio. When dump rate
329
 * is too high, we have to throttle database writes to ensure
330
 * compaction can keep up with dumps. We can't deduce optimal dump
331
 * bandwidth from LSM configuration, such as run_size_ratio or
332
 * run_count_per_level, since different spaces or different indexes
333
 * within a space can have different configuration settings. The
334
 * workload can also vary significantly from space to space. So,
335
 * when setting the limit, we have to consider dump and compaction
336
 * activities of the database as a whole.
337
 *
338
 * To this end, we keep track of compaction bandwidth and write
339
 * amplification of the entire database, across all LSM trees.
340
 * The idea is simple: observe the current write amplification
341
 * and compaction bandwidth, and set maximal write rate to a value
342
 * somewhat below the implied limit, so as to make room for
343
 * compaction to do more work if necessary.
344
 *
345
 * We use the following metrics to calculate the limit:
346
 *  - dump_output - number of bytes dumped to disk over the last
347
 *    observation period. The period itself is measured in dumps,
348
 *    not seconds, and is defined by constant VY_RECENT_DUMP_COUNT.
349
 *  - compaction_output - number of bytes produced by compaction
350
 *    over the same period.
351
 *  - compaction_rate - total compaction output, in bytes, divided
352
 *    by total time spent on doing compaction by compaction threads,
353
 *    both measured over the same observation period. This gives an
354
 *    estimate of the speed at which compaction can write output.
355
 *    In the real world this speed is dependent on the disk write
356
 *    throughput, number of dump threads, and actual dump rate, but
357
 *    given the goal of rate limiting is providing compaction with
358
 *    extra bandwidth, this metric is considered an accurate enough
359
 *    approximation of the disk bandwidth available to compaction.
360
 *
361
 * We calculate the compaction rate with the following formula:
362
 *
363
 *                                            compaction_output
364
 *     compaction_rate = compaction_threads * -----------------
365
 *                                             compaction_time
366
 *
367
 * where compaction_threads represents the total number of available
368
 * compaction threads and compaction_time is the total time, in
369
 * seconds, spent by all threads doing compaction. You can look at
370
 * the formula this way: compaction_ouptut / compaction_time gives
371
 * the average write speed of a single compaction thread, and by
372
 * multiplying it by the number of compaction threads we get the
373
 * compaction rate of the entire database.
374
 *
375
 * In an optimal system dump rate must be proportional to compaction
376
 * rate and inverse to write amplification:
377
 *
378
 *     dump_rate = compaction_rate / (write_amplification - 1)
379
 *
380
 * The latter can be obtained by dividing total output of compaction
381
 * by total output of dumps over the observation period:
382
 *
383
 *                           dump_output + compaction_output
384
 *     write_amplification = ------------------------------- =
385
 *                                    dump_output
386
 *
387
 *                         = 1 + compaction_output / dump_output
388
 *
389
 * Putting this all together and taking into account data compaction
390
 * during memory dump, we get for the max transaction rate:
391
 *
392
 *                           dump_input
393
 *     tx_rate = dump_rate * ----------- =
394
 *                           dump_output
395
 *
396
 *                                    compaction_output
397
 *             = compaction_threads * ----------------- *
398
 *                                     compaction_time
399
 *
400
 *                              dump_output      dump_input
401
 *                         * ----------------- * ----------- =
402
 *                           compaction_output   dump_output
403
 *
404
 *             = compaction_threads * dump_input / compaction_time
405
 *
406
 * We set the rate limit to 0.75 of the approximated optimal to
407
 * leave the database engine enough room needed to use more disk
408
 * bandwidth for compaction if necessary. As soon as compaction gets
409
 * enough disk bandwidth to keep LSM trees in optimal shape
410
 * compaction speed becomes stable, as does write amplification.
411
 */
412
void
413
vy_regulator_update_rate_limit(struct vy_regulator *regulator,
414
             const struct vy_scheduler_stat *stat,
415
             int compaction_threads)
416
0
{
417
0
  struct vy_scheduler_stat *last = &regulator->sched_stat_last;
418
0
  struct vy_scheduler_stat *recent = &regulator->sched_stat_recent;
419
420
0
  int32_t dump_count = stat->dump_count - last->dump_count;
421
0
  int64_t dump_input = stat->dump_input - last->dump_input;
422
0
  double compaction_time = stat->compaction_time - last->compaction_time;
423
0
  *last = *stat;
424
425
0
  if (dump_input < (ssize_t)VY_DUMP_SIZE_ACCT_MIN || compaction_time == 0)
426
0
    return;
427
428
0
  recent->dump_count += dump_count;
429
0
  recent->dump_input += dump_input;
430
0
  recent->compaction_time += compaction_time;
431
432
0
  double rate = 0.75 * compaction_threads * recent->dump_input /
433
0
              recent->compaction_time;
434
  /*
435
   * We can't simply use (size_t)MIN(rate, SIZE_MAX) to cast
436
   * the rate from double to size_t here, because on a 64-bit
437
   * system SIZE_MAX equals 2^64-1, which can't be represented
438
   * as double without loss of precision and hence is rounded
439
   * up to 2^64, which in turn can't be converted back to size_t.
440
   * So we first convert the rate to uint64_t using exp2(64) to
441
   * check if it fits and only then cast the uint64_t to size_t.
442
   */
443
0
  uint64_t rate64;
444
0
  if (rate < exp2(64))
445
0
    rate64 = rate;
446
0
  else
447
0
    rate64 = UINT64_MAX;
448
0
  vy_quota_set_rate_limit(regulator->quota, VY_QUOTA_RESOURCE_DISK,
449
0
        (size_t)MIN(rate64, SIZE_MAX));
450
451
  /*
452
   * Periodically rotate statistics for quicker adaptation
453
   * to workload changes.
454
   */
455
0
  if (recent->dump_count > VY_RECENT_DUMP_COUNT) {
456
0
    recent->dump_count /= 2;
457
0
    recent->dump_input /= 2;
458
0
    recent->compaction_time /= 2;
459
0
  }
460
0
}