/src/tarantool/src/box/vy_regulator.c

Source (jump to first uncovered line)
/*
 * Copyright 2010-2018, Tarantool AUTHORS, please see AUTHORS file.
 *
 * Redistribution and use in source and binary forms, with or
 * without modification, are permitted provided that the following
 * conditions are met:
 *
 * 1. Redistributions of source code must retain the above
 *    copyright notice, this list of conditions and the
 *    following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above
 *    copyright notice, this list of conditions and the following
 *    disclaimer in the documentation and/or other materials
 *    provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY AUTHORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
 * AUTHORS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
 * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */
#include "vy_regulator.h"

#include <math.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <string.h>
#include <tarantool_ev.h>

#include "fiber.h"
#include "histogram.h"
#include "say.h"
#include "trivia/util.h"

#include "vy_quota.h"
#include "vy_stat.h"

/**
 * Regulator timer period, in seconds.
 */
static const double VY_REGULATOR_TIMER_PERIOD = 1;

/**
 * Time window over which the write rate is averaged,
 * in seconds.
 */
static const double VY_WRITE_RATE_AVG_WIN = 5;

/**
 * Histogram percentile used for estimating dump bandwidth.
 * For details see the comment to vy_regulator::dump_bandwidth_hist.
 */
static const int VY_DUMP_BANDWIDTH_PCT = 10;

/*
 * Until we dump anything, assume bandwidth to be 10 MB/s,
 * which should be fine for initial guess.
 */
static const size_t VY_DUMP_BANDWIDTH_DEFAULT = 10 * 1024 * 1024;

/**
 * Do not take into account small dumps when estimating dump
 * bandwidth, because they have too high overhead associated
 * with file creation.
 */
static const size_t VY_DUMP_SIZE_ACCT_MIN = 1024 * 1024;

/**
 * Number of dumps to take into account for rate limit calculation.
 * Shouldn't be too small to avoid uneven RPS. Shouldn't be too big
 * either - otherwise the rate limit will adapt too slowly to workload
 * changes. 100 feels like a good choice.
 */
static const int VY_RECENT_DUMP_COUNT = 100;

static void
vy_regulator_trigger_dump(struct vy_regulator *regulator)
{
  if (regulator->dump_in_progress)
    return;

  if (regulator->trigger_dump_cb(regulator) != 0)
    return;

  regulator->dump_in_progress = true;

  /*
   * To avoid unpredictably long stalls, we must limit
   * the write rate when a dump is in progress so that
   * we don't hit the hard limit before the dump has
   * completed, i.e.
   *
   *    mem_left        mem_used
   *   ---------- >= --------------
   *   write_rate    dump_bandwidth
   */
  struct vy_quota *quota = regulator->quota;
  size_t mem_left = (quota->used < quota->limit ?
         quota->limit - quota->used : 0);
  size_t mem_used = quota->used;
  size_t max_write_rate = (double)mem_left / (mem_used + 1) *
          regulator->dump_bandwidth;
  max_write_rate = MIN(max_write_rate, regulator->dump_bandwidth);
  vy_quota_set_rate_limit(quota, VY_QUOTA_RESOURCE_MEMORY,
        max_write_rate);

  say_info("dumping %zu bytes, expected rate %.1f MB/s, "
     "ETA %.1f s, write rate (avg/max) %.1f/%.1f MB/s",
     quota->used, (double)regulator->dump_bandwidth / 1024 / 1024,
     (double)quota->used / (regulator->dump_bandwidth + 1),
     (double)regulator->write_rate / 1024 / 1024,
     (double)regulator->write_rate_max / 1024 / 1024);

  regulator->write_rate_max = regulator->write_rate;
}

static void
vy_regulator_update_write_rate(struct vy_regulator *regulator)
{
  size_t used_curr = regulator->quota->used;
  size_t used_last = regulator->quota_used_last;

  /*
   * Memory can be dumped between two subsequent timer
   * callback invocations, in which case memory usage
   * will decrease. Ignore such observations - it's not
   * a big deal, because dump is a rare event.
   */
  if (used_curr < used_last) {
    regulator->quota_used_last = used_curr;
    return;
  }

  size_t rate_avg = regulator->write_rate;
  size_t rate_curr = (used_curr - used_last) / VY_REGULATOR_TIMER_PERIOD;

  double weight = 1 - exp(-VY_REGULATOR_TIMER_PERIOD /
        VY_WRITE_RATE_AVG_WIN);
  rate_avg = (1 - weight) * rate_avg + weight * rate_curr;

  regulator->write_rate = rate_avg;
  if (regulator->write_rate_max < rate_curr)
    regulator->write_rate_max = rate_curr;
  regulator->quota_used_last = used_curr;
}

static void
vy_regulator_update_dump_watermark(struct vy_regulator *regulator)
{
  struct vy_quota *quota = regulator->quota;

  /*
   * Due to log structured nature of the lsregion allocator,
   * which is used for allocating statements, we cannot free
   * memory in chunks, only all at once. Therefore we should
   * configure the watermark so that by the time we hit the
   * limit, all memory have been dumped, i.e.
   *
   *   limit - watermark      watermark
   *   ----------------- = --------------
   *       write_rate      dump_bandwidth
   *
   * Be pessimistic when predicting the write rate - use the
   * max observed write rate multiplied by 1.5 - because it's
   * better to start memory dump early than delay it as long
   * as possible at the risk of experiencing unpredictably
   * long stalls.
   */
  size_t write_rate = regulator->write_rate_max * 3 / 2;
  regulator->dump_watermark =
      (double)quota->limit * regulator->dump_bandwidth /
      (regulator->dump_bandwidth + write_rate + 1);
  /*
   * It doesn't make sense to set the watermark below 50%
   * of the memory limit because the write rate can exceed
   * the dump bandwidth under no circumstances.
   */
  regulator->dump_watermark = MAX(regulator->dump_watermark,
          quota->limit / 2);
}

static void
vy_regulator_timer_cb(ev_loop *loop, ev_timer *timer, int events)
{
  (void)loop;
  (void)events;

  struct vy_regulator *regulator = timer->data;

  vy_regulator_update_write_rate(regulator);
  vy_regulator_update_dump_watermark(regulator);
  vy_regulator_check_dump_watermark(regulator);
}

void
vy_regulator_create(struct vy_regulator *regulator, struct vy_quota *quota,
        vy_trigger_dump_f trigger_dump_cb)
{
  enum { KB = 1024, MB = KB * KB };
  static int64_t dump_bandwidth_buckets[] = {
    100 * KB, 200 * KB, 300 * KB, 400 * KB, 500 * KB, 600 * KB,
    700 * KB, 800 * KB, 900 * KB,   1 * MB,   2 * MB,   3 * MB,
      4 * MB,   5 * MB,   6 * MB,   7 * MB,   8 * MB,   9 * MB,
     10 * MB,  15 * MB,  20 * MB,  25 * MB,  30 * MB,  35 * MB,
     40 * MB,  45 * MB,  50 * MB,  55 * MB,  60 * MB,  65 * MB,
     70 * MB,  75 * MB,  80 * MB,  85 * MB,  90 * MB,  95 * MB,
    100 * MB, 200 * MB, 300 * MB, 400 * MB, 500 * MB, 600 * MB,
    700 * MB, 800 * MB, 900 * MB,
  };
  memset(regulator, 0, sizeof(*regulator));
  regulator->dump_bandwidth_hist = histogram_new(dump_bandwidth_buckets,
          lengthof(dump_bandwidth_buckets));
  if (regulator->dump_bandwidth_hist == NULL)
    panic("failed to allocate dump bandwidth histogram");

  regulator->quota = quota;
  regulator->trigger_dump_cb = trigger_dump_cb;
  ev_timer_init(&regulator->timer, vy_regulator_timer_cb, 0,
          VY_REGULATOR_TIMER_PERIOD);
  regulator->timer.data = regulator;
  regulator->dump_bandwidth = VY_DUMP_BANDWIDTH_DEFAULT;
  regulator->dump_watermark = SIZE_MAX;
}

void
vy_regulator_start(struct vy_regulator *regulator)
{
  regulator->quota_used_last = regulator->quota->used;
  vy_quota_set_rate_limit(regulator->quota, VY_QUOTA_RESOURCE_MEMORY,
        regulator->dump_bandwidth);
  ev_timer_start(loop(), &regulator->timer);
}

void
vy_regulator_destroy(struct vy_regulator *regulator)
{
  ev_timer_stop(loop(), &regulator->timer);
  histogram_delete(regulator->dump_bandwidth_hist);
}

void
vy_regulator_quota_exceeded(struct vy_regulator *regulator)
{
  vy_regulator_trigger_dump(regulator);
}

void
vy_regulator_check_dump_watermark(struct vy_regulator *regulator)
{
  if (regulator->quota->used >= regulator->dump_watermark)
    vy_regulator_trigger_dump(regulator);
}

void
vy_regulator_dump_complete(struct vy_regulator *regulator,
         size_t mem_dumped, double dump_duration)
{
  regulator->dump_in_progress = false;

  if (mem_dumped >= VY_DUMP_SIZE_ACCT_MIN && dump_duration > 0) {
    histogram_collect(regulator->dump_bandwidth_hist,
          mem_dumped / dump_duration);
    /*
     * To avoid unpredictably long stalls caused by
     * mispredicting dump time duration, we need to
     * know the worst (smallest) dump bandwidth so
     * use a lower-bound percentile estimate.
     */
    regulator->dump_bandwidth = histogram_percentile_lower(
      regulator->dump_bandwidth_hist, VY_DUMP_BANDWIDTH_PCT);
  }

  /*
   * Reset the rate limit.
   *
   * It doesn't make sense to allow to consume memory at
   * a higher rate than it can be dumped so we set the rate
   * limit to the dump bandwidth rather than disabling it
   * completely.
   */
  vy_quota_set_rate_limit(regulator->quota, VY_QUOTA_RESOURCE_MEMORY,
        regulator->dump_bandwidth);

  if (dump_duration > 0) {
    say_info("dumped %zu bytes in %.1f s, rate %.1f MB/s",
       mem_dumped, dump_duration,
       mem_dumped / dump_duration / 1024 / 1024);
  }
}

void
vy_regulator_set_memory_limit(struct vy_regulator *regulator, size_t limit)
{
  vy_quota_set_limit(regulator->quota, limit);
  vy_regulator_update_dump_watermark(regulator);
}

void
vy_regulator_reset_dump_bandwidth(struct vy_regulator *regulator, size_t max)
{
  histogram_reset(regulator->dump_bandwidth_hist);
  regulator->dump_bandwidth = VY_DUMP_BANDWIDTH_DEFAULT;
  if (max > 0 && regulator->dump_bandwidth > max)
    regulator->dump_bandwidth = max;
  vy_quota_set_rate_limit(regulator->quota, VY_QUOTA_RESOURCE_MEMORY,
        regulator->dump_bandwidth);
}

void
vy_regulator_reset_stat(struct vy_regulator *regulator)
{
  memset(&regulator->sched_stat_last, 0,
         sizeof(regulator->sched_stat_last));
}

/*
 * The goal of rate limiting is to ensure LSM trees stay close to
 * their perfect shape, as defined by run_size_ratio. When dump rate
 * is too high, we have to throttle database writes to ensure
 * compaction can keep up with dumps. We can't deduce optimal dump
 * bandwidth from LSM configuration, such as run_size_ratio or
 * run_count_per_level, since different spaces or different indexes
 * within a space can have different configuration settings. The
 * workload can also vary significantly from space to space. So,
 * when setting the limit, we have to consider dump and compaction
 * activities of the database as a whole.
 *
 * To this end, we keep track of compaction bandwidth and write
 * amplification of the entire database, across all LSM trees.
 * The idea is simple: observe the current write amplification
 * and compaction bandwidth, and set maximal write rate to a value
 * somewhat below the implied limit, so as to make room for
 * compaction to do more work if necessary.
 *
 * We use the following metrics to calculate the limit:
 *  - dump_output - number of bytes dumped to disk over the last
 *    observation period. The period itself is measured in dumps,
 *    not seconds, and is defined by constant VY_RECENT_DUMP_COUNT.
 *  - compaction_output - number of bytes produced by compaction
 *    over the same period.
 *  - compaction_rate - total compaction output, in bytes, divided
 *    by total time spent on doing compaction by compaction threads,
 *    both measured over the same observation period. This gives an
 *    estimate of the speed at which compaction can write output.
 *    In the real world this speed is dependent on the disk write
 *    throughput, number of dump threads, and actual dump rate, but
 *    given the goal of rate limiting is providing compaction with
 *    extra bandwidth, this metric is considered an accurate enough
 *    approximation of the disk bandwidth available to compaction.
 *
 * We calculate the compaction rate with the following formula:
 *
 *                                            compaction_output
 *     compaction_rate = compaction_threads * -----------------
 *                                             compaction_time
 *
 * where compaction_threads represents the total number of available
 * compaction threads and compaction_time is the total time, in
 * seconds, spent by all threads doing compaction. You can look at
 * the formula this way: compaction_ouptut / compaction_time gives
 * the average write speed of a single compaction thread, and by
 * multiplying it by the number of compaction threads we get the
 * compaction rate of the entire database.
 *
 * In an optimal system dump rate must be proportional to compaction
 * rate and inverse to write amplification:
 *
 *     dump_rate = compaction_rate / (write_amplification - 1)
 *
 * The latter can be obtained by dividing total output of compaction
 * by total output of dumps over the observation period:
 *
 *                           dump_output + compaction_output
 *     write_amplification = ------------------------------- =
 *                                    dump_output
 *
 *                         = 1 + compaction_output / dump_output
 *
 * Putting this all together and taking into account data compaction
 * during memory dump, we get for the max transaction rate:
 *
 *                           dump_input
 *     tx_rate = dump_rate * ----------- =
 *                           dump_output
 *
 *                                    compaction_output
 *             = compaction_threads * ----------------- *
 *                                     compaction_time
 *
 *                              dump_output      dump_input
 *                         * ----------------- * ----------- =
 *                           compaction_output   dump_output
 *
 *             = compaction_threads * dump_input / compaction_time
 *
 * We set the rate limit to 0.75 of the approximated optimal to
 * leave the database engine enough room needed to use more disk
 * bandwidth for compaction if necessary. As soon as compaction gets
 * enough disk bandwidth to keep LSM trees in optimal shape
 * compaction speed becomes stable, as does write amplification.
 */
void
vy_regulator_update_rate_limit(struct vy_regulator *regulator,
             const struct vy_scheduler_stat *stat,
             int compaction_threads)
{
  struct vy_scheduler_stat *last = &regulator->sched_stat_last;
  struct vy_scheduler_stat *recent = &regulator->sched_stat_recent;

  int32_t dump_count = stat->dump_count - last->dump_count;
  int64_t dump_input = stat->dump_input - last->dump_input;
  double compaction_time = stat->compaction_time - last->compaction_time;
  *last = *stat;

  if (dump_input < (ssize_t)VY_DUMP_SIZE_ACCT_MIN || compaction_time == 0)
    return;

  recent->dump_count += dump_count;
  recent->dump_input += dump_input;
  recent->compaction_time += compaction_time;

  double rate = 0.75 * compaction_threads * recent->dump_input /
              recent->compaction_time;
  /*
   * We can't simply use (size_t)MIN(rate, SIZE_MAX) to cast
   * the rate from double to size_t here, because on a 64-bit
   * system SIZE_MAX equals 2^64-1, which can't be represented
   * as double without loss of precision and hence is rounded
   * up to 2^64, which in turn can't be converted back to size_t.
   * So we first convert the rate to uint64_t using exp2(64) to
   * check if it fits and only then cast the uint64_t to size_t.
   */
  uint64_t rate64;
  if (rate < exp2(64))
    rate64 = rate;
  else
    rate64 = UINT64_MAX;
  vy_quota_set_rate_limit(regulator->quota, VY_QUOTA_RESOURCE_DISK,
        (size_t)MIN(rate64, SIZE_MAX));

  /*
   * Periodically rotate statistics for quicker adaptation
   * to workload changes.
   */
  if (recent->dump_count > VY_RECENT_DUMP_COUNT) {
    recent->dump_count /= 2;
    recent->dump_input /= 2;
    recent->compaction_time /= 2;
  }
}

Coverage Report

Created: 2024-04-24 06:23

Line	Count	Source (jump to first uncovered line)
1		/*
2		* Copyright 2010-2018, Tarantool AUTHORS, please see AUTHORS file.
3		*
4		* Redistribution and use in source and binary forms, with or
5		* without modification, are permitted provided that the following
6		* conditions are met:
7		*
8		* 1. Redistributions of source code must retain the above
9		* copyright notice, this list of conditions and the
10		* following disclaimer.
11		*
12		* 2. Redistributions in binary form must reproduce the above
13		* copyright notice, this list of conditions and the following
14		* disclaimer in the documentation and/or other materials
15		* provided with the distribution.
16		*
17		* THIS SOFTWARE IS PROVIDED BY AUTHORS ``AS IS'' AND
18		* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
19		* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20		* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
21		* AUTHORS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
22		* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23		* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24		* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
25		* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
26		* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27		* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
28		* THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29		* SUCH DAMAGE.
30		*/
31		#include "vy_regulator.h"
32
33		#include <math.h>
34		#include <stdbool.h>
35		#include <stddef.h>
36		#include <stdint.h>
37		#include <string.h>
38		#include <tarantool_ev.h>
39
40		#include "fiber.h"
41		#include "histogram.h"
42		#include "say.h"
43		#include "trivia/util.h"
44
45		#include "vy_quota.h"
46		#include "vy_stat.h"
47
48		/**
49		* Regulator timer period, in seconds.
50		*/
51		static const double VY_REGULATOR_TIMER_PERIOD = 1;
52
53		/**
54		* Time window over which the write rate is averaged,
55		* in seconds.
56		*/
57		static const double VY_WRITE_RATE_AVG_WIN = 5;
58
59		/**
60		* Histogram percentile used for estimating dump bandwidth.
61		* For details see the comment to vy_regulator::dump_bandwidth_hist.
62		*/
63		static const int VY_DUMP_BANDWIDTH_PCT = 10;
64
65		/*
66		* Until we dump anything, assume bandwidth to be 10 MB/s,
67		* which should be fine for initial guess.
68		*/
69		static const size_t VY_DUMP_BANDWIDTH_DEFAULT = 10 * 1024 * 1024;
70
71		/**
72		* Do not take into account small dumps when estimating dump
73		* bandwidth, because they have too high overhead associated
74		* with file creation.
75		*/
76		static const size_t VY_DUMP_SIZE_ACCT_MIN = 1024 * 1024;
77
78		/**
79		* Number of dumps to take into account for rate limit calculation.
80		* Shouldn't be too small to avoid uneven RPS. Shouldn't be too big
81		* either - otherwise the rate limit will adapt too slowly to workload
82		* changes. 100 feels like a good choice.
83		*/
84		static const int VY_RECENT_DUMP_COUNT = 100;
85
86		static void
87		vy_regulator_trigger_dump(struct vy_regulator *regulator)
88	0	{
89	0	if (regulator->dump_in_progress)
90	0	return;
91
92	0	if (regulator->trigger_dump_cb(regulator) != 0)
93	0	return;
94
95	0	regulator->dump_in_progress = true;
96
97		/*
98		* To avoid unpredictably long stalls, we must limit
99		* the write rate when a dump is in progress so that
100		* we don't hit the hard limit before the dump has
101		* completed, i.e.
102		*
103		* mem_left mem_used
104		* ---------- >= --------------
105		* write_rate dump_bandwidth
106		*/
107	0	struct vy_quota *quota = regulator->quota;
108	0	size_t mem_left = (quota->used < quota->limit ?
109	0	quota->limit - quota->used : 0);
110	0	size_t mem_used = quota->used;
111	0	size_t max_write_rate = (double)mem_left / (mem_used + 1) *
112	0	regulator->dump_bandwidth;
113	0	max_write_rate = MIN(max_write_rate, regulator->dump_bandwidth);
114	0	vy_quota_set_rate_limit(quota, VY_QUOTA_RESOURCE_MEMORY,
115	0	max_write_rate);
116
117	0	say_info("dumping %zu bytes, expected rate %.1f MB/s, "
118	0	"ETA %.1f s, write rate (avg/max) %.1f/%.1f MB/s",
119	0	quota->used, (double)regulator->dump_bandwidth / 1024 / 1024,
120	0	(double)quota->used / (regulator->dump_bandwidth + 1),
121	0	(double)regulator->write_rate / 1024 / 1024,
122	0	(double)regulator->write_rate_max / 1024 / 1024);
123
124	0	regulator->write_rate_max = regulator->write_rate;
125	0	}
126
127		static void
128		vy_regulator_update_write_rate(struct vy_regulator *regulator)
129	0	{
130	0	size_t used_curr = regulator->quota->used;
131	0	size_t used_last = regulator->quota_used_last;
132
133		/*
134		* Memory can be dumped between two subsequent timer
135		* callback invocations, in which case memory usage
136		* will decrease. Ignore such observations - it's not
137		* a big deal, because dump is a rare event.
138		*/
139	0	if (used_curr < used_last) {
140	0	regulator->quota_used_last = used_curr;
141	0	return;
142	0	}
143
144	0	size_t rate_avg = regulator->write_rate;
145	0	size_t rate_curr = (used_curr - used_last) / VY_REGULATOR_TIMER_PERIOD;
146
147	0	double weight = 1 - exp(-VY_REGULATOR_TIMER_PERIOD /
148	0	VY_WRITE_RATE_AVG_WIN);
149	0	rate_avg = (1 - weight) * rate_avg + weight * rate_curr;
150
151	0	regulator->write_rate = rate_avg;
152	0	if (regulator->write_rate_max < rate_curr)
153	0	regulator->write_rate_max = rate_curr;
154	0	regulator->quota_used_last = used_curr;
155	0	}
156
157		static void
158		vy_regulator_update_dump_watermark(struct vy_regulator *regulator)
159	0	{
160	0	struct vy_quota *quota = regulator->quota;
161
162		/*
163		* Due to log structured nature of the lsregion allocator,
164		* which is used for allocating statements, we cannot free
165		* memory in chunks, only all at once. Therefore we should
166		* configure the watermark so that by the time we hit the
167		* limit, all memory have been dumped, i.e.
168		*
169		* limit - watermark watermark
170		* ----------------- = --------------
171		* write_rate dump_bandwidth
172		*
173		* Be pessimistic when predicting the write rate - use the
174		* max observed write rate multiplied by 1.5 - because it's
175		* better to start memory dump early than delay it as long
176		* as possible at the risk of experiencing unpredictably
177		* long stalls.
178		*/
179	0	size_t write_rate = regulator->write_rate_max * 3 / 2;
180	0	regulator->dump_watermark =
181	0	(double)quota->limit * regulator->dump_bandwidth /
182	0	(regulator->dump_bandwidth + write_rate + 1);
183		/*
184		* It doesn't make sense to set the watermark below 50%
185		* of the memory limit because the write rate can exceed
186		* the dump bandwidth under no circumstances.
187		*/
188	0	regulator->dump_watermark = MAX(regulator->dump_watermark,
189	0	quota->limit / 2);
190	0	}
191
192		static void
193		vy_regulator_timer_cb(ev_loop loop, ev_timer timer, int events)
194	0	{
195	0	(void)loop;
196	0	(void)events;
197
198	0	struct vy_regulator *regulator = timer->data;
199
200	0	vy_regulator_update_write_rate(regulator);
201	0	vy_regulator_update_dump_watermark(regulator);
202	0	vy_regulator_check_dump_watermark(regulator);
203	0	}
204
205		void
206		vy_regulator_create(struct vy_regulator regulator, struct vy_quota quota,
207		vy_trigger_dump_f trigger_dump_cb)
208	0	{
209	0	enum { KB = 1024, MB = KB * KB };
210	0	static int64_t dump_bandwidth_buckets[] = {
211	0	100 * KB, 200 * KB, 300 * KB, 400 * KB, 500 * KB, 600 * KB,
212	0	700 * KB, 800 * KB, 900 * KB, 1 * MB, 2 * MB, 3 * MB,
213	0	4 * MB, 5 * MB, 6 * MB, 7 * MB, 8 * MB, 9 * MB,
214	0	10 * MB, 15 * MB, 20 * MB, 25 * MB, 30 * MB, 35 * MB,
215	0	40 * MB, 45 * MB, 50 * MB, 55 * MB, 60 * MB, 65 * MB,
216	0	70 * MB, 75 * MB, 80 * MB, 85 * MB, 90 * MB, 95 * MB,
217	0	100 * MB, 200 * MB, 300 * MB, 400 * MB, 500 * MB, 600 * MB,
218	0	700 * MB, 800 * MB, 900 * MB,
219	0	};
220	0	memset(regulator, 0, sizeof(*regulator));
221	0	regulator->dump_bandwidth_hist = histogram_new(dump_bandwidth_buckets,
222	0	lengthof(dump_bandwidth_buckets));
223	0	if (regulator->dump_bandwidth_hist == NULL)
224	0	panic("failed to allocate dump bandwidth histogram");
225
226	0	regulator->quota = quota;
227	0	regulator->trigger_dump_cb = trigger_dump_cb;
228	0	ev_timer_init(&regulator->timer, vy_regulator_timer_cb, 0,
229	0	VY_REGULATOR_TIMER_PERIOD);
230	0	regulator->timer.data = regulator;
231	0	regulator->dump_bandwidth = VY_DUMP_BANDWIDTH_DEFAULT;
232	0	regulator->dump_watermark = SIZE_MAX;
233	0	}
234
235		void
236		vy_regulator_start(struct vy_regulator *regulator)
237	0	{
238	0	regulator->quota_used_last = regulator->quota->used;
239	0	vy_quota_set_rate_limit(regulator->quota, VY_QUOTA_RESOURCE_MEMORY,
240	0	regulator->dump_bandwidth);
241	0	ev_timer_start(loop(), &regulator->timer);
242	0	}
243
244		void
245		vy_regulator_destroy(struct vy_regulator *regulator)
246	0	{
247	0	ev_timer_stop(loop(), &regulator->timer);
248	0	histogram_delete(regulator->dump_bandwidth_hist);
249	0	}
250
251		void
252		vy_regulator_quota_exceeded(struct vy_regulator *regulator)
253	0	{
254	0	vy_regulator_trigger_dump(regulator);
255	0	}
256
257		void
258		vy_regulator_check_dump_watermark(struct vy_regulator *regulator)
259	0	{
260	0	if (regulator->quota->used >= regulator->dump_watermark)
261	0	vy_regulator_trigger_dump(regulator);
262	0	}
263
264		void
265		vy_regulator_dump_complete(struct vy_regulator *regulator,
266		size_t mem_dumped, double dump_duration)
267	0	{
268	0	regulator->dump_in_progress = false;
269
270	0	if (mem_dumped >= VY_DUMP_SIZE_ACCT_MIN && dump_duration > 0) {
271	0	histogram_collect(regulator->dump_bandwidth_hist,
272	0	mem_dumped / dump_duration);
273		/*
274		* To avoid unpredictably long stalls caused by
275		* mispredicting dump time duration, we need to
276		* know the worst (smallest) dump bandwidth so
277		* use a lower-bound percentile estimate.
278		*/
279	0	regulator->dump_bandwidth = histogram_percentile_lower(
280	0	regulator->dump_bandwidth_hist, VY_DUMP_BANDWIDTH_PCT);
281	0	}
282
283		/*
284		* Reset the rate limit.
285		*
286		* It doesn't make sense to allow to consume memory at
287		* a higher rate than it can be dumped so we set the rate
288		* limit to the dump bandwidth rather than disabling it
289		* completely.
290		*/
291	0	vy_quota_set_rate_limit(regulator->quota, VY_QUOTA_RESOURCE_MEMORY,
292	0	regulator->dump_bandwidth);
293
294	0	if (dump_duration > 0) {
295	0	say_info("dumped %zu bytes in %.1f s, rate %.1f MB/s",
296	0	mem_dumped, dump_duration,
297	0	mem_dumped / dump_duration / 1024 / 1024);
298	0	}
299	0	}
300
301		void
302		vy_regulator_set_memory_limit(struct vy_regulator *regulator, size_t limit)
303	0	{
304	0	vy_quota_set_limit(regulator->quota, limit);
305	0	vy_regulator_update_dump_watermark(regulator);
306	0	}
307
308		void
309		vy_regulator_reset_dump_bandwidth(struct vy_regulator *regulator, size_t max)
310	0	{
311	0	histogram_reset(regulator->dump_bandwidth_hist);
312	0	regulator->dump_bandwidth = VY_DUMP_BANDWIDTH_DEFAULT;
313	0	if (max > 0 && regulator->dump_bandwidth > max)
314	0	regulator->dump_bandwidth = max;
315	0	vy_quota_set_rate_limit(regulator->quota, VY_QUOTA_RESOURCE_MEMORY,
316	0	regulator->dump_bandwidth);
317	0	}
318
319		void
320		vy_regulator_reset_stat(struct vy_regulator *regulator)
321	0	{
322	0	memset(&regulator->sched_stat_last, 0,
323	0	sizeof(regulator->sched_stat_last));
324	0	}
325
326		/*
327		* The goal of rate limiting is to ensure LSM trees stay close to
328		* their perfect shape, as defined by run_size_ratio. When dump rate
329		* is too high, we have to throttle database writes to ensure
330		* compaction can keep up with dumps. We can't deduce optimal dump
331		* bandwidth from LSM configuration, such as run_size_ratio or
332		* run_count_per_level, since different spaces or different indexes
333		* within a space can have different configuration settings. The
334		* workload can also vary significantly from space to space. So,
335		* when setting the limit, we have to consider dump and compaction
336		* activities of the database as a whole.
337		*
338		* To this end, we keep track of compaction bandwidth and write
339		* amplification of the entire database, across all LSM trees.
340		* The idea is simple: observe the current write amplification
341		* and compaction bandwidth, and set maximal write rate to a value
342		* somewhat below the implied limit, so as to make room for
343		* compaction to do more work if necessary.
344		*
345		* We use the following metrics to calculate the limit:
346		* - dump_output - number of bytes dumped to disk over the last
347		* observation period. The period itself is measured in dumps,
348		* not seconds, and is defined by constant VY_RECENT_DUMP_COUNT.
349		* - compaction_output - number of bytes produced by compaction
350		* over the same period.
351		* - compaction_rate - total compaction output, in bytes, divided
352		* by total time spent on doing compaction by compaction threads,
353		* both measured over the same observation period. This gives an
354		* estimate of the speed at which compaction can write output.
355		* In the real world this speed is dependent on the disk write
356		* throughput, number of dump threads, and actual dump rate, but
357		* given the goal of rate limiting is providing compaction with
358		* extra bandwidth, this metric is considered an accurate enough
359		* approximation of the disk bandwidth available to compaction.
360		*
361		* We calculate the compaction rate with the following formula:
362		*
363		* compaction_output
364		* compaction_rate = compaction_threads * -----------------
365		* compaction_time
366		*
367		* where compaction_threads represents the total number of available
368		* compaction threads and compaction_time is the total time, in
369		* seconds, spent by all threads doing compaction. You can look at
370		* the formula this way: compaction_ouptut / compaction_time gives
371		* the average write speed of a single compaction thread, and by
372		* multiplying it by the number of compaction threads we get the
373		* compaction rate of the entire database.
374		*
375		* In an optimal system dump rate must be proportional to compaction
376		* rate and inverse to write amplification:
377		*
378		* dump_rate = compaction_rate / (write_amplification - 1)
379		*
380		* The latter can be obtained by dividing total output of compaction
381		* by total output of dumps over the observation period:
382		*
383		* dump_output + compaction_output
384		* write_amplification = ------------------------------- =
385		* dump_output
386		*
387		* = 1 + compaction_output / dump_output
388		*
389		* Putting this all together and taking into account data compaction
390		* during memory dump, we get for the max transaction rate:
391		*
392		* dump_input
393		* tx_rate = dump_rate * ----------- =
394		* dump_output
395		*
396		* compaction_output
397		* = compaction_threads * ----------------- *
398		* compaction_time
399		*
400		* dump_output dump_input
401		* * ----------------- * ----------- =
402		* compaction_output dump_output
403		*
404		* = compaction_threads * dump_input / compaction_time
405		*
406		* We set the rate limit to 0.75 of the approximated optimal to
407		* leave the database engine enough room needed to use more disk
408		* bandwidth for compaction if necessary. As soon as compaction gets
409		* enough disk bandwidth to keep LSM trees in optimal shape
410		* compaction speed becomes stable, as does write amplification.
411		*/
412		void
413		vy_regulator_update_rate_limit(struct vy_regulator *regulator,
414		const struct vy_scheduler_stat *stat,
415		int compaction_threads)
416	0	{
417	0	struct vy_scheduler_stat *last = &regulator->sched_stat_last;
418	0	struct vy_scheduler_stat *recent = &regulator->sched_stat_recent;
419
420	0	int32_t dump_count = stat->dump_count - last->dump_count;
421	0	int64_t dump_input = stat->dump_input - last->dump_input;
422	0	double compaction_time = stat->compaction_time - last->compaction_time;
423	0	last = stat;
424
425	0	if (dump_input < (ssize_t)VY_DUMP_SIZE_ACCT_MIN \|\| compaction_time == 0)
426	0	return;
427
428	0	recent->dump_count += dump_count;
429	0	recent->dump_input += dump_input;
430	0	recent->compaction_time += compaction_time;
431
432	0	double rate = 0.75 * compaction_threads * recent->dump_input /
433	0	recent->compaction_time;
434		/*
435		* We can't simply use (size_t)MIN(rate, SIZE_MAX) to cast
436		* the rate from double to size_t here, because on a 64-bit
437		* system SIZE_MAX equals 2^64-1, which can't be represented
438		* as double without loss of precision and hence is rounded
439		* up to 2^64, which in turn can't be converted back to size_t.
440		* So we first convert the rate to uint64_t using exp2(64) to
441		* check if it fits and only then cast the uint64_t to size_t.
442		*/
443	0	uint64_t rate64;
444	0	if (rate < exp2(64))
445	0	rate64 = rate;
446	0	else
447	0	rate64 = UINT64_MAX;
448	0	vy_quota_set_rate_limit(regulator->quota, VY_QUOTA_RESOURCE_DISK,
449	0	(size_t)MIN(rate64, SIZE_MAX));
450
451		/*
452		* Periodically rotate statistics for quicker adaptation
453		* to workload changes.
454		*/
455	0	if (recent->dump_count > VY_RECENT_DUMP_COUNT) {
456	0	recent->dump_count /= 2;
457	0	recent->dump_input /= 2;
458	0	recent->compaction_time /= 2;
459	0	}
460	0	}