/src/tarantool/src/box/vy_regulator.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright 2010-2018, Tarantool AUTHORS, please see AUTHORS file. |
3 | | * |
4 | | * Redistribution and use in source and binary forms, with or |
5 | | * without modification, are permitted provided that the following |
6 | | * conditions are met: |
7 | | * |
8 | | * 1. Redistributions of source code must retain the above |
9 | | * copyright notice, this list of conditions and the |
10 | | * following disclaimer. |
11 | | * |
12 | | * 2. Redistributions in binary form must reproduce the above |
13 | | * copyright notice, this list of conditions and the following |
14 | | * disclaimer in the documentation and/or other materials |
15 | | * provided with the distribution. |
16 | | * |
17 | | * THIS SOFTWARE IS PROVIDED BY AUTHORS ``AS IS'' AND |
18 | | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED |
19 | | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
20 | | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL |
21 | | * AUTHORS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, |
22 | | * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
23 | | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
24 | | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR |
25 | | * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF |
26 | | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
27 | | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF |
28 | | * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
29 | | * SUCH DAMAGE. |
30 | | */ |
31 | | #include "vy_regulator.h" |
32 | | |
33 | | #include <math.h> |
34 | | #include <stdbool.h> |
35 | | #include <stddef.h> |
36 | | #include <stdint.h> |
37 | | #include <string.h> |
38 | | #include <tarantool_ev.h> |
39 | | |
40 | | #include "fiber.h" |
41 | | #include "histogram.h" |
42 | | #include "say.h" |
43 | | #include "trivia/util.h" |
44 | | |
45 | | #include "vy_quota.h" |
46 | | #include "vy_stat.h" |
47 | | |
48 | | /** |
49 | | * Regulator timer period, in seconds. |
50 | | */ |
51 | | static const double VY_REGULATOR_TIMER_PERIOD = 1; |
52 | | |
53 | | /** |
54 | | * Time window over which the write rate is averaged, |
55 | | * in seconds. |
56 | | */ |
57 | | static const double VY_WRITE_RATE_AVG_WIN = 5; |
58 | | |
59 | | /** |
60 | | * Histogram percentile used for estimating dump bandwidth. |
61 | | * For details see the comment to vy_regulator::dump_bandwidth_hist. |
62 | | */ |
63 | | static const int VY_DUMP_BANDWIDTH_PCT = 10; |
64 | | |
65 | | /* |
66 | | * Until we dump anything, assume bandwidth to be 10 MB/s, |
67 | | * which should be fine for initial guess. |
68 | | */ |
69 | | static const size_t VY_DUMP_BANDWIDTH_DEFAULT = 10 * 1024 * 1024; |
70 | | |
71 | | /** |
72 | | * Do not take into account small dumps when estimating dump |
73 | | * bandwidth, because they have too high overhead associated |
74 | | * with file creation. |
75 | | */ |
76 | | static const size_t VY_DUMP_SIZE_ACCT_MIN = 1024 * 1024; |
77 | | |
78 | | /** |
79 | | * Number of dumps to take into account for rate limit calculation. |
80 | | * Shouldn't be too small to avoid uneven RPS. Shouldn't be too big |
81 | | * either - otherwise the rate limit will adapt too slowly to workload |
82 | | * changes. 100 feels like a good choice. |
83 | | */ |
84 | | static const int VY_RECENT_DUMP_COUNT = 100; |
85 | | |
86 | | static void |
87 | | vy_regulator_trigger_dump(struct vy_regulator *regulator) |
88 | 0 | { |
89 | 0 | if (regulator->dump_in_progress) |
90 | 0 | return; |
91 | | |
92 | 0 | if (regulator->trigger_dump_cb(regulator) != 0) |
93 | 0 | return; |
94 | | |
95 | 0 | regulator->dump_in_progress = true; |
96 | | |
97 | | /* |
98 | | * To avoid unpredictably long stalls, we must limit |
99 | | * the write rate when a dump is in progress so that |
100 | | * we don't hit the hard limit before the dump has |
101 | | * completed, i.e. |
102 | | * |
103 | | * mem_left mem_used |
104 | | * ---------- >= -------------- |
105 | | * write_rate dump_bandwidth |
106 | | */ |
107 | 0 | struct vy_quota *quota = regulator->quota; |
108 | 0 | size_t mem_left = (quota->used < quota->limit ? |
109 | 0 | quota->limit - quota->used : 0); |
110 | 0 | size_t mem_used = quota->used; |
111 | 0 | size_t max_write_rate = (double)mem_left / (mem_used + 1) * |
112 | 0 | regulator->dump_bandwidth; |
113 | 0 | max_write_rate = MIN(max_write_rate, regulator->dump_bandwidth); |
114 | 0 | vy_quota_set_rate_limit(quota, VY_QUOTA_RESOURCE_MEMORY, |
115 | 0 | max_write_rate); |
116 | |
|
117 | 0 | say_info("dumping %zu bytes, expected rate %.1f MB/s, " |
118 | 0 | "ETA %.1f s, write rate (avg/max) %.1f/%.1f MB/s", |
119 | 0 | quota->used, (double)regulator->dump_bandwidth / 1024 / 1024, |
120 | 0 | (double)quota->used / (regulator->dump_bandwidth + 1), |
121 | 0 | (double)regulator->write_rate / 1024 / 1024, |
122 | 0 | (double)regulator->write_rate_max / 1024 / 1024); |
123 | |
|
124 | 0 | regulator->write_rate_max = regulator->write_rate; |
125 | 0 | } |
126 | | |
127 | | static void |
128 | | vy_regulator_update_write_rate(struct vy_regulator *regulator) |
129 | 0 | { |
130 | 0 | size_t used_curr = regulator->quota->used; |
131 | 0 | size_t used_last = regulator->quota_used_last; |
132 | | |
133 | | /* |
134 | | * Memory can be dumped between two subsequent timer |
135 | | * callback invocations, in which case memory usage |
136 | | * will decrease. Ignore such observations - it's not |
137 | | * a big deal, because dump is a rare event. |
138 | | */ |
139 | 0 | if (used_curr < used_last) { |
140 | 0 | regulator->quota_used_last = used_curr; |
141 | 0 | return; |
142 | 0 | } |
143 | | |
144 | 0 | size_t rate_avg = regulator->write_rate; |
145 | 0 | size_t rate_curr = (used_curr - used_last) / VY_REGULATOR_TIMER_PERIOD; |
146 | |
|
147 | 0 | double weight = 1 - exp(-VY_REGULATOR_TIMER_PERIOD / |
148 | 0 | VY_WRITE_RATE_AVG_WIN); |
149 | 0 | rate_avg = (1 - weight) * rate_avg + weight * rate_curr; |
150 | |
|
151 | 0 | regulator->write_rate = rate_avg; |
152 | 0 | if (regulator->write_rate_max < rate_curr) |
153 | 0 | regulator->write_rate_max = rate_curr; |
154 | 0 | regulator->quota_used_last = used_curr; |
155 | 0 | } |
156 | | |
157 | | static void |
158 | | vy_regulator_update_dump_watermark(struct vy_regulator *regulator) |
159 | 0 | { |
160 | 0 | struct vy_quota *quota = regulator->quota; |
161 | | |
162 | | /* |
163 | | * Due to log structured nature of the lsregion allocator, |
164 | | * which is used for allocating statements, we cannot free |
165 | | * memory in chunks, only all at once. Therefore we should |
166 | | * configure the watermark so that by the time we hit the |
167 | | * limit, all memory have been dumped, i.e. |
168 | | * |
169 | | * limit - watermark watermark |
170 | | * ----------------- = -------------- |
171 | | * write_rate dump_bandwidth |
172 | | * |
173 | | * Be pessimistic when predicting the write rate - use the |
174 | | * max observed write rate multiplied by 1.5 - because it's |
175 | | * better to start memory dump early than delay it as long |
176 | | * as possible at the risk of experiencing unpredictably |
177 | | * long stalls. |
178 | | */ |
179 | 0 | size_t write_rate = regulator->write_rate_max * 3 / 2; |
180 | 0 | regulator->dump_watermark = |
181 | 0 | (double)quota->limit * regulator->dump_bandwidth / |
182 | 0 | (regulator->dump_bandwidth + write_rate + 1); |
183 | | /* |
184 | | * It doesn't make sense to set the watermark below 50% |
185 | | * of the memory limit because the write rate can exceed |
186 | | * the dump bandwidth under no circumstances. |
187 | | */ |
188 | 0 | regulator->dump_watermark = MAX(regulator->dump_watermark, |
189 | 0 | quota->limit / 2); |
190 | 0 | } |
191 | | |
192 | | static void |
193 | | vy_regulator_timer_cb(ev_loop *loop, ev_timer *timer, int events) |
194 | 0 | { |
195 | 0 | (void)loop; |
196 | 0 | (void)events; |
197 | |
|
198 | 0 | struct vy_regulator *regulator = timer->data; |
199 | |
|
200 | 0 | vy_regulator_update_write_rate(regulator); |
201 | 0 | vy_regulator_update_dump_watermark(regulator); |
202 | 0 | vy_regulator_check_dump_watermark(regulator); |
203 | 0 | } |
204 | | |
205 | | void |
206 | | vy_regulator_create(struct vy_regulator *regulator, struct vy_quota *quota, |
207 | | vy_trigger_dump_f trigger_dump_cb) |
208 | 0 | { |
209 | 0 | enum { KB = 1024, MB = KB * KB }; |
210 | 0 | static int64_t dump_bandwidth_buckets[] = { |
211 | 0 | 100 * KB, 200 * KB, 300 * KB, 400 * KB, 500 * KB, 600 * KB, |
212 | 0 | 700 * KB, 800 * KB, 900 * KB, 1 * MB, 2 * MB, 3 * MB, |
213 | 0 | 4 * MB, 5 * MB, 6 * MB, 7 * MB, 8 * MB, 9 * MB, |
214 | 0 | 10 * MB, 15 * MB, 20 * MB, 25 * MB, 30 * MB, 35 * MB, |
215 | 0 | 40 * MB, 45 * MB, 50 * MB, 55 * MB, 60 * MB, 65 * MB, |
216 | 0 | 70 * MB, 75 * MB, 80 * MB, 85 * MB, 90 * MB, 95 * MB, |
217 | 0 | 100 * MB, 200 * MB, 300 * MB, 400 * MB, 500 * MB, 600 * MB, |
218 | 0 | 700 * MB, 800 * MB, 900 * MB, |
219 | 0 | }; |
220 | 0 | memset(regulator, 0, sizeof(*regulator)); |
221 | 0 | regulator->dump_bandwidth_hist = histogram_new(dump_bandwidth_buckets, |
222 | 0 | lengthof(dump_bandwidth_buckets)); |
223 | 0 | if (regulator->dump_bandwidth_hist == NULL) |
224 | 0 | panic("failed to allocate dump bandwidth histogram"); |
225 | | |
226 | 0 | regulator->quota = quota; |
227 | 0 | regulator->trigger_dump_cb = trigger_dump_cb; |
228 | 0 | ev_timer_init(®ulator->timer, vy_regulator_timer_cb, 0, |
229 | 0 | VY_REGULATOR_TIMER_PERIOD); |
230 | 0 | regulator->timer.data = regulator; |
231 | 0 | regulator->dump_bandwidth = VY_DUMP_BANDWIDTH_DEFAULT; |
232 | 0 | regulator->dump_watermark = SIZE_MAX; |
233 | 0 | } |
234 | | |
235 | | void |
236 | | vy_regulator_start(struct vy_regulator *regulator) |
237 | 0 | { |
238 | 0 | regulator->quota_used_last = regulator->quota->used; |
239 | 0 | vy_quota_set_rate_limit(regulator->quota, VY_QUOTA_RESOURCE_MEMORY, |
240 | 0 | regulator->dump_bandwidth); |
241 | 0 | ev_timer_start(loop(), ®ulator->timer); |
242 | 0 | } |
243 | | |
244 | | void |
245 | | vy_regulator_destroy(struct vy_regulator *regulator) |
246 | 0 | { |
247 | 0 | ev_timer_stop(loop(), ®ulator->timer); |
248 | 0 | histogram_delete(regulator->dump_bandwidth_hist); |
249 | 0 | } |
250 | | |
251 | | void |
252 | | vy_regulator_quota_exceeded(struct vy_regulator *regulator) |
253 | 0 | { |
254 | 0 | vy_regulator_trigger_dump(regulator); |
255 | 0 | } |
256 | | |
257 | | void |
258 | | vy_regulator_check_dump_watermark(struct vy_regulator *regulator) |
259 | 0 | { |
260 | 0 | if (regulator->quota->used >= regulator->dump_watermark) |
261 | 0 | vy_regulator_trigger_dump(regulator); |
262 | 0 | } |
263 | | |
264 | | void |
265 | | vy_regulator_dump_complete(struct vy_regulator *regulator, |
266 | | size_t mem_dumped, double dump_duration) |
267 | 0 | { |
268 | 0 | regulator->dump_in_progress = false; |
269 | |
|
270 | 0 | if (mem_dumped >= VY_DUMP_SIZE_ACCT_MIN && dump_duration > 0) { |
271 | 0 | histogram_collect(regulator->dump_bandwidth_hist, |
272 | 0 | mem_dumped / dump_duration); |
273 | | /* |
274 | | * To avoid unpredictably long stalls caused by |
275 | | * mispredicting dump time duration, we need to |
276 | | * know the worst (smallest) dump bandwidth so |
277 | | * use a lower-bound percentile estimate. |
278 | | */ |
279 | 0 | regulator->dump_bandwidth = histogram_percentile_lower( |
280 | 0 | regulator->dump_bandwidth_hist, VY_DUMP_BANDWIDTH_PCT); |
281 | 0 | } |
282 | | |
283 | | /* |
284 | | * Reset the rate limit. |
285 | | * |
286 | | * It doesn't make sense to allow to consume memory at |
287 | | * a higher rate than it can be dumped so we set the rate |
288 | | * limit to the dump bandwidth rather than disabling it |
289 | | * completely. |
290 | | */ |
291 | 0 | vy_quota_set_rate_limit(regulator->quota, VY_QUOTA_RESOURCE_MEMORY, |
292 | 0 | regulator->dump_bandwidth); |
293 | |
|
294 | 0 | if (dump_duration > 0) { |
295 | 0 | say_info("dumped %zu bytes in %.1f s, rate %.1f MB/s", |
296 | 0 | mem_dumped, dump_duration, |
297 | 0 | mem_dumped / dump_duration / 1024 / 1024); |
298 | 0 | } |
299 | 0 | } |
300 | | |
301 | | void |
302 | | vy_regulator_set_memory_limit(struct vy_regulator *regulator, size_t limit) |
303 | 0 | { |
304 | 0 | vy_quota_set_limit(regulator->quota, limit); |
305 | 0 | vy_regulator_update_dump_watermark(regulator); |
306 | 0 | } |
307 | | |
308 | | void |
309 | | vy_regulator_reset_dump_bandwidth(struct vy_regulator *regulator, size_t max) |
310 | 0 | { |
311 | 0 | histogram_reset(regulator->dump_bandwidth_hist); |
312 | 0 | regulator->dump_bandwidth = VY_DUMP_BANDWIDTH_DEFAULT; |
313 | 0 | if (max > 0 && regulator->dump_bandwidth > max) |
314 | 0 | regulator->dump_bandwidth = max; |
315 | 0 | vy_quota_set_rate_limit(regulator->quota, VY_QUOTA_RESOURCE_MEMORY, |
316 | 0 | regulator->dump_bandwidth); |
317 | 0 | } |
318 | | |
319 | | void |
320 | | vy_regulator_reset_stat(struct vy_regulator *regulator) |
321 | 0 | { |
322 | 0 | memset(®ulator->sched_stat_last, 0, |
323 | 0 | sizeof(regulator->sched_stat_last)); |
324 | 0 | } |
325 | | |
326 | | /* |
327 | | * The goal of rate limiting is to ensure LSM trees stay close to |
328 | | * their perfect shape, as defined by run_size_ratio. When dump rate |
329 | | * is too high, we have to throttle database writes to ensure |
330 | | * compaction can keep up with dumps. We can't deduce optimal dump |
331 | | * bandwidth from LSM configuration, such as run_size_ratio or |
332 | | * run_count_per_level, since different spaces or different indexes |
333 | | * within a space can have different configuration settings. The |
334 | | * workload can also vary significantly from space to space. So, |
335 | | * when setting the limit, we have to consider dump and compaction |
336 | | * activities of the database as a whole. |
337 | | * |
338 | | * To this end, we keep track of compaction bandwidth and write |
339 | | * amplification of the entire database, across all LSM trees. |
340 | | * The idea is simple: observe the current write amplification |
341 | | * and compaction bandwidth, and set maximal write rate to a value |
342 | | * somewhat below the implied limit, so as to make room for |
343 | | * compaction to do more work if necessary. |
344 | | * |
345 | | * We use the following metrics to calculate the limit: |
346 | | * - dump_output - number of bytes dumped to disk over the last |
347 | | * observation period. The period itself is measured in dumps, |
348 | | * not seconds, and is defined by constant VY_RECENT_DUMP_COUNT. |
349 | | * - compaction_output - number of bytes produced by compaction |
350 | | * over the same period. |
351 | | * - compaction_rate - total compaction output, in bytes, divided |
352 | | * by total time spent on doing compaction by compaction threads, |
353 | | * both measured over the same observation period. This gives an |
354 | | * estimate of the speed at which compaction can write output. |
355 | | * In the real world this speed is dependent on the disk write |
356 | | * throughput, number of dump threads, and actual dump rate, but |
357 | | * given the goal of rate limiting is providing compaction with |
358 | | * extra bandwidth, this metric is considered an accurate enough |
359 | | * approximation of the disk bandwidth available to compaction. |
360 | | * |
361 | | * We calculate the compaction rate with the following formula: |
362 | | * |
363 | | * compaction_output |
364 | | * compaction_rate = compaction_threads * ----------------- |
365 | | * compaction_time |
366 | | * |
367 | | * where compaction_threads represents the total number of available |
368 | | * compaction threads and compaction_time is the total time, in |
369 | | * seconds, spent by all threads doing compaction. You can look at |
370 | | * the formula this way: compaction_ouptut / compaction_time gives |
371 | | * the average write speed of a single compaction thread, and by |
372 | | * multiplying it by the number of compaction threads we get the |
373 | | * compaction rate of the entire database. |
374 | | * |
375 | | * In an optimal system dump rate must be proportional to compaction |
376 | | * rate and inverse to write amplification: |
377 | | * |
378 | | * dump_rate = compaction_rate / (write_amplification - 1) |
379 | | * |
380 | | * The latter can be obtained by dividing total output of compaction |
381 | | * by total output of dumps over the observation period: |
382 | | * |
383 | | * dump_output + compaction_output |
384 | | * write_amplification = ------------------------------- = |
385 | | * dump_output |
386 | | * |
387 | | * = 1 + compaction_output / dump_output |
388 | | * |
389 | | * Putting this all together and taking into account data compaction |
390 | | * during memory dump, we get for the max transaction rate: |
391 | | * |
392 | | * dump_input |
393 | | * tx_rate = dump_rate * ----------- = |
394 | | * dump_output |
395 | | * |
396 | | * compaction_output |
397 | | * = compaction_threads * ----------------- * |
398 | | * compaction_time |
399 | | * |
400 | | * dump_output dump_input |
401 | | * * ----------------- * ----------- = |
402 | | * compaction_output dump_output |
403 | | * |
404 | | * = compaction_threads * dump_input / compaction_time |
405 | | * |
406 | | * We set the rate limit to 0.75 of the approximated optimal to |
407 | | * leave the database engine enough room needed to use more disk |
408 | | * bandwidth for compaction if necessary. As soon as compaction gets |
409 | | * enough disk bandwidth to keep LSM trees in optimal shape |
410 | | * compaction speed becomes stable, as does write amplification. |
411 | | */ |
412 | | void |
413 | | vy_regulator_update_rate_limit(struct vy_regulator *regulator, |
414 | | const struct vy_scheduler_stat *stat, |
415 | | int compaction_threads) |
416 | 0 | { |
417 | 0 | struct vy_scheduler_stat *last = ®ulator->sched_stat_last; |
418 | 0 | struct vy_scheduler_stat *recent = ®ulator->sched_stat_recent; |
419 | |
|
420 | 0 | int32_t dump_count = stat->dump_count - last->dump_count; |
421 | 0 | int64_t dump_input = stat->dump_input - last->dump_input; |
422 | 0 | double compaction_time = stat->compaction_time - last->compaction_time; |
423 | 0 | *last = *stat; |
424 | |
|
425 | 0 | if (dump_input < (ssize_t)VY_DUMP_SIZE_ACCT_MIN || compaction_time == 0) |
426 | 0 | return; |
427 | | |
428 | 0 | recent->dump_count += dump_count; |
429 | 0 | recent->dump_input += dump_input; |
430 | 0 | recent->compaction_time += compaction_time; |
431 | |
|
432 | 0 | double rate = 0.75 * compaction_threads * recent->dump_input / |
433 | 0 | recent->compaction_time; |
434 | | /* |
435 | | * We can't simply use (size_t)MIN(rate, SIZE_MAX) to cast |
436 | | * the rate from double to size_t here, because on a 64-bit |
437 | | * system SIZE_MAX equals 2^64-1, which can't be represented |
438 | | * as double without loss of precision and hence is rounded |
439 | | * up to 2^64, which in turn can't be converted back to size_t. |
440 | | * So we first convert the rate to uint64_t using exp2(64) to |
441 | | * check if it fits and only then cast the uint64_t to size_t. |
442 | | */ |
443 | 0 | uint64_t rate64; |
444 | 0 | if (rate < exp2(64)) |
445 | 0 | rate64 = rate; |
446 | 0 | else |
447 | 0 | rate64 = UINT64_MAX; |
448 | 0 | vy_quota_set_rate_limit(regulator->quota, VY_QUOTA_RESOURCE_DISK, |
449 | 0 | (size_t)MIN(rate64, SIZE_MAX)); |
450 | | |
451 | | /* |
452 | | * Periodically rotate statistics for quicker adaptation |
453 | | * to workload changes. |
454 | | */ |
455 | 0 | if (recent->dump_count > VY_RECENT_DUMP_COUNT) { |
456 | 0 | recent->dump_count /= 2; |
457 | 0 | recent->dump_input /= 2; |
458 | 0 | recent->compaction_time /= 2; |
459 | 0 | } |
460 | 0 | } |