Coverage Report

Created: 2026-06-22 06:39

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/openvswitch/lib/dpif-netdev-perf.c
Line
Count
Source
1
/*
2
 * Copyright (c) 2017 Ericsson AB.
3
 *
4
 * Licensed under the Apache License, Version 2.0 (the "License");
5
 * you may not use this file except in compliance with the License.
6
 * You may obtain a copy of the License at:
7
 *
8
 *     http://www.apache.org/licenses/LICENSE-2.0
9
 *
10
 * Unless required by applicable law or agreed to in writing, software
11
 * distributed under the License is distributed on an "AS IS" BASIS,
12
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
 * See the License for the specific language governing permissions and
14
 * limitations under the License.
15
 */
16
17
#include <config.h>
18
#include <stdint.h>
19
20
#include "dpdk.h"
21
#include "dpif-netdev-perf.h"
22
#include "openvswitch/dynamic-string.h"
23
#include "openvswitch/vlog.h"
24
#include "ovs-numa.h"
25
#include "ovs-thread.h"
26
#include "timeval.h"
27
28
VLOG_DEFINE_THIS_MODULE(pmd_perf);
29
30
#define ITER_US_THRESHOLD 250   /* Warning threshold for iteration duration
31
                                   in microseconds. */
32
#define VHOST_QUEUE_FULL 128    /* Size of the virtio TX queue. */
33
0
#define LOG_IT_BEFORE 5         /* Number of iterations to log before
34
                                   suspicious iteration. */
35
0
#define LOG_IT_AFTER 5          /* Number of iterations to log after
36
                                   suspicious iteration. */
37
38
bool log_enabled = false;
39
bool log_extend = false;
40
static uint32_t log_it_before = LOG_IT_BEFORE;
41
static uint32_t log_it_after = LOG_IT_AFTER;
42
static uint32_t log_us_thr = ITER_US_THRESHOLD;
43
uint32_t log_q_thr = VHOST_QUEUE_FULL;
44
uint64_t iter_cycle_threshold;
45
46
static struct vlog_rate_limit latency_rl = VLOG_RATE_LIMIT_INIT(600, 600);
47
48
static uint64_t tsc_hz = 1;
49
50
void
51
pmd_perf_estimate_tsc_frequency(void)
52
0
{
53
#ifdef DPDK_NETDEV
54
    if (dpdk_available()) {
55
        tsc_hz = rte_get_tsc_hz();
56
    }
57
    if (tsc_hz > 1) {
58
        VLOG_INFO("DPDK provided TSC frequency: %"PRIu64" KHz", tsc_hz / 1000);
59
        return;
60
    }
61
#endif
62
0
    struct ovs_numa_dump *affinity;
63
0
    struct pmd_perf_stats s;
64
0
    uint64_t start, stop;
65
66
    /* DPDK is not available or returned unreliable value.
67
     * Trying to estimate. */
68
0
    affinity = ovs_numa_thread_getaffinity_dump();
69
0
    if (affinity) {
70
0
        const struct ovs_numa_info_core *core;
71
72
0
        FOR_EACH_CORE_ON_DUMP (core, affinity) {
73
            /* Setting affinity to a single core from the affinity mask to
74
             * avoid re-scheduling to another core while sleeping. */
75
0
            ovs_numa_thread_setaffinity_core(core->core_id);
76
0
            break;
77
0
        }
78
0
    }
79
80
0
    memset(&s, 0, sizeof s);
81
0
    start = cycles_counter_update(&s);
82
    /* Using xnanosleep as it's interrupt resistant.
83
     * Sleeping only 100 ms to avoid holding the main thread for too long. */
84
0
    xnanosleep(1E8);
85
0
    stop = cycles_counter_update(&s);
86
87
0
    if (affinity) {
88
        /* Restoring previous affinity. */
89
0
        ovs_numa_thread_setaffinity_dump(affinity);
90
0
        ovs_numa_dump_destroy(affinity);
91
0
    }
92
93
0
    if (stop <= start) {
94
0
        VLOG_WARN("TSC source is unreliable.");
95
0
        tsc_hz = 1;
96
0
    } else {
97
0
        tsc_hz = (stop - start) * 10;
98
0
    }
99
100
0
    VLOG_INFO("Estimated TSC frequency: %"PRIu64" KHz", tsc_hz / 1000);
101
0
}
102
103
/* Histogram functions. */
104
105
static void
106
histogram_walls_set_lin(struct histogram *hist, uint32_t min, uint32_t max)
107
0
{
108
0
    int i;
109
110
0
    ovs_assert(min < max);
111
0
    for (i = 0; i < NUM_BINS-1; i++) {
112
0
        hist->wall[i] = min + (i * (max - min)) / (NUM_BINS - 2);
113
0
    }
114
0
    hist->wall[NUM_BINS-1] = UINT32_MAX;
115
0
}
116
117
static void
118
histogram_walls_set_log(struct histogram *hist, uint32_t min, uint32_t max)
119
0
{
120
0
    int i, start, bins, wall;
121
0
    double log_min, log_max;
122
123
0
    ovs_assert(min < max);
124
0
    if (min > 0) {
125
0
        log_min = log(min);
126
0
        log_max = log(max);
127
0
        start = 0;
128
0
        bins = NUM_BINS - 1;
129
0
    } else {
130
0
        hist->wall[0] = 0;
131
0
        log_min = log(1);
132
0
        log_max = log(max);
133
0
        start = 1;
134
0
        bins = NUM_BINS - 2;
135
0
    }
136
0
    wall = start;
137
0
    for (i = 0; i < bins; i++) {
138
        /* Make sure each wall is monotonically increasing. */
139
0
        wall = MAX(wall, exp(log_min + (i * (log_max - log_min)) / (bins-1)));
140
0
        hist->wall[start + i] = wall++;
141
0
    }
142
0
    if (hist->wall[NUM_BINS-2] < max) {
143
0
        hist->wall[NUM_BINS-2] = max;
144
0
    }
145
0
    hist->wall[NUM_BINS-1] = UINT32_MAX;
146
0
}
147
148
uint64_t
149
histogram_samples(const struct histogram *hist)
150
0
{
151
0
    uint64_t samples = 0;
152
153
0
    for (int i = 0; i < NUM_BINS; i++) {
154
0
        samples += hist->bin[i];
155
0
    }
156
0
    return samples;
157
0
}
158
159
static void
160
histogram_clear(struct histogram *hist)
161
0
{
162
0
    int i;
163
164
0
    for (i = 0; i < NUM_BINS; i++) {
165
0
        hist->bin[i] = 0;
166
0
    }
167
0
}
168
169
static void
170
history_init(struct history *h)
171
0
{
172
0
    memset(h, 0, sizeof(*h));
173
0
}
174
175
void
176
pmd_perf_stats_init(struct pmd_perf_stats *s)
177
0
{
178
0
    memset(s, 0, sizeof(*s));
179
0
    ovs_mutex_init(&s->stats_mutex);
180
0
    ovs_mutex_init(&s->clear_mutex);
181
    /* Logarithmic histogram for cycles/it ranging from 500 to 24M
182
     * (corresponding to 200 ns to 9.6 ms at 2.5 GHz TSC clock). */
183
0
    histogram_walls_set_log(&s->cycles, 500, 24000000);
184
    /* Logarithmic histogram for pkts/it ranging from 0 to 1000. */
185
0
    histogram_walls_set_log(&s->pkts, 0, 1000);
186
    /* Linear histogram for cycles/pkt ranging from 100 to 30K. */
187
0
    histogram_walls_set_lin(&s->cycles_per_pkt, 100, 30000);
188
    /* Linear histogram for pkts/rx batch ranging from 0 to 32,
189
     * the maximum rx batch size in OVS. */
190
0
    histogram_walls_set_lin(&s->pkts_per_batch, 0, 32);
191
    /* Linear histogram for upcalls/it ranging from 0 to 30. */
192
0
    histogram_walls_set_lin(&s->upcalls, 0, 30);
193
    /* Logarithmic histogram for cycles/upcall ranging from 1000 to 1M
194
     * (corresponding to 400 ns to 400 us at 2.5 GHz TSC clock). */
195
0
    histogram_walls_set_log(&s->cycles_per_upcall, 1000, 1000000);
196
    /* Log. histogram for max vhostuser queue fill level from 0 to 512.
197
     * 512 is the maximum fill level for a virtio queue with 1024
198
     * descriptors (maximum configurable length in Qemu), with the
199
     * DPDK 17.11 virtio PMD in the guest. */
200
0
    histogram_walls_set_log(&s->max_vhost_qfill, 0, 512);
201
0
    s->iteration_cnt = 0;
202
0
    s->start_ms = time_msec();
203
0
    s->log_susp_it = UINT32_MAX;
204
0
    s->log_begin_it = UINT32_MAX;
205
0
    s->log_end_it = UINT32_MAX;
206
0
    s->log_reason = NULL;
207
0
}
208
209
void
210
pmd_perf_format_overall_stats(struct ds *str, struct pmd_perf_stats *s,
211
                              double duration, bool format_iterations)
212
0
{
213
0
    uint64_t stats[PMD_N_STATS];
214
0
    double us_per_cycle = 1000000.0 / tsc_hz;
215
216
0
    if (duration == 0) {
217
0
        return;
218
0
    }
219
220
0
    pmd_perf_read_counters(s, stats);
221
0
    uint64_t tot_cycles = stats[PMD_CYCLES_ITER_IDLE] +
222
0
                          stats[PMD_CYCLES_ITER_BUSY];
223
0
    uint64_t rx_packets = stats[PMD_STAT_RECV];
224
0
    uint64_t tx_packets = stats[PMD_STAT_SENT_PKTS];
225
0
    uint64_t tx_batches = stats[PMD_STAT_SENT_BATCHES];
226
0
    uint64_t passes = stats[PMD_STAT_RECV] +
227
0
                      stats[PMD_STAT_RECIRC];
228
0
    uint64_t upcalls = stats[PMD_STAT_MISS];
229
0
    uint64_t upcall_cycles = stats[PMD_CYCLES_UPCALL];
230
0
    uint64_t tot_iter = histogram_samples(&s->pkts);
231
0
    uint64_t idle_iter = s->pkts.bin[0];
232
0
    uint64_t busy_iter = tot_iter >= idle_iter ? tot_iter - idle_iter : 0;
233
0
    uint64_t sleep_iter = stats[PMD_SLEEP_ITER];
234
0
    uint64_t tot_sleep_cycles = stats[PMD_CYCLES_SLEEP];
235
236
0
    if (format_iterations) {
237
0
        ds_put_format(str,
238
0
            "  Iterations:         %12"PRIu64"  (%.2f us/it)\n"
239
0
            "  - Used TSC cycles:  %12"PRIu64"  (%5.1f %% of total cycles)\n"
240
0
            "  - idle iterations:  %12"PRIu64"  (%5.1f %% of used cycles)\n"
241
0
            "  - busy iterations:  %12"PRIu64"  (%5.1f %% of used cycles)\n"
242
0
            "  - sleep iterations: %12"PRIu64"  (%5.1f %% of iterations)\n"
243
0
            "  Sleep time (us):    %12.0f  (%3.0f us/iteration avg.)\n",
244
0
            tot_iter,
245
0
            tot_iter
246
0
                ? (tot_cycles + tot_sleep_cycles) * us_per_cycle / tot_iter
247
0
                : 0,
248
0
            tot_cycles, 100.0 * (tot_cycles / duration) / tsc_hz,
249
0
            idle_iter,
250
0
            tot_cycles ? 100.0 * stats[PMD_CYCLES_ITER_IDLE] / tot_cycles : 0,
251
0
            busy_iter,
252
0
            tot_cycles ? 100.0 * stats[PMD_CYCLES_ITER_BUSY] / tot_cycles : 0,
253
0
            sleep_iter, tot_iter ? 100.0 * sleep_iter / tot_iter : 0,
254
0
            tot_sleep_cycles * us_per_cycle,
255
0
            sleep_iter ? (tot_sleep_cycles * us_per_cycle) / sleep_iter : 0);
256
0
    }
257
0
    if (rx_packets > 0) {
258
0
        ds_put_format(str,
259
0
            "  Rx packets:         %12"PRIu64"  (%.0f Kpps",
260
0
            rx_packets, (rx_packets / duration) / 1000);
261
0
        if (tot_iter) {
262
0
            ds_put_format(str, ", %.0f cycles/pkt",
263
0
                1.0 * stats[PMD_CYCLES_ITER_BUSY] / rx_packets);
264
0
        }
265
0
        ds_put_cstr(str, ")\n");
266
267
0
        ds_put_format(str,
268
0
            "  Datapath passes:    %12"PRIu64"  (%.2f passes/pkt)\n"
269
0
            "  - PHWOL hits:       %12"PRIu64"  (%5.1f %%)\n"
270
0
            "  - Simple Match hits:%12"PRIu64"  (%5.1f %%)\n"
271
0
            "  - EMC hits:         %12"PRIu64"  (%5.1f %%)\n"
272
0
            "  - SMC hits:         %12"PRIu64"  (%5.1f %%)\n"
273
0
            "  - Megaflow hits:    %12"PRIu64"  (%5.1f %%, %.2f "
274
0
                                                 "subtbl lookups/hit)\n",
275
0
            passes, 1.0 * passes / rx_packets,
276
0
            stats[PMD_STAT_PHWOL_HIT],
277
0
            100.0 * stats[PMD_STAT_PHWOL_HIT] / passes,
278
0
            stats[PMD_STAT_SIMPLE_HIT],
279
0
            100.0 * stats[PMD_STAT_SIMPLE_HIT] / passes,
280
0
            stats[PMD_STAT_EXACT_HIT],
281
0
            100.0 * stats[PMD_STAT_EXACT_HIT] / passes,
282
0
            stats[PMD_STAT_SMC_HIT],
283
0
            100.0 * stats[PMD_STAT_SMC_HIT] / passes,
284
0
            stats[PMD_STAT_MASKED_HIT],
285
0
            100.0 * stats[PMD_STAT_MASKED_HIT] / passes,
286
0
            stats[PMD_STAT_MASKED_HIT]
287
0
            ? 1.0 * stats[PMD_STAT_MASKED_LOOKUP] / stats[PMD_STAT_MASKED_HIT]
288
0
            : 0);
289
290
0
        ds_put_format(str,
291
0
            "  - Upcalls:          %12"PRIu64"  (%5.1f %%",
292
0
            upcalls, 100.0 * upcalls / passes);
293
0
        if (tot_iter) {
294
0
            ds_put_format(str, ", %.1f us/upcall",
295
0
                upcalls ? (upcall_cycles * us_per_cycle) / upcalls : 0);
296
0
        }
297
0
        ds_put_cstr(str, ")\n");
298
299
0
        ds_put_format(str,
300
0
            "  - Lost upcalls:     %12"PRIu64"  (%5.1f %%)\n",
301
0
            stats[PMD_STAT_LOST], 100.0 * stats[PMD_STAT_LOST] / passes);
302
0
    } else {
303
0
        ds_put_format(str,
304
0
            "  Rx packets:         %12d\n", 0);
305
0
    }
306
0
    if (tx_packets > 0) {
307
0
        ds_put_format(str,
308
0
            "  Tx packets:         %12"PRIu64"  (%.0f Kpps)\n"
309
0
            "  Tx batches:         %12"PRIu64"  (%.2f pkts/batch)\n",
310
0
            tx_packets, (tx_packets / duration) / 1000,
311
0
            tx_batches, 1.0 * tx_packets / tx_batches);
312
0
    } else {
313
0
        ds_put_format(str,
314
0
            "  Tx packets:         %12d\n\n", 0);
315
0
    }
316
0
}
317
318
void
319
pmd_perf_format_histograms(struct ds *str, struct pmd_perf_stats *s)
320
0
{
321
0
    int i;
322
323
0
    ds_put_cstr(str, "Histograms\n");
324
0
    ds_put_format(str,
325
0
                  "   %-21s  %-21s  %-21s  %-21s  %-21s  %-21s  %-21s\n",
326
0
                  "cycles/it", "packets/it", "cycles/pkt", "pkts/batch",
327
0
                  "max vhost qlen", "upcalls/it", "cycles/upcall");
328
0
    for (i = 0; i < NUM_BINS-1; i++) {
329
0
        ds_put_format(str,
330
0
            "   %-9d %-11"PRIu64"  %-9d %-11"PRIu64"  %-9d %-11"PRIu64
331
0
            "  %-9d %-11"PRIu64"  %-9d %-11"PRIu64"  %-9d %-11"PRIu64
332
0
            "  %-9d %-11"PRIu64"\n",
333
0
            s->cycles.wall[i], s->cycles.bin[i],
334
0
            s->pkts.wall[i],s->pkts.bin[i],
335
0
            s->cycles_per_pkt.wall[i], s->cycles_per_pkt.bin[i],
336
0
            s->pkts_per_batch.wall[i], s->pkts_per_batch.bin[i],
337
0
            s->max_vhost_qfill.wall[i], s->max_vhost_qfill.bin[i],
338
0
            s->upcalls.wall[i], s->upcalls.bin[i],
339
0
            s->cycles_per_upcall.wall[i], s->cycles_per_upcall.bin[i]);
340
0
    }
341
0
    ds_put_format(str,
342
0
                  "   %-9s %-11"PRIu64"  %-9s %-11"PRIu64"  %-9s %-11"PRIu64
343
0
                  "  %-9s %-11"PRIu64"  %-9s %-11"PRIu64"  %-9s %-11"PRIu64
344
0
                  "  %-9s %-11"PRIu64"\n",
345
0
                  ">", s->cycles.bin[i],
346
0
                  ">", s->pkts.bin[i],
347
0
                  ">", s->cycles_per_pkt.bin[i],
348
0
                  ">", s->pkts_per_batch.bin[i],
349
0
                  ">", s->max_vhost_qfill.bin[i],
350
0
                  ">", s->upcalls.bin[i],
351
0
                  ">", s->cycles_per_upcall.bin[i]);
352
0
    ds_put_cstr(str,
353
0
                "-----------------------------------------------------"
354
0
                "-----------------------------------------------------"
355
0
                "------------------------------------------------\n");
356
0
    ds_put_format(str,
357
0
                  "   %-21s  %-21s  %-21s  %-21s  %-21s  %-21s  %-21s\n",
358
0
                  "cycles/it", "packets/it", "cycles/pkt", "pkts/batch",
359
0
                  "vhost qlen", "upcalls/it", "cycles/upcall");
360
0
    ds_put_format(str,
361
0
                  "   %-21"PRIu64"  %-21.5f  %-21"PRIu64
362
0
                  "  %-21.5f  %-21.5f  %-21.5f  %-21"PRIu32"\n",
363
0
                  s->totals.iterations
364
0
                      ? s->totals.cycles / s->totals.iterations : 0,
365
0
                  s->totals.iterations
366
0
                      ? 1.0 * s->totals.pkts / s->totals.iterations : 0,
367
0
                  s->totals.pkts
368
0
                      ? s->totals.busy_cycles / s->totals.pkts : 0,
369
0
                  s->totals.batches
370
0
                      ? 1.0 * s->totals.pkts / s->totals.batches : 0,
371
0
                  s->totals.iterations
372
0
                      ? 1.0 * s->totals.max_vhost_qfill / s->totals.iterations
373
0
                      : 0,
374
0
                  s->totals.iterations
375
0
                      ? 1.0 * s->totals.upcalls / s->totals.iterations : 0,
376
0
                  s->totals.upcalls
377
0
                      ? s->totals.upcall_cycles / s->totals.upcalls : 0);
378
0
}
379
380
void
381
pmd_perf_format_iteration_history(struct ds *str, struct pmd_perf_stats *s,
382
                                  int n_iter)
383
0
{
384
0
    struct iter_stats *is;
385
0
    size_t index;
386
0
    int i;
387
388
0
    if (n_iter == 0) {
389
0
        return;
390
0
    }
391
0
    ds_put_format(str, "   %-17s   %-10s   %-10s   %-10s   %-10s   "
392
0
                  "%-10s   %-10s   %-10s\n",
393
0
                  "iter", "cycles", "packets", "cycles/pkt", "pkts/batch",
394
0
                  "vhost qlen", "upcalls", "cycles/upcall");
395
0
    for (i = 1; i <= n_iter; i++) {
396
0
        index = history_sub(s->iterations.idx, i);
397
0
        is = &s->iterations.sample[index];
398
0
        ds_put_format(str,
399
0
                      "   %-17"PRIu64"   %-11"PRIu64"  %-11"PRIu32
400
0
                      "  %-11"PRIu64"  %-11"PRIu32"  %-11"PRIu32
401
0
                      "  %-11"PRIu32"  %-11"PRIu32"\n",
402
0
                      is->timestamp,
403
0
                      is->cycles,
404
0
                      is->pkts,
405
0
                      is->pkts ? is->cycles / is->pkts : 0,
406
0
                      is->batches ? is->pkts / is->batches : 0,
407
0
                      is->max_vhost_qfill,
408
0
                      is->upcalls,
409
0
                      is->upcalls ? is->upcall_cycles / is->upcalls : 0);
410
0
    }
411
0
}
412
413
void
414
pmd_perf_format_ms_history(struct ds *str, struct pmd_perf_stats *s, int n_ms)
415
0
{
416
0
    struct iter_stats *is;
417
0
    size_t index;
418
0
    int i;
419
420
0
    if (n_ms == 0) {
421
0
        return;
422
0
    }
423
0
    ds_put_format(str,
424
0
                  "   %-12s   %-10s   %-10s   %-10s   %-10s"
425
0
                  "   %-10s   %-10s   %-10s   %-10s\n",
426
0
                  "ms", "iterations", "cycles/it", "Kpps", "cycles/pkt",
427
0
                  "pkts/batch", "vhost qlen", "upcalls", "cycles/upcall");
428
0
    for (i = 1; i <= n_ms; i++) {
429
0
        index = history_sub(s->milliseconds.idx, i);
430
0
        is = &s->milliseconds.sample[index];
431
0
        ds_put_format(str,
432
0
                      "   %-12"PRIu64"   %-11"PRIu32"  %-11"PRIu64
433
0
                      "  %-11"PRIu32"  %-11"PRIu64"  %-11"PRIu32
434
0
                      "  %-11"PRIu32"  %-11"PRIu32"  %-11"PRIu32"\n",
435
0
                      is->timestamp,
436
0
                      is->iterations,
437
0
                      is->iterations ? is->cycles / is->iterations : 0,
438
0
                      is->pkts,
439
0
                      is->pkts ? is->busy_cycles / is->pkts : 0,
440
0
                      is->batches ? is->pkts / is->batches : 0,
441
0
                      is->iterations
442
0
                          ? is->max_vhost_qfill / is->iterations : 0,
443
0
                      is->upcalls,
444
0
                      is->upcalls ? is->upcall_cycles / is->upcalls : 0);
445
0
    }
446
0
}
447
448
void
449
pmd_perf_read_counters(struct pmd_perf_stats *s,
450
                       uint64_t stats[PMD_N_STATS])
451
0
{
452
0
    uint64_t val;
453
454
    /* These loops subtracts reference values (.zero[*]) from the counters.
455
     * Since loads and stores are relaxed, it might be possible for a .zero[*]
456
     * value to be more recent than the current value we're reading from the
457
     * counter.  This is not a big problem, since these numbers are not
458
     * supposed to be 100% accurate, but we should at least make sure that
459
     * the result is not negative. */
460
0
    for (int i = 0; i < PMD_N_STATS; i++) {
461
0
        atomic_read_relaxed(&s->counters.n[i], &val);
462
0
        if (val > s->counters.zero[i]) {
463
0
            stats[i] = val - s->counters.zero[i];
464
0
        } else {
465
0
            stats[i] = 0;
466
0
        }
467
0
    }
468
0
}
469
470
/* This function clears the PMD performance counters from within the PMD
471
 * thread or from another thread when the PMD thread is not executing its
472
 * poll loop. */
473
void
474
pmd_perf_stats_clear_lock(struct pmd_perf_stats *s)
475
    OVS_REQUIRES(s->stats_mutex)
476
0
{
477
0
    ovs_mutex_lock(&s->clear_mutex);
478
0
    for (int i = 0; i < PMD_N_STATS; i++) {
479
0
        atomic_read_relaxed(&s->counters.n[i], &s->counters.zero[i]);
480
0
    }
481
    /* The following stats are only applicable in PMD thread and */
482
0
    memset(&s->current, 0 , sizeof(struct iter_stats));
483
0
    memset(&s->totals, 0 , sizeof(struct iter_stats));
484
0
    histogram_clear(&s->cycles);
485
0
    histogram_clear(&s->pkts);
486
0
    histogram_clear(&s->cycles_per_pkt);
487
0
    histogram_clear(&s->upcalls);
488
0
    histogram_clear(&s->cycles_per_upcall);
489
0
    histogram_clear(&s->pkts_per_batch);
490
0
    histogram_clear(&s->max_vhost_qfill);
491
0
    history_init(&s->iterations);
492
0
    history_init(&s->milliseconds);
493
0
    s->start_ms = time_msec();
494
0
    s->milliseconds.sample[0].timestamp = s->start_ms;
495
0
    s->log_susp_it = UINT32_MAX;
496
0
    s->log_begin_it = UINT32_MAX;
497
0
    s->log_end_it = UINT32_MAX;
498
0
    s->log_reason = NULL;
499
    /* Clearing finished. */
500
0
    s->clear = false;
501
0
    ovs_mutex_unlock(&s->clear_mutex);
502
0
}
503
504
/* This function can be called from the anywhere to clear the stats
505
 * of PMD and non-PMD threads. */
506
void
507
pmd_perf_stats_clear(struct pmd_perf_stats *s)
508
0
{
509
0
    if (ovs_mutex_trylock(&s->stats_mutex) == 0) {
510
        /* Locking successful. PMD not polling. */
511
0
        pmd_perf_stats_clear_lock(s);
512
0
        ovs_mutex_unlock(&s->stats_mutex);
513
0
    } else {
514
        /* Request the polling PMD to clear the stats. There is no need to
515
         * block here as stats retrieval is prevented during clearing. */
516
0
        s->clear = true;
517
0
    }
518
0
}
519
520
/* Functions recording PMD metrics per iteration. */
521
522
void
523
pmd_perf_start_iteration(struct pmd_perf_stats *s)
524
OVS_REQUIRES(s->stats_mutex)
525
0
{
526
0
    if (s->clear) {
527
        /* Clear the PMD stats before starting next iteration. */
528
0
        pmd_perf_stats_clear_lock(s);
529
0
    }
530
0
    s->iteration_cnt++;
531
    /* Initialize the current interval stats. */
532
0
    memset(&s->current, 0, sizeof(struct iter_stats));
533
0
    if (OVS_LIKELY(s->last_tsc)) {
534
        /* We assume here that last_tsc was updated immediately prior at
535
         * the end of the previous iteration, or just before the first
536
         * iteration. */
537
0
        s->start_tsc = s->last_tsc;
538
0
    } else {
539
        /* In case last_tsc has never been set before. */
540
0
        s->start_tsc = cycles_counter_update(s);
541
0
    }
542
0
}
543
544
void
545
pmd_perf_end_iteration(struct pmd_perf_stats *s, int rx_packets,
546
                       int tx_packets, uint64_t sleep_cycles,
547
                       bool full_metrics)
548
0
{
549
0
    uint64_t now_tsc = cycles_counter_update(s);
550
0
    struct iter_stats *cum_ms;
551
0
    uint64_t cycles, cycles_per_pkt = 0;
552
0
    char *reason = NULL;
553
554
0
    cycles = now_tsc - s->start_tsc - sleep_cycles;
555
0
    s->current.timestamp = s->iteration_cnt;
556
0
    s->current.cycles = cycles;
557
0
    s->current.pkts = rx_packets;
558
559
0
    if (rx_packets + tx_packets > 0) {
560
0
        pmd_perf_update_counter(s, PMD_CYCLES_ITER_BUSY, cycles);
561
0
    } else {
562
0
        pmd_perf_update_counter(s, PMD_CYCLES_ITER_IDLE, cycles);
563
0
    }
564
    /* Add iteration samples to histograms. */
565
0
    histogram_add_sample(&s->cycles, cycles);
566
0
    histogram_add_sample(&s->pkts, rx_packets);
567
568
0
    if (sleep_cycles) {
569
0
        pmd_perf_update_counter(s, PMD_SLEEP_ITER, 1);
570
0
        pmd_perf_update_counter(s, PMD_CYCLES_SLEEP, sleep_cycles);
571
0
    }
572
573
0
    if (!full_metrics) {
574
0
        return;
575
0
    }
576
577
0
    s->counters.n[PMD_CYCLES_UPCALL] += s->current.upcall_cycles;
578
579
0
    if (rx_packets > 0) {
580
0
        cycles_per_pkt = cycles / rx_packets;
581
0
        histogram_add_sample(&s->cycles_per_pkt, cycles_per_pkt);
582
0
    }
583
0
    histogram_add_sample(&s->upcalls, s->current.upcalls);
584
0
    histogram_add_sample(&s->max_vhost_qfill, s->current.max_vhost_qfill);
585
586
    /* Add iteration samples to millisecond stats. */
587
0
    cum_ms = history_current(&s->milliseconds);
588
0
    cum_ms->iterations++;
589
0
    cum_ms->cycles += cycles;
590
0
    if (rx_packets > 0) {
591
0
        cum_ms->busy_cycles += cycles;
592
0
    }
593
0
    cum_ms->pkts += s->current.pkts;
594
0
    cum_ms->upcalls += s->current.upcalls;
595
0
    cum_ms->upcall_cycles += s->current.upcall_cycles;
596
0
    cum_ms->batches += s->current.batches;
597
0
    cum_ms->max_vhost_qfill += s->current.max_vhost_qfill;
598
599
0
    if (log_enabled) {
600
        /* Log suspicious iterations. */
601
0
        if (cycles > iter_cycle_threshold) {
602
0
            reason = "Excessive total cycles";
603
0
        } else if (s->current.max_vhost_qfill >= log_q_thr) {
604
0
            reason = "Vhost RX queue full";
605
0
        }
606
0
        if (OVS_UNLIKELY(reason)) {
607
0
            pmd_perf_set_log_susp_iteration(s, reason);
608
0
            cycles_counter_update(s);
609
0
        }
610
611
        /* Log iteration interval around suspicious iteration when reaching
612
         * the end of the range to be logged. */
613
0
        if (OVS_UNLIKELY(s->log_end_it == s->iterations.idx)) {
614
0
            pmd_perf_log_susp_iteration_neighborhood(s);
615
0
            cycles_counter_update(s);
616
0
        }
617
0
    }
618
619
    /* Store in iteration history. This advances the iteration idx and
620
     * clears the next slot in the iteration history. */
621
0
    history_store(&s->iterations, &s->current);
622
623
0
    if (now_tsc > s->next_check_tsc) {
624
        /* Check if ms is completed and store in milliseconds history. */
625
0
        uint64_t now = time_msec();
626
0
        if (now != cum_ms->timestamp) {
627
            /* Add ms stats to totals. */
628
0
            s->totals.iterations += cum_ms->iterations;
629
0
            s->totals.cycles += cum_ms->cycles;
630
0
            s->totals.busy_cycles += cum_ms->busy_cycles;
631
0
            s->totals.pkts += cum_ms->pkts;
632
0
            s->totals.upcalls += cum_ms->upcalls;
633
0
            s->totals.upcall_cycles += cum_ms->upcall_cycles;
634
0
            s->totals.batches += cum_ms->batches;
635
0
            s->totals.max_vhost_qfill += cum_ms->max_vhost_qfill;
636
0
            cum_ms = history_next(&s->milliseconds);
637
0
            cum_ms->timestamp = now;
638
0
        }
639
        /* Do the next check after 4 us (10K cycles at 2.5 GHz TSC clock). */
640
0
        s->next_check_tsc = cycles_counter_update(s) + tsc_hz / 250000;
641
0
    }
642
0
}
643
644
/* Delay logging of the suspicious iteration and the range of iterations
645
 * around it until after the last iteration in the range to be logged.
646
 * This avoids any distortion of the measurements through the cost of
647
 * logging itself. */
648
649
void
650
pmd_perf_set_log_susp_iteration(struct pmd_perf_stats *s,
651
                                char *reason)
652
0
{
653
0
    if (s->log_susp_it == UINT32_MAX) {
654
        /* No logging scheduled yet. */
655
0
        s->log_susp_it = s->iterations.idx;
656
0
        s->log_reason = reason;
657
0
        s->log_begin_it = history_sub(s->iterations.idx, log_it_before);
658
0
        s->log_end_it = history_add(s->iterations.idx, log_it_after + 1);
659
0
    } else if (log_extend) {
660
        /* Logging was initiated earlier, we log the previous suspicious
661
         * iteration now and extend the logging interval, if possible. */
662
0
        struct iter_stats *susp = &s->iterations.sample[s->log_susp_it];
663
0
        uint32_t new_end_it, old_range, new_range;
664
665
0
        VLOG_WARN_RL(&latency_rl,
666
0
                "Suspicious iteration (%s): iter=%"PRIu64
667
0
                " duration=%"PRIu64" us\n",
668
0
                s->log_reason,
669
0
                susp->timestamp,
670
0
                (1000000L * susp->cycles) / tsc_hz);
671
672
0
        new_end_it = history_add(s->iterations.idx, log_it_after + 1);
673
0
        new_range = history_sub(new_end_it, s->log_begin_it);
674
0
        old_range = history_sub(s->log_end_it, s->log_begin_it);
675
0
        if (new_range < old_range) {
676
            /* Extended range exceeds history length. */
677
0
            new_end_it = s->log_begin_it;
678
0
        }
679
0
        s->log_susp_it = s->iterations.idx;
680
0
        s->log_reason = reason;
681
0
        s->log_end_it = new_end_it;
682
0
    }
683
0
}
684
685
void
686
pmd_perf_log_susp_iteration_neighborhood(struct pmd_perf_stats *s)
687
0
{
688
0
    ovs_assert(s->log_reason != NULL);
689
0
    ovs_assert(s->log_susp_it != UINT32_MAX);
690
691
0
    struct ds log = DS_EMPTY_INITIALIZER;
692
0
    struct iter_stats *susp = &s->iterations.sample[s->log_susp_it];
693
0
    uint32_t range = history_sub(s->log_end_it, s->log_begin_it);
694
695
0
    VLOG_WARN_RL(&latency_rl,
696
0
                 "Suspicious iteration (%s): iter=%"PRIu64
697
0
                 " duration=%"PRIu64" us\n",
698
0
                 s->log_reason,
699
0
                 susp->timestamp,
700
0
                 (1000000L * susp->cycles) / tsc_hz);
701
702
0
    pmd_perf_format_iteration_history(&log, s, range);
703
0
    VLOG_WARN_RL(&latency_rl,
704
0
                 "Neighborhood of suspicious iteration:\n"
705
0
                 "%s", ds_cstr(&log));
706
0
    ds_destroy(&log);
707
0
    s->log_susp_it = s->log_end_it = s->log_begin_it = UINT32_MAX;
708
0
    s->log_reason = NULL;
709
710
0
    if (range > 100) {
711
        /* Reset to safe default values to avoid disturbance. */
712
0
        log_it_before = LOG_IT_BEFORE;
713
0
        log_it_after = LOG_IT_AFTER;
714
0
        log_extend = false;
715
0
    }
716
0
}
717
718
void
719
pmd_perf_log_set_cmd(struct unixctl_conn *conn,
720
                 int argc, const char *argv[],
721
                 void *aux OVS_UNUSED)
722
0
{
723
0
    unsigned int it_before, it_after, us_thr, q_thr;
724
0
    bool on, extend;
725
0
    bool usage = false;
726
727
0
    on = log_enabled;
728
0
    extend = log_extend;
729
0
    it_before = log_it_before;
730
0
    it_after = log_it_after;
731
0
    q_thr = log_q_thr;
732
0
    us_thr = log_us_thr;
733
734
0
    while (argc > 1) {
735
0
        if (!strcmp(argv[1], "on")) {
736
0
            on = true;
737
0
            argc--;
738
0
            argv++;
739
0
        } else if (!strcmp(argv[1], "off")) {
740
0
            on = false;
741
0
            argc--;
742
0
            argv++;
743
0
        } else if (!strcmp(argv[1], "-e")) {
744
0
            extend = true;
745
0
            argc--;
746
0
            argv++;
747
0
        } else if (!strcmp(argv[1], "-ne")) {
748
0
            extend = false;
749
0
            argc--;
750
0
            argv++;
751
0
        } else if (!strcmp(argv[1], "-a") && argc > 2) {
752
0
            if (str_to_uint(argv[2], 10, &it_after)) {
753
0
                if (it_after > HISTORY_LEN - 2) {
754
0
                    it_after = HISTORY_LEN - 2;
755
0
                }
756
0
            } else {
757
0
                usage = true;
758
0
                break;
759
0
            }
760
0
            argc -= 2;
761
0
            argv += 2;
762
0
        } else if (!strcmp(argv[1], "-b") && argc > 2) {
763
0
            if (str_to_uint(argv[2], 10, &it_before)) {
764
0
                if (it_before > HISTORY_LEN - 2) {
765
0
                    it_before = HISTORY_LEN - 2;
766
0
                }
767
0
            } else {
768
0
                usage = true;
769
0
                break;
770
0
            }
771
0
            argc -= 2;
772
0
            argv += 2;
773
0
        } else if (!strcmp(argv[1], "-q") && argc > 2) {
774
0
            if (!str_to_uint(argv[2], 10, &q_thr)) {
775
0
                usage = true;
776
0
                break;
777
0
            }
778
0
            argc -= 2;
779
0
            argv += 2;
780
0
        } else if (!strcmp(argv[1], "-us") && argc > 2) {
781
0
            if (!str_to_uint(argv[2], 10, &us_thr)) {
782
0
                usage = true;
783
0
                break;
784
0
            }
785
0
            argc -= 2;
786
0
            argv += 2;
787
0
        } else {
788
0
            usage = true;
789
0
            break;
790
0
        }
791
0
    }
792
0
    if (it_before + it_after > HISTORY_LEN - 2) {
793
0
        it_after = HISTORY_LEN - 2 - it_before;
794
0
    }
795
796
0
    if (usage) {
797
0
        unixctl_command_reply_error(conn,
798
0
                "Usage: ovs-appctl dpif-netdev/pmd-perf-log-set "
799
0
                "[on|off] [-b before] [-a after] [-e|-ne] "
800
0
                "[-us usec] [-q qlen]");
801
0
        return;
802
0
    }
803
804
0
    VLOG_INFO("pmd-perf-log-set: %s, before=%d, after=%d, extend=%s, "
805
0
              "us_thr=%d, q_thr=%d\n",
806
0
              on ? "on" : "off", it_before, it_after,
807
0
              extend ? "true" : "false", us_thr, q_thr);
808
0
    log_enabled = on;
809
0
    log_extend = extend;
810
0
    log_it_before = it_before;
811
0
    log_it_after = it_after;
812
0
    log_q_thr = q_thr;
813
0
    log_us_thr = us_thr;
814
0
    iter_cycle_threshold = (log_us_thr * tsc_hz) / 1000000L;
815
816
0
    unixctl_command_reply(conn, "");
817
0
}