/src/openvswitch/lib/dpif-netdev-perf.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2017 Ericsson AB. |
3 | | * |
4 | | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | | * you may not use this file except in compliance with the License. |
6 | | * You may obtain a copy of the License at: |
7 | | * |
8 | | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | | * |
10 | | * Unless required by applicable law or agreed to in writing, software |
11 | | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | | * See the License for the specific language governing permissions and |
14 | | * limitations under the License. |
15 | | */ |
16 | | |
17 | | #include <config.h> |
18 | | #include <stdint.h> |
19 | | |
20 | | #include "dpdk.h" |
21 | | #include "dpif-netdev-perf.h" |
22 | | #include "openvswitch/dynamic-string.h" |
23 | | #include "openvswitch/vlog.h" |
24 | | #include "ovs-numa.h" |
25 | | #include "ovs-thread.h" |
26 | | #include "timeval.h" |
27 | | |
28 | | VLOG_DEFINE_THIS_MODULE(pmd_perf); |
29 | | |
30 | | #define ITER_US_THRESHOLD 250 /* Warning threshold for iteration duration |
31 | | in microseconds. */ |
32 | | #define VHOST_QUEUE_FULL 128 /* Size of the virtio TX queue. */ |
33 | 0 | #define LOG_IT_BEFORE 5 /* Number of iterations to log before |
34 | | suspicious iteration. */ |
35 | 0 | #define LOG_IT_AFTER 5 /* Number of iterations to log after |
36 | | suspicious iteration. */ |
37 | | |
38 | | bool log_enabled = false; |
39 | | bool log_extend = false; |
40 | | static uint32_t log_it_before = LOG_IT_BEFORE; |
41 | | static uint32_t log_it_after = LOG_IT_AFTER; |
42 | | static uint32_t log_us_thr = ITER_US_THRESHOLD; |
43 | | uint32_t log_q_thr = VHOST_QUEUE_FULL; |
44 | | uint64_t iter_cycle_threshold; |
45 | | |
46 | | static struct vlog_rate_limit latency_rl = VLOG_RATE_LIMIT_INIT(600, 600); |
47 | | |
48 | | static uint64_t tsc_hz = 1; |
49 | | |
50 | | void |
51 | | pmd_perf_estimate_tsc_frequency(void) |
52 | 0 | { |
53 | | #ifdef DPDK_NETDEV |
54 | | if (dpdk_available()) { |
55 | | tsc_hz = rte_get_tsc_hz(); |
56 | | } |
57 | | if (tsc_hz > 1) { |
58 | | VLOG_INFO("DPDK provided TSC frequency: %"PRIu64" KHz", tsc_hz / 1000); |
59 | | return; |
60 | | } |
61 | | #endif |
62 | 0 | struct ovs_numa_dump *affinity; |
63 | 0 | struct pmd_perf_stats s; |
64 | 0 | uint64_t start, stop; |
65 | | |
66 | | /* DPDK is not available or returned unreliable value. |
67 | | * Trying to estimate. */ |
68 | 0 | affinity = ovs_numa_thread_getaffinity_dump(); |
69 | 0 | if (affinity) { |
70 | 0 | const struct ovs_numa_info_core *core; |
71 | |
|
72 | 0 | FOR_EACH_CORE_ON_DUMP (core, affinity) { |
73 | | /* Setting affinity to a single core from the affinity mask to |
74 | | * avoid re-scheduling to another core while sleeping. */ |
75 | 0 | ovs_numa_thread_setaffinity_core(core->core_id); |
76 | 0 | break; |
77 | 0 | } |
78 | 0 | } |
79 | |
|
80 | 0 | memset(&s, 0, sizeof s); |
81 | 0 | start = cycles_counter_update(&s); |
82 | | /* Using xnanosleep as it's interrupt resistant. |
83 | | * Sleeping only 100 ms to avoid holding the main thread for too long. */ |
84 | 0 | xnanosleep(1E8); |
85 | 0 | stop = cycles_counter_update(&s); |
86 | |
|
87 | 0 | if (affinity) { |
88 | | /* Restoring previous affinity. */ |
89 | 0 | ovs_numa_thread_setaffinity_dump(affinity); |
90 | 0 | ovs_numa_dump_destroy(affinity); |
91 | 0 | } |
92 | |
|
93 | 0 | if (stop <= start) { |
94 | 0 | VLOG_WARN("TSC source is unreliable."); |
95 | 0 | tsc_hz = 1; |
96 | 0 | } else { |
97 | 0 | tsc_hz = (stop - start) * 10; |
98 | 0 | } |
99 | |
|
100 | 0 | VLOG_INFO("Estimated TSC frequency: %"PRIu64" KHz", tsc_hz / 1000); |
101 | 0 | } |
102 | | |
103 | | /* Histogram functions. */ |
104 | | |
105 | | static void |
106 | | histogram_walls_set_lin(struct histogram *hist, uint32_t min, uint32_t max) |
107 | 0 | { |
108 | 0 | int i; |
109 | |
|
110 | 0 | ovs_assert(min < max); |
111 | 0 | for (i = 0; i < NUM_BINS-1; i++) { |
112 | 0 | hist->wall[i] = min + (i * (max - min)) / (NUM_BINS - 2); |
113 | 0 | } |
114 | 0 | hist->wall[NUM_BINS-1] = UINT32_MAX; |
115 | 0 | } |
116 | | |
117 | | static void |
118 | | histogram_walls_set_log(struct histogram *hist, uint32_t min, uint32_t max) |
119 | 0 | { |
120 | 0 | int i, start, bins, wall; |
121 | 0 | double log_min, log_max; |
122 | |
|
123 | 0 | ovs_assert(min < max); |
124 | 0 | if (min > 0) { |
125 | 0 | log_min = log(min); |
126 | 0 | log_max = log(max); |
127 | 0 | start = 0; |
128 | 0 | bins = NUM_BINS - 1; |
129 | 0 | } else { |
130 | 0 | hist->wall[0] = 0; |
131 | 0 | log_min = log(1); |
132 | 0 | log_max = log(max); |
133 | 0 | start = 1; |
134 | 0 | bins = NUM_BINS - 2; |
135 | 0 | } |
136 | 0 | wall = start; |
137 | 0 | for (i = 0; i < bins; i++) { |
138 | | /* Make sure each wall is monotonically increasing. */ |
139 | 0 | wall = MAX(wall, exp(log_min + (i * (log_max - log_min)) / (bins-1))); |
140 | 0 | hist->wall[start + i] = wall++; |
141 | 0 | } |
142 | 0 | if (hist->wall[NUM_BINS-2] < max) { |
143 | 0 | hist->wall[NUM_BINS-2] = max; |
144 | 0 | } |
145 | 0 | hist->wall[NUM_BINS-1] = UINT32_MAX; |
146 | 0 | } |
147 | | |
148 | | uint64_t |
149 | | histogram_samples(const struct histogram *hist) |
150 | 0 | { |
151 | 0 | uint64_t samples = 0; |
152 | |
|
153 | 0 | for (int i = 0; i < NUM_BINS; i++) { |
154 | 0 | samples += hist->bin[i]; |
155 | 0 | } |
156 | 0 | return samples; |
157 | 0 | } |
158 | | |
159 | | static void |
160 | | histogram_clear(struct histogram *hist) |
161 | 0 | { |
162 | 0 | int i; |
163 | |
|
164 | 0 | for (i = 0; i < NUM_BINS; i++) { |
165 | 0 | hist->bin[i] = 0; |
166 | 0 | } |
167 | 0 | } |
168 | | |
169 | | static void |
170 | | history_init(struct history *h) |
171 | 0 | { |
172 | 0 | memset(h, 0, sizeof(*h)); |
173 | 0 | } |
174 | | |
175 | | void |
176 | | pmd_perf_stats_init(struct pmd_perf_stats *s) |
177 | 0 | { |
178 | 0 | memset(s, 0, sizeof(*s)); |
179 | 0 | ovs_mutex_init(&s->stats_mutex); |
180 | 0 | ovs_mutex_init(&s->clear_mutex); |
181 | | /* Logarithmic histogram for cycles/it ranging from 500 to 24M |
182 | | * (corresponding to 200 ns to 9.6 ms at 2.5 GHz TSC clock). */ |
183 | 0 | histogram_walls_set_log(&s->cycles, 500, 24000000); |
184 | | /* Logarithmic histogram for pkts/it ranging from 0 to 1000. */ |
185 | 0 | histogram_walls_set_log(&s->pkts, 0, 1000); |
186 | | /* Linear histogram for cycles/pkt ranging from 100 to 30K. */ |
187 | 0 | histogram_walls_set_lin(&s->cycles_per_pkt, 100, 30000); |
188 | | /* Linear histogram for pkts/rx batch ranging from 0 to 32, |
189 | | * the maximum rx batch size in OVS. */ |
190 | 0 | histogram_walls_set_lin(&s->pkts_per_batch, 0, 32); |
191 | | /* Linear histogram for upcalls/it ranging from 0 to 30. */ |
192 | 0 | histogram_walls_set_lin(&s->upcalls, 0, 30); |
193 | | /* Logarithmic histogram for cycles/upcall ranging from 1000 to 1M |
194 | | * (corresponding to 400 ns to 400 us at 2.5 GHz TSC clock). */ |
195 | 0 | histogram_walls_set_log(&s->cycles_per_upcall, 1000, 1000000); |
196 | | /* Log. histogram for max vhostuser queue fill level from 0 to 512. |
197 | | * 512 is the maximum fill level for a virtio queue with 1024 |
198 | | * descriptors (maximum configurable length in Qemu), with the |
199 | | * DPDK 17.11 virtio PMD in the guest. */ |
200 | 0 | histogram_walls_set_log(&s->max_vhost_qfill, 0, 512); |
201 | 0 | s->iteration_cnt = 0; |
202 | 0 | s->start_ms = time_msec(); |
203 | 0 | s->log_susp_it = UINT32_MAX; |
204 | 0 | s->log_begin_it = UINT32_MAX; |
205 | 0 | s->log_end_it = UINT32_MAX; |
206 | 0 | s->log_reason = NULL; |
207 | 0 | } |
208 | | |
209 | | void |
210 | | pmd_perf_format_overall_stats(struct ds *str, struct pmd_perf_stats *s, |
211 | | double duration) |
212 | 0 | { |
213 | 0 | uint64_t stats[PMD_N_STATS]; |
214 | 0 | double us_per_cycle = 1000000.0 / tsc_hz; |
215 | |
|
216 | 0 | if (duration == 0) { |
217 | 0 | return; |
218 | 0 | } |
219 | | |
220 | 0 | pmd_perf_read_counters(s, stats); |
221 | 0 | uint64_t tot_cycles = stats[PMD_CYCLES_ITER_IDLE] + |
222 | 0 | stats[PMD_CYCLES_ITER_BUSY]; |
223 | 0 | uint64_t rx_packets = stats[PMD_STAT_RECV]; |
224 | 0 | uint64_t tx_packets = stats[PMD_STAT_SENT_PKTS]; |
225 | 0 | uint64_t tx_batches = stats[PMD_STAT_SENT_BATCHES]; |
226 | 0 | uint64_t passes = stats[PMD_STAT_RECV] + |
227 | 0 | stats[PMD_STAT_RECIRC]; |
228 | 0 | uint64_t upcalls = stats[PMD_STAT_MISS]; |
229 | 0 | uint64_t upcall_cycles = stats[PMD_CYCLES_UPCALL]; |
230 | 0 | uint64_t tot_iter = histogram_samples(&s->pkts); |
231 | 0 | uint64_t idle_iter = s->pkts.bin[0]; |
232 | 0 | uint64_t busy_iter = tot_iter >= idle_iter ? tot_iter - idle_iter : 0; |
233 | 0 | uint64_t sleep_iter = stats[PMD_SLEEP_ITER]; |
234 | 0 | uint64_t tot_sleep_cycles = stats[PMD_CYCLES_SLEEP]; |
235 | |
|
236 | 0 | ds_put_format(str, |
237 | 0 | " Iterations: %12"PRIu64" (%.2f us/it)\n" |
238 | 0 | " - Used TSC cycles: %12"PRIu64" (%5.1f %% of total cycles)\n" |
239 | 0 | " - idle iterations: %12"PRIu64" (%5.1f %% of used cycles)\n" |
240 | 0 | " - busy iterations: %12"PRIu64" (%5.1f %% of used cycles)\n" |
241 | 0 | " - sleep iterations: %12"PRIu64" (%5.1f %% of iterations)\n" |
242 | 0 | " Sleep time (us): %12.0f (%3.0f us/iteration avg.)\n", |
243 | 0 | tot_iter, |
244 | 0 | tot_iter |
245 | 0 | ? (tot_cycles + tot_sleep_cycles) * us_per_cycle / tot_iter |
246 | 0 | : 0, |
247 | 0 | tot_cycles, 100.0 * (tot_cycles / duration) / tsc_hz, |
248 | 0 | idle_iter, |
249 | 0 | tot_cycles ? 100.0 * stats[PMD_CYCLES_ITER_IDLE] / tot_cycles : 0, |
250 | 0 | busy_iter, |
251 | 0 | tot_cycles ? 100.0 * stats[PMD_CYCLES_ITER_BUSY] / tot_cycles : 0, |
252 | 0 | sleep_iter, tot_iter ? 100.0 * sleep_iter / tot_iter : 0, |
253 | 0 | tot_sleep_cycles * us_per_cycle, |
254 | 0 | sleep_iter ? (tot_sleep_cycles * us_per_cycle) / sleep_iter : 0); |
255 | 0 | if (rx_packets > 0) { |
256 | 0 | ds_put_format(str, |
257 | 0 | " Rx packets: %12"PRIu64" (%.0f Kpps, %.0f cycles/pkt)\n" |
258 | 0 | " Datapath passes: %12"PRIu64" (%.2f passes/pkt)\n" |
259 | 0 | " - PHWOL hits: %12"PRIu64" (%5.1f %%)\n" |
260 | 0 | " - MFEX Opt hits: %12"PRIu64" (%5.1f %%)\n" |
261 | 0 | " - Simple Match hits:%12"PRIu64" (%5.1f %%)\n" |
262 | 0 | " - EMC hits: %12"PRIu64" (%5.1f %%)\n" |
263 | 0 | " - SMC hits: %12"PRIu64" (%5.1f %%)\n" |
264 | 0 | " - Megaflow hits: %12"PRIu64" (%5.1f %%, %.2f " |
265 | 0 | "subtbl lookups/hit)\n" |
266 | 0 | " - Upcalls: %12"PRIu64" (%5.1f %%, %.1f us/upcall)\n" |
267 | 0 | " - Lost upcalls: %12"PRIu64" (%5.1f %%)\n", |
268 | 0 | rx_packets, (rx_packets / duration) / 1000, |
269 | 0 | 1.0 * stats[PMD_CYCLES_ITER_BUSY] / rx_packets, |
270 | 0 | passes, 1.0 * passes / rx_packets, |
271 | 0 | stats[PMD_STAT_PHWOL_HIT], |
272 | 0 | 100.0 * stats[PMD_STAT_PHWOL_HIT] / passes, |
273 | 0 | stats[PMD_STAT_MFEX_OPT_HIT], |
274 | 0 | 100.0 * stats[PMD_STAT_MFEX_OPT_HIT] / passes, |
275 | 0 | stats[PMD_STAT_SIMPLE_HIT], |
276 | 0 | 100.0 * stats[PMD_STAT_SIMPLE_HIT] / passes, |
277 | 0 | stats[PMD_STAT_EXACT_HIT], |
278 | 0 | 100.0 * stats[PMD_STAT_EXACT_HIT] / passes, |
279 | 0 | stats[PMD_STAT_SMC_HIT], |
280 | 0 | 100.0 * stats[PMD_STAT_SMC_HIT] / passes, |
281 | 0 | stats[PMD_STAT_MASKED_HIT], |
282 | 0 | 100.0 * stats[PMD_STAT_MASKED_HIT] / passes, |
283 | 0 | stats[PMD_STAT_MASKED_HIT] |
284 | 0 | ? 1.0 * stats[PMD_STAT_MASKED_LOOKUP] / stats[PMD_STAT_MASKED_HIT] |
285 | 0 | : 0, |
286 | 0 | upcalls, 100.0 * upcalls / passes, |
287 | 0 | upcalls ? (upcall_cycles * us_per_cycle) / upcalls : 0, |
288 | 0 | stats[PMD_STAT_LOST], |
289 | 0 | 100.0 * stats[PMD_STAT_LOST] / passes); |
290 | 0 | } else { |
291 | 0 | ds_put_format(str, |
292 | 0 | " Rx packets: %12d\n", 0); |
293 | 0 | } |
294 | 0 | if (tx_packets > 0) { |
295 | 0 | ds_put_format(str, |
296 | 0 | " Tx packets: %12"PRIu64" (%.0f Kpps)\n" |
297 | 0 | " Tx batches: %12"PRIu64" (%.2f pkts/batch)\n", |
298 | 0 | tx_packets, (tx_packets / duration) / 1000, |
299 | 0 | tx_batches, 1.0 * tx_packets / tx_batches); |
300 | 0 | } else { |
301 | 0 | ds_put_format(str, |
302 | 0 | " Tx packets: %12d\n\n", 0); |
303 | 0 | } |
304 | 0 | } |
305 | | |
306 | | void |
307 | | pmd_perf_format_histograms(struct ds *str, struct pmd_perf_stats *s) |
308 | 0 | { |
309 | 0 | int i; |
310 | |
|
311 | 0 | ds_put_cstr(str, "Histograms\n"); |
312 | 0 | ds_put_format(str, |
313 | 0 | " %-21s %-21s %-21s %-21s %-21s %-21s %-21s\n", |
314 | 0 | "cycles/it", "packets/it", "cycles/pkt", "pkts/batch", |
315 | 0 | "max vhost qlen", "upcalls/it", "cycles/upcall"); |
316 | 0 | for (i = 0; i < NUM_BINS-1; i++) { |
317 | 0 | ds_put_format(str, |
318 | 0 | " %-9d %-11"PRIu64" %-9d %-11"PRIu64" %-9d %-11"PRIu64 |
319 | 0 | " %-9d %-11"PRIu64" %-9d %-11"PRIu64" %-9d %-11"PRIu64 |
320 | 0 | " %-9d %-11"PRIu64"\n", |
321 | 0 | s->cycles.wall[i], s->cycles.bin[i], |
322 | 0 | s->pkts.wall[i],s->pkts.bin[i], |
323 | 0 | s->cycles_per_pkt.wall[i], s->cycles_per_pkt.bin[i], |
324 | 0 | s->pkts_per_batch.wall[i], s->pkts_per_batch.bin[i], |
325 | 0 | s->max_vhost_qfill.wall[i], s->max_vhost_qfill.bin[i], |
326 | 0 | s->upcalls.wall[i], s->upcalls.bin[i], |
327 | 0 | s->cycles_per_upcall.wall[i], s->cycles_per_upcall.bin[i]); |
328 | 0 | } |
329 | 0 | ds_put_format(str, |
330 | 0 | " %-9s %-11"PRIu64" %-9s %-11"PRIu64" %-9s %-11"PRIu64 |
331 | 0 | " %-9s %-11"PRIu64" %-9s %-11"PRIu64" %-9s %-11"PRIu64 |
332 | 0 | " %-9s %-11"PRIu64"\n", |
333 | 0 | ">", s->cycles.bin[i], |
334 | 0 | ">", s->pkts.bin[i], |
335 | 0 | ">", s->cycles_per_pkt.bin[i], |
336 | 0 | ">", s->pkts_per_batch.bin[i], |
337 | 0 | ">", s->max_vhost_qfill.bin[i], |
338 | 0 | ">", s->upcalls.bin[i], |
339 | 0 | ">", s->cycles_per_upcall.bin[i]); |
340 | 0 | ds_put_cstr(str, |
341 | 0 | "-----------------------------------------------------" |
342 | 0 | "-----------------------------------------------------" |
343 | 0 | "------------------------------------------------\n"); |
344 | 0 | ds_put_format(str, |
345 | 0 | " %-21s %-21s %-21s %-21s %-21s %-21s %-21s\n", |
346 | 0 | "cycles/it", "packets/it", "cycles/pkt", "pkts/batch", |
347 | 0 | "vhost qlen", "upcalls/it", "cycles/upcall"); |
348 | 0 | ds_put_format(str, |
349 | 0 | " %-21"PRIu64" %-21.5f %-21"PRIu64 |
350 | 0 | " %-21.5f %-21.5f %-21.5f %-21"PRIu32"\n", |
351 | 0 | s->totals.iterations |
352 | 0 | ? s->totals.cycles / s->totals.iterations : 0, |
353 | 0 | s->totals.iterations |
354 | 0 | ? 1.0 * s->totals.pkts / s->totals.iterations : 0, |
355 | 0 | s->totals.pkts |
356 | 0 | ? s->totals.busy_cycles / s->totals.pkts : 0, |
357 | 0 | s->totals.batches |
358 | 0 | ? 1.0 * s->totals.pkts / s->totals.batches : 0, |
359 | 0 | s->totals.iterations |
360 | 0 | ? 1.0 * s->totals.max_vhost_qfill / s->totals.iterations |
361 | 0 | : 0, |
362 | 0 | s->totals.iterations |
363 | 0 | ? 1.0 * s->totals.upcalls / s->totals.iterations : 0, |
364 | 0 | s->totals.upcalls |
365 | 0 | ? s->totals.upcall_cycles / s->totals.upcalls : 0); |
366 | 0 | } |
367 | | |
368 | | void |
369 | | pmd_perf_format_iteration_history(struct ds *str, struct pmd_perf_stats *s, |
370 | | int n_iter) |
371 | 0 | { |
372 | 0 | struct iter_stats *is; |
373 | 0 | size_t index; |
374 | 0 | int i; |
375 | |
|
376 | 0 | if (n_iter == 0) { |
377 | 0 | return; |
378 | 0 | } |
379 | 0 | ds_put_format(str, " %-17s %-10s %-10s %-10s %-10s " |
380 | 0 | "%-10s %-10s %-10s\n", |
381 | 0 | "iter", "cycles", "packets", "cycles/pkt", "pkts/batch", |
382 | 0 | "vhost qlen", "upcalls", "cycles/upcall"); |
383 | 0 | for (i = 1; i <= n_iter; i++) { |
384 | 0 | index = history_sub(s->iterations.idx, i); |
385 | 0 | is = &s->iterations.sample[index]; |
386 | 0 | ds_put_format(str, |
387 | 0 | " %-17"PRIu64" %-11"PRIu64" %-11"PRIu32 |
388 | 0 | " %-11"PRIu64" %-11"PRIu32" %-11"PRIu32 |
389 | 0 | " %-11"PRIu32" %-11"PRIu32"\n", |
390 | 0 | is->timestamp, |
391 | 0 | is->cycles, |
392 | 0 | is->pkts, |
393 | 0 | is->pkts ? is->cycles / is->pkts : 0, |
394 | 0 | is->batches ? is->pkts / is->batches : 0, |
395 | 0 | is->max_vhost_qfill, |
396 | 0 | is->upcalls, |
397 | 0 | is->upcalls ? is->upcall_cycles / is->upcalls : 0); |
398 | 0 | } |
399 | 0 | } |
400 | | |
401 | | void |
402 | | pmd_perf_format_ms_history(struct ds *str, struct pmd_perf_stats *s, int n_ms) |
403 | 0 | { |
404 | 0 | struct iter_stats *is; |
405 | 0 | size_t index; |
406 | 0 | int i; |
407 | |
|
408 | 0 | if (n_ms == 0) { |
409 | 0 | return; |
410 | 0 | } |
411 | 0 | ds_put_format(str, |
412 | 0 | " %-12s %-10s %-10s %-10s %-10s" |
413 | 0 | " %-10s %-10s %-10s %-10s\n", |
414 | 0 | "ms", "iterations", "cycles/it", "Kpps", "cycles/pkt", |
415 | 0 | "pkts/batch", "vhost qlen", "upcalls", "cycles/upcall"); |
416 | 0 | for (i = 1; i <= n_ms; i++) { |
417 | 0 | index = history_sub(s->milliseconds.idx, i); |
418 | 0 | is = &s->milliseconds.sample[index]; |
419 | 0 | ds_put_format(str, |
420 | 0 | " %-12"PRIu64" %-11"PRIu32" %-11"PRIu64 |
421 | 0 | " %-11"PRIu32" %-11"PRIu64" %-11"PRIu32 |
422 | 0 | " %-11"PRIu32" %-11"PRIu32" %-11"PRIu32"\n", |
423 | 0 | is->timestamp, |
424 | 0 | is->iterations, |
425 | 0 | is->iterations ? is->cycles / is->iterations : 0, |
426 | 0 | is->pkts, |
427 | 0 | is->pkts ? is->busy_cycles / is->pkts : 0, |
428 | 0 | is->batches ? is->pkts / is->batches : 0, |
429 | 0 | is->iterations |
430 | 0 | ? is->max_vhost_qfill / is->iterations : 0, |
431 | 0 | is->upcalls, |
432 | 0 | is->upcalls ? is->upcall_cycles / is->upcalls : 0); |
433 | 0 | } |
434 | 0 | } |
435 | | |
436 | | void |
437 | | pmd_perf_read_counters(struct pmd_perf_stats *s, |
438 | | uint64_t stats[PMD_N_STATS]) |
439 | 0 | { |
440 | 0 | uint64_t val; |
441 | | |
442 | | /* These loops subtracts reference values (.zero[*]) from the counters. |
443 | | * Since loads and stores are relaxed, it might be possible for a .zero[*] |
444 | | * value to be more recent than the current value we're reading from the |
445 | | * counter. This is not a big problem, since these numbers are not |
446 | | * supposed to be 100% accurate, but we should at least make sure that |
447 | | * the result is not negative. */ |
448 | 0 | for (int i = 0; i < PMD_N_STATS; i++) { |
449 | 0 | atomic_read_relaxed(&s->counters.n[i], &val); |
450 | 0 | if (val > s->counters.zero[i]) { |
451 | 0 | stats[i] = val - s->counters.zero[i]; |
452 | 0 | } else { |
453 | 0 | stats[i] = 0; |
454 | 0 | } |
455 | 0 | } |
456 | 0 | } |
457 | | |
458 | | /* This function clears the PMD performance counters from within the PMD |
459 | | * thread or from another thread when the PMD thread is not executing its |
460 | | * poll loop. */ |
461 | | void |
462 | | pmd_perf_stats_clear_lock(struct pmd_perf_stats *s) |
463 | | OVS_REQUIRES(s->stats_mutex) |
464 | 0 | { |
465 | 0 | ovs_mutex_lock(&s->clear_mutex); |
466 | 0 | for (int i = 0; i < PMD_N_STATS; i++) { |
467 | 0 | atomic_read_relaxed(&s->counters.n[i], &s->counters.zero[i]); |
468 | 0 | } |
469 | | /* The following stats are only applicable in PMD thread and */ |
470 | 0 | memset(&s->current, 0 , sizeof(struct iter_stats)); |
471 | 0 | memset(&s->totals, 0 , sizeof(struct iter_stats)); |
472 | 0 | histogram_clear(&s->cycles); |
473 | 0 | histogram_clear(&s->pkts); |
474 | 0 | histogram_clear(&s->cycles_per_pkt); |
475 | 0 | histogram_clear(&s->upcalls); |
476 | 0 | histogram_clear(&s->cycles_per_upcall); |
477 | 0 | histogram_clear(&s->pkts_per_batch); |
478 | 0 | histogram_clear(&s->max_vhost_qfill); |
479 | 0 | history_init(&s->iterations); |
480 | 0 | history_init(&s->milliseconds); |
481 | 0 | s->start_ms = time_msec(); |
482 | 0 | s->milliseconds.sample[0].timestamp = s->start_ms; |
483 | 0 | s->log_susp_it = UINT32_MAX; |
484 | 0 | s->log_begin_it = UINT32_MAX; |
485 | 0 | s->log_end_it = UINT32_MAX; |
486 | 0 | s->log_reason = NULL; |
487 | | /* Clearing finished. */ |
488 | 0 | s->clear = false; |
489 | 0 | ovs_mutex_unlock(&s->clear_mutex); |
490 | 0 | } |
491 | | |
492 | | /* This function can be called from the anywhere to clear the stats |
493 | | * of PMD and non-PMD threads. */ |
494 | | void |
495 | | pmd_perf_stats_clear(struct pmd_perf_stats *s) |
496 | 0 | { |
497 | 0 | if (ovs_mutex_trylock(&s->stats_mutex) == 0) { |
498 | | /* Locking successful. PMD not polling. */ |
499 | 0 | pmd_perf_stats_clear_lock(s); |
500 | 0 | ovs_mutex_unlock(&s->stats_mutex); |
501 | 0 | } else { |
502 | | /* Request the polling PMD to clear the stats. There is no need to |
503 | | * block here as stats retrieval is prevented during clearing. */ |
504 | 0 | s->clear = true; |
505 | 0 | } |
506 | 0 | } |
507 | | |
508 | | /* Functions recording PMD metrics per iteration. */ |
509 | | |
510 | | void |
511 | | pmd_perf_start_iteration(struct pmd_perf_stats *s) |
512 | | OVS_REQUIRES(s->stats_mutex) |
513 | 0 | { |
514 | 0 | if (s->clear) { |
515 | | /* Clear the PMD stats before starting next iteration. */ |
516 | 0 | pmd_perf_stats_clear_lock(s); |
517 | 0 | } |
518 | 0 | s->iteration_cnt++; |
519 | | /* Initialize the current interval stats. */ |
520 | 0 | memset(&s->current, 0, sizeof(struct iter_stats)); |
521 | 0 | if (OVS_LIKELY(s->last_tsc)) { |
522 | | /* We assume here that last_tsc was updated immediately prior at |
523 | | * the end of the previous iteration, or just before the first |
524 | | * iteration. */ |
525 | 0 | s->start_tsc = s->last_tsc; |
526 | 0 | } else { |
527 | | /* In case last_tsc has never been set before. */ |
528 | 0 | s->start_tsc = cycles_counter_update(s); |
529 | 0 | } |
530 | 0 | } |
531 | | |
532 | | void |
533 | | pmd_perf_end_iteration(struct pmd_perf_stats *s, int rx_packets, |
534 | | int tx_packets, uint64_t sleep_cycles, |
535 | | bool full_metrics) |
536 | 0 | { |
537 | 0 | uint64_t now_tsc = cycles_counter_update(s); |
538 | 0 | struct iter_stats *cum_ms; |
539 | 0 | uint64_t cycles, cycles_per_pkt = 0; |
540 | 0 | char *reason = NULL; |
541 | |
|
542 | 0 | cycles = now_tsc - s->start_tsc - sleep_cycles; |
543 | 0 | s->current.timestamp = s->iteration_cnt; |
544 | 0 | s->current.cycles = cycles; |
545 | 0 | s->current.pkts = rx_packets; |
546 | |
|
547 | 0 | if (rx_packets + tx_packets > 0) { |
548 | 0 | pmd_perf_update_counter(s, PMD_CYCLES_ITER_BUSY, cycles); |
549 | 0 | } else { |
550 | 0 | pmd_perf_update_counter(s, PMD_CYCLES_ITER_IDLE, cycles); |
551 | 0 | } |
552 | | /* Add iteration samples to histograms. */ |
553 | 0 | histogram_add_sample(&s->cycles, cycles); |
554 | 0 | histogram_add_sample(&s->pkts, rx_packets); |
555 | |
|
556 | 0 | if (sleep_cycles) { |
557 | 0 | pmd_perf_update_counter(s, PMD_SLEEP_ITER, 1); |
558 | 0 | pmd_perf_update_counter(s, PMD_CYCLES_SLEEP, sleep_cycles); |
559 | 0 | } |
560 | |
|
561 | 0 | if (!full_metrics) { |
562 | 0 | return; |
563 | 0 | } |
564 | | |
565 | 0 | s->counters.n[PMD_CYCLES_UPCALL] += s->current.upcall_cycles; |
566 | |
|
567 | 0 | if (rx_packets > 0) { |
568 | 0 | cycles_per_pkt = cycles / rx_packets; |
569 | 0 | histogram_add_sample(&s->cycles_per_pkt, cycles_per_pkt); |
570 | 0 | } |
571 | 0 | histogram_add_sample(&s->upcalls, s->current.upcalls); |
572 | 0 | histogram_add_sample(&s->max_vhost_qfill, s->current.max_vhost_qfill); |
573 | | |
574 | | /* Add iteration samples to millisecond stats. */ |
575 | 0 | cum_ms = history_current(&s->milliseconds); |
576 | 0 | cum_ms->iterations++; |
577 | 0 | cum_ms->cycles += cycles; |
578 | 0 | if (rx_packets > 0) { |
579 | 0 | cum_ms->busy_cycles += cycles; |
580 | 0 | } |
581 | 0 | cum_ms->pkts += s->current.pkts; |
582 | 0 | cum_ms->upcalls += s->current.upcalls; |
583 | 0 | cum_ms->upcall_cycles += s->current.upcall_cycles; |
584 | 0 | cum_ms->batches += s->current.batches; |
585 | 0 | cum_ms->max_vhost_qfill += s->current.max_vhost_qfill; |
586 | |
|
587 | 0 | if (log_enabled) { |
588 | | /* Log suspicious iterations. */ |
589 | 0 | if (cycles > iter_cycle_threshold) { |
590 | 0 | reason = "Excessive total cycles"; |
591 | 0 | } else if (s->current.max_vhost_qfill >= log_q_thr) { |
592 | 0 | reason = "Vhost RX queue full"; |
593 | 0 | } |
594 | 0 | if (OVS_UNLIKELY(reason)) { |
595 | 0 | pmd_perf_set_log_susp_iteration(s, reason); |
596 | 0 | cycles_counter_update(s); |
597 | 0 | } |
598 | | |
599 | | /* Log iteration interval around suspicious iteration when reaching |
600 | | * the end of the range to be logged. */ |
601 | 0 | if (OVS_UNLIKELY(s->log_end_it == s->iterations.idx)) { |
602 | 0 | pmd_perf_log_susp_iteration_neighborhood(s); |
603 | 0 | cycles_counter_update(s); |
604 | 0 | } |
605 | 0 | } |
606 | | |
607 | | /* Store in iteration history. This advances the iteration idx and |
608 | | * clears the next slot in the iteration history. */ |
609 | 0 | history_store(&s->iterations, &s->current); |
610 | |
|
611 | 0 | if (now_tsc > s->next_check_tsc) { |
612 | | /* Check if ms is completed and store in milliseconds history. */ |
613 | 0 | uint64_t now = time_msec(); |
614 | 0 | if (now != cum_ms->timestamp) { |
615 | | /* Add ms stats to totals. */ |
616 | 0 | s->totals.iterations += cum_ms->iterations; |
617 | 0 | s->totals.cycles += cum_ms->cycles; |
618 | 0 | s->totals.busy_cycles += cum_ms->busy_cycles; |
619 | 0 | s->totals.pkts += cum_ms->pkts; |
620 | 0 | s->totals.upcalls += cum_ms->upcalls; |
621 | 0 | s->totals.upcall_cycles += cum_ms->upcall_cycles; |
622 | 0 | s->totals.batches += cum_ms->batches; |
623 | 0 | s->totals.max_vhost_qfill += cum_ms->max_vhost_qfill; |
624 | 0 | cum_ms = history_next(&s->milliseconds); |
625 | 0 | cum_ms->timestamp = now; |
626 | 0 | } |
627 | | /* Do the next check after 4 us (10K cycles at 2.5 GHz TSC clock). */ |
628 | 0 | s->next_check_tsc = cycles_counter_update(s) + tsc_hz / 250000; |
629 | 0 | } |
630 | 0 | } |
631 | | |
632 | | /* Delay logging of the suspicious iteration and the range of iterations |
633 | | * around it until after the last iteration in the range to be logged. |
634 | | * This avoids any distortion of the measurements through the cost of |
635 | | * logging itself. */ |
636 | | |
637 | | void |
638 | | pmd_perf_set_log_susp_iteration(struct pmd_perf_stats *s, |
639 | | char *reason) |
640 | 0 | { |
641 | 0 | if (s->log_susp_it == UINT32_MAX) { |
642 | | /* No logging scheduled yet. */ |
643 | 0 | s->log_susp_it = s->iterations.idx; |
644 | 0 | s->log_reason = reason; |
645 | 0 | s->log_begin_it = history_sub(s->iterations.idx, log_it_before); |
646 | 0 | s->log_end_it = history_add(s->iterations.idx, log_it_after + 1); |
647 | 0 | } else if (log_extend) { |
648 | | /* Logging was initiated earlier, we log the previous suspicious |
649 | | * iteration now and extend the logging interval, if possible. */ |
650 | 0 | struct iter_stats *susp = &s->iterations.sample[s->log_susp_it]; |
651 | 0 | uint32_t new_end_it, old_range, new_range; |
652 | |
|
653 | 0 | VLOG_WARN_RL(&latency_rl, |
654 | 0 | "Suspicious iteration (%s): iter=%"PRIu64 |
655 | 0 | " duration=%"PRIu64" us\n", |
656 | 0 | s->log_reason, |
657 | 0 | susp->timestamp, |
658 | 0 | (1000000L * susp->cycles) / tsc_hz); |
659 | |
|
660 | 0 | new_end_it = history_add(s->iterations.idx, log_it_after + 1); |
661 | 0 | new_range = history_sub(new_end_it, s->log_begin_it); |
662 | 0 | old_range = history_sub(s->log_end_it, s->log_begin_it); |
663 | 0 | if (new_range < old_range) { |
664 | | /* Extended range exceeds history length. */ |
665 | 0 | new_end_it = s->log_begin_it; |
666 | 0 | } |
667 | 0 | s->log_susp_it = s->iterations.idx; |
668 | 0 | s->log_reason = reason; |
669 | 0 | s->log_end_it = new_end_it; |
670 | 0 | } |
671 | 0 | } |
672 | | |
673 | | void |
674 | | pmd_perf_log_susp_iteration_neighborhood(struct pmd_perf_stats *s) |
675 | 0 | { |
676 | 0 | ovs_assert(s->log_reason != NULL); |
677 | 0 | ovs_assert(s->log_susp_it != UINT32_MAX); |
678 | |
|
679 | 0 | struct ds log = DS_EMPTY_INITIALIZER; |
680 | 0 | struct iter_stats *susp = &s->iterations.sample[s->log_susp_it]; |
681 | 0 | uint32_t range = history_sub(s->log_end_it, s->log_begin_it); |
682 | |
|
683 | 0 | VLOG_WARN_RL(&latency_rl, |
684 | 0 | "Suspicious iteration (%s): iter=%"PRIu64 |
685 | 0 | " duration=%"PRIu64" us\n", |
686 | 0 | s->log_reason, |
687 | 0 | susp->timestamp, |
688 | 0 | (1000000L * susp->cycles) / tsc_hz); |
689 | |
|
690 | 0 | pmd_perf_format_iteration_history(&log, s, range); |
691 | 0 | VLOG_WARN_RL(&latency_rl, |
692 | 0 | "Neighborhood of suspicious iteration:\n" |
693 | 0 | "%s", ds_cstr(&log)); |
694 | 0 | ds_destroy(&log); |
695 | 0 | s->log_susp_it = s->log_end_it = s->log_begin_it = UINT32_MAX; |
696 | 0 | s->log_reason = NULL; |
697 | |
|
698 | 0 | if (range > 100) { |
699 | | /* Reset to safe default values to avoid disturbance. */ |
700 | 0 | log_it_before = LOG_IT_BEFORE; |
701 | 0 | log_it_after = LOG_IT_AFTER; |
702 | 0 | log_extend = false; |
703 | 0 | } |
704 | 0 | } |
705 | | |
706 | | void |
707 | | pmd_perf_log_set_cmd(struct unixctl_conn *conn, |
708 | | int argc, const char *argv[], |
709 | | void *aux OVS_UNUSED) |
710 | 0 | { |
711 | 0 | unsigned int it_before, it_after, us_thr, q_thr; |
712 | 0 | bool on, extend; |
713 | 0 | bool usage = false; |
714 | |
|
715 | 0 | on = log_enabled; |
716 | 0 | extend = log_extend; |
717 | 0 | it_before = log_it_before; |
718 | 0 | it_after = log_it_after; |
719 | 0 | q_thr = log_q_thr; |
720 | 0 | us_thr = log_us_thr; |
721 | |
|
722 | 0 | while (argc > 1) { |
723 | 0 | if (!strcmp(argv[1], "on")) { |
724 | 0 | on = true; |
725 | 0 | argc--; |
726 | 0 | argv++; |
727 | 0 | } else if (!strcmp(argv[1], "off")) { |
728 | 0 | on = false; |
729 | 0 | argc--; |
730 | 0 | argv++; |
731 | 0 | } else if (!strcmp(argv[1], "-e")) { |
732 | 0 | extend = true; |
733 | 0 | argc--; |
734 | 0 | argv++; |
735 | 0 | } else if (!strcmp(argv[1], "-ne")) { |
736 | 0 | extend = false; |
737 | 0 | argc--; |
738 | 0 | argv++; |
739 | 0 | } else if (!strcmp(argv[1], "-a") && argc > 2) { |
740 | 0 | if (str_to_uint(argv[2], 10, &it_after)) { |
741 | 0 | if (it_after > HISTORY_LEN - 2) { |
742 | 0 | it_after = HISTORY_LEN - 2; |
743 | 0 | } |
744 | 0 | } else { |
745 | 0 | usage = true; |
746 | 0 | break; |
747 | 0 | } |
748 | 0 | argc -= 2; |
749 | 0 | argv += 2; |
750 | 0 | } else if (!strcmp(argv[1], "-b") && argc > 2) { |
751 | 0 | if (str_to_uint(argv[2], 10, &it_before)) { |
752 | 0 | if (it_before > HISTORY_LEN - 2) { |
753 | 0 | it_before = HISTORY_LEN - 2; |
754 | 0 | } |
755 | 0 | } else { |
756 | 0 | usage = true; |
757 | 0 | break; |
758 | 0 | } |
759 | 0 | argc -= 2; |
760 | 0 | argv += 2; |
761 | 0 | } else if (!strcmp(argv[1], "-q") && argc > 2) { |
762 | 0 | if (!str_to_uint(argv[2], 10, &q_thr)) { |
763 | 0 | usage = true; |
764 | 0 | break; |
765 | 0 | } |
766 | 0 | argc -= 2; |
767 | 0 | argv += 2; |
768 | 0 | } else if (!strcmp(argv[1], "-us") && argc > 2) { |
769 | 0 | if (!str_to_uint(argv[2], 10, &us_thr)) { |
770 | 0 | usage = true; |
771 | 0 | break; |
772 | 0 | } |
773 | 0 | argc -= 2; |
774 | 0 | argv += 2; |
775 | 0 | } else { |
776 | 0 | usage = true; |
777 | 0 | break; |
778 | 0 | } |
779 | 0 | } |
780 | 0 | if (it_before + it_after > HISTORY_LEN - 2) { |
781 | 0 | it_after = HISTORY_LEN - 2 - it_before; |
782 | 0 | } |
783 | |
|
784 | 0 | if (usage) { |
785 | 0 | unixctl_command_reply_error(conn, |
786 | 0 | "Usage: ovs-appctl dpif-netdev/pmd-perf-log-set " |
787 | 0 | "[on|off] [-b before] [-a after] [-e|-ne] " |
788 | 0 | "[-us usec] [-q qlen]"); |
789 | 0 | return; |
790 | 0 | } |
791 | | |
792 | 0 | VLOG_INFO("pmd-perf-log-set: %s, before=%d, after=%d, extend=%s, " |
793 | 0 | "us_thr=%d, q_thr=%d\n", |
794 | 0 | on ? "on" : "off", it_before, it_after, |
795 | 0 | extend ? "true" : "false", us_thr, q_thr); |
796 | 0 | log_enabled = on; |
797 | 0 | log_extend = extend; |
798 | 0 | log_it_before = it_before; |
799 | 0 | log_it_after = it_after; |
800 | 0 | log_q_thr = q_thr; |
801 | 0 | log_us_thr = us_thr; |
802 | 0 | iter_cycle_threshold = (log_us_thr * tsc_hz) / 1000000L; |
803 | |
|
804 | 0 | unixctl_command_reply(conn, ""); |
805 | 0 | } |