/src/openvswitch/lib/dpif-netdev.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2009-2014, 2016-2018 Nicira, Inc. |
3 | | * |
4 | | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | | * you may not use this file except in compliance with the License. |
6 | | * You may obtain a copy of the License at: |
7 | | * |
8 | | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | | * |
10 | | * Unless required by applicable law or agreed to in writing, software |
11 | | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | | * See the License for the specific language governing permissions and |
14 | | * limitations under the License. |
15 | | */ |
16 | | |
17 | | #include <config.h> |
18 | | #include "dpif-netdev.h" |
19 | | #include "dpif-netdev-private.h" |
20 | | #include "dpif-netdev-private-dfc.h" |
21 | | |
22 | | #include <ctype.h> |
23 | | #include <errno.h> |
24 | | #include <fcntl.h> |
25 | | #include <inttypes.h> |
26 | | #include <net/if.h> |
27 | | #include <sys/types.h> |
28 | | #include <netinet/in.h> |
29 | | #include <stdint.h> |
30 | | #include <stdlib.h> |
31 | | #include <string.h> |
32 | | #include <sys/ioctl.h> |
33 | | #include <sys/socket.h> |
34 | | #include <sys/stat.h> |
35 | | #include <unistd.h> |
36 | | |
37 | | #include "bitmap.h" |
38 | | #include "ccmap.h" |
39 | | #include "cmap.h" |
40 | | #include "conntrack.h" |
41 | | #include "conntrack-tp.h" |
42 | | #include "coverage.h" |
43 | | #include "ct-dpif.h" |
44 | | #include "csum.h" |
45 | | #include "dp-packet.h" |
46 | | #include "dpif.h" |
47 | | #include "dpif-netdev-lookup.h" |
48 | | #include "dpif-netdev-perf.h" |
49 | | #include "dpif-netdev-private-extract.h" |
50 | | #include "dpif-provider.h" |
51 | | #include "dummy.h" |
52 | | #include "fat-rwlock.h" |
53 | | #include "flow.h" |
54 | | #include "hmapx.h" |
55 | | #include "id-fpool.h" |
56 | | #include "id-pool.h" |
57 | | #include "ipf.h" |
58 | | #include "mov-avg.h" |
59 | | #include "mpsc-queue.h" |
60 | | #include "netdev.h" |
61 | | #include "netdev-offload.h" |
62 | | #include "netdev-provider.h" |
63 | | #include "netdev-vport.h" |
64 | | #include "netlink.h" |
65 | | #include "odp-execute.h" |
66 | | #include "odp-util.h" |
67 | | #include "openvswitch/dynamic-string.h" |
68 | | #include "openvswitch/list.h" |
69 | | #include "openvswitch/match.h" |
70 | | #include "openvswitch/ofp-parse.h" |
71 | | #include "openvswitch/ofp-print.h" |
72 | | #include "openvswitch/ofpbuf.h" |
73 | | #include "openvswitch/shash.h" |
74 | | #include "openvswitch/vlog.h" |
75 | | #include "ovs-numa.h" |
76 | | #include "ovs-rcu.h" |
77 | | #include "packets.h" |
78 | | #include "openvswitch/poll-loop.h" |
79 | | #include "pvector.h" |
80 | | #include "random.h" |
81 | | #include "seq.h" |
82 | | #include "smap.h" |
83 | | #include "sset.h" |
84 | | #include "timeval.h" |
85 | | #include "tnl-neigh-cache.h" |
86 | | #include "tnl-ports.h" |
87 | | #include "unixctl.h" |
88 | | #include "util.h" |
89 | | #include "uuid.h" |
90 | | |
91 | | VLOG_DEFINE_THIS_MODULE(dpif_netdev); |
92 | | |
93 | | /* Auto Load Balancing Defaults */ |
94 | 0 | #define ALB_IMPROVEMENT_THRESHOLD 25 |
95 | 0 | #define ALB_LOAD_THRESHOLD 95 |
96 | 0 | #define ALB_REBALANCE_INTERVAL 1 /* 1 Min */ |
97 | 0 | #define MAX_ALB_REBALANCE_INTERVAL 20000 /* 20000 Min */ |
98 | 0 | #define MIN_TO_MSEC 60000 |
99 | | |
100 | | #define FLOW_DUMP_MAX_BATCH 50 |
101 | | /* Use per thread recirc_depth to prevent recirculation loop. */ |
102 | 0 | #define MAX_RECIRC_DEPTH 6 |
103 | | DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0) |
104 | | |
105 | | /* Use instant packet send by default. */ |
106 | 0 | #define DEFAULT_TX_FLUSH_INTERVAL 0 |
107 | | |
108 | | /* Configuration parameters. */ |
109 | | enum { MAX_METERS = 1 << 18 }; /* Maximum number of meters. */ |
110 | | enum { MAX_BANDS = 8 }; /* Maximum number of bands / meter. */ |
111 | | |
112 | | COVERAGE_DEFINE(datapath_drop_meter); |
113 | | COVERAGE_DEFINE(datapath_drop_upcall_error); |
114 | | COVERAGE_DEFINE(datapath_drop_lock_error); |
115 | | COVERAGE_DEFINE(datapath_drop_userspace_action_error); |
116 | | COVERAGE_DEFINE(datapath_drop_tunnel_push_error); |
117 | | COVERAGE_DEFINE(datapath_drop_tunnel_pop_error); |
118 | | COVERAGE_DEFINE(datapath_drop_recirc_error); |
119 | | COVERAGE_DEFINE(datapath_drop_invalid_port); |
120 | | COVERAGE_DEFINE(datapath_drop_invalid_bond); |
121 | | COVERAGE_DEFINE(datapath_drop_invalid_tnl_port); |
122 | | COVERAGE_DEFINE(datapath_drop_rx_invalid_packet); |
123 | | #ifdef ALLOW_EXPERIMENTAL_API /* Packet restoration API required. */ |
124 | | COVERAGE_DEFINE(datapath_drop_hw_miss_recover); |
125 | | #endif |
126 | | |
127 | | /* Protects against changes to 'dp_netdevs'. */ |
128 | | struct ovs_mutex dp_netdev_mutex = OVS_MUTEX_INITIALIZER; |
129 | | |
130 | | /* Contains all 'struct dp_netdev's. */ |
131 | | static struct shash dp_netdevs OVS_GUARDED_BY(dp_netdev_mutex) |
132 | | = SHASH_INITIALIZER(&dp_netdevs); |
133 | | |
134 | | static struct vlog_rate_limit upcall_rl = VLOG_RATE_LIMIT_INIT(600, 600); |
135 | | |
136 | 0 | #define DP_NETDEV_CS_SUPPORTED_MASK (CS_NEW | CS_ESTABLISHED | CS_RELATED \ |
137 | 0 | | CS_INVALID | CS_REPLY_DIR | CS_TRACKED \ |
138 | 0 | | CS_SRC_NAT | CS_DST_NAT) |
139 | 0 | #define DP_NETDEV_CS_UNSUPPORTED_MASK (~(uint32_t)DP_NETDEV_CS_SUPPORTED_MASK) |
140 | | |
141 | | static struct odp_support dp_netdev_support = { |
142 | | .max_vlan_headers = SIZE_MAX, |
143 | | .max_mpls_depth = SIZE_MAX, |
144 | | .recirc = true, |
145 | | .ct_state = true, |
146 | | .ct_zone = true, |
147 | | .ct_mark = true, |
148 | | .ct_label = true, |
149 | | .ct_state_nat = true, |
150 | | .ct_orig_tuple = true, |
151 | | .ct_orig_tuple6 = true, |
152 | | }; |
153 | | |
154 | | |
155 | | /* Simple non-wildcarding single-priority classifier. */ |
156 | | |
157 | | /* Time in microseconds between successive optimizations of the dpcls |
158 | | * subtable vector */ |
159 | 0 | #define DPCLS_OPTIMIZATION_INTERVAL 1000000LL |
160 | | |
161 | | /* Time in microseconds of the interval in which rxq processing cycles used |
162 | | * in rxq to pmd assignments is measured and stored. */ |
163 | 0 | #define PMD_INTERVAL_LEN 5000000LL |
164 | | /* For converting PMD_INTERVAL_LEN to secs. */ |
165 | 0 | #define INTERVAL_USEC_TO_SEC 1000000LL |
166 | | |
167 | | /* Number of intervals for which cycles are stored |
168 | | * and used during rxq to pmd assignment. */ |
169 | 0 | #define PMD_INTERVAL_MAX 12 |
170 | | |
171 | | /* Time in microseconds to try RCU quiescing. */ |
172 | 0 | #define PMD_RCU_QUIESCE_INTERVAL 10000LL |
173 | | |
174 | | /* Timer resolution for PMD threads in nanoseconds. */ |
175 | 0 | #define PMD_TIMER_RES_NS 1000 |
176 | | |
177 | | /* Number of pkts Rx on an interface that will stop pmd thread sleeping. */ |
178 | 0 | #define PMD_SLEEP_THRESH (NETDEV_MAX_BURST / 2) |
179 | | /* Time in uS to increment a pmd thread sleep time. */ |
180 | 0 | #define PMD_SLEEP_INC_US 1 |
181 | | |
182 | | struct dpcls { |
183 | | struct cmap_node node; /* Within dp_netdev_pmd_thread.classifiers */ |
184 | | odp_port_t in_port; |
185 | | struct cmap subtables_map; |
186 | | struct pvector subtables; |
187 | | }; |
188 | | |
189 | | /* Data structure to keep packet order till fastpath processing. */ |
190 | | struct dp_packet_flow_map { |
191 | | struct dp_packet *packet; |
192 | | struct dp_netdev_flow *flow; |
193 | | uint16_t tcp_flags; |
194 | | }; |
195 | | |
196 | | static void dpcls_init(struct dpcls *); |
197 | | static void dpcls_destroy(struct dpcls *); |
198 | | static void dpcls_sort_subtable_vector(struct dpcls *); |
199 | | static uint32_t dpcls_subtable_lookup_reprobe(struct dpcls *cls); |
200 | | static void dpcls_insert(struct dpcls *, struct dpcls_rule *, |
201 | | const struct netdev_flow_key *mask); |
202 | | static void dpcls_remove(struct dpcls *, struct dpcls_rule *); |
203 | | |
204 | | /* Set of supported meter flags */ |
205 | | #define DP_SUPPORTED_METER_FLAGS_MASK \ |
206 | 0 | (OFPMF13_STATS | OFPMF13_PKTPS | OFPMF13_KBPS | OFPMF13_BURST) |
207 | | |
208 | | /* Set of supported meter band types */ |
209 | | #define DP_SUPPORTED_METER_BAND_TYPES \ |
210 | 0 | ( 1 << OFPMBT13_DROP ) |
211 | | |
212 | | struct dp_meter_band { |
213 | | uint32_t rate; |
214 | | uint32_t burst_size; |
215 | | uint64_t bucket; /* In 1/1000 packets (for PKTPS), or in bits (for KBPS) */ |
216 | | uint64_t packet_count; |
217 | | uint64_t byte_count; |
218 | | }; |
219 | | |
220 | | struct dp_meter { |
221 | | struct cmap_node node; |
222 | | struct ovs_mutex lock; |
223 | | uint32_t id; |
224 | | uint16_t flags; |
225 | | uint16_t n_bands; |
226 | | uint32_t max_delta_t; |
227 | | uint64_t used; |
228 | | uint64_t packet_count; |
229 | | uint64_t byte_count; |
230 | | struct dp_meter_band bands[]; |
231 | | }; |
232 | | |
233 | | struct pmd_auto_lb { |
234 | | bool do_dry_run; |
235 | | bool recheck_config; |
236 | | bool is_enabled; /* Current status of Auto load balancing. */ |
237 | | uint64_t rebalance_intvl; |
238 | | uint64_t rebalance_poll_timer; |
239 | | uint8_t rebalance_improve_thresh; |
240 | | atomic_uint8_t rebalance_load_thresh; |
241 | | }; |
242 | | |
243 | | enum sched_assignment_type { |
244 | | SCHED_ROUNDROBIN, |
245 | | SCHED_CYCLES, /* Default.*/ |
246 | | SCHED_GROUP |
247 | | }; |
248 | | |
249 | | /* Datapath based on the network device interface from netdev.h. |
250 | | * |
251 | | * |
252 | | * Thread-safety |
253 | | * ============= |
254 | | * |
255 | | * Some members, marked 'const', are immutable. Accessing other members |
256 | | * requires synchronization, as noted in more detail below. |
257 | | * |
258 | | * Acquisition order is, from outermost to innermost: |
259 | | * |
260 | | * dp_netdev_mutex (global) |
261 | | * port_rwlock |
262 | | * bond_mutex |
263 | | * non_pmd_mutex |
264 | | */ |
265 | | struct dp_netdev { |
266 | | const struct dpif_class *const class; |
267 | | const char *const name; |
268 | | struct ovs_refcount ref_cnt; |
269 | | atomic_flag destroyed; |
270 | | |
271 | | /* Ports. |
272 | | * |
273 | | * Any lookup into 'ports' or any access to the dp_netdev_ports found |
274 | | * through 'ports' requires taking 'port_rwlock'. */ |
275 | | struct ovs_rwlock port_rwlock; |
276 | | struct hmap ports; |
277 | | struct seq *port_seq; /* Incremented whenever a port changes. */ |
278 | | |
279 | | /* The time that a packet can wait in output batch for sending. */ |
280 | | atomic_uint32_t tx_flush_interval; |
281 | | |
282 | | /* Meters. */ |
283 | | struct ovs_mutex meters_lock; |
284 | | struct cmap meters OVS_GUARDED; |
285 | | |
286 | | /* Probability of EMC insertions is a factor of 'emc_insert_min'.*/ |
287 | | atomic_uint32_t emc_insert_min; |
288 | | /* Enable collection of PMD performance metrics. */ |
289 | | atomic_bool pmd_perf_metrics; |
290 | | /* Max load based sleep request. */ |
291 | | atomic_uint64_t pmd_max_sleep; |
292 | | /* Enable the SMC cache from ovsdb config */ |
293 | | atomic_bool smc_enable_db; |
294 | | |
295 | | /* Protects access to ofproto-dpif-upcall interface during revalidator |
296 | | * thread synchronization. */ |
297 | | struct fat_rwlock upcall_rwlock; |
298 | | upcall_callback *upcall_cb; /* Callback function for executing upcalls. */ |
299 | | void *upcall_aux; |
300 | | |
301 | | /* Callback function for notifying the purging of dp flows (during |
302 | | * reseting pmd deletion). */ |
303 | | dp_purge_callback *dp_purge_cb; |
304 | | void *dp_purge_aux; |
305 | | |
306 | | /* Stores all 'struct dp_netdev_pmd_thread's. */ |
307 | | struct cmap poll_threads; |
308 | | /* id pool for per thread static_tx_qid. */ |
309 | | struct id_pool *tx_qid_pool; |
310 | | struct ovs_mutex tx_qid_pool_mutex; |
311 | | /* Rxq to pmd assignment type. */ |
312 | | enum sched_assignment_type pmd_rxq_assign_type; |
313 | | bool pmd_iso; |
314 | | |
315 | | /* Protects the access of the 'struct dp_netdev_pmd_thread' |
316 | | * instance for non-pmd thread. */ |
317 | | struct ovs_mutex non_pmd_mutex; |
318 | | |
319 | | /* Each pmd thread will store its pointer to |
320 | | * 'struct dp_netdev_pmd_thread' in 'per_pmd_key'. */ |
321 | | ovsthread_key_t per_pmd_key; |
322 | | |
323 | | struct seq *reconfigure_seq; |
324 | | uint64_t last_reconfigure_seq; |
325 | | |
326 | | /* Cpu mask for pin of pmd threads. */ |
327 | | char *pmd_cmask; |
328 | | |
329 | | uint64_t last_tnl_conf_seq; |
330 | | |
331 | | struct conntrack *conntrack; |
332 | | struct pmd_auto_lb pmd_alb; |
333 | | |
334 | | /* Bonds. */ |
335 | | struct ovs_mutex bond_mutex; /* Protects updates of 'tx_bonds'. */ |
336 | | struct cmap tx_bonds; /* Contains 'struct tx_bond'. */ |
337 | | }; |
338 | | |
339 | | static struct dp_netdev_port *dp_netdev_lookup_port(const struct dp_netdev *dp, |
340 | | odp_port_t) |
341 | | OVS_REQ_RDLOCK(dp->port_rwlock); |
342 | | |
343 | | enum rxq_cycles_counter_type { |
344 | | RXQ_CYCLES_PROC_CURR, /* Cycles spent successfully polling and |
345 | | processing packets during the current |
346 | | interval. */ |
347 | | RXQ_CYCLES_PROC_HIST, /* Total cycles of all intervals that are used |
348 | | during rxq to pmd assignment. */ |
349 | | RXQ_N_CYCLES |
350 | | }; |
351 | | |
352 | | enum dp_offload_type { |
353 | | DP_OFFLOAD_FLOW, |
354 | | DP_OFFLOAD_FLUSH, |
355 | | }; |
356 | | |
357 | | enum { |
358 | | DP_NETDEV_FLOW_OFFLOAD_OP_ADD, |
359 | | DP_NETDEV_FLOW_OFFLOAD_OP_MOD, |
360 | | DP_NETDEV_FLOW_OFFLOAD_OP_DEL, |
361 | | }; |
362 | | |
363 | | struct dp_offload_flow_item { |
364 | | struct dp_netdev_flow *flow; |
365 | | int op; |
366 | | struct match match; |
367 | | struct nlattr *actions; |
368 | | size_t actions_len; |
369 | | odp_port_t orig_in_port; /* Originating in_port for tnl flows. */ |
370 | | }; |
371 | | |
372 | | struct dp_offload_flush_item { |
373 | | struct netdev *netdev; |
374 | | struct ovs_barrier *barrier; |
375 | | }; |
376 | | |
377 | | union dp_offload_thread_data { |
378 | | struct dp_offload_flow_item flow; |
379 | | struct dp_offload_flush_item flush; |
380 | | }; |
381 | | |
382 | | struct dp_offload_thread_item { |
383 | | struct mpsc_queue_node node; |
384 | | enum dp_offload_type type; |
385 | | long long int timestamp; |
386 | | struct dp_netdev *dp; |
387 | | union dp_offload_thread_data data[0]; |
388 | | }; |
389 | | |
390 | | struct dp_offload_thread { |
391 | | PADDED_MEMBERS(CACHE_LINE_SIZE, |
392 | | struct mpsc_queue queue; |
393 | | atomic_uint64_t enqueued_item; |
394 | | struct cmap megaflow_to_mark; |
395 | | struct cmap mark_to_flow; |
396 | | struct mov_avg_cma cma; |
397 | | struct mov_avg_ema ema; |
398 | | ); |
399 | | }; |
400 | | static struct dp_offload_thread *dp_offload_threads; |
401 | | static void *dp_netdev_flow_offload_main(void *arg); |
402 | | |
403 | | static void |
404 | | dp_netdev_offload_init(void) |
405 | 0 | { |
406 | 0 | static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER; |
407 | 0 | unsigned int nb_offload_thread = netdev_offload_thread_nb(); |
408 | 0 | unsigned int tid; |
409 | |
|
410 | 0 | if (!ovsthread_once_start(&once)) { |
411 | 0 | return; |
412 | 0 | } |
413 | | |
414 | 0 | dp_offload_threads = xcalloc(nb_offload_thread, |
415 | 0 | sizeof *dp_offload_threads); |
416 | |
|
417 | 0 | for (tid = 0; tid < nb_offload_thread; tid++) { |
418 | 0 | struct dp_offload_thread *thread; |
419 | |
|
420 | 0 | thread = &dp_offload_threads[tid]; |
421 | 0 | mpsc_queue_init(&thread->queue); |
422 | 0 | cmap_init(&thread->megaflow_to_mark); |
423 | 0 | cmap_init(&thread->mark_to_flow); |
424 | 0 | atomic_init(&thread->enqueued_item, 0); |
425 | 0 | mov_avg_cma_init(&thread->cma); |
426 | 0 | mov_avg_ema_init(&thread->ema, 100); |
427 | 0 | ovs_thread_create("hw_offload", dp_netdev_flow_offload_main, thread); |
428 | 0 | } |
429 | |
|
430 | 0 | ovsthread_once_done(&once); |
431 | 0 | } |
432 | | |
433 | 0 | #define XPS_TIMEOUT 500000LL /* In microseconds. */ |
434 | | |
435 | | /* Contained by struct dp_netdev_port's 'rxqs' member. */ |
436 | | struct dp_netdev_rxq { |
437 | | struct dp_netdev_port *port; |
438 | | struct netdev_rxq *rx; |
439 | | unsigned core_id; /* Core to which this queue should be |
440 | | pinned. OVS_CORE_UNSPEC if the |
441 | | queue doesn't need to be pinned to a |
442 | | particular core. */ |
443 | | atomic_count intrvl_idx; /* Write index for 'cycles_intrvl'. */ |
444 | | struct dp_netdev_pmd_thread *pmd; /* pmd thread that polls this queue. */ |
445 | | bool is_vhost; /* Is rxq of a vhost port. */ |
446 | | |
447 | | /* Counters of cycles spent successfully polling and processing pkts. */ |
448 | | atomic_ullong cycles[RXQ_N_CYCLES]; |
449 | | /* We store PMD_INTERVAL_MAX intervals of data for an rxq and then |
450 | | sum them to yield the cycles used for an rxq. */ |
451 | | atomic_ullong cycles_intrvl[PMD_INTERVAL_MAX]; |
452 | | }; |
453 | | |
454 | | enum txq_req_mode { |
455 | | TXQ_REQ_MODE_THREAD, |
456 | | TXQ_REQ_MODE_HASH, |
457 | | }; |
458 | | |
459 | | enum txq_mode { |
460 | | TXQ_MODE_STATIC, |
461 | | TXQ_MODE_XPS, |
462 | | TXQ_MODE_XPS_HASH, |
463 | | }; |
464 | | |
465 | | /* A port in a netdev-based datapath. */ |
466 | | struct dp_netdev_port { |
467 | | odp_port_t port_no; |
468 | | enum txq_mode txq_mode; /* static, XPS, XPS_HASH. */ |
469 | | bool need_reconfigure; /* True if we should reconfigure netdev. */ |
470 | | struct netdev *netdev; |
471 | | struct hmap_node node; /* Node in dp_netdev's 'ports'. */ |
472 | | struct netdev_saved_flags *sf; |
473 | | struct dp_netdev_rxq *rxqs; |
474 | | unsigned n_rxq; /* Number of elements in 'rxqs' */ |
475 | | unsigned *txq_used; /* Number of threads that use each tx queue. */ |
476 | | struct ovs_mutex txq_used_mutex; |
477 | | bool emc_enabled; /* If true EMC will be used. */ |
478 | | char *type; /* Port type as requested by user. */ |
479 | | char *rxq_affinity_list; /* Requested affinity of rx queues. */ |
480 | | enum txq_req_mode txq_requested_mode; |
481 | | }; |
482 | | |
483 | | static bool dp_netdev_flow_ref(struct dp_netdev_flow *); |
484 | | static int dpif_netdev_flow_from_nlattrs(const struct nlattr *, uint32_t, |
485 | | struct flow *, bool); |
486 | | |
487 | | struct dp_netdev_actions *dp_netdev_actions_create(const struct nlattr *, |
488 | | size_t); |
489 | | struct dp_netdev_actions *dp_netdev_flow_get_actions( |
490 | | const struct dp_netdev_flow *); |
491 | | static void dp_netdev_actions_free(struct dp_netdev_actions *); |
492 | | |
493 | | struct polled_queue { |
494 | | struct dp_netdev_rxq *rxq; |
495 | | odp_port_t port_no; |
496 | | bool emc_enabled; |
497 | | bool rxq_enabled; |
498 | | uint64_t change_seq; |
499 | | }; |
500 | | |
501 | | /* Contained by struct dp_netdev_pmd_thread's 'poll_list' member. */ |
502 | | struct rxq_poll { |
503 | | struct dp_netdev_rxq *rxq; |
504 | | struct hmap_node node; |
505 | | }; |
506 | | |
507 | | /* Contained by struct dp_netdev_pmd_thread's 'send_port_cache', |
508 | | * 'tnl_port_cache' or 'tx_ports'. */ |
509 | | struct tx_port { |
510 | | struct dp_netdev_port *port; |
511 | | int qid; |
512 | | long long last_used; |
513 | | struct hmap_node node; |
514 | | long long flush_time; |
515 | | struct dp_packet_batch output_pkts; |
516 | | struct dp_packet_batch *txq_pkts; /* Only for hash mode. */ |
517 | | struct dp_netdev_rxq *output_pkts_rxqs[NETDEV_MAX_BURST]; |
518 | | }; |
519 | | |
520 | | /* Contained by struct tx_bond 'member_buckets'. */ |
521 | | struct member_entry { |
522 | | odp_port_t member_id; |
523 | | atomic_ullong n_packets; |
524 | | atomic_ullong n_bytes; |
525 | | }; |
526 | | |
527 | | /* Contained by struct dp_netdev_pmd_thread's 'tx_bonds'. */ |
528 | | struct tx_bond { |
529 | | struct cmap_node node; |
530 | | uint32_t bond_id; |
531 | | struct member_entry member_buckets[BOND_BUCKETS]; |
532 | | }; |
533 | | |
534 | | /* Interface to netdev-based datapath. */ |
535 | | struct dpif_netdev { |
536 | | struct dpif dpif; |
537 | | struct dp_netdev *dp; |
538 | | uint64_t last_port_seq; |
539 | | }; |
540 | | |
541 | | static int get_port_by_number(struct dp_netdev *dp, odp_port_t port_no, |
542 | | struct dp_netdev_port **portp) |
543 | | OVS_REQ_RDLOCK(dp->port_rwlock); |
544 | | static int get_port_by_name(struct dp_netdev *dp, const char *devname, |
545 | | struct dp_netdev_port **portp) |
546 | | OVS_REQ_RDLOCK(dp->port_rwlock); |
547 | | static void dp_netdev_free(struct dp_netdev *) |
548 | | OVS_REQUIRES(dp_netdev_mutex); |
549 | | static int do_add_port(struct dp_netdev *dp, const char *devname, |
550 | | const char *type, odp_port_t port_no) |
551 | | OVS_REQ_WRLOCK(dp->port_rwlock); |
552 | | static void do_del_port(struct dp_netdev *dp, struct dp_netdev_port *) |
553 | | OVS_REQ_WRLOCK(dp->port_rwlock); |
554 | | static int dpif_netdev_open(const struct dpif_class *, const char *name, |
555 | | bool create, struct dpif **); |
556 | | static void dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd, |
557 | | struct dp_packet_batch *, |
558 | | bool should_steal, |
559 | | const struct flow *flow, |
560 | | const struct nlattr *actions, |
561 | | size_t actions_len); |
562 | | static void dp_netdev_recirculate(struct dp_netdev_pmd_thread *, |
563 | | struct dp_packet_batch *); |
564 | | |
565 | | static void dp_netdev_disable_upcall(struct dp_netdev *); |
566 | | static void dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd); |
567 | | static void dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, |
568 | | struct dp_netdev *dp, unsigned core_id, |
569 | | int numa_id); |
570 | | static void dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd); |
571 | | static void dp_netdev_set_nonpmd(struct dp_netdev *dp) |
572 | | OVS_REQ_WRLOCK(dp->port_rwlock); |
573 | | |
574 | | static void *pmd_thread_main(void *); |
575 | | static struct dp_netdev_pmd_thread *dp_netdev_get_pmd(struct dp_netdev *dp, |
576 | | unsigned core_id); |
577 | | static struct dp_netdev_pmd_thread * |
578 | | dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos); |
579 | | static void dp_netdev_del_pmd(struct dp_netdev *dp, |
580 | | struct dp_netdev_pmd_thread *pmd); |
581 | | static void dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd); |
582 | | static void dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd); |
583 | | static void dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd, |
584 | | struct dp_netdev_port *port) |
585 | | OVS_REQUIRES(pmd->port_mutex); |
586 | | static void dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd, |
587 | | struct tx_port *tx) |
588 | | OVS_REQUIRES(pmd->port_mutex); |
589 | | static void dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd, |
590 | | struct dp_netdev_rxq *rxq) |
591 | | OVS_REQUIRES(pmd->port_mutex); |
592 | | static void dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd, |
593 | | struct rxq_poll *poll) |
594 | | OVS_REQUIRES(pmd->port_mutex); |
595 | | static int |
596 | | dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd, |
597 | | bool force); |
598 | | static void dp_netdev_add_bond_tx_to_pmd(struct dp_netdev_pmd_thread *pmd, |
599 | | struct tx_bond *bond, bool update) |
600 | | OVS_EXCLUDED(pmd->bond_mutex); |
601 | | static void dp_netdev_del_bond_tx_from_pmd(struct dp_netdev_pmd_thread *pmd, |
602 | | uint32_t bond_id) |
603 | | OVS_EXCLUDED(pmd->bond_mutex); |
604 | | |
605 | | static void dp_netdev_offload_flush(struct dp_netdev *dp, |
606 | | struct dp_netdev_port *port); |
607 | | |
608 | | static void reconfigure_datapath(struct dp_netdev *dp) |
609 | | OVS_REQ_RDLOCK(dp->port_rwlock); |
610 | | static bool dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd); |
611 | | static void dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd); |
612 | | static void dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd); |
613 | | static void pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd) |
614 | | OVS_REQUIRES(pmd->port_mutex); |
615 | | static inline void |
616 | | dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd, |
617 | | struct polled_queue *poll_list, int poll_cnt); |
618 | | static void |
619 | | dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx, |
620 | | enum rxq_cycles_counter_type type, |
621 | | unsigned long long cycles); |
622 | | static uint64_t |
623 | | dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx, |
624 | | enum rxq_cycles_counter_type type); |
625 | | static void |
626 | | dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx, |
627 | | unsigned long long cycles); |
628 | | static uint64_t |
629 | | dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx); |
630 | | static uint64_t |
631 | | get_interval_values(atomic_ullong *source, atomic_count *cur_idx, |
632 | | int num_to_read); |
633 | | static void |
634 | | dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd, |
635 | | bool purge); |
636 | | static int dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd, |
637 | | struct tx_port *tx); |
638 | | inline struct dpcls * |
639 | | dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd, |
640 | | odp_port_t in_port); |
641 | | |
642 | | static void dp_netdev_request_reconfigure(struct dp_netdev *dp); |
643 | | static inline bool |
644 | | pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd); |
645 | | static void queue_netdev_flow_del(struct dp_netdev_pmd_thread *pmd, |
646 | | struct dp_netdev_flow *flow); |
647 | | |
648 | | static void dp_netdev_simple_match_insert(struct dp_netdev_pmd_thread *pmd, |
649 | | struct dp_netdev_flow *flow) |
650 | | OVS_REQUIRES(pmd->flow_mutex); |
651 | | static void dp_netdev_simple_match_remove(struct dp_netdev_pmd_thread *pmd, |
652 | | struct dp_netdev_flow *flow) |
653 | | OVS_REQUIRES(pmd->flow_mutex); |
654 | | |
655 | | static bool dp_netdev_flow_is_simple_match(const struct match *); |
656 | | |
657 | | /* Updates the time in PMD threads context and should be called in three cases: |
658 | | * |
659 | | * 1. PMD structure initialization: |
660 | | * - dp_netdev_configure_pmd() |
661 | | * |
662 | | * 2. Before processing of the new packet batch: |
663 | | * - dpif_netdev_execute() |
664 | | * - dp_netdev_process_rxq_port() |
665 | | * |
666 | | * 3. At least once per polling iteration in main polling threads if no |
667 | | * packets received on current iteration: |
668 | | * - dpif_netdev_run() |
669 | | * - pmd_thread_main() |
670 | | * |
671 | | * 'pmd->ctx.now' should be used without update in all other cases if possible. |
672 | | */ |
673 | | static inline void |
674 | | pmd_thread_ctx_time_update(struct dp_netdev_pmd_thread *pmd) |
675 | 0 | { |
676 | 0 | pmd->ctx.now = time_usec(); |
677 | 0 | } |
678 | | |
679 | | /* Returns true if 'dpif' is a netdev or dummy dpif, false otherwise. */ |
680 | | bool |
681 | | dpif_is_netdev(const struct dpif *dpif) |
682 | 0 | { |
683 | 0 | return dpif->dpif_class->open == dpif_netdev_open; |
684 | 0 | } |
685 | | |
686 | | static struct dpif_netdev * |
687 | | dpif_netdev_cast(const struct dpif *dpif) |
688 | 0 | { |
689 | 0 | ovs_assert(dpif_is_netdev(dpif)); |
690 | 0 | return CONTAINER_OF(dpif, struct dpif_netdev, dpif); |
691 | 0 | } |
692 | | |
693 | | static struct dp_netdev * |
694 | | get_dp_netdev(const struct dpif *dpif) |
695 | 0 | { |
696 | 0 | return dpif_netdev_cast(dpif)->dp; |
697 | 0 | } |
698 | | |
699 | | enum pmd_info_type { |
700 | | PMD_INFO_SHOW_STATS, /* Show how cpu cycles are spent. */ |
701 | | PMD_INFO_CLEAR_STATS, /* Set the cycles count to 0. */ |
702 | | PMD_INFO_SHOW_RXQ, /* Show poll lists of pmd threads. */ |
703 | | PMD_INFO_PERF_SHOW, /* Show pmd performance details. */ |
704 | | }; |
705 | | |
706 | | static void |
707 | | format_pmd_thread(struct ds *reply, struct dp_netdev_pmd_thread *pmd) |
708 | 0 | { |
709 | 0 | ds_put_cstr(reply, (pmd->core_id == NON_PMD_CORE_ID) |
710 | 0 | ? "main thread" : "pmd thread"); |
711 | 0 | if (pmd->numa_id != OVS_NUMA_UNSPEC) { |
712 | 0 | ds_put_format(reply, " numa_id %d", pmd->numa_id); |
713 | 0 | } |
714 | 0 | if (pmd->core_id != OVS_CORE_UNSPEC && pmd->core_id != NON_PMD_CORE_ID) { |
715 | 0 | ds_put_format(reply, " core_id %u", pmd->core_id); |
716 | 0 | } |
717 | 0 | ds_put_cstr(reply, ":\n"); |
718 | 0 | } |
719 | | |
720 | | static void |
721 | | pmd_info_show_stats(struct ds *reply, |
722 | | struct dp_netdev_pmd_thread *pmd) |
723 | 0 | { |
724 | 0 | uint64_t stats[PMD_N_STATS]; |
725 | 0 | uint64_t total_cycles, total_packets; |
726 | 0 | double passes_per_pkt = 0; |
727 | 0 | double lookups_per_hit = 0; |
728 | 0 | double packets_per_batch = 0; |
729 | |
|
730 | 0 | pmd_perf_read_counters(&pmd->perf_stats, stats); |
731 | 0 | total_cycles = stats[PMD_CYCLES_ITER_IDLE] |
732 | 0 | + stats[PMD_CYCLES_ITER_BUSY]; |
733 | 0 | total_packets = stats[PMD_STAT_RECV]; |
734 | |
|
735 | 0 | format_pmd_thread(reply, pmd); |
736 | |
|
737 | 0 | if (total_packets > 0) { |
738 | 0 | passes_per_pkt = (total_packets + stats[PMD_STAT_RECIRC]) |
739 | 0 | / (double) total_packets; |
740 | 0 | } |
741 | 0 | if (stats[PMD_STAT_MASKED_HIT] > 0) { |
742 | 0 | lookups_per_hit = stats[PMD_STAT_MASKED_LOOKUP] |
743 | 0 | / (double) stats[PMD_STAT_MASKED_HIT]; |
744 | 0 | } |
745 | 0 | if (stats[PMD_STAT_SENT_BATCHES] > 0) { |
746 | 0 | packets_per_batch = stats[PMD_STAT_SENT_PKTS] |
747 | 0 | / (double) stats[PMD_STAT_SENT_BATCHES]; |
748 | 0 | } |
749 | |
|
750 | 0 | ds_put_format(reply, |
751 | 0 | " packets received: %"PRIu64"\n" |
752 | 0 | " packet recirculations: %"PRIu64"\n" |
753 | 0 | " avg. datapath passes per packet: %.02f\n" |
754 | 0 | " phwol hits: %"PRIu64"\n" |
755 | 0 | " mfex opt hits: %"PRIu64"\n" |
756 | 0 | " simple match hits: %"PRIu64"\n" |
757 | 0 | " emc hits: %"PRIu64"\n" |
758 | 0 | " smc hits: %"PRIu64"\n" |
759 | 0 | " megaflow hits: %"PRIu64"\n" |
760 | 0 | " avg. subtable lookups per megaflow hit: %.02f\n" |
761 | 0 | " miss with success upcall: %"PRIu64"\n" |
762 | 0 | " miss with failed upcall: %"PRIu64"\n" |
763 | 0 | " avg. packets per output batch: %.02f\n", |
764 | 0 | total_packets, stats[PMD_STAT_RECIRC], |
765 | 0 | passes_per_pkt, stats[PMD_STAT_PHWOL_HIT], |
766 | 0 | stats[PMD_STAT_MFEX_OPT_HIT], |
767 | 0 | stats[PMD_STAT_SIMPLE_HIT], |
768 | 0 | stats[PMD_STAT_EXACT_HIT], |
769 | 0 | stats[PMD_STAT_SMC_HIT], |
770 | 0 | stats[PMD_STAT_MASKED_HIT], |
771 | 0 | lookups_per_hit, stats[PMD_STAT_MISS], stats[PMD_STAT_LOST], |
772 | 0 | packets_per_batch); |
773 | |
|
774 | 0 | if (total_cycles == 0) { |
775 | 0 | return; |
776 | 0 | } |
777 | | |
778 | 0 | ds_put_format(reply, |
779 | 0 | " idle cycles: %"PRIu64" (%.02f%%)\n" |
780 | 0 | " processing cycles: %"PRIu64" (%.02f%%)\n", |
781 | 0 | stats[PMD_CYCLES_ITER_IDLE], |
782 | 0 | stats[PMD_CYCLES_ITER_IDLE] / (double) total_cycles * 100, |
783 | 0 | stats[PMD_CYCLES_ITER_BUSY], |
784 | 0 | stats[PMD_CYCLES_ITER_BUSY] / (double) total_cycles * 100); |
785 | |
|
786 | 0 | if (total_packets == 0) { |
787 | 0 | return; |
788 | 0 | } |
789 | | |
790 | 0 | ds_put_format(reply, |
791 | 0 | " avg cycles per packet: %.02f (%"PRIu64"/%"PRIu64")\n", |
792 | 0 | total_cycles / (double) total_packets, |
793 | 0 | total_cycles, total_packets); |
794 | |
|
795 | 0 | ds_put_format(reply, |
796 | 0 | " avg processing cycles per packet: " |
797 | 0 | "%.02f (%"PRIu64"/%"PRIu64")\n", |
798 | 0 | stats[PMD_CYCLES_ITER_BUSY] / (double) total_packets, |
799 | 0 | stats[PMD_CYCLES_ITER_BUSY], total_packets); |
800 | 0 | } |
801 | | |
802 | | static void |
803 | | pmd_info_show_perf(struct ds *reply, |
804 | | struct dp_netdev_pmd_thread *pmd, |
805 | | struct pmd_perf_params *par) |
806 | 0 | { |
807 | 0 | if (pmd->core_id != NON_PMD_CORE_ID) { |
808 | 0 | char *time_str = |
809 | 0 | xastrftime_msec("%H:%M:%S.###", time_wall_msec(), true); |
810 | 0 | long long now = time_msec(); |
811 | 0 | double duration = (now - pmd->perf_stats.start_ms) / 1000.0; |
812 | |
|
813 | 0 | ds_put_cstr(reply, "\n"); |
814 | 0 | ds_put_format(reply, "Time: %s\n", time_str); |
815 | 0 | ds_put_format(reply, "Measurement duration: %.3f s\n", duration); |
816 | 0 | ds_put_cstr(reply, "\n"); |
817 | 0 | format_pmd_thread(reply, pmd); |
818 | 0 | ds_put_cstr(reply, "\n"); |
819 | 0 | pmd_perf_format_overall_stats(reply, &pmd->perf_stats, duration); |
820 | 0 | if (pmd_perf_metrics_enabled(pmd)) { |
821 | | /* Prevent parallel clearing of perf metrics. */ |
822 | 0 | ovs_mutex_lock(&pmd->perf_stats.clear_mutex); |
823 | 0 | if (par->histograms) { |
824 | 0 | ds_put_cstr(reply, "\n"); |
825 | 0 | pmd_perf_format_histograms(reply, &pmd->perf_stats); |
826 | 0 | } |
827 | 0 | if (par->iter_hist_len > 0) { |
828 | 0 | ds_put_cstr(reply, "\n"); |
829 | 0 | pmd_perf_format_iteration_history(reply, &pmd->perf_stats, |
830 | 0 | par->iter_hist_len); |
831 | 0 | } |
832 | 0 | if (par->ms_hist_len > 0) { |
833 | 0 | ds_put_cstr(reply, "\n"); |
834 | 0 | pmd_perf_format_ms_history(reply, &pmd->perf_stats, |
835 | 0 | par->ms_hist_len); |
836 | 0 | } |
837 | 0 | ovs_mutex_unlock(&pmd->perf_stats.clear_mutex); |
838 | 0 | } |
839 | 0 | free(time_str); |
840 | 0 | } |
841 | 0 | } |
842 | | |
843 | | static int |
844 | | compare_poll_list(const void *a_, const void *b_) |
845 | 0 | { |
846 | 0 | const struct rxq_poll *a = a_; |
847 | 0 | const struct rxq_poll *b = b_; |
848 | |
|
849 | 0 | const char *namea = netdev_rxq_get_name(a->rxq->rx); |
850 | 0 | const char *nameb = netdev_rxq_get_name(b->rxq->rx); |
851 | |
|
852 | 0 | int cmp = strcmp(namea, nameb); |
853 | 0 | if (!cmp) { |
854 | 0 | return netdev_rxq_get_queue_id(a->rxq->rx) |
855 | 0 | - netdev_rxq_get_queue_id(b->rxq->rx); |
856 | 0 | } else { |
857 | 0 | return cmp; |
858 | 0 | } |
859 | 0 | } |
860 | | |
861 | | static void |
862 | | sorted_poll_list(struct dp_netdev_pmd_thread *pmd, struct rxq_poll **list, |
863 | | size_t *n) |
864 | | OVS_REQUIRES(pmd->port_mutex) |
865 | 0 | { |
866 | 0 | struct rxq_poll *ret, *poll; |
867 | 0 | size_t i; |
868 | |
|
869 | 0 | *n = hmap_count(&pmd->poll_list); |
870 | 0 | if (!*n) { |
871 | 0 | ret = NULL; |
872 | 0 | } else { |
873 | 0 | ret = xcalloc(*n, sizeof *ret); |
874 | 0 | i = 0; |
875 | 0 | HMAP_FOR_EACH (poll, node, &pmd->poll_list) { |
876 | 0 | ret[i] = *poll; |
877 | 0 | i++; |
878 | 0 | } |
879 | 0 | ovs_assert(i == *n); |
880 | 0 | qsort(ret, *n, sizeof *ret, compare_poll_list); |
881 | 0 | } |
882 | |
|
883 | 0 | *list = ret; |
884 | 0 | } |
885 | | |
886 | | static void |
887 | | pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd, |
888 | | int secs) |
889 | 0 | { |
890 | 0 | if (pmd->core_id != NON_PMD_CORE_ID) { |
891 | 0 | struct rxq_poll *list; |
892 | 0 | size_t n_rxq; |
893 | 0 | uint64_t total_pmd_cycles = 0; |
894 | 0 | uint64_t busy_pmd_cycles = 0; |
895 | 0 | uint64_t total_rxq_proc_cycles = 0; |
896 | 0 | unsigned int intervals; |
897 | |
|
898 | 0 | ds_put_format(reply, |
899 | 0 | "pmd thread numa_id %d core_id %u:\n isolated : %s\n", |
900 | 0 | pmd->numa_id, pmd->core_id, (pmd->isolated) |
901 | 0 | ? "true" : "false"); |
902 | |
|
903 | 0 | ovs_mutex_lock(&pmd->port_mutex); |
904 | 0 | sorted_poll_list(pmd, &list, &n_rxq); |
905 | | |
906 | | /* Get the total pmd cycles for an interval. */ |
907 | 0 | atomic_read_relaxed(&pmd->intrvl_cycles, &total_pmd_cycles); |
908 | | /* Calculate how many intervals are to be used. */ |
909 | 0 | intervals = DIV_ROUND_UP(secs, |
910 | 0 | PMD_INTERVAL_LEN / INTERVAL_USEC_TO_SEC); |
911 | | /* Estimate the cycles to cover all intervals. */ |
912 | 0 | total_pmd_cycles *= intervals; |
913 | 0 | busy_pmd_cycles = get_interval_values(pmd->busy_cycles_intrvl, |
914 | 0 | &pmd->intrvl_idx, |
915 | 0 | intervals); |
916 | 0 | if (busy_pmd_cycles > total_pmd_cycles) { |
917 | 0 | busy_pmd_cycles = total_pmd_cycles; |
918 | 0 | } |
919 | |
|
920 | 0 | for (int i = 0; i < n_rxq; i++) { |
921 | 0 | struct dp_netdev_rxq *rxq = list[i].rxq; |
922 | 0 | const char *name = netdev_rxq_get_name(rxq->rx); |
923 | 0 | uint64_t rxq_proc_cycles = 0; |
924 | |
|
925 | 0 | rxq_proc_cycles = get_interval_values(rxq->cycles_intrvl, |
926 | 0 | &rxq->intrvl_idx, |
927 | 0 | intervals); |
928 | 0 | total_rxq_proc_cycles += rxq_proc_cycles; |
929 | 0 | ds_put_format(reply, " port: %-16s queue-id: %2d", name, |
930 | 0 | netdev_rxq_get_queue_id(list[i].rxq->rx)); |
931 | 0 | ds_put_format(reply, " %s", netdev_rxq_enabled(list[i].rxq->rx) |
932 | 0 | ? "(enabled) " : "(disabled)"); |
933 | 0 | ds_put_format(reply, " pmd usage: "); |
934 | 0 | if (total_pmd_cycles) { |
935 | 0 | ds_put_format(reply, "%2"PRIu64"", |
936 | 0 | rxq_proc_cycles * 100 / total_pmd_cycles); |
937 | 0 | ds_put_cstr(reply, " %"); |
938 | 0 | } else { |
939 | 0 | ds_put_format(reply, "%s", "NOT AVAIL"); |
940 | 0 | } |
941 | 0 | ds_put_cstr(reply, "\n"); |
942 | 0 | } |
943 | |
|
944 | 0 | if (n_rxq > 0) { |
945 | 0 | ds_put_cstr(reply, " overhead: "); |
946 | 0 | if (total_pmd_cycles) { |
947 | 0 | uint64_t overhead_cycles = 0; |
948 | |
|
949 | 0 | if (total_rxq_proc_cycles < busy_pmd_cycles) { |
950 | 0 | overhead_cycles = busy_pmd_cycles - total_rxq_proc_cycles; |
951 | 0 | } |
952 | 0 | ds_put_format(reply, "%2"PRIu64" %%", |
953 | 0 | overhead_cycles * 100 / total_pmd_cycles); |
954 | 0 | } else { |
955 | 0 | ds_put_cstr(reply, "NOT AVAIL"); |
956 | 0 | } |
957 | 0 | ds_put_cstr(reply, "\n"); |
958 | 0 | } |
959 | |
|
960 | 0 | ovs_mutex_unlock(&pmd->port_mutex); |
961 | 0 | free(list); |
962 | 0 | } |
963 | 0 | } |
964 | | |
965 | | static int |
966 | | compare_poll_thread_list(const void *a_, const void *b_) |
967 | 0 | { |
968 | 0 | const struct dp_netdev_pmd_thread *a, *b; |
969 | |
|
970 | 0 | a = *(struct dp_netdev_pmd_thread **)a_; |
971 | 0 | b = *(struct dp_netdev_pmd_thread **)b_; |
972 | |
|
973 | 0 | if (a->core_id < b->core_id) { |
974 | 0 | return -1; |
975 | 0 | } |
976 | 0 | if (a->core_id > b->core_id) { |
977 | 0 | return 1; |
978 | 0 | } |
979 | 0 | return 0; |
980 | 0 | } |
981 | | |
982 | | /* Create a sorted list of pmd's from the dp->poll_threads cmap. We can use |
983 | | * this list, as long as we do not go to quiescent state. */ |
984 | | static void |
985 | | sorted_poll_thread_list(struct dp_netdev *dp, |
986 | | struct dp_netdev_pmd_thread ***list, |
987 | | size_t *n) |
988 | 0 | { |
989 | 0 | struct dp_netdev_pmd_thread *pmd; |
990 | 0 | struct dp_netdev_pmd_thread **pmd_list; |
991 | 0 | size_t k = 0, n_pmds; |
992 | |
|
993 | 0 | n_pmds = cmap_count(&dp->poll_threads); |
994 | 0 | pmd_list = xcalloc(n_pmds, sizeof *pmd_list); |
995 | |
|
996 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
997 | 0 | if (k >= n_pmds) { |
998 | 0 | break; |
999 | 0 | } |
1000 | 0 | pmd_list[k++] = pmd; |
1001 | 0 | } |
1002 | |
|
1003 | 0 | qsort(pmd_list, k, sizeof *pmd_list, compare_poll_thread_list); |
1004 | |
|
1005 | 0 | *list = pmd_list; |
1006 | 0 | *n = k; |
1007 | 0 | } |
1008 | | |
1009 | | static void |
1010 | | dpif_netdev_subtable_lookup_get(struct unixctl_conn *conn, int argc OVS_UNUSED, |
1011 | | const char *argv[] OVS_UNUSED, |
1012 | | void *aux OVS_UNUSED) |
1013 | 0 | { |
1014 | 0 | struct ds reply = DS_EMPTY_INITIALIZER; |
1015 | |
|
1016 | 0 | dpcls_impl_print_stats(&reply); |
1017 | 0 | unixctl_command_reply(conn, ds_cstr(&reply)); |
1018 | 0 | ds_destroy(&reply); |
1019 | 0 | } |
1020 | | |
1021 | | static void |
1022 | | dpif_netdev_subtable_lookup_set(struct unixctl_conn *conn, int argc OVS_UNUSED, |
1023 | | const char *argv[], void *aux OVS_UNUSED) |
1024 | 0 | { |
1025 | | /* This function requires 2 parameters (argv[1] and argv[2]) to execute. |
1026 | | * argv[1] is subtable name |
1027 | | * argv[2] is priority |
1028 | | */ |
1029 | 0 | const char *func_name = argv[1]; |
1030 | |
|
1031 | 0 | errno = 0; |
1032 | 0 | char *err_char; |
1033 | 0 | uint32_t new_prio = strtoul(argv[2], &err_char, 10); |
1034 | 0 | uint32_t lookup_dpcls_changed = 0; |
1035 | 0 | uint32_t lookup_subtable_changed = 0; |
1036 | 0 | struct shash_node *node; |
1037 | 0 | if (errno != 0 || new_prio > UINT8_MAX) { |
1038 | 0 | unixctl_command_reply_error(conn, |
1039 | 0 | "error converting priority, use integer in range 0-255\n"); |
1040 | 0 | return; |
1041 | 0 | } |
1042 | | |
1043 | 0 | int32_t err = dpcls_subtable_set_prio(func_name, new_prio); |
1044 | 0 | if (err) { |
1045 | 0 | unixctl_command_reply_error(conn, |
1046 | 0 | "error, subtable lookup function not found\n"); |
1047 | 0 | return; |
1048 | 0 | } |
1049 | | |
1050 | 0 | ovs_mutex_lock(&dp_netdev_mutex); |
1051 | 0 | SHASH_FOR_EACH (node, &dp_netdevs) { |
1052 | 0 | struct dp_netdev *dp = node->data; |
1053 | | |
1054 | | /* Get PMD threads list, required to get DPCLS instances. */ |
1055 | 0 | size_t n; |
1056 | 0 | struct dp_netdev_pmd_thread **pmd_list; |
1057 | 0 | sorted_poll_thread_list(dp, &pmd_list, &n); |
1058 | | |
1059 | | /* take port mutex as HMAP iters over them. */ |
1060 | 0 | ovs_rwlock_rdlock(&dp->port_rwlock); |
1061 | |
|
1062 | 0 | for (size_t i = 0; i < n; i++) { |
1063 | 0 | struct dp_netdev_pmd_thread *pmd = pmd_list[i]; |
1064 | 0 | if (pmd->core_id == NON_PMD_CORE_ID) { |
1065 | 0 | continue; |
1066 | 0 | } |
1067 | | |
1068 | 0 | struct dp_netdev_port *port = NULL; |
1069 | 0 | HMAP_FOR_EACH (port, node, &dp->ports) { |
1070 | 0 | odp_port_t in_port = port->port_no; |
1071 | 0 | struct dpcls *cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port); |
1072 | 0 | if (!cls) { |
1073 | 0 | continue; |
1074 | 0 | } |
1075 | 0 | ovs_mutex_lock(&pmd->flow_mutex); |
1076 | 0 | uint32_t subtbl_changes = dpcls_subtable_lookup_reprobe(cls); |
1077 | 0 | ovs_mutex_unlock(&pmd->flow_mutex); |
1078 | 0 | if (subtbl_changes) { |
1079 | 0 | lookup_dpcls_changed++; |
1080 | 0 | lookup_subtable_changed += subtbl_changes; |
1081 | 0 | } |
1082 | 0 | } |
1083 | 0 | } |
1084 | | |
1085 | | /* release port mutex before netdev mutex. */ |
1086 | 0 | ovs_rwlock_unlock(&dp->port_rwlock); |
1087 | 0 | free(pmd_list); |
1088 | 0 | } |
1089 | 0 | ovs_mutex_unlock(&dp_netdev_mutex); |
1090 | |
|
1091 | 0 | struct ds reply = DS_EMPTY_INITIALIZER; |
1092 | 0 | ds_put_format(&reply, |
1093 | 0 | "Lookup priority change affected %d dpcls ports and %d subtables.\n", |
1094 | 0 | lookup_dpcls_changed, lookup_subtable_changed); |
1095 | 0 | const char *reply_str = ds_cstr(&reply); |
1096 | 0 | unixctl_command_reply(conn, reply_str); |
1097 | 0 | VLOG_INFO("%s", reply_str); |
1098 | 0 | ds_destroy(&reply); |
1099 | 0 | } |
1100 | | |
1101 | | static void |
1102 | | dpif_netdev_impl_get(struct unixctl_conn *conn, int argc OVS_UNUSED, |
1103 | | const char *argv[] OVS_UNUSED, void *aux OVS_UNUSED) |
1104 | 0 | { |
1105 | 0 | struct ds reply = DS_EMPTY_INITIALIZER; |
1106 | 0 | struct shash_node *node; |
1107 | |
|
1108 | 0 | ovs_mutex_lock(&dp_netdev_mutex); |
1109 | 0 | SHASH_FOR_EACH (node, &dp_netdevs) { |
1110 | 0 | struct dp_netdev_pmd_thread **pmd_list; |
1111 | 0 | struct dp_netdev *dp = node->data; |
1112 | 0 | size_t n; |
1113 | | |
1114 | | /* Get PMD threads list, required to get the DPIF impl used by each PMD |
1115 | | * thread. */ |
1116 | 0 | sorted_poll_thread_list(dp, &pmd_list, &n); |
1117 | 0 | dp_netdev_impl_get(&reply, pmd_list, n); |
1118 | 0 | free(pmd_list); |
1119 | 0 | } |
1120 | 0 | ovs_mutex_unlock(&dp_netdev_mutex); |
1121 | 0 | unixctl_command_reply(conn, ds_cstr(&reply)); |
1122 | 0 | ds_destroy(&reply); |
1123 | 0 | } |
1124 | | |
1125 | | static void |
1126 | | dpif_netdev_impl_set(struct unixctl_conn *conn, int argc OVS_UNUSED, |
1127 | | const char *argv[], void *aux OVS_UNUSED) |
1128 | 0 | { |
1129 | | /* This function requires just one parameter, the DPIF name. */ |
1130 | 0 | const char *dpif_name = argv[1]; |
1131 | 0 | struct shash_node *node; |
1132 | |
|
1133 | 0 | static const char *error_description[2] = { |
1134 | 0 | "Unknown DPIF implementation", |
1135 | 0 | "CPU doesn't support the required instruction for", |
1136 | 0 | }; |
1137 | |
|
1138 | 0 | ovs_mutex_lock(&dp_netdev_mutex); |
1139 | 0 | int32_t err = dp_netdev_impl_set_default_by_name(dpif_name); |
1140 | |
|
1141 | 0 | if (err) { |
1142 | 0 | struct ds reply = DS_EMPTY_INITIALIZER; |
1143 | 0 | ds_put_format(&reply, "DPIF implementation not available: %s %s.\n", |
1144 | 0 | error_description[ (err == -ENOTSUP) ], dpif_name); |
1145 | 0 | const char *reply_str = ds_cstr(&reply); |
1146 | 0 | unixctl_command_reply_error(conn, reply_str); |
1147 | 0 | VLOG_ERR("%s", reply_str); |
1148 | 0 | ds_destroy(&reply); |
1149 | 0 | ovs_mutex_unlock(&dp_netdev_mutex); |
1150 | 0 | return; |
1151 | 0 | } |
1152 | | |
1153 | 0 | SHASH_FOR_EACH (node, &dp_netdevs) { |
1154 | 0 | struct dp_netdev *dp = node->data; |
1155 | | |
1156 | | /* Get PMD threads list, required to get DPCLS instances. */ |
1157 | 0 | size_t n; |
1158 | 0 | struct dp_netdev_pmd_thread **pmd_list; |
1159 | 0 | sorted_poll_thread_list(dp, &pmd_list, &n); |
1160 | |
|
1161 | 0 | for (size_t i = 0; i < n; i++) { |
1162 | 0 | struct dp_netdev_pmd_thread *pmd = pmd_list[i]; |
1163 | 0 | if (pmd->core_id == NON_PMD_CORE_ID) { |
1164 | 0 | continue; |
1165 | 0 | } |
1166 | | |
1167 | | /* Initialize DPIF function pointer to the newly configured |
1168 | | * default. */ |
1169 | 0 | atomic_store_relaxed(&pmd->netdev_input_func, |
1170 | 0 | dp_netdev_impl_get_default()); |
1171 | 0 | }; |
1172 | |
|
1173 | 0 | free(pmd_list); |
1174 | 0 | } |
1175 | 0 | ovs_mutex_unlock(&dp_netdev_mutex); |
1176 | | |
1177 | | /* Reply with success to command. */ |
1178 | 0 | struct ds reply = DS_EMPTY_INITIALIZER; |
1179 | 0 | ds_put_format(&reply, "DPIF implementation set to %s.\n", dpif_name); |
1180 | 0 | const char *reply_str = ds_cstr(&reply); |
1181 | 0 | unixctl_command_reply(conn, reply_str); |
1182 | 0 | VLOG_INFO("%s", reply_str); |
1183 | 0 | ds_destroy(&reply); |
1184 | 0 | } |
1185 | | |
1186 | | static void |
1187 | | dpif_miniflow_extract_impl_get(struct unixctl_conn *conn, int argc OVS_UNUSED, |
1188 | | const char *argv[] OVS_UNUSED, |
1189 | | void *aux OVS_UNUSED) |
1190 | 0 | { |
1191 | 0 | struct ds reply = DS_EMPTY_INITIALIZER; |
1192 | 0 | struct shash_node *node; |
1193 | |
|
1194 | 0 | ovs_mutex_lock(&dp_netdev_mutex); |
1195 | 0 | SHASH_FOR_EACH (node, &dp_netdevs) { |
1196 | 0 | struct dp_netdev_pmd_thread **pmd_list; |
1197 | 0 | struct dp_netdev *dp = node->data; |
1198 | 0 | size_t n; |
1199 | | |
1200 | | /* Get PMD threads list, required to get the DPIF impl used by each PMD |
1201 | | * thread. */ |
1202 | 0 | sorted_poll_thread_list(dp, &pmd_list, &n); |
1203 | 0 | dp_mfex_impl_get(&reply, pmd_list, n); |
1204 | 0 | free(pmd_list); |
1205 | 0 | } |
1206 | 0 | ovs_mutex_unlock(&dp_netdev_mutex); |
1207 | 0 | unixctl_command_reply(conn, ds_cstr(&reply)); |
1208 | 0 | ds_destroy(&reply); |
1209 | 0 | } |
1210 | | |
1211 | | static void |
1212 | | dpif_miniflow_extract_impl_set(struct unixctl_conn *conn, int argc, |
1213 | | const char *argv[], void *aux OVS_UNUSED) |
1214 | 0 | { |
1215 | | /* This command takes some optional and mandatory arguments. The function |
1216 | | * here first parses all of the options, saving results in local variables. |
1217 | | * Then the parsed values are acted on. |
1218 | | */ |
1219 | 0 | unsigned int pmd_thread_to_change = NON_PMD_CORE_ID; |
1220 | 0 | unsigned int study_count = MFEX_MAX_PKT_COUNT; |
1221 | 0 | struct ds reply = DS_EMPTY_INITIALIZER; |
1222 | 0 | bool pmd_thread_update_done = false; |
1223 | 0 | bool mfex_name_is_study = false; |
1224 | 0 | const char *mfex_name = NULL; |
1225 | 0 | const char *reply_str = NULL; |
1226 | 0 | struct shash_node *node; |
1227 | 0 | int err; |
1228 | |
|
1229 | 0 | while (argc > 1) { |
1230 | | /* Optional argument "-pmd" limits the commands actions to just this |
1231 | | * PMD thread. |
1232 | | */ |
1233 | 0 | if ((!strcmp(argv[1], "-pmd") && !mfex_name)) { |
1234 | 0 | if (argc < 3) { |
1235 | 0 | ds_put_format(&reply, |
1236 | 0 | "Error: -pmd option requires a thread id" |
1237 | 0 | " argument.\n"); |
1238 | 0 | goto error; |
1239 | 0 | } |
1240 | | |
1241 | | /* Ensure argument can be parsed to an integer. */ |
1242 | 0 | if (!str_to_uint(argv[2], 10, &pmd_thread_to_change) || |
1243 | 0 | (pmd_thread_to_change == NON_PMD_CORE_ID)) { |
1244 | 0 | ds_put_format(&reply, |
1245 | 0 | "Error: miniflow extract parser not changed," |
1246 | 0 | " PMD thread passed is not valid: '%s'." |
1247 | 0 | " Pass a valid pmd thread ID.\n", |
1248 | 0 | argv[2]); |
1249 | 0 | goto error; |
1250 | 0 | } |
1251 | | |
1252 | 0 | argc -= 2; |
1253 | 0 | argv += 2; |
1254 | |
|
1255 | 0 | } else if (!mfex_name) { |
1256 | | /* Name of MFEX impl requested by user. */ |
1257 | 0 | mfex_name = argv[1]; |
1258 | 0 | mfex_name_is_study = strcmp("study", mfex_name) == 0; |
1259 | 0 | argc -= 1; |
1260 | 0 | argv += 1; |
1261 | | |
1262 | | /* If name is study and more args exist, parse study_count value. */ |
1263 | 0 | } else if (mfex_name && mfex_name_is_study) { |
1264 | 0 | if (!str_to_uint(argv[1], 10, &study_count) || |
1265 | 0 | (study_count == 0)) { |
1266 | 0 | ds_put_format(&reply, |
1267 | 0 | "Error: invalid study_pkt_cnt value: %s.\n", |
1268 | 0 | argv[1]); |
1269 | 0 | goto error; |
1270 | 0 | } |
1271 | | |
1272 | 0 | argc -= 1; |
1273 | 0 | argv += 1; |
1274 | 0 | } else { |
1275 | 0 | ds_put_format(&reply, "Error: unknown argument %s.\n", argv[1]); |
1276 | 0 | goto error; |
1277 | 0 | } |
1278 | 0 | } |
1279 | | |
1280 | | /* Ensure user passed an MFEX name. */ |
1281 | 0 | if (!mfex_name) { |
1282 | 0 | ds_put_format(&reply, "Error: no miniflow extract name provided." |
1283 | 0 | " Output of miniflow-parser-get shows implementation" |
1284 | 0 | " list.\n"); |
1285 | 0 | goto error; |
1286 | 0 | } |
1287 | | |
1288 | | /* If the MFEX name is "study", set the study packet count. */ |
1289 | 0 | if (mfex_name_is_study) { |
1290 | 0 | err = mfex_set_study_pkt_cnt(study_count, mfex_name); |
1291 | 0 | if (err) { |
1292 | 0 | ds_put_format(&reply, "Error: failed to set study count %d for" |
1293 | 0 | " miniflow extract implementation %s.\n", |
1294 | 0 | study_count, mfex_name); |
1295 | 0 | goto error; |
1296 | 0 | } |
1297 | 0 | } |
1298 | | |
1299 | | /* Set the default MFEX impl only if the command was applied to all PMD |
1300 | | * threads. If a PMD thread was selected, do NOT update the default. |
1301 | | */ |
1302 | 0 | if (pmd_thread_to_change == NON_PMD_CORE_ID) { |
1303 | 0 | err = dp_mfex_impl_set_default_by_name(mfex_name); |
1304 | 0 | if (err == -ENODEV) { |
1305 | 0 | ds_put_format(&reply, |
1306 | 0 | "Error: miniflow extract not available due to CPU" |
1307 | 0 | " ISA requirements: %s", |
1308 | 0 | mfex_name); |
1309 | 0 | goto error; |
1310 | 0 | } else if (err) { |
1311 | 0 | ds_put_format(&reply, |
1312 | 0 | "Error: unknown miniflow extract implementation %s.", |
1313 | 0 | mfex_name); |
1314 | 0 | goto error; |
1315 | 0 | } |
1316 | 0 | } |
1317 | | |
1318 | | /* Get the desired MFEX function pointer and error check its usage. */ |
1319 | 0 | miniflow_extract_func mfex_func = NULL; |
1320 | 0 | err = dp_mfex_impl_get_by_name(mfex_name, &mfex_func); |
1321 | 0 | if (err) { |
1322 | 0 | if (err == -ENODEV) { |
1323 | 0 | ds_put_format(&reply, |
1324 | 0 | "Error: miniflow extract not available due to CPU" |
1325 | 0 | " ISA requirements: %s", mfex_name); |
1326 | 0 | } else { |
1327 | 0 | ds_put_format(&reply, |
1328 | 0 | "Error: unknown miniflow extract implementation %s.", |
1329 | 0 | mfex_name); |
1330 | 0 | } |
1331 | 0 | goto error; |
1332 | 0 | } |
1333 | | |
1334 | | /* Apply the MFEX pointer to each pmd thread in each netdev, filtering |
1335 | | * by the users "-pmd" argument if required. |
1336 | | */ |
1337 | 0 | ovs_mutex_lock(&dp_netdev_mutex); |
1338 | |
|
1339 | 0 | SHASH_FOR_EACH (node, &dp_netdevs) { |
1340 | 0 | struct dp_netdev_pmd_thread **pmd_list; |
1341 | 0 | struct dp_netdev *dp = node->data; |
1342 | 0 | size_t n; |
1343 | |
|
1344 | 0 | sorted_poll_thread_list(dp, &pmd_list, &n); |
1345 | |
|
1346 | 0 | for (size_t i = 0; i < n; i++) { |
1347 | 0 | struct dp_netdev_pmd_thread *pmd = pmd_list[i]; |
1348 | 0 | if (pmd->core_id == NON_PMD_CORE_ID) { |
1349 | 0 | continue; |
1350 | 0 | } |
1351 | | |
1352 | | /* If -pmd specified, skip all other pmd threads. */ |
1353 | 0 | if ((pmd_thread_to_change != NON_PMD_CORE_ID) && |
1354 | 0 | (pmd->core_id != pmd_thread_to_change)) { |
1355 | 0 | continue; |
1356 | 0 | } |
1357 | | |
1358 | 0 | pmd_thread_update_done = true; |
1359 | 0 | atomic_store_relaxed(&pmd->miniflow_extract_opt, mfex_func); |
1360 | 0 | }; |
1361 | |
|
1362 | 0 | free(pmd_list); |
1363 | 0 | } |
1364 | |
|
1365 | 0 | ovs_mutex_unlock(&dp_netdev_mutex); |
1366 | | |
1367 | | /* If PMD thread was specified, but it wasn't found, return error. */ |
1368 | 0 | if (pmd_thread_to_change != NON_PMD_CORE_ID && !pmd_thread_update_done) { |
1369 | 0 | ds_put_format(&reply, |
1370 | 0 | "Error: miniflow extract parser not changed, " |
1371 | 0 | "PMD thread %d not in use, pass a valid pmd" |
1372 | 0 | " thread ID.\n", pmd_thread_to_change); |
1373 | 0 | goto error; |
1374 | 0 | } |
1375 | | |
1376 | | /* Reply with success to command. */ |
1377 | 0 | ds_put_format(&reply, "Miniflow extract implementation set to %s", |
1378 | 0 | mfex_name); |
1379 | 0 | if (pmd_thread_to_change != NON_PMD_CORE_ID) { |
1380 | 0 | ds_put_format(&reply, ", on pmd thread %d", pmd_thread_to_change); |
1381 | 0 | } |
1382 | 0 | if (mfex_name_is_study) { |
1383 | 0 | ds_put_format(&reply, ", studying %d packets", study_count); |
1384 | 0 | } |
1385 | 0 | ds_put_format(&reply, ".\n"); |
1386 | |
|
1387 | 0 | reply_str = ds_cstr(&reply); |
1388 | 0 | VLOG_INFO("%s", reply_str); |
1389 | 0 | unixctl_command_reply(conn, reply_str); |
1390 | 0 | ds_destroy(&reply); |
1391 | 0 | return; |
1392 | | |
1393 | 0 | error: |
1394 | 0 | reply_str = ds_cstr(&reply); |
1395 | 0 | VLOG_ERR("%s", reply_str); |
1396 | 0 | unixctl_command_reply_error(conn, reply_str); |
1397 | 0 | ds_destroy(&reply); |
1398 | 0 | } |
1399 | | |
1400 | | static void |
1401 | | dpif_netdev_pmd_rebalance(struct unixctl_conn *conn, int argc, |
1402 | | const char *argv[], void *aux OVS_UNUSED) |
1403 | 0 | { |
1404 | 0 | struct ds reply = DS_EMPTY_INITIALIZER; |
1405 | 0 | struct dp_netdev *dp = NULL; |
1406 | |
|
1407 | 0 | ovs_mutex_lock(&dp_netdev_mutex); |
1408 | |
|
1409 | 0 | if (argc == 2) { |
1410 | 0 | dp = shash_find_data(&dp_netdevs, argv[1]); |
1411 | 0 | } else if (shash_count(&dp_netdevs) == 1) { |
1412 | | /* There's only one datapath */ |
1413 | 0 | dp = shash_first(&dp_netdevs)->data; |
1414 | 0 | } |
1415 | |
|
1416 | 0 | if (!dp) { |
1417 | 0 | ovs_mutex_unlock(&dp_netdev_mutex); |
1418 | 0 | unixctl_command_reply_error(conn, |
1419 | 0 | "please specify an existing datapath"); |
1420 | 0 | return; |
1421 | 0 | } |
1422 | | |
1423 | 0 | dp_netdev_request_reconfigure(dp); |
1424 | 0 | ovs_mutex_unlock(&dp_netdev_mutex); |
1425 | 0 | ds_put_cstr(&reply, "pmd rxq rebalance requested.\n"); |
1426 | 0 | unixctl_command_reply(conn, ds_cstr(&reply)); |
1427 | 0 | ds_destroy(&reply); |
1428 | 0 | } |
1429 | | |
1430 | | static void |
1431 | | dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[], |
1432 | | void *aux) |
1433 | 0 | { |
1434 | 0 | struct ds reply = DS_EMPTY_INITIALIZER; |
1435 | 0 | struct dp_netdev_pmd_thread **pmd_list; |
1436 | 0 | struct dp_netdev *dp = NULL; |
1437 | 0 | enum pmd_info_type type = *(enum pmd_info_type *) aux; |
1438 | 0 | unsigned int core_id; |
1439 | 0 | bool filter_on_pmd = false; |
1440 | 0 | size_t n; |
1441 | 0 | unsigned int secs = 0; |
1442 | 0 | unsigned long long max_secs = (PMD_INTERVAL_LEN * PMD_INTERVAL_MAX) |
1443 | 0 | / INTERVAL_USEC_TO_SEC; |
1444 | 0 | bool first_show_rxq = true; |
1445 | |
|
1446 | 0 | ovs_mutex_lock(&dp_netdev_mutex); |
1447 | |
|
1448 | 0 | while (argc > 1) { |
1449 | 0 | if (!strcmp(argv[1], "-pmd") && argc > 2) { |
1450 | 0 | if (str_to_uint(argv[2], 10, &core_id)) { |
1451 | 0 | filter_on_pmd = true; |
1452 | 0 | } |
1453 | 0 | argc -= 2; |
1454 | 0 | argv += 2; |
1455 | 0 | } else if (type == PMD_INFO_SHOW_RXQ && |
1456 | 0 | !strcmp(argv[1], "-secs") && |
1457 | 0 | argc > 2) { |
1458 | 0 | if (!str_to_uint(argv[2], 10, &secs)) { |
1459 | 0 | secs = max_secs; |
1460 | 0 | } |
1461 | 0 | argc -= 2; |
1462 | 0 | argv += 2; |
1463 | 0 | } else { |
1464 | 0 | dp = shash_find_data(&dp_netdevs, argv[1]); |
1465 | 0 | argc -= 1; |
1466 | 0 | argv += 1; |
1467 | 0 | } |
1468 | 0 | } |
1469 | |
|
1470 | 0 | if (!dp) { |
1471 | 0 | if (shash_count(&dp_netdevs) == 1) { |
1472 | | /* There's only one datapath */ |
1473 | 0 | dp = shash_first(&dp_netdevs)->data; |
1474 | 0 | } else { |
1475 | 0 | ovs_mutex_unlock(&dp_netdev_mutex); |
1476 | 0 | unixctl_command_reply_error(conn, |
1477 | 0 | "please specify an existing datapath"); |
1478 | 0 | return; |
1479 | 0 | } |
1480 | 0 | } |
1481 | | |
1482 | 0 | sorted_poll_thread_list(dp, &pmd_list, &n); |
1483 | 0 | for (size_t i = 0; i < n; i++) { |
1484 | 0 | struct dp_netdev_pmd_thread *pmd = pmd_list[i]; |
1485 | 0 | if (!pmd) { |
1486 | 0 | break; |
1487 | 0 | } |
1488 | 0 | if (filter_on_pmd && pmd->core_id != core_id) { |
1489 | 0 | continue; |
1490 | 0 | } |
1491 | 0 | if (type == PMD_INFO_SHOW_RXQ) { |
1492 | 0 | if (first_show_rxq) { |
1493 | 0 | if (!secs || secs > max_secs) { |
1494 | 0 | secs = max_secs; |
1495 | 0 | } else { |
1496 | 0 | secs = ROUND_UP(secs, |
1497 | 0 | PMD_INTERVAL_LEN / INTERVAL_USEC_TO_SEC); |
1498 | 0 | } |
1499 | 0 | ds_put_format(&reply, "Displaying last %u seconds " |
1500 | 0 | "pmd usage %%\n", secs); |
1501 | 0 | first_show_rxq = false; |
1502 | 0 | } |
1503 | 0 | pmd_info_show_rxq(&reply, pmd, secs); |
1504 | 0 | } else if (type == PMD_INFO_CLEAR_STATS) { |
1505 | 0 | pmd_perf_stats_clear(&pmd->perf_stats); |
1506 | 0 | } else if (type == PMD_INFO_SHOW_STATS) { |
1507 | 0 | pmd_info_show_stats(&reply, pmd); |
1508 | 0 | } else if (type == PMD_INFO_PERF_SHOW) { |
1509 | 0 | pmd_info_show_perf(&reply, pmd, (struct pmd_perf_params *)aux); |
1510 | 0 | } |
1511 | 0 | } |
1512 | 0 | free(pmd_list); |
1513 | |
|
1514 | 0 | ovs_mutex_unlock(&dp_netdev_mutex); |
1515 | |
|
1516 | 0 | unixctl_command_reply(conn, ds_cstr(&reply)); |
1517 | 0 | ds_destroy(&reply); |
1518 | 0 | } |
1519 | | |
1520 | | static void |
1521 | | pmd_perf_show_cmd(struct unixctl_conn *conn, int argc, |
1522 | | const char *argv[], |
1523 | | void *aux OVS_UNUSED) |
1524 | 0 | { |
1525 | 0 | struct pmd_perf_params par; |
1526 | 0 | long int it_hist = 0, ms_hist = 0; |
1527 | 0 | par.histograms = true; |
1528 | |
|
1529 | 0 | while (argc > 1) { |
1530 | 0 | if (!strcmp(argv[1], "-nh")) { |
1531 | 0 | par.histograms = false; |
1532 | 0 | argc -= 1; |
1533 | 0 | argv += 1; |
1534 | 0 | } else if (!strcmp(argv[1], "-it") && argc > 2) { |
1535 | 0 | it_hist = strtol(argv[2], NULL, 10); |
1536 | 0 | if (it_hist < 0) { |
1537 | 0 | it_hist = 0; |
1538 | 0 | } else if (it_hist > HISTORY_LEN) { |
1539 | 0 | it_hist = HISTORY_LEN; |
1540 | 0 | } |
1541 | 0 | argc -= 2; |
1542 | 0 | argv += 2; |
1543 | 0 | } else if (!strcmp(argv[1], "-ms") && argc > 2) { |
1544 | 0 | ms_hist = strtol(argv[2], NULL, 10); |
1545 | 0 | if (ms_hist < 0) { |
1546 | 0 | ms_hist = 0; |
1547 | 0 | } else if (ms_hist > HISTORY_LEN) { |
1548 | 0 | ms_hist = HISTORY_LEN; |
1549 | 0 | } |
1550 | 0 | argc -= 2; |
1551 | 0 | argv += 2; |
1552 | 0 | } else { |
1553 | 0 | break; |
1554 | 0 | } |
1555 | 0 | } |
1556 | 0 | par.iter_hist_len = it_hist; |
1557 | 0 | par.ms_hist_len = ms_hist; |
1558 | 0 | par.command_type = PMD_INFO_PERF_SHOW; |
1559 | 0 | dpif_netdev_pmd_info(conn, argc, argv, &par); |
1560 | 0 | } |
1561 | | |
1562 | | static void |
1563 | | dpif_netdev_bond_show(struct unixctl_conn *conn, int argc, |
1564 | | const char *argv[], void *aux OVS_UNUSED) |
1565 | 0 | { |
1566 | 0 | struct ds reply = DS_EMPTY_INITIALIZER; |
1567 | 0 | struct dp_netdev *dp = NULL; |
1568 | |
|
1569 | 0 | ovs_mutex_lock(&dp_netdev_mutex); |
1570 | 0 | if (argc == 2) { |
1571 | 0 | dp = shash_find_data(&dp_netdevs, argv[1]); |
1572 | 0 | } else if (shash_count(&dp_netdevs) == 1) { |
1573 | | /* There's only one datapath. */ |
1574 | 0 | dp = shash_first(&dp_netdevs)->data; |
1575 | 0 | } |
1576 | 0 | if (!dp) { |
1577 | 0 | ovs_mutex_unlock(&dp_netdev_mutex); |
1578 | 0 | unixctl_command_reply_error(conn, |
1579 | 0 | "please specify an existing datapath"); |
1580 | 0 | return; |
1581 | 0 | } |
1582 | | |
1583 | 0 | if (cmap_count(&dp->tx_bonds) > 0) { |
1584 | 0 | struct tx_bond *dp_bond_entry; |
1585 | |
|
1586 | 0 | ds_put_cstr(&reply, "Bonds:\n"); |
1587 | 0 | CMAP_FOR_EACH (dp_bond_entry, node, &dp->tx_bonds) { |
1588 | 0 | ds_put_format(&reply, " bond-id %"PRIu32":\n", |
1589 | 0 | dp_bond_entry->bond_id); |
1590 | 0 | for (int bucket = 0; bucket < BOND_BUCKETS; bucket++) { |
1591 | 0 | uint32_t member_id = odp_to_u32( |
1592 | 0 | dp_bond_entry->member_buckets[bucket].member_id); |
1593 | 0 | ds_put_format(&reply, |
1594 | 0 | " bucket %d - member %"PRIu32"\n", |
1595 | 0 | bucket, member_id); |
1596 | 0 | } |
1597 | 0 | } |
1598 | 0 | } |
1599 | 0 | ovs_mutex_unlock(&dp_netdev_mutex); |
1600 | 0 | unixctl_command_reply(conn, ds_cstr(&reply)); |
1601 | 0 | ds_destroy(&reply); |
1602 | 0 | } |
1603 | | |
1604 | | |
1605 | | static int |
1606 | | dpif_netdev_init(void) |
1607 | 0 | { |
1608 | 0 | static enum pmd_info_type show_aux = PMD_INFO_SHOW_STATS, |
1609 | 0 | clear_aux = PMD_INFO_CLEAR_STATS, |
1610 | 0 | poll_aux = PMD_INFO_SHOW_RXQ; |
1611 | |
|
1612 | 0 | unixctl_command_register("dpif-netdev/pmd-stats-show", "[-pmd core] [dp]", |
1613 | 0 | 0, 3, dpif_netdev_pmd_info, |
1614 | 0 | (void *)&show_aux); |
1615 | 0 | unixctl_command_register("dpif-netdev/pmd-stats-clear", "[-pmd core] [dp]", |
1616 | 0 | 0, 3, dpif_netdev_pmd_info, |
1617 | 0 | (void *)&clear_aux); |
1618 | 0 | unixctl_command_register("dpif-netdev/pmd-rxq-show", "[-pmd core] " |
1619 | 0 | "[-secs secs] [dp]", |
1620 | 0 | 0, 5, dpif_netdev_pmd_info, |
1621 | 0 | (void *)&poll_aux); |
1622 | 0 | unixctl_command_register("dpif-netdev/pmd-perf-show", |
1623 | 0 | "[-nh] [-it iter-history-len]" |
1624 | 0 | " [-ms ms-history-len]" |
1625 | 0 | " [-pmd core] [dp]", |
1626 | 0 | 0, 8, pmd_perf_show_cmd, |
1627 | 0 | NULL); |
1628 | 0 | unixctl_command_register("dpif-netdev/pmd-rxq-rebalance", "[dp]", |
1629 | 0 | 0, 1, dpif_netdev_pmd_rebalance, |
1630 | 0 | NULL); |
1631 | 0 | unixctl_command_register("dpif-netdev/pmd-perf-log-set", |
1632 | 0 | "on|off [-b before] [-a after] [-e|-ne] " |
1633 | 0 | "[-us usec] [-q qlen]", |
1634 | 0 | 0, 10, pmd_perf_log_set_cmd, |
1635 | 0 | NULL); |
1636 | 0 | unixctl_command_register("dpif-netdev/bond-show", "[dp]", |
1637 | 0 | 0, 1, dpif_netdev_bond_show, |
1638 | 0 | NULL); |
1639 | 0 | unixctl_command_register("dpif-netdev/subtable-lookup-prio-set", |
1640 | 0 | "[lookup_func] [prio]", |
1641 | 0 | 2, 2, dpif_netdev_subtable_lookup_set, |
1642 | 0 | NULL); |
1643 | 0 | unixctl_command_register("dpif-netdev/subtable-lookup-info-get", "", |
1644 | 0 | 0, 0, dpif_netdev_subtable_lookup_get, |
1645 | 0 | NULL); |
1646 | 0 | unixctl_command_register("dpif-netdev/subtable-lookup-prio-get", NULL, |
1647 | 0 | 0, 0, dpif_netdev_subtable_lookup_get, |
1648 | 0 | NULL); |
1649 | 0 | unixctl_command_register("dpif-netdev/dpif-impl-set", |
1650 | 0 | "dpif_implementation_name", |
1651 | 0 | 1, 1, dpif_netdev_impl_set, |
1652 | 0 | NULL); |
1653 | 0 | unixctl_command_register("dpif-netdev/dpif-impl-get", "", |
1654 | 0 | 0, 0, dpif_netdev_impl_get, |
1655 | 0 | NULL); |
1656 | 0 | unixctl_command_register("dpif-netdev/miniflow-parser-set", |
1657 | 0 | "[-pmd core] miniflow_implementation_name" |
1658 | 0 | " [study_pkt_cnt]", |
1659 | 0 | 1, 5, dpif_miniflow_extract_impl_set, |
1660 | 0 | NULL); |
1661 | 0 | unixctl_command_register("dpif-netdev/miniflow-parser-get", "", |
1662 | 0 | 0, 0, dpif_miniflow_extract_impl_get, |
1663 | 0 | NULL); |
1664 | 0 | return 0; |
1665 | 0 | } |
1666 | | |
1667 | | static int |
1668 | | dpif_netdev_enumerate(struct sset *all_dps, |
1669 | | const struct dpif_class *dpif_class) |
1670 | 0 | { |
1671 | 0 | struct shash_node *node; |
1672 | |
|
1673 | 0 | ovs_mutex_lock(&dp_netdev_mutex); |
1674 | 0 | SHASH_FOR_EACH(node, &dp_netdevs) { |
1675 | 0 | struct dp_netdev *dp = node->data; |
1676 | 0 | if (dpif_class != dp->class) { |
1677 | | /* 'dp_netdevs' contains both "netdev" and "dummy" dpifs. |
1678 | | * If the class doesn't match, skip this dpif. */ |
1679 | 0 | continue; |
1680 | 0 | } |
1681 | 0 | sset_add(all_dps, node->name); |
1682 | 0 | } |
1683 | 0 | ovs_mutex_unlock(&dp_netdev_mutex); |
1684 | |
|
1685 | 0 | return 0; |
1686 | 0 | } |
1687 | | |
1688 | | static bool |
1689 | | dpif_netdev_class_is_dummy(const struct dpif_class *class) |
1690 | 0 | { |
1691 | 0 | return class != &dpif_netdev_class; |
1692 | 0 | } |
1693 | | |
1694 | | static const char * |
1695 | | dpif_netdev_port_open_type(const struct dpif_class *class, const char *type) |
1696 | 0 | { |
1697 | 0 | return strcmp(type, "internal") ? type |
1698 | 0 | : dpif_netdev_class_is_dummy(class) ? "dummy-internal" |
1699 | 0 | : "tap"; |
1700 | 0 | } |
1701 | | |
1702 | | static struct dpif * |
1703 | | create_dpif_netdev(struct dp_netdev *dp) |
1704 | 0 | { |
1705 | 0 | uint16_t netflow_id = hash_string(dp->name, 0); |
1706 | 0 | struct dpif_netdev *dpif; |
1707 | |
|
1708 | 0 | ovs_refcount_ref(&dp->ref_cnt); |
1709 | |
|
1710 | 0 | dpif = xmalloc(sizeof *dpif); |
1711 | 0 | dpif_init(&dpif->dpif, dp->class, dp->name, netflow_id >> 8, netflow_id); |
1712 | 0 | dpif->dp = dp; |
1713 | 0 | dpif->last_port_seq = seq_read(dp->port_seq); |
1714 | |
|
1715 | 0 | return &dpif->dpif; |
1716 | 0 | } |
1717 | | |
1718 | | /* Choose an unused, non-zero port number and return it on success. |
1719 | | * Return ODPP_NONE on failure. */ |
1720 | | static odp_port_t |
1721 | | choose_port(struct dp_netdev *dp, const char *name) |
1722 | | OVS_REQ_RDLOCK(dp->port_rwlock) |
1723 | 0 | { |
1724 | 0 | uint32_t port_no; |
1725 | |
|
1726 | 0 | if (dp->class != &dpif_netdev_class) { |
1727 | 0 | const char *p; |
1728 | 0 | int start_no = 0; |
1729 | | |
1730 | | /* If the port name begins with "br", start the number search at |
1731 | | * 100 to make writing tests easier. */ |
1732 | 0 | if (!strncmp(name, "br", 2)) { |
1733 | 0 | start_no = 100; |
1734 | 0 | } |
1735 | | |
1736 | | /* If the port name contains a number, try to assign that port number. |
1737 | | * This can make writing unit tests easier because port numbers are |
1738 | | * predictable. */ |
1739 | 0 | for (p = name; *p != '\0'; p++) { |
1740 | 0 | if (isdigit((unsigned char) *p)) { |
1741 | 0 | port_no = start_no + strtol(p, NULL, 10); |
1742 | 0 | if (port_no > 0 && port_no != odp_to_u32(ODPP_NONE) |
1743 | 0 | && !dp_netdev_lookup_port(dp, u32_to_odp(port_no))) { |
1744 | 0 | return u32_to_odp(port_no); |
1745 | 0 | } |
1746 | 0 | break; |
1747 | 0 | } |
1748 | 0 | } |
1749 | 0 | } |
1750 | | |
1751 | 0 | for (port_no = 1; port_no <= UINT16_MAX; port_no++) { |
1752 | 0 | if (!dp_netdev_lookup_port(dp, u32_to_odp(port_no))) { |
1753 | 0 | return u32_to_odp(port_no); |
1754 | 0 | } |
1755 | 0 | } |
1756 | | |
1757 | 0 | return ODPP_NONE; |
1758 | 0 | } |
1759 | | |
1760 | | static uint32_t |
1761 | | dp_meter_hash(uint32_t meter_id) |
1762 | 0 | { |
1763 | | /* In the ofproto-dpif layer, we use the id-pool to alloc meter id |
1764 | | * orderly (e.g. 1, 2, ... N.), which provides a better hash |
1765 | | * distribution. Use them directly instead of hash_xxx function for |
1766 | | * achieving high-performance. */ |
1767 | 0 | return meter_id; |
1768 | 0 | } |
1769 | | |
1770 | | static void |
1771 | | dp_netdev_meter_destroy(struct dp_netdev *dp) |
1772 | 0 | { |
1773 | 0 | struct dp_meter *m; |
1774 | |
|
1775 | 0 | ovs_mutex_lock(&dp->meters_lock); |
1776 | 0 | CMAP_FOR_EACH (m, node, &dp->meters) { |
1777 | 0 | cmap_remove(&dp->meters, &m->node, dp_meter_hash(m->id)); |
1778 | 0 | ovsrcu_postpone(free, m); |
1779 | 0 | } |
1780 | |
|
1781 | 0 | cmap_destroy(&dp->meters); |
1782 | 0 | ovs_mutex_unlock(&dp->meters_lock); |
1783 | 0 | ovs_mutex_destroy(&dp->meters_lock); |
1784 | 0 | } |
1785 | | |
1786 | | static struct dp_meter * |
1787 | | dp_meter_lookup(struct cmap *meters, uint32_t meter_id) |
1788 | 0 | { |
1789 | 0 | uint32_t hash = dp_meter_hash(meter_id); |
1790 | 0 | struct dp_meter *m; |
1791 | |
|
1792 | 0 | CMAP_FOR_EACH_WITH_HASH (m, node, hash, meters) { |
1793 | 0 | if (m->id == meter_id) { |
1794 | 0 | return m; |
1795 | 0 | } |
1796 | 0 | } |
1797 | | |
1798 | 0 | return NULL; |
1799 | 0 | } |
1800 | | |
1801 | | static void |
1802 | | dp_meter_detach_free(struct cmap *meters, uint32_t meter_id) |
1803 | 0 | { |
1804 | 0 | struct dp_meter *m = dp_meter_lookup(meters, meter_id); |
1805 | |
|
1806 | 0 | if (m) { |
1807 | 0 | cmap_remove(meters, &m->node, dp_meter_hash(meter_id)); |
1808 | 0 | ovsrcu_postpone(free, m); |
1809 | 0 | } |
1810 | 0 | } |
1811 | | |
1812 | | static void |
1813 | | dp_meter_attach(struct cmap *meters, struct dp_meter *meter) |
1814 | 0 | { |
1815 | 0 | cmap_insert(meters, &meter->node, dp_meter_hash(meter->id)); |
1816 | 0 | } |
1817 | | |
1818 | | static int |
1819 | | create_dp_netdev(const char *name, const struct dpif_class *class, |
1820 | | struct dp_netdev **dpp) |
1821 | | OVS_REQUIRES(dp_netdev_mutex) |
1822 | 0 | { |
1823 | 0 | static struct ovsthread_once tsc_freq_check = OVSTHREAD_ONCE_INITIALIZER; |
1824 | 0 | struct dp_netdev *dp; |
1825 | 0 | int error; |
1826 | | |
1827 | | /* Avoid estimating TSC frequency for dummy datapath to not slow down |
1828 | | * unit tests. */ |
1829 | 0 | if (!dpif_netdev_class_is_dummy(class) |
1830 | 0 | && ovsthread_once_start(&tsc_freq_check)) { |
1831 | 0 | pmd_perf_estimate_tsc_frequency(); |
1832 | 0 | ovsthread_once_done(&tsc_freq_check); |
1833 | 0 | } |
1834 | |
|
1835 | 0 | dp = xzalloc(sizeof *dp); |
1836 | 0 | shash_add(&dp_netdevs, name, dp); |
1837 | |
|
1838 | 0 | *CONST_CAST(const struct dpif_class **, &dp->class) = class; |
1839 | 0 | *CONST_CAST(const char **, &dp->name) = xstrdup(name); |
1840 | 0 | ovs_refcount_init(&dp->ref_cnt); |
1841 | 0 | atomic_flag_clear(&dp->destroyed); |
1842 | |
|
1843 | 0 | ovs_rwlock_init(&dp->port_rwlock); |
1844 | 0 | hmap_init(&dp->ports); |
1845 | 0 | dp->port_seq = seq_create(); |
1846 | 0 | ovs_mutex_init(&dp->bond_mutex); |
1847 | 0 | cmap_init(&dp->tx_bonds); |
1848 | |
|
1849 | 0 | fat_rwlock_init(&dp->upcall_rwlock); |
1850 | |
|
1851 | 0 | dp->reconfigure_seq = seq_create(); |
1852 | 0 | dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq); |
1853 | | |
1854 | | /* Init meter resources. */ |
1855 | 0 | cmap_init(&dp->meters); |
1856 | 0 | ovs_mutex_init(&dp->meters_lock); |
1857 | | |
1858 | | /* Disable upcalls by default. */ |
1859 | 0 | dp_netdev_disable_upcall(dp); |
1860 | 0 | dp->upcall_aux = NULL; |
1861 | 0 | dp->upcall_cb = NULL; |
1862 | |
|
1863 | 0 | dp->conntrack = conntrack_init(); |
1864 | |
|
1865 | 0 | dpif_miniflow_extract_init(); |
1866 | |
|
1867 | 0 | atomic_init(&dp->emc_insert_min, DEFAULT_EM_FLOW_INSERT_MIN); |
1868 | 0 | atomic_init(&dp->tx_flush_interval, DEFAULT_TX_FLUSH_INTERVAL); |
1869 | |
|
1870 | 0 | cmap_init(&dp->poll_threads); |
1871 | 0 | dp->pmd_rxq_assign_type = SCHED_CYCLES; |
1872 | |
|
1873 | 0 | ovs_mutex_init(&dp->tx_qid_pool_mutex); |
1874 | | /* We need 1 Tx queue for each possible core + 1 for non-PMD threads. */ |
1875 | 0 | dp->tx_qid_pool = id_pool_create(0, ovs_numa_get_n_cores() + 1); |
1876 | |
|
1877 | 0 | ovs_mutex_init_recursive(&dp->non_pmd_mutex); |
1878 | 0 | ovsthread_key_create(&dp->per_pmd_key, NULL); |
1879 | |
|
1880 | 0 | ovs_rwlock_wrlock(&dp->port_rwlock); |
1881 | | /* non-PMD will be created before all other threads and will |
1882 | | * allocate static_tx_qid = 0. */ |
1883 | 0 | dp_netdev_set_nonpmd(dp); |
1884 | |
|
1885 | 0 | error = do_add_port(dp, name, dpif_netdev_port_open_type(dp->class, |
1886 | 0 | "internal"), |
1887 | 0 | ODPP_LOCAL); |
1888 | 0 | ovs_rwlock_unlock(&dp->port_rwlock); |
1889 | 0 | if (error) { |
1890 | 0 | dp_netdev_free(dp); |
1891 | 0 | return error; |
1892 | 0 | } |
1893 | | |
1894 | 0 | dp->last_tnl_conf_seq = seq_read(tnl_conf_seq); |
1895 | 0 | *dpp = dp; |
1896 | 0 | return 0; |
1897 | 0 | } |
1898 | | |
1899 | | static void |
1900 | | dp_netdev_request_reconfigure(struct dp_netdev *dp) |
1901 | 0 | { |
1902 | 0 | seq_change(dp->reconfigure_seq); |
1903 | 0 | } |
1904 | | |
1905 | | static bool |
1906 | | dp_netdev_is_reconf_required(struct dp_netdev *dp) |
1907 | 0 | { |
1908 | 0 | return seq_read(dp->reconfigure_seq) != dp->last_reconfigure_seq; |
1909 | 0 | } |
1910 | | |
1911 | | static int |
1912 | | dpif_netdev_open(const struct dpif_class *class, const char *name, |
1913 | | bool create, struct dpif **dpifp) |
1914 | 0 | { |
1915 | 0 | struct dp_netdev *dp; |
1916 | 0 | int error; |
1917 | |
|
1918 | 0 | ovs_mutex_lock(&dp_netdev_mutex); |
1919 | 0 | dp = shash_find_data(&dp_netdevs, name); |
1920 | 0 | if (!dp) { |
1921 | 0 | error = create ? create_dp_netdev(name, class, &dp) : ENODEV; |
1922 | 0 | } else { |
1923 | 0 | error = (dp->class != class ? EINVAL |
1924 | 0 | : create ? EEXIST |
1925 | 0 | : 0); |
1926 | 0 | } |
1927 | 0 | if (!error) { |
1928 | 0 | *dpifp = create_dpif_netdev(dp); |
1929 | 0 | } |
1930 | 0 | ovs_mutex_unlock(&dp_netdev_mutex); |
1931 | |
|
1932 | 0 | return error; |
1933 | 0 | } |
1934 | | |
1935 | | static void |
1936 | | dp_netdev_destroy_upcall_lock(struct dp_netdev *dp) |
1937 | | OVS_NO_THREAD_SAFETY_ANALYSIS |
1938 | 0 | { |
1939 | | /* Check that upcalls are disabled, i.e. that the rwlock is taken */ |
1940 | 0 | ovs_assert(fat_rwlock_tryrdlock(&dp->upcall_rwlock)); |
1941 | | |
1942 | | /* Before freeing a lock we should release it */ |
1943 | 0 | fat_rwlock_unlock(&dp->upcall_rwlock); |
1944 | 0 | fat_rwlock_destroy(&dp->upcall_rwlock); |
1945 | 0 | } |
1946 | | |
1947 | | static uint32_t |
1948 | | hash_bond_id(uint32_t bond_id) |
1949 | 0 | { |
1950 | 0 | return hash_int(bond_id, 0); |
1951 | 0 | } |
1952 | | |
1953 | | /* Requires dp_netdev_mutex so that we can't get a new reference to 'dp' |
1954 | | * through the 'dp_netdevs' shash while freeing 'dp'. */ |
1955 | | static void |
1956 | | dp_netdev_free(struct dp_netdev *dp) |
1957 | | OVS_REQUIRES(dp_netdev_mutex) |
1958 | 0 | { |
1959 | 0 | struct dp_netdev_port *port; |
1960 | 0 | struct tx_bond *bond; |
1961 | |
|
1962 | 0 | shash_find_and_delete(&dp_netdevs, dp->name); |
1963 | |
|
1964 | 0 | ovs_rwlock_wrlock(&dp->port_rwlock); |
1965 | 0 | HMAP_FOR_EACH_SAFE (port, node, &dp->ports) { |
1966 | 0 | do_del_port(dp, port); |
1967 | 0 | } |
1968 | 0 | ovs_rwlock_unlock(&dp->port_rwlock); |
1969 | |
|
1970 | 0 | ovs_mutex_lock(&dp->bond_mutex); |
1971 | 0 | CMAP_FOR_EACH (bond, node, &dp->tx_bonds) { |
1972 | 0 | cmap_remove(&dp->tx_bonds, &bond->node, hash_bond_id(bond->bond_id)); |
1973 | 0 | ovsrcu_postpone(free, bond); |
1974 | 0 | } |
1975 | 0 | ovs_mutex_unlock(&dp->bond_mutex); |
1976 | |
|
1977 | 0 | dp_netdev_destroy_all_pmds(dp, true); |
1978 | 0 | cmap_destroy(&dp->poll_threads); |
1979 | |
|
1980 | 0 | ovs_mutex_destroy(&dp->tx_qid_pool_mutex); |
1981 | 0 | id_pool_destroy(dp->tx_qid_pool); |
1982 | |
|
1983 | 0 | ovs_mutex_destroy(&dp->non_pmd_mutex); |
1984 | 0 | ovsthread_key_delete(dp->per_pmd_key); |
1985 | |
|
1986 | 0 | conntrack_destroy(dp->conntrack); |
1987 | | |
1988 | |
|
1989 | 0 | seq_destroy(dp->reconfigure_seq); |
1990 | |
|
1991 | 0 | seq_destroy(dp->port_seq); |
1992 | 0 | hmap_destroy(&dp->ports); |
1993 | 0 | ovs_rwlock_destroy(&dp->port_rwlock); |
1994 | |
|
1995 | 0 | cmap_destroy(&dp->tx_bonds); |
1996 | 0 | ovs_mutex_destroy(&dp->bond_mutex); |
1997 | | |
1998 | | /* Upcalls must be disabled at this point */ |
1999 | 0 | dp_netdev_destroy_upcall_lock(dp); |
2000 | |
|
2001 | 0 | dp_netdev_meter_destroy(dp); |
2002 | |
|
2003 | 0 | free(dp->pmd_cmask); |
2004 | 0 | free(CONST_CAST(char *, dp->name)); |
2005 | 0 | free(dp); |
2006 | 0 | } |
2007 | | |
2008 | | static void |
2009 | | dp_netdev_unref(struct dp_netdev *dp) |
2010 | 0 | { |
2011 | 0 | if (dp) { |
2012 | | /* Take dp_netdev_mutex so that, if dp->ref_cnt falls to zero, we can't |
2013 | | * get a new reference to 'dp' through the 'dp_netdevs' shash. */ |
2014 | 0 | ovs_mutex_lock(&dp_netdev_mutex); |
2015 | 0 | if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) { |
2016 | 0 | dp_netdev_free(dp); |
2017 | 0 | } |
2018 | 0 | ovs_mutex_unlock(&dp_netdev_mutex); |
2019 | 0 | } |
2020 | 0 | } |
2021 | | |
2022 | | static void |
2023 | | dpif_netdev_close(struct dpif *dpif) |
2024 | 0 | { |
2025 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
2026 | |
|
2027 | 0 | dp_netdev_unref(dp); |
2028 | 0 | free(dpif); |
2029 | 0 | } |
2030 | | |
2031 | | static int |
2032 | | dpif_netdev_destroy(struct dpif *dpif) |
2033 | 0 | { |
2034 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
2035 | |
|
2036 | 0 | if (!atomic_flag_test_and_set(&dp->destroyed)) { |
2037 | 0 | if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) { |
2038 | | /* Can't happen: 'dpif' still owns a reference to 'dp'. */ |
2039 | 0 | OVS_NOT_REACHED(); |
2040 | 0 | } |
2041 | 0 | } |
2042 | | |
2043 | 0 | return 0; |
2044 | 0 | } |
2045 | | |
2046 | | /* Add 'n' to the atomic variable 'var' non-atomically and using relaxed |
2047 | | * load/store semantics. While the increment is not atomic, the load and |
2048 | | * store operations are, making it impossible to read inconsistent values. |
2049 | | * |
2050 | | * This is used to update thread local stats counters. */ |
2051 | | static void |
2052 | | non_atomic_ullong_add(atomic_ullong *var, unsigned long long n) |
2053 | 0 | { |
2054 | 0 | unsigned long long tmp; |
2055 | |
|
2056 | 0 | atomic_read_relaxed(var, &tmp); |
2057 | 0 | tmp += n; |
2058 | 0 | atomic_store_relaxed(var, tmp); |
2059 | 0 | } |
2060 | | |
2061 | | static int |
2062 | | dpif_netdev_get_stats(const struct dpif *dpif, struct dpif_dp_stats *stats) |
2063 | 0 | { |
2064 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
2065 | 0 | struct dp_netdev_pmd_thread *pmd; |
2066 | 0 | uint64_t pmd_stats[PMD_N_STATS]; |
2067 | |
|
2068 | 0 | stats->n_flows = stats->n_hit = stats->n_missed = stats->n_lost = 0; |
2069 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
2070 | 0 | stats->n_flows += cmap_count(&pmd->flow_table); |
2071 | 0 | pmd_perf_read_counters(&pmd->perf_stats, pmd_stats); |
2072 | 0 | stats->n_hit += pmd_stats[PMD_STAT_PHWOL_HIT]; |
2073 | 0 | stats->n_hit += pmd_stats[PMD_STAT_SIMPLE_HIT]; |
2074 | 0 | stats->n_hit += pmd_stats[PMD_STAT_EXACT_HIT]; |
2075 | 0 | stats->n_hit += pmd_stats[PMD_STAT_SMC_HIT]; |
2076 | 0 | stats->n_hit += pmd_stats[PMD_STAT_MASKED_HIT]; |
2077 | 0 | stats->n_missed += pmd_stats[PMD_STAT_MISS]; |
2078 | 0 | stats->n_lost += pmd_stats[PMD_STAT_LOST]; |
2079 | 0 | } |
2080 | 0 | stats->n_masks = UINT32_MAX; |
2081 | 0 | stats->n_mask_hit = UINT64_MAX; |
2082 | 0 | stats->n_cache_hit = UINT64_MAX; |
2083 | |
|
2084 | 0 | return 0; |
2085 | 0 | } |
2086 | | |
2087 | | static void |
2088 | | dp_netdev_reload_pmd__(struct dp_netdev_pmd_thread *pmd) |
2089 | 0 | { |
2090 | 0 | if (pmd->core_id == NON_PMD_CORE_ID) { |
2091 | 0 | ovs_mutex_lock(&pmd->dp->non_pmd_mutex); |
2092 | 0 | ovs_mutex_lock(&pmd->port_mutex); |
2093 | 0 | pmd_load_cached_ports(pmd); |
2094 | 0 | ovs_mutex_unlock(&pmd->port_mutex); |
2095 | 0 | ovs_mutex_unlock(&pmd->dp->non_pmd_mutex); |
2096 | 0 | return; |
2097 | 0 | } |
2098 | | |
2099 | 0 | seq_change(pmd->reload_seq); |
2100 | 0 | atomic_store_explicit(&pmd->reload, true, memory_order_release); |
2101 | 0 | } |
2102 | | |
2103 | | static uint32_t |
2104 | | hash_port_no(odp_port_t port_no) |
2105 | 0 | { |
2106 | 0 | return hash_int(odp_to_u32(port_no), 0); |
2107 | 0 | } |
2108 | | |
2109 | | static int |
2110 | | port_create(const char *devname, const char *type, |
2111 | | odp_port_t port_no, struct dp_netdev_port **portp) |
2112 | 0 | { |
2113 | 0 | struct dp_netdev_port *port; |
2114 | 0 | enum netdev_flags flags; |
2115 | 0 | struct netdev *netdev; |
2116 | 0 | int error; |
2117 | |
|
2118 | 0 | *portp = NULL; |
2119 | | |
2120 | | /* Open and validate network device. */ |
2121 | 0 | error = netdev_open(devname, type, &netdev); |
2122 | 0 | if (error) { |
2123 | 0 | return error; |
2124 | 0 | } |
2125 | | /* XXX reject non-Ethernet devices */ |
2126 | | |
2127 | 0 | netdev_get_flags(netdev, &flags); |
2128 | 0 | if (flags & NETDEV_LOOPBACK) { |
2129 | 0 | VLOG_ERR("%s: cannot add a loopback device", devname); |
2130 | 0 | error = EINVAL; |
2131 | 0 | goto out; |
2132 | 0 | } |
2133 | | |
2134 | 0 | port = xzalloc(sizeof *port); |
2135 | 0 | port->port_no = port_no; |
2136 | 0 | port->netdev = netdev; |
2137 | 0 | port->type = xstrdup(type); |
2138 | 0 | port->sf = NULL; |
2139 | 0 | port->emc_enabled = true; |
2140 | 0 | port->need_reconfigure = true; |
2141 | 0 | ovs_mutex_init(&port->txq_used_mutex); |
2142 | |
|
2143 | 0 | *portp = port; |
2144 | |
|
2145 | 0 | return 0; |
2146 | | |
2147 | 0 | out: |
2148 | 0 | netdev_close(netdev); |
2149 | 0 | return error; |
2150 | 0 | } |
2151 | | |
2152 | | static int |
2153 | | do_add_port(struct dp_netdev *dp, const char *devname, const char *type, |
2154 | | odp_port_t port_no) |
2155 | | OVS_REQ_WRLOCK(dp->port_rwlock) |
2156 | 0 | { |
2157 | 0 | struct netdev_saved_flags *sf; |
2158 | 0 | struct dp_netdev_port *port; |
2159 | 0 | int error; |
2160 | | |
2161 | | /* Reject devices already in 'dp'. */ |
2162 | 0 | if (!get_port_by_name(dp, devname, &port)) { |
2163 | 0 | return EEXIST; |
2164 | 0 | } |
2165 | | |
2166 | 0 | error = port_create(devname, type, port_no, &port); |
2167 | 0 | if (error) { |
2168 | 0 | return error; |
2169 | 0 | } |
2170 | | |
2171 | 0 | hmap_insert(&dp->ports, &port->node, hash_port_no(port_no)); |
2172 | 0 | seq_change(dp->port_seq); |
2173 | |
|
2174 | 0 | reconfigure_datapath(dp); |
2175 | | |
2176 | | /* Check that port was successfully configured. */ |
2177 | 0 | if (!dp_netdev_lookup_port(dp, port_no)) { |
2178 | 0 | return EINVAL; |
2179 | 0 | } |
2180 | | |
2181 | | /* Updating device flags triggers an if_notifier, which triggers a bridge |
2182 | | * reconfiguration and another attempt to add this port, leading to an |
2183 | | * infinite loop if the device is configured incorrectly and cannot be |
2184 | | * added. Setting the promisc mode after a successful reconfiguration, |
2185 | | * since we already know that the device is somehow properly configured. */ |
2186 | 0 | error = netdev_turn_flags_on(port->netdev, NETDEV_PROMISC, &sf); |
2187 | 0 | if (error) { |
2188 | 0 | VLOG_ERR("%s: cannot set promisc flag", devname); |
2189 | 0 | do_del_port(dp, port); |
2190 | 0 | return error; |
2191 | 0 | } |
2192 | 0 | port->sf = sf; |
2193 | |
|
2194 | 0 | return 0; |
2195 | 0 | } |
2196 | | |
2197 | | static int |
2198 | | dpif_netdev_port_add(struct dpif *dpif, struct netdev *netdev, |
2199 | | odp_port_t *port_nop) |
2200 | 0 | { |
2201 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
2202 | 0 | char namebuf[NETDEV_VPORT_NAME_BUFSIZE]; |
2203 | 0 | const char *dpif_port; |
2204 | 0 | odp_port_t port_no; |
2205 | 0 | int error; |
2206 | |
|
2207 | 0 | ovs_rwlock_wrlock(&dp->port_rwlock); |
2208 | 0 | dpif_port = netdev_vport_get_dpif_port(netdev, namebuf, sizeof namebuf); |
2209 | 0 | if (*port_nop != ODPP_NONE) { |
2210 | 0 | port_no = *port_nop; |
2211 | 0 | error = dp_netdev_lookup_port(dp, *port_nop) ? EBUSY : 0; |
2212 | 0 | } else { |
2213 | 0 | port_no = choose_port(dp, dpif_port); |
2214 | 0 | error = port_no == ODPP_NONE ? EFBIG : 0; |
2215 | 0 | } |
2216 | 0 | if (!error) { |
2217 | 0 | *port_nop = port_no; |
2218 | 0 | error = do_add_port(dp, dpif_port, netdev_get_type(netdev), port_no); |
2219 | 0 | } |
2220 | 0 | ovs_rwlock_unlock(&dp->port_rwlock); |
2221 | |
|
2222 | 0 | return error; |
2223 | 0 | } |
2224 | | |
2225 | | static int |
2226 | | dpif_netdev_port_del(struct dpif *dpif, odp_port_t port_no) |
2227 | 0 | { |
2228 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
2229 | 0 | int error; |
2230 | |
|
2231 | 0 | ovs_rwlock_wrlock(&dp->port_rwlock); |
2232 | 0 | if (port_no == ODPP_LOCAL) { |
2233 | 0 | error = EINVAL; |
2234 | 0 | } else { |
2235 | 0 | struct dp_netdev_port *port; |
2236 | |
|
2237 | 0 | error = get_port_by_number(dp, port_no, &port); |
2238 | 0 | if (!error) { |
2239 | 0 | do_del_port(dp, port); |
2240 | 0 | } |
2241 | 0 | } |
2242 | 0 | ovs_rwlock_unlock(&dp->port_rwlock); |
2243 | |
|
2244 | 0 | return error; |
2245 | 0 | } |
2246 | | |
2247 | | static bool |
2248 | | is_valid_port_number(odp_port_t port_no) |
2249 | 0 | { |
2250 | 0 | return port_no != ODPP_NONE; |
2251 | 0 | } |
2252 | | |
2253 | | static struct dp_netdev_port * |
2254 | | dp_netdev_lookup_port(const struct dp_netdev *dp, odp_port_t port_no) |
2255 | | OVS_REQ_RDLOCK(dp->port_rwlock) |
2256 | 0 | { |
2257 | 0 | struct dp_netdev_port *port; |
2258 | |
|
2259 | 0 | HMAP_FOR_EACH_WITH_HASH (port, node, hash_port_no(port_no), &dp->ports) { |
2260 | 0 | if (port->port_no == port_no) { |
2261 | 0 | return port; |
2262 | 0 | } |
2263 | 0 | } |
2264 | 0 | return NULL; |
2265 | 0 | } |
2266 | | |
2267 | | static int |
2268 | | get_port_by_number(struct dp_netdev *dp, |
2269 | | odp_port_t port_no, struct dp_netdev_port **portp) |
2270 | | OVS_REQ_RDLOCK(dp->port_rwlock) |
2271 | 0 | { |
2272 | 0 | if (!is_valid_port_number(port_no)) { |
2273 | 0 | *portp = NULL; |
2274 | 0 | return EINVAL; |
2275 | 0 | } else { |
2276 | 0 | *portp = dp_netdev_lookup_port(dp, port_no); |
2277 | 0 | return *portp ? 0 : ENODEV; |
2278 | 0 | } |
2279 | 0 | } |
2280 | | |
2281 | | static void |
2282 | | port_destroy(struct dp_netdev_port *port) |
2283 | 0 | { |
2284 | 0 | if (!port) { |
2285 | 0 | return; |
2286 | 0 | } |
2287 | | |
2288 | 0 | netdev_close(port->netdev); |
2289 | 0 | netdev_restore_flags(port->sf); |
2290 | |
|
2291 | 0 | for (unsigned i = 0; i < port->n_rxq; i++) { |
2292 | 0 | netdev_rxq_close(port->rxqs[i].rx); |
2293 | 0 | } |
2294 | 0 | ovs_mutex_destroy(&port->txq_used_mutex); |
2295 | 0 | free(port->rxq_affinity_list); |
2296 | 0 | free(port->txq_used); |
2297 | 0 | free(port->rxqs); |
2298 | 0 | free(port->type); |
2299 | 0 | free(port); |
2300 | 0 | } |
2301 | | |
2302 | | static int |
2303 | | get_port_by_name(struct dp_netdev *dp, |
2304 | | const char *devname, struct dp_netdev_port **portp) |
2305 | | OVS_REQ_RDLOCK(dp->port_rwlock) |
2306 | 0 | { |
2307 | 0 | struct dp_netdev_port *port; |
2308 | |
|
2309 | 0 | HMAP_FOR_EACH (port, node, &dp->ports) { |
2310 | 0 | if (!strcmp(netdev_get_name(port->netdev), devname)) { |
2311 | 0 | *portp = port; |
2312 | 0 | return 0; |
2313 | 0 | } |
2314 | 0 | } |
2315 | | |
2316 | | /* Callers of dpif_netdev_port_query_by_name() expect ENODEV for a non |
2317 | | * existing port. */ |
2318 | 0 | return ENODEV; |
2319 | 0 | } |
2320 | | |
2321 | | /* Returns 'true' if there is a port with pmd netdev. */ |
2322 | | static bool |
2323 | | has_pmd_port(struct dp_netdev *dp) |
2324 | | OVS_REQ_RDLOCK(dp->port_rwlock) |
2325 | 0 | { |
2326 | 0 | struct dp_netdev_port *port; |
2327 | |
|
2328 | 0 | HMAP_FOR_EACH (port, node, &dp->ports) { |
2329 | 0 | if (netdev_is_pmd(port->netdev)) { |
2330 | 0 | return true; |
2331 | 0 | } |
2332 | 0 | } |
2333 | | |
2334 | 0 | return false; |
2335 | 0 | } |
2336 | | |
2337 | | static void |
2338 | | do_del_port(struct dp_netdev *dp, struct dp_netdev_port *port) |
2339 | | OVS_REQ_WRLOCK(dp->port_rwlock) |
2340 | 0 | { |
2341 | 0 | hmap_remove(&dp->ports, &port->node); |
2342 | 0 | seq_change(dp->port_seq); |
2343 | |
|
2344 | 0 | reconfigure_datapath(dp); |
2345 | | |
2346 | | /* Flush and disable offloads only after 'port' has been made |
2347 | | * inaccessible through datapath reconfiguration. |
2348 | | * This prevents having PMDs enqueuing offload requests after |
2349 | | * the flush. |
2350 | | * When only this port is deleted instead of the whole datapath, |
2351 | | * revalidator threads are still active and can still enqueue |
2352 | | * offload modification or deletion. Managing those stray requests |
2353 | | * is done in the offload threads. */ |
2354 | 0 | dp_netdev_offload_flush(dp, port); |
2355 | 0 | netdev_uninit_flow_api(port->netdev); |
2356 | |
|
2357 | 0 | port_destroy(port); |
2358 | 0 | } |
2359 | | |
2360 | | static void |
2361 | | answer_port_query(const struct dp_netdev_port *port, |
2362 | | struct dpif_port *dpif_port) |
2363 | 0 | { |
2364 | 0 | dpif_port->name = xstrdup(netdev_get_name(port->netdev)); |
2365 | 0 | dpif_port->type = xstrdup(port->type); |
2366 | 0 | dpif_port->port_no = port->port_no; |
2367 | 0 | } |
2368 | | |
2369 | | static int |
2370 | | dpif_netdev_port_query_by_number(const struct dpif *dpif, odp_port_t port_no, |
2371 | | struct dpif_port *dpif_port) |
2372 | 0 | { |
2373 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
2374 | 0 | struct dp_netdev_port *port; |
2375 | 0 | int error; |
2376 | |
|
2377 | 0 | ovs_rwlock_wrlock(&dp->port_rwlock); |
2378 | 0 | error = get_port_by_number(dp, port_no, &port); |
2379 | 0 | if (!error && dpif_port) { |
2380 | 0 | answer_port_query(port, dpif_port); |
2381 | 0 | } |
2382 | 0 | ovs_rwlock_unlock(&dp->port_rwlock); |
2383 | |
|
2384 | 0 | return error; |
2385 | 0 | } |
2386 | | |
2387 | | static int |
2388 | | dpif_netdev_port_query_by_name(const struct dpif *dpif, const char *devname, |
2389 | | struct dpif_port *dpif_port) |
2390 | 0 | { |
2391 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
2392 | 0 | struct dp_netdev_port *port; |
2393 | 0 | int error; |
2394 | |
|
2395 | 0 | ovs_rwlock_rdlock(&dp->port_rwlock); |
2396 | 0 | error = get_port_by_name(dp, devname, &port); |
2397 | 0 | if (!error && dpif_port) { |
2398 | 0 | answer_port_query(port, dpif_port); |
2399 | 0 | } |
2400 | 0 | ovs_rwlock_unlock(&dp->port_rwlock); |
2401 | |
|
2402 | 0 | return error; |
2403 | 0 | } |
2404 | | |
2405 | | static void |
2406 | | dp_netdev_flow_free(struct dp_netdev_flow *flow) |
2407 | 0 | { |
2408 | 0 | dp_netdev_actions_free(dp_netdev_flow_get_actions(flow)); |
2409 | 0 | free(flow->dp_extra_info); |
2410 | 0 | free(flow); |
2411 | 0 | } |
2412 | | |
2413 | | void dp_netdev_flow_unref(struct dp_netdev_flow *flow) |
2414 | 0 | { |
2415 | 0 | if (ovs_refcount_unref_relaxed(&flow->ref_cnt) == 1) { |
2416 | 0 | ovsrcu_postpone(dp_netdev_flow_free, flow); |
2417 | 0 | } |
2418 | 0 | } |
2419 | | |
2420 | | inline struct dpcls * |
2421 | | dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd, |
2422 | | odp_port_t in_port) |
2423 | 0 | { |
2424 | 0 | struct dpcls *cls; |
2425 | 0 | uint32_t hash = hash_port_no(in_port); |
2426 | 0 | CMAP_FOR_EACH_WITH_HASH (cls, node, hash, &pmd->classifiers) { |
2427 | 0 | if (cls->in_port == in_port) { |
2428 | | /* Port classifier exists already */ |
2429 | 0 | return cls; |
2430 | 0 | } |
2431 | 0 | } |
2432 | 0 | return NULL; |
2433 | 0 | } |
2434 | | |
2435 | | static inline struct dpcls * |
2436 | | dp_netdev_pmd_find_dpcls(struct dp_netdev_pmd_thread *pmd, |
2437 | | odp_port_t in_port) |
2438 | | OVS_REQUIRES(pmd->flow_mutex) |
2439 | 0 | { |
2440 | 0 | struct dpcls *cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port); |
2441 | |
|
2442 | 0 | if (!cls) { |
2443 | 0 | uint32_t hash = hash_port_no(in_port); |
2444 | | |
2445 | | /* Create new classifier for in_port */ |
2446 | 0 | cls = xmalloc(sizeof(*cls)); |
2447 | 0 | dpcls_init(cls); |
2448 | 0 | cls->in_port = in_port; |
2449 | 0 | cmap_insert(&pmd->classifiers, &cls->node, hash); |
2450 | 0 | VLOG_DBG("Creating dpcls %p for in_port %d", cls, in_port); |
2451 | 0 | } |
2452 | 0 | return cls; |
2453 | 0 | } |
2454 | | |
2455 | 0 | #define MAX_FLOW_MARK (UINT32_MAX - 1) |
2456 | 0 | #define INVALID_FLOW_MARK 0 |
2457 | | /* Zero flow mark is used to indicate the HW to remove the mark. A packet |
2458 | | * marked with zero mark is received in SW without a mark at all, so it |
2459 | | * cannot be used as a valid mark. |
2460 | | */ |
2461 | | |
2462 | | struct megaflow_to_mark_data { |
2463 | | const struct cmap_node node; |
2464 | | ovs_u128 mega_ufid; |
2465 | | uint32_t mark; |
2466 | | }; |
2467 | | |
2468 | | static struct id_fpool *flow_mark_pool; |
2469 | | |
2470 | | static uint32_t |
2471 | | flow_mark_alloc(void) |
2472 | 0 | { |
2473 | 0 | static struct ovsthread_once init_once = OVSTHREAD_ONCE_INITIALIZER; |
2474 | 0 | unsigned int tid = netdev_offload_thread_id(); |
2475 | 0 | uint32_t mark; |
2476 | |
|
2477 | 0 | if (ovsthread_once_start(&init_once)) { |
2478 | | /* Haven't initiated yet, do it here */ |
2479 | 0 | flow_mark_pool = id_fpool_create(netdev_offload_thread_nb(), |
2480 | 0 | 1, MAX_FLOW_MARK); |
2481 | 0 | ovsthread_once_done(&init_once); |
2482 | 0 | } |
2483 | |
|
2484 | 0 | if (id_fpool_new_id(flow_mark_pool, tid, &mark)) { |
2485 | 0 | return mark; |
2486 | 0 | } |
2487 | | |
2488 | 0 | return INVALID_FLOW_MARK; |
2489 | 0 | } |
2490 | | |
2491 | | static void |
2492 | | flow_mark_free(uint32_t mark) |
2493 | 0 | { |
2494 | 0 | unsigned int tid = netdev_offload_thread_id(); |
2495 | |
|
2496 | 0 | id_fpool_free_id(flow_mark_pool, tid, mark); |
2497 | 0 | } |
2498 | | |
2499 | | /* associate megaflow with a mark, which is a 1:1 mapping */ |
2500 | | static void |
2501 | | megaflow_to_mark_associate(const ovs_u128 *mega_ufid, uint32_t mark) |
2502 | 0 | { |
2503 | 0 | size_t hash = dp_netdev_flow_hash(mega_ufid); |
2504 | 0 | struct megaflow_to_mark_data *data = xzalloc(sizeof(*data)); |
2505 | 0 | unsigned int tid = netdev_offload_thread_id(); |
2506 | |
|
2507 | 0 | data->mega_ufid = *mega_ufid; |
2508 | 0 | data->mark = mark; |
2509 | |
|
2510 | 0 | cmap_insert(&dp_offload_threads[tid].megaflow_to_mark, |
2511 | 0 | CONST_CAST(struct cmap_node *, &data->node), hash); |
2512 | 0 | } |
2513 | | |
2514 | | /* disassociate meagaflow with a mark */ |
2515 | | static void |
2516 | | megaflow_to_mark_disassociate(const ovs_u128 *mega_ufid) |
2517 | 0 | { |
2518 | 0 | size_t hash = dp_netdev_flow_hash(mega_ufid); |
2519 | 0 | struct megaflow_to_mark_data *data; |
2520 | 0 | unsigned int tid = netdev_offload_thread_id(); |
2521 | |
|
2522 | 0 | CMAP_FOR_EACH_WITH_HASH (data, node, hash, |
2523 | 0 | &dp_offload_threads[tid].megaflow_to_mark) { |
2524 | 0 | if (ovs_u128_equals(*mega_ufid, data->mega_ufid)) { |
2525 | 0 | cmap_remove(&dp_offload_threads[tid].megaflow_to_mark, |
2526 | 0 | CONST_CAST(struct cmap_node *, &data->node), hash); |
2527 | 0 | ovsrcu_postpone(free, data); |
2528 | 0 | return; |
2529 | 0 | } |
2530 | 0 | } |
2531 | | |
2532 | 0 | VLOG_WARN("Masked ufid "UUID_FMT" is not associated with a mark?\n", |
2533 | 0 | UUID_ARGS((struct uuid *)mega_ufid)); |
2534 | 0 | } |
2535 | | |
2536 | | static inline uint32_t |
2537 | | megaflow_to_mark_find(const ovs_u128 *mega_ufid) |
2538 | 0 | { |
2539 | 0 | size_t hash = dp_netdev_flow_hash(mega_ufid); |
2540 | 0 | struct megaflow_to_mark_data *data; |
2541 | 0 | unsigned int tid = netdev_offload_thread_id(); |
2542 | |
|
2543 | 0 | CMAP_FOR_EACH_WITH_HASH (data, node, hash, |
2544 | 0 | &dp_offload_threads[tid].megaflow_to_mark) { |
2545 | 0 | if (ovs_u128_equals(*mega_ufid, data->mega_ufid)) { |
2546 | 0 | return data->mark; |
2547 | 0 | } |
2548 | 0 | } |
2549 | | |
2550 | 0 | VLOG_DBG("Mark id for ufid "UUID_FMT" was not found\n", |
2551 | 0 | UUID_ARGS((struct uuid *)mega_ufid)); |
2552 | 0 | return INVALID_FLOW_MARK; |
2553 | 0 | } |
2554 | | |
2555 | | /* associate mark with a flow, which is 1:N mapping */ |
2556 | | static void |
2557 | | mark_to_flow_associate(const uint32_t mark, struct dp_netdev_flow *flow) |
2558 | 0 | { |
2559 | 0 | unsigned int tid = netdev_offload_thread_id(); |
2560 | 0 | dp_netdev_flow_ref(flow); |
2561 | |
|
2562 | 0 | cmap_insert(&dp_offload_threads[tid].mark_to_flow, |
2563 | 0 | CONST_CAST(struct cmap_node *, &flow->mark_node), |
2564 | 0 | hash_int(mark, 0)); |
2565 | 0 | flow->mark = mark; |
2566 | |
|
2567 | 0 | VLOG_DBG("Associated dp_netdev flow %p with mark %u mega_ufid "UUID_FMT, |
2568 | 0 | flow, mark, UUID_ARGS((struct uuid *) &flow->mega_ufid)); |
2569 | 0 | } |
2570 | | |
2571 | | static bool |
2572 | | flow_mark_has_no_ref(uint32_t mark) |
2573 | 0 | { |
2574 | 0 | unsigned int tid = netdev_offload_thread_id(); |
2575 | 0 | struct dp_netdev_flow *flow; |
2576 | |
|
2577 | 0 | CMAP_FOR_EACH_WITH_HASH (flow, mark_node, hash_int(mark, 0), |
2578 | 0 | &dp_offload_threads[tid].mark_to_flow) { |
2579 | 0 | if (flow->mark == mark) { |
2580 | 0 | return false; |
2581 | 0 | } |
2582 | 0 | } |
2583 | | |
2584 | 0 | return true; |
2585 | 0 | } |
2586 | | |
2587 | | static int |
2588 | | mark_to_flow_disassociate(struct dp_netdev *dp, |
2589 | | struct dp_netdev_flow *flow) |
2590 | 0 | { |
2591 | 0 | const char *dpif_type_str = dpif_normalize_type(dp->class->type); |
2592 | 0 | struct cmap_node *mark_node = CONST_CAST(struct cmap_node *, |
2593 | 0 | &flow->mark_node); |
2594 | 0 | unsigned int tid = netdev_offload_thread_id(); |
2595 | 0 | uint32_t mark = flow->mark; |
2596 | 0 | int ret = 0; |
2597 | | |
2598 | | /* INVALID_FLOW_MARK may mean that the flow has been disassociated or |
2599 | | * never associated. */ |
2600 | 0 | if (OVS_UNLIKELY(mark == INVALID_FLOW_MARK)) { |
2601 | 0 | return EINVAL; |
2602 | 0 | } |
2603 | | |
2604 | 0 | cmap_remove(&dp_offload_threads[tid].mark_to_flow, |
2605 | 0 | mark_node, hash_int(mark, 0)); |
2606 | 0 | flow->mark = INVALID_FLOW_MARK; |
2607 | | |
2608 | | /* |
2609 | | * no flow is referencing the mark any more? If so, let's |
2610 | | * remove the flow from hardware and free the mark. |
2611 | | */ |
2612 | 0 | if (flow_mark_has_no_ref(mark)) { |
2613 | 0 | struct netdev *port; |
2614 | 0 | odp_port_t in_port = flow->flow.in_port.odp_port; |
2615 | |
|
2616 | 0 | port = netdev_ports_get(in_port, dpif_type_str); |
2617 | 0 | if (port) { |
2618 | | /* Taking a global 'port_rwlock' to fulfill thread safety |
2619 | | * restrictions regarding netdev port mapping. */ |
2620 | 0 | ovs_rwlock_rdlock(&dp->port_rwlock); |
2621 | 0 | ret = netdev_flow_del(port, &flow->mega_ufid, NULL); |
2622 | 0 | ovs_rwlock_unlock(&dp->port_rwlock); |
2623 | 0 | netdev_close(port); |
2624 | 0 | } |
2625 | |
|
2626 | 0 | flow_mark_free(mark); |
2627 | 0 | VLOG_DBG("Freed flow mark %u mega_ufid "UUID_FMT, mark, |
2628 | 0 | UUID_ARGS((struct uuid *) &flow->mega_ufid)); |
2629 | |
|
2630 | 0 | megaflow_to_mark_disassociate(&flow->mega_ufid); |
2631 | 0 | } |
2632 | 0 | dp_netdev_flow_unref(flow); |
2633 | |
|
2634 | 0 | return ret; |
2635 | 0 | } |
2636 | | |
2637 | | static struct dp_netdev_flow * |
2638 | | mark_to_flow_find(const struct dp_netdev_pmd_thread *pmd, |
2639 | | const uint32_t mark) |
2640 | 0 | { |
2641 | 0 | struct dp_netdev_flow *flow; |
2642 | 0 | unsigned int tid; |
2643 | 0 | size_t hash; |
2644 | |
|
2645 | 0 | if (dp_offload_threads == NULL) { |
2646 | 0 | return NULL; |
2647 | 0 | } |
2648 | | |
2649 | 0 | hash = hash_int(mark, 0); |
2650 | 0 | for (tid = 0; tid < netdev_offload_thread_nb(); tid++) { |
2651 | 0 | CMAP_FOR_EACH_WITH_HASH (flow, mark_node, hash, |
2652 | 0 | &dp_offload_threads[tid].mark_to_flow) { |
2653 | 0 | if (flow->mark == mark && flow->pmd_id == pmd->core_id && |
2654 | 0 | flow->dead == false) { |
2655 | 0 | return flow; |
2656 | 0 | } |
2657 | 0 | } |
2658 | 0 | } |
2659 | | |
2660 | 0 | return NULL; |
2661 | 0 | } |
2662 | | |
2663 | | static struct dp_offload_thread_item * |
2664 | | dp_netdev_alloc_flow_offload(struct dp_netdev *dp, |
2665 | | struct dp_netdev_flow *flow, |
2666 | | int op) |
2667 | 0 | { |
2668 | 0 | struct dp_offload_thread_item *item; |
2669 | 0 | struct dp_offload_flow_item *flow_offload; |
2670 | |
|
2671 | 0 | item = xzalloc(sizeof *item + sizeof *flow_offload); |
2672 | 0 | flow_offload = &item->data->flow; |
2673 | |
|
2674 | 0 | item->type = DP_OFFLOAD_FLOW; |
2675 | 0 | item->dp = dp; |
2676 | |
|
2677 | 0 | flow_offload->flow = flow; |
2678 | 0 | flow_offload->op = op; |
2679 | |
|
2680 | 0 | dp_netdev_flow_ref(flow); |
2681 | |
|
2682 | 0 | return item; |
2683 | 0 | } |
2684 | | |
2685 | | static void |
2686 | | dp_netdev_free_flow_offload__(struct dp_offload_thread_item *offload) |
2687 | 0 | { |
2688 | 0 | struct dp_offload_flow_item *flow_offload = &offload->data->flow; |
2689 | |
|
2690 | 0 | free(flow_offload->actions); |
2691 | 0 | free(offload); |
2692 | 0 | } |
2693 | | |
2694 | | static void |
2695 | | dp_netdev_free_flow_offload(struct dp_offload_thread_item *offload) |
2696 | 0 | { |
2697 | 0 | struct dp_offload_flow_item *flow_offload = &offload->data->flow; |
2698 | |
|
2699 | 0 | dp_netdev_flow_unref(flow_offload->flow); |
2700 | 0 | ovsrcu_postpone(dp_netdev_free_flow_offload__, offload); |
2701 | 0 | } |
2702 | | |
2703 | | static void |
2704 | | dp_netdev_free_offload(struct dp_offload_thread_item *offload) |
2705 | 0 | { |
2706 | 0 | switch (offload->type) { |
2707 | 0 | case DP_OFFLOAD_FLOW: |
2708 | 0 | dp_netdev_free_flow_offload(offload); |
2709 | 0 | break; |
2710 | 0 | case DP_OFFLOAD_FLUSH: |
2711 | 0 | free(offload); |
2712 | 0 | break; |
2713 | 0 | default: |
2714 | 0 | OVS_NOT_REACHED(); |
2715 | 0 | }; |
2716 | 0 | } |
2717 | | |
2718 | | static void |
2719 | | dp_netdev_append_offload(struct dp_offload_thread_item *offload, |
2720 | | unsigned int tid) |
2721 | 0 | { |
2722 | 0 | dp_netdev_offload_init(); |
2723 | |
|
2724 | 0 | mpsc_queue_insert(&dp_offload_threads[tid].queue, &offload->node); |
2725 | 0 | atomic_count_inc64(&dp_offload_threads[tid].enqueued_item); |
2726 | 0 | } |
2727 | | |
2728 | | static void |
2729 | | dp_netdev_offload_flow_enqueue(struct dp_offload_thread_item *item) |
2730 | 0 | { |
2731 | 0 | struct dp_offload_flow_item *flow_offload = &item->data->flow; |
2732 | 0 | unsigned int tid; |
2733 | |
|
2734 | 0 | ovs_assert(item->type == DP_OFFLOAD_FLOW); |
2735 | |
|
2736 | 0 | tid = netdev_offload_ufid_to_thread_id(flow_offload->flow->mega_ufid); |
2737 | 0 | dp_netdev_append_offload(item, tid); |
2738 | 0 | } |
2739 | | |
2740 | | static int |
2741 | | dp_netdev_flow_offload_del(struct dp_offload_thread_item *item) |
2742 | 0 | { |
2743 | 0 | return mark_to_flow_disassociate(item->dp, item->data->flow.flow); |
2744 | 0 | } |
2745 | | |
2746 | | /* |
2747 | | * There are two flow offload operations here: addition and modification. |
2748 | | * |
2749 | | * For flow addition, this function does: |
2750 | | * - allocate a new flow mark id |
2751 | | * - perform hardware flow offload |
2752 | | * - associate the flow mark with flow and mega flow |
2753 | | * |
2754 | | * For flow modification, both flow mark and the associations are still |
2755 | | * valid, thus only item 2 needed. |
2756 | | */ |
2757 | | static int |
2758 | | dp_netdev_flow_offload_put(struct dp_offload_thread_item *item) |
2759 | 0 | { |
2760 | 0 | struct dp_offload_flow_item *offload = &item->data->flow; |
2761 | 0 | struct dp_netdev *dp = item->dp; |
2762 | 0 | struct dp_netdev_flow *flow = offload->flow; |
2763 | 0 | odp_port_t in_port = flow->flow.in_port.odp_port; |
2764 | 0 | const char *dpif_type_str = dpif_normalize_type(dp->class->type); |
2765 | 0 | bool modification = offload->op == DP_NETDEV_FLOW_OFFLOAD_OP_MOD |
2766 | 0 | && flow->mark != INVALID_FLOW_MARK; |
2767 | 0 | struct offload_info info; |
2768 | 0 | struct netdev *port; |
2769 | 0 | uint32_t mark; |
2770 | 0 | int ret; |
2771 | |
|
2772 | 0 | if (flow->dead) { |
2773 | 0 | return -1; |
2774 | 0 | } |
2775 | | |
2776 | 0 | if (modification) { |
2777 | 0 | mark = flow->mark; |
2778 | 0 | } else { |
2779 | | /* |
2780 | | * If a mega flow has already been offloaded (from other PMD |
2781 | | * instances), do not offload it again. |
2782 | | */ |
2783 | 0 | mark = megaflow_to_mark_find(&flow->mega_ufid); |
2784 | 0 | if (mark != INVALID_FLOW_MARK) { |
2785 | 0 | VLOG_DBG("Flow has already been offloaded with mark %u\n", mark); |
2786 | 0 | if (flow->mark != INVALID_FLOW_MARK) { |
2787 | 0 | ovs_assert(flow->mark == mark); |
2788 | 0 | } else { |
2789 | 0 | mark_to_flow_associate(mark, flow); |
2790 | 0 | } |
2791 | 0 | return 0; |
2792 | 0 | } |
2793 | | |
2794 | 0 | mark = flow_mark_alloc(); |
2795 | 0 | if (mark == INVALID_FLOW_MARK) { |
2796 | 0 | VLOG_ERR("Failed to allocate flow mark!\n"); |
2797 | 0 | return -1; |
2798 | 0 | } |
2799 | 0 | } |
2800 | 0 | info.flow_mark = mark; |
2801 | 0 | info.orig_in_port = offload->orig_in_port; |
2802 | |
|
2803 | 0 | port = netdev_ports_get(in_port, dpif_type_str); |
2804 | 0 | if (!port) { |
2805 | 0 | goto err_free; |
2806 | 0 | } |
2807 | | |
2808 | | /* Taking a global 'port_rwlock' to fulfill thread safety |
2809 | | * restrictions regarding the netdev port mapping. */ |
2810 | 0 | ovs_rwlock_rdlock(&dp->port_rwlock); |
2811 | 0 | ret = netdev_flow_put(port, &offload->match, |
2812 | 0 | CONST_CAST(struct nlattr *, offload->actions), |
2813 | 0 | offload->actions_len, &flow->mega_ufid, &info, |
2814 | 0 | NULL); |
2815 | 0 | ovs_rwlock_unlock(&dp->port_rwlock); |
2816 | 0 | netdev_close(port); |
2817 | |
|
2818 | 0 | if (ret) { |
2819 | 0 | goto err_free; |
2820 | 0 | } |
2821 | | |
2822 | 0 | if (!modification) { |
2823 | 0 | megaflow_to_mark_associate(&flow->mega_ufid, mark); |
2824 | 0 | mark_to_flow_associate(mark, flow); |
2825 | 0 | } |
2826 | 0 | return 0; |
2827 | | |
2828 | 0 | err_free: |
2829 | 0 | if (!modification) { |
2830 | 0 | flow_mark_free(mark); |
2831 | 0 | } else { |
2832 | 0 | mark_to_flow_disassociate(item->dp, flow); |
2833 | 0 | } |
2834 | 0 | return -1; |
2835 | 0 | } |
2836 | | |
2837 | | static void |
2838 | | dp_offload_flow(struct dp_offload_thread_item *item) |
2839 | 0 | { |
2840 | 0 | struct dp_offload_flow_item *flow_offload = &item->data->flow; |
2841 | 0 | const char *op; |
2842 | 0 | int ret; |
2843 | |
|
2844 | 0 | switch (flow_offload->op) { |
2845 | 0 | case DP_NETDEV_FLOW_OFFLOAD_OP_ADD: |
2846 | 0 | op = "add"; |
2847 | 0 | ret = dp_netdev_flow_offload_put(item); |
2848 | 0 | break; |
2849 | 0 | case DP_NETDEV_FLOW_OFFLOAD_OP_MOD: |
2850 | 0 | op = "modify"; |
2851 | 0 | ret = dp_netdev_flow_offload_put(item); |
2852 | 0 | break; |
2853 | 0 | case DP_NETDEV_FLOW_OFFLOAD_OP_DEL: |
2854 | 0 | op = "delete"; |
2855 | 0 | ret = dp_netdev_flow_offload_del(item); |
2856 | 0 | break; |
2857 | 0 | default: |
2858 | 0 | OVS_NOT_REACHED(); |
2859 | 0 | } |
2860 | | |
2861 | 0 | VLOG_DBG("%s to %s netdev flow "UUID_FMT, |
2862 | 0 | ret == 0 ? "succeed" : "failed", op, |
2863 | 0 | UUID_ARGS((struct uuid *) &flow_offload->flow->mega_ufid)); |
2864 | 0 | } |
2865 | | |
2866 | | static void |
2867 | | dp_offload_flush(struct dp_offload_thread_item *item) |
2868 | 0 | { |
2869 | 0 | struct dp_offload_flush_item *flush = &item->data->flush; |
2870 | |
|
2871 | 0 | ovs_rwlock_rdlock(&item->dp->port_rwlock); |
2872 | 0 | netdev_flow_flush(flush->netdev); |
2873 | 0 | ovs_rwlock_unlock(&item->dp->port_rwlock); |
2874 | |
|
2875 | 0 | ovs_barrier_block(flush->barrier); |
2876 | | |
2877 | | /* Allow the initiator thread to take again the port lock, |
2878 | | * before continuing offload operations in this thread. |
2879 | | */ |
2880 | 0 | ovs_barrier_block(flush->barrier); |
2881 | 0 | } |
2882 | | |
2883 | 0 | #define DP_NETDEV_OFFLOAD_BACKOFF_MIN 1 |
2884 | 0 | #define DP_NETDEV_OFFLOAD_BACKOFF_MAX 64 |
2885 | 0 | #define DP_NETDEV_OFFLOAD_QUIESCE_INTERVAL_US (10 * 1000) /* 10 ms */ |
2886 | | |
2887 | | static void * |
2888 | | dp_netdev_flow_offload_main(void *arg) |
2889 | 0 | { |
2890 | 0 | struct dp_offload_thread *ofl_thread = arg; |
2891 | 0 | struct dp_offload_thread_item *offload; |
2892 | 0 | struct mpsc_queue_node *node; |
2893 | 0 | struct mpsc_queue *queue; |
2894 | 0 | long long int latency_us; |
2895 | 0 | long long int next_rcu; |
2896 | 0 | long long int now; |
2897 | 0 | uint64_t backoff; |
2898 | |
|
2899 | 0 | queue = &ofl_thread->queue; |
2900 | 0 | mpsc_queue_acquire(queue); |
2901 | |
|
2902 | 0 | while (true) { |
2903 | 0 | backoff = DP_NETDEV_OFFLOAD_BACKOFF_MIN; |
2904 | 0 | while (mpsc_queue_tail(queue) == NULL) { |
2905 | 0 | xnanosleep(backoff * 1E6); |
2906 | 0 | if (backoff < DP_NETDEV_OFFLOAD_BACKOFF_MAX) { |
2907 | 0 | backoff <<= 1; |
2908 | 0 | } |
2909 | 0 | } |
2910 | |
|
2911 | 0 | next_rcu = time_usec() + DP_NETDEV_OFFLOAD_QUIESCE_INTERVAL_US; |
2912 | 0 | MPSC_QUEUE_FOR_EACH_POP (node, queue) { |
2913 | 0 | offload = CONTAINER_OF(node, struct dp_offload_thread_item, node); |
2914 | 0 | atomic_count_dec64(&ofl_thread->enqueued_item); |
2915 | |
|
2916 | 0 | switch (offload->type) { |
2917 | 0 | case DP_OFFLOAD_FLOW: |
2918 | 0 | dp_offload_flow(offload); |
2919 | 0 | break; |
2920 | 0 | case DP_OFFLOAD_FLUSH: |
2921 | 0 | dp_offload_flush(offload); |
2922 | 0 | break; |
2923 | 0 | default: |
2924 | 0 | OVS_NOT_REACHED(); |
2925 | 0 | } |
2926 | | |
2927 | 0 | now = time_usec(); |
2928 | |
|
2929 | 0 | latency_us = now - offload->timestamp; |
2930 | 0 | mov_avg_cma_update(&ofl_thread->cma, latency_us); |
2931 | 0 | mov_avg_ema_update(&ofl_thread->ema, latency_us); |
2932 | |
|
2933 | 0 | dp_netdev_free_offload(offload); |
2934 | | |
2935 | | /* Do RCU synchronization at fixed interval. */ |
2936 | 0 | if (now > next_rcu) { |
2937 | 0 | ovsrcu_quiesce(); |
2938 | 0 | next_rcu = time_usec() + DP_NETDEV_OFFLOAD_QUIESCE_INTERVAL_US; |
2939 | 0 | } |
2940 | 0 | } |
2941 | 0 | } |
2942 | | |
2943 | 0 | OVS_NOT_REACHED(); |
2944 | 0 | mpsc_queue_release(queue); |
2945 | |
|
2946 | 0 | return NULL; |
2947 | 0 | } |
2948 | | |
2949 | | static void |
2950 | | queue_netdev_flow_del(struct dp_netdev_pmd_thread *pmd, |
2951 | | struct dp_netdev_flow *flow) |
2952 | 0 | { |
2953 | 0 | struct dp_offload_thread_item *offload; |
2954 | |
|
2955 | 0 | if (!netdev_is_flow_api_enabled()) { |
2956 | 0 | return; |
2957 | 0 | } |
2958 | | |
2959 | 0 | offload = dp_netdev_alloc_flow_offload(pmd->dp, flow, |
2960 | 0 | DP_NETDEV_FLOW_OFFLOAD_OP_DEL); |
2961 | 0 | offload->timestamp = pmd->ctx.now; |
2962 | 0 | dp_netdev_offload_flow_enqueue(offload); |
2963 | 0 | } |
2964 | | |
2965 | | static void |
2966 | | log_netdev_flow_change(const struct dp_netdev_flow *flow, |
2967 | | const struct match *match, |
2968 | | const struct dp_netdev_actions *old_actions, |
2969 | | const struct nlattr *actions, |
2970 | | size_t actions_len) |
2971 | 0 | { |
2972 | 0 | struct ds ds = DS_EMPTY_INITIALIZER; |
2973 | 0 | struct ofpbuf key_buf, mask_buf; |
2974 | 0 | struct odp_flow_key_parms odp_parms = { |
2975 | 0 | .flow = &match->flow, |
2976 | 0 | .mask = &match->wc.masks, |
2977 | 0 | .support = dp_netdev_support, |
2978 | 0 | }; |
2979 | |
|
2980 | 0 | if (OVS_LIKELY(VLOG_DROP_DBG((&upcall_rl)))) { |
2981 | 0 | return; |
2982 | 0 | } |
2983 | | |
2984 | 0 | ofpbuf_init(&key_buf, 0); |
2985 | 0 | ofpbuf_init(&mask_buf, 0); |
2986 | |
|
2987 | 0 | odp_flow_key_from_flow(&odp_parms, &key_buf); |
2988 | 0 | odp_parms.key_buf = &key_buf; |
2989 | 0 | odp_flow_key_from_mask(&odp_parms, &mask_buf); |
2990 | |
|
2991 | 0 | if (old_actions) { |
2992 | 0 | ds_put_cstr(&ds, "flow_mod: "); |
2993 | 0 | } else { |
2994 | 0 | ds_put_cstr(&ds, "flow_add: "); |
2995 | 0 | } |
2996 | 0 | odp_format_ufid(&flow->ufid, &ds); |
2997 | 0 | ds_put_cstr(&ds, " mega_"); |
2998 | 0 | odp_format_ufid(&flow->mega_ufid, &ds); |
2999 | 0 | ds_put_cstr(&ds, " "); |
3000 | 0 | odp_flow_format(key_buf.data, key_buf.size, |
3001 | 0 | mask_buf.data, mask_buf.size, |
3002 | 0 | NULL, &ds, false); |
3003 | 0 | if (old_actions) { |
3004 | 0 | ds_put_cstr(&ds, ", old_actions:"); |
3005 | 0 | format_odp_actions(&ds, old_actions->actions, old_actions->size, |
3006 | 0 | NULL); |
3007 | 0 | } |
3008 | 0 | ds_put_cstr(&ds, ", actions:"); |
3009 | 0 | format_odp_actions(&ds, actions, actions_len, NULL); |
3010 | |
|
3011 | 0 | VLOG_DBG("%s", ds_cstr(&ds)); |
3012 | |
|
3013 | 0 | ofpbuf_uninit(&key_buf); |
3014 | 0 | ofpbuf_uninit(&mask_buf); |
3015 | | |
3016 | | /* Add a printout of the actual match installed. */ |
3017 | 0 | struct match m; |
3018 | 0 | ds_clear(&ds); |
3019 | 0 | ds_put_cstr(&ds, "flow match: "); |
3020 | 0 | miniflow_expand(&flow->cr.flow.mf, &m.flow); |
3021 | 0 | miniflow_expand(&flow->cr.mask->mf, &m.wc.masks); |
3022 | 0 | memset(&m.tun_md, 0, sizeof m.tun_md); |
3023 | 0 | match_format(&m, NULL, &ds, OFP_DEFAULT_PRIORITY); |
3024 | |
|
3025 | 0 | VLOG_DBG("%s", ds_cstr(&ds)); |
3026 | |
|
3027 | 0 | ds_destroy(&ds); |
3028 | 0 | } |
3029 | | |
3030 | | static void |
3031 | | queue_netdev_flow_put(struct dp_netdev_pmd_thread *pmd, |
3032 | | struct dp_netdev_flow *flow, struct match *match, |
3033 | | const struct nlattr *actions, size_t actions_len, |
3034 | | int op) |
3035 | 0 | { |
3036 | 0 | struct dp_offload_thread_item *item; |
3037 | 0 | struct dp_offload_flow_item *flow_offload; |
3038 | |
|
3039 | 0 | if (!netdev_is_flow_api_enabled()) { |
3040 | 0 | return; |
3041 | 0 | } |
3042 | | |
3043 | 0 | item = dp_netdev_alloc_flow_offload(pmd->dp, flow, op); |
3044 | 0 | flow_offload = &item->data->flow; |
3045 | 0 | flow_offload->match = *match; |
3046 | 0 | flow_offload->actions = xmalloc(actions_len); |
3047 | 0 | memcpy(flow_offload->actions, actions, actions_len); |
3048 | 0 | flow_offload->actions_len = actions_len; |
3049 | 0 | flow_offload->orig_in_port = flow->orig_in_port; |
3050 | |
|
3051 | 0 | item->timestamp = pmd->ctx.now; |
3052 | 0 | dp_netdev_offload_flow_enqueue(item); |
3053 | 0 | } |
3054 | | |
3055 | | static void |
3056 | | dp_netdev_pmd_remove_flow(struct dp_netdev_pmd_thread *pmd, |
3057 | | struct dp_netdev_flow *flow) |
3058 | | OVS_REQUIRES(pmd->flow_mutex) |
3059 | 0 | { |
3060 | 0 | struct cmap_node *node = CONST_CAST(struct cmap_node *, &flow->node); |
3061 | 0 | struct dpcls *cls; |
3062 | 0 | odp_port_t in_port = flow->flow.in_port.odp_port; |
3063 | |
|
3064 | 0 | cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port); |
3065 | 0 | ovs_assert(cls != NULL); |
3066 | 0 | dpcls_remove(cls, &flow->cr); |
3067 | 0 | dp_netdev_simple_match_remove(pmd, flow); |
3068 | 0 | cmap_remove(&pmd->flow_table, node, dp_netdev_flow_hash(&flow->ufid)); |
3069 | 0 | ccmap_dec(&pmd->n_flows, odp_to_u32(in_port)); |
3070 | 0 | queue_netdev_flow_del(pmd, flow); |
3071 | 0 | flow->dead = true; |
3072 | |
|
3073 | 0 | dp_netdev_flow_unref(flow); |
3074 | 0 | } |
3075 | | |
3076 | | static void |
3077 | | dp_netdev_offload_flush_enqueue(struct dp_netdev *dp, |
3078 | | struct netdev *netdev, |
3079 | | struct ovs_barrier *barrier) |
3080 | 0 | { |
3081 | 0 | unsigned int tid; |
3082 | 0 | long long int now_us = time_usec(); |
3083 | |
|
3084 | 0 | for (tid = 0; tid < netdev_offload_thread_nb(); tid++) { |
3085 | 0 | struct dp_offload_thread_item *item; |
3086 | 0 | struct dp_offload_flush_item *flush; |
3087 | |
|
3088 | 0 | item = xmalloc(sizeof *item + sizeof *flush); |
3089 | 0 | item->type = DP_OFFLOAD_FLUSH; |
3090 | 0 | item->dp = dp; |
3091 | 0 | item->timestamp = now_us; |
3092 | |
|
3093 | 0 | flush = &item->data->flush; |
3094 | 0 | flush->netdev = netdev; |
3095 | 0 | flush->barrier = barrier; |
3096 | |
|
3097 | 0 | dp_netdev_append_offload(item, tid); |
3098 | 0 | } |
3099 | 0 | } |
3100 | | |
3101 | | /* Blocking call that will wait on the offload thread to |
3102 | | * complete its work. As the flush order will only be |
3103 | | * enqueued after existing offload requests, those previous |
3104 | | * offload requests must be processed, which requires being |
3105 | | * able to lock the 'port_rwlock' from the offload thread. |
3106 | | * |
3107 | | * Flow offload flush is done when a port is being deleted. |
3108 | | * Right after this call executes, the offload API is disabled |
3109 | | * for the port. This call must be made blocking until the |
3110 | | * offload provider completed its job. |
3111 | | */ |
3112 | | static void |
3113 | | dp_netdev_offload_flush(struct dp_netdev *dp, |
3114 | | struct dp_netdev_port *port) |
3115 | | OVS_REQ_WRLOCK(dp->port_rwlock) |
3116 | 0 | { |
3117 | | /* The flush mutex serves to exclude mutual access to the static |
3118 | | * barrier, and to prevent multiple flush orders to several threads. |
3119 | | * |
3120 | | * The memory barrier needs to go beyond the function scope as |
3121 | | * the other threads can resume from blocking after this function |
3122 | | * already finished. |
3123 | | * |
3124 | | * Additionally, because the flush operation is blocking, it would |
3125 | | * deadlock if multiple offload threads were blocking on several |
3126 | | * different barriers. Only allow a single flush order in the offload |
3127 | | * queue at a time. |
3128 | | */ |
3129 | 0 | static struct ovs_mutex flush_mutex = OVS_MUTEX_INITIALIZER; |
3130 | 0 | static struct ovs_barrier barrier OVS_GUARDED_BY(flush_mutex); |
3131 | 0 | struct netdev *netdev; |
3132 | |
|
3133 | 0 | if (!netdev_is_flow_api_enabled()) { |
3134 | 0 | return; |
3135 | 0 | } |
3136 | | |
3137 | 0 | ovs_rwlock_unlock(&dp->port_rwlock); |
3138 | 0 | ovs_mutex_lock(&flush_mutex); |
3139 | | |
3140 | | /* This thread and the offload threads. */ |
3141 | 0 | ovs_barrier_init(&barrier, 1 + netdev_offload_thread_nb()); |
3142 | |
|
3143 | 0 | netdev = netdev_ref(port->netdev); |
3144 | 0 | dp_netdev_offload_flush_enqueue(dp, netdev, &barrier); |
3145 | 0 | ovs_barrier_block(&barrier); |
3146 | 0 | netdev_close(netdev); |
3147 | | |
3148 | | /* Take back the datapath port lock before allowing the offload |
3149 | | * threads to proceed further. The port deletion must complete first, |
3150 | | * to ensure no further offloads are inserted after the flush. |
3151 | | * |
3152 | | * Some offload provider (e.g. DPDK) keeps a netdev reference with |
3153 | | * the offload data. If this reference is not closed, the netdev is |
3154 | | * kept indefinitely. */ |
3155 | 0 | ovs_rwlock_wrlock(&dp->port_rwlock); |
3156 | |
|
3157 | 0 | ovs_barrier_block(&barrier); |
3158 | 0 | ovs_barrier_destroy(&barrier); |
3159 | |
|
3160 | 0 | ovs_mutex_unlock(&flush_mutex); |
3161 | 0 | } |
3162 | | |
3163 | | static void |
3164 | | dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd) |
3165 | 0 | { |
3166 | 0 | struct dp_netdev_flow *netdev_flow; |
3167 | |
|
3168 | 0 | ovs_mutex_lock(&pmd->flow_mutex); |
3169 | 0 | CMAP_FOR_EACH (netdev_flow, node, &pmd->flow_table) { |
3170 | 0 | dp_netdev_pmd_remove_flow(pmd, netdev_flow); |
3171 | 0 | } |
3172 | 0 | ovs_mutex_unlock(&pmd->flow_mutex); |
3173 | 0 | } |
3174 | | |
3175 | | static int |
3176 | | dpif_netdev_flow_flush(struct dpif *dpif) |
3177 | 0 | { |
3178 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
3179 | 0 | struct dp_netdev_pmd_thread *pmd; |
3180 | |
|
3181 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
3182 | 0 | dp_netdev_pmd_flow_flush(pmd); |
3183 | 0 | } |
3184 | |
|
3185 | 0 | return 0; |
3186 | 0 | } |
3187 | | |
3188 | | struct dp_netdev_port_state { |
3189 | | struct hmap_position position; |
3190 | | char *name; |
3191 | | }; |
3192 | | |
3193 | | static int |
3194 | | dpif_netdev_port_dump_start(const struct dpif *dpif OVS_UNUSED, void **statep) |
3195 | 0 | { |
3196 | 0 | *statep = xzalloc(sizeof(struct dp_netdev_port_state)); |
3197 | 0 | return 0; |
3198 | 0 | } |
3199 | | |
3200 | | static int |
3201 | | dpif_netdev_port_dump_next(const struct dpif *dpif, void *state_, |
3202 | | struct dpif_port *dpif_port) |
3203 | 0 | { |
3204 | 0 | struct dp_netdev_port_state *state = state_; |
3205 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
3206 | 0 | struct hmap_node *node; |
3207 | 0 | int retval; |
3208 | |
|
3209 | 0 | ovs_rwlock_rdlock(&dp->port_rwlock); |
3210 | 0 | node = hmap_at_position(&dp->ports, &state->position); |
3211 | 0 | if (node) { |
3212 | 0 | struct dp_netdev_port *port; |
3213 | |
|
3214 | 0 | port = CONTAINER_OF(node, struct dp_netdev_port, node); |
3215 | |
|
3216 | 0 | free(state->name); |
3217 | 0 | state->name = xstrdup(netdev_get_name(port->netdev)); |
3218 | 0 | dpif_port->name = state->name; |
3219 | 0 | dpif_port->type = port->type; |
3220 | 0 | dpif_port->port_no = port->port_no; |
3221 | |
|
3222 | 0 | retval = 0; |
3223 | 0 | } else { |
3224 | 0 | retval = EOF; |
3225 | 0 | } |
3226 | 0 | ovs_rwlock_unlock(&dp->port_rwlock); |
3227 | |
|
3228 | 0 | return retval; |
3229 | 0 | } |
3230 | | |
3231 | | static int |
3232 | | dpif_netdev_port_dump_done(const struct dpif *dpif OVS_UNUSED, void *state_) |
3233 | 0 | { |
3234 | 0 | struct dp_netdev_port_state *state = state_; |
3235 | 0 | free(state->name); |
3236 | 0 | free(state); |
3237 | 0 | return 0; |
3238 | 0 | } |
3239 | | |
3240 | | static int |
3241 | | dpif_netdev_port_poll(const struct dpif *dpif_, char **devnamep OVS_UNUSED) |
3242 | 0 | { |
3243 | 0 | struct dpif_netdev *dpif = dpif_netdev_cast(dpif_); |
3244 | 0 | uint64_t new_port_seq; |
3245 | 0 | int error; |
3246 | |
|
3247 | 0 | new_port_seq = seq_read(dpif->dp->port_seq); |
3248 | 0 | if (dpif->last_port_seq != new_port_seq) { |
3249 | 0 | dpif->last_port_seq = new_port_seq; |
3250 | 0 | error = ENOBUFS; |
3251 | 0 | } else { |
3252 | 0 | error = EAGAIN; |
3253 | 0 | } |
3254 | |
|
3255 | 0 | return error; |
3256 | 0 | } |
3257 | | |
3258 | | static void |
3259 | | dpif_netdev_port_poll_wait(const struct dpif *dpif_) |
3260 | 0 | { |
3261 | 0 | struct dpif_netdev *dpif = dpif_netdev_cast(dpif_); |
3262 | |
|
3263 | 0 | seq_wait(dpif->dp->port_seq, dpif->last_port_seq); |
3264 | 0 | } |
3265 | | |
3266 | | static struct dp_netdev_flow * |
3267 | | dp_netdev_flow_cast(const struct dpcls_rule *cr) |
3268 | 0 | { |
3269 | 0 | return cr ? CONTAINER_OF(cr, struct dp_netdev_flow, cr) : NULL; |
3270 | 0 | } |
3271 | | |
3272 | | static bool dp_netdev_flow_ref(struct dp_netdev_flow *flow) |
3273 | 0 | { |
3274 | 0 | return ovs_refcount_try_ref_rcu(&flow->ref_cnt); |
3275 | 0 | } |
3276 | | |
3277 | | /* netdev_flow_key utilities. |
3278 | | * |
3279 | | * netdev_flow_key is basically a miniflow. We use these functions |
3280 | | * (netdev_flow_key_clone, netdev_flow_key_equal, ...) instead of the miniflow |
3281 | | * functions (miniflow_clone_inline, miniflow_equal, ...), because: |
3282 | | * |
3283 | | * - Since we are dealing exclusively with miniflows created by |
3284 | | * miniflow_extract(), if the map is different the miniflow is different. |
3285 | | * Therefore we can be faster by comparing the map and the miniflow in a |
3286 | | * single memcmp(). |
3287 | | * - These functions can be inlined by the compiler. */ |
3288 | | |
3289 | | static inline bool |
3290 | | netdev_flow_key_equal(const struct netdev_flow_key *a, |
3291 | | const struct netdev_flow_key *b) |
3292 | 0 | { |
3293 | | /* 'b->len' may be not set yet. */ |
3294 | 0 | return a->hash == b->hash && !memcmp(&a->mf, &b->mf, a->len); |
3295 | 0 | } |
3296 | | |
3297 | | static inline void |
3298 | | netdev_flow_key_clone(struct netdev_flow_key *dst, |
3299 | | const struct netdev_flow_key *src) |
3300 | 0 | { |
3301 | 0 | memcpy(dst, src, |
3302 | 0 | offsetof(struct netdev_flow_key, mf) + src->len); |
3303 | 0 | } |
3304 | | |
3305 | | /* Initialize a netdev_flow_key 'mask' from 'match'. */ |
3306 | | static inline void |
3307 | | netdev_flow_mask_init(struct netdev_flow_key *mask, |
3308 | | const struct match *match) |
3309 | 0 | { |
3310 | 0 | uint64_t *dst = miniflow_values(&mask->mf); |
3311 | 0 | struct flowmap fmap; |
3312 | 0 | uint32_t hash = 0; |
3313 | 0 | size_t idx; |
3314 | | |
3315 | | /* Only check masks that make sense for the flow. */ |
3316 | 0 | flow_wc_map(&match->flow, &fmap); |
3317 | 0 | flowmap_init(&mask->mf.map); |
3318 | |
|
3319 | 0 | FLOWMAP_FOR_EACH_INDEX(idx, fmap) { |
3320 | 0 | uint64_t mask_u64 = flow_u64_value(&match->wc.masks, idx); |
3321 | |
|
3322 | 0 | if (mask_u64) { |
3323 | 0 | flowmap_set(&mask->mf.map, idx, 1); |
3324 | 0 | *dst++ = mask_u64; |
3325 | 0 | hash = hash_add64(hash, mask_u64); |
3326 | 0 | } |
3327 | 0 | } |
3328 | |
|
3329 | 0 | map_t map; |
3330 | |
|
3331 | 0 | FLOWMAP_FOR_EACH_MAP (map, mask->mf.map) { |
3332 | 0 | hash = hash_add64(hash, map); |
3333 | 0 | } |
3334 | |
|
3335 | 0 | size_t n = dst - miniflow_get_values(&mask->mf); |
3336 | |
|
3337 | 0 | mask->hash = hash_finish(hash, n * 8); |
3338 | 0 | mask->len = netdev_flow_key_size(n); |
3339 | 0 | } |
3340 | | |
3341 | | /* Initializes 'dst' as a copy of 'flow' masked with 'mask'. */ |
3342 | | static inline void |
3343 | | netdev_flow_key_init_masked(struct netdev_flow_key *dst, |
3344 | | const struct flow *flow, |
3345 | | const struct netdev_flow_key *mask) |
3346 | 0 | { |
3347 | 0 | uint64_t *dst_u64 = miniflow_values(&dst->mf); |
3348 | 0 | const uint64_t *mask_u64 = miniflow_get_values(&mask->mf); |
3349 | 0 | uint32_t hash = 0; |
3350 | 0 | uint64_t value; |
3351 | |
|
3352 | 0 | dst->len = mask->len; |
3353 | 0 | dst->mf = mask->mf; /* Copy maps. */ |
3354 | |
|
3355 | 0 | FLOW_FOR_EACH_IN_MAPS(value, flow, mask->mf.map) { |
3356 | 0 | *dst_u64 = value & *mask_u64++; |
3357 | 0 | hash = hash_add64(hash, *dst_u64++); |
3358 | 0 | } |
3359 | 0 | dst->hash = hash_finish(hash, |
3360 | 0 | (dst_u64 - miniflow_get_values(&dst->mf)) * 8); |
3361 | 0 | } |
3362 | | |
3363 | | /* Initializes 'key' as a copy of 'flow'. */ |
3364 | | static inline void |
3365 | | netdev_flow_key_init(struct netdev_flow_key *key, |
3366 | | const struct flow *flow) |
3367 | 0 | { |
3368 | 0 | uint64_t *dst = miniflow_values(&key->mf); |
3369 | 0 | uint32_t hash = 0; |
3370 | 0 | uint64_t value; |
3371 | |
|
3372 | 0 | miniflow_map_init(&key->mf, flow); |
3373 | 0 | miniflow_init(&key->mf, flow); |
3374 | |
|
3375 | 0 | size_t n = dst - miniflow_get_values(&key->mf); |
3376 | |
|
3377 | 0 | FLOW_FOR_EACH_IN_MAPS (value, flow, key->mf.map) { |
3378 | 0 | hash = hash_add64(hash, value); |
3379 | 0 | } |
3380 | |
|
3381 | 0 | key->hash = hash_finish(hash, n * 8); |
3382 | 0 | key->len = netdev_flow_key_size(n); |
3383 | 0 | } |
3384 | | |
3385 | | static inline void |
3386 | | emc_change_entry(struct emc_entry *ce, struct dp_netdev_flow *flow, |
3387 | | const struct netdev_flow_key *key) |
3388 | 0 | { |
3389 | 0 | if (ce->flow != flow) { |
3390 | 0 | if (ce->flow) { |
3391 | 0 | dp_netdev_flow_unref(ce->flow); |
3392 | 0 | } |
3393 | |
|
3394 | 0 | if (dp_netdev_flow_ref(flow)) { |
3395 | 0 | ce->flow = flow; |
3396 | 0 | } else { |
3397 | 0 | ce->flow = NULL; |
3398 | 0 | } |
3399 | 0 | } |
3400 | 0 | if (key) { |
3401 | 0 | netdev_flow_key_clone(&ce->key, key); |
3402 | 0 | } |
3403 | 0 | } |
3404 | | |
3405 | | static inline void |
3406 | | emc_insert(struct emc_cache *cache, const struct netdev_flow_key *key, |
3407 | | struct dp_netdev_flow *flow) |
3408 | 0 | { |
3409 | 0 | struct emc_entry *to_be_replaced = NULL; |
3410 | 0 | struct emc_entry *current_entry; |
3411 | |
|
3412 | 0 | EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) { |
3413 | 0 | if (netdev_flow_key_equal(¤t_entry->key, key)) { |
3414 | | /* We found the entry with the 'mf' miniflow */ |
3415 | 0 | emc_change_entry(current_entry, flow, NULL); |
3416 | 0 | return; |
3417 | 0 | } |
3418 | | |
3419 | | /* Replacement policy: put the flow in an empty (not alive) entry, or |
3420 | | * in the first entry where it can be */ |
3421 | 0 | if (!to_be_replaced |
3422 | 0 | || (emc_entry_alive(to_be_replaced) |
3423 | 0 | && !emc_entry_alive(current_entry)) |
3424 | 0 | || current_entry->key.hash < to_be_replaced->key.hash) { |
3425 | 0 | to_be_replaced = current_entry; |
3426 | 0 | } |
3427 | 0 | } |
3428 | | /* We didn't find the miniflow in the cache. |
3429 | | * The 'to_be_replaced' entry is where the new flow will be stored */ |
3430 | | |
3431 | 0 | emc_change_entry(to_be_replaced, flow, key); |
3432 | 0 | } |
3433 | | |
3434 | | static inline void |
3435 | | emc_probabilistic_insert(struct dp_netdev_pmd_thread *pmd, |
3436 | | const struct netdev_flow_key *key, |
3437 | | struct dp_netdev_flow *flow) |
3438 | 0 | { |
3439 | | /* Insert an entry into the EMC based on probability value 'min'. By |
3440 | | * default the value is UINT32_MAX / 100 which yields an insertion |
3441 | | * probability of 1/100 ie. 1% */ |
3442 | |
|
3443 | 0 | uint32_t min = pmd->ctx.emc_insert_min; |
3444 | |
|
3445 | 0 | if (min && random_uint32() <= min) { |
3446 | 0 | emc_insert(&(pmd->flow_cache).emc_cache, key, flow); |
3447 | 0 | } |
3448 | 0 | } |
3449 | | |
3450 | | static inline const struct cmap_node * |
3451 | | smc_entry_get(struct dp_netdev_pmd_thread *pmd, const uint32_t hash) |
3452 | 0 | { |
3453 | 0 | struct smc_cache *cache = &(pmd->flow_cache).smc_cache; |
3454 | 0 | struct smc_bucket *bucket = &cache->buckets[hash & SMC_MASK]; |
3455 | 0 | uint16_t sig = hash >> 16; |
3456 | 0 | uint16_t index = UINT16_MAX; |
3457 | |
|
3458 | 0 | for (int i = 0; i < SMC_ENTRY_PER_BUCKET; i++) { |
3459 | 0 | if (bucket->sig[i] == sig) { |
3460 | 0 | index = bucket->flow_idx[i]; |
3461 | 0 | break; |
3462 | 0 | } |
3463 | 0 | } |
3464 | 0 | if (index != UINT16_MAX) { |
3465 | 0 | return cmap_find_by_index(&pmd->flow_table, index); |
3466 | 0 | } |
3467 | 0 | return NULL; |
3468 | 0 | } |
3469 | | |
3470 | | /* Insert the flow_table index into SMC. Insertion may fail when 1) SMC is |
3471 | | * turned off, 2) the flow_table index is larger than uint16_t can handle. |
3472 | | * If there is already an SMC entry having same signature, the index will be |
3473 | | * updated. If there is no existing entry, but an empty entry is available, |
3474 | | * the empty entry will be taken. If no empty entry or existing same signature, |
3475 | | * a random entry from the hashed bucket will be picked. */ |
3476 | | static inline void |
3477 | | smc_insert(struct dp_netdev_pmd_thread *pmd, |
3478 | | const struct netdev_flow_key *key, |
3479 | | uint32_t hash) |
3480 | 0 | { |
3481 | 0 | struct smc_cache *smc_cache = &(pmd->flow_cache).smc_cache; |
3482 | 0 | struct smc_bucket *bucket = &smc_cache->buckets[key->hash & SMC_MASK]; |
3483 | 0 | uint16_t index; |
3484 | 0 | uint32_t cmap_index; |
3485 | 0 | int i; |
3486 | |
|
3487 | 0 | if (!pmd->ctx.smc_enable_db) { |
3488 | 0 | return; |
3489 | 0 | } |
3490 | | |
3491 | 0 | cmap_index = cmap_find_index(&pmd->flow_table, hash); |
3492 | 0 | index = (cmap_index >= UINT16_MAX) ? UINT16_MAX : (uint16_t)cmap_index; |
3493 | | |
3494 | | /* If the index is larger than SMC can handle (uint16_t), we don't |
3495 | | * insert */ |
3496 | 0 | if (index == UINT16_MAX) { |
3497 | 0 | return; |
3498 | 0 | } |
3499 | | |
3500 | | /* If an entry with same signature already exists, update the index */ |
3501 | 0 | uint16_t sig = key->hash >> 16; |
3502 | 0 | for (i = 0; i < SMC_ENTRY_PER_BUCKET; i++) { |
3503 | 0 | if (bucket->sig[i] == sig) { |
3504 | 0 | bucket->flow_idx[i] = index; |
3505 | 0 | return; |
3506 | 0 | } |
3507 | 0 | } |
3508 | | /* If there is an empty entry, occupy it. */ |
3509 | 0 | for (i = 0; i < SMC_ENTRY_PER_BUCKET; i++) { |
3510 | 0 | if (bucket->flow_idx[i] == UINT16_MAX) { |
3511 | 0 | bucket->sig[i] = sig; |
3512 | 0 | bucket->flow_idx[i] = index; |
3513 | 0 | return; |
3514 | 0 | } |
3515 | 0 | } |
3516 | | /* Otherwise, pick a random entry. */ |
3517 | 0 | i = random_uint32() % SMC_ENTRY_PER_BUCKET; |
3518 | 0 | bucket->sig[i] = sig; |
3519 | 0 | bucket->flow_idx[i] = index; |
3520 | 0 | } |
3521 | | |
3522 | | inline void |
3523 | | emc_probabilistic_insert_batch(struct dp_netdev_pmd_thread *pmd, |
3524 | | const struct netdev_flow_key *keys, |
3525 | | struct dpcls_rule **rules, |
3526 | | uint32_t emc_insert_mask) |
3527 | 0 | { |
3528 | 0 | while (emc_insert_mask) { |
3529 | 0 | uint32_t i = raw_ctz(emc_insert_mask); |
3530 | 0 | emc_insert_mask &= emc_insert_mask - 1; |
3531 | | /* Get the require parameters for EMC/SMC from the rule */ |
3532 | 0 | struct dp_netdev_flow *flow = dp_netdev_flow_cast(rules[i]); |
3533 | | /* Insert the key into EMC/SMC. */ |
3534 | 0 | emc_probabilistic_insert(pmd, &keys[i], flow); |
3535 | 0 | } |
3536 | 0 | } |
3537 | | |
3538 | | inline void |
3539 | | smc_insert_batch(struct dp_netdev_pmd_thread *pmd, |
3540 | | const struct netdev_flow_key *keys, |
3541 | | struct dpcls_rule **rules, |
3542 | | uint32_t smc_insert_mask) |
3543 | 0 | { |
3544 | 0 | while (smc_insert_mask) { |
3545 | 0 | uint32_t i = raw_ctz(smc_insert_mask); |
3546 | 0 | smc_insert_mask &= smc_insert_mask - 1; |
3547 | | /* Get the require parameters for EMC/SMC from the rule */ |
3548 | 0 | struct dp_netdev_flow *flow = dp_netdev_flow_cast(rules[i]); |
3549 | 0 | uint32_t hash = dp_netdev_flow_hash(&flow->ufid); |
3550 | | /* Insert the key into EMC/SMC. */ |
3551 | 0 | smc_insert(pmd, &keys[i], hash); |
3552 | 0 | } |
3553 | 0 | } |
3554 | | |
3555 | | static struct dp_netdev_flow * |
3556 | | dp_netdev_pmd_lookup_flow(struct dp_netdev_pmd_thread *pmd, |
3557 | | const struct netdev_flow_key *key, |
3558 | | int *lookup_num_p) |
3559 | 0 | { |
3560 | 0 | struct dpcls *cls; |
3561 | 0 | struct dpcls_rule *rule = NULL; |
3562 | 0 | odp_port_t in_port = u32_to_odp(MINIFLOW_GET_U32(&key->mf, |
3563 | 0 | in_port.odp_port)); |
3564 | 0 | struct dp_netdev_flow *netdev_flow = NULL; |
3565 | |
|
3566 | 0 | cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port); |
3567 | 0 | if (OVS_LIKELY(cls)) { |
3568 | 0 | dpcls_lookup(cls, &key, &rule, 1, lookup_num_p); |
3569 | 0 | netdev_flow = dp_netdev_flow_cast(rule); |
3570 | 0 | } |
3571 | 0 | return netdev_flow; |
3572 | 0 | } |
3573 | | |
3574 | | static struct dp_netdev_flow * |
3575 | | dp_netdev_pmd_find_flow(const struct dp_netdev_pmd_thread *pmd, |
3576 | | const ovs_u128 *ufidp, const struct nlattr *key, |
3577 | | size_t key_len) |
3578 | 0 | { |
3579 | 0 | struct dp_netdev_flow *netdev_flow; |
3580 | 0 | struct flow flow; |
3581 | 0 | ovs_u128 ufid; |
3582 | | |
3583 | | /* If a UFID is not provided, determine one based on the key. */ |
3584 | 0 | if (!ufidp && key && key_len |
3585 | 0 | && !dpif_netdev_flow_from_nlattrs(key, key_len, &flow, false)) { |
3586 | 0 | odp_flow_key_hash(&flow, sizeof flow, &ufid); |
3587 | 0 | ufidp = &ufid; |
3588 | 0 | } |
3589 | |
|
3590 | 0 | if (ufidp) { |
3591 | 0 | CMAP_FOR_EACH_WITH_HASH (netdev_flow, node, dp_netdev_flow_hash(ufidp), |
3592 | 0 | &pmd->flow_table) { |
3593 | 0 | if (ovs_u128_equals(netdev_flow->ufid, *ufidp)) { |
3594 | 0 | return netdev_flow; |
3595 | 0 | } |
3596 | 0 | } |
3597 | 0 | } |
3598 | | |
3599 | 0 | return NULL; |
3600 | 0 | } |
3601 | | |
3602 | | static void |
3603 | | dp_netdev_flow_set_last_stats_attrs(struct dp_netdev_flow *netdev_flow, |
3604 | | const struct dpif_flow_stats *stats, |
3605 | | const struct dpif_flow_attrs *attrs, |
3606 | | int result) |
3607 | 0 | { |
3608 | 0 | struct dp_netdev_flow_stats *last_stats = &netdev_flow->last_stats; |
3609 | 0 | struct dp_netdev_flow_attrs *last_attrs = &netdev_flow->last_attrs; |
3610 | |
|
3611 | 0 | atomic_store_relaxed(&netdev_flow->netdev_flow_get_result, result); |
3612 | 0 | if (result) { |
3613 | 0 | return; |
3614 | 0 | } |
3615 | | |
3616 | 0 | atomic_store_relaxed(&last_stats->used, stats->used); |
3617 | 0 | atomic_store_relaxed(&last_stats->packet_count, stats->n_packets); |
3618 | 0 | atomic_store_relaxed(&last_stats->byte_count, stats->n_bytes); |
3619 | 0 | atomic_store_relaxed(&last_stats->tcp_flags, stats->tcp_flags); |
3620 | |
|
3621 | 0 | atomic_store_relaxed(&last_attrs->offloaded, attrs->offloaded); |
3622 | 0 | atomic_store_relaxed(&last_attrs->dp_layer, attrs->dp_layer); |
3623 | |
|
3624 | 0 | } |
3625 | | |
3626 | | static void |
3627 | | dp_netdev_flow_get_last_stats_attrs(struct dp_netdev_flow *netdev_flow, |
3628 | | struct dpif_flow_stats *stats, |
3629 | | struct dpif_flow_attrs *attrs, |
3630 | | int *result) |
3631 | 0 | { |
3632 | 0 | struct dp_netdev_flow_stats *last_stats = &netdev_flow->last_stats; |
3633 | 0 | struct dp_netdev_flow_attrs *last_attrs = &netdev_flow->last_attrs; |
3634 | |
|
3635 | 0 | atomic_read_relaxed(&netdev_flow->netdev_flow_get_result, result); |
3636 | 0 | if (*result) { |
3637 | 0 | return; |
3638 | 0 | } |
3639 | | |
3640 | 0 | atomic_read_relaxed(&last_stats->used, &stats->used); |
3641 | 0 | atomic_read_relaxed(&last_stats->packet_count, &stats->n_packets); |
3642 | 0 | atomic_read_relaxed(&last_stats->byte_count, &stats->n_bytes); |
3643 | 0 | atomic_read_relaxed(&last_stats->tcp_flags, &stats->tcp_flags); |
3644 | |
|
3645 | 0 | atomic_read_relaxed(&last_attrs->offloaded, &attrs->offloaded); |
3646 | 0 | atomic_read_relaxed(&last_attrs->dp_layer, &attrs->dp_layer); |
3647 | 0 | } |
3648 | | |
3649 | | static bool |
3650 | | dpif_netdev_get_flow_offload_status(const struct dp_netdev *dp, |
3651 | | struct dp_netdev_flow *netdev_flow, |
3652 | | struct dpif_flow_stats *stats, |
3653 | | struct dpif_flow_attrs *attrs) |
3654 | 0 | { |
3655 | 0 | uint64_t act_buf[1024 / 8]; |
3656 | 0 | struct nlattr *actions; |
3657 | 0 | struct netdev *netdev; |
3658 | 0 | struct match match; |
3659 | 0 | struct ofpbuf buf; |
3660 | |
|
3661 | 0 | int ret = 0; |
3662 | |
|
3663 | 0 | if (!netdev_is_flow_api_enabled()) { |
3664 | 0 | return false; |
3665 | 0 | } |
3666 | | |
3667 | 0 | netdev = netdev_ports_get(netdev_flow->flow.in_port.odp_port, |
3668 | 0 | dpif_normalize_type(dp->class->type)); |
3669 | 0 | if (!netdev) { |
3670 | 0 | return false; |
3671 | 0 | } |
3672 | 0 | ofpbuf_use_stack(&buf, &act_buf, sizeof act_buf); |
3673 | | /* Taking a global 'port_rwlock' to fulfill thread safety |
3674 | | * restrictions regarding netdev port mapping. |
3675 | | * |
3676 | | * XXX: Main thread will try to pause/stop all revalidators during datapath |
3677 | | * reconfiguration via datapath purge callback (dp_purge_cb) while |
3678 | | * rw-holding 'dp->port_rwlock'. So we're not waiting for lock here. |
3679 | | * Otherwise, deadlock is possible, because revalidators might sleep |
3680 | | * waiting for the main thread to release the lock and main thread |
3681 | | * will wait for them to stop processing. |
3682 | | * This workaround might make statistics less accurate. Especially |
3683 | | * for flow deletion case, since there will be no other attempt. */ |
3684 | 0 | if (!ovs_rwlock_tryrdlock(&dp->port_rwlock)) { |
3685 | 0 | ret = netdev_flow_get(netdev, &match, &actions, |
3686 | 0 | &netdev_flow->mega_ufid, stats, attrs, &buf); |
3687 | | /* Storing statistics and attributes from the last request for |
3688 | | * later use on mutex contention. */ |
3689 | 0 | dp_netdev_flow_set_last_stats_attrs(netdev_flow, stats, attrs, ret); |
3690 | 0 | ovs_rwlock_unlock(&dp->port_rwlock); |
3691 | 0 | } else { |
3692 | 0 | dp_netdev_flow_get_last_stats_attrs(netdev_flow, stats, attrs, &ret); |
3693 | 0 | if (!ret && !attrs->dp_layer) { |
3694 | | /* Flow was never reported as 'offloaded' so it's harmless |
3695 | | * to continue to think so. */ |
3696 | 0 | ret = EAGAIN; |
3697 | 0 | } |
3698 | 0 | } |
3699 | 0 | netdev_close(netdev); |
3700 | 0 | if (ret) { |
3701 | 0 | return false; |
3702 | 0 | } |
3703 | | |
3704 | 0 | return true; |
3705 | 0 | } |
3706 | | |
3707 | | static void |
3708 | | get_dpif_flow_status(const struct dp_netdev *dp, |
3709 | | const struct dp_netdev_flow *netdev_flow_, |
3710 | | struct dpif_flow_stats *stats, |
3711 | | struct dpif_flow_attrs *attrs) |
3712 | 0 | { |
3713 | 0 | struct dpif_flow_stats offload_stats; |
3714 | 0 | struct dpif_flow_attrs offload_attrs; |
3715 | 0 | struct dp_netdev_flow *netdev_flow; |
3716 | 0 | unsigned long long n; |
3717 | 0 | long long used; |
3718 | 0 | uint16_t flags; |
3719 | |
|
3720 | 0 | netdev_flow = CONST_CAST(struct dp_netdev_flow *, netdev_flow_); |
3721 | |
|
3722 | 0 | atomic_read_relaxed(&netdev_flow->stats.packet_count, &n); |
3723 | 0 | stats->n_packets = n; |
3724 | 0 | atomic_read_relaxed(&netdev_flow->stats.byte_count, &n); |
3725 | 0 | stats->n_bytes = n; |
3726 | 0 | atomic_read_relaxed(&netdev_flow->stats.used, &used); |
3727 | 0 | stats->used = used; |
3728 | 0 | atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags); |
3729 | 0 | stats->tcp_flags = flags; |
3730 | |
|
3731 | 0 | if (dpif_netdev_get_flow_offload_status(dp, netdev_flow, |
3732 | 0 | &offload_stats, &offload_attrs)) { |
3733 | 0 | stats->n_packets += offload_stats.n_packets; |
3734 | 0 | stats->n_bytes += offload_stats.n_bytes; |
3735 | 0 | stats->used = MAX(stats->used, offload_stats.used); |
3736 | 0 | stats->tcp_flags |= offload_stats.tcp_flags; |
3737 | 0 | if (attrs) { |
3738 | 0 | attrs->offloaded = offload_attrs.offloaded; |
3739 | 0 | attrs->dp_layer = offload_attrs.dp_layer; |
3740 | 0 | } |
3741 | 0 | } else if (attrs) { |
3742 | 0 | attrs->offloaded = false; |
3743 | 0 | attrs->dp_layer = "ovs"; |
3744 | 0 | } |
3745 | 0 | } |
3746 | | |
3747 | | /* Converts to the dpif_flow format, using 'key_buf' and 'mask_buf' for |
3748 | | * storing the netlink-formatted key/mask. 'key_buf' may be the same as |
3749 | | * 'mask_buf'. Actions will be returned without copying, by relying on RCU to |
3750 | | * protect them. */ |
3751 | | static void |
3752 | | dp_netdev_flow_to_dpif_flow(const struct dp_netdev *dp, |
3753 | | const struct dp_netdev_flow *netdev_flow, |
3754 | | struct ofpbuf *key_buf, struct ofpbuf *mask_buf, |
3755 | | struct dpif_flow *flow, bool terse) |
3756 | 0 | { |
3757 | 0 | if (terse) { |
3758 | 0 | memset(flow, 0, sizeof *flow); |
3759 | 0 | } else { |
3760 | 0 | struct flow_wildcards wc; |
3761 | 0 | struct dp_netdev_actions *actions; |
3762 | 0 | size_t offset; |
3763 | 0 | struct odp_flow_key_parms odp_parms = { |
3764 | 0 | .flow = &netdev_flow->flow, |
3765 | 0 | .mask = &wc.masks, |
3766 | 0 | .support = dp_netdev_support, |
3767 | 0 | }; |
3768 | |
|
3769 | 0 | miniflow_expand(&netdev_flow->cr.mask->mf, &wc.masks); |
3770 | | /* in_port is exact matched, but we have left it out from the mask for |
3771 | | * optimnization reasons. Add in_port back to the mask. */ |
3772 | 0 | wc.masks.in_port.odp_port = ODPP_NONE; |
3773 | | |
3774 | | /* Key */ |
3775 | 0 | offset = key_buf->size; |
3776 | 0 | flow->key = ofpbuf_tail(key_buf); |
3777 | 0 | odp_flow_key_from_flow(&odp_parms, key_buf); |
3778 | 0 | flow->key_len = key_buf->size - offset; |
3779 | | |
3780 | | /* Mask */ |
3781 | 0 | offset = mask_buf->size; |
3782 | 0 | flow->mask = ofpbuf_tail(mask_buf); |
3783 | 0 | odp_parms.key_buf = key_buf; |
3784 | 0 | odp_flow_key_from_mask(&odp_parms, mask_buf); |
3785 | 0 | flow->mask_len = mask_buf->size - offset; |
3786 | | |
3787 | | /* Actions */ |
3788 | 0 | actions = dp_netdev_flow_get_actions(netdev_flow); |
3789 | 0 | flow->actions = actions->actions; |
3790 | 0 | flow->actions_len = actions->size; |
3791 | 0 | } |
3792 | |
|
3793 | 0 | flow->ufid = netdev_flow->ufid; |
3794 | 0 | flow->ufid_present = true; |
3795 | 0 | flow->pmd_id = netdev_flow->pmd_id; |
3796 | |
|
3797 | 0 | get_dpif_flow_status(dp, netdev_flow, &flow->stats, &flow->attrs); |
3798 | 0 | flow->attrs.dp_extra_info = netdev_flow->dp_extra_info; |
3799 | 0 | } |
3800 | | |
3801 | | static int |
3802 | | dpif_netdev_mask_from_nlattrs(const struct nlattr *key, uint32_t key_len, |
3803 | | const struct nlattr *mask_key, |
3804 | | uint32_t mask_key_len, const struct flow *flow, |
3805 | | struct flow_wildcards *wc, bool probe) |
3806 | 0 | { |
3807 | 0 | enum odp_key_fitness fitness; |
3808 | |
|
3809 | 0 | fitness = odp_flow_key_to_mask(mask_key, mask_key_len, wc, flow, NULL); |
3810 | 0 | if (fitness) { |
3811 | 0 | if (!probe) { |
3812 | | /* This should not happen: it indicates that |
3813 | | * odp_flow_key_from_mask() and odp_flow_key_to_mask() |
3814 | | * disagree on the acceptable form of a mask. Log the problem |
3815 | | * as an error, with enough details to enable debugging. */ |
3816 | 0 | static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); |
3817 | |
|
3818 | 0 | if (!VLOG_DROP_ERR(&rl)) { |
3819 | 0 | struct ds s; |
3820 | |
|
3821 | 0 | ds_init(&s); |
3822 | 0 | odp_flow_format(key, key_len, mask_key, mask_key_len, NULL, &s, |
3823 | 0 | true); |
3824 | 0 | VLOG_ERR("internal error parsing flow mask %s (%s)", |
3825 | 0 | ds_cstr(&s), odp_key_fitness_to_string(fitness)); |
3826 | 0 | ds_destroy(&s); |
3827 | 0 | } |
3828 | 0 | } |
3829 | |
|
3830 | 0 | return EINVAL; |
3831 | 0 | } |
3832 | | |
3833 | 0 | return 0; |
3834 | 0 | } |
3835 | | |
3836 | | static int |
3837 | | dpif_netdev_flow_from_nlattrs(const struct nlattr *key, uint32_t key_len, |
3838 | | struct flow *flow, bool probe) |
3839 | 0 | { |
3840 | 0 | if (odp_flow_key_to_flow(key, key_len, flow, NULL)) { |
3841 | 0 | if (!probe) { |
3842 | | /* This should not happen: it indicates that |
3843 | | * odp_flow_key_from_flow() and odp_flow_key_to_flow() disagree on |
3844 | | * the acceptable form of a flow. Log the problem as an error, |
3845 | | * with enough details to enable debugging. */ |
3846 | 0 | static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); |
3847 | |
|
3848 | 0 | if (!VLOG_DROP_ERR(&rl)) { |
3849 | 0 | struct ds s; |
3850 | |
|
3851 | 0 | ds_init(&s); |
3852 | 0 | odp_flow_format(key, key_len, NULL, 0, NULL, &s, true); |
3853 | 0 | VLOG_ERR("internal error parsing flow key %s", ds_cstr(&s)); |
3854 | 0 | ds_destroy(&s); |
3855 | 0 | } |
3856 | 0 | } |
3857 | |
|
3858 | 0 | return EINVAL; |
3859 | 0 | } |
3860 | | |
3861 | 0 | if (flow->ct_state & DP_NETDEV_CS_UNSUPPORTED_MASK) { |
3862 | 0 | return EINVAL; |
3863 | 0 | } |
3864 | | |
3865 | 0 | return 0; |
3866 | 0 | } |
3867 | | |
3868 | | static int |
3869 | | dpif_netdev_flow_get(const struct dpif *dpif, const struct dpif_flow_get *get) |
3870 | 0 | { |
3871 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
3872 | 0 | struct dp_netdev_flow *netdev_flow; |
3873 | 0 | struct dp_netdev_pmd_thread *pmd; |
3874 | 0 | struct hmapx to_find = HMAPX_INITIALIZER(&to_find); |
3875 | 0 | struct hmapx_node *node; |
3876 | 0 | int error = EINVAL; |
3877 | |
|
3878 | 0 | if (get->pmd_id == PMD_ID_NULL) { |
3879 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
3880 | 0 | if (dp_netdev_pmd_try_ref(pmd) && !hmapx_add(&to_find, pmd)) { |
3881 | 0 | dp_netdev_pmd_unref(pmd); |
3882 | 0 | } |
3883 | 0 | } |
3884 | 0 | } else { |
3885 | 0 | pmd = dp_netdev_get_pmd(dp, get->pmd_id); |
3886 | 0 | if (!pmd) { |
3887 | 0 | goto out; |
3888 | 0 | } |
3889 | 0 | hmapx_add(&to_find, pmd); |
3890 | 0 | } |
3891 | | |
3892 | 0 | if (!hmapx_count(&to_find)) { |
3893 | 0 | goto out; |
3894 | 0 | } |
3895 | | |
3896 | 0 | HMAPX_FOR_EACH (node, &to_find) { |
3897 | 0 | pmd = (struct dp_netdev_pmd_thread *) node->data; |
3898 | 0 | netdev_flow = dp_netdev_pmd_find_flow(pmd, get->ufid, get->key, |
3899 | 0 | get->key_len); |
3900 | 0 | if (netdev_flow) { |
3901 | 0 | dp_netdev_flow_to_dpif_flow(dp, netdev_flow, get->buffer, |
3902 | 0 | get->buffer, get->flow, false); |
3903 | 0 | error = 0; |
3904 | 0 | break; |
3905 | 0 | } else { |
3906 | 0 | error = ENOENT; |
3907 | 0 | } |
3908 | 0 | } |
3909 | |
|
3910 | 0 | HMAPX_FOR_EACH (node, &to_find) { |
3911 | 0 | pmd = (struct dp_netdev_pmd_thread *) node->data; |
3912 | 0 | dp_netdev_pmd_unref(pmd); |
3913 | 0 | } |
3914 | 0 | out: |
3915 | 0 | hmapx_destroy(&to_find); |
3916 | 0 | return error; |
3917 | 0 | } |
3918 | | |
3919 | | static void |
3920 | | dp_netdev_get_mega_ufid(const struct match *match, ovs_u128 *mega_ufid) |
3921 | 0 | { |
3922 | 0 | struct flow masked_flow; |
3923 | 0 | size_t i; |
3924 | |
|
3925 | 0 | for (i = 0; i < sizeof(struct flow); i++) { |
3926 | 0 | ((uint8_t *)&masked_flow)[i] = ((uint8_t *)&match->flow)[i] & |
3927 | 0 | ((uint8_t *)&match->wc)[i]; |
3928 | 0 | } |
3929 | 0 | odp_flow_key_hash(&masked_flow, sizeof masked_flow, mega_ufid); |
3930 | 0 | } |
3931 | | |
3932 | | uint64_t |
3933 | | dp_netdev_simple_match_mark(odp_port_t in_port, ovs_be16 dl_type, |
3934 | | uint8_t nw_frag, ovs_be16 vlan_tci) |
3935 | 0 | { |
3936 | | /* Simple Match Mark: |
3937 | | * |
3938 | | * BE: |
3939 | | * +-----------------+-------------++---------+---+-----------+ |
3940 | | * | in_port | dl_type || nw_frag |CFI| VID(12) | |
3941 | | * +-----------------+-------------++---------+---+-----------+ |
3942 | | * 0 32 47 49 51 52 63 |
3943 | | * |
3944 | | * LE: |
3945 | | * +-----------------+-------------+------++-------+---+------+ |
3946 | | * | in_port | dl_type |VID(8)||nw_frag|CFI|VID(4)| |
3947 | | * +-----------------+-------------+------++-------+---+------+ |
3948 | | * 0 32 47 48 55 57 59 60 61 63 |
3949 | | * |
3950 | | * Big Endian Little Endian |
3951 | | * in_port : 32 bits [ 0..31] in_port : 32 bits [ 0..31] |
3952 | | * dl_type : 16 bits [32..47] dl_type : 16 bits [32..47] |
3953 | | * <empty> : 1 bit [48..48] vlan VID: 8 bits [48..55] |
3954 | | * nw_frag : 2 bits [49..50] <empty> : 1 bit [56..56] |
3955 | | * vlan CFI: 1 bit [51..51] nw_frag : 2 bits [57..59] |
3956 | | * vlan VID: 12 bits [52..63] vlan CFI: 1 bit [60..60] |
3957 | | * vlan VID: 4 bits [61..63] |
3958 | | * |
3959 | | * Layout is different for LE and BE in order to save a couple of |
3960 | | * network to host translations. |
3961 | | * */ |
3962 | 0 | return ((uint64_t) odp_to_u32(in_port) << 32) |
3963 | 0 | | ((OVS_FORCE uint32_t) dl_type << 16) |
3964 | | #if WORDS_BIGENDIAN |
3965 | | | (((uint16_t) nw_frag & FLOW_NW_FRAG_MASK) << VLAN_PCP_SHIFT) |
3966 | | #else |
3967 | 0 | | ((nw_frag & FLOW_NW_FRAG_MASK) << (VLAN_PCP_SHIFT - 8)) |
3968 | 0 | #endif |
3969 | 0 | | (OVS_FORCE uint16_t) (vlan_tci & htons(VLAN_VID_MASK | VLAN_CFI)); |
3970 | 0 | } |
3971 | | |
3972 | | struct dp_netdev_flow * |
3973 | | dp_netdev_simple_match_lookup(const struct dp_netdev_pmd_thread *pmd, |
3974 | | odp_port_t in_port, ovs_be16 dl_type, |
3975 | | uint8_t nw_frag, ovs_be16 vlan_tci) |
3976 | 0 | { |
3977 | 0 | uint64_t mark = dp_netdev_simple_match_mark(in_port, dl_type, |
3978 | 0 | nw_frag, vlan_tci); |
3979 | 0 | uint32_t hash = hash_uint64(mark); |
3980 | 0 | struct dp_netdev_flow *flow; |
3981 | 0 | bool found = false; |
3982 | |
|
3983 | 0 | CMAP_FOR_EACH_WITH_HASH (flow, simple_match_node, |
3984 | 0 | hash, &pmd->simple_match_table) { |
3985 | 0 | if (flow->simple_match_mark == mark) { |
3986 | 0 | found = true; |
3987 | 0 | break; |
3988 | 0 | } |
3989 | 0 | } |
3990 | 0 | return found ? flow : NULL; |
3991 | 0 | } |
3992 | | |
3993 | | bool |
3994 | | dp_netdev_simple_match_enabled(const struct dp_netdev_pmd_thread *pmd, |
3995 | | odp_port_t in_port) |
3996 | 0 | { |
3997 | 0 | return ccmap_find(&pmd->n_flows, odp_to_u32(in_port)) |
3998 | 0 | == ccmap_find(&pmd->n_simple_flows, odp_to_u32(in_port)); |
3999 | 0 | } |
4000 | | |
4001 | | static void |
4002 | | dp_netdev_simple_match_insert(struct dp_netdev_pmd_thread *pmd, |
4003 | | struct dp_netdev_flow *dp_flow) |
4004 | | OVS_REQUIRES(pmd->flow_mutex) |
4005 | 0 | { |
4006 | 0 | odp_port_t in_port = dp_flow->flow.in_port.odp_port; |
4007 | 0 | ovs_be16 vlan_tci = dp_flow->flow.vlans[0].tci; |
4008 | 0 | ovs_be16 dl_type = dp_flow->flow.dl_type; |
4009 | 0 | uint8_t nw_frag = dp_flow->flow.nw_frag; |
4010 | |
|
4011 | 0 | if (!dp_netdev_flow_ref(dp_flow)) { |
4012 | 0 | return; |
4013 | 0 | } |
4014 | | |
4015 | | /* Avoid double insertion. Should not happen in practice. */ |
4016 | 0 | dp_netdev_simple_match_remove(pmd, dp_flow); |
4017 | |
|
4018 | 0 | uint64_t mark = dp_netdev_simple_match_mark(in_port, dl_type, |
4019 | 0 | nw_frag, vlan_tci); |
4020 | 0 | uint32_t hash = hash_uint64(mark); |
4021 | |
|
4022 | 0 | dp_flow->simple_match_mark = mark; |
4023 | 0 | cmap_insert(&pmd->simple_match_table, |
4024 | 0 | CONST_CAST(struct cmap_node *, &dp_flow->simple_match_node), |
4025 | 0 | hash); |
4026 | 0 | ccmap_inc(&pmd->n_simple_flows, odp_to_u32(in_port)); |
4027 | |
|
4028 | 0 | VLOG_DBG("Simple match insert: " |
4029 | 0 | "core_id(%d),in_port(%"PRIu32"),mark(0x%016"PRIx64").", |
4030 | 0 | pmd->core_id, in_port, mark); |
4031 | 0 | } |
4032 | | |
4033 | | static void |
4034 | | dp_netdev_simple_match_remove(struct dp_netdev_pmd_thread *pmd, |
4035 | | struct dp_netdev_flow *dp_flow) |
4036 | | OVS_REQUIRES(pmd->flow_mutex) |
4037 | 0 | { |
4038 | 0 | odp_port_t in_port = dp_flow->flow.in_port.odp_port; |
4039 | 0 | ovs_be16 vlan_tci = dp_flow->flow.vlans[0].tci; |
4040 | 0 | ovs_be16 dl_type = dp_flow->flow.dl_type; |
4041 | 0 | uint8_t nw_frag = dp_flow->flow.nw_frag; |
4042 | 0 | struct dp_netdev_flow *flow; |
4043 | 0 | uint64_t mark = dp_netdev_simple_match_mark(in_port, dl_type, |
4044 | 0 | nw_frag, vlan_tci); |
4045 | 0 | uint32_t hash = hash_uint64(mark); |
4046 | |
|
4047 | 0 | flow = dp_netdev_simple_match_lookup(pmd, in_port, dl_type, |
4048 | 0 | nw_frag, vlan_tci); |
4049 | 0 | if (flow == dp_flow) { |
4050 | 0 | VLOG_DBG("Simple match remove: " |
4051 | 0 | "core_id(%d),in_port(%"PRIu32"),mark(0x%016"PRIx64").", |
4052 | 0 | pmd->core_id, in_port, mark); |
4053 | 0 | cmap_remove(&pmd->simple_match_table, |
4054 | 0 | CONST_CAST(struct cmap_node *, &flow->simple_match_node), |
4055 | 0 | hash); |
4056 | 0 | ccmap_dec(&pmd->n_simple_flows, odp_to_u32(in_port)); |
4057 | 0 | dp_netdev_flow_unref(flow); |
4058 | 0 | } |
4059 | 0 | } |
4060 | | |
4061 | | static bool |
4062 | | dp_netdev_flow_is_simple_match(const struct match *match) |
4063 | 0 | { |
4064 | 0 | const struct flow *flow = &match->flow; |
4065 | 0 | const struct flow_wildcards *wc = &match->wc; |
4066 | |
|
4067 | 0 | if (flow->recirc_id || flow->packet_type != htonl(PT_ETH)) { |
4068 | 0 | return false; |
4069 | 0 | } |
4070 | | |
4071 | | /* Check that flow matches only minimal set of fields that always set. |
4072 | | * Also checking that VLAN VID+CFI is an exact match, because these |
4073 | | * are not mandatory and could be masked. */ |
4074 | 0 | struct flow_wildcards *minimal = xmalloc(sizeof *minimal); |
4075 | 0 | ovs_be16 vlan_tci_mask = htons(VLAN_VID_MASK | VLAN_CFI); |
4076 | |
|
4077 | 0 | flow_wildcards_init_catchall(minimal); |
4078 | | /* 'dpif-netdev' always has following in exact match: |
4079 | | * - recirc_id <-- recirc_id == 0 checked on input. |
4080 | | * - in_port <-- Will be checked on input. |
4081 | | * - packet_type <-- Assuming all packets are PT_ETH. |
4082 | | * - dl_type <-- Need to match with. |
4083 | | * - vlan_tci <-- Need to match with. |
4084 | | * - and nw_frag for ip packets. <-- Need to match with. |
4085 | | */ |
4086 | 0 | WC_MASK_FIELD(minimal, recirc_id); |
4087 | 0 | WC_MASK_FIELD(minimal, in_port); |
4088 | 0 | WC_MASK_FIELD(minimal, packet_type); |
4089 | 0 | WC_MASK_FIELD(minimal, dl_type); |
4090 | 0 | WC_MASK_FIELD_MASK(minimal, vlans[0].tci, vlan_tci_mask); |
4091 | 0 | WC_MASK_FIELD_MASK(minimal, nw_frag, FLOW_NW_FRAG_MASK); |
4092 | |
|
4093 | 0 | if (flow_wildcards_has_extra(minimal, wc) |
4094 | 0 | || wc->masks.vlans[0].tci != vlan_tci_mask) { |
4095 | 0 | free(minimal); |
4096 | 0 | return false; |
4097 | 0 | } |
4098 | 0 | free(minimal); |
4099 | |
|
4100 | 0 | return true; |
4101 | 0 | } |
4102 | | |
4103 | | static struct dp_netdev_flow * |
4104 | | dp_netdev_flow_add(struct dp_netdev_pmd_thread *pmd, |
4105 | | struct match *match, const ovs_u128 *ufid, |
4106 | | const struct nlattr *actions, size_t actions_len, |
4107 | | odp_port_t orig_in_port) |
4108 | | OVS_REQUIRES(pmd->flow_mutex) |
4109 | 0 | { |
4110 | 0 | struct ds extra_info = DS_EMPTY_INITIALIZER; |
4111 | 0 | struct dp_netdev_flow *flow; |
4112 | 0 | struct netdev_flow_key mask; |
4113 | 0 | struct dpcls *cls; |
4114 | 0 | size_t unit; |
4115 | | |
4116 | | /* Make sure in_port is exact matched before we read it. */ |
4117 | 0 | ovs_assert(match->wc.masks.in_port.odp_port == ODPP_NONE); |
4118 | 0 | odp_port_t in_port = match->flow.in_port.odp_port; |
4119 | | |
4120 | | /* As we select the dpcls based on the port number, each netdev flow |
4121 | | * belonging to the same dpcls will have the same odp_port value. |
4122 | | * For performance reasons we wildcard odp_port here in the mask. In the |
4123 | | * typical case dp_hash is also wildcarded, and the resulting 8-byte |
4124 | | * chunk {dp_hash, in_port} will be ignored by netdev_flow_mask_init() and |
4125 | | * will not be part of the subtable mask. |
4126 | | * This will speed up the hash computation during dpcls_lookup() because |
4127 | | * there is one less call to hash_add64() in this case. */ |
4128 | 0 | match->wc.masks.in_port.odp_port = 0; |
4129 | 0 | netdev_flow_mask_init(&mask, match); |
4130 | 0 | match->wc.masks.in_port.odp_port = ODPP_NONE; |
4131 | | |
4132 | | /* Make sure wc does not have metadata. */ |
4133 | 0 | ovs_assert(!FLOWMAP_HAS_FIELD(&mask.mf.map, metadata) |
4134 | 0 | && !FLOWMAP_HAS_FIELD(&mask.mf.map, regs)); |
4135 | | |
4136 | | /* Do not allocate extra space. */ |
4137 | 0 | flow = xmalloc(sizeof *flow - sizeof flow->cr.flow.mf + mask.len); |
4138 | 0 | memset(&flow->stats, 0, sizeof flow->stats); |
4139 | 0 | atomic_init(&flow->netdev_flow_get_result, 0); |
4140 | 0 | memset(&flow->last_stats, 0, sizeof flow->last_stats); |
4141 | 0 | memset(&flow->last_attrs, 0, sizeof flow->last_attrs); |
4142 | 0 | flow->dead = false; |
4143 | 0 | flow->batch = NULL; |
4144 | 0 | flow->mark = INVALID_FLOW_MARK; |
4145 | 0 | flow->orig_in_port = orig_in_port; |
4146 | 0 | *CONST_CAST(unsigned *, &flow->pmd_id) = pmd->core_id; |
4147 | 0 | *CONST_CAST(struct flow *, &flow->flow) = match->flow; |
4148 | 0 | *CONST_CAST(ovs_u128 *, &flow->ufid) = *ufid; |
4149 | 0 | ovs_refcount_init(&flow->ref_cnt); |
4150 | 0 | ovsrcu_set(&flow->actions, dp_netdev_actions_create(actions, actions_len)); |
4151 | |
|
4152 | 0 | dp_netdev_get_mega_ufid(match, CONST_CAST(ovs_u128 *, &flow->mega_ufid)); |
4153 | 0 | netdev_flow_key_init_masked(&flow->cr.flow, &match->flow, &mask); |
4154 | | |
4155 | | /* Select dpcls for in_port. Relies on in_port to be exact match. */ |
4156 | 0 | cls = dp_netdev_pmd_find_dpcls(pmd, in_port); |
4157 | 0 | dpcls_insert(cls, &flow->cr, &mask); |
4158 | |
|
4159 | 0 | ds_put_cstr(&extra_info, "miniflow_bits("); |
4160 | 0 | FLOWMAP_FOR_EACH_UNIT (unit) { |
4161 | 0 | if (unit) { |
4162 | 0 | ds_put_char(&extra_info, ','); |
4163 | 0 | } |
4164 | 0 | ds_put_format(&extra_info, "%d", |
4165 | 0 | count_1bits(flow->cr.mask->mf.map.bits[unit])); |
4166 | 0 | } |
4167 | 0 | ds_put_char(&extra_info, ')'); |
4168 | 0 | flow->dp_extra_info = ds_steal_cstr(&extra_info); |
4169 | 0 | ds_destroy(&extra_info); |
4170 | |
|
4171 | 0 | cmap_insert(&pmd->flow_table, CONST_CAST(struct cmap_node *, &flow->node), |
4172 | 0 | dp_netdev_flow_hash(&flow->ufid)); |
4173 | 0 | ccmap_inc(&pmd->n_flows, odp_to_u32(in_port)); |
4174 | |
|
4175 | 0 | if (dp_netdev_flow_is_simple_match(match)) { |
4176 | 0 | dp_netdev_simple_match_insert(pmd, flow); |
4177 | 0 | } |
4178 | |
|
4179 | 0 | queue_netdev_flow_put(pmd, flow, match, actions, actions_len, |
4180 | 0 | DP_NETDEV_FLOW_OFFLOAD_OP_ADD); |
4181 | 0 | log_netdev_flow_change(flow, match, NULL, actions, actions_len); |
4182 | |
|
4183 | 0 | return flow; |
4184 | 0 | } |
4185 | | |
4186 | | static int |
4187 | | flow_put_on_pmd(struct dp_netdev_pmd_thread *pmd, |
4188 | | struct netdev_flow_key *key, |
4189 | | struct match *match, |
4190 | | ovs_u128 *ufid, |
4191 | | const struct dpif_flow_put *put, |
4192 | | struct dpif_flow_stats *stats) |
4193 | 0 | { |
4194 | 0 | struct dp_netdev_flow *netdev_flow; |
4195 | 0 | int error = 0; |
4196 | |
|
4197 | 0 | if (stats) { |
4198 | 0 | memset(stats, 0, sizeof *stats); |
4199 | 0 | } |
4200 | |
|
4201 | 0 | ovs_mutex_lock(&pmd->flow_mutex); |
4202 | 0 | netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL); |
4203 | 0 | if (!netdev_flow) { |
4204 | 0 | if (put->flags & DPIF_FP_CREATE) { |
4205 | 0 | dp_netdev_flow_add(pmd, match, ufid, put->actions, |
4206 | 0 | put->actions_len, ODPP_NONE); |
4207 | 0 | } else { |
4208 | 0 | error = ENOENT; |
4209 | 0 | } |
4210 | 0 | } else { |
4211 | 0 | if (put->flags & DPIF_FP_MODIFY) { |
4212 | 0 | struct dp_netdev_actions *new_actions; |
4213 | 0 | struct dp_netdev_actions *old_actions; |
4214 | |
|
4215 | 0 | new_actions = dp_netdev_actions_create(put->actions, |
4216 | 0 | put->actions_len); |
4217 | |
|
4218 | 0 | old_actions = dp_netdev_flow_get_actions(netdev_flow); |
4219 | 0 | ovsrcu_set(&netdev_flow->actions, new_actions); |
4220 | |
|
4221 | 0 | queue_netdev_flow_put(pmd, netdev_flow, match, |
4222 | 0 | put->actions, put->actions_len, |
4223 | 0 | DP_NETDEV_FLOW_OFFLOAD_OP_MOD); |
4224 | 0 | log_netdev_flow_change(netdev_flow, match, old_actions, |
4225 | 0 | put->actions, put->actions_len); |
4226 | |
|
4227 | 0 | if (stats) { |
4228 | 0 | get_dpif_flow_status(pmd->dp, netdev_flow, stats, NULL); |
4229 | 0 | } |
4230 | 0 | if (put->flags & DPIF_FP_ZERO_STATS) { |
4231 | | /* XXX: The userspace datapath uses thread local statistics |
4232 | | * (for flows), which should be updated only by the owning |
4233 | | * thread. Since we cannot write on stats memory here, |
4234 | | * we choose not to support this flag. Please note: |
4235 | | * - This feature is currently used only by dpctl commands with |
4236 | | * option --clear. |
4237 | | * - Should the need arise, this operation can be implemented |
4238 | | * by keeping a base value (to be update here) for each |
4239 | | * counter, and subtracting it before outputting the stats */ |
4240 | 0 | error = EOPNOTSUPP; |
4241 | 0 | } |
4242 | |
|
4243 | 0 | ovsrcu_postpone(dp_netdev_actions_free, old_actions); |
4244 | 0 | } else if (put->flags & DPIF_FP_CREATE) { |
4245 | 0 | error = EEXIST; |
4246 | 0 | } else { |
4247 | | /* Overlapping flow. */ |
4248 | 0 | error = EINVAL; |
4249 | 0 | } |
4250 | 0 | } |
4251 | 0 | ovs_mutex_unlock(&pmd->flow_mutex); |
4252 | 0 | return error; |
4253 | 0 | } |
4254 | | |
4255 | | static int |
4256 | | dpif_netdev_flow_put(struct dpif *dpif, const struct dpif_flow_put *put) |
4257 | 0 | { |
4258 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
4259 | 0 | struct netdev_flow_key key; |
4260 | 0 | struct dp_netdev_pmd_thread *pmd; |
4261 | 0 | struct match match; |
4262 | 0 | ovs_u128 ufid; |
4263 | 0 | int error; |
4264 | 0 | bool probe = put->flags & DPIF_FP_PROBE; |
4265 | |
|
4266 | 0 | if (put->stats) { |
4267 | 0 | memset(put->stats, 0, sizeof *put->stats); |
4268 | 0 | } |
4269 | 0 | error = dpif_netdev_flow_from_nlattrs(put->key, put->key_len, &match.flow, |
4270 | 0 | probe); |
4271 | 0 | if (error) { |
4272 | 0 | return error; |
4273 | 0 | } |
4274 | 0 | error = dpif_netdev_mask_from_nlattrs(put->key, put->key_len, |
4275 | 0 | put->mask, put->mask_len, |
4276 | 0 | &match.flow, &match.wc, probe); |
4277 | 0 | if (error) { |
4278 | 0 | return error; |
4279 | 0 | } |
4280 | | |
4281 | 0 | if (match.wc.masks.in_port.odp_port != ODPP_NONE) { |
4282 | 0 | static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); |
4283 | |
|
4284 | 0 | VLOG_ERR_RL(&rl, "failed to put%s flow: in_port is not an exact match", |
4285 | 0 | (put->flags & DPIF_FP_CREATE) ? "[create]" |
4286 | 0 | : (put->flags & DPIF_FP_MODIFY) ? "[modify]" : "[zero]"); |
4287 | 0 | return EINVAL; |
4288 | 0 | } |
4289 | | |
4290 | 0 | if (put->ufid) { |
4291 | 0 | ufid = *put->ufid; |
4292 | 0 | } else { |
4293 | 0 | odp_flow_key_hash(&match.flow, sizeof match.flow, &ufid); |
4294 | 0 | } |
4295 | | |
4296 | | /* The Netlink encoding of datapath flow keys cannot express |
4297 | | * wildcarding the presence of a VLAN tag. Instead, a missing VLAN |
4298 | | * tag is interpreted as exact match on the fact that there is no |
4299 | | * VLAN. Unless we refactor a lot of code that translates between |
4300 | | * Netlink and struct flow representations, we have to do the same |
4301 | | * here. This must be in sync with 'match' in handle_packet_upcall(). */ |
4302 | 0 | if (!match.wc.masks.vlans[0].tci) { |
4303 | 0 | match.wc.masks.vlans[0].tci = htons(VLAN_VID_MASK | VLAN_CFI); |
4304 | 0 | } |
4305 | | |
4306 | | /* Must produce a netdev_flow_key for lookup. |
4307 | | * Use the same method as employed to create the key when adding |
4308 | | * the flow to the dplcs to make sure they match. |
4309 | | * We need to put in the unmasked key as flow_put_on_pmd() will first try |
4310 | | * to see if an entry exists doing a packet type lookup. As masked-out |
4311 | | * fields are interpreted as zeros, they could falsely match a wider IP |
4312 | | * address mask. Installation of the flow will use the match variable. */ |
4313 | 0 | netdev_flow_key_init(&key, &match.flow); |
4314 | |
|
4315 | 0 | if (put->pmd_id == PMD_ID_NULL) { |
4316 | 0 | if (cmap_count(&dp->poll_threads) == 0) { |
4317 | 0 | return EINVAL; |
4318 | 0 | } |
4319 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
4320 | 0 | struct dpif_flow_stats pmd_stats; |
4321 | 0 | int pmd_error; |
4322 | |
|
4323 | 0 | pmd_error = flow_put_on_pmd(pmd, &key, &match, &ufid, put, |
4324 | 0 | &pmd_stats); |
4325 | 0 | if (pmd_error) { |
4326 | 0 | error = pmd_error; |
4327 | 0 | } else if (put->stats) { |
4328 | 0 | put->stats->n_packets += pmd_stats.n_packets; |
4329 | 0 | put->stats->n_bytes += pmd_stats.n_bytes; |
4330 | 0 | put->stats->used = MAX(put->stats->used, pmd_stats.used); |
4331 | 0 | put->stats->tcp_flags |= pmd_stats.tcp_flags; |
4332 | 0 | } |
4333 | 0 | } |
4334 | 0 | } else { |
4335 | 0 | pmd = dp_netdev_get_pmd(dp, put->pmd_id); |
4336 | 0 | if (!pmd) { |
4337 | 0 | return EINVAL; |
4338 | 0 | } |
4339 | 0 | error = flow_put_on_pmd(pmd, &key, &match, &ufid, put, put->stats); |
4340 | 0 | dp_netdev_pmd_unref(pmd); |
4341 | 0 | } |
4342 | | |
4343 | 0 | return error; |
4344 | 0 | } |
4345 | | |
4346 | | static int |
4347 | | flow_del_on_pmd(struct dp_netdev_pmd_thread *pmd, |
4348 | | struct dpif_flow_stats *stats, |
4349 | | const struct dpif_flow_del *del) |
4350 | 0 | { |
4351 | 0 | struct dp_netdev_flow *netdev_flow; |
4352 | 0 | int error = 0; |
4353 | |
|
4354 | 0 | ovs_mutex_lock(&pmd->flow_mutex); |
4355 | 0 | netdev_flow = dp_netdev_pmd_find_flow(pmd, del->ufid, del->key, |
4356 | 0 | del->key_len); |
4357 | 0 | if (netdev_flow) { |
4358 | 0 | if (stats) { |
4359 | 0 | get_dpif_flow_status(pmd->dp, netdev_flow, stats, NULL); |
4360 | 0 | } |
4361 | 0 | dp_netdev_pmd_remove_flow(pmd, netdev_flow); |
4362 | 0 | } else { |
4363 | 0 | error = ENOENT; |
4364 | 0 | } |
4365 | 0 | ovs_mutex_unlock(&pmd->flow_mutex); |
4366 | |
|
4367 | 0 | return error; |
4368 | 0 | } |
4369 | | |
4370 | | static int |
4371 | | dpif_netdev_flow_del(struct dpif *dpif, const struct dpif_flow_del *del) |
4372 | 0 | { |
4373 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
4374 | 0 | struct dp_netdev_pmd_thread *pmd; |
4375 | 0 | int error = 0; |
4376 | |
|
4377 | 0 | if (del->stats) { |
4378 | 0 | memset(del->stats, 0, sizeof *del->stats); |
4379 | 0 | } |
4380 | |
|
4381 | 0 | if (del->pmd_id == PMD_ID_NULL) { |
4382 | 0 | if (cmap_count(&dp->poll_threads) == 0) { |
4383 | 0 | return EINVAL; |
4384 | 0 | } |
4385 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
4386 | 0 | struct dpif_flow_stats pmd_stats; |
4387 | 0 | int pmd_error; |
4388 | |
|
4389 | 0 | pmd_error = flow_del_on_pmd(pmd, &pmd_stats, del); |
4390 | 0 | if (pmd_error) { |
4391 | 0 | error = pmd_error; |
4392 | 0 | } else if (del->stats) { |
4393 | 0 | del->stats->n_packets += pmd_stats.n_packets; |
4394 | 0 | del->stats->n_bytes += pmd_stats.n_bytes; |
4395 | 0 | del->stats->used = MAX(del->stats->used, pmd_stats.used); |
4396 | 0 | del->stats->tcp_flags |= pmd_stats.tcp_flags; |
4397 | 0 | } |
4398 | 0 | } |
4399 | 0 | } else { |
4400 | 0 | pmd = dp_netdev_get_pmd(dp, del->pmd_id); |
4401 | 0 | if (!pmd) { |
4402 | 0 | return EINVAL; |
4403 | 0 | } |
4404 | 0 | error = flow_del_on_pmd(pmd, del->stats, del); |
4405 | 0 | dp_netdev_pmd_unref(pmd); |
4406 | 0 | } |
4407 | | |
4408 | | |
4409 | 0 | return error; |
4410 | 0 | } |
4411 | | |
4412 | | struct dpif_netdev_flow_dump { |
4413 | | struct dpif_flow_dump up; |
4414 | | struct cmap_position poll_thread_pos; |
4415 | | struct cmap_position flow_pos; |
4416 | | struct dp_netdev_pmd_thread *cur_pmd; |
4417 | | int status; |
4418 | | struct ovs_mutex mutex; |
4419 | | }; |
4420 | | |
4421 | | static struct dpif_netdev_flow_dump * |
4422 | | dpif_netdev_flow_dump_cast(struct dpif_flow_dump *dump) |
4423 | 0 | { |
4424 | 0 | return CONTAINER_OF(dump, struct dpif_netdev_flow_dump, up); |
4425 | 0 | } |
4426 | | |
4427 | | static struct dpif_flow_dump * |
4428 | | dpif_netdev_flow_dump_create(const struct dpif *dpif_, bool terse, |
4429 | | struct dpif_flow_dump_types *types OVS_UNUSED) |
4430 | 0 | { |
4431 | 0 | struct dpif_netdev_flow_dump *dump; |
4432 | |
|
4433 | 0 | dump = xzalloc(sizeof *dump); |
4434 | 0 | dpif_flow_dump_init(&dump->up, dpif_); |
4435 | 0 | dump->up.terse = terse; |
4436 | 0 | ovs_mutex_init(&dump->mutex); |
4437 | |
|
4438 | 0 | return &dump->up; |
4439 | 0 | } |
4440 | | |
4441 | | static int |
4442 | | dpif_netdev_flow_dump_destroy(struct dpif_flow_dump *dump_) |
4443 | 0 | { |
4444 | 0 | struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_); |
4445 | |
|
4446 | 0 | ovs_mutex_destroy(&dump->mutex); |
4447 | 0 | free(dump); |
4448 | 0 | return 0; |
4449 | 0 | } |
4450 | | |
4451 | | struct dpif_netdev_flow_dump_thread { |
4452 | | struct dpif_flow_dump_thread up; |
4453 | | struct dpif_netdev_flow_dump *dump; |
4454 | | struct odputil_keybuf keybuf[FLOW_DUMP_MAX_BATCH]; |
4455 | | struct odputil_keybuf maskbuf[FLOW_DUMP_MAX_BATCH]; |
4456 | | }; |
4457 | | |
4458 | | static struct dpif_netdev_flow_dump_thread * |
4459 | | dpif_netdev_flow_dump_thread_cast(struct dpif_flow_dump_thread *thread) |
4460 | 0 | { |
4461 | 0 | return CONTAINER_OF(thread, struct dpif_netdev_flow_dump_thread, up); |
4462 | 0 | } |
4463 | | |
4464 | | static struct dpif_flow_dump_thread * |
4465 | | dpif_netdev_flow_dump_thread_create(struct dpif_flow_dump *dump_) |
4466 | 0 | { |
4467 | 0 | struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_); |
4468 | 0 | struct dpif_netdev_flow_dump_thread *thread; |
4469 | |
|
4470 | 0 | thread = xmalloc(sizeof *thread); |
4471 | 0 | dpif_flow_dump_thread_init(&thread->up, &dump->up); |
4472 | 0 | thread->dump = dump; |
4473 | 0 | return &thread->up; |
4474 | 0 | } |
4475 | | |
4476 | | static void |
4477 | | dpif_netdev_flow_dump_thread_destroy(struct dpif_flow_dump_thread *thread_) |
4478 | 0 | { |
4479 | 0 | struct dpif_netdev_flow_dump_thread *thread |
4480 | 0 | = dpif_netdev_flow_dump_thread_cast(thread_); |
4481 | |
|
4482 | 0 | free(thread); |
4483 | 0 | } |
4484 | | |
4485 | | static int |
4486 | | dpif_netdev_flow_dump_next(struct dpif_flow_dump_thread *thread_, |
4487 | | struct dpif_flow *flows, int max_flows) |
4488 | 0 | { |
4489 | 0 | struct dpif_netdev_flow_dump_thread *thread |
4490 | 0 | = dpif_netdev_flow_dump_thread_cast(thread_); |
4491 | 0 | struct dpif_netdev_flow_dump *dump = thread->dump; |
4492 | 0 | struct dp_netdev_flow *netdev_flows[FLOW_DUMP_MAX_BATCH]; |
4493 | 0 | struct dpif_netdev *dpif = dpif_netdev_cast(thread->up.dpif); |
4494 | 0 | struct dp_netdev *dp = get_dp_netdev(&dpif->dpif); |
4495 | 0 | int n_flows = 0; |
4496 | 0 | int i; |
4497 | |
|
4498 | 0 | ovs_mutex_lock(&dump->mutex); |
4499 | 0 | if (!dump->status) { |
4500 | 0 | struct dp_netdev_pmd_thread *pmd = dump->cur_pmd; |
4501 | 0 | int flow_limit = MIN(max_flows, FLOW_DUMP_MAX_BATCH); |
4502 | | |
4503 | | /* First call to dump_next(), extracts the first pmd thread. |
4504 | | * If there is no pmd thread, returns immediately. */ |
4505 | 0 | if (!pmd) { |
4506 | 0 | pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos); |
4507 | 0 | if (!pmd) { |
4508 | 0 | ovs_mutex_unlock(&dump->mutex); |
4509 | 0 | return n_flows; |
4510 | |
|
4511 | 0 | } |
4512 | 0 | } |
4513 | | |
4514 | 0 | do { |
4515 | 0 | for (n_flows = 0; n_flows < flow_limit; n_flows++) { |
4516 | 0 | struct cmap_node *node; |
4517 | |
|
4518 | 0 | node = cmap_next_position(&pmd->flow_table, &dump->flow_pos); |
4519 | 0 | if (!node) { |
4520 | 0 | break; |
4521 | 0 | } |
4522 | 0 | netdev_flows[n_flows] = CONTAINER_OF(node, |
4523 | 0 | struct dp_netdev_flow, |
4524 | 0 | node); |
4525 | 0 | } |
4526 | | /* When finishing dumping the current pmd thread, moves to |
4527 | | * the next. */ |
4528 | 0 | if (n_flows < flow_limit) { |
4529 | 0 | memset(&dump->flow_pos, 0, sizeof dump->flow_pos); |
4530 | 0 | dp_netdev_pmd_unref(pmd); |
4531 | 0 | pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos); |
4532 | 0 | if (!pmd) { |
4533 | 0 | dump->status = EOF; |
4534 | 0 | break; |
4535 | 0 | } |
4536 | 0 | } |
4537 | | /* Keeps the reference to next caller. */ |
4538 | 0 | dump->cur_pmd = pmd; |
4539 | | |
4540 | | /* If the current dump is empty, do not exit the loop, since the |
4541 | | * remaining pmds could have flows to be dumped. Just dumps again |
4542 | | * on the new 'pmd'. */ |
4543 | 0 | } while (!n_flows); |
4544 | 0 | } |
4545 | 0 | ovs_mutex_unlock(&dump->mutex); |
4546 | |
|
4547 | 0 | for (i = 0; i < n_flows; i++) { |
4548 | 0 | struct odputil_keybuf *maskbuf = &thread->maskbuf[i]; |
4549 | 0 | struct odputil_keybuf *keybuf = &thread->keybuf[i]; |
4550 | 0 | struct dp_netdev_flow *netdev_flow = netdev_flows[i]; |
4551 | 0 | struct dpif_flow *f = &flows[i]; |
4552 | 0 | struct ofpbuf key, mask; |
4553 | |
|
4554 | 0 | ofpbuf_use_stack(&key, keybuf, sizeof *keybuf); |
4555 | 0 | ofpbuf_use_stack(&mask, maskbuf, sizeof *maskbuf); |
4556 | 0 | dp_netdev_flow_to_dpif_flow(dp, netdev_flow, &key, &mask, f, |
4557 | 0 | dump->up.terse); |
4558 | 0 | } |
4559 | |
|
4560 | 0 | return n_flows; |
4561 | 0 | } |
4562 | | |
4563 | | static int |
4564 | | dpif_netdev_execute(struct dpif *dpif, struct dpif_execute *execute) |
4565 | | OVS_NO_THREAD_SAFETY_ANALYSIS |
4566 | 0 | { |
4567 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
4568 | 0 | struct dp_netdev_pmd_thread *pmd; |
4569 | 0 | struct dp_packet_batch pp; |
4570 | |
|
4571 | 0 | if (dp_packet_size(execute->packet) < ETH_HEADER_LEN || |
4572 | 0 | dp_packet_size(execute->packet) > UINT16_MAX) { |
4573 | 0 | return EINVAL; |
4574 | 0 | } |
4575 | | |
4576 | | /* Tries finding the 'pmd'. If NULL is returned, that means |
4577 | | * the current thread is a non-pmd thread and should use |
4578 | | * dp_netdev_get_pmd(dp, NON_PMD_CORE_ID). */ |
4579 | 0 | pmd = ovsthread_getspecific(dp->per_pmd_key); |
4580 | 0 | if (!pmd) { |
4581 | 0 | pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID); |
4582 | 0 | if (!pmd) { |
4583 | 0 | return EBUSY; |
4584 | 0 | } |
4585 | 0 | } |
4586 | | |
4587 | 0 | if (execute->probe) { |
4588 | | /* If this is part of a probe, Drop the packet, since executing |
4589 | | * the action may actually cause spurious packets be sent into |
4590 | | * the network. */ |
4591 | 0 | if (pmd->core_id == NON_PMD_CORE_ID) { |
4592 | 0 | dp_netdev_pmd_unref(pmd); |
4593 | 0 | } |
4594 | 0 | return 0; |
4595 | 0 | } |
4596 | | |
4597 | | /* If the current thread is non-pmd thread, acquires |
4598 | | * the 'non_pmd_mutex'. */ |
4599 | 0 | if (pmd->core_id == NON_PMD_CORE_ID) { |
4600 | 0 | ovs_mutex_lock(&dp->non_pmd_mutex); |
4601 | 0 | } |
4602 | | |
4603 | | /* Update current time in PMD context. We don't care about EMC insertion |
4604 | | * probability, because we are on a slow path. */ |
4605 | 0 | pmd_thread_ctx_time_update(pmd); |
4606 | | |
4607 | | /* The action processing expects the RSS hash to be valid, because |
4608 | | * it's always initialized at the beginning of datapath processing. |
4609 | | * In this case, though, 'execute->packet' may not have gone through |
4610 | | * the datapath at all, it may have been generated by the upper layer |
4611 | | * (OpenFlow packet-out, BFD frame, ...). */ |
4612 | 0 | if (!dp_packet_rss_valid(execute->packet)) { |
4613 | 0 | dp_packet_set_rss_hash(execute->packet, |
4614 | 0 | flow_hash_5tuple(execute->flow, 0)); |
4615 | 0 | } |
4616 | | |
4617 | | /* Making a copy because the packet might be stolen during the execution |
4618 | | * and caller might still need it. */ |
4619 | 0 | struct dp_packet *packet_clone = dp_packet_clone(execute->packet); |
4620 | 0 | dp_packet_batch_init_packet(&pp, packet_clone); |
4621 | 0 | dp_netdev_execute_actions(pmd, &pp, false, execute->flow, |
4622 | 0 | execute->actions, execute->actions_len); |
4623 | 0 | dp_netdev_pmd_flush_output_packets(pmd, true); |
4624 | |
|
4625 | 0 | if (pmd->core_id == NON_PMD_CORE_ID) { |
4626 | 0 | ovs_mutex_unlock(&dp->non_pmd_mutex); |
4627 | 0 | dp_netdev_pmd_unref(pmd); |
4628 | 0 | } |
4629 | |
|
4630 | 0 | if (dp_packet_batch_size(&pp) == 1) { |
4631 | | /* Packet wasn't dropped during the execution. Swapping content with |
4632 | | * the original packet, because the caller might expect actions to |
4633 | | * modify it. Uisng the packet from a batch instead of 'packet_clone' |
4634 | | * because it maybe stolen and replaced by other packet, e.g. by |
4635 | | * the fragmentation engine. */ |
4636 | 0 | dp_packet_swap(execute->packet, pp.packets[0]); |
4637 | 0 | dp_packet_delete_batch(&pp, true); |
4638 | 0 | } else if (dp_packet_batch_size(&pp)) { |
4639 | | /* FIXME: We have more packets than expected. Likely, we got IP |
4640 | | * fragments of the reassembled packet. Dropping them here as we have |
4641 | | * no way to get them to the caller. It might be that all the required |
4642 | | * actions with them are already executed, but it also might not be a |
4643 | | * case, e.g. if dpif_netdev_execute() called to execute a single |
4644 | | * tunnel push. */ |
4645 | 0 | dp_packet_delete_batch(&pp, true); |
4646 | 0 | } |
4647 | |
|
4648 | 0 | return 0; |
4649 | 0 | } |
4650 | | |
4651 | | static void |
4652 | | dpif_netdev_operate(struct dpif *dpif, struct dpif_op **ops, size_t n_ops, |
4653 | | enum dpif_offload_type offload_type OVS_UNUSED) |
4654 | 0 | { |
4655 | 0 | size_t i; |
4656 | |
|
4657 | 0 | for (i = 0; i < n_ops; i++) { |
4658 | 0 | struct dpif_op *op = ops[i]; |
4659 | |
|
4660 | 0 | switch (op->type) { |
4661 | 0 | case DPIF_OP_FLOW_PUT: |
4662 | 0 | op->error = dpif_netdev_flow_put(dpif, &op->flow_put); |
4663 | 0 | break; |
4664 | | |
4665 | 0 | case DPIF_OP_FLOW_DEL: |
4666 | 0 | op->error = dpif_netdev_flow_del(dpif, &op->flow_del); |
4667 | 0 | break; |
4668 | | |
4669 | 0 | case DPIF_OP_EXECUTE: |
4670 | 0 | op->error = dpif_netdev_execute(dpif, &op->execute); |
4671 | 0 | break; |
4672 | | |
4673 | 0 | case DPIF_OP_FLOW_GET: |
4674 | 0 | op->error = dpif_netdev_flow_get(dpif, &op->flow_get); |
4675 | 0 | break; |
4676 | 0 | } |
4677 | 0 | } |
4678 | 0 | } |
4679 | | |
4680 | | static int |
4681 | | dpif_netdev_offload_stats_get(struct dpif *dpif, |
4682 | | struct netdev_custom_stats *stats) |
4683 | 0 | { |
4684 | 0 | enum { |
4685 | 0 | DP_NETDEV_HW_OFFLOADS_STATS_ENQUEUED, |
4686 | 0 | DP_NETDEV_HW_OFFLOADS_STATS_INSERTED, |
4687 | 0 | DP_NETDEV_HW_OFFLOADS_STATS_LAT_CMA_MEAN, |
4688 | 0 | DP_NETDEV_HW_OFFLOADS_STATS_LAT_CMA_STDDEV, |
4689 | 0 | DP_NETDEV_HW_OFFLOADS_STATS_LAT_EMA_MEAN, |
4690 | 0 | DP_NETDEV_HW_OFFLOADS_STATS_LAT_EMA_STDDEV, |
4691 | 0 | }; |
4692 | 0 | struct { |
4693 | 0 | const char *name; |
4694 | 0 | uint64_t total; |
4695 | 0 | } hwol_stats[] = { |
4696 | 0 | [DP_NETDEV_HW_OFFLOADS_STATS_ENQUEUED] = |
4697 | 0 | { " Enqueued offloads", 0 }, |
4698 | 0 | [DP_NETDEV_HW_OFFLOADS_STATS_INSERTED] = |
4699 | 0 | { " Inserted offloads", 0 }, |
4700 | 0 | [DP_NETDEV_HW_OFFLOADS_STATS_LAT_CMA_MEAN] = |
4701 | 0 | { " Cumulative Average latency (us)", 0 }, |
4702 | 0 | [DP_NETDEV_HW_OFFLOADS_STATS_LAT_CMA_STDDEV] = |
4703 | 0 | { " Cumulative Latency stddev (us)", 0 }, |
4704 | 0 | [DP_NETDEV_HW_OFFLOADS_STATS_LAT_EMA_MEAN] = |
4705 | 0 | { " Exponential Average latency (us)", 0 }, |
4706 | 0 | [DP_NETDEV_HW_OFFLOADS_STATS_LAT_EMA_STDDEV] = |
4707 | 0 | { " Exponential Latency stddev (us)", 0 }, |
4708 | 0 | }; |
4709 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
4710 | 0 | struct dp_netdev_port *port; |
4711 | 0 | unsigned int nb_thread; |
4712 | 0 | uint64_t *port_nb_offloads; |
4713 | 0 | uint64_t *nb_offloads; |
4714 | 0 | unsigned int tid; |
4715 | 0 | size_t i; |
4716 | |
|
4717 | 0 | if (!netdev_is_flow_api_enabled()) { |
4718 | 0 | return EINVAL; |
4719 | 0 | } |
4720 | | |
4721 | 0 | nb_thread = netdev_offload_thread_nb(); |
4722 | | /* nb_thread counters for the overall total as well. */ |
4723 | 0 | stats->size = ARRAY_SIZE(hwol_stats) * (nb_thread + 1); |
4724 | 0 | stats->counters = xcalloc(stats->size, sizeof *stats->counters); |
4725 | |
|
4726 | 0 | nb_offloads = xcalloc(nb_thread, sizeof *nb_offloads); |
4727 | 0 | port_nb_offloads = xcalloc(nb_thread, sizeof *port_nb_offloads); |
4728 | |
|
4729 | 0 | ovs_rwlock_rdlock(&dp->port_rwlock); |
4730 | 0 | HMAP_FOR_EACH (port, node, &dp->ports) { |
4731 | 0 | memset(port_nb_offloads, 0, nb_thread * sizeof *port_nb_offloads); |
4732 | | /* Do not abort on read error from a port, just report 0. */ |
4733 | 0 | if (!netdev_flow_get_n_flows(port->netdev, port_nb_offloads)) { |
4734 | 0 | for (i = 0; i < nb_thread; i++) { |
4735 | 0 | nb_offloads[i] += port_nb_offloads[i]; |
4736 | 0 | } |
4737 | 0 | } |
4738 | 0 | } |
4739 | 0 | ovs_rwlock_unlock(&dp->port_rwlock); |
4740 | |
|
4741 | 0 | free(port_nb_offloads); |
4742 | |
|
4743 | 0 | for (tid = 0; tid < nb_thread; tid++) { |
4744 | 0 | uint64_t counts[ARRAY_SIZE(hwol_stats)]; |
4745 | 0 | size_t idx = ((tid + 1) * ARRAY_SIZE(hwol_stats)); |
4746 | |
|
4747 | 0 | memset(counts, 0, sizeof counts); |
4748 | 0 | counts[DP_NETDEV_HW_OFFLOADS_STATS_INSERTED] = nb_offloads[tid]; |
4749 | 0 | if (dp_offload_threads != NULL) { |
4750 | 0 | atomic_read_relaxed(&dp_offload_threads[tid].enqueued_item, |
4751 | 0 | &counts[DP_NETDEV_HW_OFFLOADS_STATS_ENQUEUED]); |
4752 | |
|
4753 | 0 | counts[DP_NETDEV_HW_OFFLOADS_STATS_LAT_CMA_MEAN] = |
4754 | 0 | mov_avg_cma(&dp_offload_threads[tid].cma); |
4755 | 0 | counts[DP_NETDEV_HW_OFFLOADS_STATS_LAT_CMA_STDDEV] = |
4756 | 0 | mov_avg_cma_std_dev(&dp_offload_threads[tid].cma); |
4757 | |
|
4758 | 0 | counts[DP_NETDEV_HW_OFFLOADS_STATS_LAT_EMA_MEAN] = |
4759 | 0 | mov_avg_ema(&dp_offload_threads[tid].ema); |
4760 | 0 | counts[DP_NETDEV_HW_OFFLOADS_STATS_LAT_EMA_STDDEV] = |
4761 | 0 | mov_avg_ema_std_dev(&dp_offload_threads[tid].ema); |
4762 | 0 | } |
4763 | |
|
4764 | 0 | for (i = 0; i < ARRAY_SIZE(hwol_stats); i++) { |
4765 | 0 | snprintf(stats->counters[idx + i].name, |
4766 | 0 | sizeof(stats->counters[idx + i].name), |
4767 | 0 | " [%3u] %s", tid, hwol_stats[i].name); |
4768 | 0 | stats->counters[idx + i].value = counts[i]; |
4769 | 0 | hwol_stats[i].total += counts[i]; |
4770 | 0 | } |
4771 | 0 | } |
4772 | |
|
4773 | 0 | free(nb_offloads); |
4774 | | |
4775 | | /* Do an average of the average for the aggregate. */ |
4776 | 0 | hwol_stats[DP_NETDEV_HW_OFFLOADS_STATS_LAT_CMA_MEAN].total /= nb_thread; |
4777 | 0 | hwol_stats[DP_NETDEV_HW_OFFLOADS_STATS_LAT_CMA_STDDEV].total /= nb_thread; |
4778 | 0 | hwol_stats[DP_NETDEV_HW_OFFLOADS_STATS_LAT_EMA_MEAN].total /= nb_thread; |
4779 | 0 | hwol_stats[DP_NETDEV_HW_OFFLOADS_STATS_LAT_EMA_STDDEV].total /= nb_thread; |
4780 | |
|
4781 | 0 | for (i = 0; i < ARRAY_SIZE(hwol_stats); i++) { |
4782 | 0 | snprintf(stats->counters[i].name, sizeof(stats->counters[i].name), |
4783 | 0 | " Total %s", hwol_stats[i].name); |
4784 | 0 | stats->counters[i].value = hwol_stats[i].total; |
4785 | 0 | } |
4786 | |
|
4787 | 0 | return 0; |
4788 | 0 | } |
4789 | | |
4790 | | /* Enable or Disable PMD auto load balancing. */ |
4791 | | static void |
4792 | | set_pmd_auto_lb(struct dp_netdev *dp, bool state, bool always_log) |
4793 | 0 | { |
4794 | 0 | struct pmd_auto_lb *pmd_alb = &dp->pmd_alb; |
4795 | |
|
4796 | 0 | if (pmd_alb->is_enabled != state || always_log) { |
4797 | 0 | pmd_alb->is_enabled = state; |
4798 | 0 | if (pmd_alb->is_enabled) { |
4799 | 0 | uint8_t rebalance_load_thresh; |
4800 | |
|
4801 | 0 | atomic_read_relaxed(&pmd_alb->rebalance_load_thresh, |
4802 | 0 | &rebalance_load_thresh); |
4803 | 0 | VLOG_INFO("PMD auto load balance is enabled, " |
4804 | 0 | "interval %"PRIu64" mins, " |
4805 | 0 | "pmd load threshold %"PRIu8"%%, " |
4806 | 0 | "improvement threshold %"PRIu8"%%.", |
4807 | 0 | pmd_alb->rebalance_intvl / MIN_TO_MSEC, |
4808 | 0 | rebalance_load_thresh, |
4809 | 0 | pmd_alb->rebalance_improve_thresh); |
4810 | 0 | } else { |
4811 | 0 | pmd_alb->rebalance_poll_timer = 0; |
4812 | 0 | VLOG_INFO("PMD auto load balance is disabled."); |
4813 | 0 | } |
4814 | 0 | } |
4815 | 0 | } |
4816 | | |
4817 | | /* Applies datapath configuration from the database. Some of the changes are |
4818 | | * actually applied in dpif_netdev_run(). */ |
4819 | | static int |
4820 | | dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config) |
4821 | 0 | { |
4822 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
4823 | 0 | const char *cmask = smap_get(other_config, "pmd-cpu-mask"); |
4824 | 0 | const char *pmd_rxq_assign = smap_get_def(other_config, "pmd-rxq-assign", |
4825 | 0 | "cycles"); |
4826 | 0 | unsigned long long insert_prob = |
4827 | 0 | smap_get_ullong(other_config, "emc-insert-inv-prob", |
4828 | 0 | DEFAULT_EM_FLOW_INSERT_INV_PROB); |
4829 | 0 | uint32_t insert_min, cur_min; |
4830 | 0 | uint32_t tx_flush_interval, cur_tx_flush_interval; |
4831 | 0 | uint64_t rebalance_intvl; |
4832 | 0 | uint8_t cur_rebalance_load; |
4833 | 0 | uint32_t rebalance_load, rebalance_improve; |
4834 | 0 | uint64_t pmd_max_sleep, cur_pmd_max_sleep; |
4835 | 0 | bool log_autolb = false; |
4836 | 0 | enum sched_assignment_type pmd_rxq_assign_type; |
4837 | 0 | static bool first_set_config = true; |
4838 | |
|
4839 | 0 | tx_flush_interval = smap_get_int(other_config, "tx-flush-interval", |
4840 | 0 | DEFAULT_TX_FLUSH_INTERVAL); |
4841 | 0 | atomic_read_relaxed(&dp->tx_flush_interval, &cur_tx_flush_interval); |
4842 | 0 | if (tx_flush_interval != cur_tx_flush_interval) { |
4843 | 0 | atomic_store_relaxed(&dp->tx_flush_interval, tx_flush_interval); |
4844 | 0 | VLOG_INFO("Flushing interval for tx queues set to %"PRIu32" us", |
4845 | 0 | tx_flush_interval); |
4846 | 0 | } |
4847 | |
|
4848 | 0 | if (!nullable_string_is_equal(dp->pmd_cmask, cmask)) { |
4849 | 0 | free(dp->pmd_cmask); |
4850 | 0 | dp->pmd_cmask = nullable_xstrdup(cmask); |
4851 | 0 | dp_netdev_request_reconfigure(dp); |
4852 | 0 | } |
4853 | |
|
4854 | 0 | atomic_read_relaxed(&dp->emc_insert_min, &cur_min); |
4855 | 0 | if (insert_prob <= UINT32_MAX) { |
4856 | 0 | insert_min = insert_prob == 0 ? 0 : UINT32_MAX / insert_prob; |
4857 | 0 | } else { |
4858 | 0 | insert_min = DEFAULT_EM_FLOW_INSERT_MIN; |
4859 | 0 | insert_prob = DEFAULT_EM_FLOW_INSERT_INV_PROB; |
4860 | 0 | } |
4861 | |
|
4862 | 0 | if (insert_min != cur_min) { |
4863 | 0 | atomic_store_relaxed(&dp->emc_insert_min, insert_min); |
4864 | 0 | if (insert_min == 0) { |
4865 | 0 | VLOG_INFO("EMC insertion probability changed to zero"); |
4866 | 0 | } else { |
4867 | 0 | VLOG_INFO("EMC insertion probability changed to 1/%llu (~%.2f%%)", |
4868 | 0 | insert_prob, (100 / (float)insert_prob)); |
4869 | 0 | } |
4870 | 0 | } |
4871 | |
|
4872 | 0 | bool perf_enabled = smap_get_bool(other_config, "pmd-perf-metrics", false); |
4873 | 0 | bool cur_perf_enabled; |
4874 | 0 | atomic_read_relaxed(&dp->pmd_perf_metrics, &cur_perf_enabled); |
4875 | 0 | if (perf_enabled != cur_perf_enabled) { |
4876 | 0 | atomic_store_relaxed(&dp->pmd_perf_metrics, perf_enabled); |
4877 | 0 | if (perf_enabled) { |
4878 | 0 | VLOG_INFO("PMD performance metrics collection enabled"); |
4879 | 0 | } else { |
4880 | 0 | VLOG_INFO("PMD performance metrics collection disabled"); |
4881 | 0 | } |
4882 | 0 | } |
4883 | |
|
4884 | 0 | bool smc_enable = smap_get_bool(other_config, "smc-enable", false); |
4885 | 0 | bool cur_smc; |
4886 | 0 | atomic_read_relaxed(&dp->smc_enable_db, &cur_smc); |
4887 | 0 | if (smc_enable != cur_smc) { |
4888 | 0 | atomic_store_relaxed(&dp->smc_enable_db, smc_enable); |
4889 | 0 | if (smc_enable) { |
4890 | 0 | VLOG_INFO("SMC cache is enabled"); |
4891 | 0 | } else { |
4892 | 0 | VLOG_INFO("SMC cache is disabled"); |
4893 | 0 | } |
4894 | 0 | } |
4895 | |
|
4896 | 0 | if (!strcmp(pmd_rxq_assign, "roundrobin")) { |
4897 | 0 | pmd_rxq_assign_type = SCHED_ROUNDROBIN; |
4898 | 0 | } else if (!strcmp(pmd_rxq_assign, "cycles")) { |
4899 | 0 | pmd_rxq_assign_type = SCHED_CYCLES; |
4900 | 0 | } else if (!strcmp(pmd_rxq_assign, "group")) { |
4901 | 0 | pmd_rxq_assign_type = SCHED_GROUP; |
4902 | 0 | } else { |
4903 | | /* Default. */ |
4904 | 0 | VLOG_WARN("Unsupported rx queue to PMD assignment mode in " |
4905 | 0 | "pmd-rxq-assign. Defaulting to 'cycles'."); |
4906 | 0 | pmd_rxq_assign_type = SCHED_CYCLES; |
4907 | 0 | pmd_rxq_assign = "cycles"; |
4908 | 0 | } |
4909 | 0 | if (dp->pmd_rxq_assign_type != pmd_rxq_assign_type) { |
4910 | 0 | dp->pmd_rxq_assign_type = pmd_rxq_assign_type; |
4911 | 0 | VLOG_INFO("Rxq to PMD assignment mode changed to: \'%s\'.", |
4912 | 0 | pmd_rxq_assign); |
4913 | 0 | dp_netdev_request_reconfigure(dp); |
4914 | 0 | } |
4915 | |
|
4916 | 0 | bool pmd_iso = smap_get_bool(other_config, "pmd-rxq-isolate", true); |
4917 | |
|
4918 | 0 | if (pmd_rxq_assign_type != SCHED_GROUP && pmd_iso == false) { |
4919 | | /* Invalid combination. */ |
4920 | 0 | VLOG_WARN("pmd-rxq-isolate can only be set false " |
4921 | 0 | "when using pmd-rxq-assign=group"); |
4922 | 0 | pmd_iso = true; |
4923 | 0 | } |
4924 | 0 | if (dp->pmd_iso != pmd_iso) { |
4925 | 0 | dp->pmd_iso = pmd_iso; |
4926 | 0 | if (pmd_iso) { |
4927 | 0 | VLOG_INFO("pmd-rxq-affinity isolates PMD core"); |
4928 | 0 | } else { |
4929 | 0 | VLOG_INFO("pmd-rxq-affinity does not isolate PMD core"); |
4930 | 0 | } |
4931 | 0 | dp_netdev_request_reconfigure(dp); |
4932 | 0 | } |
4933 | |
|
4934 | 0 | struct pmd_auto_lb *pmd_alb = &dp->pmd_alb; |
4935 | |
|
4936 | 0 | rebalance_intvl = smap_get_ullong(other_config, |
4937 | 0 | "pmd-auto-lb-rebal-interval", |
4938 | 0 | ALB_REBALANCE_INTERVAL); |
4939 | 0 | if (rebalance_intvl > MAX_ALB_REBALANCE_INTERVAL) { |
4940 | 0 | rebalance_intvl = ALB_REBALANCE_INTERVAL; |
4941 | 0 | } |
4942 | | |
4943 | | /* Input is in min, convert it to msec. */ |
4944 | 0 | rebalance_intvl = |
4945 | 0 | rebalance_intvl ? rebalance_intvl * MIN_TO_MSEC : MIN_TO_MSEC; |
4946 | |
|
4947 | 0 | if (pmd_alb->rebalance_intvl != rebalance_intvl) { |
4948 | 0 | pmd_alb->rebalance_intvl = rebalance_intvl; |
4949 | 0 | VLOG_INFO("PMD auto load balance interval set to " |
4950 | 0 | "%"PRIu64" mins\n", rebalance_intvl / MIN_TO_MSEC); |
4951 | 0 | log_autolb = true; |
4952 | 0 | } |
4953 | |
|
4954 | 0 | rebalance_improve = smap_get_uint(other_config, |
4955 | 0 | "pmd-auto-lb-improvement-threshold", |
4956 | 0 | ALB_IMPROVEMENT_THRESHOLD); |
4957 | 0 | if (rebalance_improve > 100) { |
4958 | 0 | rebalance_improve = ALB_IMPROVEMENT_THRESHOLD; |
4959 | 0 | } |
4960 | 0 | if (rebalance_improve != pmd_alb->rebalance_improve_thresh) { |
4961 | 0 | pmd_alb->rebalance_improve_thresh = rebalance_improve; |
4962 | 0 | VLOG_INFO("PMD auto load balance improvement threshold set to " |
4963 | 0 | "%"PRIu32"%%", rebalance_improve); |
4964 | 0 | log_autolb = true; |
4965 | 0 | } |
4966 | |
|
4967 | 0 | rebalance_load = smap_get_uint(other_config, "pmd-auto-lb-load-threshold", |
4968 | 0 | ALB_LOAD_THRESHOLD); |
4969 | 0 | if (rebalance_load > 100) { |
4970 | 0 | rebalance_load = ALB_LOAD_THRESHOLD; |
4971 | 0 | } |
4972 | 0 | atomic_read_relaxed(&pmd_alb->rebalance_load_thresh, &cur_rebalance_load); |
4973 | 0 | if (rebalance_load != cur_rebalance_load) { |
4974 | 0 | atomic_store_relaxed(&pmd_alb->rebalance_load_thresh, |
4975 | 0 | rebalance_load); |
4976 | 0 | VLOG_INFO("PMD auto load balance load threshold set to %"PRIu32"%%", |
4977 | 0 | rebalance_load); |
4978 | 0 | log_autolb = true; |
4979 | 0 | } |
4980 | |
|
4981 | 0 | bool autolb_state = smap_get_bool(other_config, "pmd-auto-lb", false); |
4982 | |
|
4983 | 0 | set_pmd_auto_lb(dp, autolb_state, log_autolb); |
4984 | |
|
4985 | 0 | pmd_max_sleep = smap_get_ullong(other_config, "pmd-maxsleep", 0); |
4986 | 0 | pmd_max_sleep = MIN(PMD_RCU_QUIESCE_INTERVAL, pmd_max_sleep); |
4987 | 0 | atomic_read_relaxed(&dp->pmd_max_sleep, &cur_pmd_max_sleep); |
4988 | 0 | if (first_set_config || pmd_max_sleep != cur_pmd_max_sleep) { |
4989 | 0 | atomic_store_relaxed(&dp->pmd_max_sleep, pmd_max_sleep); |
4990 | 0 | VLOG_INFO("PMD max sleep request is %"PRIu64" usecs.", pmd_max_sleep); |
4991 | 0 | VLOG_INFO("PMD load based sleeps are %s.", |
4992 | 0 | pmd_max_sleep ? "enabled" : "disabled" ); |
4993 | 0 | } |
4994 | |
|
4995 | 0 | first_set_config = false; |
4996 | 0 | return 0; |
4997 | 0 | } |
4998 | | |
4999 | | /* Parses affinity list and returns result in 'core_ids'. */ |
5000 | | static int |
5001 | | parse_affinity_list(const char *affinity_list, unsigned *core_ids, int n_rxq) |
5002 | 0 | { |
5003 | 0 | unsigned i; |
5004 | 0 | char *list, *copy, *key, *value; |
5005 | 0 | int error = 0; |
5006 | |
|
5007 | 0 | for (i = 0; i < n_rxq; i++) { |
5008 | 0 | core_ids[i] = OVS_CORE_UNSPEC; |
5009 | 0 | } |
5010 | |
|
5011 | 0 | if (!affinity_list) { |
5012 | 0 | return 0; |
5013 | 0 | } |
5014 | | |
5015 | 0 | list = copy = xstrdup(affinity_list); |
5016 | |
|
5017 | 0 | while (ofputil_parse_key_value(&list, &key, &value)) { |
5018 | 0 | int rxq_id, core_id; |
5019 | |
|
5020 | 0 | if (!str_to_int(key, 0, &rxq_id) || rxq_id < 0 |
5021 | 0 | || !str_to_int(value, 0, &core_id) || core_id < 0) { |
5022 | 0 | error = EINVAL; |
5023 | 0 | break; |
5024 | 0 | } |
5025 | | |
5026 | 0 | if (rxq_id < n_rxq) { |
5027 | 0 | core_ids[rxq_id] = core_id; |
5028 | 0 | } |
5029 | 0 | } |
5030 | |
|
5031 | 0 | free(copy); |
5032 | 0 | return error; |
5033 | 0 | } |
5034 | | |
5035 | | /* Parses 'affinity_list' and applies configuration if it is valid. */ |
5036 | | static int |
5037 | | dpif_netdev_port_set_rxq_affinity(struct dp_netdev_port *port, |
5038 | | const char *affinity_list) |
5039 | 0 | { |
5040 | 0 | unsigned *core_ids, i; |
5041 | 0 | int error = 0; |
5042 | |
|
5043 | 0 | core_ids = xmalloc(port->n_rxq * sizeof *core_ids); |
5044 | 0 | if (parse_affinity_list(affinity_list, core_ids, port->n_rxq)) { |
5045 | 0 | error = EINVAL; |
5046 | 0 | goto exit; |
5047 | 0 | } |
5048 | | |
5049 | 0 | for (i = 0; i < port->n_rxq; i++) { |
5050 | 0 | port->rxqs[i].core_id = core_ids[i]; |
5051 | 0 | } |
5052 | |
|
5053 | 0 | exit: |
5054 | 0 | free(core_ids); |
5055 | 0 | return error; |
5056 | 0 | } |
5057 | | |
5058 | | /* Returns 'true' if one of the 'port's RX queues exists in 'poll_list' |
5059 | | * of given PMD thread. */ |
5060 | | static bool |
5061 | | dpif_netdev_pmd_polls_port(struct dp_netdev_pmd_thread *pmd, |
5062 | | struct dp_netdev_port *port) |
5063 | | OVS_EXCLUDED(pmd->port_mutex) |
5064 | 0 | { |
5065 | 0 | struct rxq_poll *poll; |
5066 | 0 | bool found = false; |
5067 | |
|
5068 | 0 | ovs_mutex_lock(&pmd->port_mutex); |
5069 | 0 | HMAP_FOR_EACH (poll, node, &pmd->poll_list) { |
5070 | 0 | if (port == poll->rxq->port) { |
5071 | 0 | found = true; |
5072 | 0 | break; |
5073 | 0 | } |
5074 | 0 | } |
5075 | 0 | ovs_mutex_unlock(&pmd->port_mutex); |
5076 | 0 | return found; |
5077 | 0 | } |
5078 | | |
5079 | | /* Updates port configuration from the database. The changes are actually |
5080 | | * applied in dpif_netdev_run(). */ |
5081 | | static int |
5082 | | dpif_netdev_port_set_config(struct dpif *dpif, odp_port_t port_no, |
5083 | | const struct smap *cfg) |
5084 | 0 | { |
5085 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
5086 | 0 | struct dp_netdev_port *port; |
5087 | 0 | int error = 0; |
5088 | 0 | const char *affinity_list = smap_get(cfg, "pmd-rxq-affinity"); |
5089 | 0 | bool emc_enabled = smap_get_bool(cfg, "emc-enable", true); |
5090 | 0 | const char *tx_steering_mode = smap_get(cfg, "tx-steering"); |
5091 | 0 | enum txq_req_mode txq_mode; |
5092 | |
|
5093 | 0 | ovs_rwlock_wrlock(&dp->port_rwlock); |
5094 | 0 | error = get_port_by_number(dp, port_no, &port); |
5095 | 0 | if (error) { |
5096 | 0 | goto unlock; |
5097 | 0 | } |
5098 | | |
5099 | 0 | if (emc_enabled != port->emc_enabled) { |
5100 | 0 | struct dp_netdev_pmd_thread *pmd; |
5101 | 0 | struct ds ds = DS_EMPTY_INITIALIZER; |
5102 | 0 | uint32_t cur_min, insert_prob; |
5103 | |
|
5104 | 0 | port->emc_enabled = emc_enabled; |
5105 | | /* Mark for reload all the threads that polls this port and request |
5106 | | * for reconfiguration for the actual reloading of threads. */ |
5107 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
5108 | 0 | if (dpif_netdev_pmd_polls_port(pmd, port)) { |
5109 | 0 | pmd->need_reload = true; |
5110 | 0 | } |
5111 | 0 | } |
5112 | 0 | dp_netdev_request_reconfigure(dp); |
5113 | |
|
5114 | 0 | ds_put_format(&ds, "%s: EMC has been %s.", |
5115 | 0 | netdev_get_name(port->netdev), |
5116 | 0 | (emc_enabled) ? "enabled" : "disabled"); |
5117 | 0 | if (emc_enabled) { |
5118 | 0 | ds_put_cstr(&ds, " Current insertion probability is "); |
5119 | 0 | atomic_read_relaxed(&dp->emc_insert_min, &cur_min); |
5120 | 0 | if (!cur_min) { |
5121 | 0 | ds_put_cstr(&ds, "zero."); |
5122 | 0 | } else { |
5123 | 0 | insert_prob = UINT32_MAX / cur_min; |
5124 | 0 | ds_put_format(&ds, "1/%"PRIu32" (~%.2f%%).", |
5125 | 0 | insert_prob, 100 / (float) insert_prob); |
5126 | 0 | } |
5127 | 0 | } |
5128 | 0 | VLOG_INFO("%s", ds_cstr(&ds)); |
5129 | 0 | ds_destroy(&ds); |
5130 | 0 | } |
5131 | | |
5132 | | /* Checking for RXq affinity changes. */ |
5133 | 0 | if (netdev_is_pmd(port->netdev) |
5134 | 0 | && !nullable_string_is_equal(affinity_list, port->rxq_affinity_list)) { |
5135 | |
|
5136 | 0 | error = dpif_netdev_port_set_rxq_affinity(port, affinity_list); |
5137 | 0 | if (error) { |
5138 | 0 | goto unlock; |
5139 | 0 | } |
5140 | 0 | free(port->rxq_affinity_list); |
5141 | 0 | port->rxq_affinity_list = nullable_xstrdup(affinity_list); |
5142 | |
|
5143 | 0 | dp_netdev_request_reconfigure(dp); |
5144 | 0 | } |
5145 | | |
5146 | 0 | if (nullable_string_is_equal(tx_steering_mode, "hash")) { |
5147 | 0 | txq_mode = TXQ_REQ_MODE_HASH; |
5148 | 0 | } else { |
5149 | 0 | txq_mode = TXQ_REQ_MODE_THREAD; |
5150 | 0 | } |
5151 | |
|
5152 | 0 | if (txq_mode != port->txq_requested_mode) { |
5153 | 0 | port->txq_requested_mode = txq_mode; |
5154 | 0 | VLOG_INFO("%s: Tx packet steering mode has been set to '%s'.", |
5155 | 0 | netdev_get_name(port->netdev), |
5156 | 0 | (txq_mode == TXQ_REQ_MODE_THREAD) ? "thread" : "hash"); |
5157 | 0 | dp_netdev_request_reconfigure(dp); |
5158 | 0 | } |
5159 | |
|
5160 | 0 | unlock: |
5161 | 0 | ovs_rwlock_unlock(&dp->port_rwlock); |
5162 | 0 | return error; |
5163 | 0 | } |
5164 | | |
5165 | | static int |
5166 | | dpif_netdev_queue_to_priority(const struct dpif *dpif OVS_UNUSED, |
5167 | | uint32_t queue_id, uint32_t *priority) |
5168 | 0 | { |
5169 | 0 | *priority = queue_id; |
5170 | 0 | return 0; |
5171 | 0 | } |
5172 | | |
5173 | | |
5174 | | /* Creates and returns a new 'struct dp_netdev_actions', whose actions are |
5175 | | * a copy of the 'size' bytes of 'actions' input parameters. */ |
5176 | | struct dp_netdev_actions * |
5177 | | dp_netdev_actions_create(const struct nlattr *actions, size_t size) |
5178 | 0 | { |
5179 | 0 | struct dp_netdev_actions *netdev_actions; |
5180 | |
|
5181 | 0 | netdev_actions = xmalloc(sizeof *netdev_actions + size); |
5182 | 0 | netdev_actions->size = size; |
5183 | 0 | if (size) { |
5184 | 0 | memcpy(netdev_actions->actions, actions, size); |
5185 | 0 | } |
5186 | |
|
5187 | 0 | return netdev_actions; |
5188 | 0 | } |
5189 | | |
5190 | | struct dp_netdev_actions * |
5191 | | dp_netdev_flow_get_actions(const struct dp_netdev_flow *flow) |
5192 | 0 | { |
5193 | 0 | return ovsrcu_get(struct dp_netdev_actions *, &flow->actions); |
5194 | 0 | } |
5195 | | |
5196 | | static void |
5197 | | dp_netdev_actions_free(struct dp_netdev_actions *actions) |
5198 | 0 | { |
5199 | 0 | free(actions); |
5200 | 0 | } |
5201 | | |
5202 | | static void |
5203 | | dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx, |
5204 | | enum rxq_cycles_counter_type type, |
5205 | | unsigned long long cycles) |
5206 | 0 | { |
5207 | 0 | atomic_store_relaxed(&rx->cycles[type], cycles); |
5208 | 0 | } |
5209 | | |
5210 | | static void |
5211 | | dp_netdev_rxq_add_cycles(struct dp_netdev_rxq *rx, |
5212 | | enum rxq_cycles_counter_type type, |
5213 | | unsigned long long cycles) |
5214 | 0 | { |
5215 | 0 | non_atomic_ullong_add(&rx->cycles[type], cycles); |
5216 | 0 | } |
5217 | | |
5218 | | static uint64_t |
5219 | | dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx, |
5220 | | enum rxq_cycles_counter_type type) |
5221 | 0 | { |
5222 | 0 | unsigned long long processing_cycles; |
5223 | 0 | atomic_read_relaxed(&rx->cycles[type], &processing_cycles); |
5224 | 0 | return processing_cycles; |
5225 | 0 | } |
5226 | | |
5227 | | static void |
5228 | | dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx, |
5229 | | unsigned long long cycles) |
5230 | 0 | { |
5231 | 0 | unsigned int idx = atomic_count_inc(&rx->intrvl_idx) % PMD_INTERVAL_MAX; |
5232 | 0 | atomic_store_relaxed(&rx->cycles_intrvl[idx], cycles); |
5233 | 0 | } |
5234 | | |
5235 | | static uint64_t |
5236 | | dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx) |
5237 | 0 | { |
5238 | 0 | unsigned long long processing_cycles; |
5239 | 0 | atomic_read_relaxed(&rx->cycles_intrvl[idx], &processing_cycles); |
5240 | 0 | return processing_cycles; |
5241 | 0 | } |
5242 | | |
5243 | | #if ATOMIC_ALWAYS_LOCK_FREE_8B |
5244 | | static inline bool |
5245 | | pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd) |
5246 | 0 | { |
5247 | 0 | bool pmd_perf_enabled; |
5248 | 0 | atomic_read_relaxed(&pmd->dp->pmd_perf_metrics, &pmd_perf_enabled); |
5249 | 0 | return pmd_perf_enabled; |
5250 | 0 | } |
5251 | | #else |
5252 | | /* If stores and reads of 64-bit integers are not atomic, the full PMD |
5253 | | * performance metrics are not available as locked access to 64 bit |
5254 | | * integers would be prohibitively expensive. */ |
5255 | | static inline bool |
5256 | | pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd OVS_UNUSED) |
5257 | | { |
5258 | | return false; |
5259 | | } |
5260 | | #endif |
5261 | | |
5262 | | static int |
5263 | | dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd, |
5264 | | struct tx_port *p) |
5265 | 0 | { |
5266 | 0 | int i; |
5267 | 0 | int tx_qid; |
5268 | 0 | int output_cnt; |
5269 | 0 | bool concurrent_txqs; |
5270 | 0 | struct cycle_timer timer; |
5271 | 0 | uint64_t cycles; |
5272 | 0 | uint32_t tx_flush_interval; |
5273 | |
|
5274 | 0 | cycle_timer_start(&pmd->perf_stats, &timer); |
5275 | |
|
5276 | 0 | output_cnt = dp_packet_batch_size(&p->output_pkts); |
5277 | 0 | ovs_assert(output_cnt > 0); |
5278 | |
|
5279 | 0 | if (p->port->txq_mode == TXQ_MODE_XPS_HASH) { |
5280 | 0 | int n_txq = netdev_n_txq(p->port->netdev); |
5281 | | |
5282 | | /* Re-batch per txq based on packet hash. */ |
5283 | 0 | struct dp_packet *packet; |
5284 | 0 | DP_PACKET_BATCH_FOR_EACH (j, packet, &p->output_pkts) { |
5285 | 0 | uint32_t hash; |
5286 | |
|
5287 | 0 | if (OVS_LIKELY(dp_packet_rss_valid(packet))) { |
5288 | 0 | hash = dp_packet_get_rss_hash(packet); |
5289 | 0 | } else { |
5290 | 0 | struct flow flow; |
5291 | |
|
5292 | 0 | flow_extract(packet, &flow); |
5293 | 0 | hash = flow_hash_5tuple(&flow, 0); |
5294 | 0 | } |
5295 | 0 | dp_packet_batch_add(&p->txq_pkts[hash % n_txq], packet); |
5296 | 0 | } |
5297 | | |
5298 | | /* Flush batches of each Tx queues. */ |
5299 | 0 | for (i = 0; i < n_txq; i++) { |
5300 | 0 | if (dp_packet_batch_is_empty(&p->txq_pkts[i])) { |
5301 | 0 | continue; |
5302 | 0 | } |
5303 | 0 | netdev_send(p->port->netdev, i, &p->txq_pkts[i], true); |
5304 | 0 | dp_packet_batch_init(&p->txq_pkts[i]); |
5305 | 0 | } |
5306 | 0 | } else { |
5307 | 0 | if (p->port->txq_mode == TXQ_MODE_XPS) { |
5308 | 0 | tx_qid = dpif_netdev_xps_get_tx_qid(pmd, p); |
5309 | 0 | concurrent_txqs = true; |
5310 | 0 | } else { |
5311 | 0 | tx_qid = pmd->static_tx_qid; |
5312 | 0 | concurrent_txqs = false; |
5313 | 0 | } |
5314 | 0 | netdev_send(p->port->netdev, tx_qid, &p->output_pkts, concurrent_txqs); |
5315 | 0 | } |
5316 | 0 | dp_packet_batch_init(&p->output_pkts); |
5317 | | |
5318 | | /* Update time of the next flush. */ |
5319 | 0 | atomic_read_relaxed(&pmd->dp->tx_flush_interval, &tx_flush_interval); |
5320 | 0 | p->flush_time = pmd->ctx.now + tx_flush_interval; |
5321 | |
|
5322 | 0 | ovs_assert(pmd->n_output_batches > 0); |
5323 | 0 | pmd->n_output_batches--; |
5324 | |
|
5325 | 0 | pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_PKTS, output_cnt); |
5326 | 0 | pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_BATCHES, 1); |
5327 | | |
5328 | | /* Distribute send cycles evenly among transmitted packets and assign to |
5329 | | * their respective rx queues. */ |
5330 | 0 | cycles = cycle_timer_stop(&pmd->perf_stats, &timer) / output_cnt; |
5331 | 0 | for (i = 0; i < output_cnt; i++) { |
5332 | 0 | if (p->output_pkts_rxqs[i]) { |
5333 | 0 | dp_netdev_rxq_add_cycles(p->output_pkts_rxqs[i], |
5334 | 0 | RXQ_CYCLES_PROC_CURR, cycles); |
5335 | 0 | } |
5336 | 0 | } |
5337 | |
|
5338 | 0 | return output_cnt; |
5339 | 0 | } |
5340 | | |
5341 | | static int |
5342 | | dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd, |
5343 | | bool force) |
5344 | 0 | { |
5345 | 0 | struct tx_port *p; |
5346 | 0 | int output_cnt = 0; |
5347 | |
|
5348 | 0 | if (!pmd->n_output_batches) { |
5349 | 0 | return 0; |
5350 | 0 | } |
5351 | | |
5352 | 0 | HMAP_FOR_EACH (p, node, &pmd->send_port_cache) { |
5353 | 0 | if (!dp_packet_batch_is_empty(&p->output_pkts) |
5354 | 0 | && (force || pmd->ctx.now >= p->flush_time)) { |
5355 | 0 | output_cnt += dp_netdev_pmd_flush_output_on_port(pmd, p); |
5356 | 0 | } |
5357 | 0 | } |
5358 | 0 | return output_cnt; |
5359 | 0 | } |
5360 | | |
5361 | | static int |
5362 | | dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd, |
5363 | | struct dp_netdev_rxq *rxq, |
5364 | | odp_port_t port_no) |
5365 | 0 | { |
5366 | 0 | struct pmd_perf_stats *s = &pmd->perf_stats; |
5367 | 0 | struct dp_packet_batch batch; |
5368 | 0 | struct cycle_timer timer; |
5369 | 0 | int error; |
5370 | 0 | int batch_cnt = 0; |
5371 | 0 | int rem_qlen = 0, *qlen_p = NULL; |
5372 | 0 | uint64_t cycles; |
5373 | | |
5374 | | /* Measure duration for polling and processing rx burst. */ |
5375 | 0 | cycle_timer_start(&pmd->perf_stats, &timer); |
5376 | |
|
5377 | 0 | pmd->ctx.last_rxq = rxq; |
5378 | 0 | dp_packet_batch_init(&batch); |
5379 | | |
5380 | | /* Fetch the rx queue length only for vhostuser ports. */ |
5381 | 0 | if (pmd_perf_metrics_enabled(pmd) && rxq->is_vhost) { |
5382 | 0 | qlen_p = &rem_qlen; |
5383 | 0 | } |
5384 | |
|
5385 | 0 | error = netdev_rxq_recv(rxq->rx, &batch, qlen_p); |
5386 | 0 | if (!error) { |
5387 | | /* At least one packet received. */ |
5388 | 0 | *recirc_depth_get() = 0; |
5389 | 0 | pmd_thread_ctx_time_update(pmd); |
5390 | 0 | batch_cnt = dp_packet_batch_size(&batch); |
5391 | 0 | if (pmd_perf_metrics_enabled(pmd)) { |
5392 | | /* Update batch histogram. */ |
5393 | 0 | s->current.batches++; |
5394 | 0 | histogram_add_sample(&s->pkts_per_batch, batch_cnt); |
5395 | | /* Update the maximum vhost rx queue fill level. */ |
5396 | 0 | if (rxq->is_vhost && rem_qlen >= 0) { |
5397 | 0 | uint32_t qfill = batch_cnt + rem_qlen; |
5398 | 0 | if (qfill > s->current.max_vhost_qfill) { |
5399 | 0 | s->current.max_vhost_qfill = qfill; |
5400 | 0 | } |
5401 | 0 | } |
5402 | 0 | } |
5403 | | |
5404 | | /* Process packet batch. */ |
5405 | 0 | int ret = pmd->netdev_input_func(pmd, &batch, port_no); |
5406 | 0 | if (ret) { |
5407 | 0 | dp_netdev_input(pmd, &batch, port_no); |
5408 | 0 | } |
5409 | | |
5410 | | /* Assign processing cycles to rx queue. */ |
5411 | 0 | cycles = cycle_timer_stop(&pmd->perf_stats, &timer); |
5412 | 0 | dp_netdev_rxq_add_cycles(rxq, RXQ_CYCLES_PROC_CURR, cycles); |
5413 | |
|
5414 | 0 | dp_netdev_pmd_flush_output_packets(pmd, false); |
5415 | 0 | } else { |
5416 | | /* Discard cycles. */ |
5417 | 0 | cycle_timer_stop(&pmd->perf_stats, &timer); |
5418 | 0 | if (error != EAGAIN && error != EOPNOTSUPP) { |
5419 | 0 | static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); |
5420 | |
|
5421 | 0 | VLOG_ERR_RL(&rl, "error receiving data from %s: %s", |
5422 | 0 | netdev_rxq_get_name(rxq->rx), ovs_strerror(error)); |
5423 | 0 | } |
5424 | 0 | } |
5425 | |
|
5426 | 0 | pmd->ctx.last_rxq = NULL; |
5427 | |
|
5428 | 0 | return batch_cnt; |
5429 | 0 | } |
5430 | | |
5431 | | static struct tx_port * |
5432 | | tx_port_lookup(const struct hmap *hmap, odp_port_t port_no) |
5433 | 0 | { |
5434 | 0 | struct tx_port *tx; |
5435 | |
|
5436 | 0 | HMAP_FOR_EACH_IN_BUCKET (tx, node, hash_port_no(port_no), hmap) { |
5437 | 0 | if (tx->port->port_no == port_no) { |
5438 | 0 | return tx; |
5439 | 0 | } |
5440 | 0 | } |
5441 | | |
5442 | 0 | return NULL; |
5443 | 0 | } |
5444 | | |
5445 | | static struct tx_bond * |
5446 | | tx_bond_lookup(const struct cmap *tx_bonds, uint32_t bond_id) |
5447 | 0 | { |
5448 | 0 | uint32_t hash = hash_bond_id(bond_id); |
5449 | 0 | struct tx_bond *tx; |
5450 | |
|
5451 | 0 | CMAP_FOR_EACH_WITH_HASH (tx, node, hash, tx_bonds) { |
5452 | 0 | if (tx->bond_id == bond_id) { |
5453 | 0 | return tx; |
5454 | 0 | } |
5455 | 0 | } |
5456 | 0 | return NULL; |
5457 | 0 | } |
5458 | | |
5459 | | static int |
5460 | | port_reconfigure(struct dp_netdev_port *port) |
5461 | 0 | { |
5462 | 0 | struct netdev *netdev = port->netdev; |
5463 | 0 | int i, err; |
5464 | | |
5465 | | /* Closes the existing 'rxq's. */ |
5466 | 0 | for (i = 0; i < port->n_rxq; i++) { |
5467 | 0 | netdev_rxq_close(port->rxqs[i].rx); |
5468 | 0 | port->rxqs[i].rx = NULL; |
5469 | 0 | } |
5470 | 0 | unsigned last_nrxq = port->n_rxq; |
5471 | 0 | port->n_rxq = 0; |
5472 | | |
5473 | | /* Allows 'netdev' to apply the pending configuration changes. */ |
5474 | 0 | if (netdev_is_reconf_required(netdev) || port->need_reconfigure) { |
5475 | 0 | err = netdev_reconfigure(netdev); |
5476 | 0 | if (err && (err != EOPNOTSUPP)) { |
5477 | 0 | VLOG_ERR("Failed to set interface %s new configuration", |
5478 | 0 | netdev_get_name(netdev)); |
5479 | 0 | return err; |
5480 | 0 | } |
5481 | 0 | } |
5482 | | /* If the netdev_reconfigure() above succeeds, reopens the 'rxq's. */ |
5483 | 0 | port->rxqs = xrealloc(port->rxqs, |
5484 | 0 | sizeof *port->rxqs * netdev_n_rxq(netdev)); |
5485 | | /* Realloc 'used' counters for tx queues. */ |
5486 | 0 | free(port->txq_used); |
5487 | 0 | port->txq_used = xcalloc(netdev_n_txq(netdev), sizeof *port->txq_used); |
5488 | |
|
5489 | 0 | for (i = 0; i < netdev_n_rxq(netdev); i++) { |
5490 | 0 | bool new_queue = i >= last_nrxq; |
5491 | 0 | if (new_queue) { |
5492 | 0 | memset(&port->rxqs[i], 0, sizeof port->rxqs[i]); |
5493 | 0 | } |
5494 | |
|
5495 | 0 | port->rxqs[i].port = port; |
5496 | 0 | port->rxqs[i].is_vhost = !strncmp(port->type, "dpdkvhost", 9); |
5497 | |
|
5498 | 0 | err = netdev_rxq_open(netdev, &port->rxqs[i].rx, i); |
5499 | 0 | if (err) { |
5500 | 0 | return err; |
5501 | 0 | } |
5502 | 0 | port->n_rxq++; |
5503 | 0 | } |
5504 | | |
5505 | | /* Parse affinity list to apply configuration for new queues. */ |
5506 | 0 | dpif_netdev_port_set_rxq_affinity(port, port->rxq_affinity_list); |
5507 | | |
5508 | | /* If reconfiguration was successful mark it as such, so we can use it */ |
5509 | 0 | port->need_reconfigure = false; |
5510 | |
|
5511 | 0 | return 0; |
5512 | 0 | } |
5513 | | |
5514 | | struct sched_numa_list { |
5515 | | struct hmap numas; /* Contains 'struct sched_numa'. */ |
5516 | | }; |
5517 | | |
5518 | | /* Meta data for out-of-place pmd rxq assignments. */ |
5519 | | struct sched_pmd { |
5520 | | struct sched_numa *numa; |
5521 | | /* Associated PMD thread. */ |
5522 | | struct dp_netdev_pmd_thread *pmd; |
5523 | | uint64_t pmd_proc_cycles; |
5524 | | struct dp_netdev_rxq **rxqs; |
5525 | | unsigned n_rxq; |
5526 | | bool isolated; |
5527 | | }; |
5528 | | |
5529 | | struct sched_numa { |
5530 | | struct hmap_node node; |
5531 | | int numa_id; |
5532 | | /* PMDs on numa node. */ |
5533 | | struct sched_pmd *pmds; |
5534 | | /* Num of PMDs on numa node. */ |
5535 | | unsigned n_pmds; |
5536 | | /* Num of isolated PMDs on numa node. */ |
5537 | | unsigned n_isolated; |
5538 | | int rr_cur_index; |
5539 | | bool rr_idx_inc; |
5540 | | }; |
5541 | | |
5542 | | static size_t |
5543 | | sched_numa_list_count(struct sched_numa_list *numa_list) |
5544 | 0 | { |
5545 | 0 | return hmap_count(&numa_list->numas); |
5546 | 0 | } |
5547 | | |
5548 | | static struct sched_numa * |
5549 | | sched_numa_list_next(struct sched_numa_list *numa_list, |
5550 | | const struct sched_numa *numa) |
5551 | 0 | { |
5552 | 0 | struct hmap_node *node = NULL; |
5553 | |
|
5554 | 0 | if (numa) { |
5555 | 0 | node = hmap_next(&numa_list->numas, &numa->node); |
5556 | 0 | } |
5557 | 0 | if (!node) { |
5558 | 0 | node = hmap_first(&numa_list->numas); |
5559 | 0 | } |
5560 | |
|
5561 | 0 | return (node) ? CONTAINER_OF(node, struct sched_numa, node) : NULL; |
5562 | 0 | } |
5563 | | |
5564 | | static struct sched_numa * |
5565 | | sched_numa_list_lookup(struct sched_numa_list *numa_list, int numa_id) |
5566 | 0 | { |
5567 | 0 | struct sched_numa *numa; |
5568 | |
|
5569 | 0 | HMAP_FOR_EACH_WITH_HASH (numa, node, hash_int(numa_id, 0), |
5570 | 0 | &numa_list->numas) { |
5571 | 0 | if (numa->numa_id == numa_id) { |
5572 | 0 | return numa; |
5573 | 0 | } |
5574 | 0 | } |
5575 | 0 | return NULL; |
5576 | 0 | } |
5577 | | |
5578 | | static int |
5579 | | compare_sched_pmd_list(const void *a_, const void *b_) |
5580 | 0 | { |
5581 | 0 | struct sched_pmd *a, *b; |
5582 | |
|
5583 | 0 | a = (struct sched_pmd *) a_; |
5584 | 0 | b = (struct sched_pmd *) b_; |
5585 | |
|
5586 | 0 | return compare_poll_thread_list(&a->pmd, &b->pmd); |
5587 | 0 | } |
5588 | | |
5589 | | static void |
5590 | | sort_numa_list_pmds(struct sched_numa_list *numa_list) |
5591 | 0 | { |
5592 | 0 | struct sched_numa *numa; |
5593 | |
|
5594 | 0 | HMAP_FOR_EACH (numa, node, &numa_list->numas) { |
5595 | 0 | if (numa->n_pmds > 1) { |
5596 | 0 | qsort(numa->pmds, numa->n_pmds, sizeof *numa->pmds, |
5597 | 0 | compare_sched_pmd_list); |
5598 | 0 | } |
5599 | 0 | } |
5600 | 0 | } |
5601 | | |
5602 | | /* Populate numas and pmds on those numas. */ |
5603 | | static void |
5604 | | sched_numa_list_populate(struct sched_numa_list *numa_list, |
5605 | | struct dp_netdev *dp) |
5606 | 0 | { |
5607 | 0 | struct dp_netdev_pmd_thread *pmd; |
5608 | |
|
5609 | 0 | hmap_init(&numa_list->numas); |
5610 | | |
5611 | | /* For each pmd on this datapath. */ |
5612 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
5613 | 0 | struct sched_numa *numa; |
5614 | 0 | struct sched_pmd *sched_pmd; |
5615 | 0 | if (pmd->core_id == NON_PMD_CORE_ID) { |
5616 | 0 | continue; |
5617 | 0 | } |
5618 | | |
5619 | | /* Get the numa of the PMD. */ |
5620 | 0 | numa = sched_numa_list_lookup(numa_list, pmd->numa_id); |
5621 | | /* Create a new numa node for it if not already created. */ |
5622 | 0 | if (!numa) { |
5623 | 0 | numa = xzalloc(sizeof *numa); |
5624 | 0 | numa->numa_id = pmd->numa_id; |
5625 | 0 | hmap_insert(&numa_list->numas, &numa->node, |
5626 | 0 | hash_int(pmd->numa_id, 0)); |
5627 | 0 | } |
5628 | | |
5629 | | /* Create a sched_pmd on this numa for the pmd. */ |
5630 | 0 | numa->n_pmds++; |
5631 | 0 | numa->pmds = xrealloc(numa->pmds, numa->n_pmds * sizeof *numa->pmds); |
5632 | 0 | sched_pmd = &numa->pmds[numa->n_pmds - 1]; |
5633 | 0 | memset(sched_pmd, 0, sizeof *sched_pmd); |
5634 | 0 | sched_pmd->numa = numa; |
5635 | 0 | sched_pmd->pmd = pmd; |
5636 | | /* At least one pmd is present so initialize curr_idx and idx_inc. */ |
5637 | 0 | numa->rr_cur_index = 0; |
5638 | 0 | numa->rr_idx_inc = true; |
5639 | 0 | } |
5640 | 0 | sort_numa_list_pmds(numa_list); |
5641 | 0 | } |
5642 | | |
5643 | | static void |
5644 | | sched_numa_list_free_entries(struct sched_numa_list *numa_list) |
5645 | 0 | { |
5646 | 0 | struct sched_numa *numa; |
5647 | |
|
5648 | 0 | HMAP_FOR_EACH_POP (numa, node, &numa_list->numas) { |
5649 | 0 | for (unsigned i = 0; i < numa->n_pmds; i++) { |
5650 | 0 | struct sched_pmd *sched_pmd; |
5651 | |
|
5652 | 0 | sched_pmd = &numa->pmds[i]; |
5653 | 0 | sched_pmd->n_rxq = 0; |
5654 | 0 | free(sched_pmd->rxqs); |
5655 | 0 | } |
5656 | 0 | numa->n_pmds = 0; |
5657 | 0 | free(numa->pmds); |
5658 | 0 | free(numa); |
5659 | 0 | } |
5660 | 0 | hmap_destroy(&numa_list->numas); |
5661 | 0 | } |
5662 | | |
5663 | | static struct sched_pmd * |
5664 | | sched_pmd_find_by_pmd(struct sched_numa_list *numa_list, |
5665 | | struct dp_netdev_pmd_thread *pmd) |
5666 | 0 | { |
5667 | 0 | struct sched_numa *numa; |
5668 | |
|
5669 | 0 | HMAP_FOR_EACH (numa, node, &numa_list->numas) { |
5670 | 0 | for (unsigned i = 0; i < numa->n_pmds; i++) { |
5671 | 0 | struct sched_pmd *sched_pmd; |
5672 | |
|
5673 | 0 | sched_pmd = &numa->pmds[i]; |
5674 | 0 | if (pmd == sched_pmd->pmd) { |
5675 | 0 | return sched_pmd; |
5676 | 0 | } |
5677 | 0 | } |
5678 | 0 | } |
5679 | 0 | return NULL; |
5680 | 0 | } |
5681 | | |
5682 | | static void |
5683 | | sched_pmd_add_rxq(struct sched_pmd *sched_pmd, struct dp_netdev_rxq *rxq, |
5684 | | uint64_t cycles) |
5685 | 0 | { |
5686 | | /* As sched_pmd is allocated outside this fn. better to not assume |
5687 | | * rxqs is initialized to NULL. */ |
5688 | 0 | if (sched_pmd->n_rxq == 0) { |
5689 | 0 | sched_pmd->rxqs = xmalloc(sizeof *sched_pmd->rxqs); |
5690 | 0 | } else { |
5691 | 0 | sched_pmd->rxqs = xrealloc(sched_pmd->rxqs, (sched_pmd->n_rxq + 1) * |
5692 | 0 | sizeof *sched_pmd->rxqs); |
5693 | 0 | } |
5694 | |
|
5695 | 0 | sched_pmd->rxqs[sched_pmd->n_rxq++] = rxq; |
5696 | 0 | sched_pmd->pmd_proc_cycles += cycles; |
5697 | 0 | } |
5698 | | |
5699 | | static void |
5700 | | sched_numa_list_assignments(struct sched_numa_list *numa_list, |
5701 | | struct dp_netdev *dp) |
5702 | | OVS_REQ_RDLOCK(dp->port_rwlock) |
5703 | 0 | { |
5704 | 0 | struct dp_netdev_port *port; |
5705 | | |
5706 | | /* For each port. */ |
5707 | 0 | HMAP_FOR_EACH (port, node, &dp->ports) { |
5708 | 0 | if (!netdev_is_pmd(port->netdev)) { |
5709 | 0 | continue; |
5710 | 0 | } |
5711 | | /* For each rxq on the port. */ |
5712 | 0 | for (unsigned qid = 0; qid < port->n_rxq; qid++) { |
5713 | 0 | struct dp_netdev_rxq *rxq = &port->rxqs[qid]; |
5714 | 0 | struct sched_pmd *sched_pmd; |
5715 | 0 | uint64_t proc_cycles = 0; |
5716 | |
|
5717 | 0 | for (int i = 0; i < PMD_INTERVAL_MAX; i++) { |
5718 | 0 | proc_cycles += dp_netdev_rxq_get_intrvl_cycles(rxq, i); |
5719 | 0 | } |
5720 | |
|
5721 | 0 | sched_pmd = sched_pmd_find_by_pmd(numa_list, rxq->pmd); |
5722 | 0 | if (sched_pmd) { |
5723 | 0 | if (rxq->core_id != OVS_CORE_UNSPEC && dp->pmd_iso) { |
5724 | 0 | sched_pmd->isolated = true; |
5725 | 0 | } |
5726 | 0 | sched_pmd_add_rxq(sched_pmd, rxq, proc_cycles); |
5727 | 0 | } |
5728 | 0 | } |
5729 | 0 | } |
5730 | 0 | } |
5731 | | |
5732 | | static void |
5733 | | sched_numa_list_put_in_place(struct sched_numa_list *numa_list) |
5734 | 0 | { |
5735 | 0 | struct sched_numa *numa; |
5736 | | |
5737 | | /* For each numa. */ |
5738 | 0 | HMAP_FOR_EACH (numa, node, &numa_list->numas) { |
5739 | | /* For each pmd. */ |
5740 | 0 | for (int i = 0; i < numa->n_pmds; i++) { |
5741 | 0 | struct sched_pmd *sched_pmd; |
5742 | |
|
5743 | 0 | sched_pmd = &numa->pmds[i]; |
5744 | 0 | sched_pmd->pmd->isolated = sched_pmd->isolated; |
5745 | | /* For each rxq. */ |
5746 | 0 | for (unsigned k = 0; k < sched_pmd->n_rxq; k++) { |
5747 | | /* Store the new pmd from the out of place sched_numa_list |
5748 | | * struct to the dp_netdev_rxq struct */ |
5749 | 0 | sched_pmd->rxqs[k]->pmd = sched_pmd->pmd; |
5750 | 0 | } |
5751 | 0 | } |
5752 | 0 | } |
5753 | 0 | } |
5754 | | |
5755 | | /* Returns 'true' if OVS rxq scheduling algorithm assigned any unpinned rxq to |
5756 | | * a PMD thread core on a non-local numa node. */ |
5757 | | static bool |
5758 | | sched_numa_list_cross_numa_polling(struct sched_numa_list *numa_list) |
5759 | 0 | { |
5760 | 0 | struct sched_numa *numa; |
5761 | |
|
5762 | 0 | HMAP_FOR_EACH (numa, node, &numa_list->numas) { |
5763 | 0 | for (int i = 0; i < numa->n_pmds; i++) { |
5764 | 0 | struct sched_pmd *sched_pmd; |
5765 | |
|
5766 | 0 | sched_pmd = &numa->pmds[i]; |
5767 | 0 | if (sched_pmd->isolated) { |
5768 | | /* All rxqs on this PMD thread core are pinned. */ |
5769 | 0 | continue; |
5770 | 0 | } |
5771 | 0 | for (unsigned k = 0; k < sched_pmd->n_rxq; k++) { |
5772 | 0 | struct dp_netdev_rxq *rxq = sched_pmd->rxqs[k]; |
5773 | | /* Check if the rxq is not pinned to a specific PMD thread core |
5774 | | * by the user AND the PMD thread core that OVS assigned is |
5775 | | * non-local to the rxq port. */ |
5776 | 0 | if (rxq->core_id == OVS_CORE_UNSPEC && |
5777 | 0 | rxq->pmd->numa_id != |
5778 | 0 | netdev_get_numa_id(rxq->port->netdev)) { |
5779 | 0 | return true; |
5780 | 0 | } |
5781 | 0 | } |
5782 | 0 | } |
5783 | 0 | } |
5784 | 0 | return false; |
5785 | 0 | } |
5786 | | |
5787 | | static unsigned |
5788 | | sched_numa_noniso_pmd_count(struct sched_numa *numa) |
5789 | 0 | { |
5790 | 0 | if (numa->n_pmds > numa->n_isolated) { |
5791 | 0 | return numa->n_pmds - numa->n_isolated; |
5792 | 0 | } |
5793 | 0 | return 0; |
5794 | 0 | } |
5795 | | |
5796 | | /* Sort Rx Queues by the processing cycles they are consuming. */ |
5797 | | static int |
5798 | | compare_rxq_cycles(const void *a, const void *b) |
5799 | 0 | { |
5800 | 0 | struct dp_netdev_rxq *qa; |
5801 | 0 | struct dp_netdev_rxq *qb; |
5802 | 0 | uint64_t cycles_qa, cycles_qb; |
5803 | |
|
5804 | 0 | qa = *(struct dp_netdev_rxq **) a; |
5805 | 0 | qb = *(struct dp_netdev_rxq **) b; |
5806 | |
|
5807 | 0 | cycles_qa = dp_netdev_rxq_get_cycles(qa, RXQ_CYCLES_PROC_HIST); |
5808 | 0 | cycles_qb = dp_netdev_rxq_get_cycles(qb, RXQ_CYCLES_PROC_HIST); |
5809 | |
|
5810 | 0 | if (cycles_qa != cycles_qb) { |
5811 | 0 | return (cycles_qa < cycles_qb) ? 1 : -1; |
5812 | 0 | } else { |
5813 | | /* Cycles are the same so tiebreak on port/queue id. |
5814 | | * Tiebreaking (as opposed to return 0) ensures consistent |
5815 | | * sort results across multiple OS's. */ |
5816 | 0 | uint32_t port_qa = odp_to_u32(qa->port->port_no); |
5817 | 0 | uint32_t port_qb = odp_to_u32(qb->port->port_no); |
5818 | 0 | if (port_qa != port_qb) { |
5819 | 0 | return port_qa > port_qb ? 1 : -1; |
5820 | 0 | } else { |
5821 | 0 | return netdev_rxq_get_queue_id(qa->rx) |
5822 | 0 | - netdev_rxq_get_queue_id(qb->rx); |
5823 | 0 | } |
5824 | 0 | } |
5825 | 0 | } |
5826 | | |
5827 | | static bool |
5828 | | sched_pmd_new_lowest(struct sched_pmd *current_lowest, struct sched_pmd *pmd, |
5829 | | bool has_proc) |
5830 | 0 | { |
5831 | 0 | uint64_t current_num, pmd_num; |
5832 | |
|
5833 | 0 | if (current_lowest == NULL) { |
5834 | 0 | return true; |
5835 | 0 | } |
5836 | | |
5837 | 0 | if (has_proc) { |
5838 | 0 | current_num = current_lowest->pmd_proc_cycles; |
5839 | 0 | pmd_num = pmd->pmd_proc_cycles; |
5840 | 0 | } else { |
5841 | 0 | current_num = current_lowest->n_rxq; |
5842 | 0 | pmd_num = pmd->n_rxq; |
5843 | 0 | } |
5844 | |
|
5845 | 0 | if (pmd_num < current_num) { |
5846 | 0 | return true; |
5847 | 0 | } |
5848 | 0 | return false; |
5849 | 0 | } |
5850 | | |
5851 | | static struct sched_pmd * |
5852 | | sched_pmd_get_lowest(struct sched_numa *numa, bool has_cyc) |
5853 | 0 | { |
5854 | 0 | struct sched_pmd *lowest_sched_pmd = NULL; |
5855 | |
|
5856 | 0 | for (unsigned i = 0; i < numa->n_pmds; i++) { |
5857 | 0 | struct sched_pmd *sched_pmd; |
5858 | |
|
5859 | 0 | sched_pmd = &numa->pmds[i]; |
5860 | 0 | if (sched_pmd->isolated) { |
5861 | 0 | continue; |
5862 | 0 | } |
5863 | 0 | if (sched_pmd_new_lowest(lowest_sched_pmd, sched_pmd, has_cyc)) { |
5864 | 0 | lowest_sched_pmd = sched_pmd; |
5865 | 0 | } |
5866 | 0 | } |
5867 | 0 | return lowest_sched_pmd; |
5868 | 0 | } |
5869 | | |
5870 | | /* |
5871 | | * Returns the next pmd from the numa node. |
5872 | | * |
5873 | | * If 'updown' is 'true' it will alternate between selecting the next pmd in |
5874 | | * either an up or down walk, switching between up/down when the first or last |
5875 | | * core is reached. e.g. 1,2,3,3,2,1,1,2... |
5876 | | * |
5877 | | * If 'updown' is 'false' it will select the next pmd wrapping around when |
5878 | | * last core reached. e.g. 1,2,3,1,2,3,1,2... |
5879 | | */ |
5880 | | static struct sched_pmd * |
5881 | | sched_pmd_next_rr(struct sched_numa *numa, bool updown) |
5882 | 0 | { |
5883 | 0 | int numa_idx = numa->rr_cur_index; |
5884 | |
|
5885 | 0 | if (numa->rr_idx_inc == true) { |
5886 | | /* Incrementing through list of pmds. */ |
5887 | 0 | if (numa->rr_cur_index == numa->n_pmds - 1) { |
5888 | | /* Reached the last pmd. */ |
5889 | 0 | if (updown) { |
5890 | 0 | numa->rr_idx_inc = false; |
5891 | 0 | } else { |
5892 | 0 | numa->rr_cur_index = 0; |
5893 | 0 | } |
5894 | 0 | } else { |
5895 | 0 | numa->rr_cur_index++; |
5896 | 0 | } |
5897 | 0 | } else { |
5898 | | /* Decrementing through list of pmds. */ |
5899 | 0 | if (numa->rr_cur_index == 0) { |
5900 | | /* Reached the first pmd. */ |
5901 | 0 | numa->rr_idx_inc = true; |
5902 | 0 | } else { |
5903 | 0 | numa->rr_cur_index--; |
5904 | 0 | } |
5905 | 0 | } |
5906 | 0 | return &numa->pmds[numa_idx]; |
5907 | 0 | } |
5908 | | |
5909 | | static struct sched_pmd * |
5910 | | sched_pmd_next_noniso_rr(struct sched_numa *numa, bool updown) |
5911 | 0 | { |
5912 | 0 | struct sched_pmd *sched_pmd = NULL; |
5913 | | |
5914 | | /* sched_pmd_next_rr() may return duplicate PMDs before all PMDs have been |
5915 | | * returned depending on updown. Call it more than n_pmds to ensure all |
5916 | | * PMDs can be searched for the next non-isolated PMD. */ |
5917 | 0 | for (unsigned i = 0; i < numa->n_pmds * 2; i++) { |
5918 | 0 | sched_pmd = sched_pmd_next_rr(numa, updown); |
5919 | 0 | if (!sched_pmd->isolated) { |
5920 | 0 | break; |
5921 | 0 | } |
5922 | 0 | sched_pmd = NULL; |
5923 | 0 | } |
5924 | 0 | return sched_pmd; |
5925 | 0 | } |
5926 | | |
5927 | | static struct sched_pmd * |
5928 | | sched_pmd_next(struct sched_numa *numa, enum sched_assignment_type algo, |
5929 | | bool has_proc) |
5930 | 0 | { |
5931 | 0 | if (algo == SCHED_GROUP) { |
5932 | 0 | return sched_pmd_get_lowest(numa, has_proc); |
5933 | 0 | } |
5934 | | |
5935 | | /* By default RR the PMDs. */ |
5936 | 0 | return sched_pmd_next_noniso_rr(numa, algo == SCHED_CYCLES ? true : false); |
5937 | 0 | } |
5938 | | |
5939 | | static const char * |
5940 | | get_assignment_type_string(enum sched_assignment_type algo) |
5941 | 0 | { |
5942 | 0 | switch (algo) { |
5943 | 0 | case SCHED_ROUNDROBIN: return "roundrobin"; |
5944 | 0 | case SCHED_CYCLES: return "cycles"; |
5945 | 0 | case SCHED_GROUP: return "group"; |
5946 | 0 | default: return "Unknown"; |
5947 | 0 | } |
5948 | 0 | } |
5949 | | |
5950 | 0 | #define MAX_RXQ_CYC_TEXT 40 |
5951 | 0 | #define MAX_RXQ_CYC_STRLEN (INT_STRLEN(uint64_t) + MAX_RXQ_CYC_TEXT) |
5952 | | |
5953 | | static char * |
5954 | | get_rxq_cyc_log(char *a, enum sched_assignment_type algo, uint64_t cycles) |
5955 | 0 | { |
5956 | 0 | int ret = 0; |
5957 | |
|
5958 | 0 | if (algo != SCHED_ROUNDROBIN) { |
5959 | 0 | ret = snprintf(a, MAX_RXQ_CYC_STRLEN, |
5960 | 0 | " (measured processing cycles %"PRIu64")", cycles); |
5961 | 0 | } |
5962 | |
|
5963 | 0 | if (algo == SCHED_ROUNDROBIN || ret <= 0) { |
5964 | 0 | a[0] = '\0'; |
5965 | 0 | } |
5966 | 0 | return a; |
5967 | 0 | } |
5968 | | |
5969 | | static void |
5970 | | sched_numa_list_schedule(struct sched_numa_list *numa_list, |
5971 | | struct dp_netdev *dp, |
5972 | | enum sched_assignment_type algo, |
5973 | | enum vlog_level level) |
5974 | | OVS_REQ_RDLOCK(dp->port_rwlock) |
5975 | 0 | { |
5976 | 0 | struct dp_netdev_port *port; |
5977 | 0 | struct dp_netdev_rxq **rxqs = NULL; |
5978 | 0 | struct sched_numa *last_cross_numa; |
5979 | 0 | unsigned n_rxqs = 0; |
5980 | 0 | bool start_logged = false; |
5981 | 0 | size_t n_numa; |
5982 | | |
5983 | | /* For each port. */ |
5984 | 0 | HMAP_FOR_EACH (port, node, &dp->ports) { |
5985 | 0 | if (!netdev_is_pmd(port->netdev)) { |
5986 | 0 | continue; |
5987 | 0 | } |
5988 | | |
5989 | | /* For each rxq on the port. */ |
5990 | 0 | for (int qid = 0; qid < port->n_rxq; qid++) { |
5991 | 0 | struct dp_netdev_rxq *rxq = &port->rxqs[qid]; |
5992 | |
|
5993 | 0 | if (algo != SCHED_ROUNDROBIN) { |
5994 | 0 | uint64_t cycle_hist = 0; |
5995 | | |
5996 | | /* Sum the queue intervals and store the cycle history. */ |
5997 | 0 | for (unsigned i = 0; i < PMD_INTERVAL_MAX; i++) { |
5998 | 0 | cycle_hist += dp_netdev_rxq_get_intrvl_cycles(rxq, i); |
5999 | 0 | } |
6000 | 0 | dp_netdev_rxq_set_cycles(rxq, RXQ_CYCLES_PROC_HIST, |
6001 | 0 | cycle_hist); |
6002 | 0 | } |
6003 | | |
6004 | | /* Check if this rxq is pinned. */ |
6005 | 0 | if (rxq->core_id != OVS_CORE_UNSPEC) { |
6006 | 0 | struct sched_pmd *sched_pmd; |
6007 | 0 | struct dp_netdev_pmd_thread *pmd; |
6008 | 0 | struct sched_numa *numa; |
6009 | 0 | bool iso = dp->pmd_iso; |
6010 | 0 | uint64_t proc_cycles; |
6011 | 0 | char rxq_cyc_log[MAX_RXQ_CYC_STRLEN]; |
6012 | | |
6013 | | /* This rxq should be pinned, pin it now. */ |
6014 | 0 | pmd = dp_netdev_get_pmd(dp, rxq->core_id); |
6015 | 0 | sched_pmd = sched_pmd_find_by_pmd(numa_list, pmd); |
6016 | 0 | dp_netdev_pmd_unref(pmd); |
6017 | 0 | if (!sched_pmd) { |
6018 | | /* Cannot find the PMD. Cannot pin this rxq. */ |
6019 | 0 | VLOG(level == VLL_DBG ? VLL_DBG : VLL_WARN, |
6020 | 0 | "Core %2u cannot be pinned with " |
6021 | 0 | "port \'%s\' rx queue %d. Use pmd-cpu-mask to " |
6022 | 0 | "enable a pmd on core %u. An alternative core " |
6023 | 0 | "will be assigned.", |
6024 | 0 | rxq->core_id, |
6025 | 0 | netdev_rxq_get_name(rxq->rx), |
6026 | 0 | netdev_rxq_get_queue_id(rxq->rx), |
6027 | 0 | rxq->core_id); |
6028 | 0 | rxqs = xrealloc(rxqs, (n_rxqs + 1) * sizeof *rxqs); |
6029 | 0 | rxqs[n_rxqs++] = rxq; |
6030 | 0 | continue; |
6031 | 0 | } |
6032 | 0 | if (iso) { |
6033 | | /* Mark PMD as isolated if not done already. */ |
6034 | 0 | if (sched_pmd->isolated == false) { |
6035 | 0 | sched_pmd->isolated = true; |
6036 | 0 | numa = sched_pmd->numa; |
6037 | 0 | numa->n_isolated++; |
6038 | 0 | } |
6039 | 0 | } |
6040 | 0 | proc_cycles = dp_netdev_rxq_get_cycles(rxq, |
6041 | 0 | RXQ_CYCLES_PROC_HIST); |
6042 | 0 | VLOG(level, "Core %2u on numa node %d is pinned with " |
6043 | 0 | "port \'%s\' rx queue %d%s", |
6044 | 0 | sched_pmd->pmd->core_id, sched_pmd->pmd->numa_id, |
6045 | 0 | netdev_rxq_get_name(rxq->rx), |
6046 | 0 | netdev_rxq_get_queue_id(rxq->rx), |
6047 | 0 | get_rxq_cyc_log(rxq_cyc_log, algo, proc_cycles)); |
6048 | 0 | sched_pmd_add_rxq(sched_pmd, rxq, proc_cycles); |
6049 | 0 | } else { |
6050 | 0 | rxqs = xrealloc(rxqs, (n_rxqs + 1) * sizeof *rxqs); |
6051 | 0 | rxqs[n_rxqs++] = rxq; |
6052 | 0 | } |
6053 | 0 | } |
6054 | 0 | } |
6055 | |
|
6056 | 0 | if (n_rxqs > 1 && algo != SCHED_ROUNDROBIN) { |
6057 | | /* Sort the queues in order of the processing cycles |
6058 | | * they consumed during their last pmd interval. */ |
6059 | 0 | qsort(rxqs, n_rxqs, sizeof *rxqs, compare_rxq_cycles); |
6060 | 0 | } |
6061 | |
|
6062 | 0 | last_cross_numa = NULL; |
6063 | 0 | n_numa = sched_numa_list_count(numa_list); |
6064 | 0 | for (unsigned i = 0; i < n_rxqs; i++) { |
6065 | 0 | struct dp_netdev_rxq *rxq = rxqs[i]; |
6066 | 0 | struct sched_pmd *sched_pmd = NULL; |
6067 | 0 | struct sched_numa *numa; |
6068 | 0 | int port_numa_id; |
6069 | 0 | uint64_t proc_cycles; |
6070 | 0 | char rxq_cyc_log[MAX_RXQ_CYC_STRLEN]; |
6071 | |
|
6072 | 0 | if (start_logged == false && level != VLL_DBG) { |
6073 | 0 | VLOG(level, "Performing pmd to rx queue assignment using %s " |
6074 | 0 | "algorithm.", get_assignment_type_string(algo)); |
6075 | 0 | start_logged = true; |
6076 | 0 | } |
6077 | | |
6078 | | /* Store the cycles for this rxq as we will log these later. */ |
6079 | 0 | proc_cycles = dp_netdev_rxq_get_cycles(rxq, RXQ_CYCLES_PROC_HIST); |
6080 | |
|
6081 | 0 | port_numa_id = netdev_get_numa_id(rxq->port->netdev); |
6082 | | |
6083 | | /* Select numa. */ |
6084 | 0 | numa = sched_numa_list_lookup(numa_list, port_numa_id); |
6085 | | |
6086 | | /* Check if numa has no PMDs or no non-isolated PMDs. */ |
6087 | 0 | if (!numa || !sched_numa_noniso_pmd_count(numa)) { |
6088 | | /* Unable to use this numa to find a PMD. */ |
6089 | 0 | numa = NULL; |
6090 | | /* Find any numa with available PMDs. */ |
6091 | 0 | for (int j = 0; j < n_numa; j++) { |
6092 | 0 | numa = sched_numa_list_next(numa_list, last_cross_numa); |
6093 | 0 | last_cross_numa = numa; |
6094 | 0 | if (sched_numa_noniso_pmd_count(numa)) { |
6095 | 0 | break; |
6096 | 0 | } |
6097 | 0 | numa = NULL; |
6098 | 0 | } |
6099 | 0 | } |
6100 | |
|
6101 | 0 | if (numa) { |
6102 | | /* Select the PMD that should be used for this rxq. */ |
6103 | 0 | sched_pmd = sched_pmd_next(numa, algo, |
6104 | 0 | proc_cycles ? true : false); |
6105 | 0 | } |
6106 | | |
6107 | | /* Check that a pmd has been selected. */ |
6108 | 0 | if (sched_pmd) { |
6109 | 0 | int pmd_numa_id; |
6110 | |
|
6111 | 0 | pmd_numa_id = sched_pmd->numa->numa_id; |
6112 | | /* Check if selected pmd numa matches port numa. */ |
6113 | 0 | if (pmd_numa_id != port_numa_id) { |
6114 | 0 | VLOG(level, "There's no available (non-isolated) pmd thread " |
6115 | 0 | "on numa node %d. Port \'%s\' rx queue %d will " |
6116 | 0 | "be assigned to a pmd on numa node %d. " |
6117 | 0 | "This may lead to reduced performance.", |
6118 | 0 | port_numa_id, netdev_rxq_get_name(rxq->rx), |
6119 | 0 | netdev_rxq_get_queue_id(rxq->rx), pmd_numa_id); |
6120 | 0 | } |
6121 | 0 | VLOG(level, "Core %2u on numa node %d assigned port \'%s\' " |
6122 | 0 | "rx queue %d%s.", |
6123 | 0 | sched_pmd->pmd->core_id, sched_pmd->pmd->numa_id, |
6124 | 0 | netdev_rxq_get_name(rxq->rx), |
6125 | 0 | netdev_rxq_get_queue_id(rxq->rx), |
6126 | 0 | get_rxq_cyc_log(rxq_cyc_log, algo, proc_cycles)); |
6127 | 0 | sched_pmd_add_rxq(sched_pmd, rxq, proc_cycles); |
6128 | 0 | } else { |
6129 | 0 | VLOG(level == VLL_DBG ? level : VLL_WARN, |
6130 | 0 | "No non-isolated pmd on any numa available for " |
6131 | 0 | "port \'%s\' rx queue %d%s. " |
6132 | 0 | "This rx queue will not be polled.", |
6133 | 0 | netdev_rxq_get_name(rxq->rx), |
6134 | 0 | netdev_rxq_get_queue_id(rxq->rx), |
6135 | 0 | get_rxq_cyc_log(rxq_cyc_log, algo, proc_cycles)); |
6136 | 0 | } |
6137 | 0 | } |
6138 | 0 | free(rxqs); |
6139 | 0 | } |
6140 | | |
6141 | | static void |
6142 | | rxq_scheduling(struct dp_netdev *dp) |
6143 | | OVS_REQ_RDLOCK(dp->port_rwlock) |
6144 | 0 | { |
6145 | 0 | struct sched_numa_list numa_list; |
6146 | 0 | enum sched_assignment_type algo = dp->pmd_rxq_assign_type; |
6147 | |
|
6148 | 0 | sched_numa_list_populate(&numa_list, dp); |
6149 | 0 | sched_numa_list_schedule(&numa_list, dp, algo, VLL_INFO); |
6150 | 0 | sched_numa_list_put_in_place(&numa_list); |
6151 | |
|
6152 | 0 | sched_numa_list_free_entries(&numa_list); |
6153 | 0 | } |
6154 | | |
6155 | | static uint64_t variance(uint64_t a[], int n); |
6156 | | |
6157 | | static uint64_t |
6158 | | sched_numa_variance(struct sched_numa *numa) |
6159 | 0 | { |
6160 | 0 | uint64_t *percent_busy = NULL; |
6161 | 0 | int n_proc = 0; |
6162 | 0 | uint64_t var; |
6163 | |
|
6164 | 0 | percent_busy = xmalloc(numa->n_pmds * sizeof *percent_busy); |
6165 | |
|
6166 | 0 | for (unsigned i = 0; i < numa->n_pmds; i++) { |
6167 | 0 | struct sched_pmd *sched_pmd; |
6168 | 0 | uint64_t total_cycles = 0; |
6169 | |
|
6170 | 0 | sched_pmd = &numa->pmds[i]; |
6171 | | /* Exclude isolated PMDs from variance calculations. */ |
6172 | 0 | if (sched_pmd->isolated == true) { |
6173 | 0 | continue; |
6174 | 0 | } |
6175 | | /* Get the total pmd cycles for an interval. */ |
6176 | 0 | atomic_read_relaxed(&sched_pmd->pmd->intrvl_cycles, &total_cycles); |
6177 | |
|
6178 | 0 | if (total_cycles) { |
6179 | | /* Estimate the cycles to cover all intervals. */ |
6180 | 0 | total_cycles *= PMD_INTERVAL_MAX; |
6181 | 0 | percent_busy[n_proc++] = (sched_pmd->pmd_proc_cycles * 100) |
6182 | 0 | / total_cycles; |
6183 | 0 | } else { |
6184 | 0 | percent_busy[n_proc++] = 0; |
6185 | 0 | } |
6186 | 0 | } |
6187 | 0 | var = variance(percent_busy, n_proc); |
6188 | 0 | free(percent_busy); |
6189 | 0 | return var; |
6190 | 0 | } |
6191 | | |
6192 | | /* |
6193 | | * This function checks that some basic conditions needed for a rebalance to be |
6194 | | * effective are met. Such as Rxq scheduling assignment type, more than one |
6195 | | * PMD, more than 2 Rxqs on a PMD. If there was no reconfiguration change |
6196 | | * since the last check, it reuses the last result. |
6197 | | * |
6198 | | * It is not intended to be an inclusive check of every condition that may make |
6199 | | * a rebalance ineffective. It is done as a quick check so a full |
6200 | | * pmd_rebalance_dry_run() can be avoided when it is not needed. |
6201 | | */ |
6202 | | static bool |
6203 | | pmd_rebalance_dry_run_needed(struct dp_netdev *dp) |
6204 | | OVS_REQ_RDLOCK(dp->port_rwlock) |
6205 | 0 | { |
6206 | 0 | struct dp_netdev_pmd_thread *pmd; |
6207 | 0 | struct pmd_auto_lb *pmd_alb = &dp->pmd_alb; |
6208 | 0 | unsigned int cnt = 0; |
6209 | 0 | bool multi_rxq = false; |
6210 | | |
6211 | | /* Check if there was no reconfiguration since last check. */ |
6212 | 0 | if (!pmd_alb->recheck_config) { |
6213 | 0 | if (!pmd_alb->do_dry_run) { |
6214 | 0 | VLOG_DBG("PMD auto load balance nothing to do, " |
6215 | 0 | "no configuration changes since last check."); |
6216 | 0 | return false; |
6217 | 0 | } |
6218 | 0 | return true; |
6219 | 0 | } |
6220 | 0 | pmd_alb->recheck_config = false; |
6221 | | |
6222 | | /* Check for incompatible assignment type. */ |
6223 | 0 | if (dp->pmd_rxq_assign_type == SCHED_ROUNDROBIN) { |
6224 | 0 | VLOG_DBG("PMD auto load balance nothing to do, " |
6225 | 0 | "pmd-rxq-assign=roundrobin assignment type configured."); |
6226 | 0 | return pmd_alb->do_dry_run = false; |
6227 | 0 | } |
6228 | | |
6229 | | /* Check that there is at least 2 non-isolated PMDs and |
6230 | | * one of them is polling more than one rxq. */ |
6231 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
6232 | 0 | if (pmd->core_id == NON_PMD_CORE_ID || pmd->isolated) { |
6233 | 0 | continue; |
6234 | 0 | } |
6235 | | |
6236 | 0 | if (hmap_count(&pmd->poll_list) > 1) { |
6237 | 0 | multi_rxq = true; |
6238 | 0 | } |
6239 | 0 | if (cnt && multi_rxq) { |
6240 | 0 | return pmd_alb->do_dry_run = true; |
6241 | 0 | } |
6242 | 0 | cnt++; |
6243 | 0 | } |
6244 | | |
6245 | 0 | VLOG_DBG("PMD auto load balance nothing to do, " |
6246 | 0 | "not enough non-isolated PMDs or RxQs."); |
6247 | 0 | return pmd_alb->do_dry_run = false; |
6248 | 0 | } |
6249 | | |
6250 | | static bool |
6251 | | pmd_rebalance_dry_run(struct dp_netdev *dp) |
6252 | | OVS_REQ_RDLOCK(dp->port_rwlock) |
6253 | 0 | { |
6254 | 0 | struct sched_numa_list numa_list_cur; |
6255 | 0 | struct sched_numa_list numa_list_est; |
6256 | 0 | bool thresh_met = false; |
6257 | 0 | uint64_t current_var, estimate_var; |
6258 | 0 | struct sched_numa *numa_cur, *numa_est; |
6259 | 0 | uint64_t improvement = 0; |
6260 | |
|
6261 | 0 | VLOG_DBG("PMD auto load balance performing dry run."); |
6262 | | |
6263 | | /* Populate current assignments. */ |
6264 | 0 | sched_numa_list_populate(&numa_list_cur, dp); |
6265 | 0 | sched_numa_list_assignments(&numa_list_cur, dp); |
6266 | | |
6267 | | /* Populate estimated assignments. */ |
6268 | 0 | sched_numa_list_populate(&numa_list_est, dp); |
6269 | 0 | sched_numa_list_schedule(&numa_list_est, dp, |
6270 | 0 | dp->pmd_rxq_assign_type, VLL_DBG); |
6271 | | |
6272 | | /* Check if cross-numa polling, there is only one numa with PMDs. */ |
6273 | 0 | if (!sched_numa_list_cross_numa_polling(&numa_list_est) || |
6274 | 0 | sched_numa_list_count(&numa_list_est) == 1) { |
6275 | | |
6276 | | /* Calculate variances. */ |
6277 | 0 | HMAP_FOR_EACH (numa_cur, node, &numa_list_cur.numas) { |
6278 | 0 | numa_est = sched_numa_list_lookup(&numa_list_est, |
6279 | 0 | numa_cur->numa_id); |
6280 | 0 | if (!numa_est) { |
6281 | 0 | continue; |
6282 | 0 | } |
6283 | 0 | current_var = sched_numa_variance(numa_cur); |
6284 | 0 | estimate_var = sched_numa_variance(numa_est); |
6285 | 0 | if (estimate_var < current_var) { |
6286 | 0 | improvement = ((current_var - estimate_var) * 100) |
6287 | 0 | / current_var; |
6288 | 0 | } |
6289 | 0 | VLOG_DBG("Numa node %d. Current variance %"PRIu64" Estimated " |
6290 | 0 | "variance %"PRIu64". Variance improvement %"PRIu64"%%.", |
6291 | 0 | numa_cur->numa_id, current_var, |
6292 | 0 | estimate_var, improvement); |
6293 | 0 | if (improvement >= dp->pmd_alb.rebalance_improve_thresh) { |
6294 | 0 | thresh_met = true; |
6295 | 0 | } |
6296 | 0 | } |
6297 | 0 | VLOG_DBG("PMD load variance improvement threshold %u%% is %s.", |
6298 | 0 | dp->pmd_alb.rebalance_improve_thresh, |
6299 | 0 | thresh_met ? "met" : "not met"); |
6300 | 0 | } else { |
6301 | 0 | VLOG_DBG("PMD auto load balance detected cross-numa polling with " |
6302 | 0 | "multiple numa nodes. Unable to accurately estimate."); |
6303 | 0 | } |
6304 | |
|
6305 | 0 | sched_numa_list_free_entries(&numa_list_cur); |
6306 | 0 | sched_numa_list_free_entries(&numa_list_est); |
6307 | |
|
6308 | 0 | return thresh_met; |
6309 | 0 | } |
6310 | | |
6311 | | static void |
6312 | | reload_affected_pmds(struct dp_netdev *dp) |
6313 | 0 | { |
6314 | 0 | struct dp_netdev_pmd_thread *pmd; |
6315 | |
|
6316 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
6317 | 0 | if (pmd->need_reload) { |
6318 | 0 | dp_netdev_reload_pmd__(pmd); |
6319 | 0 | } |
6320 | 0 | } |
6321 | |
|
6322 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
6323 | 0 | if (pmd->need_reload) { |
6324 | 0 | if (pmd->core_id != NON_PMD_CORE_ID) { |
6325 | 0 | bool reload; |
6326 | |
|
6327 | 0 | do { |
6328 | 0 | atomic_read_explicit(&pmd->reload, &reload, |
6329 | 0 | memory_order_acquire); |
6330 | 0 | } while (reload); |
6331 | 0 | } |
6332 | 0 | pmd->need_reload = false; |
6333 | 0 | } |
6334 | 0 | } |
6335 | 0 | } |
6336 | | |
6337 | | static void |
6338 | | reconfigure_pmd_threads(struct dp_netdev *dp) |
6339 | | OVS_REQ_RDLOCK(dp->port_rwlock) |
6340 | 0 | { |
6341 | 0 | struct dp_netdev_pmd_thread *pmd; |
6342 | 0 | struct ovs_numa_dump *pmd_cores; |
6343 | 0 | struct ovs_numa_info_core *core; |
6344 | 0 | struct hmapx to_delete = HMAPX_INITIALIZER(&to_delete); |
6345 | 0 | struct hmapx_node *node; |
6346 | 0 | bool changed = false; |
6347 | 0 | bool need_to_adjust_static_tx_qids = false; |
6348 | | |
6349 | | /* The pmd threads should be started only if there's a pmd port in the |
6350 | | * datapath. If the user didn't provide any "pmd-cpu-mask", we start |
6351 | | * NR_PMD_THREADS per numa node. */ |
6352 | 0 | if (!has_pmd_port(dp)) { |
6353 | 0 | pmd_cores = ovs_numa_dump_n_cores_per_numa(0); |
6354 | 0 | } else if (dp->pmd_cmask && dp->pmd_cmask[0]) { |
6355 | 0 | pmd_cores = ovs_numa_dump_cores_with_cmask(dp->pmd_cmask); |
6356 | 0 | } else { |
6357 | 0 | pmd_cores = ovs_numa_dump_n_cores_per_numa(NR_PMD_THREADS); |
6358 | 0 | } |
6359 | | |
6360 | | /* We need to adjust 'static_tx_qid's only if we're reducing number of |
6361 | | * PMD threads. Otherwise, new threads will allocate all the freed ids. */ |
6362 | 0 | if (ovs_numa_dump_count(pmd_cores) < cmap_count(&dp->poll_threads) - 1) { |
6363 | | /* Adjustment is required to keep 'static_tx_qid's sequential and |
6364 | | * avoid possible issues, for example, imbalanced tx queue usage |
6365 | | * and unnecessary locking caused by remapping on netdev level. */ |
6366 | 0 | need_to_adjust_static_tx_qids = true; |
6367 | 0 | } |
6368 | | |
6369 | | /* Check for unwanted pmd threads */ |
6370 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
6371 | 0 | if (pmd->core_id == NON_PMD_CORE_ID) { |
6372 | 0 | continue; |
6373 | 0 | } |
6374 | 0 | if (!ovs_numa_dump_contains_core(pmd_cores, pmd->numa_id, |
6375 | 0 | pmd->core_id)) { |
6376 | 0 | hmapx_add(&to_delete, pmd); |
6377 | 0 | } else if (need_to_adjust_static_tx_qids) { |
6378 | 0 | atomic_store_relaxed(&pmd->reload_tx_qid, true); |
6379 | 0 | pmd->need_reload = true; |
6380 | 0 | } |
6381 | 0 | } |
6382 | |
|
6383 | 0 | HMAPX_FOR_EACH (node, &to_delete) { |
6384 | 0 | pmd = (struct dp_netdev_pmd_thread *) node->data; |
6385 | 0 | VLOG_INFO("PMD thread on numa_id: %d, core id: %2d destroyed.", |
6386 | 0 | pmd->numa_id, pmd->core_id); |
6387 | 0 | dp_netdev_del_pmd(dp, pmd); |
6388 | 0 | } |
6389 | 0 | changed = !hmapx_is_empty(&to_delete); |
6390 | 0 | hmapx_destroy(&to_delete); |
6391 | |
|
6392 | 0 | if (need_to_adjust_static_tx_qids) { |
6393 | | /* 'static_tx_qid's are not sequential now. |
6394 | | * Reload remaining threads to fix this. */ |
6395 | 0 | reload_affected_pmds(dp); |
6396 | 0 | } |
6397 | | |
6398 | | /* Check for required new pmd threads */ |
6399 | 0 | FOR_EACH_CORE_ON_DUMP(core, pmd_cores) { |
6400 | 0 | pmd = dp_netdev_get_pmd(dp, core->core_id); |
6401 | 0 | if (!pmd) { |
6402 | 0 | struct ds name = DS_EMPTY_INITIALIZER; |
6403 | |
|
6404 | 0 | pmd = xzalloc(sizeof *pmd); |
6405 | 0 | dp_netdev_configure_pmd(pmd, dp, core->core_id, core->numa_id); |
6406 | |
|
6407 | 0 | ds_put_format(&name, "pmd-c%02d/id:", core->core_id); |
6408 | 0 | pmd->thread = ovs_thread_create(ds_cstr(&name), |
6409 | 0 | pmd_thread_main, pmd); |
6410 | 0 | ds_destroy(&name); |
6411 | |
|
6412 | 0 | VLOG_INFO("PMD thread on numa_id: %d, core id: %2d created.", |
6413 | 0 | pmd->numa_id, pmd->core_id); |
6414 | 0 | changed = true; |
6415 | 0 | } else { |
6416 | 0 | dp_netdev_pmd_unref(pmd); |
6417 | 0 | } |
6418 | 0 | } |
6419 | |
|
6420 | 0 | if (changed) { |
6421 | 0 | struct ovs_numa_info_numa *numa; |
6422 | | |
6423 | | /* Log the number of pmd threads per numa node. */ |
6424 | 0 | FOR_EACH_NUMA_ON_DUMP (numa, pmd_cores) { |
6425 | 0 | VLOG_INFO("There are %"PRIuSIZE" pmd threads on numa node %d", |
6426 | 0 | numa->n_cores, numa->numa_id); |
6427 | 0 | } |
6428 | 0 | } |
6429 | |
|
6430 | 0 | ovs_numa_dump_destroy(pmd_cores); |
6431 | 0 | } |
6432 | | |
6433 | | static void |
6434 | | pmd_remove_stale_ports(struct dp_netdev *dp, |
6435 | | struct dp_netdev_pmd_thread *pmd) |
6436 | | OVS_EXCLUDED(pmd->port_mutex) |
6437 | | OVS_REQ_RDLOCK(dp->port_rwlock) |
6438 | 0 | { |
6439 | 0 | struct rxq_poll *poll; |
6440 | 0 | struct tx_port *tx; |
6441 | |
|
6442 | 0 | ovs_mutex_lock(&pmd->port_mutex); |
6443 | 0 | HMAP_FOR_EACH_SAFE (poll, node, &pmd->poll_list) { |
6444 | 0 | struct dp_netdev_port *port = poll->rxq->port; |
6445 | |
|
6446 | 0 | if (port->need_reconfigure |
6447 | 0 | || !hmap_contains(&dp->ports, &port->node)) { |
6448 | 0 | dp_netdev_del_rxq_from_pmd(pmd, poll); |
6449 | 0 | } |
6450 | 0 | } |
6451 | 0 | HMAP_FOR_EACH_SAFE (tx, node, &pmd->tx_ports) { |
6452 | 0 | struct dp_netdev_port *port = tx->port; |
6453 | |
|
6454 | 0 | if (port->need_reconfigure |
6455 | 0 | || !hmap_contains(&dp->ports, &port->node)) { |
6456 | 0 | dp_netdev_del_port_tx_from_pmd(pmd, tx); |
6457 | 0 | } |
6458 | 0 | } |
6459 | 0 | ovs_mutex_unlock(&pmd->port_mutex); |
6460 | 0 | } |
6461 | | |
6462 | | /* Must be called each time a port is added/removed or the cmask changes. |
6463 | | * This creates and destroys pmd threads, reconfigures ports, opens their |
6464 | | * rxqs and assigns all rxqs/txqs to pmd threads. */ |
6465 | | static void |
6466 | | reconfigure_datapath(struct dp_netdev *dp) |
6467 | | OVS_REQ_RDLOCK(dp->port_rwlock) |
6468 | 0 | { |
6469 | 0 | struct hmapx busy_threads = HMAPX_INITIALIZER(&busy_threads); |
6470 | 0 | struct dp_netdev_pmd_thread *pmd; |
6471 | 0 | struct dp_netdev_port *port; |
6472 | 0 | int wanted_txqs; |
6473 | |
|
6474 | 0 | dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq); |
6475 | | |
6476 | | /* Step 1: Adjust the pmd threads based on the datapath ports, the cores |
6477 | | * on the system and the user configuration. */ |
6478 | 0 | reconfigure_pmd_threads(dp); |
6479 | |
|
6480 | 0 | wanted_txqs = cmap_count(&dp->poll_threads); |
6481 | | |
6482 | | /* The number of pmd threads might have changed, or a port can be new: |
6483 | | * adjust the txqs. */ |
6484 | 0 | HMAP_FOR_EACH (port, node, &dp->ports) { |
6485 | 0 | netdev_set_tx_multiq(port->netdev, wanted_txqs); |
6486 | 0 | } |
6487 | | |
6488 | | /* Step 2: Remove from the pmd threads ports that have been removed or |
6489 | | * need reconfiguration. */ |
6490 | | |
6491 | | /* Check for all the ports that need reconfiguration. We cache this in |
6492 | | * 'port->need_reconfigure', because netdev_is_reconf_required() can |
6493 | | * change at any time. |
6494 | | * Also mark for reconfiguration all ports which will likely change their |
6495 | | * 'txq_mode' parameter. It's required to stop using them before |
6496 | | * changing this setting and it's simpler to mark ports here and allow |
6497 | | * 'pmd_remove_stale_ports' to remove them from threads. There will be |
6498 | | * no actual reconfiguration in 'port_reconfigure' because it's |
6499 | | * unnecessary. */ |
6500 | 0 | HMAP_FOR_EACH (port, node, &dp->ports) { |
6501 | 0 | if (netdev_is_reconf_required(port->netdev) |
6502 | 0 | || ((port->txq_mode == TXQ_MODE_XPS) |
6503 | 0 | != (netdev_n_txq(port->netdev) < wanted_txqs)) |
6504 | 0 | || ((port->txq_mode == TXQ_MODE_XPS_HASH) |
6505 | 0 | != (port->txq_requested_mode == TXQ_REQ_MODE_HASH |
6506 | 0 | && netdev_n_txq(port->netdev) > 1))) { |
6507 | 0 | port->need_reconfigure = true; |
6508 | 0 | } |
6509 | 0 | } |
6510 | | |
6511 | | /* Remove from the pmd threads all the ports that have been deleted or |
6512 | | * need reconfiguration. */ |
6513 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
6514 | 0 | pmd_remove_stale_ports(dp, pmd); |
6515 | 0 | } |
6516 | | |
6517 | | /* Reload affected pmd threads. We must wait for the pmd threads before |
6518 | | * reconfiguring the ports, because a port cannot be reconfigured while |
6519 | | * it's being used. */ |
6520 | 0 | reload_affected_pmds(dp); |
6521 | | |
6522 | | /* Step 3: Reconfigure ports. */ |
6523 | | |
6524 | | /* We only reconfigure the ports that we determined above, because they're |
6525 | | * not being used by any pmd thread at the moment. If a port fails to |
6526 | | * reconfigure we remove it from the datapath. */ |
6527 | 0 | HMAP_FOR_EACH_SAFE (port, node, &dp->ports) { |
6528 | 0 | int err; |
6529 | |
|
6530 | 0 | if (!port->need_reconfigure) { |
6531 | 0 | continue; |
6532 | 0 | } |
6533 | | |
6534 | 0 | err = port_reconfigure(port); |
6535 | 0 | if (err) { |
6536 | 0 | hmap_remove(&dp->ports, &port->node); |
6537 | 0 | seq_change(dp->port_seq); |
6538 | 0 | port_destroy(port); |
6539 | 0 | } else { |
6540 | | /* With a single queue, there is no point in using hash mode. */ |
6541 | 0 | if (port->txq_requested_mode == TXQ_REQ_MODE_HASH && |
6542 | 0 | netdev_n_txq(port->netdev) > 1) { |
6543 | 0 | port->txq_mode = TXQ_MODE_XPS_HASH; |
6544 | 0 | } else if (netdev_n_txq(port->netdev) < wanted_txqs) { |
6545 | 0 | port->txq_mode = TXQ_MODE_XPS; |
6546 | 0 | } else { |
6547 | 0 | port->txq_mode = TXQ_MODE_STATIC; |
6548 | 0 | } |
6549 | 0 | } |
6550 | 0 | } |
6551 | | |
6552 | | /* Step 4: Compute new rxq scheduling. We don't touch the pmd threads |
6553 | | * for now, we just update the 'pmd' pointer in each rxq to point to the |
6554 | | * wanted thread according to the scheduling policy. */ |
6555 | | |
6556 | | /* Reset all the pmd threads to non isolated. */ |
6557 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
6558 | 0 | pmd->isolated = false; |
6559 | 0 | } |
6560 | | |
6561 | | /* Reset all the queues to unassigned */ |
6562 | 0 | HMAP_FOR_EACH (port, node, &dp->ports) { |
6563 | 0 | for (int i = 0; i < port->n_rxq; i++) { |
6564 | 0 | port->rxqs[i].pmd = NULL; |
6565 | 0 | } |
6566 | 0 | } |
6567 | 0 | rxq_scheduling(dp); |
6568 | | |
6569 | | /* Step 5: Remove queues not compliant with new scheduling. */ |
6570 | | |
6571 | | /* Count all the threads that will have at least one queue to poll. */ |
6572 | 0 | HMAP_FOR_EACH (port, node, &dp->ports) { |
6573 | 0 | for (int qid = 0; qid < port->n_rxq; qid++) { |
6574 | 0 | struct dp_netdev_rxq *q = &port->rxqs[qid]; |
6575 | |
|
6576 | 0 | if (q->pmd) { |
6577 | 0 | hmapx_add(&busy_threads, q->pmd); |
6578 | 0 | } |
6579 | 0 | } |
6580 | 0 | } |
6581 | |
|
6582 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
6583 | 0 | struct rxq_poll *poll; |
6584 | |
|
6585 | 0 | ovs_mutex_lock(&pmd->port_mutex); |
6586 | 0 | HMAP_FOR_EACH_SAFE (poll, node, &pmd->poll_list) { |
6587 | 0 | if (poll->rxq->pmd != pmd) { |
6588 | 0 | dp_netdev_del_rxq_from_pmd(pmd, poll); |
6589 | | |
6590 | | /* This pmd might sleep after this step if it has no rxq |
6591 | | * remaining. Tell it to busy wait for new assignment if it |
6592 | | * has at least one scheduled queue. */ |
6593 | 0 | if (hmap_count(&pmd->poll_list) == 0 && |
6594 | 0 | hmapx_contains(&busy_threads, pmd)) { |
6595 | 0 | atomic_store_relaxed(&pmd->wait_for_reload, true); |
6596 | 0 | } |
6597 | 0 | } |
6598 | 0 | } |
6599 | 0 | ovs_mutex_unlock(&pmd->port_mutex); |
6600 | 0 | } |
6601 | |
|
6602 | 0 | hmapx_destroy(&busy_threads); |
6603 | | |
6604 | | /* Reload affected pmd threads. We must wait for the pmd threads to remove |
6605 | | * the old queues before readding them, otherwise a queue can be polled by |
6606 | | * two threads at the same time. */ |
6607 | 0 | reload_affected_pmds(dp); |
6608 | | |
6609 | | /* Step 6: Add queues from scheduling, if they're not there already. */ |
6610 | 0 | HMAP_FOR_EACH (port, node, &dp->ports) { |
6611 | 0 | if (!netdev_is_pmd(port->netdev)) { |
6612 | 0 | continue; |
6613 | 0 | } |
6614 | | |
6615 | 0 | for (int qid = 0; qid < port->n_rxq; qid++) { |
6616 | 0 | struct dp_netdev_rxq *q = &port->rxqs[qid]; |
6617 | |
|
6618 | 0 | if (q->pmd) { |
6619 | 0 | ovs_mutex_lock(&q->pmd->port_mutex); |
6620 | 0 | dp_netdev_add_rxq_to_pmd(q->pmd, q); |
6621 | 0 | ovs_mutex_unlock(&q->pmd->port_mutex); |
6622 | 0 | } |
6623 | 0 | } |
6624 | 0 | } |
6625 | | |
6626 | | /* Add every port and bond to the tx port and bond caches of |
6627 | | * every pmd thread, if it's not there already and if this pmd |
6628 | | * has at least one rxq to poll. |
6629 | | */ |
6630 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
6631 | 0 | ovs_mutex_lock(&pmd->port_mutex); |
6632 | 0 | if (hmap_count(&pmd->poll_list) || pmd->core_id == NON_PMD_CORE_ID) { |
6633 | 0 | struct tx_bond *bond; |
6634 | |
|
6635 | 0 | HMAP_FOR_EACH (port, node, &dp->ports) { |
6636 | 0 | dp_netdev_add_port_tx_to_pmd(pmd, port); |
6637 | 0 | } |
6638 | |
|
6639 | 0 | CMAP_FOR_EACH (bond, node, &dp->tx_bonds) { |
6640 | 0 | dp_netdev_add_bond_tx_to_pmd(pmd, bond, false); |
6641 | 0 | } |
6642 | 0 | } |
6643 | 0 | ovs_mutex_unlock(&pmd->port_mutex); |
6644 | 0 | } |
6645 | | |
6646 | | /* Reload affected pmd threads. */ |
6647 | 0 | reload_affected_pmds(dp); |
6648 | | |
6649 | | /* PMD ALB will need to recheck if dry run needed. */ |
6650 | 0 | dp->pmd_alb.recheck_config = true; |
6651 | 0 | } |
6652 | | |
6653 | | /* Returns true if one of the netdevs in 'dp' requires a reconfiguration */ |
6654 | | static bool |
6655 | | ports_require_restart(const struct dp_netdev *dp) |
6656 | | OVS_REQ_RDLOCK(dp->port_rwlock) |
6657 | 0 | { |
6658 | 0 | struct dp_netdev_port *port; |
6659 | |
|
6660 | 0 | HMAP_FOR_EACH (port, node, &dp->ports) { |
6661 | 0 | if (netdev_is_reconf_required(port->netdev)) { |
6662 | 0 | return true; |
6663 | 0 | } |
6664 | 0 | } |
6665 | | |
6666 | 0 | return false; |
6667 | 0 | } |
6668 | | |
6669 | | /* Calculates variance in the values stored in array 'a'. 'n' is the number |
6670 | | * of elements in array to be considered for calculating vairance. |
6671 | | * Usage example: data array 'a' contains the processing load of each pmd and |
6672 | | * 'n' is the number of PMDs. It returns the variance in processing load of |
6673 | | * PMDs*/ |
6674 | | static uint64_t |
6675 | | variance(uint64_t a[], int n) |
6676 | 0 | { |
6677 | | /* Compute mean (average of elements). */ |
6678 | 0 | uint64_t sum = 0; |
6679 | 0 | uint64_t mean = 0; |
6680 | 0 | uint64_t sqDiff = 0; |
6681 | |
|
6682 | 0 | if (!n) { |
6683 | 0 | return 0; |
6684 | 0 | } |
6685 | | |
6686 | 0 | for (int i = 0; i < n; i++) { |
6687 | 0 | sum += a[i]; |
6688 | 0 | } |
6689 | |
|
6690 | 0 | if (sum) { |
6691 | 0 | mean = sum / n; |
6692 | | |
6693 | | /* Compute sum squared differences with mean. */ |
6694 | 0 | for (int i = 0; i < n; i++) { |
6695 | 0 | sqDiff += (a[i] - mean)*(a[i] - mean); |
6696 | 0 | } |
6697 | 0 | } |
6698 | 0 | return (sqDiff ? (sqDiff / n) : 0); |
6699 | 0 | } |
6700 | | |
6701 | | /* Return true if needs to revalidate datapath flows. */ |
6702 | | static bool |
6703 | | dpif_netdev_run(struct dpif *dpif) |
6704 | 0 | { |
6705 | 0 | struct dp_netdev_port *port; |
6706 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
6707 | 0 | struct dp_netdev_pmd_thread *non_pmd; |
6708 | 0 | uint64_t new_tnl_seq; |
6709 | 0 | bool need_to_flush = true; |
6710 | 0 | bool pmd_rebalance = false; |
6711 | 0 | long long int now = time_msec(); |
6712 | 0 | struct dp_netdev_pmd_thread *pmd; |
6713 | |
|
6714 | 0 | ovs_rwlock_rdlock(&dp->port_rwlock); |
6715 | 0 | non_pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID); |
6716 | 0 | if (non_pmd) { |
6717 | 0 | ovs_mutex_lock(&dp->non_pmd_mutex); |
6718 | |
|
6719 | 0 | atomic_read_relaxed(&dp->smc_enable_db, &non_pmd->ctx.smc_enable_db); |
6720 | |
|
6721 | 0 | HMAP_FOR_EACH (port, node, &dp->ports) { |
6722 | 0 | if (!netdev_is_pmd(port->netdev)) { |
6723 | 0 | int i; |
6724 | |
|
6725 | 0 | if (port->emc_enabled) { |
6726 | 0 | atomic_read_relaxed(&dp->emc_insert_min, |
6727 | 0 | &non_pmd->ctx.emc_insert_min); |
6728 | 0 | } else { |
6729 | 0 | non_pmd->ctx.emc_insert_min = 0; |
6730 | 0 | } |
6731 | |
|
6732 | 0 | for (i = 0; i < port->n_rxq; i++) { |
6733 | |
|
6734 | 0 | if (!netdev_rxq_enabled(port->rxqs[i].rx)) { |
6735 | 0 | continue; |
6736 | 0 | } |
6737 | | |
6738 | 0 | if (dp_netdev_process_rxq_port(non_pmd, |
6739 | 0 | &port->rxqs[i], |
6740 | 0 | port->port_no)) { |
6741 | 0 | need_to_flush = false; |
6742 | 0 | } |
6743 | 0 | } |
6744 | 0 | } |
6745 | 0 | } |
6746 | 0 | if (need_to_flush) { |
6747 | | /* We didn't receive anything in the process loop. |
6748 | | * Check if we need to send something. |
6749 | | * There was no time updates on current iteration. */ |
6750 | 0 | pmd_thread_ctx_time_update(non_pmd); |
6751 | 0 | dp_netdev_pmd_flush_output_packets(non_pmd, false); |
6752 | 0 | } |
6753 | |
|
6754 | 0 | dpif_netdev_xps_revalidate_pmd(non_pmd, false); |
6755 | 0 | ovs_mutex_unlock(&dp->non_pmd_mutex); |
6756 | |
|
6757 | 0 | dp_netdev_pmd_unref(non_pmd); |
6758 | 0 | } |
6759 | |
|
6760 | 0 | struct pmd_auto_lb *pmd_alb = &dp->pmd_alb; |
6761 | 0 | if (pmd_alb->is_enabled) { |
6762 | 0 | if (!pmd_alb->rebalance_poll_timer) { |
6763 | 0 | pmd_alb->rebalance_poll_timer = now; |
6764 | 0 | } else if ((pmd_alb->rebalance_poll_timer + |
6765 | 0 | pmd_alb->rebalance_intvl) < now) { |
6766 | 0 | pmd_alb->rebalance_poll_timer = now; |
6767 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
6768 | 0 | if (atomic_count_get(&pmd->pmd_overloaded) >= |
6769 | 0 | PMD_INTERVAL_MAX) { |
6770 | 0 | pmd_rebalance = true; |
6771 | 0 | break; |
6772 | 0 | } |
6773 | 0 | } |
6774 | |
|
6775 | 0 | if (pmd_rebalance && |
6776 | 0 | !dp_netdev_is_reconf_required(dp) && |
6777 | 0 | !ports_require_restart(dp) && |
6778 | 0 | pmd_rebalance_dry_run_needed(dp) && |
6779 | 0 | pmd_rebalance_dry_run(dp)) { |
6780 | 0 | VLOG_INFO("PMD auto load balance dry run. " |
6781 | 0 | "Requesting datapath reconfigure."); |
6782 | 0 | dp_netdev_request_reconfigure(dp); |
6783 | 0 | } |
6784 | 0 | } |
6785 | 0 | } |
6786 | |
|
6787 | 0 | if (dp_netdev_is_reconf_required(dp) || ports_require_restart(dp)) { |
6788 | 0 | reconfigure_datapath(dp); |
6789 | 0 | } |
6790 | 0 | ovs_rwlock_unlock(&dp->port_rwlock); |
6791 | |
|
6792 | 0 | tnl_neigh_cache_run(); |
6793 | 0 | tnl_port_map_run(); |
6794 | 0 | new_tnl_seq = seq_read(tnl_conf_seq); |
6795 | |
|
6796 | 0 | if (dp->last_tnl_conf_seq != new_tnl_seq) { |
6797 | 0 | dp->last_tnl_conf_seq = new_tnl_seq; |
6798 | 0 | return true; |
6799 | 0 | } |
6800 | 0 | return false; |
6801 | 0 | } |
6802 | | |
6803 | | static void |
6804 | | dpif_netdev_wait(struct dpif *dpif) |
6805 | 0 | { |
6806 | 0 | struct dp_netdev_port *port; |
6807 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
6808 | |
|
6809 | 0 | ovs_mutex_lock(&dp_netdev_mutex); |
6810 | 0 | ovs_rwlock_rdlock(&dp->port_rwlock); |
6811 | 0 | HMAP_FOR_EACH (port, node, &dp->ports) { |
6812 | 0 | netdev_wait_reconf_required(port->netdev); |
6813 | 0 | if (!netdev_is_pmd(port->netdev)) { |
6814 | 0 | int i; |
6815 | |
|
6816 | 0 | for (i = 0; i < port->n_rxq; i++) { |
6817 | 0 | netdev_rxq_wait(port->rxqs[i].rx); |
6818 | 0 | } |
6819 | 0 | } |
6820 | 0 | } |
6821 | 0 | ovs_rwlock_unlock(&dp->port_rwlock); |
6822 | 0 | ovs_mutex_unlock(&dp_netdev_mutex); |
6823 | 0 | seq_wait(tnl_conf_seq, dp->last_tnl_conf_seq); |
6824 | 0 | } |
6825 | | |
6826 | | static void |
6827 | | pmd_free_cached_ports(struct dp_netdev_pmd_thread *pmd) |
6828 | 0 | { |
6829 | 0 | struct tx_port *tx_port_cached; |
6830 | | |
6831 | | /* Flush all the queued packets. */ |
6832 | 0 | dp_netdev_pmd_flush_output_packets(pmd, true); |
6833 | | /* Free all used tx queue ids. */ |
6834 | 0 | dpif_netdev_xps_revalidate_pmd(pmd, true); |
6835 | |
|
6836 | 0 | HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->tnl_port_cache) { |
6837 | 0 | free(tx_port_cached->txq_pkts); |
6838 | 0 | free(tx_port_cached); |
6839 | 0 | } |
6840 | 0 | HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->send_port_cache) { |
6841 | 0 | free(tx_port_cached->txq_pkts); |
6842 | 0 | free(tx_port_cached); |
6843 | 0 | } |
6844 | 0 | } |
6845 | | |
6846 | | /* Copies ports from 'pmd->tx_ports' (shared with the main thread) to |
6847 | | * thread-local copies. Copy to 'pmd->tnl_port_cache' if it is a tunnel |
6848 | | * device, otherwise to 'pmd->send_port_cache' if the port has at least |
6849 | | * one txq. */ |
6850 | | static void |
6851 | | pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd) |
6852 | | OVS_REQUIRES(pmd->port_mutex) |
6853 | 0 | { |
6854 | 0 | struct tx_port *tx_port, *tx_port_cached; |
6855 | |
|
6856 | 0 | pmd_free_cached_ports(pmd); |
6857 | 0 | hmap_shrink(&pmd->send_port_cache); |
6858 | 0 | hmap_shrink(&pmd->tnl_port_cache); |
6859 | |
|
6860 | 0 | HMAP_FOR_EACH (tx_port, node, &pmd->tx_ports) { |
6861 | 0 | int n_txq = netdev_n_txq(tx_port->port->netdev); |
6862 | 0 | struct dp_packet_batch *txq_pkts_cached; |
6863 | |
|
6864 | 0 | if (netdev_has_tunnel_push_pop(tx_port->port->netdev)) { |
6865 | 0 | tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached); |
6866 | 0 | if (tx_port->txq_pkts) { |
6867 | 0 | txq_pkts_cached = xmemdup(tx_port->txq_pkts, |
6868 | 0 | n_txq * sizeof *tx_port->txq_pkts); |
6869 | 0 | tx_port_cached->txq_pkts = txq_pkts_cached; |
6870 | 0 | } |
6871 | 0 | hmap_insert(&pmd->tnl_port_cache, &tx_port_cached->node, |
6872 | 0 | hash_port_no(tx_port_cached->port->port_no)); |
6873 | 0 | } |
6874 | |
|
6875 | 0 | if (n_txq) { |
6876 | 0 | tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached); |
6877 | 0 | if (tx_port->txq_pkts) { |
6878 | 0 | txq_pkts_cached = xmemdup(tx_port->txq_pkts, |
6879 | 0 | n_txq * sizeof *tx_port->txq_pkts); |
6880 | 0 | tx_port_cached->txq_pkts = txq_pkts_cached; |
6881 | 0 | } |
6882 | 0 | hmap_insert(&pmd->send_port_cache, &tx_port_cached->node, |
6883 | 0 | hash_port_no(tx_port_cached->port->port_no)); |
6884 | 0 | } |
6885 | 0 | } |
6886 | 0 | } |
6887 | | |
6888 | | static void |
6889 | | pmd_alloc_static_tx_qid(struct dp_netdev_pmd_thread *pmd) |
6890 | 0 | { |
6891 | 0 | ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex); |
6892 | 0 | if (!id_pool_alloc_id(pmd->dp->tx_qid_pool, &pmd->static_tx_qid)) { |
6893 | 0 | VLOG_ABORT("static_tx_qid allocation failed for PMD on core %2d" |
6894 | 0 | ", numa_id %d.", pmd->core_id, pmd->numa_id); |
6895 | 0 | } |
6896 | 0 | ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex); |
6897 | |
|
6898 | 0 | VLOG_DBG("static_tx_qid = %d allocated for PMD thread on core %2d" |
6899 | 0 | ", numa_id %d.", pmd->static_tx_qid, pmd->core_id, pmd->numa_id); |
6900 | 0 | } |
6901 | | |
6902 | | static void |
6903 | | pmd_free_static_tx_qid(struct dp_netdev_pmd_thread *pmd) |
6904 | 0 | { |
6905 | 0 | ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex); |
6906 | 0 | id_pool_free_id(pmd->dp->tx_qid_pool, pmd->static_tx_qid); |
6907 | 0 | ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex); |
6908 | 0 | } |
6909 | | |
6910 | | static int |
6911 | | pmd_load_queues_and_ports(struct dp_netdev_pmd_thread *pmd, |
6912 | | struct polled_queue **ppoll_list) |
6913 | 0 | { |
6914 | 0 | struct polled_queue *poll_list = *ppoll_list; |
6915 | 0 | struct rxq_poll *poll; |
6916 | 0 | int i; |
6917 | |
|
6918 | 0 | ovs_mutex_lock(&pmd->port_mutex); |
6919 | 0 | poll_list = xrealloc(poll_list, hmap_count(&pmd->poll_list) |
6920 | 0 | * sizeof *poll_list); |
6921 | |
|
6922 | 0 | i = 0; |
6923 | 0 | HMAP_FOR_EACH (poll, node, &pmd->poll_list) { |
6924 | 0 | poll_list[i].rxq = poll->rxq; |
6925 | 0 | poll_list[i].port_no = poll->rxq->port->port_no; |
6926 | 0 | poll_list[i].emc_enabled = poll->rxq->port->emc_enabled; |
6927 | 0 | poll_list[i].rxq_enabled = netdev_rxq_enabled(poll->rxq->rx); |
6928 | 0 | poll_list[i].change_seq = |
6929 | 0 | netdev_get_change_seq(poll->rxq->port->netdev); |
6930 | 0 | i++; |
6931 | 0 | } |
6932 | |
|
6933 | 0 | pmd_load_cached_ports(pmd); |
6934 | |
|
6935 | 0 | ovs_mutex_unlock(&pmd->port_mutex); |
6936 | |
|
6937 | 0 | *ppoll_list = poll_list; |
6938 | 0 | return i; |
6939 | 0 | } |
6940 | | |
6941 | | static void * |
6942 | | pmd_thread_main(void *f_) |
6943 | 0 | { |
6944 | 0 | struct dp_netdev_pmd_thread *pmd = f_; |
6945 | 0 | struct pmd_perf_stats *s = &pmd->perf_stats; |
6946 | 0 | unsigned int lc = 0; |
6947 | 0 | struct polled_queue *poll_list; |
6948 | 0 | bool wait_for_reload = false; |
6949 | 0 | bool dpdk_attached; |
6950 | 0 | bool reload_tx_qid; |
6951 | 0 | bool exiting; |
6952 | 0 | bool reload; |
6953 | 0 | int poll_cnt; |
6954 | 0 | int i; |
6955 | 0 | int process_packets = 0; |
6956 | 0 | uint64_t sleep_time = 0; |
6957 | |
|
6958 | 0 | poll_list = NULL; |
6959 | | |
6960 | | /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */ |
6961 | 0 | ovsthread_setspecific(pmd->dp->per_pmd_key, pmd); |
6962 | 0 | ovs_numa_thread_setaffinity_core(pmd->core_id); |
6963 | 0 | dpdk_attached = dpdk_attach_thread(pmd->core_id); |
6964 | 0 | poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list); |
6965 | 0 | dfc_cache_init(&pmd->flow_cache); |
6966 | 0 | pmd_alloc_static_tx_qid(pmd); |
6967 | 0 | set_timer_resolution(PMD_TIMER_RES_NS); |
6968 | |
|
6969 | 0 | reload: |
6970 | 0 | atomic_count_init(&pmd->pmd_overloaded, 0); |
6971 | |
|
6972 | 0 | pmd->intrvl_tsc_prev = 0; |
6973 | 0 | atomic_store_relaxed(&pmd->intrvl_cycles, 0); |
6974 | |
|
6975 | 0 | if (!dpdk_attached) { |
6976 | 0 | dpdk_attached = dpdk_attach_thread(pmd->core_id); |
6977 | 0 | } |
6978 | | |
6979 | | /* List port/core affinity */ |
6980 | 0 | for (i = 0; i < poll_cnt; i++) { |
6981 | 0 | VLOG_DBG("Core %d processing port \'%s\' with queue-id %d\n", |
6982 | 0 | pmd->core_id, netdev_rxq_get_name(poll_list[i].rxq->rx), |
6983 | 0 | netdev_rxq_get_queue_id(poll_list[i].rxq->rx)); |
6984 | | /* Reset the rxq current cycles counter. */ |
6985 | 0 | dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR, 0); |
6986 | 0 | for (int j = 0; j < PMD_INTERVAL_MAX; j++) { |
6987 | 0 | dp_netdev_rxq_set_intrvl_cycles(poll_list[i].rxq, 0); |
6988 | 0 | } |
6989 | 0 | } |
6990 | |
|
6991 | 0 | if (!poll_cnt) { |
6992 | 0 | if (wait_for_reload) { |
6993 | | /* Don't sleep, control thread will ask for a reload shortly. */ |
6994 | 0 | do { |
6995 | 0 | atomic_read_explicit(&pmd->reload, &reload, |
6996 | 0 | memory_order_acquire); |
6997 | 0 | } while (!reload); |
6998 | 0 | } else { |
6999 | 0 | while (seq_read(pmd->reload_seq) == pmd->last_reload_seq) { |
7000 | 0 | seq_wait(pmd->reload_seq, pmd->last_reload_seq); |
7001 | 0 | poll_block(); |
7002 | 0 | } |
7003 | 0 | } |
7004 | 0 | } |
7005 | |
|
7006 | 0 | for (i = 0; i < PMD_INTERVAL_MAX; i++) { |
7007 | 0 | atomic_store_relaxed(&pmd->busy_cycles_intrvl[i], 0); |
7008 | 0 | } |
7009 | 0 | atomic_count_set(&pmd->intrvl_idx, 0); |
7010 | 0 | cycles_counter_update(s); |
7011 | |
|
7012 | 0 | pmd->next_rcu_quiesce = pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL; |
7013 | | |
7014 | | /* Protect pmd stats from external clearing while polling. */ |
7015 | 0 | ovs_mutex_lock(&pmd->perf_stats.stats_mutex); |
7016 | 0 | for (;;) { |
7017 | 0 | uint64_t rx_packets = 0, tx_packets = 0; |
7018 | 0 | uint64_t time_slept = 0; |
7019 | 0 | uint64_t max_sleep; |
7020 | |
|
7021 | 0 | pmd_perf_start_iteration(s); |
7022 | |
|
7023 | 0 | atomic_read_relaxed(&pmd->dp->smc_enable_db, &pmd->ctx.smc_enable_db); |
7024 | 0 | atomic_read_relaxed(&pmd->dp->pmd_max_sleep, &max_sleep); |
7025 | |
|
7026 | 0 | for (i = 0; i < poll_cnt; i++) { |
7027 | |
|
7028 | 0 | if (!poll_list[i].rxq_enabled) { |
7029 | 0 | continue; |
7030 | 0 | } |
7031 | | |
7032 | 0 | if (poll_list[i].emc_enabled) { |
7033 | 0 | atomic_read_relaxed(&pmd->dp->emc_insert_min, |
7034 | 0 | &pmd->ctx.emc_insert_min); |
7035 | 0 | } else { |
7036 | 0 | pmd->ctx.emc_insert_min = 0; |
7037 | 0 | } |
7038 | |
|
7039 | 0 | process_packets = |
7040 | 0 | dp_netdev_process_rxq_port(pmd, poll_list[i].rxq, |
7041 | 0 | poll_list[i].port_no); |
7042 | 0 | rx_packets += process_packets; |
7043 | 0 | if (process_packets >= PMD_SLEEP_THRESH) { |
7044 | 0 | sleep_time = 0; |
7045 | 0 | } |
7046 | 0 | } |
7047 | |
|
7048 | 0 | if (!rx_packets) { |
7049 | | /* We didn't receive anything in the process loop. |
7050 | | * Check if we need to send something. |
7051 | | * There was no time updates on current iteration. */ |
7052 | 0 | pmd_thread_ctx_time_update(pmd); |
7053 | 0 | tx_packets = dp_netdev_pmd_flush_output_packets(pmd, |
7054 | 0 | max_sleep && sleep_time |
7055 | 0 | ? true : false); |
7056 | 0 | } |
7057 | |
|
7058 | 0 | if (max_sleep) { |
7059 | | /* Check if a sleep should happen on this iteration. */ |
7060 | 0 | if (sleep_time) { |
7061 | 0 | struct cycle_timer sleep_timer; |
7062 | |
|
7063 | 0 | cycle_timer_start(&pmd->perf_stats, &sleep_timer); |
7064 | 0 | xnanosleep_no_quiesce(sleep_time * 1000); |
7065 | 0 | time_slept = cycle_timer_stop(&pmd->perf_stats, &sleep_timer); |
7066 | 0 | pmd_thread_ctx_time_update(pmd); |
7067 | 0 | } |
7068 | 0 | if (sleep_time < max_sleep) { |
7069 | | /* Increase sleep time for next iteration. */ |
7070 | 0 | sleep_time += PMD_SLEEP_INC_US; |
7071 | 0 | } else { |
7072 | 0 | sleep_time = max_sleep; |
7073 | 0 | } |
7074 | 0 | } else { |
7075 | | /* Reset sleep time as max sleep policy may have been changed. */ |
7076 | 0 | sleep_time = 0; |
7077 | 0 | } |
7078 | | |
7079 | | /* Do RCU synchronization at fixed interval. This ensures that |
7080 | | * synchronization would not be delayed long even at high load of |
7081 | | * packet processing. */ |
7082 | 0 | if (pmd->ctx.now > pmd->next_rcu_quiesce) { |
7083 | 0 | if (!ovsrcu_try_quiesce()) { |
7084 | 0 | pmd->next_rcu_quiesce = |
7085 | 0 | pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL; |
7086 | 0 | } |
7087 | 0 | } |
7088 | |
|
7089 | 0 | if (lc++ > 1024) { |
7090 | 0 | lc = 0; |
7091 | |
|
7092 | 0 | coverage_try_clear(); |
7093 | 0 | dp_netdev_pmd_try_optimize(pmd, poll_list, poll_cnt); |
7094 | 0 | if (!ovsrcu_try_quiesce()) { |
7095 | 0 | emc_cache_slow_sweep(&((pmd->flow_cache).emc_cache)); |
7096 | 0 | pmd->next_rcu_quiesce = |
7097 | 0 | pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL; |
7098 | 0 | } |
7099 | |
|
7100 | 0 | for (i = 0; i < poll_cnt; i++) { |
7101 | 0 | uint64_t current_seq = |
7102 | 0 | netdev_get_change_seq(poll_list[i].rxq->port->netdev); |
7103 | 0 | if (poll_list[i].change_seq != current_seq) { |
7104 | 0 | poll_list[i].change_seq = current_seq; |
7105 | 0 | poll_list[i].rxq_enabled = |
7106 | 0 | netdev_rxq_enabled(poll_list[i].rxq->rx); |
7107 | 0 | } |
7108 | 0 | } |
7109 | 0 | } |
7110 | |
|
7111 | 0 | atomic_read_explicit(&pmd->reload, &reload, memory_order_acquire); |
7112 | 0 | if (OVS_UNLIKELY(reload)) { |
7113 | 0 | break; |
7114 | 0 | } |
7115 | | |
7116 | 0 | pmd_perf_end_iteration(s, rx_packets, tx_packets, time_slept, |
7117 | 0 | pmd_perf_metrics_enabled(pmd)); |
7118 | 0 | } |
7119 | 0 | ovs_mutex_unlock(&pmd->perf_stats.stats_mutex); |
7120 | |
|
7121 | 0 | poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list); |
7122 | 0 | atomic_read_relaxed(&pmd->wait_for_reload, &wait_for_reload); |
7123 | 0 | atomic_read_relaxed(&pmd->reload_tx_qid, &reload_tx_qid); |
7124 | 0 | atomic_read_relaxed(&pmd->exit, &exiting); |
7125 | | /* Signal here to make sure the pmd finishes |
7126 | | * reloading the updated configuration. */ |
7127 | 0 | dp_netdev_pmd_reload_done(pmd); |
7128 | |
|
7129 | 0 | if (reload_tx_qid) { |
7130 | 0 | pmd_free_static_tx_qid(pmd); |
7131 | 0 | pmd_alloc_static_tx_qid(pmd); |
7132 | 0 | } |
7133 | |
|
7134 | 0 | if (!exiting) { |
7135 | 0 | goto reload; |
7136 | 0 | } |
7137 | | |
7138 | 0 | pmd_free_static_tx_qid(pmd); |
7139 | 0 | dfc_cache_uninit(&pmd->flow_cache); |
7140 | 0 | free(poll_list); |
7141 | 0 | pmd_free_cached_ports(pmd); |
7142 | 0 | if (dpdk_attached) { |
7143 | 0 | dpdk_detach_thread(); |
7144 | 0 | } |
7145 | 0 | return NULL; |
7146 | 0 | } |
7147 | | |
7148 | | static void |
7149 | | dp_netdev_disable_upcall(struct dp_netdev *dp) |
7150 | | OVS_ACQUIRES(dp->upcall_rwlock) |
7151 | 0 | { |
7152 | 0 | fat_rwlock_wrlock(&dp->upcall_rwlock); |
7153 | 0 | } |
7154 | | |
7155 | | |
7156 | | /* Meters */ |
7157 | | static void |
7158 | | dpif_netdev_meter_get_features(const struct dpif * dpif OVS_UNUSED, |
7159 | | struct ofputil_meter_features *features) |
7160 | 0 | { |
7161 | 0 | features->max_meters = MAX_METERS; |
7162 | 0 | features->band_types = DP_SUPPORTED_METER_BAND_TYPES; |
7163 | 0 | features->capabilities = DP_SUPPORTED_METER_FLAGS_MASK; |
7164 | 0 | features->max_bands = MAX_BANDS; |
7165 | 0 | features->max_color = 0; |
7166 | 0 | } |
7167 | | |
7168 | | /* Applies the meter identified by 'meter_id' to 'packets_'. Packets |
7169 | | * that exceed a band are dropped in-place. */ |
7170 | | static void |
7171 | | dp_netdev_run_meter(struct dp_netdev *dp, struct dp_packet_batch *packets_, |
7172 | | uint32_t meter_id, long long int now) |
7173 | 0 | { |
7174 | 0 | struct dp_meter *meter; |
7175 | 0 | struct dp_meter_band *band; |
7176 | 0 | struct dp_packet *packet; |
7177 | 0 | long long int long_delta_t; /* msec */ |
7178 | 0 | uint32_t delta_t; /* msec */ |
7179 | 0 | const size_t cnt = dp_packet_batch_size(packets_); |
7180 | 0 | uint32_t bytes, volume; |
7181 | 0 | int exceeded_band[NETDEV_MAX_BURST]; |
7182 | 0 | uint32_t exceeded_rate[NETDEV_MAX_BURST]; |
7183 | 0 | int exceeded_pkt = cnt; /* First packet that exceeded a band rate. */ |
7184 | |
|
7185 | 0 | if (meter_id >= MAX_METERS) { |
7186 | 0 | return; |
7187 | 0 | } |
7188 | | |
7189 | 0 | meter = dp_meter_lookup(&dp->meters, meter_id); |
7190 | 0 | if (!meter) { |
7191 | 0 | return; |
7192 | 0 | } |
7193 | | |
7194 | | /* Initialize as negative values. */ |
7195 | 0 | memset(exceeded_band, 0xff, cnt * sizeof *exceeded_band); |
7196 | | /* Initialize as zeroes. */ |
7197 | 0 | memset(exceeded_rate, 0, cnt * sizeof *exceeded_rate); |
7198 | |
|
7199 | 0 | ovs_mutex_lock(&meter->lock); |
7200 | | /* All packets will hit the meter at the same time. */ |
7201 | 0 | long_delta_t = now / 1000 - meter->used / 1000; /* msec */ |
7202 | |
|
7203 | 0 | if (long_delta_t < 0) { |
7204 | | /* This condition means that we have several threads fighting for a |
7205 | | meter lock, and the one who received the packets a bit later wins. |
7206 | | Assuming that all racing threads received packets at the same time |
7207 | | to avoid overflow. */ |
7208 | 0 | long_delta_t = 0; |
7209 | 0 | } |
7210 | | |
7211 | | /* Make sure delta_t will not be too large, so that bucket will not |
7212 | | * wrap around below. */ |
7213 | 0 | delta_t = (long_delta_t > (long long int)meter->max_delta_t) |
7214 | 0 | ? meter->max_delta_t : (uint32_t)long_delta_t; |
7215 | | |
7216 | | /* Update meter stats. */ |
7217 | 0 | meter->used = now; |
7218 | 0 | meter->packet_count += cnt; |
7219 | 0 | bytes = 0; |
7220 | 0 | DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) { |
7221 | 0 | bytes += dp_packet_size(packet); |
7222 | 0 | } |
7223 | 0 | meter->byte_count += bytes; |
7224 | | |
7225 | | /* Meters can operate in terms of packets per second or kilobits per |
7226 | | * second. */ |
7227 | 0 | if (meter->flags & OFPMF13_PKTPS) { |
7228 | | /* Rate in packets/second, bucket 1/1000 packets. */ |
7229 | | /* msec * packets/sec = 1/1000 packets. */ |
7230 | 0 | volume = cnt * 1000; /* Take 'cnt' packets from the bucket. */ |
7231 | 0 | } else { |
7232 | | /* Rate in kbps, bucket in bits. */ |
7233 | | /* msec * kbps = bits */ |
7234 | 0 | volume = bytes * 8; |
7235 | 0 | } |
7236 | | |
7237 | | /* Update all bands and find the one hit with the highest rate for each |
7238 | | * packet (if any). */ |
7239 | 0 | for (int m = 0; m < meter->n_bands; ++m) { |
7240 | 0 | uint64_t max_bucket_size; |
7241 | |
|
7242 | 0 | band = &meter->bands[m]; |
7243 | 0 | max_bucket_size = band->burst_size * 1000ULL; |
7244 | | /* Update band's bucket. */ |
7245 | 0 | band->bucket += (uint64_t) delta_t * band->rate; |
7246 | 0 | if (band->bucket > max_bucket_size) { |
7247 | 0 | band->bucket = max_bucket_size; |
7248 | 0 | } |
7249 | | |
7250 | | /* Drain the bucket for all the packets, if possible. */ |
7251 | 0 | if (band->bucket >= volume) { |
7252 | 0 | band->bucket -= volume; |
7253 | 0 | } else { |
7254 | 0 | int band_exceeded_pkt; |
7255 | | |
7256 | | /* Band limit hit, must process packet-by-packet. */ |
7257 | 0 | if (meter->flags & OFPMF13_PKTPS) { |
7258 | 0 | band_exceeded_pkt = band->bucket / 1000; |
7259 | 0 | band->bucket %= 1000; /* Remainder stays in bucket. */ |
7260 | | |
7261 | | /* Update the exceeding band for each exceeding packet. |
7262 | | * (Only one band will be fired by a packet, and that |
7263 | | * can be different for each packet.) */ |
7264 | 0 | for (int i = band_exceeded_pkt; i < cnt; i++) { |
7265 | 0 | if (band->rate > exceeded_rate[i]) { |
7266 | 0 | exceeded_rate[i] = band->rate; |
7267 | 0 | exceeded_band[i] = m; |
7268 | 0 | } |
7269 | 0 | } |
7270 | 0 | } else { |
7271 | | /* Packet sizes differ, must process one-by-one. */ |
7272 | 0 | band_exceeded_pkt = cnt; |
7273 | 0 | DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) { |
7274 | 0 | uint32_t bits = dp_packet_size(packet) * 8; |
7275 | |
|
7276 | 0 | if (band->bucket >= bits) { |
7277 | 0 | band->bucket -= bits; |
7278 | 0 | } else { |
7279 | 0 | if (i < band_exceeded_pkt) { |
7280 | 0 | band_exceeded_pkt = i; |
7281 | 0 | } |
7282 | | /* Update the exceeding band for the exceeding packet. |
7283 | | * (Only one band will be fired by a packet, and that |
7284 | | * can be different for each packet.) */ |
7285 | 0 | if (band->rate > exceeded_rate[i]) { |
7286 | 0 | exceeded_rate[i] = band->rate; |
7287 | 0 | exceeded_band[i] = m; |
7288 | 0 | } |
7289 | 0 | } |
7290 | 0 | } |
7291 | 0 | } |
7292 | | /* Remember the first exceeding packet. */ |
7293 | 0 | if (exceeded_pkt > band_exceeded_pkt) { |
7294 | 0 | exceeded_pkt = band_exceeded_pkt; |
7295 | 0 | } |
7296 | 0 | } |
7297 | 0 | } |
7298 | | |
7299 | | /* Fire the highest rate band exceeded by each packet, and drop |
7300 | | * packets if needed. */ |
7301 | 0 | size_t j; |
7302 | 0 | DP_PACKET_BATCH_REFILL_FOR_EACH (j, cnt, packet, packets_) { |
7303 | 0 | if (exceeded_band[j] >= 0) { |
7304 | | /* Meter drop packet. */ |
7305 | 0 | band = &meter->bands[exceeded_band[j]]; |
7306 | 0 | band->packet_count += 1; |
7307 | 0 | band->byte_count += dp_packet_size(packet); |
7308 | 0 | COVERAGE_INC(datapath_drop_meter); |
7309 | 0 | dp_packet_delete(packet); |
7310 | 0 | } else { |
7311 | | /* Meter accepts packet. */ |
7312 | 0 | dp_packet_batch_refill(packets_, packet, j); |
7313 | 0 | } |
7314 | 0 | } |
7315 | |
|
7316 | 0 | ovs_mutex_unlock(&meter->lock); |
7317 | 0 | } |
7318 | | |
7319 | | /* Meter set/get/del processing is still single-threaded. */ |
7320 | | static int |
7321 | | dpif_netdev_meter_set(struct dpif *dpif, ofproto_meter_id meter_id, |
7322 | | struct ofputil_meter_config *config) |
7323 | 0 | { |
7324 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
7325 | 0 | uint32_t mid = meter_id.uint32; |
7326 | 0 | struct dp_meter *meter; |
7327 | 0 | int i; |
7328 | |
|
7329 | 0 | if (mid >= MAX_METERS) { |
7330 | 0 | return EFBIG; /* Meter_id out of range. */ |
7331 | 0 | } |
7332 | | |
7333 | 0 | if (config->flags & ~DP_SUPPORTED_METER_FLAGS_MASK) { |
7334 | 0 | return EBADF; /* Unsupported flags set */ |
7335 | 0 | } |
7336 | | |
7337 | 0 | if (config->n_bands > MAX_BANDS) { |
7338 | 0 | return EINVAL; |
7339 | 0 | } |
7340 | | |
7341 | 0 | for (i = 0; i < config->n_bands; ++i) { |
7342 | 0 | switch (config->bands[i].type) { |
7343 | 0 | case OFPMBT13_DROP: |
7344 | 0 | break; |
7345 | 0 | default: |
7346 | 0 | return ENODEV; /* Unsupported band type */ |
7347 | 0 | } |
7348 | 0 | } |
7349 | | |
7350 | | /* Allocate meter */ |
7351 | 0 | meter = xzalloc(sizeof *meter |
7352 | 0 | + config->n_bands * sizeof(struct dp_meter_band)); |
7353 | |
|
7354 | 0 | meter->flags = config->flags; |
7355 | 0 | meter->n_bands = config->n_bands; |
7356 | 0 | meter->max_delta_t = 0; |
7357 | 0 | meter->used = time_usec(); |
7358 | 0 | meter->id = mid; |
7359 | 0 | ovs_mutex_init_adaptive(&meter->lock); |
7360 | | |
7361 | | /* set up bands */ |
7362 | 0 | for (i = 0; i < config->n_bands; ++i) { |
7363 | 0 | uint32_t band_max_delta_t; |
7364 | | |
7365 | | /* Set burst size to a workable value if none specified. */ |
7366 | 0 | if (config->bands[i].burst_size == 0) { |
7367 | 0 | config->bands[i].burst_size = config->bands[i].rate; |
7368 | 0 | } |
7369 | |
|
7370 | 0 | meter->bands[i].rate = config->bands[i].rate; |
7371 | 0 | meter->bands[i].burst_size = config->bands[i].burst_size; |
7372 | | /* Start with a full bucket. */ |
7373 | 0 | meter->bands[i].bucket = meter->bands[i].burst_size * 1000ULL; |
7374 | | |
7375 | | /* Figure out max delta_t that is enough to fill any bucket. */ |
7376 | 0 | band_max_delta_t |
7377 | 0 | = meter->bands[i].bucket / meter->bands[i].rate; |
7378 | 0 | if (band_max_delta_t > meter->max_delta_t) { |
7379 | 0 | meter->max_delta_t = band_max_delta_t; |
7380 | 0 | } |
7381 | 0 | } |
7382 | |
|
7383 | 0 | ovs_mutex_lock(&dp->meters_lock); |
7384 | |
|
7385 | 0 | dp_meter_detach_free(&dp->meters, mid); /* Free existing meter, if any. */ |
7386 | 0 | dp_meter_attach(&dp->meters, meter); |
7387 | |
|
7388 | 0 | ovs_mutex_unlock(&dp->meters_lock); |
7389 | |
|
7390 | 0 | return 0; |
7391 | 0 | } |
7392 | | |
7393 | | static int |
7394 | | dpif_netdev_meter_get(const struct dpif *dpif, |
7395 | | ofproto_meter_id meter_id_, |
7396 | | struct ofputil_meter_stats *stats, uint16_t n_bands) |
7397 | 0 | { |
7398 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
7399 | 0 | uint32_t meter_id = meter_id_.uint32; |
7400 | 0 | const struct dp_meter *meter; |
7401 | |
|
7402 | 0 | if (meter_id >= MAX_METERS) { |
7403 | 0 | return EFBIG; |
7404 | 0 | } |
7405 | | |
7406 | 0 | meter = dp_meter_lookup(&dp->meters, meter_id); |
7407 | 0 | if (!meter) { |
7408 | 0 | return ENOENT; |
7409 | 0 | } |
7410 | | |
7411 | 0 | if (stats) { |
7412 | 0 | int i = 0; |
7413 | |
|
7414 | 0 | ovs_mutex_lock(&meter->lock); |
7415 | |
|
7416 | 0 | stats->packet_in_count = meter->packet_count; |
7417 | 0 | stats->byte_in_count = meter->byte_count; |
7418 | |
|
7419 | 0 | for (i = 0; i < n_bands && i < meter->n_bands; ++i) { |
7420 | 0 | stats->bands[i].packet_count = meter->bands[i].packet_count; |
7421 | 0 | stats->bands[i].byte_count = meter->bands[i].byte_count; |
7422 | 0 | } |
7423 | |
|
7424 | 0 | ovs_mutex_unlock(&meter->lock); |
7425 | 0 | stats->n_bands = i; |
7426 | 0 | } |
7427 | |
|
7428 | 0 | return 0; |
7429 | 0 | } |
7430 | | |
7431 | | static int |
7432 | | dpif_netdev_meter_del(struct dpif *dpif, |
7433 | | ofproto_meter_id meter_id_, |
7434 | | struct ofputil_meter_stats *stats, uint16_t n_bands) |
7435 | 0 | { |
7436 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
7437 | 0 | int error; |
7438 | |
|
7439 | 0 | error = dpif_netdev_meter_get(dpif, meter_id_, stats, n_bands); |
7440 | 0 | if (!error) { |
7441 | 0 | uint32_t meter_id = meter_id_.uint32; |
7442 | |
|
7443 | 0 | ovs_mutex_lock(&dp->meters_lock); |
7444 | 0 | dp_meter_detach_free(&dp->meters, meter_id); |
7445 | 0 | ovs_mutex_unlock(&dp->meters_lock); |
7446 | 0 | } |
7447 | 0 | return error; |
7448 | 0 | } |
7449 | | |
7450 | | |
7451 | | static void |
7452 | | dpif_netdev_disable_upcall(struct dpif *dpif) |
7453 | | OVS_NO_THREAD_SAFETY_ANALYSIS |
7454 | 0 | { |
7455 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
7456 | 0 | dp_netdev_disable_upcall(dp); |
7457 | 0 | } |
7458 | | |
7459 | | static void |
7460 | | dp_netdev_enable_upcall(struct dp_netdev *dp) |
7461 | | OVS_RELEASES(dp->upcall_rwlock) |
7462 | 0 | { |
7463 | 0 | fat_rwlock_unlock(&dp->upcall_rwlock); |
7464 | 0 | } |
7465 | | |
7466 | | static void |
7467 | | dpif_netdev_enable_upcall(struct dpif *dpif) |
7468 | | OVS_NO_THREAD_SAFETY_ANALYSIS |
7469 | 0 | { |
7470 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
7471 | 0 | dp_netdev_enable_upcall(dp); |
7472 | 0 | } |
7473 | | |
7474 | | static void |
7475 | | dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd) |
7476 | 0 | { |
7477 | 0 | atomic_store_relaxed(&pmd->wait_for_reload, false); |
7478 | 0 | atomic_store_relaxed(&pmd->reload_tx_qid, false); |
7479 | 0 | pmd->last_reload_seq = seq_read(pmd->reload_seq); |
7480 | 0 | atomic_store_explicit(&pmd->reload, false, memory_order_release); |
7481 | 0 | } |
7482 | | |
7483 | | /* Finds and refs the dp_netdev_pmd_thread on core 'core_id'. Returns |
7484 | | * the pointer if succeeds, otherwise, NULL (it can return NULL even if |
7485 | | * 'core_id' is NON_PMD_CORE_ID). |
7486 | | * |
7487 | | * Caller must unrefs the returned reference. */ |
7488 | | static struct dp_netdev_pmd_thread * |
7489 | | dp_netdev_get_pmd(struct dp_netdev *dp, unsigned core_id) |
7490 | 0 | { |
7491 | 0 | struct dp_netdev_pmd_thread *pmd; |
7492 | |
|
7493 | 0 | CMAP_FOR_EACH_WITH_HASH (pmd, node, hash_int(core_id, 0), |
7494 | 0 | &dp->poll_threads) { |
7495 | 0 | if (pmd->core_id == core_id) { |
7496 | 0 | return dp_netdev_pmd_try_ref(pmd) ? pmd : NULL; |
7497 | 0 | } |
7498 | 0 | } |
7499 | | |
7500 | 0 | return NULL; |
7501 | 0 | } |
7502 | | |
7503 | | /* Sets the 'struct dp_netdev_pmd_thread' for non-pmd threads. */ |
7504 | | static void |
7505 | | dp_netdev_set_nonpmd(struct dp_netdev *dp) |
7506 | | OVS_REQ_WRLOCK(dp->port_rwlock) |
7507 | 0 | { |
7508 | 0 | struct dp_netdev_pmd_thread *non_pmd; |
7509 | |
|
7510 | 0 | non_pmd = xzalloc(sizeof *non_pmd); |
7511 | 0 | dp_netdev_configure_pmd(non_pmd, dp, NON_PMD_CORE_ID, OVS_NUMA_UNSPEC); |
7512 | 0 | } |
7513 | | |
7514 | | /* Caller must have valid pointer to 'pmd'. */ |
7515 | | static bool |
7516 | | dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd) |
7517 | 0 | { |
7518 | 0 | return ovs_refcount_try_ref_rcu(&pmd->ref_cnt); |
7519 | 0 | } |
7520 | | |
7521 | | static void |
7522 | | dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd) |
7523 | 0 | { |
7524 | 0 | if (pmd && ovs_refcount_unref(&pmd->ref_cnt) == 1) { |
7525 | 0 | ovsrcu_postpone(dp_netdev_destroy_pmd, pmd); |
7526 | 0 | } |
7527 | 0 | } |
7528 | | |
7529 | | /* Given cmap position 'pos', tries to ref the next node. If try_ref() |
7530 | | * fails, keeps checking for next node until reaching the end of cmap. |
7531 | | * |
7532 | | * Caller must unrefs the returned reference. */ |
7533 | | static struct dp_netdev_pmd_thread * |
7534 | | dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos) |
7535 | 0 | { |
7536 | 0 | struct dp_netdev_pmd_thread *next; |
7537 | |
|
7538 | 0 | do { |
7539 | 0 | struct cmap_node *node; |
7540 | |
|
7541 | 0 | node = cmap_next_position(&dp->poll_threads, pos); |
7542 | 0 | next = node ? CONTAINER_OF(node, struct dp_netdev_pmd_thread, node) |
7543 | 0 | : NULL; |
7544 | 0 | } while (next && !dp_netdev_pmd_try_ref(next)); |
7545 | |
|
7546 | 0 | return next; |
7547 | 0 | } |
7548 | | |
7549 | | /* Configures the 'pmd' based on the input argument. */ |
7550 | | static void |
7551 | | dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp, |
7552 | | unsigned core_id, int numa_id) |
7553 | 0 | { |
7554 | 0 | pmd->dp = dp; |
7555 | 0 | pmd->core_id = core_id; |
7556 | 0 | pmd->numa_id = numa_id; |
7557 | 0 | pmd->need_reload = false; |
7558 | 0 | pmd->n_output_batches = 0; |
7559 | |
|
7560 | 0 | ovs_refcount_init(&pmd->ref_cnt); |
7561 | 0 | atomic_init(&pmd->exit, false); |
7562 | 0 | pmd->reload_seq = seq_create(); |
7563 | 0 | pmd->last_reload_seq = seq_read(pmd->reload_seq); |
7564 | 0 | atomic_init(&pmd->reload, false); |
7565 | 0 | ovs_mutex_init(&pmd->flow_mutex); |
7566 | 0 | ovs_mutex_init(&pmd->port_mutex); |
7567 | 0 | ovs_mutex_init(&pmd->bond_mutex); |
7568 | 0 | cmap_init(&pmd->flow_table); |
7569 | 0 | cmap_init(&pmd->classifiers); |
7570 | 0 | cmap_init(&pmd->simple_match_table); |
7571 | 0 | ccmap_init(&pmd->n_flows); |
7572 | 0 | ccmap_init(&pmd->n_simple_flows); |
7573 | 0 | pmd->ctx.last_rxq = NULL; |
7574 | 0 | pmd_thread_ctx_time_update(pmd); |
7575 | 0 | pmd->next_optimization = pmd->ctx.now + DPCLS_OPTIMIZATION_INTERVAL; |
7576 | 0 | pmd->next_rcu_quiesce = pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL; |
7577 | 0 | pmd->next_cycle_store = pmd->ctx.now + PMD_INTERVAL_LEN; |
7578 | 0 | pmd->busy_cycles_intrvl = xzalloc(PMD_INTERVAL_MAX * |
7579 | 0 | sizeof *pmd->busy_cycles_intrvl); |
7580 | 0 | hmap_init(&pmd->poll_list); |
7581 | 0 | hmap_init(&pmd->tx_ports); |
7582 | 0 | hmap_init(&pmd->tnl_port_cache); |
7583 | 0 | hmap_init(&pmd->send_port_cache); |
7584 | 0 | cmap_init(&pmd->tx_bonds); |
7585 | | |
7586 | | /* Initialize DPIF function pointer to the default configured version. */ |
7587 | 0 | atomic_init(&pmd->netdev_input_func, dp_netdev_impl_get_default()); |
7588 | | |
7589 | | /* Init default miniflow_extract function */ |
7590 | 0 | atomic_init(&pmd->miniflow_extract_opt, dp_mfex_impl_get_default()); |
7591 | | |
7592 | | /* init the 'flow_cache' since there is no |
7593 | | * actual thread created for NON_PMD_CORE_ID. */ |
7594 | 0 | if (core_id == NON_PMD_CORE_ID) { |
7595 | 0 | dfc_cache_init(&pmd->flow_cache); |
7596 | 0 | pmd_alloc_static_tx_qid(pmd); |
7597 | 0 | } |
7598 | 0 | pmd_perf_stats_init(&pmd->perf_stats); |
7599 | 0 | cmap_insert(&dp->poll_threads, CONST_CAST(struct cmap_node *, &pmd->node), |
7600 | 0 | hash_int(core_id, 0)); |
7601 | 0 | } |
7602 | | |
7603 | | static void |
7604 | | dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd) |
7605 | 0 | { |
7606 | 0 | struct dpcls *cls; |
7607 | |
|
7608 | 0 | dp_netdev_pmd_flow_flush(pmd); |
7609 | 0 | hmap_destroy(&pmd->send_port_cache); |
7610 | 0 | hmap_destroy(&pmd->tnl_port_cache); |
7611 | 0 | hmap_destroy(&pmd->tx_ports); |
7612 | 0 | cmap_destroy(&pmd->tx_bonds); |
7613 | 0 | hmap_destroy(&pmd->poll_list); |
7614 | 0 | free(pmd->busy_cycles_intrvl); |
7615 | | /* All flows (including their dpcls_rules) have been deleted already */ |
7616 | 0 | CMAP_FOR_EACH (cls, node, &pmd->classifiers) { |
7617 | 0 | dpcls_destroy(cls); |
7618 | 0 | ovsrcu_postpone(free, cls); |
7619 | 0 | } |
7620 | 0 | cmap_destroy(&pmd->classifiers); |
7621 | 0 | cmap_destroy(&pmd->flow_table); |
7622 | 0 | cmap_destroy(&pmd->simple_match_table); |
7623 | 0 | ccmap_destroy(&pmd->n_flows); |
7624 | 0 | ccmap_destroy(&pmd->n_simple_flows); |
7625 | 0 | ovs_mutex_destroy(&pmd->flow_mutex); |
7626 | 0 | seq_destroy(pmd->reload_seq); |
7627 | 0 | ovs_mutex_destroy(&pmd->port_mutex); |
7628 | 0 | ovs_mutex_destroy(&pmd->bond_mutex); |
7629 | 0 | free(pmd->netdev_input_func_userdata); |
7630 | 0 | free(pmd); |
7631 | 0 | } |
7632 | | |
7633 | | /* Stops the pmd thread, removes it from the 'dp->poll_threads', |
7634 | | * and unrefs the struct. */ |
7635 | | static void |
7636 | | dp_netdev_del_pmd(struct dp_netdev *dp, struct dp_netdev_pmd_thread *pmd) |
7637 | 0 | { |
7638 | | /* NON_PMD_CORE_ID doesn't have a thread, so we don't have to synchronize, |
7639 | | * but extra cleanup is necessary */ |
7640 | 0 | if (pmd->core_id == NON_PMD_CORE_ID) { |
7641 | 0 | ovs_mutex_lock(&dp->non_pmd_mutex); |
7642 | 0 | dfc_cache_uninit(&pmd->flow_cache); |
7643 | 0 | pmd_free_cached_ports(pmd); |
7644 | 0 | pmd_free_static_tx_qid(pmd); |
7645 | 0 | ovs_mutex_unlock(&dp->non_pmd_mutex); |
7646 | 0 | } else { |
7647 | 0 | atomic_store_relaxed(&pmd->exit, true); |
7648 | 0 | dp_netdev_reload_pmd__(pmd); |
7649 | 0 | xpthread_join(pmd->thread, NULL); |
7650 | 0 | } |
7651 | |
|
7652 | 0 | dp_netdev_pmd_clear_ports(pmd); |
7653 | | |
7654 | | /* Purges the 'pmd''s flows after stopping the thread, but before |
7655 | | * destroying the flows, so that the flow stats can be collected. */ |
7656 | 0 | if (dp->dp_purge_cb) { |
7657 | 0 | dp->dp_purge_cb(dp->dp_purge_aux, pmd->core_id); |
7658 | 0 | } |
7659 | 0 | cmap_remove(&pmd->dp->poll_threads, &pmd->node, hash_int(pmd->core_id, 0)); |
7660 | 0 | dp_netdev_pmd_unref(pmd); |
7661 | 0 | } |
7662 | | |
7663 | | /* Destroys all pmd threads. If 'non_pmd' is true it also destroys the non pmd |
7664 | | * thread. */ |
7665 | | static void |
7666 | | dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd) |
7667 | 0 | { |
7668 | 0 | struct dp_netdev_pmd_thread *pmd; |
7669 | 0 | struct dp_netdev_pmd_thread **pmd_list; |
7670 | 0 | size_t k = 0, n_pmds; |
7671 | |
|
7672 | 0 | n_pmds = cmap_count(&dp->poll_threads); |
7673 | 0 | pmd_list = xcalloc(n_pmds, sizeof *pmd_list); |
7674 | |
|
7675 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
7676 | 0 | if (!non_pmd && pmd->core_id == NON_PMD_CORE_ID) { |
7677 | 0 | continue; |
7678 | 0 | } |
7679 | | /* We cannot call dp_netdev_del_pmd(), since it alters |
7680 | | * 'dp->poll_threads' (while we're iterating it) and it |
7681 | | * might quiesce. */ |
7682 | 0 | ovs_assert(k < n_pmds); |
7683 | 0 | pmd_list[k++] = pmd; |
7684 | 0 | } |
7685 | |
|
7686 | 0 | for (size_t i = 0; i < k; i++) { |
7687 | 0 | dp_netdev_del_pmd(dp, pmd_list[i]); |
7688 | 0 | } |
7689 | 0 | free(pmd_list); |
7690 | 0 | } |
7691 | | |
7692 | | /* Deletes all rx queues from pmd->poll_list and all the ports from |
7693 | | * pmd->tx_ports. */ |
7694 | | static void |
7695 | | dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd) |
7696 | 0 | { |
7697 | 0 | struct rxq_poll *poll; |
7698 | 0 | struct tx_port *port; |
7699 | 0 | struct tx_bond *tx; |
7700 | |
|
7701 | 0 | ovs_mutex_lock(&pmd->port_mutex); |
7702 | 0 | HMAP_FOR_EACH_POP (poll, node, &pmd->poll_list) { |
7703 | 0 | free(poll); |
7704 | 0 | } |
7705 | 0 | HMAP_FOR_EACH_POP (port, node, &pmd->tx_ports) { |
7706 | 0 | free(port->txq_pkts); |
7707 | 0 | free(port); |
7708 | 0 | } |
7709 | 0 | ovs_mutex_unlock(&pmd->port_mutex); |
7710 | |
|
7711 | 0 | ovs_mutex_lock(&pmd->bond_mutex); |
7712 | 0 | CMAP_FOR_EACH (tx, node, &pmd->tx_bonds) { |
7713 | 0 | cmap_remove(&pmd->tx_bonds, &tx->node, hash_bond_id(tx->bond_id)); |
7714 | 0 | ovsrcu_postpone(free, tx); |
7715 | 0 | } |
7716 | 0 | ovs_mutex_unlock(&pmd->bond_mutex); |
7717 | 0 | } |
7718 | | |
7719 | | /* Adds rx queue to poll_list of PMD thread, if it's not there already. */ |
7720 | | static void |
7721 | | dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd, |
7722 | | struct dp_netdev_rxq *rxq) |
7723 | | OVS_REQUIRES(pmd->port_mutex) |
7724 | 0 | { |
7725 | 0 | int qid = netdev_rxq_get_queue_id(rxq->rx); |
7726 | 0 | uint32_t hash = hash_2words(odp_to_u32(rxq->port->port_no), qid); |
7727 | 0 | struct rxq_poll *poll; |
7728 | |
|
7729 | 0 | HMAP_FOR_EACH_WITH_HASH (poll, node, hash, &pmd->poll_list) { |
7730 | 0 | if (poll->rxq == rxq) { |
7731 | | /* 'rxq' is already polled by this thread. Do nothing. */ |
7732 | 0 | return; |
7733 | 0 | } |
7734 | 0 | } |
7735 | | |
7736 | 0 | poll = xmalloc(sizeof *poll); |
7737 | 0 | poll->rxq = rxq; |
7738 | 0 | hmap_insert(&pmd->poll_list, &poll->node, hash); |
7739 | |
|
7740 | 0 | pmd->need_reload = true; |
7741 | 0 | } |
7742 | | |
7743 | | /* Delete 'poll' from poll_list of PMD thread. */ |
7744 | | static void |
7745 | | dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd, |
7746 | | struct rxq_poll *poll) |
7747 | | OVS_REQUIRES(pmd->port_mutex) |
7748 | 0 | { |
7749 | 0 | hmap_remove(&pmd->poll_list, &poll->node); |
7750 | 0 | free(poll); |
7751 | |
|
7752 | 0 | pmd->need_reload = true; |
7753 | 0 | } |
7754 | | |
7755 | | /* Add 'port' to the tx port cache of 'pmd', which must be reloaded for the |
7756 | | * changes to take effect. */ |
7757 | | static void |
7758 | | dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd, |
7759 | | struct dp_netdev_port *port) |
7760 | | OVS_REQUIRES(pmd->port_mutex) |
7761 | 0 | { |
7762 | 0 | struct tx_port *tx; |
7763 | |
|
7764 | 0 | tx = tx_port_lookup(&pmd->tx_ports, port->port_no); |
7765 | 0 | if (tx) { |
7766 | | /* 'port' is already on this thread tx cache. Do nothing. */ |
7767 | 0 | return; |
7768 | 0 | } |
7769 | | |
7770 | 0 | tx = xzalloc(sizeof *tx); |
7771 | |
|
7772 | 0 | tx->port = port; |
7773 | 0 | tx->qid = -1; |
7774 | 0 | tx->flush_time = 0LL; |
7775 | 0 | dp_packet_batch_init(&tx->output_pkts); |
7776 | |
|
7777 | 0 | if (tx->port->txq_mode == TXQ_MODE_XPS_HASH) { |
7778 | 0 | int i, n_txq = netdev_n_txq(tx->port->netdev); |
7779 | |
|
7780 | 0 | tx->txq_pkts = xzalloc(n_txq * sizeof *tx->txq_pkts); |
7781 | 0 | for (i = 0; i < n_txq; i++) { |
7782 | 0 | dp_packet_batch_init(&tx->txq_pkts[i]); |
7783 | 0 | } |
7784 | 0 | } |
7785 | |
|
7786 | 0 | hmap_insert(&pmd->tx_ports, &tx->node, hash_port_no(tx->port->port_no)); |
7787 | 0 | pmd->need_reload = true; |
7788 | 0 | } |
7789 | | |
7790 | | /* Del 'tx' from the tx port cache of 'pmd', which must be reloaded for the |
7791 | | * changes to take effect. */ |
7792 | | static void |
7793 | | dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd, |
7794 | | struct tx_port *tx) |
7795 | | OVS_REQUIRES(pmd->port_mutex) |
7796 | 0 | { |
7797 | 0 | hmap_remove(&pmd->tx_ports, &tx->node); |
7798 | 0 | free(tx->txq_pkts); |
7799 | 0 | free(tx); |
7800 | 0 | pmd->need_reload = true; |
7801 | 0 | } |
7802 | | |
7803 | | /* Add bond to the tx bond cmap of 'pmd'. */ |
7804 | | static void |
7805 | | dp_netdev_add_bond_tx_to_pmd(struct dp_netdev_pmd_thread *pmd, |
7806 | | struct tx_bond *bond, bool update) |
7807 | | OVS_EXCLUDED(pmd->bond_mutex) |
7808 | 0 | { |
7809 | 0 | struct tx_bond *tx; |
7810 | |
|
7811 | 0 | ovs_mutex_lock(&pmd->bond_mutex); |
7812 | 0 | tx = tx_bond_lookup(&pmd->tx_bonds, bond->bond_id); |
7813 | |
|
7814 | 0 | if (tx && !update) { |
7815 | | /* It's not an update and the entry already exists. Do nothing. */ |
7816 | 0 | goto unlock; |
7817 | 0 | } |
7818 | | |
7819 | 0 | if (tx) { |
7820 | 0 | struct tx_bond *new_tx = xmemdup(bond, sizeof *bond); |
7821 | | |
7822 | | /* Copy the stats for each bucket. */ |
7823 | 0 | for (int i = 0; i < BOND_BUCKETS; i++) { |
7824 | 0 | uint64_t n_packets, n_bytes; |
7825 | |
|
7826 | 0 | atomic_read_relaxed(&tx->member_buckets[i].n_packets, &n_packets); |
7827 | 0 | atomic_read_relaxed(&tx->member_buckets[i].n_bytes, &n_bytes); |
7828 | 0 | atomic_init(&new_tx->member_buckets[i].n_packets, n_packets); |
7829 | 0 | atomic_init(&new_tx->member_buckets[i].n_bytes, n_bytes); |
7830 | 0 | } |
7831 | 0 | cmap_replace(&pmd->tx_bonds, &tx->node, &new_tx->node, |
7832 | 0 | hash_bond_id(bond->bond_id)); |
7833 | 0 | ovsrcu_postpone(free, tx); |
7834 | 0 | } else { |
7835 | 0 | tx = xmemdup(bond, sizeof *bond); |
7836 | 0 | cmap_insert(&pmd->tx_bonds, &tx->node, hash_bond_id(bond->bond_id)); |
7837 | 0 | } |
7838 | 0 | unlock: |
7839 | 0 | ovs_mutex_unlock(&pmd->bond_mutex); |
7840 | 0 | } |
7841 | | |
7842 | | /* Delete bond from the tx bond cmap of 'pmd'. */ |
7843 | | static void |
7844 | | dp_netdev_del_bond_tx_from_pmd(struct dp_netdev_pmd_thread *pmd, |
7845 | | uint32_t bond_id) |
7846 | | OVS_EXCLUDED(pmd->bond_mutex) |
7847 | 0 | { |
7848 | 0 | struct tx_bond *tx; |
7849 | |
|
7850 | 0 | ovs_mutex_lock(&pmd->bond_mutex); |
7851 | 0 | tx = tx_bond_lookup(&pmd->tx_bonds, bond_id); |
7852 | 0 | if (tx) { |
7853 | 0 | cmap_remove(&pmd->tx_bonds, &tx->node, hash_bond_id(tx->bond_id)); |
7854 | 0 | ovsrcu_postpone(free, tx); |
7855 | 0 | } |
7856 | 0 | ovs_mutex_unlock(&pmd->bond_mutex); |
7857 | 0 | } |
7858 | | |
7859 | | static char * |
7860 | | dpif_netdev_get_datapath_version(void) |
7861 | 0 | { |
7862 | 0 | return xstrdup("<built-in>"); |
7863 | 0 | } |
7864 | | |
7865 | | static void |
7866 | | dp_netdev_flow_used(struct dp_netdev_flow *netdev_flow, int cnt, int size, |
7867 | | uint16_t tcp_flags, long long now) |
7868 | 0 | { |
7869 | 0 | uint16_t flags; |
7870 | |
|
7871 | 0 | atomic_store_relaxed(&netdev_flow->stats.used, now); |
7872 | 0 | non_atomic_ullong_add(&netdev_flow->stats.packet_count, cnt); |
7873 | 0 | non_atomic_ullong_add(&netdev_flow->stats.byte_count, size); |
7874 | 0 | atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags); |
7875 | 0 | flags |= tcp_flags; |
7876 | 0 | atomic_store_relaxed(&netdev_flow->stats.tcp_flags, flags); |
7877 | 0 | } |
7878 | | |
7879 | | static int |
7880 | | dp_netdev_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet_, |
7881 | | struct flow *flow, struct flow_wildcards *wc, ovs_u128 *ufid, |
7882 | | enum dpif_upcall_type type, const struct nlattr *userdata, |
7883 | | struct ofpbuf *actions, struct ofpbuf *put_actions) |
7884 | 0 | { |
7885 | 0 | struct dp_netdev *dp = pmd->dp; |
7886 | |
|
7887 | 0 | if (OVS_UNLIKELY(!dp->upcall_cb)) { |
7888 | 0 | return ENODEV; |
7889 | 0 | } |
7890 | | |
7891 | 0 | if (OVS_UNLIKELY(!VLOG_DROP_DBG(&upcall_rl))) { |
7892 | 0 | struct ds ds = DS_EMPTY_INITIALIZER; |
7893 | 0 | char *packet_str; |
7894 | 0 | struct ofpbuf key; |
7895 | 0 | struct odp_flow_key_parms odp_parms = { |
7896 | 0 | .flow = flow, |
7897 | 0 | .mask = wc ? &wc->masks : NULL, |
7898 | 0 | .support = dp_netdev_support, |
7899 | 0 | }; |
7900 | |
|
7901 | 0 | ofpbuf_init(&key, 0); |
7902 | 0 | odp_flow_key_from_flow(&odp_parms, &key); |
7903 | 0 | packet_str = ofp_dp_packet_to_string(packet_); |
7904 | |
|
7905 | 0 | odp_flow_key_format(key.data, key.size, &ds); |
7906 | |
|
7907 | 0 | VLOG_DBG("%s: %s upcall:\n%s\n%s", dp->name, |
7908 | 0 | dpif_upcall_type_to_string(type), ds_cstr(&ds), packet_str); |
7909 | |
|
7910 | 0 | ofpbuf_uninit(&key); |
7911 | 0 | free(packet_str); |
7912 | |
|
7913 | 0 | ds_destroy(&ds); |
7914 | 0 | } |
7915 | |
|
7916 | 0 | return dp->upcall_cb(packet_, flow, ufid, pmd->core_id, type, userdata, |
7917 | 0 | actions, wc, put_actions, dp->upcall_aux); |
7918 | 0 | } |
7919 | | |
7920 | | static inline uint32_t |
7921 | | dpif_netdev_packet_get_rss_hash(struct dp_packet *packet, |
7922 | | const struct miniflow *mf) |
7923 | 0 | { |
7924 | 0 | uint32_t hash, recirc_depth; |
7925 | |
|
7926 | 0 | if (OVS_LIKELY(dp_packet_rss_valid(packet))) { |
7927 | 0 | hash = dp_packet_get_rss_hash(packet); |
7928 | 0 | } else { |
7929 | 0 | hash = miniflow_hash_5tuple(mf, 0); |
7930 | 0 | dp_packet_set_rss_hash(packet, hash); |
7931 | 0 | } |
7932 | | |
7933 | | /* The RSS hash must account for the recirculation depth to avoid |
7934 | | * collisions in the exact match cache */ |
7935 | 0 | recirc_depth = *recirc_depth_get_unsafe(); |
7936 | 0 | if (OVS_UNLIKELY(recirc_depth)) { |
7937 | 0 | hash = hash_finish(hash, recirc_depth); |
7938 | 0 | } |
7939 | 0 | return hash; |
7940 | 0 | } |
7941 | | |
7942 | | struct packet_batch_per_flow { |
7943 | | unsigned int byte_count; |
7944 | | uint16_t tcp_flags; |
7945 | | struct dp_netdev_flow *flow; |
7946 | | |
7947 | | struct dp_packet_batch array; |
7948 | | }; |
7949 | | |
7950 | | static inline void |
7951 | | packet_batch_per_flow_update(struct packet_batch_per_flow *batch, |
7952 | | struct dp_packet *packet, |
7953 | | uint16_t tcp_flags) |
7954 | 0 | { |
7955 | 0 | batch->byte_count += dp_packet_size(packet); |
7956 | 0 | batch->tcp_flags |= tcp_flags; |
7957 | 0 | dp_packet_batch_add(&batch->array, packet); |
7958 | 0 | } |
7959 | | |
7960 | | static inline void |
7961 | | packet_batch_per_flow_init(struct packet_batch_per_flow *batch, |
7962 | | struct dp_netdev_flow *flow) |
7963 | 0 | { |
7964 | 0 | flow->batch = batch; |
7965 | |
|
7966 | 0 | batch->flow = flow; |
7967 | 0 | dp_packet_batch_init(&batch->array); |
7968 | 0 | batch->byte_count = 0; |
7969 | 0 | batch->tcp_flags = 0; |
7970 | 0 | } |
7971 | | |
7972 | | static inline void |
7973 | | packet_batch_per_flow_execute(struct packet_batch_per_flow *batch, |
7974 | | struct dp_netdev_pmd_thread *pmd) |
7975 | 0 | { |
7976 | 0 | struct dp_netdev_actions *actions; |
7977 | 0 | struct dp_netdev_flow *flow = batch->flow; |
7978 | |
|
7979 | 0 | dp_netdev_flow_used(flow, dp_packet_batch_size(&batch->array), |
7980 | 0 | batch->byte_count, |
7981 | 0 | batch->tcp_flags, pmd->ctx.now / 1000); |
7982 | |
|
7983 | 0 | actions = dp_netdev_flow_get_actions(flow); |
7984 | |
|
7985 | 0 | dp_netdev_execute_actions(pmd, &batch->array, true, &flow->flow, |
7986 | 0 | actions->actions, actions->size); |
7987 | 0 | } |
7988 | | |
7989 | | void |
7990 | | dp_netdev_batch_execute(struct dp_netdev_pmd_thread *pmd, |
7991 | | struct dp_packet_batch *packets, |
7992 | | struct dpcls_rule *rule, |
7993 | | uint32_t bytes, |
7994 | | uint16_t tcp_flags) |
7995 | 0 | { |
7996 | | /* Gets action* from the rule. */ |
7997 | 0 | struct dp_netdev_flow *flow = dp_netdev_flow_cast(rule); |
7998 | 0 | struct dp_netdev_actions *actions = dp_netdev_flow_get_actions(flow); |
7999 | |
|
8000 | 0 | dp_netdev_flow_used(flow, dp_packet_batch_size(packets), bytes, |
8001 | 0 | tcp_flags, pmd->ctx.now / 1000); |
8002 | 0 | const uint32_t steal = 1; |
8003 | 0 | dp_netdev_execute_actions(pmd, packets, steal, &flow->flow, |
8004 | 0 | actions->actions, actions->size); |
8005 | 0 | } |
8006 | | |
8007 | | static inline void |
8008 | | dp_netdev_queue_batches(struct dp_packet *pkt, |
8009 | | struct dp_netdev_flow *flow, uint16_t tcp_flags, |
8010 | | struct packet_batch_per_flow *batches, |
8011 | | size_t *n_batches) |
8012 | 0 | { |
8013 | 0 | struct packet_batch_per_flow *batch = flow->batch; |
8014 | |
|
8015 | 0 | if (OVS_UNLIKELY(!batch)) { |
8016 | 0 | batch = &batches[(*n_batches)++]; |
8017 | 0 | packet_batch_per_flow_init(batch, flow); |
8018 | 0 | } |
8019 | |
|
8020 | 0 | packet_batch_per_flow_update(batch, pkt, tcp_flags); |
8021 | 0 | } |
8022 | | |
8023 | | static inline void |
8024 | | packet_enqueue_to_flow_map(struct dp_packet *packet, |
8025 | | struct dp_netdev_flow *flow, |
8026 | | uint16_t tcp_flags, |
8027 | | struct dp_packet_flow_map *flow_map, |
8028 | | size_t index) |
8029 | 0 | { |
8030 | 0 | struct dp_packet_flow_map *map = &flow_map[index]; |
8031 | 0 | map->flow = flow; |
8032 | 0 | map->packet = packet; |
8033 | 0 | map->tcp_flags = tcp_flags; |
8034 | 0 | } |
8035 | | |
8036 | | /* SMC lookup function for a batch of packets. |
8037 | | * By doing batching SMC lookup, we can use prefetch |
8038 | | * to hide memory access latency. |
8039 | | */ |
8040 | | static inline void |
8041 | | smc_lookup_batch(struct dp_netdev_pmd_thread *pmd, |
8042 | | struct netdev_flow_key *keys, |
8043 | | struct netdev_flow_key **missed_keys, |
8044 | | struct dp_packet_batch *packets_, |
8045 | | const int cnt, |
8046 | | struct dp_packet_flow_map *flow_map, |
8047 | | uint8_t *index_map) |
8048 | 0 | { |
8049 | 0 | int i; |
8050 | 0 | struct dp_packet *packet; |
8051 | 0 | size_t n_smc_hit = 0, n_missed = 0; |
8052 | 0 | struct dfc_cache *cache = &pmd->flow_cache; |
8053 | 0 | struct smc_cache *smc_cache = &cache->smc_cache; |
8054 | 0 | const struct cmap_node *flow_node; |
8055 | 0 | int recv_idx; |
8056 | 0 | uint16_t tcp_flags; |
8057 | | |
8058 | | /* Prefetch buckets for all packets */ |
8059 | 0 | for (i = 0; i < cnt; i++) { |
8060 | 0 | OVS_PREFETCH(&smc_cache->buckets[keys[i].hash & SMC_MASK]); |
8061 | 0 | } |
8062 | |
|
8063 | 0 | DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) { |
8064 | 0 | struct dp_netdev_flow *flow = NULL; |
8065 | 0 | flow_node = smc_entry_get(pmd, keys[i].hash); |
8066 | 0 | bool hit = false; |
8067 | | /* Get the original order of this packet in received batch. */ |
8068 | 0 | recv_idx = index_map[i]; |
8069 | |
|
8070 | 0 | if (OVS_LIKELY(flow_node != NULL)) { |
8071 | 0 | CMAP_NODE_FOR_EACH (flow, node, flow_node) { |
8072 | | /* Since we dont have per-port megaflow to check the port |
8073 | | * number, we need to verify that the input ports match. */ |
8074 | 0 | if (OVS_LIKELY(dpcls_rule_matches_key(&flow->cr, &keys[i]) && |
8075 | 0 | flow->flow.in_port.odp_port == packet->md.in_port.odp_port)) { |
8076 | 0 | tcp_flags = miniflow_get_tcp_flags(&keys[i].mf); |
8077 | | |
8078 | | /* SMC hit and emc miss, we insert into EMC */ |
8079 | 0 | keys[i].len = |
8080 | 0 | netdev_flow_key_size(miniflow_n_values(&keys[i].mf)); |
8081 | 0 | emc_probabilistic_insert(pmd, &keys[i], flow); |
8082 | | /* Add these packets into the flow map in the same order |
8083 | | * as received. |
8084 | | */ |
8085 | 0 | packet_enqueue_to_flow_map(packet, flow, tcp_flags, |
8086 | 0 | flow_map, recv_idx); |
8087 | 0 | n_smc_hit++; |
8088 | 0 | hit = true; |
8089 | 0 | break; |
8090 | 0 | } |
8091 | 0 | } |
8092 | 0 | if (hit) { |
8093 | 0 | continue; |
8094 | 0 | } |
8095 | 0 | } |
8096 | | |
8097 | | /* SMC missed. Group missed packets together at |
8098 | | * the beginning of the 'packets' array. */ |
8099 | 0 | dp_packet_batch_refill(packets_, packet, i); |
8100 | | |
8101 | | /* Preserve the order of packet for flow batching. */ |
8102 | 0 | index_map[n_missed] = recv_idx; |
8103 | | |
8104 | | /* Put missed keys to the pointer arrays return to the caller */ |
8105 | 0 | missed_keys[n_missed++] = &keys[i]; |
8106 | 0 | } |
8107 | |
|
8108 | 0 | pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SMC_HIT, n_smc_hit); |
8109 | 0 | } |
8110 | | |
8111 | | struct dp_netdev_flow * |
8112 | | smc_lookup_single(struct dp_netdev_pmd_thread *pmd, |
8113 | | struct dp_packet *packet, |
8114 | | struct netdev_flow_key *key) |
8115 | 0 | { |
8116 | 0 | const struct cmap_node *flow_node = smc_entry_get(pmd, key->hash); |
8117 | |
|
8118 | 0 | if (OVS_LIKELY(flow_node != NULL)) { |
8119 | 0 | struct dp_netdev_flow *flow = NULL; |
8120 | |
|
8121 | 0 | CMAP_NODE_FOR_EACH (flow, node, flow_node) { |
8122 | | /* Since we dont have per-port megaflow to check the port |
8123 | | * number, we need to verify that the input ports match. */ |
8124 | 0 | if (OVS_LIKELY(dpcls_rule_matches_key(&flow->cr, key) && |
8125 | 0 | flow->flow.in_port.odp_port == packet->md.in_port.odp_port)) { |
8126 | |
|
8127 | 0 | return (void *) flow; |
8128 | 0 | } |
8129 | 0 | } |
8130 | 0 | } |
8131 | | |
8132 | 0 | return NULL; |
8133 | 0 | } |
8134 | | |
8135 | | inline int |
8136 | | dp_netdev_hw_flow(const struct dp_netdev_pmd_thread *pmd, |
8137 | | struct dp_packet *packet, |
8138 | | struct dp_netdev_flow **flow) |
8139 | 0 | { |
8140 | 0 | uint32_t mark; |
8141 | |
|
8142 | | #ifdef ALLOW_EXPERIMENTAL_API /* Packet restoration API required. */ |
8143 | | /* Restore the packet if HW processing was terminated before completion. */ |
8144 | | struct dp_netdev_rxq *rxq = pmd->ctx.last_rxq; |
8145 | | bool miss_api_supported; |
8146 | | |
8147 | | atomic_read_relaxed(&rxq->port->netdev->hw_info.miss_api_supported, |
8148 | | &miss_api_supported); |
8149 | | if (miss_api_supported) { |
8150 | | int err = netdev_hw_miss_packet_recover(rxq->port->netdev, packet); |
8151 | | if (err && err != EOPNOTSUPP) { |
8152 | | COVERAGE_INC(datapath_drop_hw_miss_recover); |
8153 | | return -1; |
8154 | | } |
8155 | | } |
8156 | | #endif |
8157 | | |
8158 | | /* If no mark, no flow to find. */ |
8159 | 0 | if (!dp_packet_has_flow_mark(packet, &mark)) { |
8160 | 0 | *flow = NULL; |
8161 | 0 | return 0; |
8162 | 0 | } |
8163 | | |
8164 | 0 | *flow = mark_to_flow_find(pmd, mark); |
8165 | 0 | return 0; |
8166 | 0 | } |
8167 | | |
8168 | | /* Enqueues already classified packet into per-flow batches or the flow map, |
8169 | | * depending on the fact if batching enabled. */ |
8170 | | static inline void |
8171 | | dfc_processing_enqueue_classified_packet(struct dp_packet *packet, |
8172 | | struct dp_netdev_flow *flow, |
8173 | | uint16_t tcp_flags, |
8174 | | bool batch_enable, |
8175 | | struct packet_batch_per_flow *batches, |
8176 | | size_t *n_batches, |
8177 | | struct dp_packet_flow_map *flow_map, |
8178 | | size_t *map_cnt) |
8179 | | |
8180 | 0 | { |
8181 | 0 | if (OVS_LIKELY(batch_enable)) { |
8182 | 0 | dp_netdev_queue_batches(packet, flow, tcp_flags, batches, |
8183 | 0 | n_batches); |
8184 | 0 | } else { |
8185 | | /* Flow batching should be performed only after fast-path |
8186 | | * processing is also completed for packets with emc miss |
8187 | | * or else it will result in reordering of packets with |
8188 | | * same datapath flows. */ |
8189 | 0 | packet_enqueue_to_flow_map(packet, flow, tcp_flags, |
8190 | 0 | flow_map, (*map_cnt)++); |
8191 | 0 | } |
8192 | |
|
8193 | 0 | } |
8194 | | |
8195 | | /* Try to process all ('cnt') the 'packets' using only the datapath flow cache |
8196 | | * 'pmd->flow_cache'. If a flow is not found for a packet 'packets[i]', the |
8197 | | * miniflow is copied into 'keys' and the packet pointer is moved at the |
8198 | | * beginning of the 'packets' array. The pointers of missed keys are put in the |
8199 | | * missed_keys pointer array for future processing. |
8200 | | * |
8201 | | * The function returns the number of packets that needs to be processed in the |
8202 | | * 'packets' array (they have been moved to the beginning of the vector). |
8203 | | * |
8204 | | * For performance reasons a caller may choose not to initialize the metadata |
8205 | | * in 'packets_'. If 'md_is_valid' is false, the metadata in 'packets' |
8206 | | * is not valid and must be initialized by this function using 'port_no'. |
8207 | | * If 'md_is_valid' is true, the metadata is already valid and 'port_no' |
8208 | | * will be ignored. |
8209 | | */ |
8210 | | static inline size_t |
8211 | | dfc_processing(struct dp_netdev_pmd_thread *pmd, |
8212 | | struct dp_packet_batch *packets_, |
8213 | | struct netdev_flow_key *keys, |
8214 | | struct netdev_flow_key **missed_keys, |
8215 | | struct packet_batch_per_flow batches[], size_t *n_batches, |
8216 | | struct dp_packet_flow_map *flow_map, |
8217 | | size_t *n_flows, uint8_t *index_map, |
8218 | | bool md_is_valid, odp_port_t port_no) |
8219 | 0 | { |
8220 | 0 | const bool netdev_flow_api = netdev_is_flow_api_enabled(); |
8221 | 0 | const uint32_t recirc_depth = *recirc_depth_get(); |
8222 | 0 | const size_t cnt = dp_packet_batch_size(packets_); |
8223 | 0 | size_t n_missed = 0, n_emc_hit = 0, n_phwol_hit = 0; |
8224 | 0 | size_t n_mfex_opt_hit = 0, n_simple_hit = 0; |
8225 | 0 | struct dfc_cache *cache = &pmd->flow_cache; |
8226 | 0 | struct netdev_flow_key *key = &keys[0]; |
8227 | 0 | struct dp_packet *packet; |
8228 | 0 | size_t map_cnt = 0; |
8229 | 0 | bool batch_enable = true; |
8230 | |
|
8231 | 0 | const bool simple_match_enabled = |
8232 | 0 | !md_is_valid && dp_netdev_simple_match_enabled(pmd, port_no); |
8233 | | /* 'simple_match_table' is a full flow table. If the flow is not there, |
8234 | | * upcall is required, and there is no chance to find a match in caches. */ |
8235 | 0 | const bool smc_enable_db = !simple_match_enabled && pmd->ctx.smc_enable_db; |
8236 | 0 | const uint32_t cur_min = simple_match_enabled |
8237 | 0 | ? 0 : pmd->ctx.emc_insert_min; |
8238 | |
|
8239 | 0 | pmd_perf_update_counter(&pmd->perf_stats, |
8240 | 0 | md_is_valid ? PMD_STAT_RECIRC : PMD_STAT_RECV, |
8241 | 0 | cnt); |
8242 | 0 | int i; |
8243 | 0 | DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) { |
8244 | 0 | struct dp_netdev_flow *flow = NULL; |
8245 | 0 | uint16_t tcp_flags; |
8246 | |
|
8247 | 0 | if (OVS_UNLIKELY(dp_packet_size(packet) < ETH_HEADER_LEN)) { |
8248 | 0 | dp_packet_delete(packet); |
8249 | 0 | COVERAGE_INC(datapath_drop_rx_invalid_packet); |
8250 | 0 | continue; |
8251 | 0 | } |
8252 | | |
8253 | 0 | if (i != cnt - 1) { |
8254 | 0 | struct dp_packet **packets = packets_->packets; |
8255 | | /* Prefetch next packet data and metadata. */ |
8256 | 0 | OVS_PREFETCH(dp_packet_data(packets[i+1])); |
8257 | 0 | pkt_metadata_prefetch_init(&packets[i+1]->md); |
8258 | 0 | } |
8259 | |
|
8260 | 0 | if (!md_is_valid) { |
8261 | 0 | pkt_metadata_init(&packet->md, port_no); |
8262 | 0 | } |
8263 | |
|
8264 | 0 | if (netdev_flow_api && recirc_depth == 0) { |
8265 | 0 | if (OVS_UNLIKELY(dp_netdev_hw_flow(pmd, packet, &flow))) { |
8266 | | /* Packet restoration failed and it was dropped, do not |
8267 | | * continue processing. |
8268 | | */ |
8269 | 0 | continue; |
8270 | 0 | } |
8271 | 0 | if (OVS_LIKELY(flow)) { |
8272 | 0 | tcp_flags = parse_tcp_flags(packet, NULL, NULL, NULL); |
8273 | 0 | n_phwol_hit++; |
8274 | 0 | dfc_processing_enqueue_classified_packet( |
8275 | 0 | packet, flow, tcp_flags, batch_enable, |
8276 | 0 | batches, n_batches, flow_map, &map_cnt); |
8277 | 0 | continue; |
8278 | 0 | } |
8279 | 0 | } |
8280 | | |
8281 | 0 | if (!flow && simple_match_enabled) { |
8282 | 0 | ovs_be16 dl_type = 0, vlan_tci = 0; |
8283 | 0 | uint8_t nw_frag = 0; |
8284 | |
|
8285 | 0 | tcp_flags = parse_tcp_flags(packet, &dl_type, &nw_frag, &vlan_tci); |
8286 | 0 | flow = dp_netdev_simple_match_lookup(pmd, port_no, dl_type, |
8287 | 0 | nw_frag, vlan_tci); |
8288 | 0 | if (OVS_LIKELY(flow)) { |
8289 | 0 | n_simple_hit++; |
8290 | 0 | dfc_processing_enqueue_classified_packet( |
8291 | 0 | packet, flow, tcp_flags, batch_enable, |
8292 | 0 | batches, n_batches, flow_map, &map_cnt); |
8293 | 0 | continue; |
8294 | 0 | } |
8295 | 0 | } |
8296 | | |
8297 | 0 | miniflow_extract(packet, &key->mf); |
8298 | 0 | key->len = 0; /* Not computed yet. */ |
8299 | 0 | key->hash = |
8300 | 0 | (md_is_valid == false) |
8301 | 0 | ? dpif_netdev_packet_get_rss_hash_orig_pkt(packet, &key->mf) |
8302 | 0 | : dpif_netdev_packet_get_rss_hash(packet, &key->mf); |
8303 | | |
8304 | | /* If EMC is disabled skip emc_lookup */ |
8305 | 0 | flow = (cur_min != 0) ? emc_lookup(&cache->emc_cache, key) : NULL; |
8306 | 0 | if (OVS_LIKELY(flow)) { |
8307 | 0 | tcp_flags = miniflow_get_tcp_flags(&key->mf); |
8308 | 0 | n_emc_hit++; |
8309 | 0 | dfc_processing_enqueue_classified_packet( |
8310 | 0 | packet, flow, tcp_flags, batch_enable, |
8311 | 0 | batches, n_batches, flow_map, &map_cnt); |
8312 | 0 | } else { |
8313 | | /* Exact match cache missed. Group missed packets together at |
8314 | | * the beginning of the 'packets' array. */ |
8315 | 0 | dp_packet_batch_refill(packets_, packet, i); |
8316 | | |
8317 | | /* Preserve the order of packet for flow batching. */ |
8318 | 0 | index_map[n_missed] = map_cnt; |
8319 | 0 | flow_map[map_cnt++].flow = NULL; |
8320 | | |
8321 | | /* 'key[n_missed]' contains the key of the current packet and it |
8322 | | * will be passed to SMC lookup. The next key should be extracted |
8323 | | * to 'keys[n_missed + 1]'. |
8324 | | * We also maintain a pointer array to keys missed both SMC and EMC |
8325 | | * which will be returned to the caller for future processing. */ |
8326 | 0 | missed_keys[n_missed] = key; |
8327 | 0 | key = &keys[++n_missed]; |
8328 | | |
8329 | | /* Skip batching for subsequent packets to avoid reordering. */ |
8330 | 0 | batch_enable = false; |
8331 | 0 | } |
8332 | 0 | } |
8333 | | /* Count of packets which are not flow batched. */ |
8334 | 0 | *n_flows = map_cnt; |
8335 | |
|
8336 | 0 | pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_PHWOL_HIT, n_phwol_hit); |
8337 | 0 | pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MFEX_OPT_HIT, |
8338 | 0 | n_mfex_opt_hit); |
8339 | 0 | pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SIMPLE_HIT, |
8340 | 0 | n_simple_hit); |
8341 | 0 | pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_EXACT_HIT, n_emc_hit); |
8342 | |
|
8343 | 0 | if (!smc_enable_db) { |
8344 | 0 | return dp_packet_batch_size(packets_); |
8345 | 0 | } |
8346 | | |
8347 | | /* Packets miss EMC will do a batch lookup in SMC if enabled */ |
8348 | 0 | smc_lookup_batch(pmd, keys, missed_keys, packets_, |
8349 | 0 | n_missed, flow_map, index_map); |
8350 | |
|
8351 | 0 | return dp_packet_batch_size(packets_); |
8352 | 0 | } |
8353 | | |
8354 | | static inline int |
8355 | | handle_packet_upcall(struct dp_netdev_pmd_thread *pmd, |
8356 | | struct dp_packet *packet, |
8357 | | const struct netdev_flow_key *key, |
8358 | | struct ofpbuf *actions, struct ofpbuf *put_actions) |
8359 | 0 | { |
8360 | 0 | struct ofpbuf *add_actions; |
8361 | 0 | struct dp_packet_batch b; |
8362 | 0 | struct match match; |
8363 | 0 | ovs_u128 ufid; |
8364 | 0 | int error; |
8365 | 0 | uint64_t cycles = cycles_counter_update(&pmd->perf_stats); |
8366 | 0 | odp_port_t orig_in_port = packet->md.orig_in_port; |
8367 | |
|
8368 | 0 | match.tun_md.valid = false; |
8369 | 0 | miniflow_expand(&key->mf, &match.flow); |
8370 | 0 | memset(&match.wc, 0, sizeof match.wc); |
8371 | |
|
8372 | 0 | ofpbuf_clear(actions); |
8373 | 0 | ofpbuf_clear(put_actions); |
8374 | |
|
8375 | 0 | odp_flow_key_hash(&match.flow, sizeof match.flow, &ufid); |
8376 | 0 | error = dp_netdev_upcall(pmd, packet, &match.flow, &match.wc, |
8377 | 0 | &ufid, DPIF_UC_MISS, NULL, actions, |
8378 | 0 | put_actions); |
8379 | 0 | if (OVS_UNLIKELY(error && error != ENOSPC)) { |
8380 | 0 | dp_packet_delete(packet); |
8381 | 0 | COVERAGE_INC(datapath_drop_upcall_error); |
8382 | 0 | return error; |
8383 | 0 | } |
8384 | | |
8385 | | /* The Netlink encoding of datapath flow keys cannot express |
8386 | | * wildcarding the presence of a VLAN tag. Instead, a missing VLAN |
8387 | | * tag is interpreted as exact match on the fact that there is no |
8388 | | * VLAN. Unless we refactor a lot of code that translates between |
8389 | | * Netlink and struct flow representations, we have to do the same |
8390 | | * here. This must be in sync with 'match' in dpif_netdev_flow_put(). */ |
8391 | 0 | if (!match.wc.masks.vlans[0].tci) { |
8392 | 0 | match.wc.masks.vlans[0].tci = htons(VLAN_VID_MASK | VLAN_CFI); |
8393 | 0 | } |
8394 | | |
8395 | | /* We can't allow the packet batching in the next loop to execute |
8396 | | * the actions. Otherwise, if there are any slow path actions, |
8397 | | * we'll send the packet up twice. */ |
8398 | 0 | dp_packet_batch_init_packet(&b, packet); |
8399 | 0 | dp_netdev_execute_actions(pmd, &b, true, &match.flow, |
8400 | 0 | actions->data, actions->size); |
8401 | |
|
8402 | 0 | add_actions = put_actions->size ? put_actions : actions; |
8403 | 0 | if (OVS_LIKELY(error != ENOSPC)) { |
8404 | 0 | struct dp_netdev_flow *netdev_flow; |
8405 | | |
8406 | | /* XXX: There's a race window where a flow covering this packet |
8407 | | * could have already been installed since we last did the flow |
8408 | | * lookup before upcall. This could be solved by moving the |
8409 | | * mutex lock outside the loop, but that's an awful long time |
8410 | | * to be locking revalidators out of making flow modifications. */ |
8411 | 0 | ovs_mutex_lock(&pmd->flow_mutex); |
8412 | 0 | netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL); |
8413 | 0 | if (OVS_LIKELY(!netdev_flow)) { |
8414 | 0 | netdev_flow = dp_netdev_flow_add(pmd, &match, &ufid, |
8415 | 0 | add_actions->data, |
8416 | 0 | add_actions->size, orig_in_port); |
8417 | 0 | } |
8418 | 0 | ovs_mutex_unlock(&pmd->flow_mutex); |
8419 | 0 | uint32_t hash = dp_netdev_flow_hash(&netdev_flow->ufid); |
8420 | 0 | smc_insert(pmd, key, hash); |
8421 | 0 | emc_probabilistic_insert(pmd, key, netdev_flow); |
8422 | 0 | } |
8423 | 0 | if (pmd_perf_metrics_enabled(pmd)) { |
8424 | | /* Update upcall stats. */ |
8425 | 0 | cycles = cycles_counter_update(&pmd->perf_stats) - cycles; |
8426 | 0 | struct pmd_perf_stats *s = &pmd->perf_stats; |
8427 | 0 | s->current.upcalls++; |
8428 | 0 | s->current.upcall_cycles += cycles; |
8429 | 0 | histogram_add_sample(&s->cycles_per_upcall, cycles); |
8430 | 0 | } |
8431 | 0 | return error; |
8432 | 0 | } |
8433 | | |
8434 | | static inline void |
8435 | | fast_path_processing(struct dp_netdev_pmd_thread *pmd, |
8436 | | struct dp_packet_batch *packets_, |
8437 | | struct netdev_flow_key **keys, |
8438 | | struct dp_packet_flow_map *flow_map, |
8439 | | uint8_t *index_map, |
8440 | | odp_port_t in_port) |
8441 | 0 | { |
8442 | 0 | const size_t cnt = dp_packet_batch_size(packets_); |
8443 | 0 | #if !defined(__CHECKER__) && !defined(_WIN32) |
8444 | 0 | const size_t PKT_ARRAY_SIZE = cnt; |
8445 | | #else |
8446 | | /* Sparse or MSVC doesn't like variable length array. */ |
8447 | | enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST }; |
8448 | | #endif |
8449 | 0 | struct dp_packet *packet; |
8450 | 0 | struct dpcls *cls; |
8451 | 0 | struct dpcls_rule *rules[PKT_ARRAY_SIZE]; |
8452 | 0 | struct dp_netdev *dp = pmd->dp; |
8453 | 0 | int upcall_ok_cnt = 0, upcall_fail_cnt = 0; |
8454 | 0 | int lookup_cnt = 0, add_lookup_cnt; |
8455 | 0 | bool any_miss; |
8456 | |
|
8457 | 0 | for (size_t i = 0; i < cnt; i++) { |
8458 | | /* Key length is needed in all the cases, hash computed on demand. */ |
8459 | 0 | keys[i]->len = netdev_flow_key_size(miniflow_n_values(&keys[i]->mf)); |
8460 | 0 | } |
8461 | | /* Get the classifier for the in_port */ |
8462 | 0 | cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port); |
8463 | 0 | if (OVS_LIKELY(cls)) { |
8464 | 0 | any_miss = !dpcls_lookup(cls, (const struct netdev_flow_key **)keys, |
8465 | 0 | rules, cnt, &lookup_cnt); |
8466 | 0 | } else { |
8467 | 0 | any_miss = true; |
8468 | 0 | memset(rules, 0, sizeof(rules)); |
8469 | 0 | } |
8470 | 0 | if (OVS_UNLIKELY(any_miss) && !fat_rwlock_tryrdlock(&dp->upcall_rwlock)) { |
8471 | 0 | uint64_t actions_stub[512 / 8], slow_stub[512 / 8]; |
8472 | 0 | struct ofpbuf actions, put_actions; |
8473 | |
|
8474 | 0 | ofpbuf_use_stub(&actions, actions_stub, sizeof actions_stub); |
8475 | 0 | ofpbuf_use_stub(&put_actions, slow_stub, sizeof slow_stub); |
8476 | |
|
8477 | 0 | DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) { |
8478 | 0 | struct dp_netdev_flow *netdev_flow; |
8479 | |
|
8480 | 0 | if (OVS_LIKELY(rules[i])) { |
8481 | 0 | continue; |
8482 | 0 | } |
8483 | | |
8484 | | /* It's possible that an earlier slow path execution installed |
8485 | | * a rule covering this flow. In this case, it's a lot cheaper |
8486 | | * to catch it here than execute a miss. */ |
8487 | 0 | netdev_flow = dp_netdev_pmd_lookup_flow(pmd, keys[i], |
8488 | 0 | &add_lookup_cnt); |
8489 | 0 | if (netdev_flow) { |
8490 | 0 | lookup_cnt += add_lookup_cnt; |
8491 | 0 | rules[i] = &netdev_flow->cr; |
8492 | 0 | continue; |
8493 | 0 | } |
8494 | | |
8495 | 0 | int error = handle_packet_upcall(pmd, packet, keys[i], |
8496 | 0 | &actions, &put_actions); |
8497 | |
|
8498 | 0 | if (OVS_UNLIKELY(error)) { |
8499 | 0 | upcall_fail_cnt++; |
8500 | 0 | } else { |
8501 | 0 | upcall_ok_cnt++; |
8502 | 0 | } |
8503 | 0 | } |
8504 | |
|
8505 | 0 | ofpbuf_uninit(&actions); |
8506 | 0 | ofpbuf_uninit(&put_actions); |
8507 | 0 | fat_rwlock_unlock(&dp->upcall_rwlock); |
8508 | 0 | } else if (OVS_UNLIKELY(any_miss)) { |
8509 | 0 | DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) { |
8510 | 0 | if (OVS_UNLIKELY(!rules[i])) { |
8511 | 0 | dp_packet_delete(packet); |
8512 | 0 | COVERAGE_INC(datapath_drop_lock_error); |
8513 | 0 | upcall_fail_cnt++; |
8514 | 0 | } |
8515 | 0 | } |
8516 | 0 | } |
8517 | |
|
8518 | 0 | DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) { |
8519 | 0 | struct dp_netdev_flow *flow; |
8520 | | /* Get the original order of this packet in received batch. */ |
8521 | 0 | int recv_idx = index_map[i]; |
8522 | 0 | uint16_t tcp_flags; |
8523 | |
|
8524 | 0 | if (OVS_UNLIKELY(!rules[i])) { |
8525 | 0 | continue; |
8526 | 0 | } |
8527 | | |
8528 | 0 | flow = dp_netdev_flow_cast(rules[i]); |
8529 | 0 | uint32_t hash = dp_netdev_flow_hash(&flow->ufid); |
8530 | 0 | smc_insert(pmd, keys[i], hash); |
8531 | |
|
8532 | 0 | emc_probabilistic_insert(pmd, keys[i], flow); |
8533 | | /* Add these packets into the flow map in the same order |
8534 | | * as received. |
8535 | | */ |
8536 | 0 | tcp_flags = miniflow_get_tcp_flags(&keys[i]->mf); |
8537 | 0 | packet_enqueue_to_flow_map(packet, flow, tcp_flags, |
8538 | 0 | flow_map, recv_idx); |
8539 | 0 | } |
8540 | |
|
8541 | 0 | pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_HIT, |
8542 | 0 | cnt - upcall_ok_cnt - upcall_fail_cnt); |
8543 | 0 | pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_LOOKUP, |
8544 | 0 | lookup_cnt); |
8545 | 0 | pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MISS, |
8546 | 0 | upcall_ok_cnt); |
8547 | 0 | pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_LOST, |
8548 | 0 | upcall_fail_cnt); |
8549 | 0 | } |
8550 | | |
8551 | | /* Packets enter the datapath from a port (or from recirculation) here. |
8552 | | * |
8553 | | * When 'md_is_valid' is true the metadata in 'packets' are already valid. |
8554 | | * When false the metadata in 'packets' need to be initialized. */ |
8555 | | static void |
8556 | | dp_netdev_input__(struct dp_netdev_pmd_thread *pmd, |
8557 | | struct dp_packet_batch *packets, |
8558 | | bool md_is_valid, odp_port_t port_no) |
8559 | 0 | { |
8560 | 0 | #if !defined(__CHECKER__) && !defined(_WIN32) |
8561 | 0 | const size_t PKT_ARRAY_SIZE = dp_packet_batch_size(packets); |
8562 | | #else |
8563 | | /* Sparse or MSVC doesn't like variable length array. */ |
8564 | | enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST }; |
8565 | | #endif |
8566 | 0 | OVS_ALIGNED_VAR(CACHE_LINE_SIZE) |
8567 | 0 | struct netdev_flow_key keys[PKT_ARRAY_SIZE]; |
8568 | 0 | struct netdev_flow_key *missed_keys[PKT_ARRAY_SIZE]; |
8569 | 0 | struct packet_batch_per_flow batches[PKT_ARRAY_SIZE]; |
8570 | 0 | size_t n_batches; |
8571 | 0 | struct dp_packet_flow_map flow_map[PKT_ARRAY_SIZE]; |
8572 | 0 | uint8_t index_map[PKT_ARRAY_SIZE]; |
8573 | 0 | size_t n_flows, i; |
8574 | |
|
8575 | 0 | odp_port_t in_port; |
8576 | |
|
8577 | 0 | n_batches = 0; |
8578 | 0 | dfc_processing(pmd, packets, keys, missed_keys, batches, &n_batches, |
8579 | 0 | flow_map, &n_flows, index_map, md_is_valid, port_no); |
8580 | |
|
8581 | 0 | if (!dp_packet_batch_is_empty(packets)) { |
8582 | | /* Get ingress port from first packet's metadata. */ |
8583 | 0 | in_port = packets->packets[0]->md.in_port.odp_port; |
8584 | 0 | fast_path_processing(pmd, packets, missed_keys, |
8585 | 0 | flow_map, index_map, in_port); |
8586 | 0 | } |
8587 | | |
8588 | | /* Batch rest of packets which are in flow map. */ |
8589 | 0 | for (i = 0; i < n_flows; i++) { |
8590 | 0 | struct dp_packet_flow_map *map = &flow_map[i]; |
8591 | |
|
8592 | 0 | if (OVS_UNLIKELY(!map->flow)) { |
8593 | 0 | continue; |
8594 | 0 | } |
8595 | 0 | dp_netdev_queue_batches(map->packet, map->flow, map->tcp_flags, |
8596 | 0 | batches, &n_batches); |
8597 | 0 | } |
8598 | | |
8599 | | /* All the flow batches need to be reset before any call to |
8600 | | * packet_batch_per_flow_execute() as it could potentially trigger |
8601 | | * recirculation. When a packet matching flow 'j' happens to be |
8602 | | * recirculated, the nested call to dp_netdev_input__() could potentially |
8603 | | * classify the packet as matching another flow - say 'k'. It could happen |
8604 | | * that in the previous call to dp_netdev_input__() that same flow 'k' had |
8605 | | * already its own batches[k] still waiting to be served. So if its |
8606 | | * 'batch' member is not reset, the recirculated packet would be wrongly |
8607 | | * appended to batches[k] of the 1st call to dp_netdev_input__(). */ |
8608 | 0 | for (i = 0; i < n_batches; i++) { |
8609 | 0 | batches[i].flow->batch = NULL; |
8610 | 0 | } |
8611 | |
|
8612 | 0 | for (i = 0; i < n_batches; i++) { |
8613 | 0 | packet_batch_per_flow_execute(&batches[i], pmd); |
8614 | 0 | } |
8615 | 0 | } |
8616 | | |
8617 | | int32_t |
8618 | | dp_netdev_input(struct dp_netdev_pmd_thread *pmd, |
8619 | | struct dp_packet_batch *packets, |
8620 | | odp_port_t port_no) |
8621 | 0 | { |
8622 | 0 | dp_netdev_input__(pmd, packets, false, port_no); |
8623 | 0 | return 0; |
8624 | 0 | } |
8625 | | |
8626 | | static void |
8627 | | dp_netdev_recirculate(struct dp_netdev_pmd_thread *pmd, |
8628 | | struct dp_packet_batch *packets) |
8629 | 0 | { |
8630 | 0 | dp_netdev_input__(pmd, packets, true, 0); |
8631 | 0 | } |
8632 | | |
8633 | | struct dp_netdev_execute_aux { |
8634 | | struct dp_netdev_pmd_thread *pmd; |
8635 | | const struct flow *flow; |
8636 | | }; |
8637 | | |
8638 | | static void |
8639 | | dpif_netdev_register_dp_purge_cb(struct dpif *dpif, dp_purge_callback *cb, |
8640 | | void *aux) |
8641 | 0 | { |
8642 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
8643 | 0 | dp->dp_purge_aux = aux; |
8644 | 0 | dp->dp_purge_cb = cb; |
8645 | 0 | } |
8646 | | |
8647 | | static void |
8648 | | dpif_netdev_register_upcall_cb(struct dpif *dpif, upcall_callback *cb, |
8649 | | void *aux) |
8650 | 0 | { |
8651 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
8652 | 0 | dp->upcall_aux = aux; |
8653 | 0 | dp->upcall_cb = cb; |
8654 | 0 | } |
8655 | | |
8656 | | static void |
8657 | | dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd, |
8658 | | bool purge) |
8659 | 0 | { |
8660 | 0 | struct tx_port *tx; |
8661 | 0 | struct dp_netdev_port *port; |
8662 | 0 | long long interval; |
8663 | |
|
8664 | 0 | HMAP_FOR_EACH (tx, node, &pmd->send_port_cache) { |
8665 | 0 | if (tx->port->txq_mode != TXQ_MODE_XPS) { |
8666 | 0 | continue; |
8667 | 0 | } |
8668 | 0 | interval = pmd->ctx.now - tx->last_used; |
8669 | 0 | if (tx->qid >= 0 && (purge || interval >= XPS_TIMEOUT)) { |
8670 | 0 | port = tx->port; |
8671 | 0 | ovs_mutex_lock(&port->txq_used_mutex); |
8672 | 0 | port->txq_used[tx->qid]--; |
8673 | 0 | ovs_mutex_unlock(&port->txq_used_mutex); |
8674 | 0 | tx->qid = -1; |
8675 | 0 | } |
8676 | 0 | } |
8677 | 0 | } |
8678 | | |
8679 | | static int |
8680 | | dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd, |
8681 | | struct tx_port *tx) |
8682 | 0 | { |
8683 | 0 | struct dp_netdev_port *port; |
8684 | 0 | long long interval; |
8685 | 0 | int i, min_cnt, min_qid; |
8686 | |
|
8687 | 0 | interval = pmd->ctx.now - tx->last_used; |
8688 | 0 | tx->last_used = pmd->ctx.now; |
8689 | |
|
8690 | 0 | if (OVS_LIKELY(tx->qid >= 0 && interval < XPS_TIMEOUT)) { |
8691 | 0 | return tx->qid; |
8692 | 0 | } |
8693 | | |
8694 | 0 | port = tx->port; |
8695 | |
|
8696 | 0 | ovs_mutex_lock(&port->txq_used_mutex); |
8697 | 0 | if (tx->qid >= 0) { |
8698 | 0 | port->txq_used[tx->qid]--; |
8699 | 0 | tx->qid = -1; |
8700 | 0 | } |
8701 | |
|
8702 | 0 | min_cnt = -1; |
8703 | 0 | min_qid = 0; |
8704 | 0 | for (i = 0; i < netdev_n_txq(port->netdev); i++) { |
8705 | 0 | if (port->txq_used[i] < min_cnt || min_cnt == -1) { |
8706 | 0 | min_cnt = port->txq_used[i]; |
8707 | 0 | min_qid = i; |
8708 | 0 | } |
8709 | 0 | } |
8710 | |
|
8711 | 0 | port->txq_used[min_qid]++; |
8712 | 0 | tx->qid = min_qid; |
8713 | |
|
8714 | 0 | ovs_mutex_unlock(&port->txq_used_mutex); |
8715 | |
|
8716 | 0 | dpif_netdev_xps_revalidate_pmd(pmd, false); |
8717 | |
|
8718 | 0 | VLOG_DBG("Core %d: New TX queue ID %d for port \'%s\'.", |
8719 | 0 | pmd->core_id, tx->qid, netdev_get_name(tx->port->netdev)); |
8720 | 0 | return min_qid; |
8721 | 0 | } |
8722 | | |
8723 | | static struct tx_port * |
8724 | | pmd_tnl_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd, |
8725 | | odp_port_t port_no) |
8726 | 0 | { |
8727 | 0 | return tx_port_lookup(&pmd->tnl_port_cache, port_no); |
8728 | 0 | } |
8729 | | |
8730 | | static struct tx_port * |
8731 | | pmd_send_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd, |
8732 | | odp_port_t port_no) |
8733 | 0 | { |
8734 | 0 | return tx_port_lookup(&pmd->send_port_cache, port_no); |
8735 | 0 | } |
8736 | | |
8737 | | static int |
8738 | | push_tnl_action(const struct dp_netdev_pmd_thread *pmd, |
8739 | | const struct nlattr *attr, |
8740 | | struct dp_packet_batch *batch) |
8741 | 0 | { |
8742 | 0 | struct tx_port *tun_port; |
8743 | 0 | const struct ovs_action_push_tnl *data; |
8744 | 0 | int err; |
8745 | |
|
8746 | 0 | data = nl_attr_get(attr); |
8747 | |
|
8748 | 0 | tun_port = pmd_tnl_port_cache_lookup(pmd, data->tnl_port); |
8749 | 0 | if (!tun_port) { |
8750 | 0 | err = -EINVAL; |
8751 | 0 | goto error; |
8752 | 0 | } |
8753 | 0 | err = netdev_push_header(tun_port->port->netdev, batch, data); |
8754 | 0 | if (!err) { |
8755 | 0 | return 0; |
8756 | 0 | } |
8757 | 0 | error: |
8758 | 0 | dp_packet_delete_batch(batch, true); |
8759 | 0 | return err; |
8760 | 0 | } |
8761 | | |
8762 | | static void |
8763 | | dp_execute_userspace_action(struct dp_netdev_pmd_thread *pmd, |
8764 | | struct dp_packet *packet, bool should_steal, |
8765 | | struct flow *flow, ovs_u128 *ufid, |
8766 | | struct ofpbuf *actions, |
8767 | | const struct nlattr *userdata) |
8768 | 0 | { |
8769 | 0 | struct dp_packet_batch b; |
8770 | 0 | int error; |
8771 | |
|
8772 | 0 | ofpbuf_clear(actions); |
8773 | |
|
8774 | 0 | error = dp_netdev_upcall(pmd, packet, flow, NULL, ufid, |
8775 | 0 | DPIF_UC_ACTION, userdata, actions, |
8776 | 0 | NULL); |
8777 | 0 | if (!error || error == ENOSPC) { |
8778 | 0 | dp_packet_batch_init_packet(&b, packet); |
8779 | 0 | dp_netdev_execute_actions(pmd, &b, should_steal, flow, |
8780 | 0 | actions->data, actions->size); |
8781 | 0 | } else if (should_steal) { |
8782 | 0 | dp_packet_delete(packet); |
8783 | 0 | COVERAGE_INC(datapath_drop_userspace_action_error); |
8784 | 0 | } |
8785 | 0 | } |
8786 | | |
8787 | | static bool |
8788 | | dp_execute_output_action(struct dp_netdev_pmd_thread *pmd, |
8789 | | struct dp_packet_batch *packets_, |
8790 | | bool should_steal, odp_port_t port_no) |
8791 | 0 | { |
8792 | 0 | struct tx_port *p = pmd_send_port_cache_lookup(pmd, port_no); |
8793 | 0 | struct dp_packet_batch out; |
8794 | |
|
8795 | 0 | if (!OVS_LIKELY(p)) { |
8796 | 0 | COVERAGE_ADD(datapath_drop_invalid_port, |
8797 | 0 | dp_packet_batch_size(packets_)); |
8798 | 0 | dp_packet_delete_batch(packets_, should_steal); |
8799 | 0 | return false; |
8800 | 0 | } |
8801 | 0 | if (!should_steal) { |
8802 | 0 | dp_packet_batch_clone(&out, packets_); |
8803 | 0 | dp_packet_batch_reset_cutlen(packets_); |
8804 | 0 | packets_ = &out; |
8805 | 0 | } |
8806 | 0 | dp_packet_batch_apply_cutlen(packets_); |
8807 | | #ifdef DPDK_NETDEV |
8808 | | if (OVS_UNLIKELY(!dp_packet_batch_is_empty(&p->output_pkts) |
8809 | | && packets_->packets[0]->source |
8810 | | != p->output_pkts.packets[0]->source)) { |
8811 | | /* XXX: netdev-dpdk assumes that all packets in a single |
8812 | | * output batch has the same source. Flush here to |
8813 | | * avoid memory access issues. */ |
8814 | | dp_netdev_pmd_flush_output_on_port(pmd, p); |
8815 | | } |
8816 | | #endif |
8817 | 0 | if (dp_packet_batch_size(&p->output_pkts) |
8818 | 0 | + dp_packet_batch_size(packets_) > NETDEV_MAX_BURST) { |
8819 | | /* Flush here to avoid overflow. */ |
8820 | 0 | dp_netdev_pmd_flush_output_on_port(pmd, p); |
8821 | 0 | } |
8822 | 0 | if (dp_packet_batch_is_empty(&p->output_pkts)) { |
8823 | 0 | pmd->n_output_batches++; |
8824 | 0 | } |
8825 | |
|
8826 | 0 | struct dp_packet *packet; |
8827 | 0 | DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) { |
8828 | 0 | p->output_pkts_rxqs[dp_packet_batch_size(&p->output_pkts)] = |
8829 | 0 | pmd->ctx.last_rxq; |
8830 | 0 | dp_packet_batch_add(&p->output_pkts, packet); |
8831 | 0 | } |
8832 | 0 | return true; |
8833 | 0 | } |
8834 | | |
8835 | | static void |
8836 | | dp_execute_lb_output_action(struct dp_netdev_pmd_thread *pmd, |
8837 | | struct dp_packet_batch *packets_, |
8838 | | bool should_steal, uint32_t bond) |
8839 | 0 | { |
8840 | 0 | struct tx_bond *p_bond = tx_bond_lookup(&pmd->tx_bonds, bond); |
8841 | 0 | struct dp_packet_batch out; |
8842 | 0 | struct dp_packet *packet; |
8843 | |
|
8844 | 0 | if (!p_bond) { |
8845 | 0 | COVERAGE_ADD(datapath_drop_invalid_bond, |
8846 | 0 | dp_packet_batch_size(packets_)); |
8847 | 0 | dp_packet_delete_batch(packets_, should_steal); |
8848 | 0 | return; |
8849 | 0 | } |
8850 | 0 | if (!should_steal) { |
8851 | 0 | dp_packet_batch_clone(&out, packets_); |
8852 | 0 | dp_packet_batch_reset_cutlen(packets_); |
8853 | 0 | packets_ = &out; |
8854 | 0 | } |
8855 | 0 | dp_packet_batch_apply_cutlen(packets_); |
8856 | |
|
8857 | 0 | DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) { |
8858 | | /* |
8859 | | * Lookup the bond-hash table using hash to get the member. |
8860 | | */ |
8861 | 0 | uint32_t hash = dp_packet_get_rss_hash(packet); |
8862 | 0 | struct member_entry *s_entry |
8863 | 0 | = &p_bond->member_buckets[hash & BOND_MASK]; |
8864 | 0 | odp_port_t bond_member = s_entry->member_id; |
8865 | 0 | uint32_t size = dp_packet_size(packet); |
8866 | 0 | struct dp_packet_batch output_pkt; |
8867 | |
|
8868 | 0 | dp_packet_batch_init_packet(&output_pkt, packet); |
8869 | 0 | if (OVS_LIKELY(dp_execute_output_action(pmd, &output_pkt, true, |
8870 | 0 | bond_member))) { |
8871 | | /* Update member stats. */ |
8872 | 0 | non_atomic_ullong_add(&s_entry->n_packets, 1); |
8873 | 0 | non_atomic_ullong_add(&s_entry->n_bytes, size); |
8874 | 0 | } |
8875 | 0 | } |
8876 | 0 | } |
8877 | | |
8878 | | static void |
8879 | | dp_execute_cb(void *aux_, struct dp_packet_batch *packets_, |
8880 | | const struct nlattr *a, bool should_steal) |
8881 | | OVS_NO_THREAD_SAFETY_ANALYSIS |
8882 | 0 | { |
8883 | 0 | struct dp_netdev_execute_aux *aux = aux_; |
8884 | 0 | uint32_t *depth = recirc_depth_get(); |
8885 | 0 | struct dp_netdev_pmd_thread *pmd = aux->pmd; |
8886 | 0 | struct dp_netdev *dp = pmd->dp; |
8887 | 0 | int type = nl_attr_type(a); |
8888 | 0 | struct tx_port *p; |
8889 | 0 | uint32_t packet_count, packets_dropped; |
8890 | |
|
8891 | 0 | switch ((enum ovs_action_attr)type) { |
8892 | 0 | case OVS_ACTION_ATTR_OUTPUT: |
8893 | 0 | dp_execute_output_action(pmd, packets_, should_steal, |
8894 | 0 | nl_attr_get_odp_port(a)); |
8895 | 0 | return; |
8896 | | |
8897 | 0 | case OVS_ACTION_ATTR_LB_OUTPUT: |
8898 | 0 | dp_execute_lb_output_action(pmd, packets_, should_steal, |
8899 | 0 | nl_attr_get_u32(a)); |
8900 | 0 | return; |
8901 | | |
8902 | 0 | case OVS_ACTION_ATTR_TUNNEL_PUSH: |
8903 | 0 | if (should_steal) { |
8904 | | /* We're requested to push tunnel header, but also we need to take |
8905 | | * the ownership of these packets. Thus, we can avoid performing |
8906 | | * the action, because the caller will not use the result anyway. |
8907 | | * Just break to free the batch. */ |
8908 | 0 | break; |
8909 | 0 | } |
8910 | 0 | dp_packet_batch_apply_cutlen(packets_); |
8911 | 0 | packet_count = dp_packet_batch_size(packets_); |
8912 | 0 | if (push_tnl_action(pmd, a, packets_)) { |
8913 | 0 | COVERAGE_ADD(datapath_drop_tunnel_push_error, |
8914 | 0 | packet_count); |
8915 | 0 | } |
8916 | 0 | return; |
8917 | | |
8918 | 0 | case OVS_ACTION_ATTR_TUNNEL_POP: |
8919 | 0 | if (*depth < MAX_RECIRC_DEPTH) { |
8920 | 0 | struct dp_packet_batch *orig_packets_ = packets_; |
8921 | 0 | odp_port_t portno = nl_attr_get_odp_port(a); |
8922 | |
|
8923 | 0 | p = pmd_tnl_port_cache_lookup(pmd, portno); |
8924 | 0 | if (p) { |
8925 | 0 | struct dp_packet_batch tnl_pkt; |
8926 | |
|
8927 | 0 | if (!should_steal) { |
8928 | 0 | dp_packet_batch_clone(&tnl_pkt, packets_); |
8929 | 0 | packets_ = &tnl_pkt; |
8930 | 0 | dp_packet_batch_reset_cutlen(orig_packets_); |
8931 | 0 | } |
8932 | |
|
8933 | 0 | dp_packet_batch_apply_cutlen(packets_); |
8934 | |
|
8935 | 0 | packet_count = dp_packet_batch_size(packets_); |
8936 | 0 | netdev_pop_header(p->port->netdev, packets_); |
8937 | 0 | packets_dropped = |
8938 | 0 | packet_count - dp_packet_batch_size(packets_); |
8939 | 0 | if (packets_dropped) { |
8940 | 0 | COVERAGE_ADD(datapath_drop_tunnel_pop_error, |
8941 | 0 | packets_dropped); |
8942 | 0 | } |
8943 | 0 | if (dp_packet_batch_is_empty(packets_)) { |
8944 | 0 | return; |
8945 | 0 | } |
8946 | | |
8947 | 0 | struct dp_packet *packet; |
8948 | 0 | DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) { |
8949 | 0 | packet->md.in_port.odp_port = portno; |
8950 | 0 | } |
8951 | |
|
8952 | 0 | (*depth)++; |
8953 | 0 | dp_netdev_recirculate(pmd, packets_); |
8954 | 0 | (*depth)--; |
8955 | 0 | return; |
8956 | 0 | } |
8957 | 0 | COVERAGE_ADD(datapath_drop_invalid_tnl_port, |
8958 | 0 | dp_packet_batch_size(packets_)); |
8959 | 0 | } else { |
8960 | 0 | COVERAGE_ADD(datapath_drop_recirc_error, |
8961 | 0 | dp_packet_batch_size(packets_)); |
8962 | 0 | } |
8963 | 0 | break; |
8964 | | |
8965 | 0 | case OVS_ACTION_ATTR_USERSPACE: |
8966 | 0 | if (!fat_rwlock_tryrdlock(&dp->upcall_rwlock)) { |
8967 | 0 | struct dp_packet_batch *orig_packets_ = packets_; |
8968 | 0 | const struct nlattr *userdata; |
8969 | 0 | struct dp_packet_batch usr_pkt; |
8970 | 0 | struct ofpbuf actions; |
8971 | 0 | struct flow flow; |
8972 | 0 | ovs_u128 ufid; |
8973 | 0 | bool clone = false; |
8974 | |
|
8975 | 0 | userdata = nl_attr_find_nested(a, OVS_USERSPACE_ATTR_USERDATA); |
8976 | 0 | ofpbuf_init(&actions, 0); |
8977 | |
|
8978 | 0 | if (packets_->trunc) { |
8979 | 0 | if (!should_steal) { |
8980 | 0 | dp_packet_batch_clone(&usr_pkt, packets_); |
8981 | 0 | packets_ = &usr_pkt; |
8982 | 0 | clone = true; |
8983 | 0 | dp_packet_batch_reset_cutlen(orig_packets_); |
8984 | 0 | } |
8985 | |
|
8986 | 0 | dp_packet_batch_apply_cutlen(packets_); |
8987 | 0 | } |
8988 | |
|
8989 | 0 | struct dp_packet *packet; |
8990 | 0 | DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) { |
8991 | 0 | flow_extract(packet, &flow); |
8992 | 0 | odp_flow_key_hash(&flow, sizeof flow, &ufid); |
8993 | 0 | dp_execute_userspace_action(pmd, packet, should_steal, &flow, |
8994 | 0 | &ufid, &actions, userdata); |
8995 | 0 | } |
8996 | |
|
8997 | 0 | if (clone) { |
8998 | 0 | dp_packet_delete_batch(packets_, true); |
8999 | 0 | } |
9000 | |
|
9001 | 0 | ofpbuf_uninit(&actions); |
9002 | 0 | fat_rwlock_unlock(&dp->upcall_rwlock); |
9003 | |
|
9004 | 0 | return; |
9005 | 0 | } |
9006 | 0 | COVERAGE_ADD(datapath_drop_lock_error, |
9007 | 0 | dp_packet_batch_size(packets_)); |
9008 | 0 | break; |
9009 | | |
9010 | 0 | case OVS_ACTION_ATTR_RECIRC: |
9011 | 0 | if (*depth < MAX_RECIRC_DEPTH) { |
9012 | 0 | struct dp_packet_batch recirc_pkts; |
9013 | |
|
9014 | 0 | if (!should_steal) { |
9015 | 0 | dp_packet_batch_clone(&recirc_pkts, packets_); |
9016 | 0 | packets_ = &recirc_pkts; |
9017 | 0 | } |
9018 | |
|
9019 | 0 | struct dp_packet *packet; |
9020 | 0 | DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) { |
9021 | 0 | packet->md.recirc_id = nl_attr_get_u32(a); |
9022 | 0 | } |
9023 | |
|
9024 | 0 | (*depth)++; |
9025 | 0 | dp_netdev_recirculate(pmd, packets_); |
9026 | 0 | (*depth)--; |
9027 | |
|
9028 | 0 | return; |
9029 | 0 | } |
9030 | | |
9031 | 0 | COVERAGE_ADD(datapath_drop_recirc_error, |
9032 | 0 | dp_packet_batch_size(packets_)); |
9033 | 0 | VLOG_WARN("Packet dropped. Max recirculation depth exceeded."); |
9034 | 0 | break; |
9035 | | |
9036 | 0 | case OVS_ACTION_ATTR_CT: { |
9037 | 0 | const struct nlattr *b; |
9038 | 0 | bool force = false; |
9039 | 0 | bool commit = false; |
9040 | 0 | unsigned int left; |
9041 | 0 | uint16_t zone = 0; |
9042 | 0 | uint32_t tp_id = 0; |
9043 | 0 | const char *helper = NULL; |
9044 | 0 | const uint32_t *setmark = NULL; |
9045 | 0 | const struct ovs_key_ct_labels *setlabel = NULL; |
9046 | 0 | struct nat_action_info_t nat_action_info; |
9047 | 0 | struct nat_action_info_t *nat_action_info_ref = NULL; |
9048 | 0 | bool nat_config = false; |
9049 | |
|
9050 | 0 | NL_ATTR_FOR_EACH_UNSAFE (b, left, nl_attr_get(a), |
9051 | 0 | nl_attr_get_size(a)) { |
9052 | 0 | enum ovs_ct_attr sub_type = nl_attr_type(b); |
9053 | |
|
9054 | 0 | switch(sub_type) { |
9055 | 0 | case OVS_CT_ATTR_FORCE_COMMIT: |
9056 | 0 | force = true; |
9057 | | /* fall through. */ |
9058 | 0 | case OVS_CT_ATTR_COMMIT: |
9059 | 0 | commit = true; |
9060 | 0 | break; |
9061 | 0 | case OVS_CT_ATTR_ZONE: |
9062 | 0 | zone = nl_attr_get_u16(b); |
9063 | 0 | break; |
9064 | 0 | case OVS_CT_ATTR_HELPER: |
9065 | 0 | helper = nl_attr_get_string(b); |
9066 | 0 | break; |
9067 | 0 | case OVS_CT_ATTR_MARK: |
9068 | 0 | setmark = nl_attr_get(b); |
9069 | 0 | break; |
9070 | 0 | case OVS_CT_ATTR_LABELS: |
9071 | 0 | setlabel = nl_attr_get(b); |
9072 | 0 | break; |
9073 | 0 | case OVS_CT_ATTR_EVENTMASK: |
9074 | | /* Silently ignored, as userspace datapath does not generate |
9075 | | * netlink events. */ |
9076 | 0 | break; |
9077 | 0 | case OVS_CT_ATTR_TIMEOUT: |
9078 | 0 | if (!str_to_uint(nl_attr_get_string(b), 10, &tp_id)) { |
9079 | 0 | VLOG_WARN("Invalid Timeout Policy ID: %s.", |
9080 | 0 | nl_attr_get_string(b)); |
9081 | 0 | tp_id = DEFAULT_TP_ID; |
9082 | 0 | } |
9083 | 0 | break; |
9084 | 0 | case OVS_CT_ATTR_NAT: { |
9085 | 0 | const struct nlattr *b_nest; |
9086 | 0 | unsigned int left_nest; |
9087 | 0 | bool ip_min_specified = false; |
9088 | 0 | bool proto_num_min_specified = false; |
9089 | 0 | bool ip_max_specified = false; |
9090 | 0 | bool proto_num_max_specified = false; |
9091 | 0 | memset(&nat_action_info, 0, sizeof nat_action_info); |
9092 | 0 | nat_action_info_ref = &nat_action_info; |
9093 | |
|
9094 | 0 | NL_NESTED_FOR_EACH_UNSAFE (b_nest, left_nest, b) { |
9095 | 0 | enum ovs_nat_attr sub_type_nest = nl_attr_type(b_nest); |
9096 | |
|
9097 | 0 | switch (sub_type_nest) { |
9098 | 0 | case OVS_NAT_ATTR_SRC: |
9099 | 0 | case OVS_NAT_ATTR_DST: |
9100 | 0 | nat_config = true; |
9101 | 0 | nat_action_info.nat_action |= |
9102 | 0 | ((sub_type_nest == OVS_NAT_ATTR_SRC) |
9103 | 0 | ? NAT_ACTION_SRC : NAT_ACTION_DST); |
9104 | 0 | break; |
9105 | 0 | case OVS_NAT_ATTR_IP_MIN: |
9106 | 0 | memcpy(&nat_action_info.min_addr, |
9107 | 0 | nl_attr_get(b_nest), |
9108 | 0 | nl_attr_get_size(b_nest)); |
9109 | 0 | ip_min_specified = true; |
9110 | 0 | break; |
9111 | 0 | case OVS_NAT_ATTR_IP_MAX: |
9112 | 0 | memcpy(&nat_action_info.max_addr, |
9113 | 0 | nl_attr_get(b_nest), |
9114 | 0 | nl_attr_get_size(b_nest)); |
9115 | 0 | ip_max_specified = true; |
9116 | 0 | break; |
9117 | 0 | case OVS_NAT_ATTR_PROTO_MIN: |
9118 | 0 | nat_action_info.min_port = |
9119 | 0 | nl_attr_get_u16(b_nest); |
9120 | 0 | proto_num_min_specified = true; |
9121 | 0 | break; |
9122 | 0 | case OVS_NAT_ATTR_PROTO_MAX: |
9123 | 0 | nat_action_info.max_port = |
9124 | 0 | nl_attr_get_u16(b_nest); |
9125 | 0 | proto_num_max_specified = true; |
9126 | 0 | break; |
9127 | 0 | case OVS_NAT_ATTR_PERSISTENT: |
9128 | 0 | case OVS_NAT_ATTR_PROTO_HASH: |
9129 | 0 | case OVS_NAT_ATTR_PROTO_RANDOM: |
9130 | 0 | break; |
9131 | 0 | case OVS_NAT_ATTR_UNSPEC: |
9132 | 0 | case __OVS_NAT_ATTR_MAX: |
9133 | 0 | OVS_NOT_REACHED(); |
9134 | 0 | } |
9135 | 0 | } |
9136 | | |
9137 | 0 | if (ip_min_specified && !ip_max_specified) { |
9138 | 0 | nat_action_info.max_addr = nat_action_info.min_addr; |
9139 | 0 | } |
9140 | 0 | if (proto_num_min_specified && !proto_num_max_specified) { |
9141 | 0 | nat_action_info.max_port = nat_action_info.min_port; |
9142 | 0 | } |
9143 | 0 | if (proto_num_min_specified || proto_num_max_specified) { |
9144 | 0 | if (nat_action_info.nat_action & NAT_ACTION_SRC) { |
9145 | 0 | nat_action_info.nat_action |= NAT_ACTION_SRC_PORT; |
9146 | 0 | } else if (nat_action_info.nat_action & NAT_ACTION_DST) { |
9147 | 0 | nat_action_info.nat_action |= NAT_ACTION_DST_PORT; |
9148 | 0 | } |
9149 | 0 | } |
9150 | 0 | break; |
9151 | 0 | } |
9152 | 0 | case OVS_CT_ATTR_UNSPEC: |
9153 | 0 | case __OVS_CT_ATTR_MAX: |
9154 | 0 | OVS_NOT_REACHED(); |
9155 | 0 | } |
9156 | 0 | } |
9157 | | |
9158 | | /* We won't be able to function properly in this case, hence |
9159 | | * complain loudly. */ |
9160 | 0 | if (nat_config && !commit) { |
9161 | 0 | static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5); |
9162 | 0 | VLOG_WARN_RL(&rl, "NAT specified without commit."); |
9163 | 0 | } |
9164 | |
|
9165 | 0 | conntrack_execute(dp->conntrack, packets_, aux->flow->dl_type, force, |
9166 | 0 | commit, zone, setmark, setlabel, aux->flow->tp_src, |
9167 | 0 | aux->flow->tp_dst, helper, nat_action_info_ref, |
9168 | 0 | pmd->ctx.now / 1000, tp_id); |
9169 | 0 | break; |
9170 | 0 | } |
9171 | | |
9172 | 0 | case OVS_ACTION_ATTR_METER: |
9173 | 0 | dp_netdev_run_meter(pmd->dp, packets_, nl_attr_get_u32(a), |
9174 | 0 | pmd->ctx.now); |
9175 | 0 | break; |
9176 | | |
9177 | 0 | case OVS_ACTION_ATTR_PUSH_VLAN: |
9178 | 0 | case OVS_ACTION_ATTR_POP_VLAN: |
9179 | 0 | case OVS_ACTION_ATTR_PUSH_MPLS: |
9180 | 0 | case OVS_ACTION_ATTR_POP_MPLS: |
9181 | 0 | case OVS_ACTION_ATTR_SET: |
9182 | 0 | case OVS_ACTION_ATTR_SET_MASKED: |
9183 | 0 | case OVS_ACTION_ATTR_SAMPLE: |
9184 | 0 | case OVS_ACTION_ATTR_HASH: |
9185 | 0 | case OVS_ACTION_ATTR_UNSPEC: |
9186 | 0 | case OVS_ACTION_ATTR_TRUNC: |
9187 | 0 | case OVS_ACTION_ATTR_PUSH_ETH: |
9188 | 0 | case OVS_ACTION_ATTR_POP_ETH: |
9189 | 0 | case OVS_ACTION_ATTR_CLONE: |
9190 | 0 | case OVS_ACTION_ATTR_PUSH_NSH: |
9191 | 0 | case OVS_ACTION_ATTR_POP_NSH: |
9192 | 0 | case OVS_ACTION_ATTR_CT_CLEAR: |
9193 | 0 | case OVS_ACTION_ATTR_CHECK_PKT_LEN: |
9194 | 0 | case OVS_ACTION_ATTR_DROP: |
9195 | 0 | case OVS_ACTION_ATTR_ADD_MPLS: |
9196 | 0 | case __OVS_ACTION_ATTR_MAX: |
9197 | 0 | OVS_NOT_REACHED(); |
9198 | 0 | } |
9199 | | |
9200 | 0 | dp_packet_delete_batch(packets_, should_steal); |
9201 | 0 | } |
9202 | | |
9203 | | static void |
9204 | | dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd, |
9205 | | struct dp_packet_batch *packets, |
9206 | | bool should_steal, const struct flow *flow, |
9207 | | const struct nlattr *actions, size_t actions_len) |
9208 | 0 | { |
9209 | 0 | struct dp_netdev_execute_aux aux = { pmd, flow }; |
9210 | |
|
9211 | 0 | odp_execute_actions(&aux, packets, should_steal, actions, |
9212 | 0 | actions_len, dp_execute_cb); |
9213 | 0 | } |
9214 | | |
9215 | | struct dp_netdev_ct_dump { |
9216 | | struct ct_dpif_dump_state up; |
9217 | | struct conntrack_dump dump; |
9218 | | struct conntrack *ct; |
9219 | | struct dp_netdev *dp; |
9220 | | }; |
9221 | | |
9222 | | static int |
9223 | | dpif_netdev_ct_dump_start(struct dpif *dpif, struct ct_dpif_dump_state **dump_, |
9224 | | const uint16_t *pzone, int *ptot_bkts) |
9225 | 0 | { |
9226 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
9227 | 0 | struct dp_netdev_ct_dump *dump; |
9228 | |
|
9229 | 0 | dump = xzalloc(sizeof *dump); |
9230 | 0 | dump->dp = dp; |
9231 | 0 | dump->ct = dp->conntrack; |
9232 | |
|
9233 | 0 | conntrack_dump_start(dp->conntrack, &dump->dump, pzone, ptot_bkts); |
9234 | |
|
9235 | 0 | *dump_ = &dump->up; |
9236 | |
|
9237 | 0 | return 0; |
9238 | 0 | } |
9239 | | |
9240 | | static int |
9241 | | dpif_netdev_ct_dump_next(struct dpif *dpif OVS_UNUSED, |
9242 | | struct ct_dpif_dump_state *dump_, |
9243 | | struct ct_dpif_entry *entry) |
9244 | 0 | { |
9245 | 0 | struct dp_netdev_ct_dump *dump; |
9246 | |
|
9247 | 0 | INIT_CONTAINER(dump, dump_, up); |
9248 | |
|
9249 | 0 | return conntrack_dump_next(&dump->dump, entry); |
9250 | 0 | } |
9251 | | |
9252 | | static int |
9253 | | dpif_netdev_ct_dump_done(struct dpif *dpif OVS_UNUSED, |
9254 | | struct ct_dpif_dump_state *dump_) |
9255 | 0 | { |
9256 | 0 | struct dp_netdev_ct_dump *dump; |
9257 | 0 | int err; |
9258 | |
|
9259 | 0 | INIT_CONTAINER(dump, dump_, up); |
9260 | |
|
9261 | 0 | err = conntrack_dump_done(&dump->dump); |
9262 | |
|
9263 | 0 | free(dump); |
9264 | |
|
9265 | 0 | return err; |
9266 | 0 | } |
9267 | | |
9268 | | static int |
9269 | | dpif_netdev_ct_flush(struct dpif *dpif, const uint16_t *zone, |
9270 | | const struct ct_dpif_tuple *tuple) |
9271 | 0 | { |
9272 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
9273 | |
|
9274 | 0 | if (tuple) { |
9275 | 0 | return conntrack_flush_tuple(dp->conntrack, tuple, zone ? *zone : 0); |
9276 | 0 | } |
9277 | 0 | return conntrack_flush(dp->conntrack, zone); |
9278 | 0 | } |
9279 | | |
9280 | | static int |
9281 | | dpif_netdev_ct_set_maxconns(struct dpif *dpif, uint32_t maxconns) |
9282 | 0 | { |
9283 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
9284 | |
|
9285 | 0 | return conntrack_set_maxconns(dp->conntrack, maxconns); |
9286 | 0 | } |
9287 | | |
9288 | | static int |
9289 | | dpif_netdev_ct_get_maxconns(struct dpif *dpif, uint32_t *maxconns) |
9290 | 0 | { |
9291 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
9292 | |
|
9293 | 0 | return conntrack_get_maxconns(dp->conntrack, maxconns); |
9294 | 0 | } |
9295 | | |
9296 | | static int |
9297 | | dpif_netdev_ct_get_nconns(struct dpif *dpif, uint32_t *nconns) |
9298 | 0 | { |
9299 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
9300 | |
|
9301 | 0 | return conntrack_get_nconns(dp->conntrack, nconns); |
9302 | 0 | } |
9303 | | |
9304 | | static int |
9305 | | dpif_netdev_ct_set_tcp_seq_chk(struct dpif *dpif, bool enabled) |
9306 | 0 | { |
9307 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
9308 | |
|
9309 | 0 | return conntrack_set_tcp_seq_chk(dp->conntrack, enabled); |
9310 | 0 | } |
9311 | | |
9312 | | static int |
9313 | | dpif_netdev_ct_get_tcp_seq_chk(struct dpif *dpif, bool *enabled) |
9314 | 0 | { |
9315 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
9316 | 0 | *enabled = conntrack_get_tcp_seq_chk(dp->conntrack); |
9317 | 0 | return 0; |
9318 | 0 | } |
9319 | | |
9320 | | static int |
9321 | | dpif_netdev_ct_set_limits(struct dpif *dpif, |
9322 | | const uint32_t *default_limits, |
9323 | | const struct ovs_list *zone_limits) |
9324 | 0 | { |
9325 | 0 | int err = 0; |
9326 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
9327 | 0 | if (default_limits) { |
9328 | 0 | err = zone_limit_update(dp->conntrack, DEFAULT_ZONE, *default_limits); |
9329 | 0 | if (err != 0) { |
9330 | 0 | return err; |
9331 | 0 | } |
9332 | 0 | } |
9333 | | |
9334 | 0 | struct ct_dpif_zone_limit *zone_limit; |
9335 | 0 | LIST_FOR_EACH (zone_limit, node, zone_limits) { |
9336 | 0 | err = zone_limit_update(dp->conntrack, zone_limit->zone, |
9337 | 0 | zone_limit->limit); |
9338 | 0 | if (err != 0) { |
9339 | 0 | break; |
9340 | 0 | } |
9341 | 0 | } |
9342 | 0 | return err; |
9343 | 0 | } |
9344 | | |
9345 | | static int |
9346 | | dpif_netdev_ct_get_limits(struct dpif *dpif, |
9347 | | uint32_t *default_limit, |
9348 | | const struct ovs_list *zone_limits_request, |
9349 | | struct ovs_list *zone_limits_reply) |
9350 | 0 | { |
9351 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
9352 | 0 | struct conntrack_zone_limit czl; |
9353 | |
|
9354 | 0 | czl = zone_limit_get(dp->conntrack, DEFAULT_ZONE); |
9355 | 0 | if (czl.zone == DEFAULT_ZONE) { |
9356 | 0 | *default_limit = czl.limit; |
9357 | 0 | } else { |
9358 | 0 | return EINVAL; |
9359 | 0 | } |
9360 | | |
9361 | 0 | if (!ovs_list_is_empty(zone_limits_request)) { |
9362 | 0 | struct ct_dpif_zone_limit *zone_limit; |
9363 | 0 | LIST_FOR_EACH (zone_limit, node, zone_limits_request) { |
9364 | 0 | czl = zone_limit_get(dp->conntrack, zone_limit->zone); |
9365 | 0 | if (czl.zone == zone_limit->zone || czl.zone == DEFAULT_ZONE) { |
9366 | 0 | ct_dpif_push_zone_limit(zone_limits_reply, zone_limit->zone, |
9367 | 0 | czl.limit, |
9368 | 0 | atomic_count_get(&czl.count)); |
9369 | 0 | } else { |
9370 | 0 | return EINVAL; |
9371 | 0 | } |
9372 | 0 | } |
9373 | 0 | } else { |
9374 | 0 | for (int z = MIN_ZONE; z <= MAX_ZONE; z++) { |
9375 | 0 | czl = zone_limit_get(dp->conntrack, z); |
9376 | 0 | if (czl.zone == z) { |
9377 | 0 | ct_dpif_push_zone_limit(zone_limits_reply, z, czl.limit, |
9378 | 0 | atomic_count_get(&czl.count)); |
9379 | 0 | } |
9380 | 0 | } |
9381 | 0 | } |
9382 | | |
9383 | 0 | return 0; |
9384 | 0 | } |
9385 | | |
9386 | | static int |
9387 | | dpif_netdev_ct_del_limits(struct dpif *dpif, |
9388 | | const struct ovs_list *zone_limits) |
9389 | 0 | { |
9390 | 0 | int err = 0; |
9391 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
9392 | 0 | struct ct_dpif_zone_limit *zone_limit; |
9393 | 0 | LIST_FOR_EACH (zone_limit, node, zone_limits) { |
9394 | 0 | err = zone_limit_delete(dp->conntrack, zone_limit->zone); |
9395 | 0 | if (err != 0) { |
9396 | 0 | break; |
9397 | 0 | } |
9398 | 0 | } |
9399 | |
|
9400 | 0 | return err; |
9401 | 0 | } |
9402 | | |
9403 | | static int |
9404 | | dpif_netdev_ct_get_features(struct dpif *dpif OVS_UNUSED, |
9405 | | enum ct_features *features) |
9406 | 0 | { |
9407 | 0 | if (features != NULL) { |
9408 | 0 | *features = CONNTRACK_F_ZERO_SNAT; |
9409 | 0 | } |
9410 | 0 | return 0; |
9411 | 0 | } |
9412 | | |
9413 | | static int |
9414 | | dpif_netdev_ct_set_timeout_policy(struct dpif *dpif, |
9415 | | const struct ct_dpif_timeout_policy *dpif_tp) |
9416 | 0 | { |
9417 | 0 | struct timeout_policy tp; |
9418 | 0 | struct dp_netdev *dp; |
9419 | |
|
9420 | 0 | dp = get_dp_netdev(dpif); |
9421 | 0 | memcpy(&tp.policy, dpif_tp, sizeof tp.policy); |
9422 | 0 | return timeout_policy_update(dp->conntrack, &tp); |
9423 | 0 | } |
9424 | | |
9425 | | static int |
9426 | | dpif_netdev_ct_get_timeout_policy(struct dpif *dpif, uint32_t tp_id, |
9427 | | struct ct_dpif_timeout_policy *dpif_tp) |
9428 | 0 | { |
9429 | 0 | struct timeout_policy *tp; |
9430 | 0 | struct dp_netdev *dp; |
9431 | 0 | int err = 0; |
9432 | |
|
9433 | 0 | dp = get_dp_netdev(dpif); |
9434 | 0 | tp = timeout_policy_get(dp->conntrack, tp_id); |
9435 | 0 | if (!tp) { |
9436 | 0 | return ENOENT; |
9437 | 0 | } |
9438 | 0 | memcpy(dpif_tp, &tp->policy, sizeof tp->policy); |
9439 | 0 | return err; |
9440 | 0 | } |
9441 | | |
9442 | | static int |
9443 | | dpif_netdev_ct_del_timeout_policy(struct dpif *dpif, |
9444 | | uint32_t tp_id) |
9445 | 0 | { |
9446 | 0 | struct dp_netdev *dp; |
9447 | 0 | int err = 0; |
9448 | |
|
9449 | 0 | dp = get_dp_netdev(dpif); |
9450 | 0 | err = timeout_policy_delete(dp->conntrack, tp_id); |
9451 | 0 | return err; |
9452 | 0 | } |
9453 | | |
9454 | | static int |
9455 | | dpif_netdev_ct_get_timeout_policy_name(struct dpif *dpif OVS_UNUSED, |
9456 | | uint32_t tp_id, |
9457 | | uint16_t dl_type OVS_UNUSED, |
9458 | | uint8_t nw_proto OVS_UNUSED, |
9459 | | char **tp_name, bool *is_generic) |
9460 | 0 | { |
9461 | 0 | struct ds ds = DS_EMPTY_INITIALIZER; |
9462 | |
|
9463 | 0 | ds_put_format(&ds, "%"PRIu32, tp_id); |
9464 | 0 | *tp_name = ds_steal_cstr(&ds); |
9465 | 0 | *is_generic = true; |
9466 | 0 | return 0; |
9467 | 0 | } |
9468 | | |
9469 | | static int |
9470 | | dpif_netdev_ipf_set_enabled(struct dpif *dpif, bool v6, bool enable) |
9471 | 0 | { |
9472 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
9473 | 0 | return ipf_set_enabled(conntrack_ipf_ctx(dp->conntrack), v6, enable); |
9474 | 0 | } |
9475 | | |
9476 | | static int |
9477 | | dpif_netdev_ipf_set_min_frag(struct dpif *dpif, bool v6, uint32_t min_frag) |
9478 | 0 | { |
9479 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
9480 | 0 | return ipf_set_min_frag(conntrack_ipf_ctx(dp->conntrack), v6, min_frag); |
9481 | 0 | } |
9482 | | |
9483 | | static int |
9484 | | dpif_netdev_ipf_set_max_nfrags(struct dpif *dpif, uint32_t max_frags) |
9485 | 0 | { |
9486 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
9487 | 0 | return ipf_set_max_nfrags(conntrack_ipf_ctx(dp->conntrack), max_frags); |
9488 | 0 | } |
9489 | | |
9490 | | /* Adjust this function if 'dpif_ipf_status' and 'ipf_status' were to |
9491 | | * diverge. */ |
9492 | | static int |
9493 | | dpif_netdev_ipf_get_status(struct dpif *dpif, |
9494 | | struct dpif_ipf_status *dpif_ipf_status) |
9495 | 0 | { |
9496 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
9497 | 0 | ipf_get_status(conntrack_ipf_ctx(dp->conntrack), |
9498 | 0 | (struct ipf_status *) dpif_ipf_status); |
9499 | 0 | return 0; |
9500 | 0 | } |
9501 | | |
9502 | | static int |
9503 | | dpif_netdev_ipf_dump_start(struct dpif *dpif OVS_UNUSED, |
9504 | | struct ipf_dump_ctx **ipf_dump_ctx) |
9505 | 0 | { |
9506 | 0 | return ipf_dump_start(ipf_dump_ctx); |
9507 | 0 | } |
9508 | | |
9509 | | static int |
9510 | | dpif_netdev_ipf_dump_next(struct dpif *dpif, void *ipf_dump_ctx, char **dump) |
9511 | 0 | { |
9512 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
9513 | 0 | return ipf_dump_next(conntrack_ipf_ctx(dp->conntrack), ipf_dump_ctx, |
9514 | 0 | dump); |
9515 | 0 | } |
9516 | | |
9517 | | static int |
9518 | | dpif_netdev_ipf_dump_done(struct dpif *dpif OVS_UNUSED, void *ipf_dump_ctx) |
9519 | 0 | { |
9520 | 0 | return ipf_dump_done(ipf_dump_ctx); |
9521 | |
|
9522 | 0 | } |
9523 | | |
9524 | | static int |
9525 | | dpif_netdev_bond_add(struct dpif *dpif, uint32_t bond_id, |
9526 | | odp_port_t *member_map) |
9527 | 0 | { |
9528 | 0 | struct tx_bond *new_tx = xzalloc(sizeof *new_tx); |
9529 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
9530 | 0 | struct dp_netdev_pmd_thread *pmd; |
9531 | | |
9532 | | /* Prepare new bond mapping. */ |
9533 | 0 | new_tx->bond_id = bond_id; |
9534 | 0 | for (int bucket = 0; bucket < BOND_BUCKETS; bucket++) { |
9535 | 0 | new_tx->member_buckets[bucket].member_id = member_map[bucket]; |
9536 | 0 | } |
9537 | |
|
9538 | 0 | ovs_mutex_lock(&dp->bond_mutex); |
9539 | | /* Check if bond already existed. */ |
9540 | 0 | struct tx_bond *old_tx = tx_bond_lookup(&dp->tx_bonds, bond_id); |
9541 | 0 | if (old_tx) { |
9542 | 0 | cmap_replace(&dp->tx_bonds, &old_tx->node, &new_tx->node, |
9543 | 0 | hash_bond_id(bond_id)); |
9544 | 0 | ovsrcu_postpone(free, old_tx); |
9545 | 0 | } else { |
9546 | 0 | cmap_insert(&dp->tx_bonds, &new_tx->node, hash_bond_id(bond_id)); |
9547 | 0 | } |
9548 | 0 | ovs_mutex_unlock(&dp->bond_mutex); |
9549 | | |
9550 | | /* Update all PMDs with new bond mapping. */ |
9551 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
9552 | 0 | dp_netdev_add_bond_tx_to_pmd(pmd, new_tx, true); |
9553 | 0 | } |
9554 | 0 | return 0; |
9555 | 0 | } |
9556 | | |
9557 | | static int |
9558 | | dpif_netdev_bond_del(struct dpif *dpif, uint32_t bond_id) |
9559 | 0 | { |
9560 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
9561 | 0 | struct dp_netdev_pmd_thread *pmd; |
9562 | 0 | struct tx_bond *tx; |
9563 | |
|
9564 | 0 | ovs_mutex_lock(&dp->bond_mutex); |
9565 | | /* Check if bond existed. */ |
9566 | 0 | tx = tx_bond_lookup(&dp->tx_bonds, bond_id); |
9567 | 0 | if (tx) { |
9568 | 0 | cmap_remove(&dp->tx_bonds, &tx->node, hash_bond_id(bond_id)); |
9569 | 0 | ovsrcu_postpone(free, tx); |
9570 | 0 | } else { |
9571 | | /* Bond is not present. */ |
9572 | 0 | ovs_mutex_unlock(&dp->bond_mutex); |
9573 | 0 | return ENOENT; |
9574 | 0 | } |
9575 | 0 | ovs_mutex_unlock(&dp->bond_mutex); |
9576 | | |
9577 | | /* Remove the bond map in all pmds. */ |
9578 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
9579 | 0 | dp_netdev_del_bond_tx_from_pmd(pmd, bond_id); |
9580 | 0 | } |
9581 | 0 | return 0; |
9582 | 0 | } |
9583 | | |
9584 | | static int |
9585 | | dpif_netdev_bond_stats_get(struct dpif *dpif, uint32_t bond_id, |
9586 | | uint64_t *n_bytes) |
9587 | 0 | { |
9588 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
9589 | 0 | struct dp_netdev_pmd_thread *pmd; |
9590 | |
|
9591 | 0 | if (!tx_bond_lookup(&dp->tx_bonds, bond_id)) { |
9592 | 0 | return ENOENT; |
9593 | 0 | } |
9594 | | |
9595 | | /* Search the bond in all PMDs. */ |
9596 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
9597 | 0 | struct tx_bond *pmd_bond_entry |
9598 | 0 | = tx_bond_lookup(&pmd->tx_bonds, bond_id); |
9599 | |
|
9600 | 0 | if (!pmd_bond_entry) { |
9601 | 0 | continue; |
9602 | 0 | } |
9603 | | |
9604 | | /* Read bond stats. */ |
9605 | 0 | for (int i = 0; i < BOND_BUCKETS; i++) { |
9606 | 0 | uint64_t pmd_n_bytes; |
9607 | |
|
9608 | 0 | atomic_read_relaxed(&pmd_bond_entry->member_buckets[i].n_bytes, |
9609 | 0 | &pmd_n_bytes); |
9610 | 0 | n_bytes[i] += pmd_n_bytes; |
9611 | 0 | } |
9612 | 0 | } |
9613 | 0 | return 0; |
9614 | 0 | } |
9615 | | |
9616 | | const struct dpif_class dpif_netdev_class = { |
9617 | | "netdev", |
9618 | | true, /* cleanup_required */ |
9619 | | true, /* synced_dp_layers */ |
9620 | | dpif_netdev_init, |
9621 | | dpif_netdev_enumerate, |
9622 | | dpif_netdev_port_open_type, |
9623 | | dpif_netdev_open, |
9624 | | dpif_netdev_close, |
9625 | | dpif_netdev_destroy, |
9626 | | dpif_netdev_run, |
9627 | | dpif_netdev_wait, |
9628 | | dpif_netdev_get_stats, |
9629 | | NULL, /* set_features */ |
9630 | | dpif_netdev_port_add, |
9631 | | dpif_netdev_port_del, |
9632 | | dpif_netdev_port_set_config, |
9633 | | dpif_netdev_port_query_by_number, |
9634 | | dpif_netdev_port_query_by_name, |
9635 | | NULL, /* port_get_pid */ |
9636 | | dpif_netdev_port_dump_start, |
9637 | | dpif_netdev_port_dump_next, |
9638 | | dpif_netdev_port_dump_done, |
9639 | | dpif_netdev_port_poll, |
9640 | | dpif_netdev_port_poll_wait, |
9641 | | dpif_netdev_flow_flush, |
9642 | | dpif_netdev_flow_dump_create, |
9643 | | dpif_netdev_flow_dump_destroy, |
9644 | | dpif_netdev_flow_dump_thread_create, |
9645 | | dpif_netdev_flow_dump_thread_destroy, |
9646 | | dpif_netdev_flow_dump_next, |
9647 | | dpif_netdev_operate, |
9648 | | dpif_netdev_offload_stats_get, |
9649 | | NULL, /* recv_set */ |
9650 | | NULL, /* handlers_set */ |
9651 | | NULL, /* number_handlers_required */ |
9652 | | dpif_netdev_set_config, |
9653 | | dpif_netdev_queue_to_priority, |
9654 | | NULL, /* recv */ |
9655 | | NULL, /* recv_wait */ |
9656 | | NULL, /* recv_purge */ |
9657 | | dpif_netdev_register_dp_purge_cb, |
9658 | | dpif_netdev_register_upcall_cb, |
9659 | | dpif_netdev_enable_upcall, |
9660 | | dpif_netdev_disable_upcall, |
9661 | | dpif_netdev_get_datapath_version, |
9662 | | dpif_netdev_ct_dump_start, |
9663 | | dpif_netdev_ct_dump_next, |
9664 | | dpif_netdev_ct_dump_done, |
9665 | | dpif_netdev_ct_flush, |
9666 | | dpif_netdev_ct_set_maxconns, |
9667 | | dpif_netdev_ct_get_maxconns, |
9668 | | dpif_netdev_ct_get_nconns, |
9669 | | dpif_netdev_ct_set_tcp_seq_chk, |
9670 | | dpif_netdev_ct_get_tcp_seq_chk, |
9671 | | dpif_netdev_ct_set_limits, |
9672 | | dpif_netdev_ct_get_limits, |
9673 | | dpif_netdev_ct_del_limits, |
9674 | | dpif_netdev_ct_set_timeout_policy, |
9675 | | dpif_netdev_ct_get_timeout_policy, |
9676 | | dpif_netdev_ct_del_timeout_policy, |
9677 | | NULL, /* ct_timeout_policy_dump_start */ |
9678 | | NULL, /* ct_timeout_policy_dump_next */ |
9679 | | NULL, /* ct_timeout_policy_dump_done */ |
9680 | | dpif_netdev_ct_get_timeout_policy_name, |
9681 | | dpif_netdev_ct_get_features, |
9682 | | dpif_netdev_ipf_set_enabled, |
9683 | | dpif_netdev_ipf_set_min_frag, |
9684 | | dpif_netdev_ipf_set_max_nfrags, |
9685 | | dpif_netdev_ipf_get_status, |
9686 | | dpif_netdev_ipf_dump_start, |
9687 | | dpif_netdev_ipf_dump_next, |
9688 | | dpif_netdev_ipf_dump_done, |
9689 | | dpif_netdev_meter_get_features, |
9690 | | dpif_netdev_meter_set, |
9691 | | dpif_netdev_meter_get, |
9692 | | dpif_netdev_meter_del, |
9693 | | dpif_netdev_bond_add, |
9694 | | dpif_netdev_bond_del, |
9695 | | dpif_netdev_bond_stats_get, |
9696 | | NULL, /* cache_get_supported_levels */ |
9697 | | NULL, /* cache_get_name */ |
9698 | | NULL, /* cache_get_size */ |
9699 | | NULL, /* cache_set_size */ |
9700 | | }; |
9701 | | |
9702 | | static void |
9703 | | dpif_dummy_change_port_number(struct unixctl_conn *conn, int argc OVS_UNUSED, |
9704 | | const char *argv[], void *aux OVS_UNUSED) |
9705 | 0 | { |
9706 | 0 | struct dp_netdev_port *port; |
9707 | 0 | struct dp_netdev *dp; |
9708 | 0 | odp_port_t port_no; |
9709 | |
|
9710 | 0 | ovs_mutex_lock(&dp_netdev_mutex); |
9711 | 0 | dp = shash_find_data(&dp_netdevs, argv[1]); |
9712 | 0 | if (!dp || !dpif_netdev_class_is_dummy(dp->class)) { |
9713 | 0 | ovs_mutex_unlock(&dp_netdev_mutex); |
9714 | 0 | unixctl_command_reply_error(conn, "unknown datapath or not a dummy"); |
9715 | 0 | return; |
9716 | 0 | } |
9717 | 0 | ovs_refcount_ref(&dp->ref_cnt); |
9718 | 0 | ovs_mutex_unlock(&dp_netdev_mutex); |
9719 | |
|
9720 | 0 | ovs_rwlock_wrlock(&dp->port_rwlock); |
9721 | 0 | if (get_port_by_name(dp, argv[2], &port)) { |
9722 | 0 | unixctl_command_reply_error(conn, "unknown port"); |
9723 | 0 | goto exit; |
9724 | 0 | } |
9725 | | |
9726 | 0 | port_no = u32_to_odp(atoi(argv[3])); |
9727 | 0 | if (!port_no || port_no == ODPP_NONE) { |
9728 | 0 | unixctl_command_reply_error(conn, "bad port number"); |
9729 | 0 | goto exit; |
9730 | 0 | } |
9731 | 0 | if (dp_netdev_lookup_port(dp, port_no)) { |
9732 | 0 | unixctl_command_reply_error(conn, "port number already in use"); |
9733 | 0 | goto exit; |
9734 | 0 | } |
9735 | | |
9736 | | /* Remove port. */ |
9737 | 0 | hmap_remove(&dp->ports, &port->node); |
9738 | 0 | reconfigure_datapath(dp); |
9739 | | |
9740 | | /* Reinsert with new port number. */ |
9741 | 0 | port->port_no = port_no; |
9742 | 0 | hmap_insert(&dp->ports, &port->node, hash_port_no(port_no)); |
9743 | 0 | reconfigure_datapath(dp); |
9744 | |
|
9745 | 0 | seq_change(dp->port_seq); |
9746 | 0 | unixctl_command_reply(conn, NULL); |
9747 | |
|
9748 | 0 | exit: |
9749 | 0 | ovs_rwlock_unlock(&dp->port_rwlock); |
9750 | 0 | dp_netdev_unref(dp); |
9751 | 0 | } |
9752 | | |
9753 | | static void |
9754 | | dpif_dummy_register__(const char *type) |
9755 | 0 | { |
9756 | 0 | struct dpif_class *class; |
9757 | |
|
9758 | 0 | class = xmalloc(sizeof *class); |
9759 | 0 | *class = dpif_netdev_class; |
9760 | 0 | class->type = xstrdup(type); |
9761 | 0 | dp_register_provider(class); |
9762 | 0 | } |
9763 | | |
9764 | | static void |
9765 | | dpif_dummy_override(const char *type) |
9766 | 0 | { |
9767 | 0 | int error; |
9768 | | |
9769 | | /* |
9770 | | * Ignore EAFNOSUPPORT to allow --enable-dummy=system with |
9771 | | * a userland-only build. It's useful for testsuite. |
9772 | | */ |
9773 | 0 | error = dp_unregister_provider(type); |
9774 | 0 | if (error == 0 || error == EAFNOSUPPORT) { |
9775 | 0 | dpif_dummy_register__(type); |
9776 | 0 | } |
9777 | 0 | } |
9778 | | |
9779 | | void |
9780 | | dpif_dummy_register(enum dummy_level level) |
9781 | 0 | { |
9782 | 0 | if (level == DUMMY_OVERRIDE_ALL) { |
9783 | 0 | struct sset types; |
9784 | 0 | const char *type; |
9785 | |
|
9786 | 0 | sset_init(&types); |
9787 | 0 | dp_enumerate_types(&types); |
9788 | 0 | SSET_FOR_EACH (type, &types) { |
9789 | 0 | dpif_dummy_override(type); |
9790 | 0 | } |
9791 | 0 | sset_destroy(&types); |
9792 | 0 | } else if (level == DUMMY_OVERRIDE_SYSTEM) { |
9793 | 0 | dpif_dummy_override("system"); |
9794 | 0 | } |
9795 | |
|
9796 | 0 | dpif_dummy_register__("dummy"); |
9797 | |
|
9798 | 0 | unixctl_command_register("dpif-dummy/change-port-number", |
9799 | 0 | "dp port new-number", |
9800 | 0 | 3, 3, dpif_dummy_change_port_number, NULL); |
9801 | 0 | } |
9802 | | |
9803 | | /* Datapath Classifier. */ |
9804 | | |
9805 | | static void |
9806 | | dpcls_subtable_destroy_cb(struct dpcls_subtable *subtable) |
9807 | 0 | { |
9808 | 0 | cmap_destroy(&subtable->rules); |
9809 | 0 | ovsrcu_postpone(free, subtable->mf_masks); |
9810 | 0 | ovsrcu_postpone(free, subtable); |
9811 | 0 | } |
9812 | | |
9813 | | /* Initializes 'cls' as a classifier that initially contains no classification |
9814 | | * rules. */ |
9815 | | static void |
9816 | | dpcls_init(struct dpcls *cls) |
9817 | 0 | { |
9818 | 0 | cmap_init(&cls->subtables_map); |
9819 | 0 | pvector_init(&cls->subtables); |
9820 | 0 | } |
9821 | | |
9822 | | static void |
9823 | | dpcls_destroy_subtable(struct dpcls *cls, struct dpcls_subtable *subtable) |
9824 | 0 | { |
9825 | 0 | VLOG_DBG("Destroying subtable %p for in_port %d", subtable, cls->in_port); |
9826 | 0 | pvector_remove(&cls->subtables, subtable); |
9827 | 0 | cmap_remove(&cls->subtables_map, &subtable->cmap_node, |
9828 | 0 | subtable->mask.hash); |
9829 | 0 | dpcls_info_dec_usage(subtable->lookup_func_info); |
9830 | 0 | ovsrcu_postpone(dpcls_subtable_destroy_cb, subtable); |
9831 | 0 | } |
9832 | | |
9833 | | /* Destroys 'cls'. Rules within 'cls', if any, are not freed; this is the |
9834 | | * caller's responsibility. |
9835 | | * May only be called after all the readers have been terminated. */ |
9836 | | static void |
9837 | | dpcls_destroy(struct dpcls *cls) |
9838 | 0 | { |
9839 | 0 | if (cls) { |
9840 | 0 | struct dpcls_subtable *subtable; |
9841 | |
|
9842 | 0 | CMAP_FOR_EACH (subtable, cmap_node, &cls->subtables_map) { |
9843 | 0 | ovs_assert(cmap_count(&subtable->rules) == 0); |
9844 | 0 | dpcls_destroy_subtable(cls, subtable); |
9845 | 0 | } |
9846 | 0 | cmap_destroy(&cls->subtables_map); |
9847 | 0 | pvector_destroy(&cls->subtables); |
9848 | 0 | } |
9849 | 0 | } |
9850 | | |
9851 | | static struct dpcls_subtable * |
9852 | | dpcls_create_subtable(struct dpcls *cls, const struct netdev_flow_key *mask) |
9853 | 0 | { |
9854 | 0 | struct dpcls_subtable *subtable; |
9855 | | |
9856 | | /* Need to add one. */ |
9857 | 0 | subtable = xmalloc(sizeof *subtable |
9858 | 0 | - sizeof subtable->mask.mf + mask->len); |
9859 | 0 | cmap_init(&subtable->rules); |
9860 | 0 | subtable->hit_cnt = 0; |
9861 | 0 | netdev_flow_key_clone(&subtable->mask, mask); |
9862 | | |
9863 | | /* The count of bits in the mask defines the space required for masks. |
9864 | | * Then call gen_masks() to create the appropriate masks, avoiding the cost |
9865 | | * of doing runtime calculations. */ |
9866 | 0 | uint32_t unit0 = count_1bits(mask->mf.map.bits[0]); |
9867 | 0 | uint32_t unit1 = count_1bits(mask->mf.map.bits[1]); |
9868 | 0 | subtable->mf_bits_set_unit0 = unit0; |
9869 | 0 | subtable->mf_bits_set_unit1 = unit1; |
9870 | 0 | subtable->mf_masks = xmalloc(sizeof(uint64_t) * (unit0 + unit1)); |
9871 | 0 | dpcls_flow_key_gen_masks(mask, subtable->mf_masks, unit0, unit1); |
9872 | | |
9873 | | /* Get the preferred subtable search function for this (u0,u1) subtable. |
9874 | | * The function is guaranteed to always return a valid implementation, and |
9875 | | * possibly an ISA optimized, and/or specialized implementation. Initialize |
9876 | | * the subtable search function atomically to avoid garbage data being read |
9877 | | * by the PMD thread. |
9878 | | */ |
9879 | 0 | atomic_init(&subtable->lookup_func, |
9880 | 0 | dpcls_subtable_get_best_impl(unit0, unit1, |
9881 | 0 | &subtable->lookup_func_info)); |
9882 | 0 | dpcls_info_inc_usage(subtable->lookup_func_info); |
9883 | |
|
9884 | 0 | cmap_insert(&cls->subtables_map, &subtable->cmap_node, mask->hash); |
9885 | | /* Add the new subtable at the end of the pvector (with no hits yet) */ |
9886 | 0 | pvector_insert(&cls->subtables, subtable, 0); |
9887 | 0 | VLOG_DBG("Creating %"PRIuSIZE". subtable %p for in_port %d", |
9888 | 0 | cmap_count(&cls->subtables_map), subtable, cls->in_port); |
9889 | 0 | pvector_publish(&cls->subtables); |
9890 | |
|
9891 | 0 | return subtable; |
9892 | 0 | } |
9893 | | |
9894 | | static inline struct dpcls_subtable * |
9895 | | dpcls_find_subtable(struct dpcls *cls, const struct netdev_flow_key *mask) |
9896 | 0 | { |
9897 | 0 | struct dpcls_subtable *subtable; |
9898 | |
|
9899 | 0 | CMAP_FOR_EACH_WITH_HASH (subtable, cmap_node, mask->hash, |
9900 | 0 | &cls->subtables_map) { |
9901 | 0 | if (netdev_flow_key_equal(&subtable->mask, mask)) { |
9902 | 0 | return subtable; |
9903 | 0 | } |
9904 | 0 | } |
9905 | 0 | return dpcls_create_subtable(cls, mask); |
9906 | 0 | } |
9907 | | |
9908 | | /* Checks for the best available implementation for each subtable lookup |
9909 | | * function, and assigns it as the lookup function pointer for each subtable. |
9910 | | * Returns the number of subtables that have changed lookup implementation. |
9911 | | * This function requires holding a flow_mutex when called. This is to make |
9912 | | * sure modifications done by this function are not overwritten. This could |
9913 | | * happen if dpcls_sort_subtable_vector() is called at the same time as this |
9914 | | * function. |
9915 | | */ |
9916 | | static uint32_t |
9917 | | dpcls_subtable_lookup_reprobe(struct dpcls *cls) |
9918 | 0 | { |
9919 | 0 | struct pvector *pvec = &cls->subtables; |
9920 | 0 | uint32_t subtables_changed = 0; |
9921 | 0 | struct dpcls_subtable *subtable = NULL; |
9922 | |
|
9923 | 0 | PVECTOR_FOR_EACH (subtable, pvec) { |
9924 | 0 | uint32_t u0_bits = subtable->mf_bits_set_unit0; |
9925 | 0 | uint32_t u1_bits = subtable->mf_bits_set_unit1; |
9926 | 0 | void *old_func = subtable->lookup_func; |
9927 | 0 | struct dpcls_subtable_lookup_info_t *old_info; |
9928 | 0 | old_info = subtable->lookup_func_info; |
9929 | | /* Set the subtable lookup function atomically to avoid garbage data |
9930 | | * being read by the PMD thread. */ |
9931 | 0 | atomic_store_relaxed(&subtable->lookup_func, |
9932 | 0 | dpcls_subtable_get_best_impl(u0_bits, u1_bits, |
9933 | 0 | &subtable->lookup_func_info)); |
9934 | 0 | if (old_func != subtable->lookup_func) { |
9935 | 0 | subtables_changed += 1; |
9936 | 0 | } |
9937 | |
|
9938 | 0 | if (old_info != subtable->lookup_func_info) { |
9939 | | /* In theory, functions can be shared between implementations, so |
9940 | | * do an explicit check on the function info structures. */ |
9941 | 0 | dpcls_info_dec_usage(old_info); |
9942 | 0 | dpcls_info_inc_usage(subtable->lookup_func_info); |
9943 | 0 | } |
9944 | 0 | } |
9945 | |
|
9946 | 0 | return subtables_changed; |
9947 | 0 | } |
9948 | | |
9949 | | /* Periodically sort the dpcls subtable vectors according to hit counts */ |
9950 | | static void |
9951 | | dpcls_sort_subtable_vector(struct dpcls *cls) |
9952 | 0 | { |
9953 | 0 | struct pvector *pvec = &cls->subtables; |
9954 | 0 | struct dpcls_subtable *subtable; |
9955 | |
|
9956 | 0 | PVECTOR_FOR_EACH (subtable, pvec) { |
9957 | 0 | pvector_change_priority(pvec, subtable, subtable->hit_cnt); |
9958 | 0 | subtable->hit_cnt = 0; |
9959 | 0 | } |
9960 | 0 | pvector_publish(pvec); |
9961 | 0 | } |
9962 | | |
9963 | | static inline void |
9964 | | dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd, |
9965 | | struct polled_queue *poll_list, int poll_cnt) |
9966 | 0 | { |
9967 | 0 | struct dpcls *cls; |
9968 | 0 | uint64_t tot_idle = 0, tot_proc = 0, tot_sleep = 0; |
9969 | 0 | unsigned int pmd_load = 0; |
9970 | |
|
9971 | 0 | if (pmd->ctx.now > pmd->next_cycle_store) { |
9972 | 0 | uint64_t curr_tsc; |
9973 | 0 | uint8_t rebalance_load_trigger; |
9974 | 0 | struct pmd_auto_lb *pmd_alb = &pmd->dp->pmd_alb; |
9975 | 0 | unsigned int idx; |
9976 | |
|
9977 | 0 | if (pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] >= |
9978 | 0 | pmd->prev_stats[PMD_CYCLES_ITER_IDLE] && |
9979 | 0 | pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] >= |
9980 | 0 | pmd->prev_stats[PMD_CYCLES_ITER_BUSY]) { |
9981 | 0 | tot_idle = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] - |
9982 | 0 | pmd->prev_stats[PMD_CYCLES_ITER_IDLE]; |
9983 | 0 | tot_proc = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] - |
9984 | 0 | pmd->prev_stats[PMD_CYCLES_ITER_BUSY]; |
9985 | 0 | tot_sleep = pmd->perf_stats.counters.n[PMD_CYCLES_SLEEP] - |
9986 | 0 | pmd->prev_stats[PMD_CYCLES_SLEEP]; |
9987 | |
|
9988 | 0 | if (pmd_alb->is_enabled && !pmd->isolated) { |
9989 | 0 | if (tot_proc) { |
9990 | 0 | pmd_load = ((tot_proc * 100) / |
9991 | 0 | (tot_idle + tot_proc + tot_sleep)); |
9992 | 0 | } |
9993 | |
|
9994 | 0 | atomic_read_relaxed(&pmd_alb->rebalance_load_thresh, |
9995 | 0 | &rebalance_load_trigger); |
9996 | 0 | if (pmd_load >= rebalance_load_trigger) { |
9997 | 0 | atomic_count_inc(&pmd->pmd_overloaded); |
9998 | 0 | } else { |
9999 | 0 | atomic_count_set(&pmd->pmd_overloaded, 0); |
10000 | 0 | } |
10001 | 0 | } |
10002 | 0 | } |
10003 | |
|
10004 | 0 | pmd->prev_stats[PMD_CYCLES_ITER_IDLE] = |
10005 | 0 | pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE]; |
10006 | 0 | pmd->prev_stats[PMD_CYCLES_ITER_BUSY] = |
10007 | 0 | pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY]; |
10008 | 0 | pmd->prev_stats[PMD_CYCLES_SLEEP] = |
10009 | 0 | pmd->perf_stats.counters.n[PMD_CYCLES_SLEEP]; |
10010 | | |
10011 | | /* Get the cycles that were used to process each queue and store. */ |
10012 | 0 | for (unsigned i = 0; i < poll_cnt; i++) { |
10013 | 0 | uint64_t rxq_cyc_curr = dp_netdev_rxq_get_cycles(poll_list[i].rxq, |
10014 | 0 | RXQ_CYCLES_PROC_CURR); |
10015 | 0 | dp_netdev_rxq_set_intrvl_cycles(poll_list[i].rxq, rxq_cyc_curr); |
10016 | 0 | dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR, |
10017 | 0 | 0); |
10018 | 0 | } |
10019 | 0 | curr_tsc = cycles_counter_update(&pmd->perf_stats); |
10020 | 0 | if (pmd->intrvl_tsc_prev) { |
10021 | | /* There is a prev timestamp, store a new intrvl cycle count. */ |
10022 | 0 | atomic_store_relaxed(&pmd->intrvl_cycles, |
10023 | 0 | curr_tsc - pmd->intrvl_tsc_prev); |
10024 | 0 | } |
10025 | 0 | idx = atomic_count_inc(&pmd->intrvl_idx) % PMD_INTERVAL_MAX; |
10026 | 0 | atomic_store_relaxed(&pmd->busy_cycles_intrvl[idx], tot_proc); |
10027 | 0 | pmd->intrvl_tsc_prev = curr_tsc; |
10028 | | /* Start new measuring interval */ |
10029 | 0 | pmd->next_cycle_store = pmd->ctx.now + PMD_INTERVAL_LEN; |
10030 | 0 | } |
10031 | |
|
10032 | 0 | if (pmd->ctx.now > pmd->next_optimization) { |
10033 | | /* Try to obtain the flow lock to block out revalidator threads. |
10034 | | * If not possible, just try next time. */ |
10035 | 0 | if (!ovs_mutex_trylock(&pmd->flow_mutex)) { |
10036 | | /* Optimize each classifier */ |
10037 | 0 | CMAP_FOR_EACH (cls, node, &pmd->classifiers) { |
10038 | 0 | dpcls_sort_subtable_vector(cls); |
10039 | 0 | } |
10040 | 0 | ovs_mutex_unlock(&pmd->flow_mutex); |
10041 | | /* Start new measuring interval */ |
10042 | 0 | pmd->next_optimization = pmd->ctx.now |
10043 | 0 | + DPCLS_OPTIMIZATION_INTERVAL; |
10044 | 0 | } |
10045 | 0 | } |
10046 | 0 | } |
10047 | | |
10048 | | /* Returns the sum of a specified number of newest to |
10049 | | * oldest interval values. 'cur_idx' is where the next |
10050 | | * write will be and wrap around needs to be handled. |
10051 | | */ |
10052 | | static uint64_t |
10053 | | get_interval_values(atomic_ullong *source, atomic_count *cur_idx, |
10054 | 0 | int num_to_read) { |
10055 | 0 | unsigned int i; |
10056 | 0 | uint64_t total = 0; |
10057 | |
|
10058 | 0 | i = atomic_count_get(cur_idx) % PMD_INTERVAL_MAX; |
10059 | 0 | for (int read = 0; read < num_to_read; read++) { |
10060 | 0 | uint64_t interval_value; |
10061 | |
|
10062 | 0 | i = i ? i - 1 : PMD_INTERVAL_MAX - 1; |
10063 | 0 | atomic_read_relaxed(&source[i], &interval_value); |
10064 | 0 | total += interval_value; |
10065 | 0 | } |
10066 | 0 | return total; |
10067 | 0 | } |
10068 | | |
10069 | | /* Insert 'rule' into 'cls'. */ |
10070 | | static void |
10071 | | dpcls_insert(struct dpcls *cls, struct dpcls_rule *rule, |
10072 | | const struct netdev_flow_key *mask) |
10073 | 0 | { |
10074 | 0 | struct dpcls_subtable *subtable = dpcls_find_subtable(cls, mask); |
10075 | | |
10076 | | /* Refer to subtable's mask, also for later removal. */ |
10077 | 0 | rule->mask = &subtable->mask; |
10078 | 0 | cmap_insert(&subtable->rules, &rule->cmap_node, rule->flow.hash); |
10079 | 0 | } |
10080 | | |
10081 | | /* Removes 'rule' from 'cls', also destructing the 'rule'. */ |
10082 | | static void |
10083 | | dpcls_remove(struct dpcls *cls, struct dpcls_rule *rule) |
10084 | 0 | { |
10085 | 0 | struct dpcls_subtable *subtable; |
10086 | |
|
10087 | 0 | ovs_assert(rule->mask); |
10088 | | |
10089 | | /* Get subtable from reference in rule->mask. */ |
10090 | 0 | INIT_CONTAINER(subtable, rule->mask, mask); |
10091 | 0 | if (cmap_remove(&subtable->rules, &rule->cmap_node, rule->flow.hash) |
10092 | 0 | == 0) { |
10093 | | /* Delete empty subtable. */ |
10094 | 0 | dpcls_destroy_subtable(cls, subtable); |
10095 | 0 | pvector_publish(&cls->subtables); |
10096 | 0 | } |
10097 | 0 | } |
10098 | | |
10099 | | /* Inner loop for mask generation of a unit, see dpcls_flow_key_gen_masks. */ |
10100 | | static inline void |
10101 | | dpcls_flow_key_gen_mask_unit(uint64_t iter, const uint64_t count, |
10102 | | uint64_t *mf_masks) |
10103 | 0 | { |
10104 | 0 | int i; |
10105 | 0 | for (i = 0; i < count; i++) { |
10106 | 0 | uint64_t lowest_bit = (iter & -iter); |
10107 | 0 | iter &= ~lowest_bit; |
10108 | 0 | mf_masks[i] = (lowest_bit - 1); |
10109 | 0 | } |
10110 | | /* Checks that count has covered all bits in the iter bitmap. */ |
10111 | 0 | ovs_assert(iter == 0); |
10112 | 0 | } |
10113 | | |
10114 | | /* Generate a mask for each block in the miniflow, based on the bits set. This |
10115 | | * allows easily masking packets with the generated array here, without |
10116 | | * calculations. This replaces runtime-calculating the masks. |
10117 | | * @param key The table to generate the mf_masks for |
10118 | | * @param mf_masks Pointer to a u64 array of at least *mf_bits* in size |
10119 | | * @param mf_bits_total Number of bits set in the whole miniflow (both units) |
10120 | | * @param mf_bits_unit0 Number of bits set in unit0 of the miniflow |
10121 | | */ |
10122 | | void |
10123 | | dpcls_flow_key_gen_masks(const struct netdev_flow_key *tbl, |
10124 | | uint64_t *mf_masks, |
10125 | | const uint32_t mf_bits_u0, |
10126 | | const uint32_t mf_bits_u1) |
10127 | 0 | { |
10128 | 0 | uint64_t iter_u0 = tbl->mf.map.bits[0]; |
10129 | 0 | uint64_t iter_u1 = tbl->mf.map.bits[1]; |
10130 | |
|
10131 | 0 | dpcls_flow_key_gen_mask_unit(iter_u0, mf_bits_u0, &mf_masks[0]); |
10132 | 0 | dpcls_flow_key_gen_mask_unit(iter_u1, mf_bits_u1, &mf_masks[mf_bits_u0]); |
10133 | 0 | } |
10134 | | |
10135 | | /* Returns true if 'target' satisfies 'key' in 'mask', that is, if each 1-bit |
10136 | | * in 'mask' the values in 'key' and 'target' are the same. */ |
10137 | | inline bool |
10138 | | dpcls_rule_matches_key(const struct dpcls_rule *rule, |
10139 | | const struct netdev_flow_key *target) |
10140 | 0 | { |
10141 | 0 | const uint64_t *keyp = miniflow_get_values(&rule->flow.mf); |
10142 | 0 | const uint64_t *maskp = miniflow_get_values(&rule->mask->mf); |
10143 | 0 | uint64_t value; |
10144 | |
|
10145 | 0 | NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, target, rule->flow.mf.map) { |
10146 | 0 | if (OVS_UNLIKELY((value & *maskp++) != *keyp++)) { |
10147 | 0 | return false; |
10148 | 0 | } |
10149 | 0 | } |
10150 | 0 | return true; |
10151 | 0 | } |
10152 | | |
10153 | | /* For each miniflow in 'keys' performs a classifier lookup writing the result |
10154 | | * into the corresponding slot in 'rules'. If a particular entry in 'keys' is |
10155 | | * NULL it is skipped. |
10156 | | * |
10157 | | * This function is optimized for use in the userspace datapath and therefore |
10158 | | * does not implement a lot of features available in the standard |
10159 | | * classifier_lookup() function. Specifically, it does not implement |
10160 | | * priorities, instead returning any rule which matches the flow. |
10161 | | * |
10162 | | * Returns true if all miniflows found a corresponding rule. */ |
10163 | | bool |
10164 | | dpcls_lookup(struct dpcls *cls, const struct netdev_flow_key *keys[], |
10165 | | struct dpcls_rule **rules, const size_t cnt, |
10166 | | int *num_lookups_p) |
10167 | 0 | { |
10168 | | /* The received 'cnt' miniflows are the search-keys that will be processed |
10169 | | * to find a matching entry into the available subtables. |
10170 | | * The number of bits in map_type is equal to NETDEV_MAX_BURST. */ |
10171 | 0 | #define MAP_BITS (sizeof(uint32_t) * CHAR_BIT) |
10172 | 0 | BUILD_ASSERT_DECL(MAP_BITS >= NETDEV_MAX_BURST); |
10173 | |
|
10174 | 0 | struct dpcls_subtable *subtable; |
10175 | 0 | uint32_t keys_map = TYPE_MAXIMUM(uint32_t); /* Set all bits. */ |
10176 | |
|
10177 | 0 | if (cnt != MAP_BITS) { |
10178 | 0 | keys_map >>= MAP_BITS - cnt; /* Clear extra bits. */ |
10179 | 0 | } |
10180 | 0 | memset(rules, 0, cnt * sizeof *rules); |
10181 | |
|
10182 | 0 | int lookups_match = 0, subtable_pos = 1; |
10183 | 0 | uint32_t found_map; |
10184 | | |
10185 | | /* The Datapath classifier - aka dpcls - is composed of subtables. |
10186 | | * Subtables are dynamically created as needed when new rules are inserted. |
10187 | | * Each subtable collects rules with matches on a specific subset of packet |
10188 | | * fields as defined by the subtable's mask. We proceed to process every |
10189 | | * search-key against each subtable, but when a match is found for a |
10190 | | * search-key, the search for that key can stop because the rules are |
10191 | | * non-overlapping. */ |
10192 | 0 | PVECTOR_FOR_EACH (subtable, &cls->subtables) { |
10193 | | /* Call the subtable specific lookup function. */ |
10194 | 0 | found_map = subtable->lookup_func(subtable, keys_map, keys, rules); |
10195 | | |
10196 | | /* Count the number of subtables searched for this packet match. This |
10197 | | * estimates the "spread" of subtables looked at per matched packet. */ |
10198 | 0 | uint32_t pkts_matched = count_1bits(found_map); |
10199 | 0 | lookups_match += pkts_matched * subtable_pos; |
10200 | | |
10201 | | /* Clear the found rules, and return early if all packets are found. */ |
10202 | 0 | keys_map &= ~found_map; |
10203 | 0 | if (!keys_map) { |
10204 | 0 | if (num_lookups_p) { |
10205 | 0 | *num_lookups_p = lookups_match; |
10206 | 0 | } |
10207 | 0 | return true; |
10208 | 0 | } |
10209 | 0 | subtable_pos++; |
10210 | 0 | } |
10211 | | |
10212 | 0 | if (num_lookups_p) { |
10213 | 0 | *num_lookups_p = lookups_match; |
10214 | 0 | } |
10215 | 0 | return false; |
10216 | 0 | } |