/src/openvswitch/lib/dpif-netdev.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2009-2014, 2016-2018 Nicira, Inc. |
3 | | * |
4 | | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | | * you may not use this file except in compliance with the License. |
6 | | * You may obtain a copy of the License at: |
7 | | * |
8 | | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | | * |
10 | | * Unless required by applicable law or agreed to in writing, software |
11 | | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | | * See the License for the specific language governing permissions and |
14 | | * limitations under the License. |
15 | | */ |
16 | | |
17 | | #include <config.h> |
18 | | #include "dpif-netdev.h" |
19 | | |
20 | | #include <ctype.h> |
21 | | #include <errno.h> |
22 | | #include <fcntl.h> |
23 | | #include <inttypes.h> |
24 | | #include <net/if.h> |
25 | | #include <sys/types.h> |
26 | | #include <netinet/in.h> |
27 | | #include <stdint.h> |
28 | | #include <stdlib.h> |
29 | | #include <string.h> |
30 | | #include <sys/ioctl.h> |
31 | | #include <sys/socket.h> |
32 | | #include <sys/stat.h> |
33 | | #include <unistd.h> |
34 | | |
35 | | #include "bitmap.h" |
36 | | #include "ccmap.h" |
37 | | #include "cmap.h" |
38 | | #include "conntrack.h" |
39 | | #include "conntrack-tp.h" |
40 | | #include "coverage.h" |
41 | | #include "ct-dpif.h" |
42 | | #include "csum.h" |
43 | | #include "dp-packet.h" |
44 | | #include "dpif.h" |
45 | | #include "dpif-netdev-dfc.h" |
46 | | #include "dpif-netdev-dpcls.h" |
47 | | #include "dpif-netdev-flow.h" |
48 | | #include "dpif-netdev-perf.h" |
49 | | #include "dpif-netdev-thread.h" |
50 | | #include "dpif-offload.h" |
51 | | #include "dpif-provider.h" |
52 | | #include "dummy.h" |
53 | | #include "fat-rwlock.h" |
54 | | #include "flow.h" |
55 | | #include "hmapx.h" |
56 | | #include "id-fpool.h" |
57 | | #include "id-pool.h" |
58 | | #include "ipf.h" |
59 | | #include "mov-avg.h" |
60 | | #include "mpsc-queue.h" |
61 | | #include "netdev.h" |
62 | | #include "netdev-provider.h" |
63 | | #include "netdev-vport.h" |
64 | | #include "netlink.h" |
65 | | #include "odp-execute.h" |
66 | | #include "odp-util.h" |
67 | | #include "openvswitch/dynamic-string.h" |
68 | | #include "openvswitch/list.h" |
69 | | #include "openvswitch/match.h" |
70 | | #include "openvswitch/ofp-parse.h" |
71 | | #include "openvswitch/ofp-print.h" |
72 | | #include "openvswitch/ofpbuf.h" |
73 | | #include "openvswitch/shash.h" |
74 | | #include "openvswitch/vlog.h" |
75 | | #include "ovs-numa.h" |
76 | | #include "ovs-rcu.h" |
77 | | #include "packets.h" |
78 | | #include "openvswitch/poll-loop.h" |
79 | | #include "pvector.h" |
80 | | #include "random.h" |
81 | | #include "seq.h" |
82 | | #include "smap.h" |
83 | | #include "sset.h" |
84 | | #include "timeval.h" |
85 | | #include "tnl-neigh-cache.h" |
86 | | #include "tnl-ports.h" |
87 | | #include "unixctl.h" |
88 | | #include "util.h" |
89 | | #include "uuid.h" |
90 | | |
91 | | VLOG_DEFINE_THIS_MODULE(dpif_netdev); |
92 | | |
93 | | /* Auto Load Balancing Defaults */ |
94 | 0 | #define ALB_IMPROVEMENT_THRESHOLD 25 |
95 | 0 | #define ALB_LOAD_THRESHOLD 95 |
96 | 0 | #define ALB_REBALANCE_INTERVAL 1 /* 1 Min */ |
97 | 0 | #define MAX_ALB_REBALANCE_INTERVAL 20000 /* 20000 Min */ |
98 | 0 | #define MIN_TO_MSEC 60000 |
99 | | |
100 | | #define FLOW_DUMP_MAX_BATCH 50 |
101 | | /* Use per thread recirc_depth to prevent recirculation loop. */ |
102 | 0 | #define MAX_RECIRC_DEPTH 8 |
103 | | DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0) |
104 | | |
105 | | /* Use instant packet send by default. */ |
106 | 0 | #define DEFAULT_TX_FLUSH_INTERVAL 0 |
107 | | |
108 | | /* Configuration parameters. */ |
109 | | enum { MAX_METERS = 1 << 18 }; /* Maximum number of meters. */ |
110 | | enum { MAX_BANDS = 8 }; /* Maximum number of bands / meter. */ |
111 | | |
112 | | COVERAGE_DEFINE(datapath_drop_meter); |
113 | | COVERAGE_DEFINE(datapath_drop_upcall_error); |
114 | | COVERAGE_DEFINE(datapath_drop_lock_error); |
115 | | COVERAGE_DEFINE(datapath_drop_userspace_action_error); |
116 | | COVERAGE_DEFINE(datapath_drop_tunnel_push_error); |
117 | | COVERAGE_DEFINE(datapath_drop_tunnel_pop_error); |
118 | | COVERAGE_DEFINE(datapath_drop_recirc_error); |
119 | | COVERAGE_DEFINE(datapath_drop_invalid_port); |
120 | | COVERAGE_DEFINE(datapath_drop_invalid_bond); |
121 | | COVERAGE_DEFINE(datapath_drop_invalid_tnl_port); |
122 | | COVERAGE_DEFINE(datapath_drop_rx_invalid_packet); |
123 | | COVERAGE_DEFINE(datapath_drop_hw_post_process); |
124 | | COVERAGE_DEFINE(datapath_drop_hw_post_process_consumed); |
125 | | |
126 | | /* Protects against changes to 'dp_netdevs'. */ |
127 | | static struct ovs_mutex dp_netdev_mutex = OVS_MUTEX_INITIALIZER; |
128 | | |
129 | | /* Contains all 'struct dp_netdev's. */ |
130 | | static struct shash dp_netdevs OVS_GUARDED_BY(dp_netdev_mutex) |
131 | | = SHASH_INITIALIZER(&dp_netdevs); |
132 | | |
133 | | static struct vlog_rate_limit upcall_rl = VLOG_RATE_LIMIT_INIT(600, 600); |
134 | | |
135 | 0 | #define DP_NETDEV_CS_SUPPORTED_MASK (CS_NEW | CS_ESTABLISHED | CS_RELATED \ |
136 | 0 | | CS_INVALID | CS_REPLY_DIR | CS_TRACKED \ |
137 | 0 | | CS_SRC_NAT | CS_DST_NAT) |
138 | 0 | #define DP_NETDEV_CS_UNSUPPORTED_MASK (~(uint32_t)DP_NETDEV_CS_SUPPORTED_MASK) |
139 | | |
140 | | static struct odp_support dp_netdev_support = { |
141 | | .max_vlan_headers = SIZE_MAX, |
142 | | .max_mpls_depth = SIZE_MAX, |
143 | | .recirc = true, |
144 | | .ct_state = true, |
145 | | .ct_zone = true, |
146 | | .ct_mark = true, |
147 | | .ct_label = true, |
148 | | .ct_state_nat = true, |
149 | | .ct_orig_tuple = true, |
150 | | .ct_orig_tuple6 = true, |
151 | | }; |
152 | | |
153 | | |
154 | | /* Simple non-wildcarding single-priority classifier. */ |
155 | | |
156 | | /* Time in microseconds between successive optimizations of the dpcls |
157 | | * subtable vector */ |
158 | 0 | #define DPCLS_OPTIMIZATION_INTERVAL 1000000LL |
159 | | |
160 | | /* Time in microseconds of the interval in which rxq processing cycles used |
161 | | * in rxq to pmd assignments is measured and stored. */ |
162 | 0 | #define PMD_INTERVAL_LEN 5000000LL |
163 | | /* For converting PMD_INTERVAL_LEN to secs. */ |
164 | 0 | #define INTERVAL_USEC_TO_SEC 1000000LL |
165 | | |
166 | | /* Number of intervals for which cycles are stored |
167 | | * and used during rxq to pmd assignment. */ |
168 | 0 | #define PMD_INTERVAL_MAX 12 |
169 | | |
170 | | /* Time in microseconds to try RCU quiescing. */ |
171 | 0 | #define PMD_RCU_QUIESCE_INTERVAL 10000LL |
172 | | |
173 | | /* Timer resolution for PMD threads in nanoseconds. */ |
174 | 0 | #define PMD_TIMER_RES_NS 1000 |
175 | | |
176 | | /* Number of pkts Rx on an interface that will stop pmd thread sleeping. */ |
177 | 0 | #define PMD_SLEEP_THRESH (NETDEV_MAX_BURST / 2) |
178 | | /* Time in uS to increment a pmd thread sleep time. */ |
179 | 0 | #define PMD_SLEEP_INC_US 1 |
180 | | |
181 | | struct pmd_sleep { |
182 | | unsigned core_id; |
183 | | uint64_t max_sleep; |
184 | | }; |
185 | | |
186 | | struct dpcls { |
187 | | struct cmap_node node; /* Within dp_netdev_pmd_thread.classifiers */ |
188 | | odp_port_t in_port; |
189 | | struct cmap subtables_map; |
190 | | struct pvector subtables; |
191 | | }; |
192 | | |
193 | | /* Data structure to keep packet order till fastpath processing. */ |
194 | | struct dp_packet_flow_map { |
195 | | struct dp_packet *packet; |
196 | | struct dp_netdev_flow *flow; |
197 | | uint16_t tcp_flags; |
198 | | }; |
199 | | |
200 | | static void dpcls_init(struct dpcls *); |
201 | | static void dpcls_destroy(struct dpcls *); |
202 | | static void dpcls_sort_subtable_vector(struct dpcls *); |
203 | | static void dpcls_insert(struct dpcls *, struct dpcls_rule *, |
204 | | const struct netdev_flow_key *mask); |
205 | | static void dpcls_remove(struct dpcls *, struct dpcls_rule *); |
206 | | |
207 | | /* Set of supported meter flags */ |
208 | | #define DP_SUPPORTED_METER_FLAGS_MASK \ |
209 | 0 | (OFPMF13_STATS | OFPMF13_PKTPS | OFPMF13_KBPS | OFPMF13_BURST) |
210 | | |
211 | | /* Set of supported meter band types */ |
212 | | #define DP_SUPPORTED_METER_BAND_TYPES \ |
213 | 0 | ( 1 << OFPMBT13_DROP ) |
214 | | |
215 | | struct dp_meter_band { |
216 | | uint32_t rate; |
217 | | uint32_t burst_size; |
218 | | atomic_uint64_t bucket; /* In 1/1000 packets for PKTPS, |
219 | | * or in bits for KBPS. */ |
220 | | atomic_uint64_t packet_count; |
221 | | atomic_uint64_t byte_count; |
222 | | }; |
223 | | |
224 | | struct dp_meter { |
225 | | struct cmap_node node; |
226 | | uint32_t id; |
227 | | uint16_t flags; |
228 | | uint16_t n_bands; |
229 | | uint32_t max_delta_t; |
230 | | atomic_uint64_t used; /* Time of a last use in milliseconds. */ |
231 | | atomic_uint64_t packet_count; |
232 | | atomic_uint64_t byte_count; |
233 | | struct dp_meter_band bands[]; |
234 | | }; |
235 | | |
236 | | struct pmd_auto_lb { |
237 | | bool do_dry_run; |
238 | | bool recheck_config; |
239 | | bool is_enabled; /* Current status of Auto load balancing. */ |
240 | | uint64_t rebalance_intvl; |
241 | | uint64_t rebalance_poll_timer; |
242 | | uint8_t rebalance_improve_thresh; |
243 | | atomic_uint8_t rebalance_load_thresh; |
244 | | }; |
245 | | |
246 | | enum sched_assignment_type { |
247 | | SCHED_ROUNDROBIN, |
248 | | SCHED_CYCLES, /* Default.*/ |
249 | | SCHED_GROUP |
250 | | }; |
251 | | |
252 | | /* Datapath based on the network device interface from netdev.h. |
253 | | * |
254 | | * |
255 | | * Thread-safety |
256 | | * ============= |
257 | | * |
258 | | * Some members, marked 'const', are immutable. Accessing other members |
259 | | * requires synchronization, as noted in more detail below. |
260 | | * |
261 | | * Acquisition order is, from outermost to innermost: |
262 | | * |
263 | | * dp_netdev_mutex (global) |
264 | | * port_rwlock |
265 | | * bond_mutex |
266 | | * non_pmd_mutex |
267 | | */ |
268 | | struct dp_netdev { |
269 | | const struct dpif_class *const class; |
270 | | const char *const name; |
271 | | const char *const full_name; |
272 | | struct ovs_refcount ref_cnt; |
273 | | atomic_flag destroyed; |
274 | | |
275 | | /* Ports. |
276 | | * |
277 | | * Any lookup into 'ports' or any access to the dp_netdev_ports found |
278 | | * through 'ports' requires taking 'port_rwlock'. */ |
279 | | struct ovs_rwlock port_rwlock; |
280 | | struct hmap ports; |
281 | | struct seq *port_seq; /* Incremented whenever a port changes. */ |
282 | | |
283 | | /* The time that a packet can wait in output batch for sending. */ |
284 | | atomic_uint32_t tx_flush_interval; |
285 | | |
286 | | /* Meters. */ |
287 | | struct ovs_mutex meters_lock; |
288 | | struct cmap meters; |
289 | | |
290 | | /* Probability of EMC insertions is a factor of 'emc_insert_min'.*/ |
291 | | atomic_uint32_t emc_insert_min; |
292 | | /* Enable collection of PMD performance metrics. */ |
293 | | atomic_bool pmd_perf_metrics; |
294 | | /* Default max load based sleep request. */ |
295 | | uint64_t pmd_max_sleep_default; |
296 | | /* Enable the SMC cache from ovsdb config */ |
297 | | atomic_bool smc_enable_db; |
298 | | |
299 | | /* Protects access to ofproto-dpif-upcall interface during revalidator |
300 | | * thread synchronization. */ |
301 | | struct fat_rwlock upcall_rwlock; |
302 | | upcall_callback *upcall_cb; /* Callback function for executing upcalls. */ |
303 | | void *upcall_aux; |
304 | | |
305 | | /* Callback function for notifying the purging of dp flows (during |
306 | | * reseting pmd deletion). */ |
307 | | dp_purge_callback *dp_purge_cb; |
308 | | void *dp_purge_aux; |
309 | | |
310 | | /* Stores all 'struct dp_netdev_pmd_thread's. */ |
311 | | struct cmap poll_threads; |
312 | | /* id pool for per thread static_tx_qid. */ |
313 | | struct id_pool *tx_qid_pool; |
314 | | struct ovs_mutex tx_qid_pool_mutex; |
315 | | /* Rxq to pmd assignment type. */ |
316 | | enum sched_assignment_type pmd_rxq_assign_type; |
317 | | bool pmd_iso; |
318 | | |
319 | | /* Protects the access of the 'struct dp_netdev_pmd_thread' |
320 | | * instance for non-pmd thread. */ |
321 | | struct ovs_mutex non_pmd_mutex; |
322 | | |
323 | | /* Each pmd thread will store its pointer to |
324 | | * 'struct dp_netdev_pmd_thread' in 'per_pmd_key'. */ |
325 | | ovsthread_key_t per_pmd_key; |
326 | | |
327 | | struct seq *reconfigure_seq; |
328 | | uint64_t last_reconfigure_seq; |
329 | | struct ovsthread_once once_set_config; |
330 | | |
331 | | /* Cpu mask for pin of pmd threads. */ |
332 | | char *pmd_cmask; |
333 | | |
334 | | /* PMD max load based sleep request user string. */ |
335 | | char *max_sleep_list; |
336 | | |
337 | | uint64_t last_tnl_conf_seq; |
338 | | |
339 | | struct conntrack *conntrack; |
340 | | struct pmd_auto_lb pmd_alb; |
341 | | |
342 | | /* Bonds. */ |
343 | | struct ovs_mutex bond_mutex; /* Protects updates of 'tx_bonds'. */ |
344 | | struct cmap tx_bonds; /* Contains 'struct tx_bond'. */ |
345 | | }; |
346 | | |
347 | | static struct dp_netdev_port *dp_netdev_lookup_port(const struct dp_netdev *dp, |
348 | | odp_port_t) |
349 | | OVS_REQ_RDLOCK(dp->port_rwlock); |
350 | | |
351 | | enum rxq_cycles_counter_type { |
352 | | RXQ_CYCLES_PROC_CURR, /* Cycles spent successfully polling and |
353 | | processing packets during the current |
354 | | interval. */ |
355 | | RXQ_CYCLES_PROC_HIST, /* Total cycles of all intervals that are used |
356 | | during rxq to pmd assignment. */ |
357 | | RXQ_N_CYCLES |
358 | | }; |
359 | | |
360 | 0 | #define XPS_TIMEOUT 500000LL /* In microseconds. */ |
361 | | |
362 | | /* Contained by struct dp_netdev_port's 'rxqs' member. */ |
363 | | struct dp_netdev_rxq { |
364 | | struct dp_netdev_port *port; |
365 | | struct netdev_rxq *rx; |
366 | | unsigned core_id; /* Core to which this queue should be |
367 | | pinned. OVS_CORE_UNSPEC if the |
368 | | queue doesn't need to be pinned to a |
369 | | particular core. */ |
370 | | atomic_count intrvl_idx; /* Write index for 'cycles_intrvl'. */ |
371 | | struct dp_netdev_pmd_thread *pmd; /* pmd thread that polls this queue. */ |
372 | | bool is_vhost; /* Is rxq of a vhost port. */ |
373 | | |
374 | | /* Counters of cycles spent successfully polling and processing pkts. */ |
375 | | atomic_ullong cycles[RXQ_N_CYCLES]; |
376 | | /* We store PMD_INTERVAL_MAX intervals of data for an rxq and then |
377 | | sum them to yield the cycles used for an rxq. */ |
378 | | atomic_ullong cycles_intrvl[PMD_INTERVAL_MAX]; |
379 | | }; |
380 | | |
381 | | enum txq_req_mode { |
382 | | TXQ_REQ_MODE_THREAD, |
383 | | TXQ_REQ_MODE_HASH, |
384 | | }; |
385 | | |
386 | | enum txq_mode { |
387 | | TXQ_MODE_STATIC, |
388 | | TXQ_MODE_XPS, |
389 | | TXQ_MODE_XPS_HASH, |
390 | | }; |
391 | | |
392 | | /* A port in a netdev-based datapath. */ |
393 | | struct dp_netdev_port { |
394 | | odp_port_t port_no; |
395 | | enum txq_mode txq_mode; /* static, XPS, XPS_HASH. */ |
396 | | bool need_reconfigure; /* True if we should reconfigure netdev. */ |
397 | | struct netdev *netdev; |
398 | | struct hmap_node node; /* Node in dp_netdev's 'ports'. */ |
399 | | struct netdev_saved_flags *sf; |
400 | | struct dp_netdev_rxq *rxqs; |
401 | | unsigned n_rxq; /* Number of elements in 'rxqs' */ |
402 | | unsigned *txq_used; /* Number of threads that use each tx queue. */ |
403 | | struct ovs_mutex txq_used_mutex; |
404 | | bool emc_enabled; /* If true EMC will be used. */ |
405 | | char *type; /* Port type as requested by user. */ |
406 | | char *rxq_affinity_list; /* Requested affinity of rx queues. */ |
407 | | enum txq_req_mode txq_requested_mode; |
408 | | }; |
409 | | |
410 | | static bool dp_netdev_flow_ref(struct dp_netdev_flow *); |
411 | | static int dpif_netdev_flow_from_nlattrs(const struct nlattr *, uint32_t, |
412 | | struct flow *, bool); |
413 | | |
414 | | struct dp_netdev_actions *dp_netdev_actions_create(const struct nlattr *, |
415 | | size_t); |
416 | | struct dp_netdev_actions *dp_netdev_flow_get_actions( |
417 | | const struct dp_netdev_flow *); |
418 | | static void dp_netdev_actions_free(struct dp_netdev_actions *); |
419 | | |
420 | | struct polled_queue { |
421 | | struct dp_netdev_rxq *rxq; |
422 | | odp_port_t port_no; |
423 | | bool emc_enabled; |
424 | | bool rxq_enabled; |
425 | | uint64_t change_seq; |
426 | | }; |
427 | | |
428 | | /* Contained by struct dp_netdev_pmd_thread's 'poll_list' member. */ |
429 | | struct rxq_poll { |
430 | | struct dp_netdev_rxq *rxq; |
431 | | struct hmap_node node; |
432 | | }; |
433 | | |
434 | | /* Contained by struct dp_netdev_pmd_thread's 'send_port_cache', |
435 | | * 'tnl_port_cache' or 'tx_ports'. */ |
436 | | struct tx_port { |
437 | | struct dp_netdev_port *port; |
438 | | int qid; |
439 | | long long last_used; |
440 | | struct hmap_node node; |
441 | | long long flush_time; |
442 | | struct dp_packet_batch output_pkts; |
443 | | struct dp_packet_batch *txq_pkts; /* Only for hash mode. */ |
444 | | struct dp_netdev_rxq *output_pkts_rxqs[NETDEV_MAX_BURST]; |
445 | | }; |
446 | | |
447 | | /* Contained by struct tx_bond 'member_buckets'. */ |
448 | | struct member_entry { |
449 | | odp_port_t member_id; |
450 | | atomic_ullong n_packets; |
451 | | atomic_ullong n_bytes; |
452 | | }; |
453 | | |
454 | | /* Contained by struct dp_netdev_pmd_thread's 'tx_bonds'. */ |
455 | | struct tx_bond { |
456 | | struct cmap_node node; |
457 | | uint32_t bond_id; |
458 | | struct member_entry member_buckets[BOND_BUCKETS]; |
459 | | }; |
460 | | |
461 | | /* Interface to netdev-based datapath. */ |
462 | | struct dpif_netdev { |
463 | | struct dpif dpif; |
464 | | struct dp_netdev *dp; |
465 | | uint64_t last_port_seq; |
466 | | }; |
467 | | |
468 | | static int get_port_by_number(struct dp_netdev *dp, odp_port_t port_no, |
469 | | struct dp_netdev_port **portp) |
470 | | OVS_REQ_RDLOCK(dp->port_rwlock); |
471 | | static int get_port_by_name(struct dp_netdev *dp, const char *devname, |
472 | | struct dp_netdev_port **portp) |
473 | | OVS_REQ_RDLOCK(dp->port_rwlock); |
474 | | static void dp_netdev_free(struct dp_netdev *) |
475 | | OVS_REQUIRES(dp_netdev_mutex); |
476 | | static int do_add_port(struct dp_netdev *dp, const char *devname, |
477 | | const char *type, odp_port_t port_no) |
478 | | OVS_REQ_WRLOCK(dp->port_rwlock); |
479 | | static void do_del_port(struct dp_netdev *dp, struct dp_netdev_port *) |
480 | | OVS_REQ_WRLOCK(dp->port_rwlock); |
481 | | static int dpif_netdev_open(const struct dpif_class *, const char *name, |
482 | | bool create, struct dpif **); |
483 | | static void dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd, |
484 | | struct dp_packet_batch *, |
485 | | bool should_steal, |
486 | | const struct flow *flow, |
487 | | const struct nlattr *actions, |
488 | | size_t actions_len); |
489 | | static void dp_netdev_input(struct dp_netdev_pmd_thread *, |
490 | | struct dp_packet_batch *, odp_port_t port_no); |
491 | | static void dp_netdev_recirculate(struct dp_netdev_pmd_thread *, |
492 | | struct dp_packet_batch *); |
493 | | |
494 | | static void dp_netdev_disable_upcall(struct dp_netdev *); |
495 | | static void dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd); |
496 | | static void dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, |
497 | | struct dp_netdev *dp, unsigned core_id, |
498 | | int numa_id); |
499 | | static void dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd); |
500 | | static void dp_netdev_set_nonpmd(struct dp_netdev *dp) |
501 | | OVS_REQ_WRLOCK(dp->port_rwlock); |
502 | | |
503 | | static void *pmd_thread_main(void *); |
504 | | static struct dp_netdev_pmd_thread *dp_netdev_get_pmd(struct dp_netdev *dp, |
505 | | unsigned core_id); |
506 | | static struct dp_netdev_pmd_thread * |
507 | | dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos); |
508 | | static void dp_netdev_del_pmd(struct dp_netdev *dp, |
509 | | struct dp_netdev_pmd_thread *pmd); |
510 | | static void dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd); |
511 | | static void dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd); |
512 | | static void dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd, |
513 | | struct dp_netdev_port *port) |
514 | | OVS_REQUIRES(pmd->port_mutex); |
515 | | static void dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd, |
516 | | struct tx_port *tx) |
517 | | OVS_REQUIRES(pmd->port_mutex); |
518 | | static void dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd, |
519 | | struct dp_netdev_rxq *rxq) |
520 | | OVS_REQUIRES(pmd->port_mutex); |
521 | | static void dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd, |
522 | | struct rxq_poll *poll) |
523 | | OVS_REQUIRES(pmd->port_mutex); |
524 | | static int |
525 | | dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd, |
526 | | bool force); |
527 | | static void dp_netdev_add_bond_tx_to_pmd(struct dp_netdev_pmd_thread *pmd, |
528 | | struct tx_bond *bond, bool update) |
529 | | OVS_EXCLUDED(pmd->bond_mutex); |
530 | | static void dp_netdev_del_bond_tx_from_pmd(struct dp_netdev_pmd_thread *pmd, |
531 | | uint32_t bond_id) |
532 | | OVS_EXCLUDED(pmd->bond_mutex); |
533 | | |
534 | | static void reconfigure_datapath(struct dp_netdev *dp) |
535 | | OVS_REQ_RDLOCK(dp->port_rwlock); |
536 | | static bool dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd); |
537 | | static void dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd); |
538 | | static void dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd); |
539 | | static void pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd) |
540 | | OVS_REQUIRES(pmd->port_mutex); |
541 | | static inline void |
542 | | dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd, |
543 | | struct polled_queue *poll_list, int poll_cnt); |
544 | | static void |
545 | | dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx, |
546 | | enum rxq_cycles_counter_type type, |
547 | | unsigned long long cycles); |
548 | | static uint64_t |
549 | | dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx, |
550 | | enum rxq_cycles_counter_type type); |
551 | | static void |
552 | | dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx, |
553 | | unsigned long long cycles); |
554 | | static uint64_t |
555 | | dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx); |
556 | | static uint64_t |
557 | | get_interval_values(atomic_ullong *source, atomic_count *cur_idx, |
558 | | int num_to_read); |
559 | | static void |
560 | | dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd, |
561 | | bool purge); |
562 | | static int dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd, |
563 | | struct tx_port *tx); |
564 | | static inline struct dpcls *dp_netdev_pmd_lookup_dpcls( |
565 | | struct dp_netdev_pmd_thread *pmd, odp_port_t in_port); |
566 | | |
567 | | static void dp_netdev_request_reconfigure(struct dp_netdev *dp); |
568 | | static inline bool |
569 | | pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd); |
570 | | |
571 | | static void dp_netdev_simple_match_insert(struct dp_netdev_pmd_thread *pmd, |
572 | | struct dp_netdev_flow *flow) |
573 | | OVS_REQUIRES(pmd->flow_mutex); |
574 | | static void dp_netdev_simple_match_remove(struct dp_netdev_pmd_thread *pmd, |
575 | | struct dp_netdev_flow *flow) |
576 | | OVS_REQUIRES(pmd->flow_mutex); |
577 | | |
578 | | static bool dp_netdev_flow_is_simple_match(const struct match *); |
579 | | |
580 | | /* Updates the time in PMD threads context and should be called in three cases: |
581 | | * |
582 | | * 1. PMD structure initialization: |
583 | | * - dp_netdev_configure_pmd() |
584 | | * |
585 | | * 2. Before processing of the new packet batch: |
586 | | * - dpif_netdev_execute() |
587 | | * - dp_netdev_process_rxq_port() |
588 | | * |
589 | | * 3. At least once per polling iteration in main polling threads if no |
590 | | * packets received on current iteration: |
591 | | * - dpif_netdev_run() |
592 | | * - pmd_thread_main() |
593 | | * |
594 | | * 'pmd->ctx.now' should be used without update in all other cases if possible. |
595 | | */ |
596 | | static inline void |
597 | | pmd_thread_ctx_time_update(struct dp_netdev_pmd_thread *pmd) |
598 | 0 | { |
599 | 0 | pmd->ctx.now = time_usec(); |
600 | 0 | } |
601 | | |
602 | | /* Returns true if 'dpif' is a netdev or dummy dpif, false otherwise. */ |
603 | | bool |
604 | | dpif_is_netdev(const struct dpif *dpif) |
605 | 0 | { |
606 | 0 | return dpif->dpif_class->open == dpif_netdev_open; |
607 | 0 | } |
608 | | |
609 | | static struct dpif_netdev * |
610 | | dpif_netdev_cast(const struct dpif *dpif) |
611 | 0 | { |
612 | 0 | ovs_assert(dpif_is_netdev(dpif)); |
613 | 0 | return CONTAINER_OF(dpif, struct dpif_netdev, dpif); |
614 | 0 | } |
615 | | |
616 | | static struct dp_netdev * |
617 | | get_dp_netdev(const struct dpif *dpif) |
618 | 0 | { |
619 | 0 | return dpif_netdev_cast(dpif)->dp; |
620 | 0 | } |
621 | | |
622 | | enum pmd_info_type { |
623 | | PMD_INFO_CLEAR_STATS, /* Set the cycle and the packet counters to 0. */ |
624 | | PMD_INFO_SHOW_RXQ, /* Show poll lists of pmd threads. */ |
625 | | PMD_INFO_PERF_SHOW, /* Show pmd performance details. */ |
626 | | PMD_INFO_SLEEP_SHOW, /* Show max sleep configuration details. */ |
627 | | }; |
628 | | |
629 | | static void |
630 | | format_pmd_thread(struct ds *reply, struct dp_netdev_pmd_thread *pmd) |
631 | 0 | { |
632 | 0 | ds_put_cstr(reply, (pmd->core_id == NON_PMD_CORE_ID) |
633 | 0 | ? "main thread" : "pmd thread"); |
634 | 0 | if (pmd->numa_id != OVS_NUMA_UNSPEC) { |
635 | 0 | ds_put_format(reply, " numa_id %d", pmd->numa_id); |
636 | 0 | } |
637 | 0 | if (pmd->core_id != OVS_CORE_UNSPEC && pmd->core_id != NON_PMD_CORE_ID) { |
638 | 0 | ds_put_format(reply, " core_id %u", pmd->core_id); |
639 | 0 | } |
640 | 0 | ds_put_cstr(reply, ":\n"); |
641 | 0 | } |
642 | | |
643 | | static void |
644 | | pmd_info_show_perf(struct ds *reply, |
645 | | struct dp_netdev_pmd_thread *pmd, |
646 | | struct pmd_perf_params *par) |
647 | 0 | { |
648 | 0 | char *time_str = xastrftime_msec("%H:%M:%S.###", time_wall_msec(), true); |
649 | 0 | long long now = time_msec(); |
650 | 0 | double duration = (now - pmd->perf_stats.start_ms) / 1000.0; |
651 | |
|
652 | 0 | ds_put_cstr(reply, "\n"); |
653 | 0 | ds_put_format(reply, "Time: %s\n", time_str); |
654 | 0 | ds_put_format(reply, "Measurement duration: %.3f s\n", duration); |
655 | 0 | ds_put_cstr(reply, "\n"); |
656 | 0 | format_pmd_thread(reply, pmd); |
657 | 0 | ds_put_cstr(reply, "\n"); |
658 | 0 | pmd_perf_format_overall_stats(reply, &pmd->perf_stats, duration, |
659 | 0 | pmd->core_id != NON_PMD_CORE_ID); |
660 | 0 | if (pmd_perf_metrics_enabled(pmd) && pmd->core_id != NON_PMD_CORE_ID) { |
661 | | /* Prevent parallel clearing of perf metrics. */ |
662 | 0 | ovs_mutex_lock(&pmd->perf_stats.clear_mutex); |
663 | 0 | if (par->histograms) { |
664 | 0 | ds_put_cstr(reply, "\n"); |
665 | 0 | pmd_perf_format_histograms(reply, &pmd->perf_stats); |
666 | 0 | } |
667 | 0 | if (par->iter_hist_len > 0) { |
668 | 0 | ds_put_cstr(reply, "\n"); |
669 | 0 | pmd_perf_format_iteration_history(reply, &pmd->perf_stats, |
670 | 0 | par->iter_hist_len); |
671 | 0 | } |
672 | 0 | if (par->ms_hist_len > 0) { |
673 | 0 | ds_put_cstr(reply, "\n"); |
674 | 0 | pmd_perf_format_ms_history(reply, &pmd->perf_stats, |
675 | 0 | par->ms_hist_len); |
676 | 0 | } |
677 | 0 | ovs_mutex_unlock(&pmd->perf_stats.clear_mutex); |
678 | 0 | } |
679 | 0 | free(time_str); |
680 | 0 | } |
681 | | |
682 | | static int |
683 | | compare_poll_list(const void *a_, const void *b_) |
684 | 0 | { |
685 | 0 | const struct rxq_poll *a = a_; |
686 | 0 | const struct rxq_poll *b = b_; |
687 | |
|
688 | 0 | const char *namea = netdev_rxq_get_name(a->rxq->rx); |
689 | 0 | const char *nameb = netdev_rxq_get_name(b->rxq->rx); |
690 | |
|
691 | 0 | int cmp = strcmp(namea, nameb); |
692 | 0 | if (!cmp) { |
693 | 0 | return netdev_rxq_get_queue_id(a->rxq->rx) |
694 | 0 | - netdev_rxq_get_queue_id(b->rxq->rx); |
695 | 0 | } else { |
696 | 0 | return cmp; |
697 | 0 | } |
698 | 0 | } |
699 | | |
700 | | static void |
701 | | sorted_poll_list(struct dp_netdev_pmd_thread *pmd, struct rxq_poll **list, |
702 | | size_t *n) |
703 | | OVS_REQUIRES(pmd->port_mutex) |
704 | 0 | { |
705 | 0 | struct rxq_poll *ret, *poll; |
706 | 0 | size_t i; |
707 | |
|
708 | 0 | *n = hmap_count(&pmd->poll_list); |
709 | 0 | if (!*n) { |
710 | 0 | ret = NULL; |
711 | 0 | } else { |
712 | 0 | ret = xcalloc(*n, sizeof *ret); |
713 | 0 | i = 0; |
714 | 0 | HMAP_FOR_EACH (poll, node, &pmd->poll_list) { |
715 | 0 | ret[i] = *poll; |
716 | 0 | i++; |
717 | 0 | } |
718 | 0 | ovs_assert(i == *n); |
719 | 0 | qsort(ret, *n, sizeof *ret, compare_poll_list); |
720 | 0 | } |
721 | |
|
722 | 0 | *list = ret; |
723 | 0 | } |
724 | | |
725 | | static void |
726 | | pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd, |
727 | | int secs) |
728 | 0 | { |
729 | 0 | if (pmd->core_id != NON_PMD_CORE_ID) { |
730 | 0 | struct rxq_poll *list; |
731 | 0 | size_t n_rxq; |
732 | 0 | uint64_t total_pmd_cycles = 0; |
733 | 0 | uint64_t busy_pmd_cycles = 0; |
734 | 0 | uint64_t total_rxq_proc_cycles = 0; |
735 | 0 | unsigned int intervals; |
736 | |
|
737 | 0 | ds_put_format(reply, |
738 | 0 | "pmd thread numa_id %d core_id %u:\n isolated : %s\n", |
739 | 0 | pmd->numa_id, pmd->core_id, (pmd->isolated) |
740 | 0 | ? "true" : "false"); |
741 | |
|
742 | 0 | ovs_mutex_lock(&pmd->port_mutex); |
743 | 0 | sorted_poll_list(pmd, &list, &n_rxq); |
744 | | |
745 | | /* Get the total pmd cycles for an interval. */ |
746 | 0 | atomic_read_relaxed(&pmd->intrvl_cycles, &total_pmd_cycles); |
747 | | /* Calculate how many intervals are to be used. */ |
748 | 0 | intervals = DIV_ROUND_UP(secs, |
749 | 0 | PMD_INTERVAL_LEN / INTERVAL_USEC_TO_SEC); |
750 | | /* Estimate the cycles to cover all intervals. */ |
751 | 0 | total_pmd_cycles *= intervals; |
752 | 0 | busy_pmd_cycles = get_interval_values(pmd->busy_cycles_intrvl, |
753 | 0 | &pmd->intrvl_idx, |
754 | 0 | intervals); |
755 | 0 | if (busy_pmd_cycles > total_pmd_cycles) { |
756 | 0 | busy_pmd_cycles = total_pmd_cycles; |
757 | 0 | } |
758 | |
|
759 | 0 | for (int i = 0; i < n_rxq; i++) { |
760 | 0 | struct dp_netdev_rxq *rxq = list[i].rxq; |
761 | 0 | const char *name = netdev_rxq_get_name(rxq->rx); |
762 | 0 | uint64_t rxq_proc_cycles = 0; |
763 | |
|
764 | 0 | rxq_proc_cycles = get_interval_values(rxq->cycles_intrvl, |
765 | 0 | &rxq->intrvl_idx, |
766 | 0 | intervals); |
767 | 0 | total_rxq_proc_cycles += rxq_proc_cycles; |
768 | 0 | ds_put_format(reply, " port: %-16s queue-id: %2d", name, |
769 | 0 | netdev_rxq_get_queue_id(list[i].rxq->rx)); |
770 | 0 | ds_put_format(reply, " %s", netdev_rxq_enabled(list[i].rxq->rx) |
771 | 0 | ? "(enabled) " : "(disabled)"); |
772 | 0 | ds_put_format(reply, " pmd usage: "); |
773 | 0 | if (total_pmd_cycles) { |
774 | 0 | ds_put_format(reply, "%2.0f %%", |
775 | 0 | (double) (rxq_proc_cycles * 100) / |
776 | 0 | total_pmd_cycles); |
777 | 0 | } else { |
778 | 0 | ds_put_format(reply, "%s", "NOT AVAIL"); |
779 | 0 | } |
780 | 0 | ds_put_cstr(reply, "\n"); |
781 | 0 | } |
782 | |
|
783 | 0 | if (n_rxq > 0) { |
784 | 0 | ds_put_cstr(reply, " overhead: "); |
785 | 0 | if (total_pmd_cycles) { |
786 | 0 | uint64_t overhead_cycles = 0; |
787 | |
|
788 | 0 | if (total_rxq_proc_cycles < busy_pmd_cycles) { |
789 | 0 | overhead_cycles = busy_pmd_cycles - total_rxq_proc_cycles; |
790 | 0 | } |
791 | |
|
792 | 0 | ds_put_format(reply, "%2.0f %%", |
793 | 0 | (double) (overhead_cycles * 100) / |
794 | 0 | total_pmd_cycles); |
795 | 0 | } else { |
796 | 0 | ds_put_cstr(reply, "NOT AVAIL"); |
797 | 0 | } |
798 | 0 | ds_put_cstr(reply, "\n"); |
799 | 0 | } |
800 | |
|
801 | 0 | ovs_mutex_unlock(&pmd->port_mutex); |
802 | 0 | free(list); |
803 | 0 | } |
804 | 0 | } |
805 | | |
806 | | static int |
807 | | compare_poll_thread_list(const void *a_, const void *b_) |
808 | 0 | { |
809 | 0 | const struct dp_netdev_pmd_thread *a, *b; |
810 | |
|
811 | 0 | a = *(struct dp_netdev_pmd_thread **)a_; |
812 | 0 | b = *(struct dp_netdev_pmd_thread **)b_; |
813 | |
|
814 | 0 | if (a->core_id < b->core_id) { |
815 | 0 | return -1; |
816 | 0 | } |
817 | 0 | if (a->core_id > b->core_id) { |
818 | 0 | return 1; |
819 | 0 | } |
820 | 0 | return 0; |
821 | 0 | } |
822 | | |
823 | | /* Create a sorted list of pmd's from the dp->poll_threads cmap. We can use |
824 | | * this list, as long as we do not go to quiescent state. */ |
825 | | static void |
826 | | sorted_poll_thread_list(struct dp_netdev *dp, |
827 | | struct dp_netdev_pmd_thread ***list, |
828 | | size_t *n) |
829 | 0 | { |
830 | 0 | struct dp_netdev_pmd_thread *pmd; |
831 | 0 | struct dp_netdev_pmd_thread **pmd_list; |
832 | 0 | size_t k = 0, n_pmds; |
833 | |
|
834 | 0 | n_pmds = cmap_count(&dp->poll_threads); |
835 | 0 | pmd_list = xcalloc(n_pmds, sizeof *pmd_list); |
836 | |
|
837 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
838 | 0 | if (k >= n_pmds) { |
839 | 0 | break; |
840 | 0 | } |
841 | 0 | pmd_list[k++] = pmd; |
842 | 0 | } |
843 | |
|
844 | 0 | qsort(pmd_list, k, sizeof *pmd_list, compare_poll_thread_list); |
845 | |
|
846 | 0 | *list = pmd_list; |
847 | 0 | *n = k; |
848 | 0 | } |
849 | | |
850 | | static void |
851 | | dpif_netdev_pmd_rebalance(struct unixctl_conn *conn, int argc, |
852 | | const char *argv[], void *aux OVS_UNUSED) |
853 | 0 | { |
854 | 0 | struct ds reply = DS_EMPTY_INITIALIZER; |
855 | 0 | struct dp_netdev *dp = NULL; |
856 | |
|
857 | 0 | ovs_mutex_lock(&dp_netdev_mutex); |
858 | |
|
859 | 0 | if (argc == 2) { |
860 | 0 | dp = shash_find_data(&dp_netdevs, argv[1]); |
861 | 0 | } else if (shash_count(&dp_netdevs) == 1) { |
862 | | /* There's only one datapath */ |
863 | 0 | dp = shash_first(&dp_netdevs)->data; |
864 | 0 | } |
865 | |
|
866 | 0 | if (!dp) { |
867 | 0 | ovs_mutex_unlock(&dp_netdev_mutex); |
868 | 0 | unixctl_command_reply_error(conn, |
869 | 0 | "please specify an existing datapath"); |
870 | 0 | return; |
871 | 0 | } |
872 | | |
873 | 0 | dp_netdev_request_reconfigure(dp); |
874 | 0 | ovs_mutex_unlock(&dp_netdev_mutex); |
875 | 0 | ds_put_cstr(&reply, "pmd rxq rebalance requested.\n"); |
876 | 0 | unixctl_command_reply(conn, ds_cstr(&reply)); |
877 | 0 | ds_destroy(&reply); |
878 | 0 | } |
879 | | |
880 | | static void |
881 | | pmd_info_show_sleep(struct ds *reply, unsigned core_id, int numa_id, |
882 | | uint64_t pmd_max_sleep) |
883 | 0 | { |
884 | 0 | if (core_id == NON_PMD_CORE_ID) { |
885 | 0 | return; |
886 | 0 | } |
887 | 0 | ds_put_format(reply, |
888 | 0 | "pmd thread numa_id %d core_id %d:\n" |
889 | 0 | " max sleep: %4"PRIu64" us\n", |
890 | 0 | numa_id, core_id, pmd_max_sleep); |
891 | 0 | } |
892 | | |
893 | | static void |
894 | | dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[], |
895 | | void *aux) |
896 | 0 | { |
897 | 0 | struct ds reply = DS_EMPTY_INITIALIZER; |
898 | 0 | struct dp_netdev_pmd_thread **pmd_list; |
899 | 0 | struct dp_netdev *dp = NULL; |
900 | 0 | enum pmd_info_type type = *(enum pmd_info_type *) aux; |
901 | 0 | unsigned int core_id; |
902 | 0 | bool filter_on_pmd = false; |
903 | 0 | size_t n; |
904 | 0 | unsigned int secs = 0; |
905 | 0 | unsigned long long max_secs = (PMD_INTERVAL_LEN * PMD_INTERVAL_MAX) |
906 | 0 | / INTERVAL_USEC_TO_SEC; |
907 | 0 | bool show_header = true; |
908 | 0 | uint64_t max_sleep; |
909 | |
|
910 | 0 | ovs_mutex_lock(&dp_netdev_mutex); |
911 | |
|
912 | 0 | while (argc > 1) { |
913 | 0 | if (!strcmp(argv[1], "-pmd") && argc > 2) { |
914 | 0 | if (str_to_uint(argv[2], 10, &core_id)) { |
915 | 0 | filter_on_pmd = true; |
916 | 0 | } |
917 | 0 | argc -= 2; |
918 | 0 | argv += 2; |
919 | 0 | } else if (type == PMD_INFO_SHOW_RXQ && |
920 | 0 | !strcmp(argv[1], "-secs") && |
921 | 0 | argc > 2) { |
922 | 0 | if (!str_to_uint(argv[2], 10, &secs)) { |
923 | 0 | secs = max_secs; |
924 | 0 | } |
925 | 0 | argc -= 2; |
926 | 0 | argv += 2; |
927 | 0 | } else { |
928 | 0 | dp = shash_find_data(&dp_netdevs, argv[1]); |
929 | 0 | argc -= 1; |
930 | 0 | argv += 1; |
931 | 0 | } |
932 | 0 | } |
933 | |
|
934 | 0 | if (!dp) { |
935 | 0 | if (shash_count(&dp_netdevs) == 1) { |
936 | | /* There's only one datapath */ |
937 | 0 | dp = shash_first(&dp_netdevs)->data; |
938 | 0 | } else { |
939 | 0 | ovs_mutex_unlock(&dp_netdev_mutex); |
940 | 0 | unixctl_command_reply_error(conn, |
941 | 0 | "please specify an existing datapath"); |
942 | 0 | return; |
943 | 0 | } |
944 | 0 | } |
945 | | |
946 | 0 | sorted_poll_thread_list(dp, &pmd_list, &n); |
947 | 0 | for (size_t i = 0; i < n; i++) { |
948 | 0 | struct dp_netdev_pmd_thread *pmd = pmd_list[i]; |
949 | 0 | if (!pmd) { |
950 | 0 | break; |
951 | 0 | } |
952 | 0 | if (filter_on_pmd && pmd->core_id != core_id) { |
953 | 0 | continue; |
954 | 0 | } |
955 | 0 | if (type == PMD_INFO_SHOW_RXQ) { |
956 | 0 | if (show_header) { |
957 | 0 | if (!secs || secs > max_secs) { |
958 | 0 | secs = max_secs; |
959 | 0 | } else { |
960 | 0 | secs = ROUND_UP(secs, |
961 | 0 | PMD_INTERVAL_LEN / INTERVAL_USEC_TO_SEC); |
962 | 0 | } |
963 | 0 | ds_put_format(&reply, "Displaying last %u seconds " |
964 | 0 | "pmd usage %%\n", secs); |
965 | 0 | show_header = false; |
966 | 0 | } |
967 | 0 | pmd_info_show_rxq(&reply, pmd, secs); |
968 | 0 | } else if (type == PMD_INFO_CLEAR_STATS) { |
969 | 0 | pmd_perf_stats_clear(&pmd->perf_stats); |
970 | 0 | } else if (type == PMD_INFO_PERF_SHOW) { |
971 | 0 | pmd_info_show_perf(&reply, pmd, (struct pmd_perf_params *)aux); |
972 | 0 | } else if (type == PMD_INFO_SLEEP_SHOW) { |
973 | 0 | if (show_header) { |
974 | 0 | ds_put_format(&reply, "Default max sleep: %4"PRIu64" us\n", |
975 | 0 | dp->pmd_max_sleep_default); |
976 | 0 | show_header = false; |
977 | 0 | } |
978 | 0 | atomic_read_relaxed(&pmd->max_sleep, &max_sleep); |
979 | 0 | pmd_info_show_sleep(&reply, pmd->core_id, pmd->numa_id, |
980 | 0 | max_sleep); |
981 | 0 | } |
982 | 0 | } |
983 | 0 | free(pmd_list); |
984 | |
|
985 | 0 | ovs_mutex_unlock(&dp_netdev_mutex); |
986 | |
|
987 | 0 | unixctl_command_reply(conn, ds_cstr(&reply)); |
988 | 0 | ds_destroy(&reply); |
989 | 0 | } |
990 | | |
991 | | static void |
992 | | pmd_perf_show_cmd(struct unixctl_conn *conn, int argc, |
993 | | const char *argv[], |
994 | | void *aux OVS_UNUSED) |
995 | 0 | { |
996 | 0 | struct pmd_perf_params par; |
997 | 0 | long int it_hist = 0, ms_hist = 0; |
998 | 0 | par.histograms = true; |
999 | |
|
1000 | 0 | while (argc > 1) { |
1001 | 0 | if (!strcmp(argv[1], "-nh")) { |
1002 | 0 | par.histograms = false; |
1003 | 0 | argc -= 1; |
1004 | 0 | argv += 1; |
1005 | 0 | } else if (!strcmp(argv[1], "-it") && argc > 2) { |
1006 | 0 | it_hist = strtol(argv[2], NULL, 10); |
1007 | 0 | if (it_hist < 0) { |
1008 | 0 | it_hist = 0; |
1009 | 0 | } else if (it_hist > HISTORY_LEN) { |
1010 | 0 | it_hist = HISTORY_LEN; |
1011 | 0 | } |
1012 | 0 | argc -= 2; |
1013 | 0 | argv += 2; |
1014 | 0 | } else if (!strcmp(argv[1], "-ms") && argc > 2) { |
1015 | 0 | ms_hist = strtol(argv[2], NULL, 10); |
1016 | 0 | if (ms_hist < 0) { |
1017 | 0 | ms_hist = 0; |
1018 | 0 | } else if (ms_hist > HISTORY_LEN) { |
1019 | 0 | ms_hist = HISTORY_LEN; |
1020 | 0 | } |
1021 | 0 | argc -= 2; |
1022 | 0 | argv += 2; |
1023 | 0 | } else { |
1024 | 0 | break; |
1025 | 0 | } |
1026 | 0 | } |
1027 | 0 | par.iter_hist_len = it_hist; |
1028 | 0 | par.ms_hist_len = ms_hist; |
1029 | 0 | par.command_type = PMD_INFO_PERF_SHOW; |
1030 | 0 | dpif_netdev_pmd_info(conn, argc, argv, &par); |
1031 | 0 | } |
1032 | | |
1033 | | static void |
1034 | | dpif_netdev_bond_show(struct unixctl_conn *conn, int argc, |
1035 | | const char *argv[], void *aux OVS_UNUSED) |
1036 | 0 | { |
1037 | 0 | struct ds reply = DS_EMPTY_INITIALIZER; |
1038 | 0 | struct dp_netdev *dp = NULL; |
1039 | |
|
1040 | 0 | ovs_mutex_lock(&dp_netdev_mutex); |
1041 | 0 | if (argc == 2) { |
1042 | 0 | dp = shash_find_data(&dp_netdevs, argv[1]); |
1043 | 0 | } else if (shash_count(&dp_netdevs) == 1) { |
1044 | | /* There's only one datapath. */ |
1045 | 0 | dp = shash_first(&dp_netdevs)->data; |
1046 | 0 | } |
1047 | 0 | if (!dp) { |
1048 | 0 | ovs_mutex_unlock(&dp_netdev_mutex); |
1049 | 0 | unixctl_command_reply_error(conn, |
1050 | 0 | "please specify an existing datapath"); |
1051 | 0 | return; |
1052 | 0 | } |
1053 | | |
1054 | 0 | if (cmap_count(&dp->tx_bonds) > 0) { |
1055 | 0 | struct tx_bond *dp_bond_entry; |
1056 | |
|
1057 | 0 | ds_put_cstr(&reply, "Bonds:\n"); |
1058 | 0 | CMAP_FOR_EACH (dp_bond_entry, node, &dp->tx_bonds) { |
1059 | 0 | ds_put_format(&reply, " bond-id %"PRIu32":\n", |
1060 | 0 | dp_bond_entry->bond_id); |
1061 | 0 | for (int bucket = 0; bucket < BOND_BUCKETS; bucket++) { |
1062 | 0 | uint32_t member_id = odp_to_u32( |
1063 | 0 | dp_bond_entry->member_buckets[bucket].member_id); |
1064 | 0 | ds_put_format(&reply, |
1065 | 0 | " bucket %d - member %"PRIu32"\n", |
1066 | 0 | bucket, member_id); |
1067 | 0 | } |
1068 | 0 | } |
1069 | 0 | } |
1070 | 0 | ovs_mutex_unlock(&dp_netdev_mutex); |
1071 | 0 | unixctl_command_reply(conn, ds_cstr(&reply)); |
1072 | 0 | ds_destroy(&reply); |
1073 | 0 | } |
1074 | | |
1075 | | |
1076 | | static int |
1077 | | dpif_netdev_init(void) |
1078 | 0 | { |
1079 | 0 | static enum pmd_info_type clear_aux = PMD_INFO_CLEAR_STATS, |
1080 | 0 | poll_aux = PMD_INFO_SHOW_RXQ, |
1081 | 0 | sleep_aux = PMD_INFO_SLEEP_SHOW; |
1082 | |
|
1083 | 0 | unixctl_command_register("dpif-netdev/pmd-stats-clear", "[-pmd core] [dp]", |
1084 | 0 | 0, 3, dpif_netdev_pmd_info, |
1085 | 0 | (void *)&clear_aux); |
1086 | 0 | unixctl_command_register("dpif-netdev/pmd-rxq-show", "[-pmd core] " |
1087 | 0 | "[-secs secs] [dp]", |
1088 | 0 | 0, 5, dpif_netdev_pmd_info, |
1089 | 0 | (void *)&poll_aux); |
1090 | 0 | unixctl_command_register("dpif-netdev/pmd-sleep-show", "[dp]", |
1091 | 0 | 0, 1, dpif_netdev_pmd_info, |
1092 | 0 | (void *)&sleep_aux); |
1093 | 0 | unixctl_command_register("dpif-netdev/pmd-perf-show", |
1094 | 0 | "[-nh] [-it iter-history-len]" |
1095 | 0 | " [-ms ms-history-len]" |
1096 | 0 | " [-pmd core] [dp]", |
1097 | 0 | 0, 8, pmd_perf_show_cmd, |
1098 | 0 | NULL); |
1099 | | /* 'pmd-stats-show' is just an undocumented alias for 'pmd-perf-show', |
1100 | | * for compatibility with old muscle memory. */ |
1101 | 0 | unixctl_command_register("dpif-netdev/pmd-stats-show", NULL, |
1102 | 0 | 0, 8, pmd_perf_show_cmd, NULL); |
1103 | 0 | unixctl_command_register("dpif-netdev/pmd-rxq-rebalance", "[dp]", |
1104 | 0 | 0, 1, dpif_netdev_pmd_rebalance, |
1105 | 0 | NULL); |
1106 | 0 | unixctl_command_register("dpif-netdev/pmd-perf-log-set", |
1107 | 0 | "on|off [-b before] [-a after] [-e|-ne] " |
1108 | 0 | "[-us usec] [-q qlen]", |
1109 | 0 | 0, 10, pmd_perf_log_set_cmd, |
1110 | 0 | NULL); |
1111 | 0 | unixctl_command_register("dpif-netdev/bond-show", "[dp]", |
1112 | 0 | 0, 1, dpif_netdev_bond_show, |
1113 | 0 | NULL); |
1114 | 0 | return 0; |
1115 | 0 | } |
1116 | | |
1117 | | static int |
1118 | | dpif_netdev_enumerate(struct sset *all_dps, |
1119 | | const struct dpif_class *dpif_class) |
1120 | 0 | { |
1121 | 0 | struct shash_node *node; |
1122 | |
|
1123 | 0 | ovs_mutex_lock(&dp_netdev_mutex); |
1124 | 0 | SHASH_FOR_EACH(node, &dp_netdevs) { |
1125 | 0 | struct dp_netdev *dp = node->data; |
1126 | 0 | if (dpif_class != dp->class) { |
1127 | | /* 'dp_netdevs' contains both "netdev" and "dummy" dpifs. |
1128 | | * If the class doesn't match, skip this dpif. */ |
1129 | 0 | continue; |
1130 | 0 | } |
1131 | 0 | sset_add(all_dps, node->name); |
1132 | 0 | } |
1133 | 0 | ovs_mutex_unlock(&dp_netdev_mutex); |
1134 | |
|
1135 | 0 | return 0; |
1136 | 0 | } |
1137 | | |
1138 | | static bool |
1139 | | dpif_netdev_class_is_dummy(const struct dpif_class *class) |
1140 | 0 | { |
1141 | 0 | return class != &dpif_netdev_class; |
1142 | 0 | } |
1143 | | |
1144 | | static const char * |
1145 | | dpif_netdev_port_open_type(const struct dpif_class *class, const char *type) |
1146 | 0 | { |
1147 | 0 | return strcmp(type, "internal") ? type |
1148 | 0 | : dpif_netdev_class_is_dummy(class) ? "dummy-internal" |
1149 | 0 | : "tap"; |
1150 | 0 | } |
1151 | | |
1152 | | static struct dpif * |
1153 | | create_dpif_netdev(struct dp_netdev *dp) |
1154 | 0 | { |
1155 | 0 | uint16_t netflow_id = hash_string(dp->name, 0); |
1156 | 0 | struct dpif_netdev *dpif; |
1157 | |
|
1158 | 0 | ovs_refcount_ref(&dp->ref_cnt); |
1159 | |
|
1160 | 0 | dpif = xmalloc(sizeof *dpif); |
1161 | 0 | dpif_init(&dpif->dpif, dp->class, dp->name, netflow_id >> 8, netflow_id); |
1162 | 0 | dpif->dp = dp; |
1163 | 0 | dpif->last_port_seq = seq_read(dp->port_seq); |
1164 | |
|
1165 | 0 | return &dpif->dpif; |
1166 | 0 | } |
1167 | | |
1168 | | /* Choose an unused, non-zero port number and return it on success. |
1169 | | * Return ODPP_NONE on failure. */ |
1170 | | static odp_port_t |
1171 | | choose_port(struct dp_netdev *dp, const char *name) |
1172 | | OVS_REQ_RDLOCK(dp->port_rwlock) |
1173 | 0 | { |
1174 | 0 | uint32_t port_no; |
1175 | |
|
1176 | 0 | if (dp->class != &dpif_netdev_class) { |
1177 | 0 | const char *p; |
1178 | 0 | int start_no = 0; |
1179 | | |
1180 | | /* If the port name begins with "br", start the number search at |
1181 | | * 100 to make writing tests easier. */ |
1182 | 0 | if (!strncmp(name, "br", 2)) { |
1183 | 0 | start_no = 100; |
1184 | 0 | } |
1185 | | |
1186 | | /* If the port name contains a number, try to assign that port number. |
1187 | | * This can make writing unit tests easier because port numbers are |
1188 | | * predictable. */ |
1189 | 0 | for (p = name; *p != '\0'; p++) { |
1190 | 0 | if (isdigit((unsigned char) *p)) { |
1191 | 0 | port_no = start_no + strtol(p, NULL, 10); |
1192 | 0 | if (port_no > 0 && port_no != odp_to_u32(ODPP_NONE) |
1193 | 0 | && !dp_netdev_lookup_port(dp, u32_to_odp(port_no))) { |
1194 | 0 | return u32_to_odp(port_no); |
1195 | 0 | } |
1196 | 0 | break; |
1197 | 0 | } |
1198 | 0 | } |
1199 | 0 | } |
1200 | | |
1201 | 0 | for (port_no = 1; port_no <= UINT16_MAX; port_no++) { |
1202 | 0 | if (!dp_netdev_lookup_port(dp, u32_to_odp(port_no))) { |
1203 | 0 | return u32_to_odp(port_no); |
1204 | 0 | } |
1205 | 0 | } |
1206 | | |
1207 | 0 | return ODPP_NONE; |
1208 | 0 | } |
1209 | | |
1210 | | static uint32_t |
1211 | | dp_meter_hash(uint32_t meter_id) |
1212 | 0 | { |
1213 | | /* In the ofproto-dpif layer, we use the id-pool to alloc meter id |
1214 | | * orderly (e.g. 1, 2, ... N.), which provides a better hash |
1215 | | * distribution. Use them directly instead of hash_xxx function for |
1216 | | * achieving high-performance. */ |
1217 | 0 | return meter_id; |
1218 | 0 | } |
1219 | | |
1220 | | static void |
1221 | | dp_netdev_meter_destroy(struct dp_netdev *dp) |
1222 | 0 | { |
1223 | 0 | struct dp_meter *m; |
1224 | |
|
1225 | 0 | ovs_mutex_lock(&dp->meters_lock); |
1226 | 0 | CMAP_FOR_EACH (m, node, &dp->meters) { |
1227 | 0 | cmap_remove(&dp->meters, &m->node, dp_meter_hash(m->id)); |
1228 | 0 | ovsrcu_postpone(free, m); |
1229 | 0 | } |
1230 | |
|
1231 | 0 | cmap_destroy(&dp->meters); |
1232 | 0 | ovs_mutex_unlock(&dp->meters_lock); |
1233 | 0 | ovs_mutex_destroy(&dp->meters_lock); |
1234 | 0 | } |
1235 | | |
1236 | | static struct dp_meter * |
1237 | | dp_meter_lookup(struct cmap *meters, uint32_t meter_id) |
1238 | 0 | { |
1239 | 0 | uint32_t hash = dp_meter_hash(meter_id); |
1240 | 0 | struct dp_meter *m; |
1241 | |
|
1242 | 0 | CMAP_FOR_EACH_WITH_HASH (m, node, hash, meters) { |
1243 | 0 | if (m->id == meter_id) { |
1244 | 0 | return m; |
1245 | 0 | } |
1246 | 0 | } |
1247 | | |
1248 | 0 | return NULL; |
1249 | 0 | } |
1250 | | |
1251 | | static void |
1252 | | dp_meter_detach_free(struct cmap *meters, uint32_t meter_id) |
1253 | 0 | { |
1254 | 0 | struct dp_meter *m = dp_meter_lookup(meters, meter_id); |
1255 | |
|
1256 | 0 | if (m) { |
1257 | 0 | cmap_remove(meters, &m->node, dp_meter_hash(meter_id)); |
1258 | 0 | ovsrcu_postpone(free, m); |
1259 | 0 | } |
1260 | 0 | } |
1261 | | |
1262 | | static void |
1263 | | dp_meter_attach(struct cmap *meters, struct dp_meter *meter) |
1264 | 0 | { |
1265 | 0 | cmap_insert(meters, &meter->node, dp_meter_hash(meter->id)); |
1266 | 0 | } |
1267 | | |
1268 | | static int |
1269 | | create_dp_netdev(const char *name, const struct dpif_class *class, |
1270 | | struct dp_netdev **dpp) |
1271 | | OVS_REQUIRES(dp_netdev_mutex) |
1272 | 0 | { |
1273 | 0 | static struct ovsthread_once tsc_freq_check = OVSTHREAD_ONCE_INITIALIZER; |
1274 | 0 | struct dp_netdev *dp; |
1275 | 0 | int error; |
1276 | | |
1277 | | /* Avoid estimating TSC frequency for dummy datapath to not slow down |
1278 | | * unit tests. */ |
1279 | 0 | if (!dpif_netdev_class_is_dummy(class) |
1280 | 0 | && ovsthread_once_start(&tsc_freq_check)) { |
1281 | 0 | pmd_perf_estimate_tsc_frequency(); |
1282 | 0 | ovsthread_once_done(&tsc_freq_check); |
1283 | 0 | } |
1284 | |
|
1285 | 0 | dp = xzalloc(sizeof *dp); |
1286 | 0 | shash_add(&dp_netdevs, name, dp); |
1287 | |
|
1288 | 0 | *CONST_CAST(const struct dpif_class **, &dp->class) = class; |
1289 | 0 | *CONST_CAST(const char **, &dp->name) = xstrdup(name); |
1290 | 0 | *CONST_CAST(const char **, &dp->full_name) = xasprintf("%s@%s", |
1291 | 0 | class->type, name); |
1292 | 0 | ovs_refcount_init(&dp->ref_cnt); |
1293 | 0 | atomic_flag_clear(&dp->destroyed); |
1294 | |
|
1295 | 0 | ovs_rwlock_init(&dp->port_rwlock); |
1296 | 0 | hmap_init(&dp->ports); |
1297 | 0 | dp->port_seq = seq_create(); |
1298 | 0 | ovs_mutex_init(&dp->bond_mutex); |
1299 | 0 | cmap_init(&dp->tx_bonds); |
1300 | |
|
1301 | 0 | fat_rwlock_init(&dp->upcall_rwlock); |
1302 | |
|
1303 | 0 | dp->reconfigure_seq = seq_create(); |
1304 | 0 | dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq); |
1305 | 0 | dp->once_set_config = (struct ovsthread_once) OVSTHREAD_ONCE_INITIALIZER; |
1306 | | |
1307 | | /* Init meter resources. */ |
1308 | 0 | cmap_init(&dp->meters); |
1309 | 0 | ovs_mutex_init(&dp->meters_lock); |
1310 | | |
1311 | | /* Disable upcalls by default. */ |
1312 | 0 | dp_netdev_disable_upcall(dp); |
1313 | 0 | dp->upcall_aux = NULL; |
1314 | 0 | dp->upcall_cb = NULL; |
1315 | |
|
1316 | 0 | dp->conntrack = conntrack_init(); |
1317 | |
|
1318 | 0 | atomic_init(&dp->emc_insert_min, DEFAULT_EM_FLOW_INSERT_MIN); |
1319 | 0 | atomic_init(&dp->tx_flush_interval, DEFAULT_TX_FLUSH_INTERVAL); |
1320 | |
|
1321 | 0 | cmap_init(&dp->poll_threads); |
1322 | 0 | dp->pmd_rxq_assign_type = SCHED_CYCLES; |
1323 | |
|
1324 | 0 | ovs_mutex_init(&dp->tx_qid_pool_mutex); |
1325 | | /* We need 1 Tx queue for each possible core + 1 for non-PMD threads. */ |
1326 | 0 | dp->tx_qid_pool = id_pool_create(0, ovs_numa_get_n_cores() + 1); |
1327 | |
|
1328 | 0 | ovs_mutex_init_recursive(&dp->non_pmd_mutex); |
1329 | 0 | ovsthread_key_create(&dp->per_pmd_key, NULL); |
1330 | |
|
1331 | 0 | ovs_rwlock_wrlock(&dp->port_rwlock); |
1332 | | /* non-PMD will be created before all other threads and will |
1333 | | * allocate static_tx_qid = 0. */ |
1334 | 0 | dp_netdev_set_nonpmd(dp); |
1335 | |
|
1336 | 0 | error = do_add_port(dp, name, dpif_netdev_port_open_type(dp->class, |
1337 | 0 | "internal"), |
1338 | 0 | ODPP_LOCAL); |
1339 | 0 | ovs_rwlock_unlock(&dp->port_rwlock); |
1340 | 0 | if (error) { |
1341 | 0 | dp_netdev_free(dp); |
1342 | 0 | return error; |
1343 | 0 | } |
1344 | | |
1345 | 0 | dp->max_sleep_list = NULL; |
1346 | |
|
1347 | 0 | dp->last_tnl_conf_seq = seq_read(tnl_conf_seq); |
1348 | 0 | *dpp = dp; |
1349 | 0 | return 0; |
1350 | 0 | } |
1351 | | |
1352 | | static void |
1353 | | dp_netdev_request_reconfigure(struct dp_netdev *dp) |
1354 | 0 | { |
1355 | 0 | seq_change(dp->reconfigure_seq); |
1356 | 0 | } |
1357 | | |
1358 | | static bool |
1359 | | dp_netdev_is_reconf_required(struct dp_netdev *dp) |
1360 | 0 | { |
1361 | 0 | return seq_read(dp->reconfigure_seq) != dp->last_reconfigure_seq; |
1362 | 0 | } |
1363 | | |
1364 | | static int |
1365 | | dpif_netdev_open(const struct dpif_class *class, const char *name, |
1366 | | bool create, struct dpif **dpifp) |
1367 | 0 | { |
1368 | 0 | struct dp_netdev *dp; |
1369 | 0 | int error; |
1370 | |
|
1371 | 0 | ovs_mutex_lock(&dp_netdev_mutex); |
1372 | 0 | dp = shash_find_data(&dp_netdevs, name); |
1373 | 0 | if (!dp) { |
1374 | 0 | error = create ? create_dp_netdev(name, class, &dp) : ENODEV; |
1375 | 0 | } else { |
1376 | 0 | error = (dp->class != class ? EINVAL |
1377 | 0 | : create ? EEXIST |
1378 | 0 | : 0); |
1379 | 0 | } |
1380 | 0 | if (!error) { |
1381 | 0 | *dpifp = create_dpif_netdev(dp); |
1382 | 0 | } |
1383 | 0 | ovs_mutex_unlock(&dp_netdev_mutex); |
1384 | |
|
1385 | 0 | return error; |
1386 | 0 | } |
1387 | | |
1388 | | static void |
1389 | | dp_netdev_destroy_upcall_lock(struct dp_netdev *dp) |
1390 | | OVS_NO_THREAD_SAFETY_ANALYSIS |
1391 | 0 | { |
1392 | | /* Check that upcalls are disabled, i.e. that the rwlock is taken */ |
1393 | 0 | ovs_assert(fat_rwlock_tryrdlock(&dp->upcall_rwlock)); |
1394 | | |
1395 | | /* Before freeing a lock we should release it */ |
1396 | 0 | fat_rwlock_unlock(&dp->upcall_rwlock); |
1397 | 0 | fat_rwlock_destroy(&dp->upcall_rwlock); |
1398 | 0 | } |
1399 | | |
1400 | | static uint32_t |
1401 | | hash_bond_id(uint32_t bond_id) |
1402 | 0 | { |
1403 | 0 | return hash_int(bond_id, 0); |
1404 | 0 | } |
1405 | | |
1406 | | /* Requires dp_netdev_mutex so that we can't get a new reference to 'dp' |
1407 | | * through the 'dp_netdevs' shash while freeing 'dp'. */ |
1408 | | static void |
1409 | | dp_netdev_free(struct dp_netdev *dp) |
1410 | | OVS_REQUIRES(dp_netdev_mutex) |
1411 | 0 | { |
1412 | 0 | struct dp_netdev_port *port; |
1413 | 0 | struct tx_bond *bond; |
1414 | |
|
1415 | 0 | shash_find_and_delete(&dp_netdevs, dp->name); |
1416 | |
|
1417 | 0 | ovs_rwlock_wrlock(&dp->port_rwlock); |
1418 | 0 | HMAP_FOR_EACH_SAFE (port, node, &dp->ports) { |
1419 | 0 | do_del_port(dp, port); |
1420 | 0 | } |
1421 | 0 | ovs_rwlock_unlock(&dp->port_rwlock); |
1422 | |
|
1423 | 0 | ovs_mutex_lock(&dp->bond_mutex); |
1424 | 0 | CMAP_FOR_EACH (bond, node, &dp->tx_bonds) { |
1425 | 0 | cmap_remove(&dp->tx_bonds, &bond->node, hash_bond_id(bond->bond_id)); |
1426 | 0 | ovsrcu_postpone(free, bond); |
1427 | 0 | } |
1428 | 0 | ovs_mutex_unlock(&dp->bond_mutex); |
1429 | |
|
1430 | 0 | dp_netdev_destroy_all_pmds(dp, true); |
1431 | 0 | cmap_destroy(&dp->poll_threads); |
1432 | |
|
1433 | 0 | ovs_mutex_destroy(&dp->tx_qid_pool_mutex); |
1434 | 0 | id_pool_destroy(dp->tx_qid_pool); |
1435 | |
|
1436 | 0 | ovs_mutex_destroy(&dp->non_pmd_mutex); |
1437 | 0 | ovsthread_key_delete(dp->per_pmd_key); |
1438 | |
|
1439 | 0 | conntrack_destroy(dp->conntrack); |
1440 | | |
1441 | |
|
1442 | 0 | seq_destroy(dp->reconfigure_seq); |
1443 | 0 | ovsthread_once_destroy(&dp->once_set_config); |
1444 | |
|
1445 | 0 | seq_destroy(dp->port_seq); |
1446 | 0 | hmap_destroy(&dp->ports); |
1447 | 0 | ovs_rwlock_destroy(&dp->port_rwlock); |
1448 | |
|
1449 | 0 | cmap_destroy(&dp->tx_bonds); |
1450 | 0 | ovs_mutex_destroy(&dp->bond_mutex); |
1451 | | |
1452 | | /* Upcalls must be disabled at this point */ |
1453 | 0 | dp_netdev_destroy_upcall_lock(dp); |
1454 | |
|
1455 | 0 | dp_netdev_meter_destroy(dp); |
1456 | |
|
1457 | 0 | free(dp->max_sleep_list); |
1458 | 0 | free(dp->pmd_cmask); |
1459 | 0 | free(CONST_CAST(char *, dp->name)); |
1460 | 0 | free(CONST_CAST(char *, dp->full_name)); |
1461 | 0 | free(dp); |
1462 | 0 | } |
1463 | | |
1464 | | static void |
1465 | | dp_netdev_unref(struct dp_netdev *dp) |
1466 | 0 | { |
1467 | 0 | if (dp) { |
1468 | | /* Take dp_netdev_mutex so that, if dp->ref_cnt falls to zero, we can't |
1469 | | * get a new reference to 'dp' through the 'dp_netdevs' shash. */ |
1470 | 0 | ovs_mutex_lock(&dp_netdev_mutex); |
1471 | 0 | if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) { |
1472 | 0 | dp_netdev_free(dp); |
1473 | 0 | } |
1474 | 0 | ovs_mutex_unlock(&dp_netdev_mutex); |
1475 | 0 | } |
1476 | 0 | } |
1477 | | |
1478 | | static void |
1479 | | dpif_netdev_close(struct dpif *dpif) |
1480 | 0 | { |
1481 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
1482 | |
|
1483 | 0 | dp_netdev_unref(dp); |
1484 | 0 | free(dpif); |
1485 | 0 | } |
1486 | | |
1487 | | static int |
1488 | | dpif_netdev_destroy(struct dpif *dpif) |
1489 | 0 | { |
1490 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
1491 | |
|
1492 | 0 | if (!atomic_flag_test_and_set(&dp->destroyed)) { |
1493 | 0 | if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) { |
1494 | | /* Can't happen: 'dpif' still owns a reference to 'dp'. */ |
1495 | 0 | OVS_NOT_REACHED(); |
1496 | 0 | } |
1497 | 0 | } |
1498 | | |
1499 | 0 | return 0; |
1500 | 0 | } |
1501 | | |
1502 | | /* Add 'n' to the atomic variable 'var' non-atomically and using relaxed |
1503 | | * load/store semantics. While the increment is not atomic, the load and |
1504 | | * store operations are, making it impossible to read inconsistent values. |
1505 | | * |
1506 | | * This is used to update thread local stats counters. */ |
1507 | | static void |
1508 | | non_atomic_ullong_add(atomic_ullong *var, unsigned long long n) |
1509 | 0 | { |
1510 | 0 | unsigned long long tmp; |
1511 | |
|
1512 | 0 | atomic_read_relaxed(var, &tmp); |
1513 | 0 | tmp += n; |
1514 | 0 | atomic_store_relaxed(var, tmp); |
1515 | 0 | } |
1516 | | |
1517 | | static int |
1518 | | dpif_netdev_get_stats(const struct dpif *dpif, struct dpif_dp_stats *stats) |
1519 | 0 | { |
1520 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
1521 | 0 | struct dp_netdev_pmd_thread *pmd; |
1522 | 0 | uint64_t pmd_stats[PMD_N_STATS]; |
1523 | |
|
1524 | 0 | stats->n_flows = stats->n_hit = stats->n_missed = stats->n_lost = 0; |
1525 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
1526 | 0 | stats->n_flows += cmap_count(&pmd->flow_table); |
1527 | 0 | pmd_perf_read_counters(&pmd->perf_stats, pmd_stats); |
1528 | 0 | stats->n_hit += pmd_stats[PMD_STAT_PHWOL_HIT]; |
1529 | 0 | stats->n_hit += pmd_stats[PMD_STAT_SIMPLE_HIT]; |
1530 | 0 | stats->n_hit += pmd_stats[PMD_STAT_EXACT_HIT]; |
1531 | 0 | stats->n_hit += pmd_stats[PMD_STAT_SMC_HIT]; |
1532 | 0 | stats->n_hit += pmd_stats[PMD_STAT_MASKED_HIT]; |
1533 | 0 | stats->n_missed += pmd_stats[PMD_STAT_MISS]; |
1534 | 0 | stats->n_lost += pmd_stats[PMD_STAT_LOST]; |
1535 | 0 | } |
1536 | 0 | stats->n_masks = UINT32_MAX; |
1537 | 0 | stats->n_mask_hit = UINT64_MAX; |
1538 | 0 | stats->n_cache_hit = UINT64_MAX; |
1539 | |
|
1540 | 0 | return 0; |
1541 | 0 | } |
1542 | | |
1543 | | static void |
1544 | | dp_netdev_reload_pmd__(struct dp_netdev_pmd_thread *pmd) |
1545 | 0 | { |
1546 | 0 | if (pmd->core_id == NON_PMD_CORE_ID) { |
1547 | 0 | ovs_mutex_lock(&pmd->dp->non_pmd_mutex); |
1548 | 0 | ovs_mutex_lock(&pmd->port_mutex); |
1549 | 0 | pmd_load_cached_ports(pmd); |
1550 | 0 | ovs_mutex_unlock(&pmd->port_mutex); |
1551 | 0 | ovs_mutex_unlock(&pmd->dp->non_pmd_mutex); |
1552 | 0 | return; |
1553 | 0 | } |
1554 | | |
1555 | 0 | seq_change(pmd->reload_seq); |
1556 | 0 | atomic_store_explicit(&pmd->reload, true, memory_order_release); |
1557 | 0 | } |
1558 | | |
1559 | | static uint32_t |
1560 | | hash_port_no(odp_port_t port_no) |
1561 | 0 | { |
1562 | 0 | return hash_int(odp_to_u32(port_no), 0); |
1563 | 0 | } |
1564 | | |
1565 | | static int |
1566 | | port_create(const char *devname, const char *type, |
1567 | | odp_port_t port_no, struct dp_netdev_port **portp) |
1568 | 0 | { |
1569 | 0 | struct dp_netdev_port *port; |
1570 | 0 | enum netdev_flags flags; |
1571 | 0 | struct netdev *netdev; |
1572 | 0 | int error; |
1573 | |
|
1574 | 0 | *portp = NULL; |
1575 | | |
1576 | | /* Open and validate network device. */ |
1577 | 0 | error = netdev_open(devname, type, &netdev); |
1578 | 0 | if (error) { |
1579 | 0 | return error; |
1580 | 0 | } |
1581 | | /* XXX reject non-Ethernet devices */ |
1582 | | |
1583 | 0 | netdev_get_flags(netdev, &flags); |
1584 | 0 | if (flags & NETDEV_LOOPBACK) { |
1585 | 0 | VLOG_ERR("%s: cannot add a loopback device", devname); |
1586 | 0 | error = EINVAL; |
1587 | 0 | goto out; |
1588 | 0 | } |
1589 | | |
1590 | 0 | port = xzalloc(sizeof *port); |
1591 | 0 | port->port_no = port_no; |
1592 | 0 | port->netdev = netdev; |
1593 | 0 | port->type = xstrdup(type); |
1594 | 0 | port->sf = NULL; |
1595 | 0 | port->emc_enabled = true; |
1596 | 0 | port->need_reconfigure = true; |
1597 | 0 | ovs_mutex_init(&port->txq_used_mutex); |
1598 | |
|
1599 | 0 | *portp = port; |
1600 | |
|
1601 | 0 | return 0; |
1602 | | |
1603 | 0 | out: |
1604 | 0 | netdev_close(netdev); |
1605 | 0 | return error; |
1606 | 0 | } |
1607 | | |
1608 | | static int |
1609 | | do_add_port(struct dp_netdev *dp, const char *devname, const char *type, |
1610 | | odp_port_t port_no) |
1611 | | OVS_REQ_WRLOCK(dp->port_rwlock) |
1612 | 0 | { |
1613 | 0 | struct netdev_saved_flags *sf; |
1614 | 0 | struct dp_netdev_port *port; |
1615 | 0 | int error; |
1616 | | |
1617 | | /* Reject devices already in 'dp'. */ |
1618 | 0 | if (!get_port_by_name(dp, devname, &port)) { |
1619 | 0 | return EEXIST; |
1620 | 0 | } |
1621 | | |
1622 | 0 | error = port_create(devname, type, port_no, &port); |
1623 | 0 | if (error) { |
1624 | 0 | return error; |
1625 | 0 | } |
1626 | | |
1627 | 0 | hmap_insert(&dp->ports, &port->node, hash_port_no(port_no)); |
1628 | 0 | seq_change(dp->port_seq); |
1629 | |
|
1630 | 0 | reconfigure_datapath(dp); |
1631 | | |
1632 | | /* Check that port was successfully configured. */ |
1633 | 0 | if (!dp_netdev_lookup_port(dp, port_no)) { |
1634 | 0 | return EINVAL; |
1635 | 0 | } |
1636 | | |
1637 | | /* Updating device flags triggers an if_notifier, which triggers a bridge |
1638 | | * reconfiguration and another attempt to add this port, leading to an |
1639 | | * infinite loop if the device is configured incorrectly and cannot be |
1640 | | * added. Setting the promisc mode after a successful reconfiguration, |
1641 | | * since we already know that the device is somehow properly configured. */ |
1642 | 0 | error = netdev_turn_flags_on(port->netdev, NETDEV_PROMISC, &sf); |
1643 | 0 | if (error) { |
1644 | 0 | VLOG_ERR("%s: cannot set promisc flag", devname); |
1645 | 0 | do_del_port(dp, port); |
1646 | 0 | return error; |
1647 | 0 | } |
1648 | 0 | port->sf = sf; |
1649 | |
|
1650 | 0 | return 0; |
1651 | 0 | } |
1652 | | |
1653 | | static int |
1654 | | dpif_netdev_port_add(struct dpif *dpif, struct netdev *netdev, |
1655 | | odp_port_t *port_nop) |
1656 | 0 | { |
1657 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
1658 | 0 | char namebuf[NETDEV_VPORT_NAME_BUFSIZE]; |
1659 | 0 | const char *dpif_port; |
1660 | 0 | odp_port_t port_no; |
1661 | 0 | int error; |
1662 | |
|
1663 | 0 | ovs_rwlock_wrlock(&dp->port_rwlock); |
1664 | 0 | dpif_port = netdev_vport_get_dpif_port(netdev, namebuf, sizeof namebuf); |
1665 | 0 | if (*port_nop != ODPP_NONE) { |
1666 | 0 | port_no = *port_nop; |
1667 | 0 | error = dp_netdev_lookup_port(dp, *port_nop) ? EBUSY : 0; |
1668 | 0 | } else { |
1669 | 0 | port_no = choose_port(dp, dpif_port); |
1670 | 0 | error = port_no == ODPP_NONE ? EFBIG : 0; |
1671 | 0 | } |
1672 | 0 | if (!error) { |
1673 | 0 | *port_nop = port_no; |
1674 | 0 | error = do_add_port(dp, dpif_port, netdev_get_type(netdev), port_no); |
1675 | 0 | } |
1676 | 0 | ovs_rwlock_unlock(&dp->port_rwlock); |
1677 | |
|
1678 | 0 | return error; |
1679 | 0 | } |
1680 | | |
1681 | | static int |
1682 | | dpif_netdev_port_del(struct dpif *dpif, odp_port_t port_no) |
1683 | 0 | { |
1684 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
1685 | 0 | int error; |
1686 | |
|
1687 | 0 | ovs_rwlock_wrlock(&dp->port_rwlock); |
1688 | 0 | if (port_no == ODPP_LOCAL) { |
1689 | 0 | error = EINVAL; |
1690 | 0 | } else { |
1691 | 0 | struct dp_netdev_port *port; |
1692 | |
|
1693 | 0 | error = get_port_by_number(dp, port_no, &port); |
1694 | 0 | if (!error) { |
1695 | 0 | do_del_port(dp, port); |
1696 | 0 | } |
1697 | 0 | } |
1698 | 0 | ovs_rwlock_unlock(&dp->port_rwlock); |
1699 | |
|
1700 | 0 | return error; |
1701 | 0 | } |
1702 | | |
1703 | | static bool |
1704 | | is_valid_port_number(odp_port_t port_no) |
1705 | 0 | { |
1706 | 0 | return port_no != ODPP_NONE; |
1707 | 0 | } |
1708 | | |
1709 | | static struct dp_netdev_port * |
1710 | | dp_netdev_lookup_port(const struct dp_netdev *dp, odp_port_t port_no) |
1711 | | OVS_REQ_RDLOCK(dp->port_rwlock) |
1712 | 0 | { |
1713 | 0 | struct dp_netdev_port *port; |
1714 | |
|
1715 | 0 | HMAP_FOR_EACH_WITH_HASH (port, node, hash_port_no(port_no), &dp->ports) { |
1716 | 0 | if (port->port_no == port_no) { |
1717 | 0 | return port; |
1718 | 0 | } |
1719 | 0 | } |
1720 | 0 | return NULL; |
1721 | 0 | } |
1722 | | |
1723 | | static int |
1724 | | get_port_by_number(struct dp_netdev *dp, |
1725 | | odp_port_t port_no, struct dp_netdev_port **portp) |
1726 | | OVS_REQ_RDLOCK(dp->port_rwlock) |
1727 | 0 | { |
1728 | 0 | if (!is_valid_port_number(port_no)) { |
1729 | 0 | *portp = NULL; |
1730 | 0 | return EINVAL; |
1731 | 0 | } else { |
1732 | 0 | *portp = dp_netdev_lookup_port(dp, port_no); |
1733 | 0 | return *portp ? 0 : ENODEV; |
1734 | 0 | } |
1735 | 0 | } |
1736 | | |
1737 | | static void |
1738 | | port_destroy(struct dp_netdev_port *port) |
1739 | 0 | { |
1740 | 0 | if (!port) { |
1741 | 0 | return; |
1742 | 0 | } |
1743 | | |
1744 | 0 | netdev_close(port->netdev); |
1745 | 0 | netdev_restore_flags(port->sf); |
1746 | |
|
1747 | 0 | for (unsigned i = 0; i < port->n_rxq; i++) { |
1748 | 0 | netdev_rxq_close(port->rxqs[i].rx); |
1749 | 0 | } |
1750 | 0 | ovs_mutex_destroy(&port->txq_used_mutex); |
1751 | 0 | free(port->rxq_affinity_list); |
1752 | 0 | free(port->txq_used); |
1753 | 0 | free(port->rxqs); |
1754 | 0 | free(port->type); |
1755 | 0 | free(port); |
1756 | 0 | } |
1757 | | |
1758 | | static int |
1759 | | get_port_by_name(struct dp_netdev *dp, |
1760 | | const char *devname, struct dp_netdev_port **portp) |
1761 | | OVS_REQ_RDLOCK(dp->port_rwlock) |
1762 | 0 | { |
1763 | 0 | struct dp_netdev_port *port; |
1764 | |
|
1765 | 0 | HMAP_FOR_EACH (port, node, &dp->ports) { |
1766 | 0 | if (!strcmp(netdev_get_name(port->netdev), devname)) { |
1767 | 0 | *portp = port; |
1768 | 0 | return 0; |
1769 | 0 | } |
1770 | 0 | } |
1771 | | |
1772 | | /* Callers of dpif_netdev_port_query_by_name() expect ENODEV for a non |
1773 | | * existing port. */ |
1774 | 0 | return ENODEV; |
1775 | 0 | } |
1776 | | |
1777 | | /* Returns 'true' if there is a port with pmd netdev. */ |
1778 | | static bool |
1779 | | has_pmd_port(struct dp_netdev *dp) |
1780 | | OVS_REQ_RDLOCK(dp->port_rwlock) |
1781 | 0 | { |
1782 | 0 | struct dp_netdev_port *port; |
1783 | |
|
1784 | 0 | HMAP_FOR_EACH (port, node, &dp->ports) { |
1785 | 0 | if (netdev_is_pmd(port->netdev)) { |
1786 | 0 | return true; |
1787 | 0 | } |
1788 | 0 | } |
1789 | | |
1790 | 0 | return false; |
1791 | 0 | } |
1792 | | |
1793 | | static void |
1794 | | do_del_port(struct dp_netdev *dp, struct dp_netdev_port *port) |
1795 | | OVS_REQ_WRLOCK(dp->port_rwlock) |
1796 | 0 | { |
1797 | 0 | hmap_remove(&dp->ports, &port->node); |
1798 | 0 | seq_change(dp->port_seq); |
1799 | |
|
1800 | 0 | reconfigure_datapath(dp); |
1801 | 0 | port_destroy(port); |
1802 | 0 | } |
1803 | | |
1804 | | static void |
1805 | | answer_port_query(const struct dp_netdev_port *port, |
1806 | | struct dpif_port *dpif_port) |
1807 | 0 | { |
1808 | 0 | dpif_port->name = xstrdup(netdev_get_name(port->netdev)); |
1809 | 0 | dpif_port->type = xstrdup(port->type); |
1810 | 0 | dpif_port->port_no = port->port_no; |
1811 | 0 | } |
1812 | | |
1813 | | static int |
1814 | | dpif_netdev_port_query_by_number(const struct dpif *dpif, odp_port_t port_no, |
1815 | | struct dpif_port *dpif_port) |
1816 | 0 | { |
1817 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
1818 | 0 | struct dp_netdev_port *port; |
1819 | 0 | int error; |
1820 | |
|
1821 | 0 | ovs_rwlock_rdlock(&dp->port_rwlock); |
1822 | 0 | error = get_port_by_number(dp, port_no, &port); |
1823 | 0 | if (!error && dpif_port) { |
1824 | 0 | answer_port_query(port, dpif_port); |
1825 | 0 | } |
1826 | 0 | ovs_rwlock_unlock(&dp->port_rwlock); |
1827 | |
|
1828 | 0 | return error; |
1829 | 0 | } |
1830 | | |
1831 | | static int |
1832 | | dpif_netdev_port_query_by_name(const struct dpif *dpif, const char *devname, |
1833 | | struct dpif_port *dpif_port) |
1834 | 0 | { |
1835 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
1836 | 0 | struct dp_netdev_port *port; |
1837 | 0 | int error; |
1838 | |
|
1839 | 0 | ovs_rwlock_rdlock(&dp->port_rwlock); |
1840 | 0 | error = get_port_by_name(dp, devname, &port); |
1841 | 0 | if (!error && dpif_port) { |
1842 | 0 | answer_port_query(port, dpif_port); |
1843 | 0 | } |
1844 | 0 | ovs_rwlock_unlock(&dp->port_rwlock); |
1845 | |
|
1846 | 0 | return error; |
1847 | 0 | } |
1848 | | |
1849 | | static void |
1850 | | dp_netdev_flow_free(struct dp_netdev_flow *flow) |
1851 | 0 | { |
1852 | 0 | dp_netdev_actions_free(dp_netdev_flow_get_actions(flow)); |
1853 | 0 | free(flow->dp_extra_info); |
1854 | 0 | free(flow); |
1855 | 0 | } |
1856 | | |
1857 | | void dp_netdev_flow_unref(struct dp_netdev_flow *flow) |
1858 | 0 | { |
1859 | 0 | if (ovs_refcount_unref_relaxed(&flow->ref_cnt) == 1) { |
1860 | 0 | ovsrcu_postpone(dp_netdev_flow_free, flow); |
1861 | 0 | } |
1862 | 0 | } |
1863 | | |
1864 | | static inline struct dpcls * |
1865 | | dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd, |
1866 | | odp_port_t in_port) |
1867 | 0 | { |
1868 | 0 | struct dpcls *cls; |
1869 | 0 | uint32_t hash = hash_port_no(in_port); |
1870 | 0 | CMAP_FOR_EACH_WITH_HASH (cls, node, hash, &pmd->classifiers) { |
1871 | 0 | if (cls->in_port == in_port) { |
1872 | | /* Port classifier exists already */ |
1873 | 0 | return cls; |
1874 | 0 | } |
1875 | 0 | } |
1876 | 0 | return NULL; |
1877 | 0 | } |
1878 | | |
1879 | | static inline struct dpcls * |
1880 | | dp_netdev_pmd_find_dpcls(struct dp_netdev_pmd_thread *pmd, |
1881 | | odp_port_t in_port) |
1882 | | OVS_REQUIRES(pmd->flow_mutex) |
1883 | 0 | { |
1884 | 0 | struct dpcls *cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port); |
1885 | |
|
1886 | 0 | if (!cls) { |
1887 | 0 | uint32_t hash = hash_port_no(in_port); |
1888 | | |
1889 | | /* Create new classifier for in_port */ |
1890 | 0 | cls = xmalloc(sizeof(*cls)); |
1891 | 0 | dpcls_init(cls); |
1892 | 0 | cls->in_port = in_port; |
1893 | 0 | cmap_insert(&pmd->classifiers, &cls->node, hash); |
1894 | 0 | VLOG_DBG("Creating dpcls %p for in_port %d", cls, in_port); |
1895 | 0 | } |
1896 | 0 | return cls; |
1897 | 0 | } |
1898 | | |
1899 | | static void |
1900 | | log_netdev_flow_change(const struct dp_netdev_flow *flow, |
1901 | | const struct match *match, |
1902 | | const struct dp_netdev_actions *old_actions, |
1903 | | const struct nlattr *actions, |
1904 | | size_t actions_len) |
1905 | 0 | { |
1906 | 0 | struct ds ds = DS_EMPTY_INITIALIZER; |
1907 | 0 | struct ofpbuf key_buf, mask_buf; |
1908 | 0 | struct odp_flow_key_parms odp_parms = { |
1909 | 0 | .flow = &match->flow, |
1910 | 0 | .mask = &match->wc.masks, |
1911 | 0 | .support = dp_netdev_support, |
1912 | 0 | }; |
1913 | |
|
1914 | 0 | if (OVS_LIKELY(VLOG_DROP_DBG((&upcall_rl)))) { |
1915 | 0 | return; |
1916 | 0 | } |
1917 | | |
1918 | 0 | ofpbuf_init(&key_buf, 0); |
1919 | 0 | ofpbuf_init(&mask_buf, 0); |
1920 | |
|
1921 | 0 | odp_flow_key_from_flow(&odp_parms, &key_buf); |
1922 | 0 | odp_parms.key_buf = &key_buf; |
1923 | 0 | odp_flow_key_from_mask(&odp_parms, &mask_buf); |
1924 | |
|
1925 | 0 | if (old_actions) { |
1926 | 0 | ds_put_cstr(&ds, "flow_mod: "); |
1927 | 0 | } else { |
1928 | 0 | ds_put_cstr(&ds, "flow_add: "); |
1929 | 0 | } |
1930 | 0 | odp_format_ufid(&flow->ufid, &ds); |
1931 | 0 | ds_put_cstr(&ds, " mega_"); |
1932 | 0 | odp_format_ufid(&flow->mega_ufid, &ds); |
1933 | 0 | ds_put_cstr(&ds, " "); |
1934 | 0 | odp_flow_format(key_buf.data, key_buf.size, |
1935 | 0 | mask_buf.data, mask_buf.size, |
1936 | 0 | NULL, &ds, false, true); |
1937 | 0 | if (old_actions) { |
1938 | 0 | ds_put_cstr(&ds, ", old_actions:"); |
1939 | 0 | format_odp_actions(&ds, old_actions->actions, old_actions->size, |
1940 | 0 | NULL); |
1941 | 0 | } |
1942 | 0 | ds_put_cstr(&ds, ", actions:"); |
1943 | 0 | format_odp_actions(&ds, actions, actions_len, NULL); |
1944 | |
|
1945 | 0 | VLOG_DBG("%s", ds_cstr(&ds)); |
1946 | |
|
1947 | 0 | ofpbuf_uninit(&key_buf); |
1948 | 0 | ofpbuf_uninit(&mask_buf); |
1949 | | |
1950 | | /* Add a printout of the actual match installed. */ |
1951 | 0 | struct match m; |
1952 | 0 | ds_clear(&ds); |
1953 | 0 | ds_put_cstr(&ds, "flow match: "); |
1954 | 0 | miniflow_expand(&flow->cr.flow.mf, &m.flow); |
1955 | 0 | miniflow_expand(&flow->cr.mask->mf, &m.wc.masks); |
1956 | 0 | memset(&m.tun_md, 0, sizeof m.tun_md); |
1957 | 0 | match_format(&m, NULL, &ds, OFP_DEFAULT_PRIORITY); |
1958 | |
|
1959 | 0 | VLOG_DBG("%s", ds_cstr(&ds)); |
1960 | |
|
1961 | 0 | ds_destroy(&ds); |
1962 | 0 | } |
1963 | | |
1964 | | /* Offloaded flows can be handled asynchronously, so we do not always know |
1965 | | * whether a specific flow is offloaded or not. It might still be pending; |
1966 | | * in fact, multiple modifications can be pending, and the actual offload |
1967 | | * state depends on the completion of each modification. |
1968 | | * |
1969 | | * To correctly determine whether a flow is offloaded when it is being |
1970 | | * destroyed (and therefore requires cleanup), we must ensure that all |
1971 | | * operations have completed. To achieve this, we track the number of |
1972 | | * outstanding offloaded flow modifications. */ |
1973 | | static bool |
1974 | | offload_queue_inc(struct dp_netdev_flow *flow) |
1975 | 0 | { |
1976 | 0 | int current; |
1977 | |
|
1978 | 0 | while (true) { |
1979 | 0 | atomic_read(&flow->offload_queue_depth, ¤t); |
1980 | 0 | if (current < 0) { |
1981 | | /* We are cleaning up, so no longer enqueue operations. */ |
1982 | 0 | return false; |
1983 | 0 | } |
1984 | | |
1985 | | /* Here we try to atomically increase the value. If we do not succeed, |
1986 | | * someone else has modified it, and we need to check again for a |
1987 | | * current negative value. */ |
1988 | 0 | if (atomic_compare_exchange_strong(&flow->offload_queue_depth, |
1989 | 0 | ¤t, current + 1)) { |
1990 | 0 | return true; |
1991 | 0 | } |
1992 | 0 | } |
1993 | 0 | } |
1994 | | |
1995 | | static bool |
1996 | | offload_queue_dec(struct dp_netdev_flow *flow) |
1997 | 0 | { |
1998 | 0 | int old; |
1999 | |
|
2000 | 0 | atomic_sub(&flow->offload_queue_depth, 1, &old); |
2001 | 0 | ovs_assert(old >= 1); |
2002 | |
|
2003 | 0 | if (old == 1) { |
2004 | | /* Note that this only indicates that the queue might be empty. */ |
2005 | 0 | return true; |
2006 | 0 | } |
2007 | 0 | return false; |
2008 | 0 | } |
2009 | | |
2010 | | static bool |
2011 | | offload_queue_complete(struct dp_netdev_flow *flow) |
2012 | 0 | { |
2013 | | /* This function returns false if the queue is still in use. |
2014 | | * If the queue is empty, it will attempt to atomically mark it as |
2015 | | * 'not in use' by making the queue depth negative. This prevents |
2016 | | * other flow operations from being added. If successful, it returns |
2017 | | * true. */ |
2018 | 0 | int expected_val = 0; |
2019 | |
|
2020 | 0 | return atomic_compare_exchange_strong(&flow->offload_queue_depth, |
2021 | 0 | &expected_val, -1); |
2022 | 0 | } |
2023 | | |
2024 | | static void |
2025 | | offload_flow_reference_unreference_cb(unsigned pmd_id OVS_UNUSED, |
2026 | | void *flow_reference_) |
2027 | 0 | { |
2028 | 0 | struct dp_netdev_flow *flow_reference = flow_reference_; |
2029 | |
|
2030 | 0 | if (flow_reference) { |
2031 | 0 | flow_reference->offloaded = false; |
2032 | 0 | dp_netdev_flow_unref(flow_reference); |
2033 | 0 | } |
2034 | 0 | } |
2035 | | |
2036 | | static void |
2037 | | offload_flow_del_resume(struct dp_netdev_flow *flow_reference, |
2038 | | int error) |
2039 | 0 | { |
2040 | 0 | if (error == EINPROGRESS) { |
2041 | 0 | return; |
2042 | 0 | } |
2043 | | |
2044 | 0 | if (error) { |
2045 | 0 | odp_port_t in_port = flow_reference->flow.in_port.odp_port; |
2046 | |
|
2047 | 0 | VLOG_DBG( |
2048 | 0 | "Failed removing offload flow ufid " UUID_FMT " from port %d: %d", |
2049 | 0 | UUID_ARGS((struct uuid *)&flow_reference->mega_ufid), in_port, |
2050 | 0 | error); |
2051 | 0 | } else { |
2052 | | /* Release because we successfully removed the reference. */ |
2053 | 0 | dp_netdev_flow_unref(flow_reference); |
2054 | 0 | } |
2055 | | |
2056 | | /* Release as we took a reference in offload_flow_del(). */ |
2057 | 0 | dp_netdev_flow_unref(flow_reference); |
2058 | 0 | } |
2059 | | |
2060 | | static void |
2061 | | offload_flow_del_resume_cb(void *aux OVS_UNUSED, |
2062 | | struct dpif_flow_stats *stats OVS_UNUSED, |
2063 | | unsigned pmd_id OVS_UNUSED, |
2064 | | void *flow_reference, |
2065 | | void *previous_flow_reference OVS_UNUSED, int error) |
2066 | 0 | { |
2067 | 0 | offload_flow_del_resume(flow_reference, error); |
2068 | 0 | } |
2069 | | |
2070 | | static void |
2071 | | offload_flow_del(struct dp_netdev *dp, unsigned pmd_id, |
2072 | | struct dp_netdev_flow *flow) |
2073 | 0 | { |
2074 | 0 | odp_port_t in_port = flow->flow.in_port.odp_port; |
2075 | 0 | struct dpif_offload_flow_del del = { |
2076 | 0 | .in_port = in_port, |
2077 | 0 | .pmd_id = pmd_id, |
2078 | 0 | .ufid = CONST_CAST(ovs_u128 *, &flow->mega_ufid), |
2079 | 0 | .flow_reference = flow, |
2080 | 0 | .stats = NULL, |
2081 | 0 | .cb_data = { .callback = offload_flow_del_resume_cb }, |
2082 | 0 | }; |
2083 | 0 | int error; |
2084 | |
|
2085 | 0 | if (!dpif_offload_enabled()) { |
2086 | 0 | return; |
2087 | 0 | } |
2088 | | |
2089 | | /* This offload flow delete is only called when the actual flow is |
2090 | | * destructed. However, we can only trust the state of flow->offloaded |
2091 | | * if no more flow_put operations are pending. Below, we check whether |
2092 | | * the queue can be marked as complete, and then determine if we need |
2093 | | * to schedule a removal. If not, the delete will be rescheduled later |
2094 | | * in the last offload_flow_put_resume_cb() callback. */ |
2095 | 0 | ovs_assert(flow->dead); |
2096 | 0 | if (!offload_queue_complete(flow) || !flow->offloaded) { |
2097 | 0 | return; |
2098 | 0 | } |
2099 | | |
2100 | 0 | flow->offloaded = false; |
2101 | 0 | dp_netdev_flow_ref(flow); |
2102 | | |
2103 | | /* It's the responsibility of the offload provider to remove the |
2104 | | * actual rule from hardware only if none of the other PMD threads |
2105 | | * have the rule installed in hardware. */ |
2106 | 0 | error = dpif_offload_datapath_flow_del(dp->full_name, &del); |
2107 | 0 | offload_flow_del_resume(flow, error); |
2108 | 0 | } |
2109 | | |
2110 | | static void |
2111 | | dp_netdev_pmd_remove_flow(struct dp_netdev_pmd_thread *pmd, |
2112 | | struct dp_netdev_flow *flow) |
2113 | | OVS_REQUIRES(pmd->flow_mutex) |
2114 | 0 | { |
2115 | 0 | struct cmap_node *node = CONST_CAST(struct cmap_node *, &flow->node); |
2116 | 0 | struct dpcls *cls; |
2117 | 0 | odp_port_t in_port = flow->flow.in_port.odp_port; |
2118 | |
|
2119 | 0 | cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port); |
2120 | 0 | ovs_assert(cls != NULL); |
2121 | 0 | dpcls_remove(cls, &flow->cr); |
2122 | 0 | dp_netdev_simple_match_remove(pmd, flow); |
2123 | 0 | cmap_remove(&pmd->flow_table, node, dp_netdev_flow_hash(&flow->ufid)); |
2124 | 0 | ccmap_dec(&pmd->n_flows, odp_to_u32(in_port)); |
2125 | 0 | flow->dead = true; |
2126 | 0 | offload_flow_del(pmd->dp, pmd->core_id, flow); |
2127 | |
|
2128 | 0 | dp_netdev_flow_unref(flow); |
2129 | 0 | } |
2130 | | |
2131 | | static void |
2132 | | dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd) |
2133 | 0 | { |
2134 | 0 | struct dp_netdev_flow *netdev_flow; |
2135 | |
|
2136 | 0 | ovs_mutex_lock(&pmd->flow_mutex); |
2137 | 0 | CMAP_FOR_EACH (netdev_flow, node, &pmd->flow_table) { |
2138 | 0 | dp_netdev_pmd_remove_flow(pmd, netdev_flow); |
2139 | 0 | } |
2140 | 0 | ovs_mutex_unlock(&pmd->flow_mutex); |
2141 | 0 | } |
2142 | | |
2143 | | static int |
2144 | | dpif_netdev_flow_flush(struct dpif *dpif) |
2145 | 0 | { |
2146 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
2147 | 0 | struct dp_netdev_pmd_thread *pmd; |
2148 | |
|
2149 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
2150 | 0 | dp_netdev_pmd_flow_flush(pmd); |
2151 | 0 | } |
2152 | |
|
2153 | 0 | return 0; |
2154 | 0 | } |
2155 | | |
2156 | | struct dp_netdev_port_state { |
2157 | | struct hmap_position position; |
2158 | | char *name; |
2159 | | }; |
2160 | | |
2161 | | static int |
2162 | | dpif_netdev_port_dump_start(const struct dpif *dpif OVS_UNUSED, void **statep) |
2163 | 0 | { |
2164 | 0 | *statep = xzalloc(sizeof(struct dp_netdev_port_state)); |
2165 | 0 | return 0; |
2166 | 0 | } |
2167 | | |
2168 | | static int |
2169 | | dpif_netdev_port_dump_next(const struct dpif *dpif, void *state_, |
2170 | | struct dpif_port *dpif_port) |
2171 | 0 | { |
2172 | 0 | struct dp_netdev_port_state *state = state_; |
2173 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
2174 | 0 | struct hmap_node *node; |
2175 | 0 | int retval; |
2176 | |
|
2177 | 0 | ovs_rwlock_rdlock(&dp->port_rwlock); |
2178 | 0 | node = hmap_at_position(&dp->ports, &state->position); |
2179 | 0 | if (node) { |
2180 | 0 | struct dp_netdev_port *port; |
2181 | |
|
2182 | 0 | port = CONTAINER_OF(node, struct dp_netdev_port, node); |
2183 | |
|
2184 | 0 | free(state->name); |
2185 | 0 | state->name = xstrdup(netdev_get_name(port->netdev)); |
2186 | 0 | dpif_port->name = state->name; |
2187 | 0 | dpif_port->type = port->type; |
2188 | 0 | dpif_port->port_no = port->port_no; |
2189 | |
|
2190 | 0 | retval = 0; |
2191 | 0 | } else { |
2192 | 0 | retval = EOF; |
2193 | 0 | } |
2194 | 0 | ovs_rwlock_unlock(&dp->port_rwlock); |
2195 | |
|
2196 | 0 | return retval; |
2197 | 0 | } |
2198 | | |
2199 | | static int |
2200 | | dpif_netdev_port_dump_done(const struct dpif *dpif OVS_UNUSED, void *state_) |
2201 | 0 | { |
2202 | 0 | struct dp_netdev_port_state *state = state_; |
2203 | 0 | free(state->name); |
2204 | 0 | free(state); |
2205 | 0 | return 0; |
2206 | 0 | } |
2207 | | |
2208 | | static int |
2209 | | dpif_netdev_port_poll(const struct dpif *dpif_, char **devnamep OVS_UNUSED) |
2210 | 0 | { |
2211 | 0 | struct dpif_netdev *dpif = dpif_netdev_cast(dpif_); |
2212 | 0 | uint64_t new_port_seq; |
2213 | 0 | int error; |
2214 | |
|
2215 | 0 | new_port_seq = seq_read(dpif->dp->port_seq); |
2216 | 0 | if (dpif->last_port_seq != new_port_seq) { |
2217 | 0 | dpif->last_port_seq = new_port_seq; |
2218 | 0 | error = ENOBUFS; |
2219 | 0 | } else { |
2220 | 0 | error = EAGAIN; |
2221 | 0 | } |
2222 | |
|
2223 | 0 | return error; |
2224 | 0 | } |
2225 | | |
2226 | | static void |
2227 | | dpif_netdev_port_poll_wait(const struct dpif *dpif_) |
2228 | 0 | { |
2229 | 0 | struct dpif_netdev *dpif = dpif_netdev_cast(dpif_); |
2230 | |
|
2231 | 0 | seq_wait(dpif->dp->port_seq, dpif->last_port_seq); |
2232 | 0 | } |
2233 | | |
2234 | | static struct dp_netdev_flow * |
2235 | | dp_netdev_flow_cast(const struct dpcls_rule *cr) |
2236 | 0 | { |
2237 | 0 | return cr ? CONTAINER_OF(cr, struct dp_netdev_flow, cr) : NULL; |
2238 | 0 | } |
2239 | | |
2240 | | static bool dp_netdev_flow_ref(struct dp_netdev_flow *flow) |
2241 | 0 | { |
2242 | 0 | return ovs_refcount_try_ref_rcu(&flow->ref_cnt); |
2243 | 0 | } |
2244 | | |
2245 | | /* netdev_flow_key utilities. |
2246 | | * |
2247 | | * netdev_flow_key is basically a miniflow. We use these functions |
2248 | | * (netdev_flow_key_clone, netdev_flow_key_equal, ...) instead of the miniflow |
2249 | | * functions (miniflow_clone_inline, miniflow_equal, ...), because: |
2250 | | * |
2251 | | * - Since we are dealing exclusively with miniflows created by |
2252 | | * miniflow_extract(), if the map is different the miniflow is different. |
2253 | | * Therefore we can be faster by comparing the map and the miniflow in a |
2254 | | * single memcmp(). |
2255 | | * - These functions can be inlined by the compiler. */ |
2256 | | |
2257 | | static inline bool |
2258 | | netdev_flow_key_equal(const struct netdev_flow_key *a, |
2259 | | const struct netdev_flow_key *b) |
2260 | 0 | { |
2261 | | /* 'b->len' may be not set yet. */ |
2262 | 0 | return a->hash == b->hash && !memcmp(&a->mf, &b->mf, a->len); |
2263 | 0 | } |
2264 | | |
2265 | | static inline void |
2266 | | netdev_flow_key_clone(struct netdev_flow_key *dst, |
2267 | | const struct netdev_flow_key *src) |
2268 | 0 | { |
2269 | 0 | memcpy(dst, src, |
2270 | 0 | offsetof(struct netdev_flow_key, mf) + src->len); |
2271 | 0 | } |
2272 | | |
2273 | | /* Initialize a netdev_flow_key 'mask' from 'match'. */ |
2274 | | static inline void |
2275 | | netdev_flow_mask_init(struct netdev_flow_key *mask, |
2276 | | const struct match *match) |
2277 | 0 | { |
2278 | 0 | uint64_t *dst = miniflow_values(&mask->mf); |
2279 | 0 | struct flowmap fmap; |
2280 | 0 | uint32_t hash = 0; |
2281 | 0 | size_t idx; |
2282 | | |
2283 | | /* Only check masks that make sense for the flow. */ |
2284 | 0 | flow_wc_map(&match->flow, &fmap); |
2285 | 0 | flowmap_init(&mask->mf.map); |
2286 | |
|
2287 | 0 | FLOWMAP_FOR_EACH_INDEX(idx, fmap) { |
2288 | 0 | uint64_t mask_u64 = flow_u64_value(&match->wc.masks, idx); |
2289 | |
|
2290 | 0 | if (mask_u64) { |
2291 | 0 | flowmap_set(&mask->mf.map, idx, 1); |
2292 | 0 | *dst++ = mask_u64; |
2293 | 0 | hash = hash_add64(hash, mask_u64); |
2294 | 0 | } |
2295 | 0 | } |
2296 | |
|
2297 | 0 | map_t map; |
2298 | |
|
2299 | 0 | FLOWMAP_FOR_EACH_MAP (map, mask->mf.map) { |
2300 | 0 | hash = hash_add64(hash, map); |
2301 | 0 | } |
2302 | |
|
2303 | 0 | size_t n = dst - miniflow_get_values(&mask->mf); |
2304 | |
|
2305 | 0 | mask->hash = hash_finish(hash, n * 8); |
2306 | 0 | mask->len = netdev_flow_key_size(n); |
2307 | 0 | } |
2308 | | |
2309 | | /* Initializes 'dst' as a copy of 'flow' masked with 'mask'. */ |
2310 | | static inline void |
2311 | | netdev_flow_key_init_masked(struct netdev_flow_key *dst, |
2312 | | const struct flow *flow, |
2313 | | const struct netdev_flow_key *mask) |
2314 | 0 | { |
2315 | 0 | uint64_t *dst_u64 = miniflow_values(&dst->mf); |
2316 | 0 | const uint64_t *mask_u64 = miniflow_get_values(&mask->mf); |
2317 | 0 | uint32_t hash = 0; |
2318 | 0 | uint64_t value; |
2319 | |
|
2320 | 0 | dst->len = mask->len; |
2321 | 0 | dst->mf = mask->mf; /* Copy maps. */ |
2322 | |
|
2323 | 0 | FLOW_FOR_EACH_IN_MAPS(value, flow, mask->mf.map) { |
2324 | 0 | *dst_u64 = value & *mask_u64++; |
2325 | 0 | hash = hash_add64(hash, *dst_u64++); |
2326 | 0 | } |
2327 | 0 | dst->hash = hash_finish(hash, |
2328 | 0 | (dst_u64 - miniflow_get_values(&dst->mf)) * 8); |
2329 | 0 | } |
2330 | | |
2331 | | /* Initializes 'key' as a copy of 'flow'. */ |
2332 | | static inline void |
2333 | | netdev_flow_key_init(struct netdev_flow_key *key, |
2334 | | const struct flow *flow) |
2335 | 0 | { |
2336 | 0 | uint32_t hash = 0; |
2337 | 0 | uint64_t value; |
2338 | |
|
2339 | 0 | miniflow_map_init(&key->mf, flow); |
2340 | 0 | miniflow_init(&key->mf, flow); |
2341 | |
|
2342 | 0 | size_t n = miniflow_n_values(&key->mf); |
2343 | |
|
2344 | 0 | FLOW_FOR_EACH_IN_MAPS (value, flow, key->mf.map) { |
2345 | 0 | hash = hash_add64(hash, value); |
2346 | 0 | } |
2347 | |
|
2348 | 0 | key->hash = hash_finish(hash, n * 8); |
2349 | 0 | key->len = netdev_flow_key_size(n); |
2350 | 0 | } |
2351 | | |
2352 | | static inline void |
2353 | | emc_change_entry(struct emc_entry *ce, struct dp_netdev_flow *flow, |
2354 | | const struct netdev_flow_key *key) |
2355 | 0 | { |
2356 | 0 | if (ce->flow != flow) { |
2357 | 0 | if (ce->flow) { |
2358 | 0 | dp_netdev_flow_unref(ce->flow); |
2359 | 0 | } |
2360 | |
|
2361 | 0 | if (dp_netdev_flow_ref(flow)) { |
2362 | 0 | ce->flow = flow; |
2363 | 0 | } else { |
2364 | 0 | ce->flow = NULL; |
2365 | 0 | } |
2366 | 0 | } |
2367 | 0 | if (key) { |
2368 | 0 | netdev_flow_key_clone(&ce->key, key); |
2369 | 0 | } |
2370 | 0 | } |
2371 | | |
2372 | | static inline void |
2373 | | emc_insert(struct emc_cache *cache, const struct netdev_flow_key *key, |
2374 | | struct dp_netdev_flow *flow) |
2375 | 0 | { |
2376 | 0 | struct emc_entry *to_be_replaced = NULL; |
2377 | 0 | struct emc_entry *current_entry; |
2378 | |
|
2379 | 0 | EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) { |
2380 | 0 | if (netdev_flow_key_equal(¤t_entry->key, key)) { |
2381 | | /* We found the entry with the 'mf' miniflow */ |
2382 | 0 | emc_change_entry(current_entry, flow, NULL); |
2383 | 0 | return; |
2384 | 0 | } |
2385 | | |
2386 | | /* Replacement policy: put the flow in an empty (not alive) entry, or |
2387 | | * in the first entry where it can be */ |
2388 | 0 | if (!to_be_replaced |
2389 | 0 | || (emc_entry_alive(to_be_replaced) |
2390 | 0 | && !emc_entry_alive(current_entry)) |
2391 | 0 | || current_entry->key.hash < to_be_replaced->key.hash) { |
2392 | 0 | to_be_replaced = current_entry; |
2393 | 0 | } |
2394 | 0 | } |
2395 | | /* We didn't find the miniflow in the cache. |
2396 | | * The 'to_be_replaced' entry is where the new flow will be stored */ |
2397 | | |
2398 | 0 | emc_change_entry(to_be_replaced, flow, key); |
2399 | 0 | } |
2400 | | |
2401 | | static inline void |
2402 | | emc_probabilistic_insert(struct dp_netdev_pmd_thread *pmd, |
2403 | | const struct netdev_flow_key *key, |
2404 | | struct dp_netdev_flow *flow) |
2405 | 0 | { |
2406 | | /* Insert an entry into the EMC based on probability value 'min'. By |
2407 | | * default the value is UINT32_MAX / 100 which yields an insertion |
2408 | | * probability of 1/100 ie. 1% */ |
2409 | |
|
2410 | 0 | uint32_t min = pmd->ctx.emc_insert_min; |
2411 | |
|
2412 | 0 | if (min && random_uint32() <= min) { |
2413 | 0 | emc_insert(&(pmd->flow_cache).emc_cache, key, flow); |
2414 | 0 | } |
2415 | 0 | } |
2416 | | |
2417 | | static inline const struct cmap_node * |
2418 | | smc_entry_get(struct dp_netdev_pmd_thread *pmd, const uint32_t hash) |
2419 | 0 | { |
2420 | 0 | struct smc_cache *cache = &(pmd->flow_cache).smc_cache; |
2421 | 0 | struct smc_bucket *bucket = &cache->buckets[hash & SMC_MASK]; |
2422 | 0 | uint16_t sig = hash >> 16; |
2423 | 0 | uint16_t index = UINT16_MAX; |
2424 | |
|
2425 | 0 | for (int i = 0; i < SMC_ENTRY_PER_BUCKET; i++) { |
2426 | 0 | if (bucket->sig[i] == sig) { |
2427 | 0 | index = bucket->flow_idx[i]; |
2428 | 0 | break; |
2429 | 0 | } |
2430 | 0 | } |
2431 | 0 | if (index != UINT16_MAX) { |
2432 | 0 | return cmap_find_by_index(&pmd->flow_table, index); |
2433 | 0 | } |
2434 | 0 | return NULL; |
2435 | 0 | } |
2436 | | |
2437 | | /* Insert the flow_table index into SMC. Insertion may fail when 1) SMC is |
2438 | | * turned off, 2) the flow_table index is larger than uint16_t can handle. |
2439 | | * If there is already an SMC entry having same signature, the index will be |
2440 | | * updated. If there is no existing entry, but an empty entry is available, |
2441 | | * the empty entry will be taken. If no empty entry or existing same signature, |
2442 | | * a random entry from the hashed bucket will be picked. */ |
2443 | | static inline void |
2444 | | smc_insert(struct dp_netdev_pmd_thread *pmd, |
2445 | | const struct netdev_flow_key *key, |
2446 | | uint32_t hash) |
2447 | 0 | { |
2448 | 0 | struct smc_cache *smc_cache = &(pmd->flow_cache).smc_cache; |
2449 | 0 | struct smc_bucket *bucket = &smc_cache->buckets[key->hash & SMC_MASK]; |
2450 | 0 | uint16_t index; |
2451 | 0 | uint32_t cmap_index; |
2452 | 0 | int i; |
2453 | |
|
2454 | 0 | if (!pmd->ctx.smc_enable_db) { |
2455 | 0 | return; |
2456 | 0 | } |
2457 | | |
2458 | 0 | cmap_index = cmap_find_index(&pmd->flow_table, hash); |
2459 | 0 | index = (cmap_index >= UINT16_MAX) ? UINT16_MAX : (uint16_t)cmap_index; |
2460 | | |
2461 | | /* If the index is larger than SMC can handle (uint16_t), we don't |
2462 | | * insert */ |
2463 | 0 | if (index == UINT16_MAX) { |
2464 | 0 | return; |
2465 | 0 | } |
2466 | | |
2467 | | /* If an entry with same signature already exists, update the index */ |
2468 | 0 | uint16_t sig = key->hash >> 16; |
2469 | 0 | for (i = 0; i < SMC_ENTRY_PER_BUCKET; i++) { |
2470 | 0 | if (bucket->sig[i] == sig) { |
2471 | 0 | bucket->flow_idx[i] = index; |
2472 | 0 | return; |
2473 | 0 | } |
2474 | 0 | } |
2475 | | /* If there is an empty entry, occupy it. */ |
2476 | 0 | for (i = 0; i < SMC_ENTRY_PER_BUCKET; i++) { |
2477 | 0 | if (bucket->flow_idx[i] == UINT16_MAX) { |
2478 | 0 | bucket->sig[i] = sig; |
2479 | 0 | bucket->flow_idx[i] = index; |
2480 | 0 | return; |
2481 | 0 | } |
2482 | 0 | } |
2483 | | /* Otherwise, pick a random entry. */ |
2484 | 0 | i = random_uint32() % SMC_ENTRY_PER_BUCKET; |
2485 | 0 | bucket->sig[i] = sig; |
2486 | 0 | bucket->flow_idx[i] = index; |
2487 | 0 | } |
2488 | | |
2489 | | inline void |
2490 | | emc_probabilistic_insert_batch(struct dp_netdev_pmd_thread *pmd, |
2491 | | const struct netdev_flow_key *keys, |
2492 | | struct dpcls_rule **rules, |
2493 | | uint32_t emc_insert_mask) |
2494 | 0 | { |
2495 | 0 | while (emc_insert_mask) { |
2496 | 0 | uint32_t i = raw_ctz(emc_insert_mask); |
2497 | 0 | emc_insert_mask &= emc_insert_mask - 1; |
2498 | | /* Get the require parameters for EMC/SMC from the rule */ |
2499 | 0 | struct dp_netdev_flow *flow = dp_netdev_flow_cast(rules[i]); |
2500 | | /* Insert the key into EMC/SMC. */ |
2501 | 0 | emc_probabilistic_insert(pmd, &keys[i], flow); |
2502 | 0 | } |
2503 | 0 | } |
2504 | | |
2505 | | inline void |
2506 | | smc_insert_batch(struct dp_netdev_pmd_thread *pmd, |
2507 | | const struct netdev_flow_key *keys, |
2508 | | struct dpcls_rule **rules, |
2509 | | uint32_t smc_insert_mask) |
2510 | 0 | { |
2511 | 0 | while (smc_insert_mask) { |
2512 | 0 | uint32_t i = raw_ctz(smc_insert_mask); |
2513 | 0 | smc_insert_mask &= smc_insert_mask - 1; |
2514 | | /* Get the require parameters for EMC/SMC from the rule */ |
2515 | 0 | struct dp_netdev_flow *flow = dp_netdev_flow_cast(rules[i]); |
2516 | 0 | uint32_t hash = dp_netdev_flow_hash(&flow->ufid); |
2517 | | /* Insert the key into EMC/SMC. */ |
2518 | 0 | smc_insert(pmd, &keys[i], hash); |
2519 | 0 | } |
2520 | 0 | } |
2521 | | |
2522 | | static struct dp_netdev_flow * |
2523 | | dp_netdev_pmd_lookup_flow(struct dp_netdev_pmd_thread *pmd, |
2524 | | const struct netdev_flow_key *key, |
2525 | | int *lookup_num_p) |
2526 | 0 | { |
2527 | 0 | struct dpcls *cls; |
2528 | 0 | struct dpcls_rule *rule = NULL; |
2529 | 0 | odp_port_t in_port = u32_to_odp(MINIFLOW_GET_U32(&key->mf, |
2530 | 0 | in_port.odp_port)); |
2531 | 0 | struct dp_netdev_flow *netdev_flow = NULL; |
2532 | |
|
2533 | 0 | cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port); |
2534 | 0 | if (OVS_LIKELY(cls)) { |
2535 | 0 | dpcls_lookup(cls, &key, &rule, 1, lookup_num_p); |
2536 | 0 | netdev_flow = dp_netdev_flow_cast(rule); |
2537 | 0 | } |
2538 | 0 | return netdev_flow; |
2539 | 0 | } |
2540 | | |
2541 | | static struct dp_netdev_flow * |
2542 | | dp_netdev_pmd_find_flow(const struct dp_netdev_pmd_thread *pmd, |
2543 | | const ovs_u128 *ufidp, const struct nlattr *key, |
2544 | | size_t key_len) |
2545 | 0 | { |
2546 | 0 | struct dp_netdev_flow *netdev_flow; |
2547 | 0 | struct flow flow; |
2548 | 0 | ovs_u128 ufid; |
2549 | | |
2550 | | /* If a UFID is not provided, determine one based on the key. */ |
2551 | 0 | if (!ufidp && key && key_len |
2552 | 0 | && !dpif_netdev_flow_from_nlattrs(key, key_len, &flow, false)) { |
2553 | 0 | odp_flow_key_hash(&flow, sizeof flow, &ufid); |
2554 | 0 | ufidp = &ufid; |
2555 | 0 | } |
2556 | |
|
2557 | 0 | if (ufidp) { |
2558 | 0 | CMAP_FOR_EACH_WITH_HASH (netdev_flow, node, dp_netdev_flow_hash(ufidp), |
2559 | 0 | &pmd->flow_table) { |
2560 | 0 | if (ovs_u128_equals(netdev_flow->ufid, *ufidp)) { |
2561 | 0 | return netdev_flow; |
2562 | 0 | } |
2563 | 0 | } |
2564 | 0 | } |
2565 | | |
2566 | 0 | return NULL; |
2567 | 0 | } |
2568 | | |
2569 | | static void |
2570 | | get_dpif_flow_status(const struct dp_netdev *dp, |
2571 | | const struct dp_netdev_flow *netdev_flow_, |
2572 | | struct dpif_flow_stats *stats, |
2573 | | struct dpif_flow_attrs *attrs) |
2574 | 0 | { |
2575 | 0 | struct dpif_flow_stats offload_stats; |
2576 | 0 | struct dpif_flow_attrs offload_attrs; |
2577 | 0 | struct dp_netdev_flow *netdev_flow; |
2578 | 0 | unsigned long long n; |
2579 | 0 | long long used; |
2580 | 0 | uint16_t flags; |
2581 | |
|
2582 | 0 | netdev_flow = CONST_CAST(struct dp_netdev_flow *, netdev_flow_); |
2583 | |
|
2584 | 0 | atomic_read_relaxed(&netdev_flow->stats.packet_count, &n); |
2585 | 0 | stats->n_packets = n; |
2586 | 0 | atomic_read_relaxed(&netdev_flow->stats.byte_count, &n); |
2587 | 0 | stats->n_bytes = n; |
2588 | 0 | atomic_read_relaxed(&netdev_flow->stats.used, &used); |
2589 | 0 | stats->used = used; |
2590 | 0 | atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags); |
2591 | 0 | stats->tcp_flags = flags; |
2592 | |
|
2593 | 0 | if (dpif_offload_datapath_flow_stats(dp->full_name, |
2594 | 0 | netdev_flow->flow.in_port.odp_port, |
2595 | 0 | &netdev_flow->mega_ufid, |
2596 | 0 | &offload_stats, &offload_attrs)) { |
2597 | 0 | stats->n_packets += offload_stats.n_packets; |
2598 | 0 | stats->n_bytes += offload_stats.n_bytes; |
2599 | 0 | stats->used = MAX(stats->used, offload_stats.used); |
2600 | 0 | stats->tcp_flags |= offload_stats.tcp_flags; |
2601 | 0 | if (attrs) { |
2602 | 0 | attrs->offloaded = offload_attrs.offloaded; |
2603 | 0 | attrs->dp_layer = offload_attrs.dp_layer; |
2604 | 0 | } |
2605 | 0 | } else if (attrs) { |
2606 | 0 | attrs->offloaded = false; |
2607 | 0 | attrs->dp_layer = "ovs"; |
2608 | 0 | } |
2609 | 0 | } |
2610 | | |
2611 | | /* Converts to the dpif_flow format, using 'key_buf' and 'mask_buf' for |
2612 | | * storing the netlink-formatted key/mask. 'key_buf' may be the same as |
2613 | | * 'mask_buf'. Actions will be returned without copying, by relying on RCU to |
2614 | | * protect them. */ |
2615 | | static void |
2616 | | dp_netdev_flow_to_dpif_flow(const struct dp_netdev *dp, |
2617 | | const struct dp_netdev_flow *netdev_flow, |
2618 | | struct ofpbuf *key_buf, struct ofpbuf *mask_buf, |
2619 | | struct dpif_flow *flow, bool terse) |
2620 | 0 | { |
2621 | 0 | if (terse) { |
2622 | 0 | memset(flow, 0, sizeof *flow); |
2623 | 0 | } else { |
2624 | 0 | struct flow_wildcards wc; |
2625 | 0 | struct dp_netdev_actions *actions; |
2626 | 0 | size_t offset; |
2627 | 0 | struct odp_flow_key_parms odp_parms = { |
2628 | 0 | .flow = &netdev_flow->flow, |
2629 | 0 | .mask = &wc.masks, |
2630 | 0 | .support = dp_netdev_support, |
2631 | 0 | }; |
2632 | |
|
2633 | 0 | miniflow_expand(&netdev_flow->cr.mask->mf, &wc.masks); |
2634 | | /* in_port is exact matched, but we have left it out from the mask for |
2635 | | * optimnization reasons. Add in_port back to the mask. */ |
2636 | 0 | wc.masks.in_port.odp_port = ODPP_NONE; |
2637 | | |
2638 | | /* Key */ |
2639 | 0 | offset = key_buf->size; |
2640 | 0 | flow->key = ofpbuf_tail(key_buf); |
2641 | 0 | odp_flow_key_from_flow(&odp_parms, key_buf); |
2642 | 0 | flow->key_len = key_buf->size - offset; |
2643 | | |
2644 | | /* Mask */ |
2645 | 0 | offset = mask_buf->size; |
2646 | 0 | flow->mask = ofpbuf_tail(mask_buf); |
2647 | 0 | odp_parms.key_buf = key_buf; |
2648 | 0 | odp_flow_key_from_mask(&odp_parms, mask_buf); |
2649 | 0 | flow->mask_len = mask_buf->size - offset; |
2650 | | |
2651 | | /* Actions */ |
2652 | 0 | actions = dp_netdev_flow_get_actions(netdev_flow); |
2653 | 0 | flow->actions = actions->actions; |
2654 | 0 | flow->actions_len = actions->size; |
2655 | 0 | } |
2656 | |
|
2657 | 0 | flow->ufid = netdev_flow->ufid; |
2658 | 0 | flow->ufid_present = true; |
2659 | 0 | flow->pmd_id = netdev_flow->pmd_id; |
2660 | |
|
2661 | 0 | get_dpif_flow_status(dp, netdev_flow, &flow->stats, &flow->attrs); |
2662 | 0 | flow->attrs.dp_extra_info = netdev_flow->dp_extra_info; |
2663 | 0 | } |
2664 | | |
2665 | | static int |
2666 | | dpif_netdev_mask_from_nlattrs(const struct nlattr *key, uint32_t key_len, |
2667 | | const struct nlattr *mask_key, |
2668 | | uint32_t mask_key_len, const struct flow *flow, |
2669 | | struct flow_wildcards *wc, bool probe) |
2670 | 0 | { |
2671 | 0 | enum odp_key_fitness fitness; |
2672 | |
|
2673 | 0 | fitness = odp_flow_key_to_mask(mask_key, mask_key_len, wc, flow, NULL); |
2674 | 0 | if (fitness) { |
2675 | 0 | if (!probe) { |
2676 | | /* This should not happen: it indicates that |
2677 | | * odp_flow_key_from_mask() and odp_flow_key_to_mask() |
2678 | | * disagree on the acceptable form of a mask. Log the problem |
2679 | | * as an error, with enough details to enable debugging. */ |
2680 | 0 | static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); |
2681 | |
|
2682 | 0 | if (!VLOG_DROP_ERR(&rl)) { |
2683 | 0 | struct ds s; |
2684 | |
|
2685 | 0 | ds_init(&s); |
2686 | 0 | odp_flow_format(key, key_len, mask_key, mask_key_len, NULL, &s, |
2687 | 0 | true, true); |
2688 | 0 | VLOG_ERR("internal error parsing flow mask %s (%s)", |
2689 | 0 | ds_cstr(&s), odp_key_fitness_to_string(fitness)); |
2690 | 0 | ds_destroy(&s); |
2691 | 0 | } |
2692 | 0 | } |
2693 | |
|
2694 | 0 | return EINVAL; |
2695 | 0 | } |
2696 | | |
2697 | 0 | return 0; |
2698 | 0 | } |
2699 | | |
2700 | | static int |
2701 | | dpif_netdev_flow_from_nlattrs(const struct nlattr *key, uint32_t key_len, |
2702 | | struct flow *flow, bool probe) |
2703 | 0 | { |
2704 | 0 | if (odp_flow_key_to_flow(key, key_len, flow, NULL)) { |
2705 | 0 | if (!probe) { |
2706 | | /* This should not happen: it indicates that |
2707 | | * odp_flow_key_from_flow() and odp_flow_key_to_flow() disagree on |
2708 | | * the acceptable form of a flow. Log the problem as an error, |
2709 | | * with enough details to enable debugging. */ |
2710 | 0 | static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); |
2711 | |
|
2712 | 0 | if (!VLOG_DROP_ERR(&rl)) { |
2713 | 0 | struct ds s; |
2714 | |
|
2715 | 0 | ds_init(&s); |
2716 | 0 | odp_flow_format(key, key_len, NULL, 0, NULL, &s, true, false); |
2717 | 0 | VLOG_ERR("internal error parsing flow key %s", ds_cstr(&s)); |
2718 | 0 | ds_destroy(&s); |
2719 | 0 | } |
2720 | 0 | } |
2721 | |
|
2722 | 0 | return EINVAL; |
2723 | 0 | } |
2724 | | |
2725 | 0 | if (flow->ct_state & DP_NETDEV_CS_UNSUPPORTED_MASK) { |
2726 | 0 | return EINVAL; |
2727 | 0 | } |
2728 | | |
2729 | 0 | return 0; |
2730 | 0 | } |
2731 | | |
2732 | | static int |
2733 | | dpif_netdev_flow_get(const struct dpif *dpif, const struct dpif_flow_get *get) |
2734 | 0 | { |
2735 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
2736 | 0 | struct dp_netdev_flow *netdev_flow; |
2737 | 0 | struct dp_netdev_pmd_thread *pmd; |
2738 | 0 | struct hmapx to_find = HMAPX_INITIALIZER(&to_find); |
2739 | 0 | struct hmapx_node *node; |
2740 | 0 | int error = EINVAL; |
2741 | |
|
2742 | 0 | if (get->pmd_id == PMD_ID_NULL) { |
2743 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
2744 | 0 | if (dp_netdev_pmd_try_ref(pmd) && !hmapx_add(&to_find, pmd)) { |
2745 | 0 | dp_netdev_pmd_unref(pmd); |
2746 | 0 | } |
2747 | 0 | } |
2748 | 0 | } else { |
2749 | 0 | pmd = dp_netdev_get_pmd(dp, get->pmd_id); |
2750 | 0 | if (!pmd) { |
2751 | 0 | goto out; |
2752 | 0 | } |
2753 | 0 | hmapx_add(&to_find, pmd); |
2754 | 0 | } |
2755 | | |
2756 | 0 | if (!hmapx_count(&to_find)) { |
2757 | 0 | goto out; |
2758 | 0 | } |
2759 | | |
2760 | 0 | HMAPX_FOR_EACH (node, &to_find) { |
2761 | 0 | pmd = (struct dp_netdev_pmd_thread *) node->data; |
2762 | 0 | netdev_flow = dp_netdev_pmd_find_flow(pmd, get->ufid, get->key, |
2763 | 0 | get->key_len); |
2764 | 0 | if (netdev_flow) { |
2765 | 0 | dp_netdev_flow_to_dpif_flow(dp, netdev_flow, get->buffer, |
2766 | 0 | get->buffer, get->flow, false); |
2767 | 0 | error = 0; |
2768 | 0 | break; |
2769 | 0 | } else { |
2770 | 0 | error = ENOENT; |
2771 | 0 | } |
2772 | 0 | } |
2773 | |
|
2774 | 0 | HMAPX_FOR_EACH (node, &to_find) { |
2775 | 0 | pmd = (struct dp_netdev_pmd_thread *) node->data; |
2776 | 0 | dp_netdev_pmd_unref(pmd); |
2777 | 0 | } |
2778 | 0 | out: |
2779 | 0 | hmapx_destroy(&to_find); |
2780 | 0 | return error; |
2781 | 0 | } |
2782 | | |
2783 | | static void |
2784 | | dp_netdev_get_mega_ufid(const struct match *match, ovs_u128 *mega_ufid) |
2785 | 0 | { |
2786 | 0 | struct { |
2787 | 0 | struct flow masked_flow; |
2788 | 0 | struct flow wc; |
2789 | 0 | } key; |
2790 | 0 | size_t i; |
2791 | |
|
2792 | 0 | memset(&key, 0, sizeof key); |
2793 | 0 | for (i = 0; i < sizeof(struct flow); i++) { |
2794 | 0 | ((uint8_t *)&key.masked_flow)[i] = ((uint8_t *)&match->flow)[i] & |
2795 | 0 | ((uint8_t *)&match->wc)[i]; |
2796 | 0 | ((uint8_t *)&key.wc)[i] = ((uint8_t *)&match->wc)[i]; |
2797 | 0 | } |
2798 | |
|
2799 | 0 | odp_flow_key_hash(&key, sizeof key, mega_ufid); |
2800 | 0 | } |
2801 | | |
2802 | | static uint64_t |
2803 | | dp_netdev_simple_match_mark(odp_port_t in_port, ovs_be16 dl_type, |
2804 | | uint8_t nw_frag, ovs_be16 vlan_tci) |
2805 | 0 | { |
2806 | | /* Simple Match Mark: |
2807 | | * |
2808 | | * BE: |
2809 | | * +-----------------+-------------++---------+---+-----------+ |
2810 | | * | in_port | dl_type || nw_frag |CFI| VID(12) | |
2811 | | * +-----------------+-------------++---------+---+-----------+ |
2812 | | * 0 32 47 49 51 52 63 |
2813 | | * |
2814 | | * LE: |
2815 | | * +-----------------+-------------+------++-------+---+------+ |
2816 | | * | in_port | dl_type |VID(8)||nw_frag|CFI|VID(4)| |
2817 | | * +-----------------+-------------+------++-------+---+------+ |
2818 | | * 0 32 47 48 55 57 59 60 61 63 |
2819 | | * |
2820 | | * Big Endian Little Endian |
2821 | | * in_port : 32 bits [ 0..31] in_port : 32 bits [ 0..31] |
2822 | | * dl_type : 16 bits [32..47] dl_type : 16 bits [32..47] |
2823 | | * <empty> : 1 bit [48..48] vlan VID: 8 bits [48..55] |
2824 | | * nw_frag : 2 bits [49..50] <empty> : 1 bit [56..56] |
2825 | | * vlan CFI: 1 bit [51..51] nw_frag : 2 bits [57..59] |
2826 | | * vlan VID: 12 bits [52..63] vlan CFI: 1 bit [60..60] |
2827 | | * vlan VID: 4 bits [61..63] |
2828 | | * |
2829 | | * Layout is different for LE and BE in order to save a couple of |
2830 | | * network to host translations. |
2831 | | * */ |
2832 | 0 | return ((uint64_t) odp_to_u32(in_port) << 32) |
2833 | 0 | | ((OVS_FORCE uint32_t) dl_type << 16) |
2834 | | #if WORDS_BIGENDIAN |
2835 | | | (((uint16_t) nw_frag & FLOW_NW_FRAG_MASK) << VLAN_PCP_SHIFT) |
2836 | | #else |
2837 | 0 | | ((nw_frag & FLOW_NW_FRAG_MASK) << (VLAN_PCP_SHIFT - 8)) |
2838 | 0 | #endif |
2839 | 0 | | (OVS_FORCE uint16_t) (vlan_tci & htons(VLAN_VID_MASK | VLAN_CFI)); |
2840 | 0 | } |
2841 | | |
2842 | | static struct dp_netdev_flow * |
2843 | | dp_netdev_simple_match_lookup(const struct dp_netdev_pmd_thread *pmd, |
2844 | | odp_port_t in_port, ovs_be16 dl_type, |
2845 | | uint8_t nw_frag, ovs_be16 vlan_tci) |
2846 | 0 | { |
2847 | 0 | uint64_t mark = dp_netdev_simple_match_mark(in_port, dl_type, |
2848 | 0 | nw_frag, vlan_tci); |
2849 | 0 | uint32_t hash = hash_uint64(mark); |
2850 | 0 | struct dp_netdev_flow *flow; |
2851 | 0 | bool found = false; |
2852 | |
|
2853 | 0 | CMAP_FOR_EACH_WITH_HASH (flow, simple_match_node, |
2854 | 0 | hash, &pmd->simple_match_table) { |
2855 | 0 | if (flow->simple_match_mark == mark) { |
2856 | 0 | found = true; |
2857 | 0 | break; |
2858 | 0 | } |
2859 | 0 | } |
2860 | 0 | return found ? flow : NULL; |
2861 | 0 | } |
2862 | | |
2863 | | static bool |
2864 | | dp_netdev_simple_match_enabled(const struct dp_netdev_pmd_thread *pmd, |
2865 | | odp_port_t in_port) |
2866 | 0 | { |
2867 | 0 | return ccmap_find(&pmd->n_flows, odp_to_u32(in_port)) |
2868 | 0 | == ccmap_find(&pmd->n_simple_flows, odp_to_u32(in_port)); |
2869 | 0 | } |
2870 | | |
2871 | | static void |
2872 | | dp_netdev_simple_match_insert(struct dp_netdev_pmd_thread *pmd, |
2873 | | struct dp_netdev_flow *dp_flow) |
2874 | | OVS_REQUIRES(pmd->flow_mutex) |
2875 | 0 | { |
2876 | 0 | odp_port_t in_port = dp_flow->flow.in_port.odp_port; |
2877 | 0 | ovs_be16 vlan_tci = dp_flow->flow.vlans[0].tci; |
2878 | 0 | ovs_be16 dl_type = dp_flow->flow.dl_type; |
2879 | 0 | uint8_t nw_frag = dp_flow->flow.nw_frag; |
2880 | |
|
2881 | 0 | if (!dp_netdev_flow_ref(dp_flow)) { |
2882 | 0 | return; |
2883 | 0 | } |
2884 | | |
2885 | | /* Avoid double insertion. Should not happen in practice. */ |
2886 | 0 | dp_netdev_simple_match_remove(pmd, dp_flow); |
2887 | |
|
2888 | 0 | uint64_t mark = dp_netdev_simple_match_mark(in_port, dl_type, |
2889 | 0 | nw_frag, vlan_tci); |
2890 | 0 | uint32_t hash = hash_uint64(mark); |
2891 | |
|
2892 | 0 | dp_flow->simple_match_mark = mark; |
2893 | 0 | cmap_insert(&pmd->simple_match_table, |
2894 | 0 | CONST_CAST(struct cmap_node *, &dp_flow->simple_match_node), |
2895 | 0 | hash); |
2896 | 0 | ccmap_inc(&pmd->n_simple_flows, odp_to_u32(in_port)); |
2897 | |
|
2898 | 0 | VLOG_DBG("Simple match insert: " |
2899 | 0 | "core_id(%d),in_port(%"PRIu32"),mark(0x%016"PRIx64").", |
2900 | 0 | pmd->core_id, in_port, mark); |
2901 | 0 | } |
2902 | | |
2903 | | static void |
2904 | | dp_netdev_simple_match_remove(struct dp_netdev_pmd_thread *pmd, |
2905 | | struct dp_netdev_flow *dp_flow) |
2906 | | OVS_REQUIRES(pmd->flow_mutex) |
2907 | 0 | { |
2908 | 0 | odp_port_t in_port = dp_flow->flow.in_port.odp_port; |
2909 | 0 | ovs_be16 vlan_tci = dp_flow->flow.vlans[0].tci; |
2910 | 0 | ovs_be16 dl_type = dp_flow->flow.dl_type; |
2911 | 0 | uint8_t nw_frag = dp_flow->flow.nw_frag; |
2912 | 0 | struct dp_netdev_flow *flow; |
2913 | 0 | uint64_t mark = dp_netdev_simple_match_mark(in_port, dl_type, |
2914 | 0 | nw_frag, vlan_tci); |
2915 | 0 | uint32_t hash = hash_uint64(mark); |
2916 | |
|
2917 | 0 | flow = dp_netdev_simple_match_lookup(pmd, in_port, dl_type, |
2918 | 0 | nw_frag, vlan_tci); |
2919 | 0 | if (flow == dp_flow) { |
2920 | 0 | VLOG_DBG("Simple match remove: " |
2921 | 0 | "core_id(%d),in_port(%"PRIu32"),mark(0x%016"PRIx64").", |
2922 | 0 | pmd->core_id, in_port, mark); |
2923 | 0 | cmap_remove(&pmd->simple_match_table, |
2924 | 0 | CONST_CAST(struct cmap_node *, &flow->simple_match_node), |
2925 | 0 | hash); |
2926 | 0 | ccmap_dec(&pmd->n_simple_flows, odp_to_u32(in_port)); |
2927 | 0 | dp_netdev_flow_unref(flow); |
2928 | 0 | } |
2929 | 0 | } |
2930 | | |
2931 | | static bool |
2932 | | dp_netdev_flow_is_simple_match(const struct match *match) |
2933 | 0 | { |
2934 | 0 | const struct flow *flow = &match->flow; |
2935 | 0 | const struct flow_wildcards *wc = &match->wc; |
2936 | |
|
2937 | 0 | if (flow->recirc_id || flow->packet_type != htonl(PT_ETH)) { |
2938 | 0 | return false; |
2939 | 0 | } |
2940 | | |
2941 | | /* Check that flow matches only minimal set of fields that always set. |
2942 | | * Also checking that VLAN VID+CFI is an exact match, because these |
2943 | | * are not mandatory and could be masked. */ |
2944 | 0 | struct flow_wildcards *minimal = xmalloc(sizeof *minimal); |
2945 | 0 | ovs_be16 vlan_tci_mask = htons(VLAN_VID_MASK | VLAN_CFI); |
2946 | |
|
2947 | 0 | flow_wildcards_init_catchall(minimal); |
2948 | | /* 'dpif-netdev' always has following in exact match: |
2949 | | * - recirc_id <-- recirc_id == 0 checked on input. |
2950 | | * - in_port <-- Will be checked on input. |
2951 | | * - packet_type <-- Assuming all packets are PT_ETH. |
2952 | | * - dl_type <-- Need to match with. |
2953 | | * - vlan_tci <-- Need to match with. |
2954 | | * - and nw_frag for ip packets. <-- Need to match with. |
2955 | | */ |
2956 | 0 | WC_MASK_FIELD(minimal, recirc_id); |
2957 | 0 | WC_MASK_FIELD(minimal, in_port); |
2958 | 0 | WC_MASK_FIELD(minimal, packet_type); |
2959 | 0 | WC_MASK_FIELD(minimal, dl_type); |
2960 | 0 | WC_MASK_FIELD_MASK(minimal, vlans[0].tci, vlan_tci_mask); |
2961 | 0 | WC_MASK_FIELD_MASK(minimal, nw_frag, FLOW_NW_FRAG_MASK); |
2962 | |
|
2963 | 0 | if (flow_wildcards_has_extra(minimal, wc) |
2964 | 0 | || wc->masks.vlans[0].tci != vlan_tci_mask) { |
2965 | 0 | free(minimal); |
2966 | 0 | return false; |
2967 | 0 | } |
2968 | 0 | free(minimal); |
2969 | |
|
2970 | 0 | return true; |
2971 | 0 | } |
2972 | | |
2973 | | static void |
2974 | | offload_flow_put_resume(struct dp_netdev *dp, struct dp_netdev_flow *flow, |
2975 | | struct dp_netdev_flow *previous_flow_reference, |
2976 | | unsigned pmd_id, int error) |
2977 | 0 | { |
2978 | 0 | if (error == EINPROGRESS) { |
2979 | 0 | return; |
2980 | 0 | } |
2981 | | |
2982 | 0 | if (!error) { |
2983 | 0 | flow->offloaded = true; |
2984 | 0 | } else { |
2985 | | /* If the flow was already offloaded, the new action set can no |
2986 | | * longer be offloaded. In theory, we should disassociate the |
2987 | | * offload from all PMDs that have this flow marked as offloaded. |
2988 | | * Unfortunately, there is no mechanism to inform other PMDs, so |
2989 | | * we cannot explicitly mark such flows. This situation typically |
2990 | | * occurs when the revalidator modifies the flow, so it is safe to |
2991 | | * assume it will update all affected flows and that the offload |
2992 | | * will subsequently fail. */ |
2993 | 0 | flow->offloaded = false; |
2994 | | |
2995 | | /* On error, the flow reference was not stored by the offload provider, |
2996 | | * so we should decrease the reference. */ |
2997 | 0 | dp_netdev_flow_unref(flow); |
2998 | 0 | } |
2999 | |
|
3000 | 0 | if (offload_queue_dec(flow) && flow->dead) { |
3001 | | /* If flows are processed asynchronously, modifications might |
3002 | | * still be queued up while the flow is being removed. If this |
3003 | | * was the last flow in the queue on a dead flow, we try again |
3004 | | * to see if we need to remove this flow. */ |
3005 | 0 | offload_flow_del(dp, pmd_id, flow); |
3006 | 0 | } |
3007 | |
|
3008 | 0 | if (previous_flow_reference) { |
3009 | 0 | dp_netdev_flow_unref(previous_flow_reference); |
3010 | 0 | if (previous_flow_reference != flow) { |
3011 | 0 | VLOG_DBG("Updated flow reference was from outdated flow"); |
3012 | 0 | } |
3013 | 0 | } |
3014 | 0 | } |
3015 | | |
3016 | | static void |
3017 | | offload_flow_put_resume_cb(void *aux, struct dpif_flow_stats *stats OVS_UNUSED, |
3018 | | unsigned pmd_id, void *flow_reference_, |
3019 | | void *old_flow_reference_, |
3020 | | int error) |
3021 | 0 | { |
3022 | 0 | struct dp_netdev *dp = aux; |
3023 | 0 | struct dp_netdev_flow *flow_reference = flow_reference_; |
3024 | 0 | struct dp_netdev_flow *old_flow_reference = old_flow_reference_; |
3025 | |
|
3026 | 0 | offload_flow_put_resume(dp, flow_reference, old_flow_reference, |
3027 | 0 | pmd_id, error); |
3028 | 0 | } |
3029 | | |
3030 | | static void |
3031 | | offload_flow_put(struct dp_netdev_pmd_thread *pmd, struct dp_netdev_flow *flow, |
3032 | | struct match *match, const struct nlattr *actions, |
3033 | | size_t actions_len) |
3034 | 0 | { |
3035 | 0 | struct dpif_offload_flow_put put = { |
3036 | 0 | .in_port = match->flow.in_port.odp_port, |
3037 | 0 | .orig_in_port = flow->orig_in_port, |
3038 | 0 | .pmd_id = pmd->core_id, |
3039 | 0 | .ufid = CONST_CAST(ovs_u128 *, &flow->mega_ufid), |
3040 | 0 | .match = match, |
3041 | 0 | .actions = actions, |
3042 | 0 | .actions_len = actions_len, |
3043 | 0 | .stats = NULL, |
3044 | 0 | .flow_reference = flow, |
3045 | 0 | .cb_data = { |
3046 | 0 | .callback = offload_flow_put_resume_cb, |
3047 | 0 | .callback_aux = pmd->dp, |
3048 | 0 | }, |
3049 | 0 | }; |
3050 | 0 | void *previous_flow_reference = NULL; |
3051 | 0 | int error; |
3052 | |
|
3053 | 0 | if (!dpif_offload_enabled() || flow->dead || !offload_queue_inc(flow)) { |
3054 | 0 | return; |
3055 | 0 | } |
3056 | | |
3057 | 0 | dp_netdev_flow_ref(flow); |
3058 | |
|
3059 | 0 | error = dpif_offload_datapath_flow_put(pmd->dp->full_name, &put, |
3060 | 0 | &previous_flow_reference); |
3061 | 0 | offload_flow_put_resume(pmd->dp, put.flow_reference, |
3062 | 0 | previous_flow_reference, |
3063 | 0 | pmd->core_id, error); |
3064 | 0 | } |
3065 | | |
3066 | | static struct dp_netdev_flow * |
3067 | | dp_netdev_flow_add(struct dp_netdev_pmd_thread *pmd, |
3068 | | struct match *match, const ovs_u128 *ufid, |
3069 | | const struct nlattr *actions, size_t actions_len, |
3070 | | odp_port_t orig_in_port) |
3071 | | OVS_REQUIRES(pmd->flow_mutex) |
3072 | 0 | { |
3073 | 0 | struct ds extra_info = DS_EMPTY_INITIALIZER; |
3074 | 0 | struct dp_netdev_flow *flow; |
3075 | 0 | struct netdev_flow_key mask; |
3076 | 0 | struct dpcls *cls; |
3077 | 0 | size_t unit; |
3078 | | |
3079 | | /* Make sure in_port is exact matched before we read it. */ |
3080 | 0 | ovs_assert(match->wc.masks.in_port.odp_port == ODPP_NONE); |
3081 | 0 | odp_port_t in_port = match->flow.in_port.odp_port; |
3082 | | |
3083 | | /* As we select the dpcls based on the port number, each netdev flow |
3084 | | * belonging to the same dpcls will have the same odp_port value. |
3085 | | * For performance reasons we wildcard odp_port here in the mask. In the |
3086 | | * typical case dp_hash is also wildcarded, and the resulting 8-byte |
3087 | | * chunk {dp_hash, in_port} will be ignored by netdev_flow_mask_init() and |
3088 | | * will not be part of the subtable mask. |
3089 | | * This will speed up the hash computation during dpcls_lookup() because |
3090 | | * there is one less call to hash_add64() in this case. */ |
3091 | 0 | match->wc.masks.in_port.odp_port = 0; |
3092 | 0 | netdev_flow_mask_init(&mask, match); |
3093 | 0 | match->wc.masks.in_port.odp_port = ODPP_NONE; |
3094 | | |
3095 | | /* Make sure wc does not have metadata. */ |
3096 | 0 | ovs_assert(!FLOWMAP_HAS_FIELD(&mask.mf.map, metadata) |
3097 | 0 | && !FLOWMAP_HAS_FIELD(&mask.mf.map, regs)); |
3098 | | |
3099 | | /* Do not allocate extra space. */ |
3100 | 0 | flow = xmalloc(sizeof *flow - sizeof flow->cr.flow.mf + mask.len); |
3101 | 0 | memset(&flow->stats, 0, sizeof flow->stats); |
3102 | 0 | flow->dead = false; |
3103 | 0 | flow->offloaded = false; |
3104 | 0 | atomic_init(&flow->offload_queue_depth, 0); |
3105 | 0 | flow->batch = NULL; |
3106 | 0 | flow->orig_in_port = orig_in_port; |
3107 | 0 | *CONST_CAST(unsigned *, &flow->pmd_id) = pmd->core_id; |
3108 | 0 | *CONST_CAST(struct flow *, &flow->flow) = match->flow; |
3109 | 0 | *CONST_CAST(ovs_u128 *, &flow->ufid) = *ufid; |
3110 | 0 | ovs_refcount_init(&flow->ref_cnt); |
3111 | 0 | ovsrcu_set(&flow->actions, dp_netdev_actions_create(actions, actions_len)); |
3112 | |
|
3113 | 0 | dp_netdev_get_mega_ufid(match, CONST_CAST(ovs_u128 *, &flow->mega_ufid)); |
3114 | 0 | netdev_flow_key_init_masked(&flow->cr.flow, &match->flow, &mask); |
3115 | | |
3116 | | /* Select dpcls for in_port. Relies on in_port to be exact match. */ |
3117 | 0 | cls = dp_netdev_pmd_find_dpcls(pmd, in_port); |
3118 | 0 | dpcls_insert(cls, &flow->cr, &mask); |
3119 | |
|
3120 | 0 | ds_put_cstr(&extra_info, "miniflow_bits("); |
3121 | 0 | FLOWMAP_FOR_EACH_UNIT (unit) { |
3122 | 0 | if (unit) { |
3123 | 0 | ds_put_char(&extra_info, ','); |
3124 | 0 | } |
3125 | 0 | ds_put_format(&extra_info, "%d", |
3126 | 0 | count_1bits(flow->cr.mask->mf.map.bits[unit])); |
3127 | 0 | } |
3128 | 0 | ds_put_char(&extra_info, ')'); |
3129 | 0 | flow->dp_extra_info = ds_steal_cstr(&extra_info); |
3130 | 0 | ds_destroy(&extra_info); |
3131 | |
|
3132 | 0 | cmap_insert(&pmd->flow_table, CONST_CAST(struct cmap_node *, &flow->node), |
3133 | 0 | dp_netdev_flow_hash(&flow->ufid)); |
3134 | 0 | ccmap_inc(&pmd->n_flows, odp_to_u32(in_port)); |
3135 | |
|
3136 | 0 | if (dp_netdev_flow_is_simple_match(match)) { |
3137 | 0 | dp_netdev_simple_match_insert(pmd, flow); |
3138 | 0 | } |
3139 | |
|
3140 | 0 | offload_flow_put(pmd, flow, match, actions, actions_len); |
3141 | 0 | log_netdev_flow_change(flow, match, NULL, actions, actions_len); |
3142 | |
|
3143 | 0 | return flow; |
3144 | 0 | } |
3145 | | |
3146 | | static int |
3147 | | flow_put_on_pmd(struct dp_netdev_pmd_thread *pmd, |
3148 | | struct netdev_flow_key *key, |
3149 | | struct match *match, |
3150 | | ovs_u128 *ufid, |
3151 | | const struct dpif_flow_put *put, |
3152 | | struct dpif_flow_stats *stats) |
3153 | 0 | { |
3154 | 0 | struct dp_netdev_flow *netdev_flow = NULL; |
3155 | 0 | int error = 0; |
3156 | |
|
3157 | 0 | if (stats) { |
3158 | 0 | memset(stats, 0, sizeof *stats); |
3159 | 0 | } |
3160 | |
|
3161 | 0 | ovs_mutex_lock(&pmd->flow_mutex); |
3162 | 0 | if (put->ufid) { |
3163 | 0 | netdev_flow = dp_netdev_pmd_find_flow(pmd, put->ufid, |
3164 | 0 | put->key, put->key_len); |
3165 | 0 | } else { |
3166 | | /* Use key instead of the locally generated ufid |
3167 | | * to search netdev_flow. */ |
3168 | 0 | netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL); |
3169 | 0 | } |
3170 | |
|
3171 | 0 | if (put->flags & DPIF_FP_CREATE) { |
3172 | 0 | if (!netdev_flow) { |
3173 | 0 | dp_netdev_flow_add(pmd, match, ufid, |
3174 | 0 | put->actions, put->actions_len, ODPP_NONE); |
3175 | 0 | } else { |
3176 | 0 | error = EEXIST; |
3177 | 0 | } |
3178 | 0 | goto exit; |
3179 | 0 | } |
3180 | | |
3181 | 0 | if (put->flags & DPIF_FP_MODIFY) { |
3182 | 0 | if (!netdev_flow) { |
3183 | 0 | error = ENOENT; |
3184 | 0 | } else { |
3185 | 0 | if (!put->ufid && !flow_equal(&match->flow, &netdev_flow->flow)) { |
3186 | | /* Overlapping flow. */ |
3187 | 0 | error = EINVAL; |
3188 | 0 | goto exit; |
3189 | 0 | } |
3190 | | |
3191 | 0 | struct dp_netdev_actions *new_actions; |
3192 | 0 | struct dp_netdev_actions *old_actions; |
3193 | |
|
3194 | 0 | new_actions = dp_netdev_actions_create(put->actions, |
3195 | 0 | put->actions_len); |
3196 | |
|
3197 | 0 | old_actions = dp_netdev_flow_get_actions(netdev_flow); |
3198 | 0 | ovsrcu_set(&netdev_flow->actions, new_actions); |
3199 | |
|
3200 | 0 | offload_flow_put(pmd, netdev_flow, match, put->actions, |
3201 | 0 | put->actions_len); |
3202 | 0 | log_netdev_flow_change(netdev_flow, match, old_actions, |
3203 | 0 | put->actions, put->actions_len); |
3204 | |
|
3205 | 0 | if (stats) { |
3206 | 0 | get_dpif_flow_status(pmd->dp, netdev_flow, stats, NULL); |
3207 | 0 | } |
3208 | 0 | if (put->flags & DPIF_FP_ZERO_STATS) { |
3209 | | /* XXX: The userspace datapath uses thread local statistics |
3210 | | * (for flows), which should be updated only by the owning |
3211 | | * thread. Since we cannot write on stats memory here, |
3212 | | * we choose not to support this flag. Please note: |
3213 | | * - This feature is currently used only by dpctl commands with |
3214 | | * option --clear. |
3215 | | * - Should the need arise, this operation can be implemented |
3216 | | * by keeping a base value (to be update here) for each |
3217 | | * counter, and subtracting it before outputting the stats */ |
3218 | 0 | error = EOPNOTSUPP; |
3219 | 0 | } |
3220 | 0 | ovsrcu_postpone(dp_netdev_actions_free, old_actions); |
3221 | 0 | } |
3222 | 0 | } |
3223 | | |
3224 | 0 | exit: |
3225 | 0 | ovs_mutex_unlock(&pmd->flow_mutex); |
3226 | 0 | return error; |
3227 | 0 | } |
3228 | | |
3229 | | static int |
3230 | | dpif_netdev_flow_put(struct dpif *dpif, const struct dpif_flow_put *put) |
3231 | 0 | { |
3232 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
3233 | 0 | struct netdev_flow_key key; |
3234 | 0 | struct dp_netdev_pmd_thread *pmd; |
3235 | 0 | struct match match; |
3236 | 0 | ovs_u128 ufid; |
3237 | 0 | int error; |
3238 | 0 | bool probe = put->flags & DPIF_FP_PROBE; |
3239 | |
|
3240 | 0 | if (put->stats) { |
3241 | 0 | memset(put->stats, 0, sizeof *put->stats); |
3242 | 0 | } |
3243 | 0 | error = dpif_netdev_flow_from_nlattrs(put->key, put->key_len, &match.flow, |
3244 | 0 | probe); |
3245 | 0 | if (error) { |
3246 | 0 | return error; |
3247 | 0 | } |
3248 | 0 | error = dpif_netdev_mask_from_nlattrs(put->key, put->key_len, |
3249 | 0 | put->mask, put->mask_len, |
3250 | 0 | &match.flow, &match.wc, probe); |
3251 | 0 | if (error) { |
3252 | 0 | return error; |
3253 | 0 | } |
3254 | | |
3255 | 0 | if (match.wc.masks.in_port.odp_port != ODPP_NONE) { |
3256 | 0 | static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); |
3257 | |
|
3258 | 0 | VLOG_ERR_RL(&rl, "failed to put%s flow: in_port is not an exact match", |
3259 | 0 | (put->flags & DPIF_FP_CREATE) ? "[create]" |
3260 | 0 | : (put->flags & DPIF_FP_MODIFY) ? "[modify]" : "[zero]"); |
3261 | 0 | return EINVAL; |
3262 | 0 | } |
3263 | | |
3264 | 0 | if (put->ufid) { |
3265 | 0 | ufid = *put->ufid; |
3266 | 0 | } else { |
3267 | 0 | odp_flow_key_hash(&match.flow, sizeof match.flow, &ufid); |
3268 | 0 | } |
3269 | | |
3270 | | /* The Netlink encoding of datapath flow keys cannot express |
3271 | | * wildcarding the presence of a VLAN tag. Instead, a missing VLAN |
3272 | | * tag is interpreted as exact match on the fact that there is no |
3273 | | * VLAN. Unless we refactor a lot of code that translates between |
3274 | | * Netlink and struct flow representations, we have to do the same |
3275 | | * here. This must be in sync with 'match' in handle_packet_upcall(). */ |
3276 | 0 | if (!match.wc.masks.vlans[0].tci) { |
3277 | 0 | match.wc.masks.vlans[0].tci = htons(VLAN_VID_MASK | VLAN_CFI); |
3278 | 0 | } |
3279 | | |
3280 | | /* Must produce a netdev_flow_key for lookup. |
3281 | | * Use the same method as employed to create the key when adding |
3282 | | * the flow to the dplcs to make sure they match. |
3283 | | * We need to put in the unmasked key as flow_put_on_pmd() will first try |
3284 | | * to see if an entry exists doing a packet type lookup. As masked-out |
3285 | | * fields are interpreted as zeros, they could falsely match a wider IP |
3286 | | * address mask. Installation of the flow will use the match variable. */ |
3287 | 0 | netdev_flow_key_init(&key, &match.flow); |
3288 | |
|
3289 | 0 | if (put->pmd_id == PMD_ID_NULL) { |
3290 | 0 | if (cmap_count(&dp->poll_threads) == 0) { |
3291 | 0 | return EINVAL; |
3292 | 0 | } |
3293 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
3294 | 0 | struct dpif_flow_stats pmd_stats; |
3295 | 0 | int pmd_error; |
3296 | |
|
3297 | 0 | pmd_error = flow_put_on_pmd(pmd, &key, &match, &ufid, put, |
3298 | 0 | &pmd_stats); |
3299 | 0 | if (pmd_error) { |
3300 | 0 | error = pmd_error; |
3301 | 0 | } else if (put->stats) { |
3302 | 0 | put->stats->n_packets += pmd_stats.n_packets; |
3303 | 0 | put->stats->n_bytes += pmd_stats.n_bytes; |
3304 | 0 | put->stats->used = MAX(put->stats->used, pmd_stats.used); |
3305 | 0 | put->stats->tcp_flags |= pmd_stats.tcp_flags; |
3306 | 0 | } |
3307 | 0 | } |
3308 | 0 | } else { |
3309 | 0 | pmd = dp_netdev_get_pmd(dp, put->pmd_id); |
3310 | 0 | if (!pmd) { |
3311 | 0 | return EINVAL; |
3312 | 0 | } |
3313 | 0 | error = flow_put_on_pmd(pmd, &key, &match, &ufid, put, put->stats); |
3314 | 0 | dp_netdev_pmd_unref(pmd); |
3315 | 0 | } |
3316 | | |
3317 | 0 | return error; |
3318 | 0 | } |
3319 | | |
3320 | | static int |
3321 | | flow_del_on_pmd(struct dp_netdev_pmd_thread *pmd, |
3322 | | struct dpif_flow_stats *stats, |
3323 | | const struct dpif_flow_del *del) |
3324 | 0 | { |
3325 | 0 | struct dp_netdev_flow *netdev_flow; |
3326 | 0 | int error = 0; |
3327 | |
|
3328 | 0 | ovs_mutex_lock(&pmd->flow_mutex); |
3329 | 0 | netdev_flow = dp_netdev_pmd_find_flow(pmd, del->ufid, del->key, |
3330 | 0 | del->key_len); |
3331 | 0 | if (netdev_flow) { |
3332 | 0 | if (stats) { |
3333 | 0 | get_dpif_flow_status(pmd->dp, netdev_flow, stats, NULL); |
3334 | 0 | } |
3335 | 0 | dp_netdev_pmd_remove_flow(pmd, netdev_flow); |
3336 | 0 | } else { |
3337 | 0 | error = ENOENT; |
3338 | 0 | } |
3339 | 0 | ovs_mutex_unlock(&pmd->flow_mutex); |
3340 | |
|
3341 | 0 | return error; |
3342 | 0 | } |
3343 | | |
3344 | | static int |
3345 | | dpif_netdev_flow_del(struct dpif *dpif, const struct dpif_flow_del *del) |
3346 | 0 | { |
3347 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
3348 | 0 | struct dp_netdev_pmd_thread *pmd; |
3349 | 0 | int error = 0; |
3350 | |
|
3351 | 0 | if (del->stats) { |
3352 | 0 | memset(del->stats, 0, sizeof *del->stats); |
3353 | 0 | } |
3354 | |
|
3355 | 0 | if (del->pmd_id == PMD_ID_NULL) { |
3356 | 0 | if (cmap_count(&dp->poll_threads) == 0) { |
3357 | 0 | return EINVAL; |
3358 | 0 | } |
3359 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
3360 | 0 | struct dpif_flow_stats pmd_stats; |
3361 | 0 | int pmd_error; |
3362 | |
|
3363 | 0 | pmd_error = flow_del_on_pmd(pmd, &pmd_stats, del); |
3364 | 0 | if (pmd_error) { |
3365 | 0 | error = pmd_error; |
3366 | 0 | } else if (del->stats) { |
3367 | 0 | del->stats->n_packets += pmd_stats.n_packets; |
3368 | 0 | del->stats->n_bytes += pmd_stats.n_bytes; |
3369 | 0 | del->stats->used = MAX(del->stats->used, pmd_stats.used); |
3370 | 0 | del->stats->tcp_flags |= pmd_stats.tcp_flags; |
3371 | 0 | } |
3372 | 0 | } |
3373 | 0 | } else { |
3374 | 0 | pmd = dp_netdev_get_pmd(dp, del->pmd_id); |
3375 | 0 | if (!pmd) { |
3376 | 0 | return EINVAL; |
3377 | 0 | } |
3378 | 0 | error = flow_del_on_pmd(pmd, del->stats, del); |
3379 | 0 | dp_netdev_pmd_unref(pmd); |
3380 | 0 | } |
3381 | | |
3382 | | |
3383 | 0 | return error; |
3384 | 0 | } |
3385 | | |
3386 | | struct dpif_netdev_flow_dump { |
3387 | | struct dpif_flow_dump up; |
3388 | | struct cmap_position poll_thread_pos; |
3389 | | struct cmap_position flow_pos; |
3390 | | struct dp_netdev_pmd_thread *cur_pmd; |
3391 | | int status; |
3392 | | struct ovs_mutex mutex; |
3393 | | }; |
3394 | | |
3395 | | static struct dpif_netdev_flow_dump * |
3396 | | dpif_netdev_flow_dump_cast(struct dpif_flow_dump *dump) |
3397 | 0 | { |
3398 | 0 | return CONTAINER_OF(dump, struct dpif_netdev_flow_dump, up); |
3399 | 0 | } |
3400 | | |
3401 | | static struct dpif_flow_dump * |
3402 | | dpif_netdev_flow_dump_create(const struct dpif *dpif_, bool terse, |
3403 | | struct dpif_flow_dump_types *types) |
3404 | 0 | { |
3405 | 0 | struct dpif_netdev_flow_dump *dump; |
3406 | |
|
3407 | 0 | dump = xzalloc(sizeof *dump); |
3408 | 0 | dpif_flow_dump_init(&dump->up, dpif_, terse, types); |
3409 | 0 | ovs_mutex_init(&dump->mutex); |
3410 | |
|
3411 | 0 | return &dump->up; |
3412 | 0 | } |
3413 | | |
3414 | | static int |
3415 | | dpif_netdev_flow_dump_destroy(struct dpif_flow_dump *dump_) |
3416 | 0 | { |
3417 | 0 | struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_); |
3418 | |
|
3419 | 0 | ovs_mutex_destroy(&dump->mutex); |
3420 | 0 | free(dump); |
3421 | 0 | return 0; |
3422 | 0 | } |
3423 | | |
3424 | | struct dpif_netdev_flow_dump_thread { |
3425 | | struct dpif_flow_dump_thread up; |
3426 | | struct dpif_netdev_flow_dump *dump; |
3427 | | struct odputil_keybuf keybuf[FLOW_DUMP_MAX_BATCH]; |
3428 | | struct odputil_keybuf maskbuf[FLOW_DUMP_MAX_BATCH]; |
3429 | | }; |
3430 | | |
3431 | | static struct dpif_netdev_flow_dump_thread * |
3432 | | dpif_netdev_flow_dump_thread_cast(struct dpif_flow_dump_thread *thread) |
3433 | 0 | { |
3434 | 0 | return CONTAINER_OF(thread, struct dpif_netdev_flow_dump_thread, up); |
3435 | 0 | } |
3436 | | |
3437 | | static struct dpif_flow_dump_thread * |
3438 | | dpif_netdev_flow_dump_thread_create(struct dpif_flow_dump *dump_) |
3439 | 0 | { |
3440 | 0 | struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_); |
3441 | 0 | struct dpif_netdev_flow_dump_thread *thread; |
3442 | |
|
3443 | 0 | thread = xmalloc(sizeof *thread); |
3444 | 0 | dpif_flow_dump_thread_init(&thread->up, &dump->up); |
3445 | 0 | thread->dump = dump; |
3446 | 0 | return &thread->up; |
3447 | 0 | } |
3448 | | |
3449 | | static void |
3450 | | dpif_netdev_flow_dump_thread_destroy(struct dpif_flow_dump_thread *thread_) |
3451 | 0 | { |
3452 | 0 | struct dpif_netdev_flow_dump_thread *thread |
3453 | 0 | = dpif_netdev_flow_dump_thread_cast(thread_); |
3454 | |
|
3455 | 0 | free(thread); |
3456 | 0 | } |
3457 | | |
3458 | | static int |
3459 | | dpif_netdev_flow_dump_next(struct dpif_flow_dump_thread *thread_, |
3460 | | struct dpif_flow *flows, int max_flows) |
3461 | 0 | { |
3462 | 0 | struct dpif_netdev_flow_dump_thread *thread |
3463 | 0 | = dpif_netdev_flow_dump_thread_cast(thread_); |
3464 | 0 | struct dpif_netdev_flow_dump *dump = thread->dump; |
3465 | 0 | struct dp_netdev_flow *netdev_flows[FLOW_DUMP_MAX_BATCH]; |
3466 | 0 | struct dpif_netdev *dpif = dpif_netdev_cast(thread->up.dump->dpif); |
3467 | 0 | struct dp_netdev *dp = get_dp_netdev(&dpif->dpif); |
3468 | 0 | int n_flows = 0; |
3469 | 0 | int i; |
3470 | |
|
3471 | 0 | ovs_mutex_lock(&dump->mutex); |
3472 | 0 | if (!dump->status) { |
3473 | 0 | struct dp_netdev_pmd_thread *pmd = dump->cur_pmd; |
3474 | 0 | int flow_limit = MIN(max_flows, FLOW_DUMP_MAX_BATCH); |
3475 | | |
3476 | | /* First call to dump_next(), extracts the first pmd thread. |
3477 | | * If there is no pmd thread, returns immediately. */ |
3478 | 0 | if (!pmd) { |
3479 | 0 | pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos); |
3480 | 0 | if (!pmd) { |
3481 | 0 | ovs_mutex_unlock(&dump->mutex); |
3482 | 0 | return n_flows; |
3483 | |
|
3484 | 0 | } |
3485 | 0 | } |
3486 | | |
3487 | 0 | do { |
3488 | 0 | for (n_flows = 0; n_flows < flow_limit; n_flows++) { |
3489 | 0 | struct cmap_node *node; |
3490 | |
|
3491 | 0 | node = cmap_next_position(&pmd->flow_table, &dump->flow_pos); |
3492 | 0 | if (!node) { |
3493 | 0 | break; |
3494 | 0 | } |
3495 | 0 | netdev_flows[n_flows] = CONTAINER_OF(node, |
3496 | 0 | struct dp_netdev_flow, |
3497 | 0 | node); |
3498 | 0 | } |
3499 | | /* When finishing dumping the current pmd thread, moves to |
3500 | | * the next. */ |
3501 | 0 | if (n_flows < flow_limit) { |
3502 | 0 | memset(&dump->flow_pos, 0, sizeof dump->flow_pos); |
3503 | 0 | dp_netdev_pmd_unref(pmd); |
3504 | 0 | pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos); |
3505 | 0 | if (!pmd) { |
3506 | 0 | dump->status = EOF; |
3507 | 0 | break; |
3508 | 0 | } |
3509 | 0 | } |
3510 | | /* Keeps the reference to next caller. */ |
3511 | 0 | dump->cur_pmd = pmd; |
3512 | | |
3513 | | /* If the current dump is empty, do not exit the loop, since the |
3514 | | * remaining pmds could have flows to be dumped. Just dumps again |
3515 | | * on the new 'pmd'. */ |
3516 | 0 | } while (!n_flows); |
3517 | 0 | } |
3518 | 0 | ovs_mutex_unlock(&dump->mutex); |
3519 | |
|
3520 | 0 | for (i = 0; i < n_flows; i++) { |
3521 | 0 | struct odputil_keybuf *maskbuf = &thread->maskbuf[i]; |
3522 | 0 | struct odputil_keybuf *keybuf = &thread->keybuf[i]; |
3523 | 0 | struct dp_netdev_flow *netdev_flow = netdev_flows[i]; |
3524 | 0 | struct dpif_flow *f = &flows[i]; |
3525 | 0 | struct ofpbuf key, mask; |
3526 | |
|
3527 | 0 | ofpbuf_use_stack(&key, keybuf, sizeof *keybuf); |
3528 | 0 | ofpbuf_use_stack(&mask, maskbuf, sizeof *maskbuf); |
3529 | 0 | dp_netdev_flow_to_dpif_flow(dp, netdev_flow, &key, &mask, f, |
3530 | 0 | dump->up.terse); |
3531 | 0 | } |
3532 | |
|
3533 | 0 | return n_flows; |
3534 | 0 | } |
3535 | | |
3536 | | static int |
3537 | | dpif_netdev_execute(struct dpif *dpif, struct dpif_execute *execute) |
3538 | | OVS_NO_THREAD_SAFETY_ANALYSIS |
3539 | 0 | { |
3540 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
3541 | 0 | struct dp_netdev_pmd_thread *pmd; |
3542 | 0 | struct dp_packet_batch pp; |
3543 | |
|
3544 | 0 | if (dp_packet_size(execute->packet) < ETH_HEADER_LEN || |
3545 | 0 | dp_packet_size(execute->packet) > UINT16_MAX) { |
3546 | 0 | return EINVAL; |
3547 | 0 | } |
3548 | | |
3549 | | /* Tries finding the 'pmd'. If NULL is returned, that means |
3550 | | * the current thread is a non-pmd thread and should use |
3551 | | * dp_netdev_get_pmd(dp, NON_PMD_CORE_ID). */ |
3552 | 0 | pmd = ovsthread_getspecific(dp->per_pmd_key); |
3553 | 0 | if (!pmd) { |
3554 | 0 | pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID); |
3555 | 0 | if (!pmd) { |
3556 | 0 | return EBUSY; |
3557 | 0 | } |
3558 | 0 | } |
3559 | | |
3560 | 0 | if (execute->probe) { |
3561 | | /* If this is part of a probe, Drop the packet, since executing |
3562 | | * the action may actually cause spurious packets be sent into |
3563 | | * the network. */ |
3564 | 0 | if (pmd->core_id == NON_PMD_CORE_ID) { |
3565 | 0 | dp_netdev_pmd_unref(pmd); |
3566 | 0 | } |
3567 | 0 | return 0; |
3568 | 0 | } |
3569 | | |
3570 | | /* If the current thread is non-pmd thread, acquires |
3571 | | * the 'non_pmd_mutex'. */ |
3572 | 0 | if (pmd->core_id == NON_PMD_CORE_ID) { |
3573 | 0 | ovs_mutex_lock(&dp->non_pmd_mutex); |
3574 | 0 | } |
3575 | | |
3576 | | /* Update current time in PMD context. We don't care about EMC insertion |
3577 | | * probability, because we are on a slow path. */ |
3578 | 0 | pmd_thread_ctx_time_update(pmd); |
3579 | | |
3580 | | /* The action processing expects the RSS hash to be valid, because |
3581 | | * it's always initialized at the beginning of datapath processing. |
3582 | | * In this case, though, 'execute->packet' may not have gone through |
3583 | | * the datapath at all, it may have been generated by the upper layer |
3584 | | * (OpenFlow packet-out, BFD frame, ...). */ |
3585 | 0 | if (!dp_packet_rss_valid(execute->packet)) { |
3586 | 0 | dp_packet_set_rss_hash(execute->packet, |
3587 | 0 | flow_hash_5tuple(execute->flow, 0)); |
3588 | 0 | } |
3589 | | |
3590 | | /* Making a copy because the packet might be stolen during the execution |
3591 | | * and caller might still need it. */ |
3592 | 0 | struct dp_packet *packet_clone = dp_packet_clone(execute->packet); |
3593 | 0 | dp_packet_batch_init_packet(&pp, packet_clone); |
3594 | 0 | dp_netdev_execute_actions(pmd, &pp, false, execute->flow, |
3595 | 0 | execute->actions, execute->actions_len); |
3596 | 0 | dp_netdev_pmd_flush_output_packets(pmd, true); |
3597 | |
|
3598 | 0 | if (pmd->core_id == NON_PMD_CORE_ID) { |
3599 | 0 | ovs_mutex_unlock(&dp->non_pmd_mutex); |
3600 | 0 | dp_netdev_pmd_unref(pmd); |
3601 | 0 | } |
3602 | |
|
3603 | 0 | if (dp_packet_batch_size(&pp) == 1) { |
3604 | | /* Packet wasn't dropped during the execution. Swapping content with |
3605 | | * the original packet, because the caller might expect actions to |
3606 | | * modify it. Uisng the packet from a batch instead of 'packet_clone' |
3607 | | * because it maybe stolen and replaced by other packet, e.g. by |
3608 | | * the fragmentation engine. */ |
3609 | 0 | dp_packet_swap(execute->packet, pp.packets[0]); |
3610 | 0 | dp_packet_delete_batch(&pp, true); |
3611 | 0 | } else if (dp_packet_batch_size(&pp)) { |
3612 | | /* FIXME: We have more packets than expected. Likely, we got IP |
3613 | | * fragments of the reassembled packet. Dropping them here as we have |
3614 | | * no way to get them to the caller. It might be that all the required |
3615 | | * actions with them are already executed, but it also might not be a |
3616 | | * case, e.g. if dpif_netdev_execute() called to execute a single |
3617 | | * tunnel push. */ |
3618 | 0 | dp_packet_delete_batch(&pp, true); |
3619 | 0 | } |
3620 | |
|
3621 | 0 | return 0; |
3622 | 0 | } |
3623 | | |
3624 | | static void |
3625 | | dpif_netdev_operate(struct dpif *dpif, struct dpif_op **ops, size_t n_ops) |
3626 | 0 | { |
3627 | 0 | size_t i; |
3628 | |
|
3629 | 0 | for (i = 0; i < n_ops; i++) { |
3630 | 0 | struct dpif_op *op = ops[i]; |
3631 | |
|
3632 | 0 | switch (op->type) { |
3633 | 0 | case DPIF_OP_FLOW_PUT: |
3634 | 0 | op->error = dpif_netdev_flow_put(dpif, &op->flow_put); |
3635 | 0 | break; |
3636 | | |
3637 | 0 | case DPIF_OP_FLOW_DEL: |
3638 | 0 | op->error = dpif_netdev_flow_del(dpif, &op->flow_del); |
3639 | 0 | break; |
3640 | | |
3641 | 0 | case DPIF_OP_EXECUTE: |
3642 | 0 | op->error = dpif_netdev_execute(dpif, &op->execute); |
3643 | 0 | break; |
3644 | | |
3645 | 0 | case DPIF_OP_FLOW_GET: |
3646 | 0 | op->error = dpif_netdev_flow_get(dpif, &op->flow_get); |
3647 | 0 | break; |
3648 | 0 | } |
3649 | 0 | } |
3650 | 0 | } |
3651 | | |
3652 | | /* Enable or Disable PMD auto load balancing. */ |
3653 | | static void |
3654 | | set_pmd_auto_lb(struct dp_netdev *dp, bool state, bool always_log) |
3655 | 0 | { |
3656 | 0 | struct pmd_auto_lb *pmd_alb = &dp->pmd_alb; |
3657 | |
|
3658 | 0 | if (pmd_alb->is_enabled != state || always_log) { |
3659 | 0 | pmd_alb->is_enabled = state; |
3660 | 0 | if (pmd_alb->is_enabled) { |
3661 | 0 | uint8_t rebalance_load_thresh; |
3662 | |
|
3663 | 0 | atomic_read_relaxed(&pmd_alb->rebalance_load_thresh, |
3664 | 0 | &rebalance_load_thresh); |
3665 | 0 | VLOG_INFO("PMD auto load balance is enabled, " |
3666 | 0 | "interval %"PRIu64" mins, " |
3667 | 0 | "pmd load threshold %"PRIu8"%%, " |
3668 | 0 | "improvement threshold %"PRIu8"%%.", |
3669 | 0 | pmd_alb->rebalance_intvl / MIN_TO_MSEC, |
3670 | 0 | rebalance_load_thresh, |
3671 | 0 | pmd_alb->rebalance_improve_thresh); |
3672 | 0 | } else { |
3673 | 0 | pmd_alb->rebalance_poll_timer = 0; |
3674 | 0 | VLOG_INFO("PMD auto load balance is disabled."); |
3675 | 0 | } |
3676 | 0 | } |
3677 | 0 | } |
3678 | | |
3679 | | static int |
3680 | | parse_pmd_sleep_list(const char *max_sleep_list, |
3681 | | struct pmd_sleep **pmd_sleeps) |
3682 | 0 | { |
3683 | 0 | char *list, *copy, *key, *value; |
3684 | 0 | int num_vals = 0; |
3685 | |
|
3686 | 0 | if (!max_sleep_list) { |
3687 | 0 | return num_vals; |
3688 | 0 | } |
3689 | | |
3690 | 0 | list = copy = xstrdup(max_sleep_list); |
3691 | |
|
3692 | 0 | while (ofputil_parse_key_value(&list, &key, &value)) { |
3693 | 0 | uint64_t temp, pmd_max_sleep; |
3694 | 0 | char *error = NULL; |
3695 | 0 | unsigned core; |
3696 | 0 | int i; |
3697 | |
|
3698 | 0 | error = str_to_u64(key, &temp); |
3699 | 0 | if (error) { |
3700 | 0 | free(error); |
3701 | 0 | continue; |
3702 | 0 | } |
3703 | | |
3704 | 0 | if (value[0] == '\0') { |
3705 | | /* No value specified. key is dp default. */ |
3706 | 0 | core = UINT_MAX; |
3707 | 0 | pmd_max_sleep = temp; |
3708 | 0 | } else { |
3709 | 0 | error = str_to_u64(value, &pmd_max_sleep); |
3710 | 0 | if (!error && temp < UINT_MAX) { |
3711 | | /* Key is pmd core id. */ |
3712 | 0 | core = (unsigned) temp; |
3713 | 0 | } else { |
3714 | 0 | free(error); |
3715 | 0 | continue; |
3716 | 0 | } |
3717 | 0 | } |
3718 | | |
3719 | | /* Detect duplicate max sleep values. */ |
3720 | 0 | for (i = 0; i < num_vals; i++) { |
3721 | 0 | if ((*pmd_sleeps)[i].core_id == core) { |
3722 | 0 | break; |
3723 | 0 | } |
3724 | 0 | } |
3725 | 0 | if (i == num_vals) { |
3726 | | /* Not duplicate, add a new entry. */ |
3727 | 0 | *pmd_sleeps = xrealloc(*pmd_sleeps, |
3728 | 0 | (num_vals + 1) * sizeof **pmd_sleeps); |
3729 | 0 | num_vals++; |
3730 | 0 | } |
3731 | |
|
3732 | 0 | pmd_max_sleep = MIN(PMD_RCU_QUIESCE_INTERVAL, pmd_max_sleep); |
3733 | |
|
3734 | 0 | (*pmd_sleeps)[i].core_id = core; |
3735 | 0 | (*pmd_sleeps)[i].max_sleep = pmd_max_sleep; |
3736 | 0 | } |
3737 | |
|
3738 | 0 | free(copy); |
3739 | 0 | return num_vals; |
3740 | 0 | } |
3741 | | |
3742 | | static void |
3743 | | log_pmd_sleep(unsigned core_id, int numa_id, uint64_t pmd_max_sleep) |
3744 | 0 | { |
3745 | 0 | if (core_id == NON_PMD_CORE_ID) { |
3746 | 0 | return; |
3747 | 0 | } |
3748 | 0 | VLOG_INFO("PMD thread on numa_id: %d, core id: %2d, " |
3749 | 0 | "max sleep: %4"PRIu64" us.", numa_id, core_id, pmd_max_sleep); |
3750 | 0 | } |
3751 | | |
3752 | | static void |
3753 | | pmd_init_max_sleep(struct dp_netdev *dp, struct dp_netdev_pmd_thread *pmd) |
3754 | 0 | { |
3755 | 0 | uint64_t max_sleep = dp->pmd_max_sleep_default; |
3756 | 0 | struct pmd_sleep *pmd_sleeps = NULL; |
3757 | 0 | int num_vals; |
3758 | |
|
3759 | 0 | num_vals = parse_pmd_sleep_list(dp->max_sleep_list, &pmd_sleeps); |
3760 | | |
3761 | | /* Check if the user has set a specific value for this pmd. */ |
3762 | 0 | for (int i = 0; i < num_vals; i++) { |
3763 | 0 | if (pmd_sleeps[i].core_id == pmd->core_id) { |
3764 | 0 | max_sleep = pmd_sleeps[i].max_sleep; |
3765 | 0 | break; |
3766 | 0 | } |
3767 | 0 | } |
3768 | 0 | atomic_init(&pmd->max_sleep, max_sleep); |
3769 | 0 | log_pmd_sleep(pmd->core_id, pmd->numa_id, max_sleep); |
3770 | 0 | free(pmd_sleeps); |
3771 | 0 | } |
3772 | | |
3773 | | static bool |
3774 | | assign_sleep_values_to_pmds(struct dp_netdev *dp, int num_vals, |
3775 | | struct pmd_sleep *pmd_sleeps) |
3776 | 0 | { |
3777 | 0 | struct dp_netdev_pmd_thread *pmd; |
3778 | 0 | bool value_changed = false; |
3779 | |
|
3780 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
3781 | 0 | uint64_t new_max_sleep, cur_pmd_max_sleep; |
3782 | |
|
3783 | 0 | if (pmd->core_id == NON_PMD_CORE_ID) { |
3784 | 0 | continue; |
3785 | 0 | } |
3786 | | |
3787 | | /* Default to global value. */ |
3788 | 0 | new_max_sleep = dp->pmd_max_sleep_default; |
3789 | | |
3790 | | /* Check for pmd specific value. */ |
3791 | 0 | for (int i = 0; i < num_vals; i++) { |
3792 | 0 | if (pmd->core_id == pmd_sleeps[i].core_id) { |
3793 | 0 | new_max_sleep = pmd_sleeps[i].max_sleep; |
3794 | 0 | break; |
3795 | 0 | } |
3796 | 0 | } |
3797 | 0 | atomic_read_relaxed(&pmd->max_sleep, &cur_pmd_max_sleep); |
3798 | 0 | if (new_max_sleep != cur_pmd_max_sleep) { |
3799 | 0 | atomic_store_relaxed(&pmd->max_sleep, new_max_sleep); |
3800 | 0 | value_changed = true; |
3801 | 0 | } |
3802 | 0 | } |
3803 | 0 | return value_changed; |
3804 | 0 | } |
3805 | | |
3806 | | static void |
3807 | | log_all_pmd_sleeps(struct dp_netdev *dp) |
3808 | 0 | { |
3809 | 0 | struct dp_netdev_pmd_thread **pmd_list = NULL; |
3810 | 0 | struct dp_netdev_pmd_thread *pmd; |
3811 | 0 | size_t n; |
3812 | |
|
3813 | 0 | VLOG_INFO("Default PMD thread max sleep: %4"PRIu64" us.", |
3814 | 0 | dp->pmd_max_sleep_default); |
3815 | |
|
3816 | 0 | sorted_poll_thread_list(dp, &pmd_list, &n); |
3817 | |
|
3818 | 0 | for (size_t i = 0; i < n; i++) { |
3819 | 0 | uint64_t cur_pmd_max_sleep; |
3820 | |
|
3821 | 0 | pmd = pmd_list[i]; |
3822 | 0 | atomic_read_relaxed(&pmd->max_sleep, &cur_pmd_max_sleep); |
3823 | 0 | log_pmd_sleep(pmd->core_id, pmd->numa_id, cur_pmd_max_sleep); |
3824 | 0 | } |
3825 | 0 | free(pmd_list); |
3826 | 0 | } |
3827 | | |
3828 | | static bool |
3829 | | set_all_pmd_max_sleeps(struct dp_netdev *dp, const struct smap *config) |
3830 | 0 | { |
3831 | 0 | const char *max_sleep_list = smap_get(config, "pmd-sleep-max"); |
3832 | 0 | struct pmd_sleep *pmd_sleeps = NULL; |
3833 | 0 | uint64_t default_max_sleep = 0; |
3834 | 0 | bool default_changed = false; |
3835 | 0 | bool pmd_changed = false; |
3836 | 0 | uint64_t pmd_maxsleep; |
3837 | 0 | int num_vals = 0; |
3838 | | |
3839 | | /* Check for deprecated 'pmd-maxsleep' value. */ |
3840 | 0 | pmd_maxsleep = smap_get_ullong(config, "pmd-maxsleep", UINT64_MAX); |
3841 | 0 | if (pmd_maxsleep != UINT64_MAX && !max_sleep_list) { |
3842 | 0 | VLOG_WARN_ONCE("pmd-maxsleep is deprecated. " |
3843 | 0 | "Please use pmd-sleep-max instead."); |
3844 | 0 | default_max_sleep = pmd_maxsleep; |
3845 | 0 | } |
3846 | | |
3847 | | /* Check if there is no change in string or value. */ |
3848 | 0 | if (!!dp->max_sleep_list == !!max_sleep_list) { |
3849 | 0 | if (max_sleep_list |
3850 | 0 | ? nullable_string_is_equal(max_sleep_list, dp->max_sleep_list) |
3851 | 0 | : default_max_sleep == dp->pmd_max_sleep_default) { |
3852 | 0 | return false; |
3853 | 0 | } |
3854 | 0 | } |
3855 | | |
3856 | | /* Free existing string and copy new one (if any). */ |
3857 | 0 | free(dp->max_sleep_list); |
3858 | 0 | dp->max_sleep_list = nullable_xstrdup(max_sleep_list); |
3859 | |
|
3860 | 0 | if (max_sleep_list) { |
3861 | 0 | num_vals = parse_pmd_sleep_list(max_sleep_list, &pmd_sleeps); |
3862 | | |
3863 | | /* Check if the user has set a global value. */ |
3864 | 0 | for (int i = 0; i < num_vals; i++) { |
3865 | 0 | if (pmd_sleeps[i].core_id == UINT_MAX) { |
3866 | 0 | default_max_sleep = pmd_sleeps[i].max_sleep; |
3867 | 0 | break; |
3868 | 0 | } |
3869 | 0 | } |
3870 | 0 | } |
3871 | |
|
3872 | 0 | if (dp->pmd_max_sleep_default != default_max_sleep) { |
3873 | 0 | dp->pmd_max_sleep_default = default_max_sleep; |
3874 | 0 | default_changed = true; |
3875 | 0 | } |
3876 | 0 | pmd_changed = assign_sleep_values_to_pmds(dp, num_vals, pmd_sleeps); |
3877 | |
|
3878 | 0 | free(pmd_sleeps); |
3879 | 0 | return default_changed || pmd_changed; |
3880 | 0 | } |
3881 | | |
3882 | | /* Applies datapath configuration from the database. Some of the changes are |
3883 | | * actually applied in dpif_netdev_run(). */ |
3884 | | static int |
3885 | | dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config) |
3886 | 0 | { |
3887 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
3888 | 0 | const char *cmask = smap_get(other_config, "pmd-cpu-mask"); |
3889 | 0 | const char *pmd_rxq_assign = smap_get_def(other_config, "pmd-rxq-assign", |
3890 | 0 | "cycles"); |
3891 | 0 | unsigned long long insert_prob = |
3892 | 0 | smap_get_ullong(other_config, "emc-insert-inv-prob", |
3893 | 0 | DEFAULT_EM_FLOW_INSERT_INV_PROB); |
3894 | 0 | uint32_t insert_min, cur_min; |
3895 | 0 | uint32_t tx_flush_interval, cur_tx_flush_interval; |
3896 | 0 | uint64_t rebalance_intvl; |
3897 | 0 | uint8_t cur_rebalance_load; |
3898 | 0 | uint32_t rebalance_load, rebalance_improve; |
3899 | 0 | bool log_autolb = false; |
3900 | 0 | enum sched_assignment_type pmd_rxq_assign_type; |
3901 | |
|
3902 | 0 | tx_flush_interval = smap_get_int(other_config, "tx-flush-interval", |
3903 | 0 | DEFAULT_TX_FLUSH_INTERVAL); |
3904 | 0 | atomic_read_relaxed(&dp->tx_flush_interval, &cur_tx_flush_interval); |
3905 | 0 | if (tx_flush_interval != cur_tx_flush_interval) { |
3906 | 0 | atomic_store_relaxed(&dp->tx_flush_interval, tx_flush_interval); |
3907 | 0 | VLOG_INFO("Flushing interval for tx queues set to %"PRIu32" us", |
3908 | 0 | tx_flush_interval); |
3909 | 0 | } |
3910 | |
|
3911 | 0 | if (!nullable_string_is_equal(dp->pmd_cmask, cmask)) { |
3912 | 0 | free(dp->pmd_cmask); |
3913 | 0 | dp->pmd_cmask = nullable_xstrdup(cmask); |
3914 | 0 | dp_netdev_request_reconfigure(dp); |
3915 | 0 | } |
3916 | |
|
3917 | 0 | atomic_read_relaxed(&dp->emc_insert_min, &cur_min); |
3918 | 0 | if (insert_prob <= UINT32_MAX) { |
3919 | 0 | insert_min = insert_prob == 0 ? 0 : UINT32_MAX / insert_prob; |
3920 | 0 | } else { |
3921 | 0 | insert_min = DEFAULT_EM_FLOW_INSERT_MIN; |
3922 | 0 | insert_prob = DEFAULT_EM_FLOW_INSERT_INV_PROB; |
3923 | 0 | } |
3924 | |
|
3925 | 0 | if (insert_min != cur_min) { |
3926 | 0 | atomic_store_relaxed(&dp->emc_insert_min, insert_min); |
3927 | 0 | if (insert_min == 0) { |
3928 | 0 | VLOG_INFO("EMC insertion probability changed to zero"); |
3929 | 0 | } else { |
3930 | 0 | VLOG_INFO("EMC insertion probability changed to 1/%llu (~%.2f%%)", |
3931 | 0 | insert_prob, (100 / (float)insert_prob)); |
3932 | 0 | } |
3933 | 0 | } |
3934 | |
|
3935 | 0 | bool perf_enabled = smap_get_bool(other_config, "pmd-perf-metrics", false); |
3936 | 0 | bool cur_perf_enabled; |
3937 | 0 | atomic_read_relaxed(&dp->pmd_perf_metrics, &cur_perf_enabled); |
3938 | 0 | if (perf_enabled != cur_perf_enabled) { |
3939 | 0 | atomic_store_relaxed(&dp->pmd_perf_metrics, perf_enabled); |
3940 | 0 | if (perf_enabled) { |
3941 | 0 | VLOG_INFO("PMD performance metrics collection enabled"); |
3942 | 0 | } else { |
3943 | 0 | VLOG_INFO("PMD performance metrics collection disabled"); |
3944 | 0 | } |
3945 | 0 | } |
3946 | |
|
3947 | 0 | bool smc_enable = smap_get_bool(other_config, "smc-enable", false); |
3948 | 0 | bool cur_smc; |
3949 | 0 | atomic_read_relaxed(&dp->smc_enable_db, &cur_smc); |
3950 | 0 | if (smc_enable != cur_smc) { |
3951 | 0 | atomic_store_relaxed(&dp->smc_enable_db, smc_enable); |
3952 | 0 | if (smc_enable) { |
3953 | 0 | VLOG_INFO("SMC cache is enabled"); |
3954 | 0 | } else { |
3955 | 0 | VLOG_INFO("SMC cache is disabled"); |
3956 | 0 | } |
3957 | 0 | } |
3958 | |
|
3959 | 0 | if (!strcmp(pmd_rxq_assign, "roundrobin")) { |
3960 | 0 | pmd_rxq_assign_type = SCHED_ROUNDROBIN; |
3961 | 0 | } else if (!strcmp(pmd_rxq_assign, "cycles")) { |
3962 | 0 | pmd_rxq_assign_type = SCHED_CYCLES; |
3963 | 0 | } else if (!strcmp(pmd_rxq_assign, "group")) { |
3964 | 0 | pmd_rxq_assign_type = SCHED_GROUP; |
3965 | 0 | } else { |
3966 | | /* Default. */ |
3967 | 0 | VLOG_WARN("Unsupported rx queue to PMD assignment mode in " |
3968 | 0 | "pmd-rxq-assign. Defaulting to 'cycles'."); |
3969 | 0 | pmd_rxq_assign_type = SCHED_CYCLES; |
3970 | 0 | pmd_rxq_assign = "cycles"; |
3971 | 0 | } |
3972 | 0 | if (dp->pmd_rxq_assign_type != pmd_rxq_assign_type) { |
3973 | 0 | dp->pmd_rxq_assign_type = pmd_rxq_assign_type; |
3974 | 0 | VLOG_INFO("Rxq to PMD assignment mode changed to: \'%s\'.", |
3975 | 0 | pmd_rxq_assign); |
3976 | 0 | dp_netdev_request_reconfigure(dp); |
3977 | 0 | } |
3978 | |
|
3979 | 0 | bool pmd_iso = smap_get_bool(other_config, "pmd-rxq-isolate", true); |
3980 | |
|
3981 | 0 | if (pmd_rxq_assign_type != SCHED_GROUP && pmd_iso == false) { |
3982 | | /* Invalid combination. */ |
3983 | 0 | VLOG_WARN("pmd-rxq-isolate can only be set false " |
3984 | 0 | "when using pmd-rxq-assign=group"); |
3985 | 0 | pmd_iso = true; |
3986 | 0 | } |
3987 | 0 | if (dp->pmd_iso != pmd_iso) { |
3988 | 0 | dp->pmd_iso = pmd_iso; |
3989 | 0 | if (pmd_iso) { |
3990 | 0 | VLOG_INFO("pmd-rxq-affinity isolates PMD core"); |
3991 | 0 | } else { |
3992 | 0 | VLOG_INFO("pmd-rxq-affinity does not isolate PMD core"); |
3993 | 0 | } |
3994 | 0 | dp_netdev_request_reconfigure(dp); |
3995 | 0 | } |
3996 | |
|
3997 | 0 | struct pmd_auto_lb *pmd_alb = &dp->pmd_alb; |
3998 | |
|
3999 | 0 | rebalance_intvl = smap_get_ullong(other_config, |
4000 | 0 | "pmd-auto-lb-rebal-interval", |
4001 | 0 | ALB_REBALANCE_INTERVAL); |
4002 | 0 | if (rebalance_intvl > MAX_ALB_REBALANCE_INTERVAL) { |
4003 | 0 | rebalance_intvl = ALB_REBALANCE_INTERVAL; |
4004 | 0 | } |
4005 | | |
4006 | | /* Input is in min, convert it to msec. */ |
4007 | 0 | rebalance_intvl = |
4008 | 0 | rebalance_intvl ? rebalance_intvl * MIN_TO_MSEC : MIN_TO_MSEC; |
4009 | |
|
4010 | 0 | if (pmd_alb->rebalance_intvl != rebalance_intvl) { |
4011 | 0 | pmd_alb->rebalance_intvl = rebalance_intvl; |
4012 | 0 | VLOG_INFO("PMD auto load balance interval set to " |
4013 | 0 | "%"PRIu64" mins\n", rebalance_intvl / MIN_TO_MSEC); |
4014 | 0 | log_autolb = true; |
4015 | 0 | } |
4016 | |
|
4017 | 0 | rebalance_improve = smap_get_uint(other_config, |
4018 | 0 | "pmd-auto-lb-improvement-threshold", |
4019 | 0 | ALB_IMPROVEMENT_THRESHOLD); |
4020 | 0 | if (rebalance_improve > 100) { |
4021 | 0 | rebalance_improve = ALB_IMPROVEMENT_THRESHOLD; |
4022 | 0 | } |
4023 | 0 | if (rebalance_improve != pmd_alb->rebalance_improve_thresh) { |
4024 | 0 | pmd_alb->rebalance_improve_thresh = rebalance_improve; |
4025 | 0 | VLOG_INFO("PMD auto load balance improvement threshold set to " |
4026 | 0 | "%"PRIu32"%%", rebalance_improve); |
4027 | 0 | log_autolb = true; |
4028 | 0 | } |
4029 | |
|
4030 | 0 | rebalance_load = smap_get_uint(other_config, "pmd-auto-lb-load-threshold", |
4031 | 0 | ALB_LOAD_THRESHOLD); |
4032 | 0 | if (rebalance_load > 100) { |
4033 | 0 | rebalance_load = ALB_LOAD_THRESHOLD; |
4034 | 0 | } |
4035 | 0 | atomic_read_relaxed(&pmd_alb->rebalance_load_thresh, &cur_rebalance_load); |
4036 | 0 | if (rebalance_load != cur_rebalance_load) { |
4037 | 0 | atomic_store_relaxed(&pmd_alb->rebalance_load_thresh, |
4038 | 0 | rebalance_load); |
4039 | 0 | VLOG_INFO("PMD auto load balance load threshold set to %"PRIu32"%%", |
4040 | 0 | rebalance_load); |
4041 | 0 | log_autolb = true; |
4042 | 0 | } |
4043 | |
|
4044 | 0 | bool autolb_state = smap_get_bool(other_config, "pmd-auto-lb", false); |
4045 | |
|
4046 | 0 | set_pmd_auto_lb(dp, autolb_state, log_autolb); |
4047 | |
|
4048 | 0 | bool sleep_changed = set_all_pmd_max_sleeps(dp, other_config); |
4049 | |
|
4050 | 0 | if (ovsthread_once_start(&dp->once_set_config)) { |
4051 | 0 | log_all_pmd_sleeps(dp); |
4052 | 0 | dpif_offload_datapath_register_flow_unreference_cb( |
4053 | 0 | dpif, offload_flow_reference_unreference_cb); |
4054 | |
|
4055 | 0 | ovsthread_once_done(&dp->once_set_config); |
4056 | 0 | } else if (sleep_changed) { |
4057 | 0 | log_all_pmd_sleeps(dp); |
4058 | 0 | } |
4059 | |
|
4060 | 0 | return 0; |
4061 | 0 | } |
4062 | | |
4063 | | static bool |
4064 | | dpif_netdev_number_handlers_required(struct dpif *dpif_ OVS_UNUSED, |
4065 | | uint32_t *n_handlers) |
4066 | 0 | { |
4067 | 0 | *n_handlers = 0; |
4068 | 0 | return true; |
4069 | 0 | } |
4070 | | |
4071 | | /* Parses affinity list and returns result in 'core_ids'. */ |
4072 | | static int |
4073 | | parse_affinity_list(const char *affinity_list, unsigned *core_ids, int n_rxq) |
4074 | 0 | { |
4075 | 0 | unsigned i; |
4076 | 0 | char *list, *copy, *key, *value; |
4077 | 0 | int error = 0; |
4078 | |
|
4079 | 0 | for (i = 0; i < n_rxq; i++) { |
4080 | 0 | core_ids[i] = OVS_CORE_UNSPEC; |
4081 | 0 | } |
4082 | |
|
4083 | 0 | if (!affinity_list) { |
4084 | 0 | return 0; |
4085 | 0 | } |
4086 | | |
4087 | 0 | list = copy = xstrdup(affinity_list); |
4088 | |
|
4089 | 0 | while (ofputil_parse_key_value(&list, &key, &value)) { |
4090 | 0 | int rxq_id, core_id; |
4091 | |
|
4092 | 0 | if (!str_to_int(key, 0, &rxq_id) || rxq_id < 0 |
4093 | 0 | || !str_to_int(value, 0, &core_id) || core_id < 0) { |
4094 | 0 | error = EINVAL; |
4095 | 0 | break; |
4096 | 0 | } |
4097 | | |
4098 | 0 | if (rxq_id < n_rxq) { |
4099 | 0 | core_ids[rxq_id] = core_id; |
4100 | 0 | } |
4101 | 0 | } |
4102 | |
|
4103 | 0 | free(copy); |
4104 | 0 | return error; |
4105 | 0 | } |
4106 | | |
4107 | | /* Parses 'affinity_list' and applies configuration if it is valid. */ |
4108 | | static int |
4109 | | dpif_netdev_port_set_rxq_affinity(struct dp_netdev_port *port, |
4110 | | const char *affinity_list) |
4111 | 0 | { |
4112 | 0 | unsigned *core_ids, i; |
4113 | 0 | int error = 0; |
4114 | |
|
4115 | 0 | core_ids = xmalloc(port->n_rxq * sizeof *core_ids); |
4116 | 0 | if (parse_affinity_list(affinity_list, core_ids, port->n_rxq)) { |
4117 | 0 | error = EINVAL; |
4118 | 0 | goto exit; |
4119 | 0 | } |
4120 | | |
4121 | 0 | for (i = 0; i < port->n_rxq; i++) { |
4122 | 0 | port->rxqs[i].core_id = core_ids[i]; |
4123 | 0 | } |
4124 | |
|
4125 | 0 | exit: |
4126 | 0 | free(core_ids); |
4127 | 0 | return error; |
4128 | 0 | } |
4129 | | |
4130 | | /* Returns 'true' if one of the 'port's RX queues exists in 'poll_list' |
4131 | | * of given PMD thread. */ |
4132 | | static bool |
4133 | | dpif_netdev_pmd_polls_port(struct dp_netdev_pmd_thread *pmd, |
4134 | | struct dp_netdev_port *port) |
4135 | | OVS_EXCLUDED(pmd->port_mutex) |
4136 | 0 | { |
4137 | 0 | struct rxq_poll *poll; |
4138 | 0 | bool found = false; |
4139 | |
|
4140 | 0 | ovs_mutex_lock(&pmd->port_mutex); |
4141 | 0 | HMAP_FOR_EACH (poll, node, &pmd->poll_list) { |
4142 | 0 | if (port == poll->rxq->port) { |
4143 | 0 | found = true; |
4144 | 0 | break; |
4145 | 0 | } |
4146 | 0 | } |
4147 | 0 | ovs_mutex_unlock(&pmd->port_mutex); |
4148 | 0 | return found; |
4149 | 0 | } |
4150 | | |
4151 | | /* Updates port configuration from the database. The changes are actually |
4152 | | * applied in dpif_netdev_run(). */ |
4153 | | static int |
4154 | | dpif_netdev_port_set_config(struct dpif *dpif, odp_port_t port_no, |
4155 | | const struct smap *cfg) |
4156 | 0 | { |
4157 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
4158 | 0 | struct dp_netdev_port *port; |
4159 | 0 | int error = 0; |
4160 | 0 | const char *affinity_list = smap_get(cfg, "pmd-rxq-affinity"); |
4161 | 0 | bool emc_enabled = smap_get_bool(cfg, "emc-enable", true); |
4162 | 0 | const char *tx_steering_mode = smap_get(cfg, "tx-steering"); |
4163 | 0 | enum txq_req_mode txq_mode; |
4164 | |
|
4165 | 0 | ovs_rwlock_wrlock(&dp->port_rwlock); |
4166 | 0 | error = get_port_by_number(dp, port_no, &port); |
4167 | 0 | if (error) { |
4168 | 0 | goto unlock; |
4169 | 0 | } |
4170 | | |
4171 | 0 | if (emc_enabled != port->emc_enabled) { |
4172 | 0 | struct dp_netdev_pmd_thread *pmd; |
4173 | 0 | struct ds ds = DS_EMPTY_INITIALIZER; |
4174 | 0 | uint32_t cur_min, insert_prob; |
4175 | |
|
4176 | 0 | port->emc_enabled = emc_enabled; |
4177 | | /* Mark for reload all the threads that polls this port and request |
4178 | | * for reconfiguration for the actual reloading of threads. */ |
4179 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
4180 | 0 | if (dpif_netdev_pmd_polls_port(pmd, port)) { |
4181 | 0 | pmd->need_reload = true; |
4182 | 0 | } |
4183 | 0 | } |
4184 | 0 | dp_netdev_request_reconfigure(dp); |
4185 | |
|
4186 | 0 | ds_put_format(&ds, "%s: EMC has been %s.", |
4187 | 0 | netdev_get_name(port->netdev), |
4188 | 0 | (emc_enabled) ? "enabled" : "disabled"); |
4189 | 0 | if (emc_enabled) { |
4190 | 0 | ds_put_cstr(&ds, " Current insertion probability is "); |
4191 | 0 | atomic_read_relaxed(&dp->emc_insert_min, &cur_min); |
4192 | 0 | if (!cur_min) { |
4193 | 0 | ds_put_cstr(&ds, "zero."); |
4194 | 0 | } else { |
4195 | 0 | insert_prob = UINT32_MAX / cur_min; |
4196 | 0 | ds_put_format(&ds, "1/%"PRIu32" (~%.2f%%).", |
4197 | 0 | insert_prob, 100 / (float) insert_prob); |
4198 | 0 | } |
4199 | 0 | } |
4200 | 0 | VLOG_INFO("%s", ds_cstr(&ds)); |
4201 | 0 | ds_destroy(&ds); |
4202 | 0 | } |
4203 | | |
4204 | | /* Checking for RXq affinity changes. */ |
4205 | 0 | if (netdev_is_pmd(port->netdev) |
4206 | 0 | && !nullable_string_is_equal(affinity_list, port->rxq_affinity_list)) { |
4207 | |
|
4208 | 0 | error = dpif_netdev_port_set_rxq_affinity(port, affinity_list); |
4209 | 0 | if (error) { |
4210 | 0 | goto unlock; |
4211 | 0 | } |
4212 | 0 | free(port->rxq_affinity_list); |
4213 | 0 | port->rxq_affinity_list = nullable_xstrdup(affinity_list); |
4214 | |
|
4215 | 0 | dp_netdev_request_reconfigure(dp); |
4216 | 0 | } |
4217 | | |
4218 | 0 | if (nullable_string_is_equal(tx_steering_mode, "hash")) { |
4219 | 0 | txq_mode = TXQ_REQ_MODE_HASH; |
4220 | 0 | } else { |
4221 | 0 | txq_mode = TXQ_REQ_MODE_THREAD; |
4222 | 0 | } |
4223 | |
|
4224 | 0 | if (txq_mode != port->txq_requested_mode) { |
4225 | 0 | port->txq_requested_mode = txq_mode; |
4226 | 0 | VLOG_INFO("%s: Tx packet steering mode has been set to '%s'.", |
4227 | 0 | netdev_get_name(port->netdev), |
4228 | 0 | (txq_mode == TXQ_REQ_MODE_THREAD) ? "thread" : "hash"); |
4229 | 0 | dp_netdev_request_reconfigure(dp); |
4230 | 0 | } |
4231 | |
|
4232 | 0 | unlock: |
4233 | 0 | ovs_rwlock_unlock(&dp->port_rwlock); |
4234 | 0 | return error; |
4235 | 0 | } |
4236 | | |
4237 | | static int |
4238 | | dpif_netdev_queue_to_priority(const struct dpif *dpif OVS_UNUSED, |
4239 | | uint32_t queue_id, uint32_t *priority) |
4240 | 0 | { |
4241 | 0 | *priority = queue_id; |
4242 | 0 | return 0; |
4243 | 0 | } |
4244 | | |
4245 | | |
4246 | | /* Creates and returns a new 'struct dp_netdev_actions', whose actions are |
4247 | | * a copy of the 'size' bytes of 'actions' input parameters. */ |
4248 | | struct dp_netdev_actions * |
4249 | | dp_netdev_actions_create(const struct nlattr *actions, size_t size) |
4250 | 0 | { |
4251 | 0 | struct dp_netdev_actions *netdev_actions; |
4252 | |
|
4253 | 0 | netdev_actions = xmalloc(sizeof *netdev_actions + size); |
4254 | 0 | netdev_actions->size = size; |
4255 | 0 | if (size) { |
4256 | 0 | memcpy(netdev_actions->actions, actions, size); |
4257 | 0 | } |
4258 | |
|
4259 | 0 | return netdev_actions; |
4260 | 0 | } |
4261 | | |
4262 | | struct dp_netdev_actions * |
4263 | | dp_netdev_flow_get_actions(const struct dp_netdev_flow *flow) |
4264 | 0 | { |
4265 | 0 | return ovsrcu_get(struct dp_netdev_actions *, &flow->actions); |
4266 | 0 | } |
4267 | | |
4268 | | static void |
4269 | | dp_netdev_actions_free(struct dp_netdev_actions *actions) |
4270 | 0 | { |
4271 | 0 | free(actions); |
4272 | 0 | } |
4273 | | |
4274 | | static void |
4275 | | dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx, |
4276 | | enum rxq_cycles_counter_type type, |
4277 | | unsigned long long cycles) |
4278 | 0 | { |
4279 | 0 | atomic_store_relaxed(&rx->cycles[type], cycles); |
4280 | 0 | } |
4281 | | |
4282 | | static void |
4283 | | dp_netdev_rxq_add_cycles(struct dp_netdev_rxq *rx, |
4284 | | enum rxq_cycles_counter_type type, |
4285 | | unsigned long long cycles) |
4286 | 0 | { |
4287 | 0 | non_atomic_ullong_add(&rx->cycles[type], cycles); |
4288 | 0 | } |
4289 | | |
4290 | | static uint64_t |
4291 | | dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx, |
4292 | | enum rxq_cycles_counter_type type) |
4293 | 0 | { |
4294 | 0 | unsigned long long processing_cycles; |
4295 | 0 | atomic_read_relaxed(&rx->cycles[type], &processing_cycles); |
4296 | 0 | return processing_cycles; |
4297 | 0 | } |
4298 | | |
4299 | | static void |
4300 | | dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx, |
4301 | | unsigned long long cycles) |
4302 | 0 | { |
4303 | 0 | unsigned int idx = atomic_count_inc(&rx->intrvl_idx) % PMD_INTERVAL_MAX; |
4304 | 0 | atomic_store_relaxed(&rx->cycles_intrvl[idx], cycles); |
4305 | 0 | } |
4306 | | |
4307 | | static uint64_t |
4308 | | dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx) |
4309 | 0 | { |
4310 | 0 | unsigned long long processing_cycles; |
4311 | 0 | atomic_read_relaxed(&rx->cycles_intrvl[idx], &processing_cycles); |
4312 | 0 | return processing_cycles; |
4313 | 0 | } |
4314 | | |
4315 | | #if ATOMIC_ALWAYS_LOCK_FREE_8B |
4316 | | static inline bool |
4317 | | pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd) |
4318 | 0 | { |
4319 | 0 | bool pmd_perf_enabled; |
4320 | 0 | atomic_read_relaxed(&pmd->dp->pmd_perf_metrics, &pmd_perf_enabled); |
4321 | 0 | return pmd_perf_enabled; |
4322 | 0 | } |
4323 | | #else |
4324 | | /* If stores and reads of 64-bit integers are not atomic, the full PMD |
4325 | | * performance metrics are not available as locked access to 64 bit |
4326 | | * integers would be prohibitively expensive. */ |
4327 | | static inline bool |
4328 | | pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd OVS_UNUSED) |
4329 | | { |
4330 | | return false; |
4331 | | } |
4332 | | #endif |
4333 | | |
4334 | | static int |
4335 | | dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd, |
4336 | | struct tx_port *p) |
4337 | 0 | { |
4338 | 0 | int i; |
4339 | 0 | int tx_qid; |
4340 | 0 | int output_cnt; |
4341 | 0 | bool concurrent_txqs; |
4342 | 0 | struct cycle_timer timer; |
4343 | 0 | uint64_t cycles; |
4344 | 0 | uint32_t tx_flush_interval; |
4345 | |
|
4346 | 0 | cycle_timer_start(&pmd->perf_stats, &timer); |
4347 | |
|
4348 | 0 | output_cnt = dp_packet_batch_size(&p->output_pkts); |
4349 | 0 | ovs_assert(output_cnt > 0); |
4350 | |
|
4351 | 0 | if (p->port->txq_mode == TXQ_MODE_XPS_HASH) { |
4352 | 0 | int n_txq = netdev_n_txq(p->port->netdev); |
4353 | | |
4354 | | /* Re-batch per txq based on packet hash. */ |
4355 | 0 | struct dp_packet *packet; |
4356 | 0 | DP_PACKET_BATCH_FOR_EACH (j, packet, &p->output_pkts) { |
4357 | 0 | uint32_t hash; |
4358 | |
|
4359 | 0 | if (OVS_LIKELY(dp_packet_rss_valid(packet))) { |
4360 | 0 | hash = dp_packet_get_rss_hash(packet); |
4361 | 0 | } else { |
4362 | 0 | struct flow flow; |
4363 | |
|
4364 | 0 | flow_extract(packet, &flow); |
4365 | 0 | hash = flow_hash_5tuple(&flow, 0); |
4366 | 0 | } |
4367 | 0 | dp_packet_batch_add(&p->txq_pkts[hash % n_txq], packet); |
4368 | 0 | } |
4369 | | |
4370 | | /* Flush batches of each Tx queues. */ |
4371 | 0 | for (i = 0; i < n_txq; i++) { |
4372 | 0 | if (dp_packet_batch_is_empty(&p->txq_pkts[i])) { |
4373 | 0 | continue; |
4374 | 0 | } |
4375 | 0 | netdev_send(p->port->netdev, i, &p->txq_pkts[i], true); |
4376 | 0 | dp_packet_batch_init(&p->txq_pkts[i]); |
4377 | 0 | } |
4378 | 0 | } else { |
4379 | 0 | if (p->port->txq_mode == TXQ_MODE_XPS) { |
4380 | 0 | tx_qid = dpif_netdev_xps_get_tx_qid(pmd, p); |
4381 | 0 | concurrent_txqs = true; |
4382 | 0 | } else { |
4383 | 0 | tx_qid = pmd->static_tx_qid; |
4384 | 0 | concurrent_txqs = false; |
4385 | 0 | } |
4386 | 0 | netdev_send(p->port->netdev, tx_qid, &p->output_pkts, concurrent_txqs); |
4387 | 0 | } |
4388 | 0 | dp_packet_batch_init(&p->output_pkts); |
4389 | | |
4390 | | /* Update time of the next flush. */ |
4391 | 0 | atomic_read_relaxed(&pmd->dp->tx_flush_interval, &tx_flush_interval); |
4392 | 0 | p->flush_time = pmd->ctx.now + tx_flush_interval; |
4393 | |
|
4394 | 0 | ovs_assert(pmd->n_output_batches > 0); |
4395 | 0 | pmd->n_output_batches--; |
4396 | |
|
4397 | 0 | pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_PKTS, output_cnt); |
4398 | 0 | pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_BATCHES, 1); |
4399 | | |
4400 | | /* Distribute send cycles evenly among transmitted packets and assign to |
4401 | | * their respective rx queues. */ |
4402 | 0 | cycles = cycle_timer_stop(&pmd->perf_stats, &timer) / output_cnt; |
4403 | 0 | for (i = 0; i < output_cnt; i++) { |
4404 | 0 | if (p->output_pkts_rxqs[i]) { |
4405 | 0 | dp_netdev_rxq_add_cycles(p->output_pkts_rxqs[i], |
4406 | 0 | RXQ_CYCLES_PROC_CURR, cycles); |
4407 | 0 | } |
4408 | 0 | } |
4409 | |
|
4410 | 0 | return output_cnt; |
4411 | 0 | } |
4412 | | |
4413 | | static int |
4414 | | dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd, |
4415 | | bool force) |
4416 | 0 | { |
4417 | 0 | struct tx_port *p; |
4418 | 0 | int output_cnt = 0; |
4419 | |
|
4420 | 0 | if (!pmd->n_output_batches) { |
4421 | 0 | return 0; |
4422 | 0 | } |
4423 | | |
4424 | 0 | HMAP_FOR_EACH (p, node, &pmd->send_port_cache) { |
4425 | 0 | if (!dp_packet_batch_is_empty(&p->output_pkts) |
4426 | 0 | && (force || pmd->ctx.now >= p->flush_time)) { |
4427 | 0 | output_cnt += dp_netdev_pmd_flush_output_on_port(pmd, p); |
4428 | 0 | } |
4429 | 0 | } |
4430 | 0 | return output_cnt; |
4431 | 0 | } |
4432 | | |
4433 | | static int |
4434 | | dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd, |
4435 | | struct dp_netdev_rxq *rxq, |
4436 | | odp_port_t port_no) |
4437 | 0 | { |
4438 | 0 | struct pmd_perf_stats *s = &pmd->perf_stats; |
4439 | 0 | struct dp_packet_batch batch; |
4440 | 0 | struct cycle_timer timer; |
4441 | 0 | int error; |
4442 | 0 | int batch_cnt = 0; |
4443 | 0 | int rem_qlen = 0, *qlen_p = NULL; |
4444 | 0 | uint64_t cycles; |
4445 | | |
4446 | | /* Measure duration for polling and processing rx burst. */ |
4447 | 0 | cycle_timer_start(&pmd->perf_stats, &timer); |
4448 | |
|
4449 | 0 | pmd->ctx.last_rxq = rxq; |
4450 | 0 | dp_packet_batch_init(&batch); |
4451 | | |
4452 | | /* Fetch the rx queue length only for vhostuser ports. */ |
4453 | 0 | if (pmd_perf_metrics_enabled(pmd) && rxq->is_vhost) { |
4454 | 0 | qlen_p = &rem_qlen; |
4455 | 0 | } |
4456 | |
|
4457 | 0 | error = netdev_rxq_recv(rxq->rx, &batch, qlen_p); |
4458 | 0 | if (!error) { |
4459 | | /* At least one packet received. */ |
4460 | 0 | *recirc_depth_get() = 0; |
4461 | 0 | pmd_thread_ctx_time_update(pmd); |
4462 | 0 | batch_cnt = dp_packet_batch_size(&batch); |
4463 | 0 | if (pmd_perf_metrics_enabled(pmd)) { |
4464 | | /* Update batch histogram. */ |
4465 | 0 | s->current.batches++; |
4466 | 0 | histogram_add_sample(&s->pkts_per_batch, batch_cnt); |
4467 | | /* Update the maximum vhost rx queue fill level. */ |
4468 | 0 | if (rxq->is_vhost && rem_qlen >= 0) { |
4469 | 0 | uint32_t qfill = batch_cnt + rem_qlen; |
4470 | 0 | if (qfill > s->current.max_vhost_qfill) { |
4471 | 0 | s->current.max_vhost_qfill = qfill; |
4472 | 0 | } |
4473 | 0 | } |
4474 | 0 | } |
4475 | | |
4476 | | /* Process packet batch. */ |
4477 | 0 | dp_netdev_input(pmd, &batch, port_no); |
4478 | | |
4479 | | /* Assign processing cycles to rx queue. */ |
4480 | 0 | cycles = cycle_timer_stop(&pmd->perf_stats, &timer); |
4481 | 0 | dp_netdev_rxq_add_cycles(rxq, RXQ_CYCLES_PROC_CURR, cycles); |
4482 | |
|
4483 | 0 | dp_netdev_pmd_flush_output_packets(pmd, false); |
4484 | 0 | } else { |
4485 | | /* Discard cycles. */ |
4486 | 0 | cycle_timer_stop(&pmd->perf_stats, &timer); |
4487 | 0 | if (error != EAGAIN && error != EOPNOTSUPP) { |
4488 | 0 | static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); |
4489 | |
|
4490 | 0 | VLOG_ERR_RL(&rl, "error receiving data from %s: %s", |
4491 | 0 | netdev_rxq_get_name(rxq->rx), ovs_strerror(error)); |
4492 | 0 | } |
4493 | 0 | } |
4494 | |
|
4495 | 0 | pmd->ctx.last_rxq = NULL; |
4496 | |
|
4497 | 0 | return batch_cnt; |
4498 | 0 | } |
4499 | | |
4500 | | static struct tx_port * |
4501 | | tx_port_lookup(const struct hmap *hmap, odp_port_t port_no) |
4502 | 0 | { |
4503 | 0 | struct tx_port *tx; |
4504 | |
|
4505 | 0 | HMAP_FOR_EACH_IN_BUCKET (tx, node, hash_port_no(port_no), hmap) { |
4506 | 0 | if (tx->port->port_no == port_no) { |
4507 | 0 | return tx; |
4508 | 0 | } |
4509 | 0 | } |
4510 | | |
4511 | 0 | return NULL; |
4512 | 0 | } |
4513 | | |
4514 | | static struct tx_bond * |
4515 | | tx_bond_lookup(const struct cmap *tx_bonds, uint32_t bond_id) |
4516 | 0 | { |
4517 | 0 | uint32_t hash = hash_bond_id(bond_id); |
4518 | 0 | struct tx_bond *tx; |
4519 | |
|
4520 | 0 | CMAP_FOR_EACH_WITH_HASH (tx, node, hash, tx_bonds) { |
4521 | 0 | if (tx->bond_id == bond_id) { |
4522 | 0 | return tx; |
4523 | 0 | } |
4524 | 0 | } |
4525 | 0 | return NULL; |
4526 | 0 | } |
4527 | | |
4528 | | static int |
4529 | | port_reconfigure(struct dp_netdev_port *port) |
4530 | 0 | { |
4531 | 0 | struct netdev *netdev = port->netdev; |
4532 | 0 | int i, err; |
4533 | | |
4534 | | /* Closes the existing 'rxq's. */ |
4535 | 0 | for (i = 0; i < port->n_rxq; i++) { |
4536 | 0 | netdev_rxq_close(port->rxqs[i].rx); |
4537 | 0 | port->rxqs[i].rx = NULL; |
4538 | 0 | } |
4539 | 0 | unsigned last_nrxq = port->n_rxq; |
4540 | 0 | port->n_rxq = 0; |
4541 | | |
4542 | | /* Allows 'netdev' to apply the pending configuration changes. */ |
4543 | 0 | if (netdev_is_reconf_required(netdev) || port->need_reconfigure) { |
4544 | 0 | err = netdev_reconfigure(netdev); |
4545 | 0 | if (err && (err != EOPNOTSUPP)) { |
4546 | 0 | VLOG_ERR("Failed to set interface %s new configuration", |
4547 | 0 | netdev_get_name(netdev)); |
4548 | 0 | return err; |
4549 | 0 | } |
4550 | 0 | } |
4551 | | /* If the netdev_reconfigure() above succeeds, reopens the 'rxq's. */ |
4552 | 0 | port->rxqs = xrealloc(port->rxqs, |
4553 | 0 | sizeof *port->rxqs * netdev_n_rxq(netdev)); |
4554 | | /* Realloc 'used' counters for tx queues. */ |
4555 | 0 | free(port->txq_used); |
4556 | 0 | port->txq_used = xcalloc(netdev_n_txq(netdev), sizeof *port->txq_used); |
4557 | |
|
4558 | 0 | for (i = 0; i < netdev_n_rxq(netdev); i++) { |
4559 | 0 | bool new_queue = i >= last_nrxq; |
4560 | 0 | if (new_queue) { |
4561 | 0 | memset(&port->rxqs[i], 0, sizeof port->rxqs[i]); |
4562 | 0 | } |
4563 | |
|
4564 | 0 | port->rxqs[i].port = port; |
4565 | 0 | port->rxqs[i].is_vhost = !strncmp(port->type, "dpdkvhost", 9); |
4566 | |
|
4567 | 0 | err = netdev_rxq_open(netdev, &port->rxqs[i].rx, i); |
4568 | 0 | if (err) { |
4569 | 0 | return err; |
4570 | 0 | } |
4571 | 0 | port->n_rxq++; |
4572 | 0 | } |
4573 | | |
4574 | | /* Parse affinity list to apply configuration for new queues. */ |
4575 | 0 | dpif_netdev_port_set_rxq_affinity(port, port->rxq_affinity_list); |
4576 | | |
4577 | | /* If reconfiguration was successful mark it as such, so we can use it */ |
4578 | 0 | port->need_reconfigure = false; |
4579 | |
|
4580 | 0 | return 0; |
4581 | 0 | } |
4582 | | |
4583 | | struct sched_numa_list { |
4584 | | struct hmap numas; /* Contains 'struct sched_numa'. */ |
4585 | | }; |
4586 | | |
4587 | | /* Meta data for out-of-place pmd rxq assignments. */ |
4588 | | struct sched_pmd { |
4589 | | struct sched_numa *numa; |
4590 | | /* Associated PMD thread. */ |
4591 | | struct dp_netdev_pmd_thread *pmd; |
4592 | | uint64_t pmd_proc_cycles; |
4593 | | struct dp_netdev_rxq **rxqs; |
4594 | | unsigned n_rxq; |
4595 | | bool isolated; |
4596 | | }; |
4597 | | |
4598 | | struct sched_numa { |
4599 | | struct hmap_node node; |
4600 | | int numa_id; |
4601 | | /* PMDs on numa node. */ |
4602 | | struct sched_pmd *pmds; |
4603 | | /* Num of PMDs on numa node. */ |
4604 | | unsigned n_pmds; |
4605 | | /* Num of isolated PMDs on numa node. */ |
4606 | | unsigned n_isolated; |
4607 | | int rr_cur_index; |
4608 | | bool rr_idx_inc; |
4609 | | }; |
4610 | | |
4611 | | static size_t |
4612 | | sched_numa_list_count(struct sched_numa_list *numa_list) |
4613 | 0 | { |
4614 | 0 | return hmap_count(&numa_list->numas); |
4615 | 0 | } |
4616 | | |
4617 | | static struct sched_numa * |
4618 | | sched_numa_list_next(struct sched_numa_list *numa_list, |
4619 | | const struct sched_numa *numa) |
4620 | 0 | { |
4621 | 0 | struct hmap_node *node = NULL; |
4622 | |
|
4623 | 0 | if (numa) { |
4624 | 0 | node = hmap_next(&numa_list->numas, &numa->node); |
4625 | 0 | } |
4626 | 0 | if (!node) { |
4627 | 0 | node = hmap_first(&numa_list->numas); |
4628 | 0 | } |
4629 | |
|
4630 | 0 | return (node) ? CONTAINER_OF(node, struct sched_numa, node) : NULL; |
4631 | 0 | } |
4632 | | |
4633 | | static struct sched_numa * |
4634 | | sched_numa_list_lookup(struct sched_numa_list *numa_list, int numa_id) |
4635 | 0 | { |
4636 | 0 | struct sched_numa *numa; |
4637 | |
|
4638 | 0 | HMAP_FOR_EACH_WITH_HASH (numa, node, hash_int(numa_id, 0), |
4639 | 0 | &numa_list->numas) { |
4640 | 0 | if (numa->numa_id == numa_id) { |
4641 | 0 | return numa; |
4642 | 0 | } |
4643 | 0 | } |
4644 | 0 | return NULL; |
4645 | 0 | } |
4646 | | |
4647 | | static int |
4648 | | compare_sched_pmd_list(const void *a_, const void *b_) |
4649 | 0 | { |
4650 | 0 | struct sched_pmd *a, *b; |
4651 | |
|
4652 | 0 | a = (struct sched_pmd *) a_; |
4653 | 0 | b = (struct sched_pmd *) b_; |
4654 | |
|
4655 | 0 | return compare_poll_thread_list(&a->pmd, &b->pmd); |
4656 | 0 | } |
4657 | | |
4658 | | static void |
4659 | | sort_numa_list_pmds(struct sched_numa_list *numa_list) |
4660 | 0 | { |
4661 | 0 | struct sched_numa *numa; |
4662 | |
|
4663 | 0 | HMAP_FOR_EACH (numa, node, &numa_list->numas) { |
4664 | 0 | if (numa->n_pmds > 1) { |
4665 | 0 | qsort(numa->pmds, numa->n_pmds, sizeof *numa->pmds, |
4666 | 0 | compare_sched_pmd_list); |
4667 | 0 | } |
4668 | 0 | } |
4669 | 0 | } |
4670 | | |
4671 | | /* Populate numas and pmds on those numas. */ |
4672 | | static void |
4673 | | sched_numa_list_populate(struct sched_numa_list *numa_list, |
4674 | | struct dp_netdev *dp) |
4675 | 0 | { |
4676 | 0 | struct dp_netdev_pmd_thread *pmd; |
4677 | |
|
4678 | 0 | hmap_init(&numa_list->numas); |
4679 | | |
4680 | | /* For each pmd on this datapath. */ |
4681 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
4682 | 0 | struct sched_numa *numa; |
4683 | 0 | struct sched_pmd *sched_pmd; |
4684 | 0 | if (pmd->core_id == NON_PMD_CORE_ID) { |
4685 | 0 | continue; |
4686 | 0 | } |
4687 | | |
4688 | | /* Get the numa of the PMD. */ |
4689 | 0 | numa = sched_numa_list_lookup(numa_list, pmd->numa_id); |
4690 | | /* Create a new numa node for it if not already created. */ |
4691 | 0 | if (!numa) { |
4692 | 0 | numa = xzalloc(sizeof *numa); |
4693 | 0 | numa->numa_id = pmd->numa_id; |
4694 | 0 | hmap_insert(&numa_list->numas, &numa->node, |
4695 | 0 | hash_int(pmd->numa_id, 0)); |
4696 | 0 | } |
4697 | | |
4698 | | /* Create a sched_pmd on this numa for the pmd. */ |
4699 | 0 | numa->n_pmds++; |
4700 | 0 | numa->pmds = xrealloc(numa->pmds, numa->n_pmds * sizeof *numa->pmds); |
4701 | 0 | sched_pmd = &numa->pmds[numa->n_pmds - 1]; |
4702 | 0 | memset(sched_pmd, 0, sizeof *sched_pmd); |
4703 | 0 | sched_pmd->numa = numa; |
4704 | 0 | sched_pmd->pmd = pmd; |
4705 | | /* At least one pmd is present so initialize curr_idx and idx_inc. */ |
4706 | 0 | numa->rr_cur_index = 0; |
4707 | 0 | numa->rr_idx_inc = true; |
4708 | 0 | } |
4709 | 0 | sort_numa_list_pmds(numa_list); |
4710 | 0 | } |
4711 | | |
4712 | | static void |
4713 | | sched_numa_list_free_entries(struct sched_numa_list *numa_list) |
4714 | 0 | { |
4715 | 0 | struct sched_numa *numa; |
4716 | |
|
4717 | 0 | HMAP_FOR_EACH_POP (numa, node, &numa_list->numas) { |
4718 | 0 | for (unsigned i = 0; i < numa->n_pmds; i++) { |
4719 | 0 | struct sched_pmd *sched_pmd; |
4720 | |
|
4721 | 0 | sched_pmd = &numa->pmds[i]; |
4722 | 0 | sched_pmd->n_rxq = 0; |
4723 | 0 | free(sched_pmd->rxqs); |
4724 | 0 | } |
4725 | 0 | numa->n_pmds = 0; |
4726 | 0 | free(numa->pmds); |
4727 | 0 | free(numa); |
4728 | 0 | } |
4729 | 0 | hmap_destroy(&numa_list->numas); |
4730 | 0 | } |
4731 | | |
4732 | | static struct sched_pmd * |
4733 | | sched_pmd_find_by_pmd(struct sched_numa_list *numa_list, |
4734 | | struct dp_netdev_pmd_thread *pmd) |
4735 | 0 | { |
4736 | 0 | struct sched_numa *numa; |
4737 | |
|
4738 | 0 | HMAP_FOR_EACH (numa, node, &numa_list->numas) { |
4739 | 0 | for (unsigned i = 0; i < numa->n_pmds; i++) { |
4740 | 0 | struct sched_pmd *sched_pmd; |
4741 | |
|
4742 | 0 | sched_pmd = &numa->pmds[i]; |
4743 | 0 | if (pmd == sched_pmd->pmd) { |
4744 | 0 | return sched_pmd; |
4745 | 0 | } |
4746 | 0 | } |
4747 | 0 | } |
4748 | 0 | return NULL; |
4749 | 0 | } |
4750 | | |
4751 | | static void |
4752 | | sched_pmd_add_rxq(struct sched_pmd *sched_pmd, struct dp_netdev_rxq *rxq, |
4753 | | uint64_t cycles) |
4754 | 0 | { |
4755 | | /* As sched_pmd is allocated outside this fn. better to not assume |
4756 | | * rxqs is initialized to NULL. */ |
4757 | 0 | if (sched_pmd->n_rxq == 0) { |
4758 | 0 | sched_pmd->rxqs = xmalloc(sizeof *sched_pmd->rxqs); |
4759 | 0 | } else { |
4760 | 0 | sched_pmd->rxqs = xrealloc(sched_pmd->rxqs, (sched_pmd->n_rxq + 1) * |
4761 | 0 | sizeof *sched_pmd->rxqs); |
4762 | 0 | } |
4763 | |
|
4764 | 0 | sched_pmd->rxqs[sched_pmd->n_rxq++] = rxq; |
4765 | 0 | sched_pmd->pmd_proc_cycles += cycles; |
4766 | 0 | } |
4767 | | |
4768 | | static void |
4769 | | sched_numa_list_assignments(struct sched_numa_list *numa_list, |
4770 | | struct dp_netdev *dp) |
4771 | | OVS_REQ_RDLOCK(dp->port_rwlock) |
4772 | 0 | { |
4773 | 0 | struct dp_netdev_port *port; |
4774 | | |
4775 | | /* For each port. */ |
4776 | 0 | HMAP_FOR_EACH (port, node, &dp->ports) { |
4777 | 0 | if (!netdev_is_pmd(port->netdev)) { |
4778 | 0 | continue; |
4779 | 0 | } |
4780 | | /* For each rxq on the port. */ |
4781 | 0 | for (unsigned qid = 0; qid < port->n_rxq; qid++) { |
4782 | 0 | struct dp_netdev_rxq *rxq = &port->rxqs[qid]; |
4783 | 0 | struct sched_pmd *sched_pmd; |
4784 | 0 | uint64_t proc_cycles = 0; |
4785 | |
|
4786 | 0 | for (int i = 0; i < PMD_INTERVAL_MAX; i++) { |
4787 | 0 | proc_cycles += dp_netdev_rxq_get_intrvl_cycles(rxq, i); |
4788 | 0 | } |
4789 | |
|
4790 | 0 | sched_pmd = sched_pmd_find_by_pmd(numa_list, rxq->pmd); |
4791 | 0 | if (sched_pmd) { |
4792 | 0 | if (rxq->core_id != OVS_CORE_UNSPEC && dp->pmd_iso) { |
4793 | 0 | sched_pmd->isolated = true; |
4794 | 0 | } |
4795 | 0 | sched_pmd_add_rxq(sched_pmd, rxq, proc_cycles); |
4796 | 0 | } |
4797 | 0 | } |
4798 | 0 | } |
4799 | 0 | } |
4800 | | |
4801 | | static void |
4802 | | sched_numa_list_put_in_place(struct sched_numa_list *numa_list) |
4803 | 0 | { |
4804 | 0 | struct sched_numa *numa; |
4805 | | |
4806 | | /* For each numa. */ |
4807 | 0 | HMAP_FOR_EACH (numa, node, &numa_list->numas) { |
4808 | | /* For each pmd. */ |
4809 | 0 | for (int i = 0; i < numa->n_pmds; i++) { |
4810 | 0 | struct sched_pmd *sched_pmd; |
4811 | |
|
4812 | 0 | sched_pmd = &numa->pmds[i]; |
4813 | 0 | sched_pmd->pmd->isolated = sched_pmd->isolated; |
4814 | | /* For each rxq. */ |
4815 | 0 | for (unsigned k = 0; k < sched_pmd->n_rxq; k++) { |
4816 | | /* Store the new pmd from the out of place sched_numa_list |
4817 | | * struct to the dp_netdev_rxq struct */ |
4818 | 0 | sched_pmd->rxqs[k]->pmd = sched_pmd->pmd; |
4819 | 0 | } |
4820 | 0 | } |
4821 | 0 | } |
4822 | 0 | } |
4823 | | |
4824 | | /* Returns 'true' if OVS rxq scheduling algorithm assigned any unpinned rxq to |
4825 | | * a PMD thread core on a non-local numa node. */ |
4826 | | static bool |
4827 | | sched_numa_list_cross_numa_polling(struct sched_numa_list *numa_list) |
4828 | 0 | { |
4829 | 0 | struct sched_numa *numa; |
4830 | |
|
4831 | 0 | HMAP_FOR_EACH (numa, node, &numa_list->numas) { |
4832 | 0 | for (int i = 0; i < numa->n_pmds; i++) { |
4833 | 0 | struct sched_pmd *sched_pmd; |
4834 | |
|
4835 | 0 | sched_pmd = &numa->pmds[i]; |
4836 | 0 | if (sched_pmd->isolated) { |
4837 | | /* All rxqs on this PMD thread core are pinned. */ |
4838 | 0 | continue; |
4839 | 0 | } |
4840 | 0 | for (unsigned k = 0; k < sched_pmd->n_rxq; k++) { |
4841 | 0 | struct dp_netdev_rxq *rxq = sched_pmd->rxqs[k]; |
4842 | | /* Check if the rxq is not pinned to a specific PMD thread core |
4843 | | * by the user AND the PMD thread core that OVS assigned is |
4844 | | * non-local to the rxq port. */ |
4845 | 0 | if (rxq->core_id == OVS_CORE_UNSPEC && |
4846 | 0 | rxq->pmd->numa_id != |
4847 | 0 | netdev_get_numa_id(rxq->port->netdev)) { |
4848 | 0 | return true; |
4849 | 0 | } |
4850 | 0 | } |
4851 | 0 | } |
4852 | 0 | } |
4853 | 0 | return false; |
4854 | 0 | } |
4855 | | |
4856 | | static unsigned |
4857 | | sched_numa_noniso_pmd_count(struct sched_numa *numa) |
4858 | 0 | { |
4859 | 0 | if (numa->n_pmds > numa->n_isolated) { |
4860 | 0 | return numa->n_pmds - numa->n_isolated; |
4861 | 0 | } |
4862 | 0 | return 0; |
4863 | 0 | } |
4864 | | |
4865 | | /* Sort Rx Queues by the processing cycles they are consuming. */ |
4866 | | static int |
4867 | | compare_rxq_cycles(const void *a, const void *b) |
4868 | 0 | { |
4869 | 0 | struct dp_netdev_rxq *qa; |
4870 | 0 | struct dp_netdev_rxq *qb; |
4871 | 0 | uint64_t cycles_qa, cycles_qb; |
4872 | |
|
4873 | 0 | qa = *(struct dp_netdev_rxq **) a; |
4874 | 0 | qb = *(struct dp_netdev_rxq **) b; |
4875 | |
|
4876 | 0 | cycles_qa = dp_netdev_rxq_get_cycles(qa, RXQ_CYCLES_PROC_HIST); |
4877 | 0 | cycles_qb = dp_netdev_rxq_get_cycles(qb, RXQ_CYCLES_PROC_HIST); |
4878 | |
|
4879 | 0 | if (cycles_qa != cycles_qb) { |
4880 | 0 | return (cycles_qa < cycles_qb) ? 1 : -1; |
4881 | 0 | } else { |
4882 | | /* Cycles are the same so tiebreak on port/queue id. |
4883 | | * Tiebreaking (as opposed to return 0) ensures consistent |
4884 | | * sort results across multiple OS's. */ |
4885 | 0 | uint32_t port_qa = odp_to_u32(qa->port->port_no); |
4886 | 0 | uint32_t port_qb = odp_to_u32(qb->port->port_no); |
4887 | 0 | if (port_qa != port_qb) { |
4888 | 0 | return port_qa > port_qb ? 1 : -1; |
4889 | 0 | } else { |
4890 | 0 | return netdev_rxq_get_queue_id(qa->rx) |
4891 | 0 | - netdev_rxq_get_queue_id(qb->rx); |
4892 | 0 | } |
4893 | 0 | } |
4894 | 0 | } |
4895 | | |
4896 | | static bool |
4897 | | sched_pmd_new_lowest(struct sched_pmd *current_lowest, struct sched_pmd *pmd, |
4898 | | bool has_proc) |
4899 | 0 | { |
4900 | 0 | uint64_t current_num, pmd_num; |
4901 | |
|
4902 | 0 | if (current_lowest == NULL) { |
4903 | 0 | return true; |
4904 | 0 | } |
4905 | | |
4906 | 0 | if (has_proc) { |
4907 | 0 | current_num = current_lowest->pmd_proc_cycles; |
4908 | 0 | pmd_num = pmd->pmd_proc_cycles; |
4909 | 0 | } else { |
4910 | 0 | current_num = current_lowest->n_rxq; |
4911 | 0 | pmd_num = pmd->n_rxq; |
4912 | 0 | } |
4913 | |
|
4914 | 0 | if (pmd_num < current_num) { |
4915 | 0 | return true; |
4916 | 0 | } |
4917 | 0 | return false; |
4918 | 0 | } |
4919 | | |
4920 | | static struct sched_pmd * |
4921 | | sched_pmd_get_lowest(struct sched_numa *numa, bool has_cyc) |
4922 | 0 | { |
4923 | 0 | struct sched_pmd *lowest_sched_pmd = NULL; |
4924 | |
|
4925 | 0 | for (unsigned i = 0; i < numa->n_pmds; i++) { |
4926 | 0 | struct sched_pmd *sched_pmd; |
4927 | |
|
4928 | 0 | sched_pmd = &numa->pmds[i]; |
4929 | 0 | if (sched_pmd->isolated) { |
4930 | 0 | continue; |
4931 | 0 | } |
4932 | 0 | if (sched_pmd_new_lowest(lowest_sched_pmd, sched_pmd, has_cyc)) { |
4933 | 0 | lowest_sched_pmd = sched_pmd; |
4934 | 0 | } |
4935 | 0 | } |
4936 | 0 | return lowest_sched_pmd; |
4937 | 0 | } |
4938 | | |
4939 | | /* |
4940 | | * Returns the next pmd from the numa node. |
4941 | | * |
4942 | | * If 'updown' is 'true' it will alternate between selecting the next pmd in |
4943 | | * either an up or down walk, switching between up/down when the first or last |
4944 | | * core is reached. e.g. 1,2,3,3,2,1,1,2... |
4945 | | * |
4946 | | * If 'updown' is 'false' it will select the next pmd wrapping around when |
4947 | | * last core reached. e.g. 1,2,3,1,2,3,1,2... |
4948 | | */ |
4949 | | static struct sched_pmd * |
4950 | | sched_pmd_next_rr(struct sched_numa *numa, bool updown) |
4951 | 0 | { |
4952 | 0 | int numa_idx = numa->rr_cur_index; |
4953 | |
|
4954 | 0 | if (numa->rr_idx_inc == true) { |
4955 | | /* Incrementing through list of pmds. */ |
4956 | 0 | if (numa->rr_cur_index == numa->n_pmds - 1) { |
4957 | | /* Reached the last pmd. */ |
4958 | 0 | if (updown) { |
4959 | 0 | numa->rr_idx_inc = false; |
4960 | 0 | } else { |
4961 | 0 | numa->rr_cur_index = 0; |
4962 | 0 | } |
4963 | 0 | } else { |
4964 | 0 | numa->rr_cur_index++; |
4965 | 0 | } |
4966 | 0 | } else { |
4967 | | /* Decrementing through list of pmds. */ |
4968 | 0 | if (numa->rr_cur_index == 0) { |
4969 | | /* Reached the first pmd. */ |
4970 | 0 | numa->rr_idx_inc = true; |
4971 | 0 | } else { |
4972 | 0 | numa->rr_cur_index--; |
4973 | 0 | } |
4974 | 0 | } |
4975 | 0 | return &numa->pmds[numa_idx]; |
4976 | 0 | } |
4977 | | |
4978 | | static struct sched_pmd * |
4979 | | sched_pmd_next_noniso_rr(struct sched_numa *numa, bool updown) |
4980 | 0 | { |
4981 | 0 | struct sched_pmd *sched_pmd = NULL; |
4982 | | |
4983 | | /* sched_pmd_next_rr() may return duplicate PMDs before all PMDs have been |
4984 | | * returned depending on updown. Call it more than n_pmds to ensure all |
4985 | | * PMDs can be searched for the next non-isolated PMD. */ |
4986 | 0 | for (unsigned i = 0; i < numa->n_pmds * 2; i++) { |
4987 | 0 | sched_pmd = sched_pmd_next_rr(numa, updown); |
4988 | 0 | if (!sched_pmd->isolated) { |
4989 | 0 | break; |
4990 | 0 | } |
4991 | 0 | sched_pmd = NULL; |
4992 | 0 | } |
4993 | 0 | return sched_pmd; |
4994 | 0 | } |
4995 | | |
4996 | | static struct sched_pmd * |
4997 | | sched_pmd_next(struct sched_numa *numa, enum sched_assignment_type algo, |
4998 | | bool has_proc) |
4999 | 0 | { |
5000 | 0 | if (algo == SCHED_GROUP) { |
5001 | 0 | return sched_pmd_get_lowest(numa, has_proc); |
5002 | 0 | } |
5003 | | |
5004 | | /* By default RR the PMDs. */ |
5005 | 0 | return sched_pmd_next_noniso_rr(numa, algo == SCHED_CYCLES ? true : false); |
5006 | 0 | } |
5007 | | |
5008 | | static const char * |
5009 | | get_assignment_type_string(enum sched_assignment_type algo) |
5010 | 0 | { |
5011 | 0 | switch (algo) { |
5012 | 0 | case SCHED_ROUNDROBIN: return "roundrobin"; |
5013 | 0 | case SCHED_CYCLES: return "cycles"; |
5014 | 0 | case SCHED_GROUP: return "group"; |
5015 | 0 | default: return "Unknown"; |
5016 | 0 | } |
5017 | 0 | } |
5018 | | |
5019 | 0 | #define MAX_RXQ_CYC_TEXT 40 |
5020 | 0 | #define MAX_RXQ_CYC_STRLEN (INT_STRLEN(uint64_t) + MAX_RXQ_CYC_TEXT) |
5021 | | |
5022 | | static char * |
5023 | | get_rxq_cyc_log(char *a, enum sched_assignment_type algo, uint64_t cycles) |
5024 | 0 | { |
5025 | 0 | int ret = 0; |
5026 | |
|
5027 | 0 | if (algo != SCHED_ROUNDROBIN) { |
5028 | 0 | ret = snprintf(a, MAX_RXQ_CYC_STRLEN, |
5029 | 0 | " (measured processing cycles %"PRIu64")", cycles); |
5030 | 0 | } |
5031 | |
|
5032 | 0 | if (algo == SCHED_ROUNDROBIN || ret <= 0) { |
5033 | 0 | a[0] = '\0'; |
5034 | 0 | } |
5035 | 0 | return a; |
5036 | 0 | } |
5037 | | |
5038 | | static void |
5039 | | sched_numa_list_schedule(struct sched_numa_list *numa_list, |
5040 | | struct dp_netdev *dp, |
5041 | | enum sched_assignment_type algo, |
5042 | | enum vlog_level level) |
5043 | | OVS_REQ_RDLOCK(dp->port_rwlock) |
5044 | 0 | { |
5045 | 0 | struct dp_netdev_port *port; |
5046 | 0 | struct dp_netdev_rxq **rxqs = NULL; |
5047 | 0 | struct sched_numa *last_cross_numa; |
5048 | 0 | unsigned n_rxqs = 0; |
5049 | 0 | bool start_logged = false; |
5050 | 0 | size_t n_numa; |
5051 | | |
5052 | | /* For each port. */ |
5053 | 0 | HMAP_FOR_EACH (port, node, &dp->ports) { |
5054 | 0 | if (!netdev_is_pmd(port->netdev)) { |
5055 | 0 | continue; |
5056 | 0 | } |
5057 | | |
5058 | | /* For each rxq on the port. */ |
5059 | 0 | for (int qid = 0; qid < port->n_rxq; qid++) { |
5060 | 0 | struct dp_netdev_rxq *rxq = &port->rxqs[qid]; |
5061 | |
|
5062 | 0 | if (algo != SCHED_ROUNDROBIN) { |
5063 | 0 | uint64_t cycle_hist = 0; |
5064 | | |
5065 | | /* Sum the queue intervals and store the cycle history. */ |
5066 | 0 | for (unsigned i = 0; i < PMD_INTERVAL_MAX; i++) { |
5067 | 0 | cycle_hist += dp_netdev_rxq_get_intrvl_cycles(rxq, i); |
5068 | 0 | } |
5069 | 0 | dp_netdev_rxq_set_cycles(rxq, RXQ_CYCLES_PROC_HIST, |
5070 | 0 | cycle_hist); |
5071 | 0 | } |
5072 | | |
5073 | | /* Check if this rxq is pinned. */ |
5074 | 0 | if (rxq->core_id != OVS_CORE_UNSPEC) { |
5075 | 0 | struct sched_pmd *sched_pmd; |
5076 | 0 | struct dp_netdev_pmd_thread *pmd; |
5077 | 0 | struct sched_numa *numa; |
5078 | 0 | bool iso = dp->pmd_iso; |
5079 | 0 | uint64_t proc_cycles; |
5080 | 0 | char rxq_cyc_log[MAX_RXQ_CYC_STRLEN]; |
5081 | | |
5082 | | /* This rxq should be pinned, pin it now. */ |
5083 | 0 | pmd = dp_netdev_get_pmd(dp, rxq->core_id); |
5084 | 0 | sched_pmd = sched_pmd_find_by_pmd(numa_list, pmd); |
5085 | 0 | dp_netdev_pmd_unref(pmd); |
5086 | 0 | if (!sched_pmd) { |
5087 | | /* Cannot find the PMD. Cannot pin this rxq. */ |
5088 | 0 | VLOG(level == VLL_DBG ? VLL_DBG : VLL_WARN, |
5089 | 0 | "Core %2u cannot be pinned with " |
5090 | 0 | "port \'%s\' rx queue %d. Use pmd-cpu-mask to " |
5091 | 0 | "enable a pmd on core %u. An alternative core " |
5092 | 0 | "will be assigned.", |
5093 | 0 | rxq->core_id, |
5094 | 0 | netdev_rxq_get_name(rxq->rx), |
5095 | 0 | netdev_rxq_get_queue_id(rxq->rx), |
5096 | 0 | rxq->core_id); |
5097 | 0 | rxqs = xrealloc(rxqs, (n_rxqs + 1) * sizeof *rxqs); |
5098 | 0 | rxqs[n_rxqs++] = rxq; |
5099 | 0 | continue; |
5100 | 0 | } |
5101 | 0 | if (iso) { |
5102 | | /* Mark PMD as isolated if not done already. */ |
5103 | 0 | if (sched_pmd->isolated == false) { |
5104 | 0 | sched_pmd->isolated = true; |
5105 | 0 | numa = sched_pmd->numa; |
5106 | 0 | numa->n_isolated++; |
5107 | 0 | } |
5108 | 0 | } |
5109 | 0 | proc_cycles = dp_netdev_rxq_get_cycles(rxq, |
5110 | 0 | RXQ_CYCLES_PROC_HIST); |
5111 | 0 | VLOG(level, "Core %2u on numa node %d is pinned with " |
5112 | 0 | "port \'%s\' rx queue %d%s", |
5113 | 0 | sched_pmd->pmd->core_id, sched_pmd->pmd->numa_id, |
5114 | 0 | netdev_rxq_get_name(rxq->rx), |
5115 | 0 | netdev_rxq_get_queue_id(rxq->rx), |
5116 | 0 | get_rxq_cyc_log(rxq_cyc_log, algo, proc_cycles)); |
5117 | 0 | sched_pmd_add_rxq(sched_pmd, rxq, proc_cycles); |
5118 | 0 | } else { |
5119 | 0 | rxqs = xrealloc(rxqs, (n_rxqs + 1) * sizeof *rxqs); |
5120 | 0 | rxqs[n_rxqs++] = rxq; |
5121 | 0 | } |
5122 | 0 | } |
5123 | 0 | } |
5124 | |
|
5125 | 0 | if (n_rxqs > 1 && algo != SCHED_ROUNDROBIN) { |
5126 | | /* Sort the queues in order of the processing cycles |
5127 | | * they consumed during their last pmd interval. */ |
5128 | 0 | qsort(rxqs, n_rxqs, sizeof *rxqs, compare_rxq_cycles); |
5129 | 0 | } |
5130 | |
|
5131 | 0 | last_cross_numa = NULL; |
5132 | 0 | n_numa = sched_numa_list_count(numa_list); |
5133 | 0 | for (unsigned i = 0; i < n_rxqs; i++) { |
5134 | 0 | struct dp_netdev_rxq *rxq = rxqs[i]; |
5135 | 0 | struct sched_pmd *sched_pmd = NULL; |
5136 | 0 | struct sched_numa *numa; |
5137 | 0 | int port_numa_id; |
5138 | 0 | uint64_t proc_cycles; |
5139 | 0 | char rxq_cyc_log[MAX_RXQ_CYC_STRLEN]; |
5140 | |
|
5141 | 0 | if (start_logged == false && level != VLL_DBG) { |
5142 | 0 | VLOG(level, "Performing pmd to rx queue assignment using %s " |
5143 | 0 | "algorithm.", get_assignment_type_string(algo)); |
5144 | 0 | start_logged = true; |
5145 | 0 | } |
5146 | | |
5147 | | /* Store the cycles for this rxq as we will log these later. */ |
5148 | 0 | proc_cycles = dp_netdev_rxq_get_cycles(rxq, RXQ_CYCLES_PROC_HIST); |
5149 | |
|
5150 | 0 | port_numa_id = netdev_get_numa_id(rxq->port->netdev); |
5151 | | |
5152 | | /* Select numa. */ |
5153 | 0 | numa = sched_numa_list_lookup(numa_list, port_numa_id); |
5154 | | |
5155 | | /* Check if numa has no PMDs or no non-isolated PMDs. */ |
5156 | 0 | if (!numa || !sched_numa_noniso_pmd_count(numa)) { |
5157 | | /* Unable to use this numa to find a PMD. */ |
5158 | 0 | numa = NULL; |
5159 | | /* Find any numa with available PMDs. */ |
5160 | 0 | for (int j = 0; j < n_numa; j++) { |
5161 | 0 | numa = sched_numa_list_next(numa_list, last_cross_numa); |
5162 | 0 | last_cross_numa = numa; |
5163 | 0 | if (sched_numa_noniso_pmd_count(numa)) { |
5164 | 0 | break; |
5165 | 0 | } |
5166 | 0 | numa = NULL; |
5167 | 0 | } |
5168 | 0 | } |
5169 | |
|
5170 | 0 | if (numa) { |
5171 | | /* Select the PMD that should be used for this rxq. */ |
5172 | 0 | sched_pmd = sched_pmd_next(numa, algo, |
5173 | 0 | proc_cycles ? true : false); |
5174 | 0 | } |
5175 | | |
5176 | | /* Check that a pmd has been selected. */ |
5177 | 0 | if (sched_pmd) { |
5178 | 0 | int pmd_numa_id; |
5179 | |
|
5180 | 0 | pmd_numa_id = sched_pmd->numa->numa_id; |
5181 | | /* Check if selected pmd numa matches port numa. */ |
5182 | 0 | if (pmd_numa_id != port_numa_id) { |
5183 | 0 | VLOG(level, "There's no available (non-isolated) pmd thread " |
5184 | 0 | "on numa node %d. Port \'%s\' rx queue %d will " |
5185 | 0 | "be assigned to a pmd on numa node %d. " |
5186 | 0 | "This may lead to reduced performance.", |
5187 | 0 | port_numa_id, netdev_rxq_get_name(rxq->rx), |
5188 | 0 | netdev_rxq_get_queue_id(rxq->rx), pmd_numa_id); |
5189 | 0 | } |
5190 | 0 | VLOG(level, "Core %2u on numa node %d assigned port \'%s\' " |
5191 | 0 | "rx queue %d%s.", |
5192 | 0 | sched_pmd->pmd->core_id, sched_pmd->pmd->numa_id, |
5193 | 0 | netdev_rxq_get_name(rxq->rx), |
5194 | 0 | netdev_rxq_get_queue_id(rxq->rx), |
5195 | 0 | get_rxq_cyc_log(rxq_cyc_log, algo, proc_cycles)); |
5196 | 0 | sched_pmd_add_rxq(sched_pmd, rxq, proc_cycles); |
5197 | 0 | } else { |
5198 | 0 | VLOG(level == VLL_DBG ? level : VLL_WARN, |
5199 | 0 | "No non-isolated pmd on any numa available for " |
5200 | 0 | "port \'%s\' rx queue %d%s. " |
5201 | 0 | "This rx queue will not be polled.", |
5202 | 0 | netdev_rxq_get_name(rxq->rx), |
5203 | 0 | netdev_rxq_get_queue_id(rxq->rx), |
5204 | 0 | get_rxq_cyc_log(rxq_cyc_log, algo, proc_cycles)); |
5205 | 0 | } |
5206 | 0 | } |
5207 | 0 | free(rxqs); |
5208 | 0 | } |
5209 | | |
5210 | | static void |
5211 | | rxq_scheduling(struct dp_netdev *dp) |
5212 | | OVS_REQ_RDLOCK(dp->port_rwlock) |
5213 | 0 | { |
5214 | 0 | struct sched_numa_list numa_list; |
5215 | 0 | enum sched_assignment_type algo = dp->pmd_rxq_assign_type; |
5216 | |
|
5217 | 0 | sched_numa_list_populate(&numa_list, dp); |
5218 | 0 | sched_numa_list_schedule(&numa_list, dp, algo, VLL_INFO); |
5219 | 0 | sched_numa_list_put_in_place(&numa_list); |
5220 | |
|
5221 | 0 | sched_numa_list_free_entries(&numa_list); |
5222 | 0 | } |
5223 | | |
5224 | | static uint64_t variance(uint64_t a[], int n); |
5225 | | |
5226 | | static uint64_t |
5227 | | sched_numa_variance(struct sched_numa *numa) |
5228 | 0 | { |
5229 | 0 | uint64_t *percent_busy = NULL; |
5230 | 0 | int n_proc = 0; |
5231 | 0 | uint64_t var; |
5232 | |
|
5233 | 0 | percent_busy = xmalloc(numa->n_pmds * sizeof *percent_busy); |
5234 | |
|
5235 | 0 | for (unsigned i = 0; i < numa->n_pmds; i++) { |
5236 | 0 | struct sched_pmd *sched_pmd; |
5237 | 0 | uint64_t total_cycles = 0; |
5238 | |
|
5239 | 0 | sched_pmd = &numa->pmds[i]; |
5240 | | /* Exclude isolated PMDs from variance calculations. */ |
5241 | 0 | if (sched_pmd->isolated == true) { |
5242 | 0 | continue; |
5243 | 0 | } |
5244 | | /* Get the total pmd cycles for an interval. */ |
5245 | 0 | atomic_read_relaxed(&sched_pmd->pmd->intrvl_cycles, &total_cycles); |
5246 | |
|
5247 | 0 | if (total_cycles) { |
5248 | | /* Estimate the cycles to cover all intervals. */ |
5249 | 0 | total_cycles *= PMD_INTERVAL_MAX; |
5250 | 0 | percent_busy[n_proc++] = (sched_pmd->pmd_proc_cycles * 100) |
5251 | 0 | / total_cycles; |
5252 | 0 | } else { |
5253 | 0 | percent_busy[n_proc++] = 0; |
5254 | 0 | } |
5255 | 0 | } |
5256 | 0 | var = variance(percent_busy, n_proc); |
5257 | 0 | free(percent_busy); |
5258 | 0 | return var; |
5259 | 0 | } |
5260 | | |
5261 | | /* |
5262 | | * This function checks that some basic conditions needed for a rebalance to be |
5263 | | * effective are met. Such as Rxq scheduling assignment type, more than one |
5264 | | * PMD, more than 2 Rxqs on a PMD. If there was no reconfiguration change |
5265 | | * since the last check, it reuses the last result. |
5266 | | * |
5267 | | * It is not intended to be an inclusive check of every condition that may make |
5268 | | * a rebalance ineffective. It is done as a quick check so a full |
5269 | | * pmd_rebalance_dry_run() can be avoided when it is not needed. |
5270 | | */ |
5271 | | static bool |
5272 | | pmd_rebalance_dry_run_needed(struct dp_netdev *dp) |
5273 | | OVS_REQ_RDLOCK(dp->port_rwlock) |
5274 | 0 | { |
5275 | 0 | struct dp_netdev_pmd_thread *pmd; |
5276 | 0 | struct pmd_auto_lb *pmd_alb = &dp->pmd_alb; |
5277 | 0 | unsigned int cnt = 0; |
5278 | 0 | bool multi_rxq = false; |
5279 | | |
5280 | | /* Check if there was no reconfiguration since last check. */ |
5281 | 0 | if (!pmd_alb->recheck_config) { |
5282 | 0 | if (!pmd_alb->do_dry_run) { |
5283 | 0 | VLOG_DBG("PMD auto load balance nothing to do, " |
5284 | 0 | "no configuration changes since last check."); |
5285 | 0 | return false; |
5286 | 0 | } |
5287 | 0 | return true; |
5288 | 0 | } |
5289 | 0 | pmd_alb->recheck_config = false; |
5290 | | |
5291 | | /* Check for incompatible assignment type. */ |
5292 | 0 | if (dp->pmd_rxq_assign_type == SCHED_ROUNDROBIN) { |
5293 | 0 | VLOG_DBG("PMD auto load balance nothing to do, " |
5294 | 0 | "pmd-rxq-assign=roundrobin assignment type configured."); |
5295 | 0 | return pmd_alb->do_dry_run = false; |
5296 | 0 | } |
5297 | | |
5298 | | /* Check that there is at least 2 non-isolated PMDs and |
5299 | | * one of them is polling more than one rxq. */ |
5300 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
5301 | 0 | if (pmd->core_id == NON_PMD_CORE_ID || pmd->isolated) { |
5302 | 0 | continue; |
5303 | 0 | } |
5304 | | |
5305 | 0 | if (hmap_count(&pmd->poll_list) > 1) { |
5306 | 0 | multi_rxq = true; |
5307 | 0 | } |
5308 | 0 | if (cnt && multi_rxq) { |
5309 | 0 | return pmd_alb->do_dry_run = true; |
5310 | 0 | } |
5311 | 0 | cnt++; |
5312 | 0 | } |
5313 | | |
5314 | 0 | VLOG_DBG("PMD auto load balance nothing to do, " |
5315 | 0 | "not enough non-isolated PMDs or RxQs."); |
5316 | 0 | return pmd_alb->do_dry_run = false; |
5317 | 0 | } |
5318 | | |
5319 | | static bool |
5320 | | pmd_rebalance_dry_run(struct dp_netdev *dp) |
5321 | | OVS_REQ_RDLOCK(dp->port_rwlock) |
5322 | 0 | { |
5323 | 0 | struct sched_numa_list numa_list_cur; |
5324 | 0 | struct sched_numa_list numa_list_est; |
5325 | 0 | bool thresh_met = false; |
5326 | |
|
5327 | 0 | VLOG_DBG("PMD auto load balance performing dry run."); |
5328 | | |
5329 | | /* Populate current assignments. */ |
5330 | 0 | sched_numa_list_populate(&numa_list_cur, dp); |
5331 | 0 | sched_numa_list_assignments(&numa_list_cur, dp); |
5332 | | |
5333 | | /* Populate estimated assignments. */ |
5334 | 0 | sched_numa_list_populate(&numa_list_est, dp); |
5335 | 0 | sched_numa_list_schedule(&numa_list_est, dp, |
5336 | 0 | dp->pmd_rxq_assign_type, VLL_DBG); |
5337 | | |
5338 | | /* Check if cross-numa polling, there is only one numa with PMDs. */ |
5339 | 0 | if (!sched_numa_list_cross_numa_polling(&numa_list_est) || |
5340 | 0 | sched_numa_list_count(&numa_list_est) == 1) { |
5341 | 0 | struct sched_numa *numa_cur; |
5342 | | |
5343 | | /* Calculate variances. */ |
5344 | 0 | HMAP_FOR_EACH (numa_cur, node, &numa_list_cur.numas) { |
5345 | 0 | uint64_t current_var, estimate_var; |
5346 | 0 | struct sched_numa *numa_est; |
5347 | 0 | uint64_t improvement = 0; |
5348 | |
|
5349 | 0 | numa_est = sched_numa_list_lookup(&numa_list_est, |
5350 | 0 | numa_cur->numa_id); |
5351 | 0 | if (!numa_est) { |
5352 | 0 | continue; |
5353 | 0 | } |
5354 | 0 | current_var = sched_numa_variance(numa_cur); |
5355 | 0 | estimate_var = sched_numa_variance(numa_est); |
5356 | 0 | if (estimate_var < current_var) { |
5357 | 0 | improvement = ((current_var - estimate_var) * 100) |
5358 | 0 | / current_var; |
5359 | 0 | } |
5360 | 0 | VLOG_DBG("Numa node %d. Current variance %"PRIu64" Estimated " |
5361 | 0 | "variance %"PRIu64". Variance improvement %"PRIu64"%%.", |
5362 | 0 | numa_cur->numa_id, current_var, |
5363 | 0 | estimate_var, improvement); |
5364 | 0 | if (improvement >= dp->pmd_alb.rebalance_improve_thresh) { |
5365 | 0 | thresh_met = true; |
5366 | 0 | } |
5367 | 0 | } |
5368 | 0 | VLOG_DBG("PMD load variance improvement threshold %u%% is %s.", |
5369 | 0 | dp->pmd_alb.rebalance_improve_thresh, |
5370 | 0 | thresh_met ? "met" : "not met"); |
5371 | 0 | } else { |
5372 | 0 | VLOG_DBG("PMD auto load balance detected cross-numa polling with " |
5373 | 0 | "multiple numa nodes. Unable to accurately estimate."); |
5374 | 0 | } |
5375 | |
|
5376 | 0 | sched_numa_list_free_entries(&numa_list_cur); |
5377 | 0 | sched_numa_list_free_entries(&numa_list_est); |
5378 | |
|
5379 | 0 | return thresh_met; |
5380 | 0 | } |
5381 | | |
5382 | | static void |
5383 | | reload_affected_pmds(struct dp_netdev *dp) |
5384 | 0 | { |
5385 | 0 | struct dp_netdev_pmd_thread *pmd; |
5386 | |
|
5387 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
5388 | 0 | if (pmd->need_reload) { |
5389 | 0 | dp_netdev_reload_pmd__(pmd); |
5390 | 0 | } |
5391 | 0 | } |
5392 | |
|
5393 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
5394 | 0 | if (pmd->need_reload) { |
5395 | 0 | if (pmd->core_id != NON_PMD_CORE_ID) { |
5396 | 0 | bool reload; |
5397 | |
|
5398 | 0 | do { |
5399 | 0 | atomic_read_explicit(&pmd->reload, &reload, |
5400 | 0 | memory_order_acquire); |
5401 | 0 | } while (reload); |
5402 | 0 | } |
5403 | 0 | pmd->need_reload = false; |
5404 | 0 | } |
5405 | 0 | } |
5406 | 0 | } |
5407 | | |
5408 | | static void |
5409 | | reconfigure_pmd_threads(struct dp_netdev *dp) |
5410 | | OVS_REQ_RDLOCK(dp->port_rwlock) |
5411 | 0 | { |
5412 | 0 | struct dp_netdev_pmd_thread *pmd; |
5413 | 0 | struct ovs_numa_dump *pmd_cores; |
5414 | 0 | struct ovs_numa_info_core *core; |
5415 | 0 | struct hmapx to_delete = HMAPX_INITIALIZER(&to_delete); |
5416 | 0 | struct hmapx_node *node; |
5417 | 0 | bool changed = false; |
5418 | 0 | bool need_to_adjust_static_tx_qids = false; |
5419 | | |
5420 | | /* The pmd threads should be started only if there's a pmd port in the |
5421 | | * datapath. If the user didn't provide any "pmd-cpu-mask", we start |
5422 | | * NR_PMD_THREADS per numa node. */ |
5423 | 0 | if (!has_pmd_port(dp)) { |
5424 | 0 | pmd_cores = ovs_numa_dump_n_cores_per_numa(0); |
5425 | 0 | } else if (dp->pmd_cmask && dp->pmd_cmask[0]) { |
5426 | 0 | pmd_cores = ovs_numa_dump_cores_with_cmask(dp->pmd_cmask); |
5427 | 0 | } else { |
5428 | 0 | pmd_cores = ovs_numa_dump_n_cores_per_numa(NR_PMD_THREADS); |
5429 | 0 | } |
5430 | | |
5431 | | /* We need to adjust 'static_tx_qid's only if we're reducing number of |
5432 | | * PMD threads. Otherwise, new threads will allocate all the freed ids. */ |
5433 | 0 | if (ovs_numa_dump_count(pmd_cores) < cmap_count(&dp->poll_threads) - 1) { |
5434 | | /* Adjustment is required to keep 'static_tx_qid's sequential and |
5435 | | * avoid possible issues, for example, imbalanced tx queue usage |
5436 | | * and unnecessary locking caused by remapping on netdev level. */ |
5437 | 0 | need_to_adjust_static_tx_qids = true; |
5438 | 0 | } |
5439 | | |
5440 | | /* Check for unwanted pmd threads */ |
5441 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
5442 | 0 | if (pmd->core_id == NON_PMD_CORE_ID) { |
5443 | 0 | continue; |
5444 | 0 | } |
5445 | 0 | if (!ovs_numa_dump_contains_core(pmd_cores, pmd->numa_id, |
5446 | 0 | pmd->core_id)) { |
5447 | 0 | hmapx_add(&to_delete, pmd); |
5448 | 0 | } else if (need_to_adjust_static_tx_qids) { |
5449 | 0 | atomic_store_relaxed(&pmd->reload_tx_qid, true); |
5450 | 0 | pmd->need_reload = true; |
5451 | 0 | } |
5452 | 0 | } |
5453 | |
|
5454 | 0 | HMAPX_FOR_EACH (node, &to_delete) { |
5455 | 0 | pmd = (struct dp_netdev_pmd_thread *) node->data; |
5456 | 0 | VLOG_INFO("PMD thread on numa_id: %d, core id: %2d destroyed.", |
5457 | 0 | pmd->numa_id, pmd->core_id); |
5458 | 0 | dp_netdev_del_pmd(dp, pmd); |
5459 | 0 | } |
5460 | 0 | changed = !hmapx_is_empty(&to_delete); |
5461 | 0 | hmapx_destroy(&to_delete); |
5462 | |
|
5463 | 0 | if (need_to_adjust_static_tx_qids) { |
5464 | | /* 'static_tx_qid's are not sequential now. |
5465 | | * Reload remaining threads to fix this. */ |
5466 | 0 | reload_affected_pmds(dp); |
5467 | 0 | } |
5468 | | |
5469 | | /* Check for required new pmd threads */ |
5470 | 0 | FOR_EACH_CORE_ON_DUMP(core, pmd_cores) { |
5471 | 0 | pmd = dp_netdev_get_pmd(dp, core->core_id); |
5472 | 0 | if (!pmd) { |
5473 | 0 | struct ds name = DS_EMPTY_INITIALIZER; |
5474 | |
|
5475 | 0 | pmd = xzalloc(sizeof *pmd); |
5476 | 0 | dp_netdev_configure_pmd(pmd, dp, core->core_id, core->numa_id); |
5477 | |
|
5478 | 0 | ds_put_format(&name, "pmd-c%02d/id:", core->core_id); |
5479 | 0 | pmd->thread = ovs_thread_create(ds_cstr(&name), |
5480 | 0 | pmd_thread_main, pmd); |
5481 | 0 | ds_destroy(&name); |
5482 | |
|
5483 | 0 | VLOG_INFO("PMD thread on numa_id: %d, core id: %2d created.", |
5484 | 0 | pmd->numa_id, pmd->core_id); |
5485 | 0 | changed = true; |
5486 | 0 | } else { |
5487 | 0 | dp_netdev_pmd_unref(pmd); |
5488 | 0 | } |
5489 | 0 | } |
5490 | |
|
5491 | 0 | if (changed) { |
5492 | 0 | struct ovs_numa_info_numa *numa; |
5493 | | |
5494 | | /* Log the number of pmd threads per numa node. */ |
5495 | 0 | FOR_EACH_NUMA_ON_DUMP (numa, pmd_cores) { |
5496 | 0 | VLOG_INFO("There are %"PRIuSIZE" pmd threads on numa node %d", |
5497 | 0 | numa->n_cores, numa->numa_id); |
5498 | 0 | } |
5499 | 0 | } |
5500 | |
|
5501 | 0 | ovs_numa_dump_destroy(pmd_cores); |
5502 | 0 | } |
5503 | | |
5504 | | static void |
5505 | | pmd_remove_stale_ports(struct dp_netdev *dp, |
5506 | | struct dp_netdev_pmd_thread *pmd) |
5507 | | OVS_EXCLUDED(pmd->port_mutex) |
5508 | | OVS_REQ_RDLOCK(dp->port_rwlock) |
5509 | 0 | { |
5510 | 0 | struct rxq_poll *poll; |
5511 | 0 | struct tx_port *tx; |
5512 | |
|
5513 | 0 | ovs_mutex_lock(&pmd->port_mutex); |
5514 | 0 | HMAP_FOR_EACH_SAFE (poll, node, &pmd->poll_list) { |
5515 | 0 | struct dp_netdev_port *port = poll->rxq->port; |
5516 | |
|
5517 | 0 | if (port->need_reconfigure |
5518 | 0 | || !hmap_contains(&dp->ports, &port->node)) { |
5519 | 0 | dp_netdev_del_rxq_from_pmd(pmd, poll); |
5520 | 0 | } |
5521 | 0 | } |
5522 | 0 | HMAP_FOR_EACH_SAFE (tx, node, &pmd->tx_ports) { |
5523 | 0 | struct dp_netdev_port *port = tx->port; |
5524 | |
|
5525 | 0 | if (port->need_reconfigure |
5526 | 0 | || !hmap_contains(&dp->ports, &port->node)) { |
5527 | 0 | dp_netdev_del_port_tx_from_pmd(pmd, tx); |
5528 | 0 | } |
5529 | 0 | } |
5530 | 0 | ovs_mutex_unlock(&pmd->port_mutex); |
5531 | 0 | } |
5532 | | |
5533 | | /* Must be called each time a port is added/removed or the cmask changes. |
5534 | | * This creates and destroys pmd threads, reconfigures ports, opens their |
5535 | | * rxqs and assigns all rxqs/txqs to pmd threads. */ |
5536 | | static void |
5537 | | reconfigure_datapath(struct dp_netdev *dp) |
5538 | | OVS_REQ_RDLOCK(dp->port_rwlock) |
5539 | 0 | { |
5540 | 0 | struct hmapx busy_threads = HMAPX_INITIALIZER(&busy_threads); |
5541 | 0 | struct dp_netdev_pmd_thread *pmd; |
5542 | 0 | struct dp_netdev_port *port; |
5543 | 0 | int wanted_txqs; |
5544 | |
|
5545 | 0 | dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq); |
5546 | | |
5547 | | /* Step 1: Adjust the pmd threads based on the datapath ports, the cores |
5548 | | * on the system and the user configuration. */ |
5549 | 0 | reconfigure_pmd_threads(dp); |
5550 | |
|
5551 | 0 | wanted_txqs = cmap_count(&dp->poll_threads); |
5552 | | |
5553 | | /* The number of pmd threads might have changed, or a port can be new: |
5554 | | * adjust the txqs. */ |
5555 | 0 | HMAP_FOR_EACH (port, node, &dp->ports) { |
5556 | 0 | netdev_set_tx_multiq(port->netdev, wanted_txqs); |
5557 | 0 | } |
5558 | | |
5559 | | /* Step 2: Remove from the pmd threads ports that have been removed or |
5560 | | * need reconfiguration. */ |
5561 | | |
5562 | | /* Check for all the ports that need reconfiguration. We cache this in |
5563 | | * 'port->need_reconfigure', because netdev_is_reconf_required() can |
5564 | | * change at any time. |
5565 | | * Also mark for reconfiguration all ports which will likely change their |
5566 | | * 'txq_mode' parameter. It's required to stop using them before |
5567 | | * changing this setting and it's simpler to mark ports here and allow |
5568 | | * 'pmd_remove_stale_ports' to remove them from threads. There will be |
5569 | | * no actual reconfiguration in 'port_reconfigure' because it's |
5570 | | * unnecessary. */ |
5571 | 0 | HMAP_FOR_EACH (port, node, &dp->ports) { |
5572 | 0 | if (netdev_is_reconf_required(port->netdev) |
5573 | 0 | || ((port->txq_mode == TXQ_MODE_XPS) |
5574 | 0 | != (netdev_n_txq(port->netdev) < wanted_txqs)) |
5575 | 0 | || ((port->txq_mode == TXQ_MODE_XPS_HASH) |
5576 | 0 | != (port->txq_requested_mode == TXQ_REQ_MODE_HASH |
5577 | 0 | && netdev_n_txq(port->netdev) > 1))) { |
5578 | 0 | port->need_reconfigure = true; |
5579 | 0 | } |
5580 | 0 | } |
5581 | | |
5582 | | /* Remove from the pmd threads all the ports that have been deleted or |
5583 | | * need reconfiguration. */ |
5584 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
5585 | 0 | pmd_remove_stale_ports(dp, pmd); |
5586 | 0 | } |
5587 | | |
5588 | | /* Reload affected pmd threads. We must wait for the pmd threads before |
5589 | | * reconfiguring the ports, because a port cannot be reconfigured while |
5590 | | * it's being used. */ |
5591 | 0 | reload_affected_pmds(dp); |
5592 | | |
5593 | | /* Step 3: Reconfigure ports. */ |
5594 | | |
5595 | | /* We only reconfigure the ports that we determined above, because they're |
5596 | | * not being used by any pmd thread at the moment. If a port fails to |
5597 | | * reconfigure we remove it from the datapath. */ |
5598 | 0 | HMAP_FOR_EACH_SAFE (port, node, &dp->ports) { |
5599 | 0 | int err; |
5600 | |
|
5601 | 0 | if (!port->need_reconfigure) { |
5602 | 0 | continue; |
5603 | 0 | } |
5604 | | |
5605 | 0 | err = port_reconfigure(port); |
5606 | 0 | if (err) { |
5607 | 0 | hmap_remove(&dp->ports, &port->node); |
5608 | 0 | seq_change(dp->port_seq); |
5609 | 0 | port_destroy(port); |
5610 | 0 | } else { |
5611 | | /* With a single queue, there is no point in using hash mode. */ |
5612 | 0 | if (port->txq_requested_mode == TXQ_REQ_MODE_HASH && |
5613 | 0 | netdev_n_txq(port->netdev) > 1) { |
5614 | 0 | port->txq_mode = TXQ_MODE_XPS_HASH; |
5615 | 0 | } else if (netdev_n_txq(port->netdev) < wanted_txqs) { |
5616 | 0 | port->txq_mode = TXQ_MODE_XPS; |
5617 | 0 | } else { |
5618 | 0 | port->txq_mode = TXQ_MODE_STATIC; |
5619 | 0 | } |
5620 | 0 | } |
5621 | 0 | } |
5622 | | |
5623 | | /* Step 4: Compute new rxq scheduling. We don't touch the pmd threads |
5624 | | * for now, we just update the 'pmd' pointer in each rxq to point to the |
5625 | | * wanted thread according to the scheduling policy. */ |
5626 | | |
5627 | | /* Reset all the pmd threads to non isolated. */ |
5628 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
5629 | 0 | pmd->isolated = false; |
5630 | 0 | } |
5631 | | |
5632 | | /* Reset all the queues to unassigned */ |
5633 | 0 | HMAP_FOR_EACH (port, node, &dp->ports) { |
5634 | 0 | for (int i = 0; i < port->n_rxq; i++) { |
5635 | 0 | port->rxqs[i].pmd = NULL; |
5636 | 0 | } |
5637 | 0 | } |
5638 | 0 | rxq_scheduling(dp); |
5639 | | |
5640 | | /* Step 5: Remove queues not compliant with new scheduling. */ |
5641 | | |
5642 | | /* Count all the threads that will have at least one queue to poll. */ |
5643 | 0 | HMAP_FOR_EACH (port, node, &dp->ports) { |
5644 | 0 | for (int qid = 0; qid < port->n_rxq; qid++) { |
5645 | 0 | struct dp_netdev_rxq *q = &port->rxqs[qid]; |
5646 | |
|
5647 | 0 | if (q->pmd) { |
5648 | 0 | hmapx_add(&busy_threads, q->pmd); |
5649 | 0 | } |
5650 | 0 | } |
5651 | 0 | } |
5652 | |
|
5653 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
5654 | 0 | struct rxq_poll *poll; |
5655 | |
|
5656 | 0 | ovs_mutex_lock(&pmd->port_mutex); |
5657 | 0 | HMAP_FOR_EACH_SAFE (poll, node, &pmd->poll_list) { |
5658 | 0 | if (poll->rxq->pmd != pmd) { |
5659 | 0 | dp_netdev_del_rxq_from_pmd(pmd, poll); |
5660 | | |
5661 | | /* This pmd might sleep after this step if it has no rxq |
5662 | | * remaining. Tell it to busy wait for new assignment if it |
5663 | | * has at least one scheduled queue. */ |
5664 | 0 | if (hmap_count(&pmd->poll_list) == 0 && |
5665 | 0 | hmapx_contains(&busy_threads, pmd)) { |
5666 | 0 | atomic_store_relaxed(&pmd->wait_for_reload, true); |
5667 | 0 | } |
5668 | 0 | } |
5669 | 0 | } |
5670 | 0 | ovs_mutex_unlock(&pmd->port_mutex); |
5671 | 0 | } |
5672 | |
|
5673 | 0 | hmapx_destroy(&busy_threads); |
5674 | | |
5675 | | /* Reload affected pmd threads. We must wait for the pmd threads to remove |
5676 | | * the old queues before readding them, otherwise a queue can be polled by |
5677 | | * two threads at the same time. */ |
5678 | 0 | reload_affected_pmds(dp); |
5679 | | |
5680 | | /* Step 6: Add queues from scheduling, if they're not there already. */ |
5681 | 0 | HMAP_FOR_EACH (port, node, &dp->ports) { |
5682 | 0 | if (!netdev_is_pmd(port->netdev)) { |
5683 | 0 | continue; |
5684 | 0 | } |
5685 | | |
5686 | 0 | for (int qid = 0; qid < port->n_rxq; qid++) { |
5687 | 0 | struct dp_netdev_rxq *q = &port->rxqs[qid]; |
5688 | |
|
5689 | 0 | if (q->pmd) { |
5690 | 0 | ovs_mutex_lock(&q->pmd->port_mutex); |
5691 | 0 | dp_netdev_add_rxq_to_pmd(q->pmd, q); |
5692 | 0 | ovs_mutex_unlock(&q->pmd->port_mutex); |
5693 | 0 | } |
5694 | 0 | } |
5695 | 0 | } |
5696 | | |
5697 | | /* Add every port and bond to the tx port and bond caches of |
5698 | | * every pmd thread, if it's not there already and if this pmd |
5699 | | * has at least one rxq to poll. |
5700 | | */ |
5701 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
5702 | 0 | ovs_mutex_lock(&pmd->port_mutex); |
5703 | 0 | if (hmap_count(&pmd->poll_list) || pmd->core_id == NON_PMD_CORE_ID) { |
5704 | 0 | struct tx_bond *bond; |
5705 | |
|
5706 | 0 | HMAP_FOR_EACH (port, node, &dp->ports) { |
5707 | 0 | dp_netdev_add_port_tx_to_pmd(pmd, port); |
5708 | 0 | } |
5709 | |
|
5710 | 0 | CMAP_FOR_EACH (bond, node, &dp->tx_bonds) { |
5711 | 0 | dp_netdev_add_bond_tx_to_pmd(pmd, bond, false); |
5712 | 0 | } |
5713 | 0 | } |
5714 | 0 | ovs_mutex_unlock(&pmd->port_mutex); |
5715 | 0 | } |
5716 | | |
5717 | | /* Reload affected pmd threads. */ |
5718 | 0 | reload_affected_pmds(dp); |
5719 | | |
5720 | | /* PMD ALB will need to recheck if dry run needed. */ |
5721 | 0 | dp->pmd_alb.recheck_config = true; |
5722 | 0 | } |
5723 | | |
5724 | | /* Returns true if one of the netdevs in 'dp' requires a reconfiguration */ |
5725 | | static bool |
5726 | | ports_require_restart(const struct dp_netdev *dp) |
5727 | | OVS_REQ_RDLOCK(dp->port_rwlock) |
5728 | 0 | { |
5729 | 0 | struct dp_netdev_port *port; |
5730 | |
|
5731 | 0 | HMAP_FOR_EACH (port, node, &dp->ports) { |
5732 | 0 | if (netdev_is_reconf_required(port->netdev)) { |
5733 | 0 | return true; |
5734 | 0 | } |
5735 | 0 | } |
5736 | | |
5737 | 0 | return false; |
5738 | 0 | } |
5739 | | |
5740 | | /* Calculates variance in the values stored in array 'a'. 'n' is the number |
5741 | | * of elements in array to be considered for calculating vairance. |
5742 | | * Usage example: data array 'a' contains the processing load of each pmd and |
5743 | | * 'n' is the number of PMDs. It returns the variance in processing load of |
5744 | | * PMDs*/ |
5745 | | static uint64_t |
5746 | | variance(uint64_t a[], int n) |
5747 | 0 | { |
5748 | | /* Compute mean (average of elements). */ |
5749 | 0 | uint64_t sum = 0; |
5750 | 0 | uint64_t mean = 0; |
5751 | 0 | uint64_t sqDiff = 0; |
5752 | |
|
5753 | 0 | if (!n) { |
5754 | 0 | return 0; |
5755 | 0 | } |
5756 | | |
5757 | 0 | for (int i = 0; i < n; i++) { |
5758 | 0 | sum += a[i]; |
5759 | 0 | } |
5760 | |
|
5761 | 0 | if (sum) { |
5762 | 0 | mean = sum / n; |
5763 | | |
5764 | | /* Compute sum squared differences with mean. */ |
5765 | 0 | for (int i = 0; i < n; i++) { |
5766 | 0 | sqDiff += (a[i] - mean)*(a[i] - mean); |
5767 | 0 | } |
5768 | 0 | } |
5769 | 0 | return (sqDiff ? (sqDiff / n) : 0); |
5770 | 0 | } |
5771 | | |
5772 | | /* Return true if needs to revalidate datapath flows. */ |
5773 | | static bool |
5774 | | dpif_netdev_run(struct dpif *dpif) |
5775 | 0 | { |
5776 | 0 | struct dp_netdev_port *port; |
5777 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
5778 | 0 | struct dp_netdev_pmd_thread *non_pmd; |
5779 | 0 | uint64_t new_tnl_seq; |
5780 | 0 | bool need_to_flush = true; |
5781 | 0 | bool pmd_rebalance = false; |
5782 | 0 | long long int now = time_msec(); |
5783 | 0 | struct dp_netdev_pmd_thread *pmd; |
5784 | |
|
5785 | 0 | ovs_rwlock_rdlock(&dp->port_rwlock); |
5786 | 0 | non_pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID); |
5787 | 0 | if (non_pmd) { |
5788 | 0 | ovs_mutex_lock(&dp->non_pmd_mutex); |
5789 | |
|
5790 | 0 | atomic_read_relaxed(&dp->smc_enable_db, &non_pmd->ctx.smc_enable_db); |
5791 | |
|
5792 | 0 | HMAP_FOR_EACH (port, node, &dp->ports) { |
5793 | 0 | if (!netdev_is_pmd(port->netdev)) { |
5794 | 0 | int i; |
5795 | |
|
5796 | 0 | if (port->emc_enabled) { |
5797 | 0 | atomic_read_relaxed(&dp->emc_insert_min, |
5798 | 0 | &non_pmd->ctx.emc_insert_min); |
5799 | 0 | } else { |
5800 | 0 | non_pmd->ctx.emc_insert_min = 0; |
5801 | 0 | } |
5802 | |
|
5803 | 0 | for (i = 0; i < port->n_rxq; i++) { |
5804 | |
|
5805 | 0 | if (!netdev_rxq_enabled(port->rxqs[i].rx)) { |
5806 | 0 | continue; |
5807 | 0 | } |
5808 | | |
5809 | 0 | if (dp_netdev_process_rxq_port(non_pmd, |
5810 | 0 | &port->rxqs[i], |
5811 | 0 | port->port_no)) { |
5812 | 0 | need_to_flush = false; |
5813 | 0 | } |
5814 | 0 | } |
5815 | 0 | } |
5816 | 0 | } |
5817 | 0 | if (need_to_flush) { |
5818 | | /* We didn't receive anything in the process loop. |
5819 | | * Check if we need to send something. |
5820 | | * There was no time updates on current iteration. */ |
5821 | 0 | pmd_thread_ctx_time_update(non_pmd); |
5822 | 0 | dp_netdev_pmd_flush_output_packets(non_pmd, false); |
5823 | 0 | } |
5824 | |
|
5825 | 0 | dpif_netdev_xps_revalidate_pmd(non_pmd, false); |
5826 | 0 | ovs_mutex_unlock(&dp->non_pmd_mutex); |
5827 | |
|
5828 | 0 | dp_netdev_pmd_unref(non_pmd); |
5829 | 0 | } |
5830 | |
|
5831 | 0 | struct pmd_auto_lb *pmd_alb = &dp->pmd_alb; |
5832 | 0 | if (pmd_alb->is_enabled) { |
5833 | 0 | if (!pmd_alb->rebalance_poll_timer) { |
5834 | 0 | pmd_alb->rebalance_poll_timer = now; |
5835 | 0 | } else if ((pmd_alb->rebalance_poll_timer + |
5836 | 0 | pmd_alb->rebalance_intvl) < now) { |
5837 | 0 | pmd_alb->rebalance_poll_timer = now; |
5838 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
5839 | 0 | if (atomic_count_get(&pmd->pmd_overloaded) >= |
5840 | 0 | PMD_INTERVAL_MAX) { |
5841 | 0 | pmd_rebalance = true; |
5842 | 0 | break; |
5843 | 0 | } |
5844 | 0 | } |
5845 | |
|
5846 | 0 | if (pmd_rebalance && |
5847 | 0 | !dp_netdev_is_reconf_required(dp) && |
5848 | 0 | !ports_require_restart(dp) && |
5849 | 0 | pmd_rebalance_dry_run_needed(dp) && |
5850 | 0 | pmd_rebalance_dry_run(dp)) { |
5851 | 0 | VLOG_INFO("PMD auto load balance dry run. " |
5852 | 0 | "Requesting datapath reconfigure."); |
5853 | 0 | dp_netdev_request_reconfigure(dp); |
5854 | 0 | } |
5855 | 0 | } |
5856 | 0 | } |
5857 | |
|
5858 | 0 | if (dp_netdev_is_reconf_required(dp) || ports_require_restart(dp)) { |
5859 | 0 | reconfigure_datapath(dp); |
5860 | 0 | } |
5861 | 0 | ovs_rwlock_unlock(&dp->port_rwlock); |
5862 | |
|
5863 | 0 | tnl_neigh_cache_run(); |
5864 | 0 | tnl_port_map_run(); |
5865 | 0 | new_tnl_seq = seq_read(tnl_conf_seq); |
5866 | |
|
5867 | 0 | if (dp->last_tnl_conf_seq != new_tnl_seq) { |
5868 | 0 | dp->last_tnl_conf_seq = new_tnl_seq; |
5869 | 0 | return true; |
5870 | 0 | } |
5871 | 0 | return false; |
5872 | 0 | } |
5873 | | |
5874 | | static void |
5875 | | dpif_netdev_wait(struct dpif *dpif) |
5876 | 0 | { |
5877 | 0 | struct dp_netdev_port *port; |
5878 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
5879 | |
|
5880 | 0 | ovs_mutex_lock(&dp_netdev_mutex); |
5881 | 0 | ovs_rwlock_rdlock(&dp->port_rwlock); |
5882 | 0 | HMAP_FOR_EACH (port, node, &dp->ports) { |
5883 | 0 | netdev_wait_reconf_required(port->netdev); |
5884 | 0 | if (!netdev_is_pmd(port->netdev)) { |
5885 | 0 | int i; |
5886 | |
|
5887 | 0 | for (i = 0; i < port->n_rxq; i++) { |
5888 | 0 | netdev_rxq_wait(port->rxqs[i].rx); |
5889 | 0 | } |
5890 | 0 | } |
5891 | 0 | } |
5892 | 0 | ovs_rwlock_unlock(&dp->port_rwlock); |
5893 | 0 | ovs_mutex_unlock(&dp_netdev_mutex); |
5894 | 0 | seq_wait(tnl_conf_seq, dp->last_tnl_conf_seq); |
5895 | 0 | } |
5896 | | |
5897 | | static void |
5898 | | pmd_free_cached_ports(struct dp_netdev_pmd_thread *pmd) |
5899 | 0 | { |
5900 | 0 | struct tx_port *tx_port_cached; |
5901 | | |
5902 | | /* Flush all the queued packets. */ |
5903 | 0 | dp_netdev_pmd_flush_output_packets(pmd, true); |
5904 | | /* Free all used tx queue ids. */ |
5905 | 0 | dpif_netdev_xps_revalidate_pmd(pmd, true); |
5906 | |
|
5907 | 0 | HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->tnl_port_cache) { |
5908 | 0 | free(tx_port_cached->txq_pkts); |
5909 | 0 | free(tx_port_cached); |
5910 | 0 | } |
5911 | 0 | HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->send_port_cache) { |
5912 | 0 | free(tx_port_cached->txq_pkts); |
5913 | 0 | free(tx_port_cached); |
5914 | 0 | } |
5915 | 0 | } |
5916 | | |
5917 | | /* Copies ports from 'pmd->tx_ports' (shared with the main thread) to |
5918 | | * thread-local copies. Copy to 'pmd->tnl_port_cache' if it is a tunnel |
5919 | | * device, otherwise to 'pmd->send_port_cache' if the port has at least |
5920 | | * one txq. */ |
5921 | | static void |
5922 | | pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd) |
5923 | | OVS_REQUIRES(pmd->port_mutex) |
5924 | 0 | { |
5925 | 0 | struct tx_port *tx_port, *tx_port_cached; |
5926 | |
|
5927 | 0 | pmd_free_cached_ports(pmd); |
5928 | 0 | hmap_shrink(&pmd->send_port_cache); |
5929 | 0 | hmap_shrink(&pmd->tnl_port_cache); |
5930 | |
|
5931 | 0 | HMAP_FOR_EACH (tx_port, node, &pmd->tx_ports) { |
5932 | 0 | int n_txq = netdev_n_txq(tx_port->port->netdev); |
5933 | 0 | struct dp_packet_batch *txq_pkts_cached; |
5934 | |
|
5935 | 0 | if (netdev_has_tunnel_push_pop(tx_port->port->netdev)) { |
5936 | 0 | tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached); |
5937 | 0 | if (tx_port->txq_pkts) { |
5938 | 0 | txq_pkts_cached = xmemdup(tx_port->txq_pkts, |
5939 | 0 | n_txq * sizeof *tx_port->txq_pkts); |
5940 | 0 | tx_port_cached->txq_pkts = txq_pkts_cached; |
5941 | 0 | } |
5942 | 0 | hmap_insert(&pmd->tnl_port_cache, &tx_port_cached->node, |
5943 | 0 | hash_port_no(tx_port_cached->port->port_no)); |
5944 | 0 | } |
5945 | |
|
5946 | 0 | if (n_txq) { |
5947 | 0 | tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached); |
5948 | 0 | if (tx_port->txq_pkts) { |
5949 | 0 | txq_pkts_cached = xmemdup(tx_port->txq_pkts, |
5950 | 0 | n_txq * sizeof *tx_port->txq_pkts); |
5951 | 0 | tx_port_cached->txq_pkts = txq_pkts_cached; |
5952 | 0 | } |
5953 | 0 | hmap_insert(&pmd->send_port_cache, &tx_port_cached->node, |
5954 | 0 | hash_port_no(tx_port_cached->port->port_no)); |
5955 | 0 | } |
5956 | 0 | } |
5957 | 0 | } |
5958 | | |
5959 | | static void |
5960 | | pmd_alloc_static_tx_qid(struct dp_netdev_pmd_thread *pmd) |
5961 | 0 | { |
5962 | 0 | ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex); |
5963 | 0 | if (!id_pool_alloc_id(pmd->dp->tx_qid_pool, &pmd->static_tx_qid)) { |
5964 | 0 | VLOG_ABORT("static_tx_qid allocation failed for PMD on core %2d" |
5965 | 0 | ", numa_id %d.", pmd->core_id, pmd->numa_id); |
5966 | 0 | } |
5967 | 0 | ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex); |
5968 | |
|
5969 | 0 | VLOG_DBG("static_tx_qid = %d allocated for PMD thread on core %2d" |
5970 | 0 | ", numa_id %d.", pmd->static_tx_qid, pmd->core_id, pmd->numa_id); |
5971 | 0 | } |
5972 | | |
5973 | | static void |
5974 | | pmd_free_static_tx_qid(struct dp_netdev_pmd_thread *pmd) |
5975 | 0 | { |
5976 | 0 | ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex); |
5977 | 0 | id_pool_free_id(pmd->dp->tx_qid_pool, pmd->static_tx_qid); |
5978 | 0 | ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex); |
5979 | 0 | } |
5980 | | |
5981 | | static int |
5982 | | pmd_load_queues_and_ports(struct dp_netdev_pmd_thread *pmd, |
5983 | | struct polled_queue **ppoll_list) |
5984 | 0 | { |
5985 | 0 | struct polled_queue *poll_list = *ppoll_list; |
5986 | 0 | struct rxq_poll *poll; |
5987 | 0 | int i; |
5988 | |
|
5989 | 0 | ovs_mutex_lock(&pmd->port_mutex); |
5990 | 0 | poll_list = xrealloc(poll_list, hmap_count(&pmd->poll_list) |
5991 | 0 | * sizeof *poll_list); |
5992 | |
|
5993 | 0 | i = 0; |
5994 | 0 | HMAP_FOR_EACH (poll, node, &pmd->poll_list) { |
5995 | 0 | poll_list[i].rxq = poll->rxq; |
5996 | 0 | poll_list[i].port_no = poll->rxq->port->port_no; |
5997 | 0 | poll_list[i].emc_enabled = poll->rxq->port->emc_enabled; |
5998 | 0 | poll_list[i].rxq_enabled = netdev_rxq_enabled(poll->rxq->rx); |
5999 | 0 | poll_list[i].change_seq = |
6000 | 0 | netdev_get_change_seq(poll->rxq->port->netdev); |
6001 | 0 | i++; |
6002 | 0 | } |
6003 | |
|
6004 | 0 | pmd_load_cached_ports(pmd); |
6005 | |
|
6006 | 0 | ovs_mutex_unlock(&pmd->port_mutex); |
6007 | |
|
6008 | 0 | *ppoll_list = poll_list; |
6009 | 0 | return i; |
6010 | 0 | } |
6011 | | |
6012 | | static void * |
6013 | | pmd_thread_main(void *f_) |
6014 | 0 | { |
6015 | 0 | struct dp_netdev_pmd_thread *pmd = f_; |
6016 | 0 | struct pmd_perf_stats *s = &pmd->perf_stats; |
6017 | 0 | unsigned int lc = 0; |
6018 | 0 | struct polled_queue *poll_list; |
6019 | 0 | bool wait_for_reload = false; |
6020 | 0 | bool dpdk_attached; |
6021 | 0 | bool reload_tx_qid; |
6022 | 0 | bool exiting; |
6023 | 0 | bool reload; |
6024 | 0 | int poll_cnt; |
6025 | 0 | int i; |
6026 | 0 | int process_packets = 0; |
6027 | 0 | uint64_t sleep_time = 0; |
6028 | |
|
6029 | 0 | poll_list = NULL; |
6030 | | |
6031 | | /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */ |
6032 | 0 | ovsthread_setspecific(pmd->dp->per_pmd_key, pmd); |
6033 | 0 | ovs_numa_thread_setaffinity_core(pmd->core_id); |
6034 | 0 | dpdk_attached = dpdk_attach_thread(pmd->core_id); |
6035 | 0 | poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list); |
6036 | 0 | dfc_cache_init(&pmd->flow_cache); |
6037 | 0 | pmd_alloc_static_tx_qid(pmd); |
6038 | 0 | set_timer_resolution(PMD_TIMER_RES_NS); |
6039 | |
|
6040 | 0 | reload: |
6041 | 0 | atomic_count_init(&pmd->pmd_overloaded, 0); |
6042 | |
|
6043 | 0 | pmd->intrvl_tsc_prev = 0; |
6044 | 0 | atomic_store_relaxed(&pmd->intrvl_cycles, 0); |
6045 | |
|
6046 | 0 | if (!dpdk_attached) { |
6047 | 0 | dpdk_attached = dpdk_attach_thread(pmd->core_id); |
6048 | 0 | } |
6049 | | |
6050 | | /* List port/core affinity */ |
6051 | 0 | for (i = 0; i < poll_cnt; i++) { |
6052 | 0 | VLOG_DBG("Core %d processing port \'%s\' with queue-id %d\n", |
6053 | 0 | pmd->core_id, netdev_rxq_get_name(poll_list[i].rxq->rx), |
6054 | 0 | netdev_rxq_get_queue_id(poll_list[i].rxq->rx)); |
6055 | | /* Reset the rxq current cycles counter. */ |
6056 | 0 | dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR, 0); |
6057 | 0 | for (int j = 0; j < PMD_INTERVAL_MAX; j++) { |
6058 | 0 | dp_netdev_rxq_set_intrvl_cycles(poll_list[i].rxq, 0); |
6059 | 0 | } |
6060 | 0 | } |
6061 | |
|
6062 | 0 | if (!poll_cnt) { |
6063 | 0 | if (wait_for_reload) { |
6064 | | /* Don't sleep, control thread will ask for a reload shortly. */ |
6065 | 0 | do { |
6066 | 0 | atomic_read_explicit(&pmd->reload, &reload, |
6067 | 0 | memory_order_acquire); |
6068 | 0 | } while (!reload); |
6069 | 0 | } else { |
6070 | 0 | while (seq_read(pmd->reload_seq) == pmd->last_reload_seq) { |
6071 | 0 | seq_wait(pmd->reload_seq, pmd->last_reload_seq); |
6072 | 0 | poll_block(); |
6073 | 0 | } |
6074 | 0 | } |
6075 | 0 | } |
6076 | |
|
6077 | 0 | for (i = 0; i < PMD_INTERVAL_MAX; i++) { |
6078 | 0 | atomic_store_relaxed(&pmd->busy_cycles_intrvl[i], 0); |
6079 | 0 | } |
6080 | 0 | atomic_count_set(&pmd->intrvl_idx, 0); |
6081 | 0 | cycles_counter_update(s); |
6082 | |
|
6083 | 0 | pmd->next_rcu_quiesce = pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL; |
6084 | | |
6085 | | /* Protect pmd stats from external clearing while polling. */ |
6086 | 0 | ovs_mutex_lock(&pmd->perf_stats.stats_mutex); |
6087 | 0 | for (;;) { |
6088 | 0 | uint64_t rx_packets = 0, tx_packets = 0; |
6089 | 0 | uint64_t time_slept = 0; |
6090 | 0 | uint64_t max_sleep; |
6091 | |
|
6092 | 0 | pmd_perf_start_iteration(s); |
6093 | |
|
6094 | 0 | atomic_read_relaxed(&pmd->dp->smc_enable_db, &pmd->ctx.smc_enable_db); |
6095 | 0 | atomic_read_relaxed(&pmd->max_sleep, &max_sleep); |
6096 | |
|
6097 | 0 | for (i = 0; i < poll_cnt; i++) { |
6098 | |
|
6099 | 0 | if (!poll_list[i].rxq_enabled) { |
6100 | 0 | continue; |
6101 | 0 | } |
6102 | | |
6103 | 0 | if (poll_list[i].emc_enabled) { |
6104 | 0 | atomic_read_relaxed(&pmd->dp->emc_insert_min, |
6105 | 0 | &pmd->ctx.emc_insert_min); |
6106 | 0 | } else { |
6107 | 0 | pmd->ctx.emc_insert_min = 0; |
6108 | 0 | } |
6109 | |
|
6110 | 0 | process_packets = |
6111 | 0 | dp_netdev_process_rxq_port(pmd, poll_list[i].rxq, |
6112 | 0 | poll_list[i].port_no); |
6113 | 0 | rx_packets += process_packets; |
6114 | 0 | if (process_packets >= PMD_SLEEP_THRESH) { |
6115 | 0 | sleep_time = 0; |
6116 | 0 | } |
6117 | 0 | } |
6118 | |
|
6119 | 0 | if (!rx_packets) { |
6120 | | /* We didn't receive anything in the process loop. |
6121 | | * Check if we need to send something. |
6122 | | * There was no time updates on current iteration. */ |
6123 | 0 | pmd_thread_ctx_time_update(pmd); |
6124 | 0 | tx_packets = dp_netdev_pmd_flush_output_packets(pmd, |
6125 | 0 | max_sleep && sleep_time |
6126 | 0 | ? true : false); |
6127 | 0 | } |
6128 | |
|
6129 | 0 | if (max_sleep) { |
6130 | | /* Check if a sleep should happen on this iteration. */ |
6131 | 0 | if (sleep_time) { |
6132 | 0 | struct cycle_timer sleep_timer; |
6133 | |
|
6134 | 0 | cycle_timer_start(&pmd->perf_stats, &sleep_timer); |
6135 | 0 | xnanosleep_no_quiesce(sleep_time * 1000); |
6136 | 0 | time_slept = cycle_timer_stop(&pmd->perf_stats, &sleep_timer); |
6137 | 0 | pmd_thread_ctx_time_update(pmd); |
6138 | 0 | } |
6139 | 0 | if (sleep_time < max_sleep) { |
6140 | | /* Increase sleep time for next iteration. */ |
6141 | 0 | sleep_time += PMD_SLEEP_INC_US; |
6142 | 0 | } else { |
6143 | 0 | sleep_time = max_sleep; |
6144 | 0 | } |
6145 | 0 | } else { |
6146 | | /* Reset sleep time as max sleep policy may have been changed. */ |
6147 | 0 | sleep_time = 0; |
6148 | 0 | } |
6149 | | |
6150 | | /* Do RCU synchronization at fixed interval. This ensures that |
6151 | | * synchronization would not be delayed long even at high load of |
6152 | | * packet processing. */ |
6153 | 0 | if (pmd->ctx.now > pmd->next_rcu_quiesce) { |
6154 | 0 | if (!ovsrcu_try_quiesce()) { |
6155 | 0 | pmd->next_rcu_quiesce = |
6156 | 0 | pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL; |
6157 | 0 | } |
6158 | 0 | } |
6159 | |
|
6160 | 0 | if (lc++ > 1024) { |
6161 | 0 | lc = 0; |
6162 | |
|
6163 | 0 | coverage_try_clear(); |
6164 | 0 | dp_netdev_pmd_try_optimize(pmd, poll_list, poll_cnt); |
6165 | 0 | if (!ovsrcu_try_quiesce()) { |
6166 | 0 | emc_cache_slow_sweep(&((pmd->flow_cache).emc_cache)); |
6167 | 0 | pmd->next_rcu_quiesce = |
6168 | 0 | pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL; |
6169 | 0 | } |
6170 | |
|
6171 | 0 | for (i = 0; i < poll_cnt; i++) { |
6172 | 0 | uint64_t current_seq = |
6173 | 0 | netdev_get_change_seq(poll_list[i].rxq->port->netdev); |
6174 | 0 | if (poll_list[i].change_seq != current_seq) { |
6175 | 0 | poll_list[i].change_seq = current_seq; |
6176 | 0 | poll_list[i].rxq_enabled = |
6177 | 0 | netdev_rxq_enabled(poll_list[i].rxq->rx); |
6178 | 0 | } |
6179 | 0 | } |
6180 | 0 | } |
6181 | |
|
6182 | 0 | atomic_read_explicit(&pmd->reload, &reload, memory_order_acquire); |
6183 | 0 | if (OVS_UNLIKELY(reload)) { |
6184 | 0 | break; |
6185 | 0 | } |
6186 | | |
6187 | 0 | pmd_perf_end_iteration(s, rx_packets, tx_packets, time_slept, |
6188 | 0 | pmd_perf_metrics_enabled(pmd)); |
6189 | 0 | } |
6190 | 0 | ovs_mutex_unlock(&pmd->perf_stats.stats_mutex); |
6191 | |
|
6192 | 0 | poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list); |
6193 | 0 | atomic_read_relaxed(&pmd->wait_for_reload, &wait_for_reload); |
6194 | 0 | atomic_read_relaxed(&pmd->reload_tx_qid, &reload_tx_qid); |
6195 | 0 | atomic_read_relaxed(&pmd->exit, &exiting); |
6196 | | /* Signal here to make sure the pmd finishes |
6197 | | * reloading the updated configuration. */ |
6198 | 0 | dp_netdev_pmd_reload_done(pmd); |
6199 | |
|
6200 | 0 | if (reload_tx_qid) { |
6201 | 0 | pmd_free_static_tx_qid(pmd); |
6202 | 0 | pmd_alloc_static_tx_qid(pmd); |
6203 | 0 | } |
6204 | |
|
6205 | 0 | if (!exiting) { |
6206 | 0 | goto reload; |
6207 | 0 | } |
6208 | | |
6209 | 0 | pmd_free_static_tx_qid(pmd); |
6210 | 0 | dfc_cache_uninit(&pmd->flow_cache); |
6211 | 0 | free(poll_list); |
6212 | 0 | pmd_free_cached_ports(pmd); |
6213 | 0 | if (dpdk_attached) { |
6214 | 0 | dpdk_detach_thread(); |
6215 | 0 | } |
6216 | 0 | return NULL; |
6217 | 0 | } |
6218 | | |
6219 | | static void |
6220 | | dp_netdev_disable_upcall(struct dp_netdev *dp) |
6221 | | OVS_ACQUIRES(dp->upcall_rwlock) |
6222 | 0 | { |
6223 | 0 | fat_rwlock_wrlock(&dp->upcall_rwlock); |
6224 | 0 | } |
6225 | | |
6226 | | |
6227 | | /* Meters */ |
6228 | | static void |
6229 | | dpif_netdev_meter_get_features(const struct dpif * dpif OVS_UNUSED, |
6230 | | struct ofputil_meter_features *features) |
6231 | 0 | { |
6232 | 0 | features->max_meters = MAX_METERS; |
6233 | 0 | features->band_types = DP_SUPPORTED_METER_BAND_TYPES; |
6234 | 0 | features->capabilities = DP_SUPPORTED_METER_FLAGS_MASK; |
6235 | 0 | features->max_bands = MAX_BANDS; |
6236 | 0 | features->max_color = 0; |
6237 | 0 | } |
6238 | | |
6239 | | /* Tries to atomically add 'n' to 'value' in terms of saturation arithmetic, |
6240 | | * i.e., if the result will be larger than 'max_value', will store 'max_value' |
6241 | | * instead. */ |
6242 | | static void |
6243 | | atomic_sat_add(atomic_uint64_t *value, uint64_t n, uint64_t max_value) |
6244 | 0 | { |
6245 | 0 | uint64_t current, new_value; |
6246 | |
|
6247 | 0 | atomic_read_relaxed(value, ¤t); |
6248 | 0 | do { |
6249 | 0 | new_value = current + n; |
6250 | 0 | new_value = MIN(new_value, max_value); |
6251 | 0 | } while (!atomic_compare_exchange_weak_relaxed(value, ¤t, |
6252 | 0 | new_value)); |
6253 | 0 | } |
6254 | | |
6255 | | /* Tries to atomically subtract 'n' from 'value'. Does not perform the |
6256 | | * operation and returns 'false' if the result will be less than 'min_value'. |
6257 | | * Otherwise, stores the result and returns 'true'. */ |
6258 | | static bool |
6259 | | atomic_bound_sub(atomic_uint64_t *value, uint64_t n, uint64_t min_value) |
6260 | 0 | { |
6261 | 0 | uint64_t current; |
6262 | |
|
6263 | 0 | atomic_read_relaxed(value, ¤t); |
6264 | 0 | do { |
6265 | 0 | if (current < min_value + n) { |
6266 | 0 | return false; |
6267 | 0 | } |
6268 | 0 | } while (!atomic_compare_exchange_weak_relaxed(value, ¤t, |
6269 | 0 | current - n)); |
6270 | 0 | return true; |
6271 | 0 | } |
6272 | | |
6273 | | /* Applies the meter identified by 'meter_id' to 'packets_'. Packets |
6274 | | * that exceed a band are dropped in-place. */ |
6275 | | static void |
6276 | | dp_netdev_run_meter(struct dp_netdev *dp, struct dp_packet_batch *packets_, |
6277 | | uint32_t meter_id, long long int now_ms) |
6278 | 0 | { |
6279 | 0 | const size_t cnt = dp_packet_batch_size(packets_); |
6280 | 0 | uint32_t exceeded_rate[NETDEV_MAX_BURST]; |
6281 | 0 | uint32_t exceeded_band[NETDEV_MAX_BURST]; |
6282 | 0 | uint64_t bytes, volume, meter_used, old; |
6283 | 0 | uint64_t band_packets[MAX_BANDS]; |
6284 | 0 | uint64_t band_bytes[MAX_BANDS]; |
6285 | 0 | struct dp_meter_band *band; |
6286 | 0 | struct dp_packet *packet; |
6287 | 0 | struct dp_meter *meter; |
6288 | 0 | bool exceeded = false; |
6289 | |
|
6290 | 0 | if (meter_id >= MAX_METERS) { |
6291 | 0 | return; |
6292 | 0 | } |
6293 | | |
6294 | 0 | meter = dp_meter_lookup(&dp->meters, meter_id); |
6295 | 0 | if (!meter) { |
6296 | 0 | return; |
6297 | 0 | } |
6298 | | |
6299 | | /* Initialize as negative values. */ |
6300 | 0 | memset(exceeded_band, 0xff, cnt * sizeof *exceeded_band); |
6301 | | /* Initialize as zeroes. */ |
6302 | 0 | memset(exceeded_rate, 0, cnt * sizeof *exceeded_rate); |
6303 | |
|
6304 | 0 | atomic_read_relaxed(&meter->used, &meter_used); |
6305 | 0 | do { |
6306 | 0 | if (meter_used >= now_ms) { |
6307 | | /* The '>' condition means that we have several threads hitting the |
6308 | | * same meter, and the other one already advanced the time. */ |
6309 | 0 | meter_used = now_ms; |
6310 | 0 | break; |
6311 | 0 | } |
6312 | 0 | } while (!atomic_compare_exchange_weak_relaxed(&meter->used, |
6313 | 0 | &meter_used, now_ms)); |
6314 | | |
6315 | | /* Refill all buckets right away, since other threads may use them. */ |
6316 | 0 | if (meter_used < now_ms) { |
6317 | | /* All packets will hit the meter at the same time. */ |
6318 | 0 | uint64_t delta_t = now_ms - meter_used; |
6319 | | |
6320 | | /* Make sure delta_t will not be too large, so that bucket will not |
6321 | | * wrap around below. */ |
6322 | 0 | delta_t = MIN(delta_t, meter->max_delta_t); |
6323 | |
|
6324 | 0 | for (int m = 0; m < meter->n_bands; m++) { |
6325 | 0 | band = &meter->bands[m]; |
6326 | | /* Update band's bucket. We can't just use atomic add here, |
6327 | | * because we should never add above the max capacity. */ |
6328 | 0 | atomic_sat_add(&band->bucket, delta_t * band->rate, |
6329 | 0 | band->burst_size * 1000ULL); |
6330 | 0 | } |
6331 | 0 | } |
6332 | | |
6333 | | /* Update meter stats. */ |
6334 | 0 | atomic_add_relaxed(&meter->packet_count, cnt, &old); |
6335 | 0 | bytes = 0; |
6336 | 0 | DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) { |
6337 | 0 | bytes += dp_packet_size(packet); |
6338 | 0 | } |
6339 | 0 | atomic_add_relaxed(&meter->byte_count, bytes, &old); |
6340 | | |
6341 | | /* Meters can operate in terms of packets per second or kilobits per |
6342 | | * second. */ |
6343 | 0 | if (meter->flags & OFPMF13_PKTPS) { |
6344 | | /* Rate in packets/second, bucket 1/1000 packets. |
6345 | | * msec * packets/sec = 1/1000 packets. */ |
6346 | 0 | volume = cnt * 1000; /* Take 'cnt' packets from the bucket. */ |
6347 | 0 | } else { |
6348 | | /* Rate in kbps, bucket in bits. |
6349 | | * msec * kbps = bits */ |
6350 | 0 | volume = bytes * 8; |
6351 | 0 | } |
6352 | | |
6353 | | /* Find the band hit with the highest rate for each packet (if any). */ |
6354 | 0 | for (int m = 0; m < meter->n_bands; m++) { |
6355 | 0 | band = &meter->bands[m]; |
6356 | | |
6357 | | /* Drain the bucket for all the packets, if possible. */ |
6358 | 0 | if (atomic_bound_sub(&band->bucket, volume, 0)) { |
6359 | 0 | continue; |
6360 | 0 | } |
6361 | | |
6362 | | /* Band limit hit, must process packet-by-packet. */ |
6363 | 0 | DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) { |
6364 | 0 | uint64_t packet_volume = (meter->flags & OFPMF13_PKTPS) |
6365 | 0 | ? 1000 : (dp_packet_size(packet) * 8); |
6366 | |
|
6367 | 0 | if (!atomic_bound_sub(&band->bucket, packet_volume, 0)) { |
6368 | | /* Update the exceeding band for the exceeding packet. |
6369 | | * Only one band will be fired by a packet, and that can |
6370 | | * be different for each packet. */ |
6371 | 0 | if (band->rate > exceeded_rate[i]) { |
6372 | 0 | exceeded_rate[i] = band->rate; |
6373 | 0 | exceeded_band[i] = m; |
6374 | 0 | exceeded = true; |
6375 | 0 | } |
6376 | 0 | } |
6377 | 0 | } |
6378 | 0 | } |
6379 | | |
6380 | | /* No need to iterate over packets if there are no drops. */ |
6381 | 0 | if (!exceeded) { |
6382 | 0 | return; |
6383 | 0 | } |
6384 | | |
6385 | | /* Fire the highest rate band exceeded by each packet, and drop |
6386 | | * packets if needed. */ |
6387 | | |
6388 | 0 | memset(band_packets, 0, sizeof band_packets); |
6389 | 0 | memset(band_bytes, 0, sizeof band_bytes); |
6390 | |
|
6391 | 0 | size_t j; |
6392 | 0 | DP_PACKET_BATCH_REFILL_FOR_EACH (j, cnt, packet, packets_) { |
6393 | 0 | uint32_t m = exceeded_band[j]; |
6394 | |
|
6395 | 0 | if (m != UINT32_MAX) { |
6396 | | /* Meter drop packet. */ |
6397 | 0 | band_packets[m]++; |
6398 | 0 | band_bytes[m] += dp_packet_size(packet); |
6399 | 0 | dp_packet_delete(packet); |
6400 | 0 | } else { |
6401 | | /* Meter accepts packet. */ |
6402 | 0 | dp_packet_batch_refill(packets_, packet, j); |
6403 | 0 | } |
6404 | 0 | } |
6405 | |
|
6406 | 0 | for (int m = 0; m < meter->n_bands; m++) { |
6407 | 0 | if (!band_packets[m]) { |
6408 | 0 | continue; |
6409 | 0 | } |
6410 | 0 | band = &meter->bands[m]; |
6411 | 0 | atomic_add_relaxed(&band->packet_count, band_packets[m], &old); |
6412 | 0 | atomic_add_relaxed(&band->byte_count, band_bytes[m], &old); |
6413 | 0 | COVERAGE_ADD(datapath_drop_meter, band_packets[m]); |
6414 | 0 | } |
6415 | 0 | } |
6416 | | |
6417 | | /* Meter set/get/del processing is still single-threaded. */ |
6418 | | static int |
6419 | | dpif_netdev_meter_set(struct dpif *dpif, ofproto_meter_id meter_id, |
6420 | | struct ofputil_meter_config *config) |
6421 | 0 | { |
6422 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
6423 | 0 | uint32_t mid = meter_id.uint32; |
6424 | 0 | struct dp_meter *meter; |
6425 | 0 | int i; |
6426 | |
|
6427 | 0 | if (mid >= MAX_METERS) { |
6428 | 0 | return EFBIG; /* Meter_id out of range. */ |
6429 | 0 | } |
6430 | | |
6431 | 0 | if (config->flags & ~DP_SUPPORTED_METER_FLAGS_MASK) { |
6432 | 0 | return EBADF; /* Unsupported flags set */ |
6433 | 0 | } |
6434 | | |
6435 | 0 | if (config->n_bands > MAX_BANDS) { |
6436 | 0 | return EINVAL; |
6437 | 0 | } |
6438 | | |
6439 | 0 | for (i = 0; i < config->n_bands; ++i) { |
6440 | 0 | switch (config->bands[i].type) { |
6441 | 0 | case OFPMBT13_DROP: |
6442 | 0 | break; |
6443 | 0 | default: |
6444 | 0 | return ENODEV; /* Unsupported band type */ |
6445 | 0 | } |
6446 | 0 | } |
6447 | | |
6448 | | /* Allocate meter */ |
6449 | 0 | meter = xzalloc(sizeof *meter |
6450 | 0 | + config->n_bands * sizeof(struct dp_meter_band)); |
6451 | |
|
6452 | 0 | meter->flags = config->flags; |
6453 | 0 | meter->n_bands = config->n_bands; |
6454 | 0 | meter->max_delta_t = 0; |
6455 | 0 | meter->id = mid; |
6456 | 0 | atomic_init(&meter->used, time_msec()); |
6457 | | |
6458 | | /* set up bands */ |
6459 | 0 | for (i = 0; i < config->n_bands; ++i) { |
6460 | 0 | uint32_t band_max_delta_t; |
6461 | 0 | uint64_t bucket_size; |
6462 | | |
6463 | | /* Set burst size to a workable value if none specified. */ |
6464 | 0 | if (config->bands[i].burst_size == 0) { |
6465 | 0 | config->bands[i].burst_size = config->bands[i].rate; |
6466 | 0 | } |
6467 | |
|
6468 | 0 | meter->bands[i].rate = config->bands[i].rate; |
6469 | 0 | meter->bands[i].burst_size = config->bands[i].burst_size; |
6470 | | /* Start with a full bucket. */ |
6471 | 0 | bucket_size = meter->bands[i].burst_size * 1000ULL; |
6472 | 0 | atomic_init(&meter->bands[i].bucket, bucket_size); |
6473 | | |
6474 | | /* Figure out max delta_t that is enough to fill any bucket. */ |
6475 | 0 | band_max_delta_t = bucket_size / meter->bands[i].rate; |
6476 | 0 | if (band_max_delta_t > meter->max_delta_t) { |
6477 | 0 | meter->max_delta_t = band_max_delta_t; |
6478 | 0 | } |
6479 | 0 | } |
6480 | |
|
6481 | 0 | ovs_mutex_lock(&dp->meters_lock); |
6482 | |
|
6483 | 0 | dp_meter_detach_free(&dp->meters, mid); /* Free existing meter, if any. */ |
6484 | 0 | dp_meter_attach(&dp->meters, meter); |
6485 | |
|
6486 | 0 | ovs_mutex_unlock(&dp->meters_lock); |
6487 | |
|
6488 | 0 | return 0; |
6489 | 0 | } |
6490 | | |
6491 | | static int |
6492 | | dpif_netdev_meter_get(const struct dpif *dpif, |
6493 | | ofproto_meter_id meter_id_, |
6494 | | struct ofputil_meter_stats *stats, uint16_t n_bands) |
6495 | 0 | { |
6496 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
6497 | 0 | uint32_t meter_id = meter_id_.uint32; |
6498 | 0 | struct dp_meter *meter; |
6499 | |
|
6500 | 0 | if (meter_id >= MAX_METERS) { |
6501 | 0 | return EFBIG; |
6502 | 0 | } |
6503 | | |
6504 | 0 | meter = dp_meter_lookup(&dp->meters, meter_id); |
6505 | 0 | if (!meter) { |
6506 | 0 | return ENOENT; |
6507 | 0 | } |
6508 | | |
6509 | 0 | if (stats) { |
6510 | 0 | int i = 0; |
6511 | |
|
6512 | 0 | atomic_read_relaxed(&meter->packet_count, &stats->packet_in_count); |
6513 | 0 | atomic_read_relaxed(&meter->byte_count, &stats->byte_in_count); |
6514 | |
|
6515 | 0 | for (i = 0; i < n_bands && i < meter->n_bands; ++i) { |
6516 | 0 | atomic_read_relaxed(&meter->bands[i].packet_count, |
6517 | 0 | &stats->bands[i].packet_count); |
6518 | 0 | atomic_read_relaxed(&meter->bands[i].byte_count, |
6519 | 0 | &stats->bands[i].byte_count); |
6520 | 0 | } |
6521 | 0 | stats->n_bands = i; |
6522 | 0 | } |
6523 | |
|
6524 | 0 | return 0; |
6525 | 0 | } |
6526 | | |
6527 | | static int |
6528 | | dpif_netdev_meter_del(struct dpif *dpif, |
6529 | | ofproto_meter_id meter_id_, |
6530 | | struct ofputil_meter_stats *stats, uint16_t n_bands) |
6531 | 0 | { |
6532 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
6533 | 0 | int error; |
6534 | |
|
6535 | 0 | error = dpif_netdev_meter_get(dpif, meter_id_, stats, n_bands); |
6536 | 0 | if (!error) { |
6537 | 0 | uint32_t meter_id = meter_id_.uint32; |
6538 | |
|
6539 | 0 | ovs_mutex_lock(&dp->meters_lock); |
6540 | 0 | dp_meter_detach_free(&dp->meters, meter_id); |
6541 | 0 | ovs_mutex_unlock(&dp->meters_lock); |
6542 | 0 | } |
6543 | 0 | return error; |
6544 | 0 | } |
6545 | | |
6546 | | |
6547 | | static void |
6548 | | dpif_netdev_disable_upcall(struct dpif *dpif) |
6549 | | OVS_NO_THREAD_SAFETY_ANALYSIS |
6550 | 0 | { |
6551 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
6552 | 0 | dp_netdev_disable_upcall(dp); |
6553 | 0 | } |
6554 | | |
6555 | | static void |
6556 | | dp_netdev_enable_upcall(struct dp_netdev *dp) |
6557 | | OVS_RELEASES(dp->upcall_rwlock) |
6558 | 0 | { |
6559 | 0 | fat_rwlock_unlock(&dp->upcall_rwlock); |
6560 | 0 | } |
6561 | | |
6562 | | static void |
6563 | | dpif_netdev_enable_upcall(struct dpif *dpif) |
6564 | | OVS_NO_THREAD_SAFETY_ANALYSIS |
6565 | 0 | { |
6566 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
6567 | 0 | dp_netdev_enable_upcall(dp); |
6568 | 0 | } |
6569 | | |
6570 | | static void |
6571 | | dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd) |
6572 | 0 | { |
6573 | 0 | atomic_store_relaxed(&pmd->wait_for_reload, false); |
6574 | 0 | atomic_store_relaxed(&pmd->reload_tx_qid, false); |
6575 | 0 | pmd->last_reload_seq = seq_read(pmd->reload_seq); |
6576 | 0 | atomic_store_explicit(&pmd->reload, false, memory_order_release); |
6577 | 0 | } |
6578 | | |
6579 | | /* Finds and refs the dp_netdev_pmd_thread on core 'core_id'. Returns |
6580 | | * the pointer if succeeds, otherwise, NULL (it can return NULL even if |
6581 | | * 'core_id' is NON_PMD_CORE_ID). |
6582 | | * |
6583 | | * Caller must unrefs the returned reference. */ |
6584 | | static struct dp_netdev_pmd_thread * |
6585 | | dp_netdev_get_pmd(struct dp_netdev *dp, unsigned core_id) |
6586 | 0 | { |
6587 | 0 | struct dp_netdev_pmd_thread *pmd; |
6588 | |
|
6589 | 0 | CMAP_FOR_EACH_WITH_HASH (pmd, node, hash_int(core_id, 0), |
6590 | 0 | &dp->poll_threads) { |
6591 | 0 | if (pmd->core_id == core_id) { |
6592 | 0 | return dp_netdev_pmd_try_ref(pmd) ? pmd : NULL; |
6593 | 0 | } |
6594 | 0 | } |
6595 | | |
6596 | 0 | return NULL; |
6597 | 0 | } |
6598 | | |
6599 | | /* Sets the 'struct dp_netdev_pmd_thread' for non-pmd threads. */ |
6600 | | static void |
6601 | | dp_netdev_set_nonpmd(struct dp_netdev *dp) |
6602 | | OVS_REQ_WRLOCK(dp->port_rwlock) |
6603 | 0 | { |
6604 | 0 | struct dp_netdev_pmd_thread *non_pmd; |
6605 | |
|
6606 | 0 | non_pmd = xzalloc(sizeof *non_pmd); |
6607 | 0 | dp_netdev_configure_pmd(non_pmd, dp, NON_PMD_CORE_ID, OVS_NUMA_UNSPEC); |
6608 | 0 | } |
6609 | | |
6610 | | /* Caller must have valid pointer to 'pmd'. */ |
6611 | | static bool |
6612 | | dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd) |
6613 | 0 | { |
6614 | 0 | return ovs_refcount_try_ref_rcu(&pmd->ref_cnt); |
6615 | 0 | } |
6616 | | |
6617 | | static void |
6618 | | dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd) |
6619 | 0 | { |
6620 | 0 | if (pmd && ovs_refcount_unref(&pmd->ref_cnt) == 1) { |
6621 | 0 | ovsrcu_postpone(dp_netdev_destroy_pmd, pmd); |
6622 | 0 | } |
6623 | 0 | } |
6624 | | |
6625 | | /* Given cmap position 'pos', tries to ref the next node. If try_ref() |
6626 | | * fails, keeps checking for next node until reaching the end of cmap. |
6627 | | * |
6628 | | * Caller must unrefs the returned reference. */ |
6629 | | static struct dp_netdev_pmd_thread * |
6630 | | dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos) |
6631 | 0 | { |
6632 | 0 | struct dp_netdev_pmd_thread *next; |
6633 | |
|
6634 | 0 | do { |
6635 | 0 | struct cmap_node *node; |
6636 | |
|
6637 | 0 | node = cmap_next_position(&dp->poll_threads, pos); |
6638 | 0 | next = node ? CONTAINER_OF(node, struct dp_netdev_pmd_thread, node) |
6639 | 0 | : NULL; |
6640 | 0 | } while (next && !dp_netdev_pmd_try_ref(next)); |
6641 | |
|
6642 | 0 | return next; |
6643 | 0 | } |
6644 | | |
6645 | | /* Configures the 'pmd' based on the input argument. */ |
6646 | | static void |
6647 | | dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp, |
6648 | | unsigned core_id, int numa_id) |
6649 | | OVS_NO_THREAD_SAFETY_ANALYSIS |
6650 | 0 | { |
6651 | 0 | pmd->dp = dp; |
6652 | 0 | pmd->core_id = core_id; |
6653 | 0 | pmd->numa_id = numa_id; |
6654 | 0 | pmd->need_reload = false; |
6655 | 0 | pmd->n_output_batches = 0; |
6656 | |
|
6657 | 0 | ovs_refcount_init(&pmd->ref_cnt); |
6658 | 0 | atomic_init(&pmd->exit, false); |
6659 | 0 | pmd->reload_seq = seq_create(); |
6660 | 0 | pmd->last_reload_seq = seq_read(pmd->reload_seq); |
6661 | 0 | atomic_init(&pmd->reload, false); |
6662 | 0 | ovs_mutex_init(&pmd->flow_mutex); |
6663 | 0 | ovs_mutex_init(&pmd->port_mutex); |
6664 | 0 | ovs_mutex_init(&pmd->bond_mutex); |
6665 | 0 | cmap_init(&pmd->flow_table); |
6666 | 0 | cmap_init(&pmd->classifiers); |
6667 | 0 | cmap_init(&pmd->simple_match_table); |
6668 | 0 | ccmap_init(&pmd->n_flows); |
6669 | 0 | ccmap_init(&pmd->n_simple_flows); |
6670 | 0 | pmd->ctx.last_rxq = NULL; |
6671 | 0 | pmd_thread_ctx_time_update(pmd); |
6672 | 0 | pmd->next_optimization = pmd->ctx.now + DPCLS_OPTIMIZATION_INTERVAL; |
6673 | 0 | pmd->next_rcu_quiesce = pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL; |
6674 | 0 | pmd->next_cycle_store = pmd->ctx.now + PMD_INTERVAL_LEN; |
6675 | 0 | pmd->busy_cycles_intrvl = xzalloc(PMD_INTERVAL_MAX * |
6676 | 0 | sizeof *pmd->busy_cycles_intrvl); |
6677 | 0 | hmap_init(&pmd->poll_list); |
6678 | 0 | hmap_init(&pmd->tx_ports); |
6679 | 0 | hmap_init(&pmd->tnl_port_cache); |
6680 | 0 | hmap_init(&pmd->send_port_cache); |
6681 | 0 | cmap_init(&pmd->tx_bonds); |
6682 | |
|
6683 | 0 | pmd_init_max_sleep(dp, pmd); |
6684 | | |
6685 | | /* init the 'flow_cache' since there is no |
6686 | | * actual thread created for NON_PMD_CORE_ID. */ |
6687 | 0 | if (core_id == NON_PMD_CORE_ID) { |
6688 | 0 | dfc_cache_init(&pmd->flow_cache); |
6689 | 0 | pmd_alloc_static_tx_qid(pmd); |
6690 | 0 | } |
6691 | 0 | pmd_perf_stats_init(&pmd->perf_stats); |
6692 | 0 | cmap_insert(&dp->poll_threads, CONST_CAST(struct cmap_node *, &pmd->node), |
6693 | 0 | hash_int(core_id, 0)); |
6694 | 0 | } |
6695 | | |
6696 | | static void |
6697 | | dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd) |
6698 | | OVS_NO_THREAD_SAFETY_ANALYSIS |
6699 | 0 | { |
6700 | 0 | struct dpcls *cls; |
6701 | |
|
6702 | 0 | dp_netdev_pmd_flow_flush(pmd); |
6703 | 0 | hmap_destroy(&pmd->send_port_cache); |
6704 | 0 | hmap_destroy(&pmd->tnl_port_cache); |
6705 | 0 | hmap_destroy(&pmd->tx_ports); |
6706 | 0 | cmap_destroy(&pmd->tx_bonds); |
6707 | 0 | hmap_destroy(&pmd->poll_list); |
6708 | 0 | free(pmd->busy_cycles_intrvl); |
6709 | | /* All flows (including their dpcls_rules) have been deleted already */ |
6710 | 0 | CMAP_FOR_EACH (cls, node, &pmd->classifiers) { |
6711 | 0 | dpcls_destroy(cls); |
6712 | 0 | ovsrcu_postpone(free, cls); |
6713 | 0 | } |
6714 | 0 | cmap_destroy(&pmd->classifiers); |
6715 | 0 | cmap_destroy(&pmd->flow_table); |
6716 | 0 | cmap_destroy(&pmd->simple_match_table); |
6717 | 0 | ccmap_destroy(&pmd->n_flows); |
6718 | 0 | ccmap_destroy(&pmd->n_simple_flows); |
6719 | 0 | ovs_mutex_destroy(&pmd->flow_mutex); |
6720 | 0 | seq_destroy(pmd->reload_seq); |
6721 | 0 | ovs_mutex_destroy(&pmd->port_mutex); |
6722 | 0 | ovs_mutex_destroy(&pmd->bond_mutex); |
6723 | 0 | free(pmd); |
6724 | 0 | } |
6725 | | |
6726 | | /* Stops the pmd thread, removes it from the 'dp->poll_threads', |
6727 | | * and unrefs the struct. */ |
6728 | | static void |
6729 | | dp_netdev_del_pmd(struct dp_netdev *dp, struct dp_netdev_pmd_thread *pmd) |
6730 | 0 | { |
6731 | | /* NON_PMD_CORE_ID doesn't have a thread, so we don't have to synchronize, |
6732 | | * but extra cleanup is necessary */ |
6733 | 0 | if (pmd->core_id == NON_PMD_CORE_ID) { |
6734 | 0 | ovs_mutex_lock(&dp->non_pmd_mutex); |
6735 | 0 | dfc_cache_uninit(&pmd->flow_cache); |
6736 | 0 | pmd_free_cached_ports(pmd); |
6737 | 0 | pmd_free_static_tx_qid(pmd); |
6738 | 0 | ovs_mutex_unlock(&dp->non_pmd_mutex); |
6739 | 0 | } else { |
6740 | 0 | atomic_store_relaxed(&pmd->exit, true); |
6741 | 0 | dp_netdev_reload_pmd__(pmd); |
6742 | 0 | xpthread_join(pmd->thread, NULL); |
6743 | 0 | } |
6744 | |
|
6745 | 0 | dp_netdev_pmd_clear_ports(pmd); |
6746 | | |
6747 | | /* Purges the 'pmd''s flows after stopping the thread, but before |
6748 | | * destroying the flows, so that the flow stats can be collected. */ |
6749 | 0 | if (dp->dp_purge_cb) { |
6750 | 0 | dp->dp_purge_cb(dp->dp_purge_aux, pmd->core_id); |
6751 | 0 | } |
6752 | 0 | cmap_remove(&pmd->dp->poll_threads, &pmd->node, hash_int(pmd->core_id, 0)); |
6753 | 0 | dp_netdev_pmd_unref(pmd); |
6754 | 0 | } |
6755 | | |
6756 | | /* Destroys all pmd threads. If 'non_pmd' is true it also destroys the non pmd |
6757 | | * thread. */ |
6758 | | static void |
6759 | | dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd) |
6760 | 0 | { |
6761 | 0 | struct dp_netdev_pmd_thread *pmd; |
6762 | 0 | struct dp_netdev_pmd_thread **pmd_list; |
6763 | 0 | size_t k = 0, n_pmds; |
6764 | |
|
6765 | 0 | n_pmds = cmap_count(&dp->poll_threads); |
6766 | 0 | pmd_list = xcalloc(n_pmds, sizeof *pmd_list); |
6767 | |
|
6768 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
6769 | 0 | if (!non_pmd && pmd->core_id == NON_PMD_CORE_ID) { |
6770 | 0 | continue; |
6771 | 0 | } |
6772 | | /* We cannot call dp_netdev_del_pmd(), since it alters |
6773 | | * 'dp->poll_threads' (while we're iterating it) and it |
6774 | | * might quiesce. */ |
6775 | 0 | ovs_assert(k < n_pmds); |
6776 | 0 | pmd_list[k++] = pmd; |
6777 | 0 | } |
6778 | |
|
6779 | 0 | for (size_t i = 0; i < k; i++) { |
6780 | 0 | dp_netdev_del_pmd(dp, pmd_list[i]); |
6781 | 0 | } |
6782 | 0 | free(pmd_list); |
6783 | 0 | } |
6784 | | |
6785 | | /* Deletes all rx queues from pmd->poll_list and all the ports from |
6786 | | * pmd->tx_ports. */ |
6787 | | static void |
6788 | | dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd) |
6789 | 0 | { |
6790 | 0 | struct rxq_poll *poll; |
6791 | 0 | struct tx_port *port; |
6792 | 0 | struct tx_bond *tx; |
6793 | |
|
6794 | 0 | ovs_mutex_lock(&pmd->port_mutex); |
6795 | 0 | HMAP_FOR_EACH_POP (poll, node, &pmd->poll_list) { |
6796 | 0 | free(poll); |
6797 | 0 | } |
6798 | 0 | HMAP_FOR_EACH_POP (port, node, &pmd->tx_ports) { |
6799 | 0 | free(port->txq_pkts); |
6800 | 0 | free(port); |
6801 | 0 | } |
6802 | 0 | ovs_mutex_unlock(&pmd->port_mutex); |
6803 | |
|
6804 | 0 | ovs_mutex_lock(&pmd->bond_mutex); |
6805 | 0 | CMAP_FOR_EACH (tx, node, &pmd->tx_bonds) { |
6806 | 0 | cmap_remove(&pmd->tx_bonds, &tx->node, hash_bond_id(tx->bond_id)); |
6807 | 0 | ovsrcu_postpone(free, tx); |
6808 | 0 | } |
6809 | 0 | ovs_mutex_unlock(&pmd->bond_mutex); |
6810 | 0 | } |
6811 | | |
6812 | | /* Adds rx queue to poll_list of PMD thread, if it's not there already. */ |
6813 | | static void |
6814 | | dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd, |
6815 | | struct dp_netdev_rxq *rxq) |
6816 | | OVS_REQUIRES(pmd->port_mutex) |
6817 | 0 | { |
6818 | 0 | int qid = netdev_rxq_get_queue_id(rxq->rx); |
6819 | 0 | uint32_t hash = hash_2words(odp_to_u32(rxq->port->port_no), qid); |
6820 | 0 | struct rxq_poll *poll; |
6821 | |
|
6822 | 0 | HMAP_FOR_EACH_WITH_HASH (poll, node, hash, &pmd->poll_list) { |
6823 | 0 | if (poll->rxq == rxq) { |
6824 | | /* 'rxq' is already polled by this thread. Do nothing. */ |
6825 | 0 | return; |
6826 | 0 | } |
6827 | 0 | } |
6828 | | |
6829 | 0 | poll = xmalloc(sizeof *poll); |
6830 | 0 | poll->rxq = rxq; |
6831 | 0 | hmap_insert(&pmd->poll_list, &poll->node, hash); |
6832 | |
|
6833 | 0 | pmd->need_reload = true; |
6834 | 0 | } |
6835 | | |
6836 | | /* Delete 'poll' from poll_list of PMD thread. */ |
6837 | | static void |
6838 | | dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd, |
6839 | | struct rxq_poll *poll) |
6840 | | OVS_REQUIRES(pmd->port_mutex) |
6841 | 0 | { |
6842 | 0 | hmap_remove(&pmd->poll_list, &poll->node); |
6843 | 0 | free(poll); |
6844 | |
|
6845 | 0 | pmd->need_reload = true; |
6846 | 0 | } |
6847 | | |
6848 | | /* Add 'port' to the tx port cache of 'pmd', which must be reloaded for the |
6849 | | * changes to take effect. */ |
6850 | | static void |
6851 | | dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd, |
6852 | | struct dp_netdev_port *port) |
6853 | | OVS_REQUIRES(pmd->port_mutex) |
6854 | 0 | { |
6855 | 0 | struct tx_port *tx; |
6856 | |
|
6857 | 0 | tx = tx_port_lookup(&pmd->tx_ports, port->port_no); |
6858 | 0 | if (tx) { |
6859 | | /* 'port' is already on this thread tx cache. Do nothing. */ |
6860 | 0 | return; |
6861 | 0 | } |
6862 | | |
6863 | 0 | tx = xzalloc(sizeof *tx); |
6864 | |
|
6865 | 0 | tx->port = port; |
6866 | 0 | tx->qid = -1; |
6867 | 0 | tx->flush_time = 0LL; |
6868 | 0 | dp_packet_batch_init(&tx->output_pkts); |
6869 | |
|
6870 | 0 | if (tx->port->txq_mode == TXQ_MODE_XPS_HASH) { |
6871 | 0 | int i, n_txq = netdev_n_txq(tx->port->netdev); |
6872 | |
|
6873 | 0 | tx->txq_pkts = xzalloc(n_txq * sizeof *tx->txq_pkts); |
6874 | 0 | for (i = 0; i < n_txq; i++) { |
6875 | 0 | dp_packet_batch_init(&tx->txq_pkts[i]); |
6876 | 0 | } |
6877 | 0 | } |
6878 | |
|
6879 | 0 | hmap_insert(&pmd->tx_ports, &tx->node, hash_port_no(tx->port->port_no)); |
6880 | 0 | pmd->need_reload = true; |
6881 | 0 | } |
6882 | | |
6883 | | /* Del 'tx' from the tx port cache of 'pmd', which must be reloaded for the |
6884 | | * changes to take effect. */ |
6885 | | static void |
6886 | | dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd, |
6887 | | struct tx_port *tx) |
6888 | | OVS_REQUIRES(pmd->port_mutex) |
6889 | 0 | { |
6890 | 0 | hmap_remove(&pmd->tx_ports, &tx->node); |
6891 | 0 | free(tx->txq_pkts); |
6892 | 0 | free(tx); |
6893 | 0 | pmd->need_reload = true; |
6894 | 0 | } |
6895 | | |
6896 | | /* Add bond to the tx bond cmap of 'pmd'. */ |
6897 | | static void |
6898 | | dp_netdev_add_bond_tx_to_pmd(struct dp_netdev_pmd_thread *pmd, |
6899 | | struct tx_bond *bond, bool update) |
6900 | | OVS_EXCLUDED(pmd->bond_mutex) |
6901 | 0 | { |
6902 | 0 | struct tx_bond *tx; |
6903 | |
|
6904 | 0 | ovs_mutex_lock(&pmd->bond_mutex); |
6905 | 0 | tx = tx_bond_lookup(&pmd->tx_bonds, bond->bond_id); |
6906 | |
|
6907 | 0 | if (tx && !update) { |
6908 | | /* It's not an update and the entry already exists. Do nothing. */ |
6909 | 0 | goto unlock; |
6910 | 0 | } |
6911 | | |
6912 | 0 | if (tx) { |
6913 | 0 | struct tx_bond *new_tx = xmemdup(bond, sizeof *bond); |
6914 | | |
6915 | | /* Copy the stats for each bucket. */ |
6916 | 0 | for (int i = 0; i < BOND_BUCKETS; i++) { |
6917 | 0 | uint64_t n_packets, n_bytes; |
6918 | |
|
6919 | 0 | atomic_read_relaxed(&tx->member_buckets[i].n_packets, &n_packets); |
6920 | 0 | atomic_read_relaxed(&tx->member_buckets[i].n_bytes, &n_bytes); |
6921 | 0 | atomic_init(&new_tx->member_buckets[i].n_packets, n_packets); |
6922 | 0 | atomic_init(&new_tx->member_buckets[i].n_bytes, n_bytes); |
6923 | 0 | } |
6924 | 0 | cmap_replace(&pmd->tx_bonds, &tx->node, &new_tx->node, |
6925 | 0 | hash_bond_id(bond->bond_id)); |
6926 | 0 | ovsrcu_postpone(free, tx); |
6927 | 0 | } else { |
6928 | 0 | tx = xmemdup(bond, sizeof *bond); |
6929 | 0 | cmap_insert(&pmd->tx_bonds, &tx->node, hash_bond_id(bond->bond_id)); |
6930 | 0 | } |
6931 | 0 | unlock: |
6932 | 0 | ovs_mutex_unlock(&pmd->bond_mutex); |
6933 | 0 | } |
6934 | | |
6935 | | /* Delete bond from the tx bond cmap of 'pmd'. */ |
6936 | | static void |
6937 | | dp_netdev_del_bond_tx_from_pmd(struct dp_netdev_pmd_thread *pmd, |
6938 | | uint32_t bond_id) |
6939 | | OVS_EXCLUDED(pmd->bond_mutex) |
6940 | 0 | { |
6941 | 0 | struct tx_bond *tx; |
6942 | |
|
6943 | 0 | ovs_mutex_lock(&pmd->bond_mutex); |
6944 | 0 | tx = tx_bond_lookup(&pmd->tx_bonds, bond_id); |
6945 | 0 | if (tx) { |
6946 | 0 | cmap_remove(&pmd->tx_bonds, &tx->node, hash_bond_id(tx->bond_id)); |
6947 | 0 | ovsrcu_postpone(free, tx); |
6948 | 0 | } |
6949 | 0 | ovs_mutex_unlock(&pmd->bond_mutex); |
6950 | 0 | } |
6951 | | |
6952 | | static char * |
6953 | | dpif_netdev_get_datapath_version(void) |
6954 | 0 | { |
6955 | 0 | return xstrdup("<built-in>"); |
6956 | 0 | } |
6957 | | |
6958 | | static void |
6959 | | dp_netdev_flow_used(struct dp_netdev_flow *netdev_flow, int cnt, int size, |
6960 | | uint16_t tcp_flags, long long now) |
6961 | 0 | { |
6962 | 0 | uint16_t flags; |
6963 | |
|
6964 | 0 | atomic_store_relaxed(&netdev_flow->stats.used, now); |
6965 | 0 | non_atomic_ullong_add(&netdev_flow->stats.packet_count, cnt); |
6966 | 0 | non_atomic_ullong_add(&netdev_flow->stats.byte_count, size); |
6967 | 0 | atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags); |
6968 | 0 | flags |= tcp_flags; |
6969 | 0 | atomic_store_relaxed(&netdev_flow->stats.tcp_flags, flags); |
6970 | 0 | } |
6971 | | |
6972 | | static int |
6973 | | dp_netdev_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet_, |
6974 | | struct flow *flow, struct flow_wildcards *wc, ovs_u128 *ufid, |
6975 | | enum dpif_upcall_type type, const struct nlattr *userdata, |
6976 | | struct ofpbuf *actions, struct ofpbuf *put_actions) |
6977 | 0 | { |
6978 | 0 | struct dp_netdev *dp = pmd->dp; |
6979 | |
|
6980 | 0 | if (OVS_UNLIKELY(!dp->upcall_cb)) { |
6981 | 0 | return ENODEV; |
6982 | 0 | } |
6983 | | |
6984 | 0 | if (OVS_UNLIKELY(!VLOG_DROP_DBG(&upcall_rl))) { |
6985 | 0 | struct ds ds = DS_EMPTY_INITIALIZER; |
6986 | 0 | char *packet_str; |
6987 | 0 | struct ofpbuf key; |
6988 | 0 | struct odp_flow_key_parms odp_parms = { |
6989 | 0 | .flow = flow, |
6990 | 0 | .mask = wc ? &wc->masks : NULL, |
6991 | 0 | .support = dp_netdev_support, |
6992 | 0 | }; |
6993 | |
|
6994 | 0 | ofpbuf_init(&key, 0); |
6995 | 0 | odp_flow_key_from_flow(&odp_parms, &key); |
6996 | 0 | packet_str = ofp_dp_packet_to_string(packet_); |
6997 | |
|
6998 | 0 | odp_flow_key_format(key.data, key.size, &ds); |
6999 | |
|
7000 | 0 | VLOG_DBG("%s: %s upcall:\n%s\n%s", dp->name, |
7001 | 0 | dpif_upcall_type_to_string(type), ds_cstr(&ds), packet_str); |
7002 | |
|
7003 | 0 | ofpbuf_uninit(&key); |
7004 | 0 | free(packet_str); |
7005 | |
|
7006 | 0 | ds_destroy(&ds); |
7007 | 0 | } |
7008 | |
|
7009 | 0 | if (type != DPIF_UC_MISS) { |
7010 | 0 | dp_packet_ol_send_prepare(packet_, 0); |
7011 | 0 | } |
7012 | |
|
7013 | 0 | return dp->upcall_cb(packet_, flow, ufid, pmd->core_id, type, userdata, |
7014 | 0 | actions, wc, put_actions, dp->upcall_aux); |
7015 | 0 | } |
7016 | | |
7017 | | static inline uint32_t |
7018 | | dpif_netdev_packet_get_rss_hash(struct dp_packet *packet, |
7019 | | const struct miniflow *mf) |
7020 | 0 | { |
7021 | 0 | uint32_t hash, recirc_depth; |
7022 | |
|
7023 | 0 | if (OVS_LIKELY(dp_packet_rss_valid(packet))) { |
7024 | 0 | hash = dp_packet_get_rss_hash(packet); |
7025 | 0 | } else { |
7026 | 0 | hash = miniflow_hash_5tuple(mf, 0); |
7027 | 0 | dp_packet_set_rss_hash(packet, hash); |
7028 | 0 | } |
7029 | | |
7030 | | /* The RSS hash must account for the recirculation depth to avoid |
7031 | | * collisions in the exact match cache */ |
7032 | 0 | recirc_depth = *recirc_depth_get_unsafe(); |
7033 | 0 | if (OVS_UNLIKELY(recirc_depth)) { |
7034 | 0 | hash = hash_finish(hash, recirc_depth); |
7035 | 0 | } |
7036 | 0 | return hash; |
7037 | 0 | } |
7038 | | |
7039 | | struct packet_batch_per_flow { |
7040 | | unsigned int byte_count; |
7041 | | uint16_t tcp_flags; |
7042 | | struct dp_netdev_flow *flow; |
7043 | | |
7044 | | struct dp_packet_batch array; |
7045 | | }; |
7046 | | |
7047 | | static inline void |
7048 | | packet_batch_per_flow_update(struct packet_batch_per_flow *batch, |
7049 | | struct dp_packet *packet, |
7050 | | uint16_t tcp_flags) |
7051 | 0 | { |
7052 | 0 | batch->byte_count += dp_packet_size(packet); |
7053 | 0 | batch->tcp_flags |= tcp_flags; |
7054 | 0 | dp_packet_batch_add(&batch->array, packet); |
7055 | 0 | } |
7056 | | |
7057 | | static inline void |
7058 | | packet_batch_per_flow_init(struct packet_batch_per_flow *batch, |
7059 | | struct dp_netdev_flow *flow) |
7060 | 0 | { |
7061 | 0 | flow->batch = batch; |
7062 | |
|
7063 | 0 | batch->flow = flow; |
7064 | 0 | dp_packet_batch_init(&batch->array); |
7065 | 0 | batch->byte_count = 0; |
7066 | 0 | batch->tcp_flags = 0; |
7067 | 0 | } |
7068 | | |
7069 | | static inline void |
7070 | | packet_batch_per_flow_execute(struct packet_batch_per_flow *batch, |
7071 | | struct dp_netdev_pmd_thread *pmd) |
7072 | 0 | { |
7073 | 0 | struct dp_netdev_actions *actions; |
7074 | 0 | struct dp_netdev_flow *flow = batch->flow; |
7075 | |
|
7076 | 0 | dp_netdev_flow_used(flow, dp_packet_batch_size(&batch->array), |
7077 | 0 | batch->byte_count, |
7078 | 0 | batch->tcp_flags, pmd->ctx.now / 1000); |
7079 | |
|
7080 | 0 | actions = dp_netdev_flow_get_actions(flow); |
7081 | |
|
7082 | 0 | dp_netdev_execute_actions(pmd, &batch->array, true, &flow->flow, |
7083 | 0 | actions->actions, actions->size); |
7084 | 0 | } |
7085 | | |
7086 | | static inline void |
7087 | | dp_netdev_queue_batches(struct dp_packet *pkt, |
7088 | | struct dp_netdev_flow *flow, uint16_t tcp_flags, |
7089 | | struct packet_batch_per_flow *batches, |
7090 | | size_t *n_batches) |
7091 | 0 | { |
7092 | 0 | struct packet_batch_per_flow *batch = flow->batch; |
7093 | |
|
7094 | 0 | if (OVS_UNLIKELY(!batch)) { |
7095 | 0 | batch = &batches[(*n_batches)++]; |
7096 | 0 | packet_batch_per_flow_init(batch, flow); |
7097 | 0 | } |
7098 | |
|
7099 | 0 | packet_batch_per_flow_update(batch, pkt, tcp_flags); |
7100 | 0 | } |
7101 | | |
7102 | | static inline void |
7103 | | packet_enqueue_to_flow_map(struct dp_packet *packet, |
7104 | | struct dp_netdev_flow *flow, |
7105 | | uint16_t tcp_flags, |
7106 | | struct dp_packet_flow_map *flow_map, |
7107 | | size_t index) |
7108 | 0 | { |
7109 | 0 | struct dp_packet_flow_map *map = &flow_map[index]; |
7110 | 0 | map->flow = flow; |
7111 | 0 | map->packet = packet; |
7112 | 0 | map->tcp_flags = tcp_flags; |
7113 | 0 | } |
7114 | | |
7115 | | /* SMC lookup function for a batch of packets. |
7116 | | * By doing batching SMC lookup, we can use prefetch |
7117 | | * to hide memory access latency. |
7118 | | */ |
7119 | | static inline void |
7120 | | smc_lookup_batch(struct dp_netdev_pmd_thread *pmd, |
7121 | | struct netdev_flow_key *keys, |
7122 | | struct netdev_flow_key **missed_keys, |
7123 | | struct dp_packet_batch *packets_, |
7124 | | const int cnt, |
7125 | | struct dp_packet_flow_map *flow_map, |
7126 | | uint8_t *index_map) |
7127 | 0 | { |
7128 | 0 | int i; |
7129 | 0 | struct dp_packet *packet; |
7130 | 0 | size_t n_smc_hit = 0, n_missed = 0; |
7131 | 0 | struct dfc_cache *cache = &pmd->flow_cache; |
7132 | 0 | struct smc_cache *smc_cache = &cache->smc_cache; |
7133 | 0 | const struct cmap_node *flow_node; |
7134 | 0 | int recv_idx; |
7135 | 0 | uint16_t tcp_flags; |
7136 | | |
7137 | | /* Prefetch buckets for all packets */ |
7138 | 0 | for (i = 0; i < cnt; i++) { |
7139 | 0 | OVS_PREFETCH(&smc_cache->buckets[keys[i].hash & SMC_MASK]); |
7140 | 0 | } |
7141 | |
|
7142 | 0 | DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) { |
7143 | 0 | struct dp_netdev_flow *flow = NULL; |
7144 | 0 | flow_node = smc_entry_get(pmd, keys[i].hash); |
7145 | 0 | bool hit = false; |
7146 | | /* Get the original order of this packet in received batch. */ |
7147 | 0 | recv_idx = index_map[i]; |
7148 | |
|
7149 | 0 | if (OVS_LIKELY(flow_node != NULL)) { |
7150 | 0 | CMAP_NODE_FOR_EACH (flow, node, flow_node) { |
7151 | | /* Since we dont have per-port megaflow to check the port |
7152 | | * number, we need to verify that the input ports match. */ |
7153 | 0 | if (OVS_LIKELY(dpcls_rule_matches_key(&flow->cr, &keys[i]) && |
7154 | 0 | flow->flow.in_port.odp_port == packet->md.in_port.odp_port)) { |
7155 | 0 | tcp_flags = miniflow_get_tcp_flags(&keys[i].mf); |
7156 | | |
7157 | | /* SMC hit and emc miss, we insert into EMC */ |
7158 | 0 | keys[i].len = |
7159 | 0 | netdev_flow_key_size(miniflow_n_values(&keys[i].mf)); |
7160 | 0 | emc_probabilistic_insert(pmd, &keys[i], flow); |
7161 | | /* Add these packets into the flow map in the same order |
7162 | | * as received. |
7163 | | */ |
7164 | 0 | packet_enqueue_to_flow_map(packet, flow, tcp_flags, |
7165 | 0 | flow_map, recv_idx); |
7166 | 0 | n_smc_hit++; |
7167 | 0 | hit = true; |
7168 | 0 | break; |
7169 | 0 | } |
7170 | 0 | } |
7171 | 0 | if (hit) { |
7172 | 0 | continue; |
7173 | 0 | } |
7174 | 0 | } |
7175 | | |
7176 | | /* SMC missed. Group missed packets together at |
7177 | | * the beginning of the 'packets' array. */ |
7178 | 0 | dp_packet_batch_refill(packets_, packet, i); |
7179 | | |
7180 | | /* Preserve the order of packet for flow batching. */ |
7181 | 0 | index_map[n_missed] = recv_idx; |
7182 | | |
7183 | | /* Put missed keys to the pointer arrays return to the caller */ |
7184 | 0 | missed_keys[n_missed++] = &keys[i]; |
7185 | 0 | } |
7186 | |
|
7187 | 0 | pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SMC_HIT, n_smc_hit); |
7188 | 0 | } |
7189 | | |
7190 | | struct dp_netdev_flow * |
7191 | | smc_lookup_single(struct dp_netdev_pmd_thread *pmd, |
7192 | | struct dp_packet *packet, |
7193 | | struct netdev_flow_key *key) |
7194 | 0 | { |
7195 | 0 | const struct cmap_node *flow_node = smc_entry_get(pmd, key->hash); |
7196 | |
|
7197 | 0 | if (OVS_LIKELY(flow_node != NULL)) { |
7198 | 0 | struct dp_netdev_flow *flow = NULL; |
7199 | |
|
7200 | 0 | CMAP_NODE_FOR_EACH (flow, node, flow_node) { |
7201 | | /* Since we dont have per-port megaflow to check the port |
7202 | | * number, we need to verify that the input ports match. */ |
7203 | 0 | if (OVS_LIKELY(dpcls_rule_matches_key(&flow->cr, key) && |
7204 | 0 | flow->flow.in_port.odp_port == packet->md.in_port.odp_port)) { |
7205 | |
|
7206 | 0 | return (void *) flow; |
7207 | 0 | } |
7208 | 0 | } |
7209 | 0 | } |
7210 | | |
7211 | 0 | return NULL; |
7212 | 0 | } |
7213 | | |
7214 | | static inline int |
7215 | | dp_netdev_hw_flow(const struct dp_netdev_pmd_thread *pmd, |
7216 | | struct dp_packet *packet, |
7217 | | struct dp_netdev_flow **flow) |
7218 | 0 | { |
7219 | 0 | struct dp_netdev_rxq *rxq = pmd->ctx.last_rxq; |
7220 | 0 | bool post_process_api_supported; |
7221 | 0 | void *flow_reference = NULL; |
7222 | 0 | int err; |
7223 | |
|
7224 | 0 | atomic_read_relaxed(&rxq->port->netdev->hw_info.post_process_api_supported, |
7225 | 0 | &post_process_api_supported); |
7226 | |
|
7227 | 0 | if (!post_process_api_supported) { |
7228 | 0 | *flow = NULL; |
7229 | 0 | return 0; |
7230 | 0 | } |
7231 | | |
7232 | 0 | err = dpif_offload_netdev_hw_post_process(rxq->port->netdev, pmd->core_id, |
7233 | 0 | packet, &flow_reference); |
7234 | 0 | if (err && err != EOPNOTSUPP) { |
7235 | 0 | if (err != ECANCELED) { |
7236 | 0 | COVERAGE_INC(datapath_drop_hw_post_process); |
7237 | 0 | } else { |
7238 | 0 | COVERAGE_INC(datapath_drop_hw_post_process_consumed); |
7239 | 0 | } |
7240 | 0 | return -1; |
7241 | 0 | } |
7242 | | |
7243 | 0 | *flow = flow_reference; |
7244 | 0 | return 0; |
7245 | 0 | } |
7246 | | |
7247 | | /* Enqueues already classified packet into per-flow batches or the flow map, |
7248 | | * depending on the fact if batching enabled. */ |
7249 | | static inline void |
7250 | | dfc_processing_enqueue_classified_packet(struct dp_packet *packet, |
7251 | | struct dp_netdev_flow *flow, |
7252 | | uint16_t tcp_flags, |
7253 | | bool batch_enable, |
7254 | | struct packet_batch_per_flow *batches, |
7255 | | size_t *n_batches, |
7256 | | struct dp_packet_flow_map *flow_map, |
7257 | | size_t *map_cnt) |
7258 | | |
7259 | 0 | { |
7260 | 0 | if (OVS_LIKELY(batch_enable)) { |
7261 | 0 | dp_netdev_queue_batches(packet, flow, tcp_flags, batches, |
7262 | 0 | n_batches); |
7263 | 0 | } else { |
7264 | | /* Flow batching should be performed only after fast-path |
7265 | | * processing is also completed for packets with emc miss |
7266 | | * or else it will result in reordering of packets with |
7267 | | * same datapath flows. */ |
7268 | 0 | packet_enqueue_to_flow_map(packet, flow, tcp_flags, |
7269 | 0 | flow_map, (*map_cnt)++); |
7270 | 0 | } |
7271 | |
|
7272 | 0 | } |
7273 | | |
7274 | | /* Try to process all ('cnt') the 'packets' using only the datapath flow cache |
7275 | | * 'pmd->flow_cache'. If a flow is not found for a packet 'packets[i]', the |
7276 | | * miniflow is copied into 'keys' and the packet pointer is moved at the |
7277 | | * beginning of the 'packets' array. The pointers of missed keys are put in the |
7278 | | * missed_keys pointer array for future processing. |
7279 | | * |
7280 | | * The function returns the number of packets that needs to be processed in the |
7281 | | * 'packets' array (they have been moved to the beginning of the vector). |
7282 | | * |
7283 | | * For performance reasons a caller may choose not to initialize the metadata |
7284 | | * in 'packets_'. If 'md_is_valid' is false, the metadata in 'packets' |
7285 | | * is not valid and must be initialized by this function using 'port_no'. |
7286 | | * If 'md_is_valid' is true, the metadata is already valid and 'port_no' |
7287 | | * will be ignored. |
7288 | | */ |
7289 | | static inline size_t |
7290 | | dfc_processing(struct dp_netdev_pmd_thread *pmd, |
7291 | | struct dp_packet_batch *packets_, |
7292 | | struct netdev_flow_key *keys, |
7293 | | struct netdev_flow_key **missed_keys, |
7294 | | struct packet_batch_per_flow batches[], size_t *n_batches, |
7295 | | struct dp_packet_flow_map *flow_map, |
7296 | | size_t *n_flows, uint8_t *index_map, |
7297 | | bool md_is_valid, odp_port_t port_no) |
7298 | 0 | { |
7299 | 0 | size_t n_missed = 0, n_emc_hit = 0, n_phwol_hit = 0, n_simple_hit = 0; |
7300 | 0 | const bool offload_enabled = dpif_offload_enabled(); |
7301 | 0 | const uint32_t recirc_depth = *recirc_depth_get(); |
7302 | 0 | const size_t cnt = dp_packet_batch_size(packets_); |
7303 | 0 | struct dfc_cache *cache = &pmd->flow_cache; |
7304 | 0 | struct netdev_flow_key *key = &keys[0]; |
7305 | 0 | struct dp_packet *packet; |
7306 | 0 | size_t map_cnt = 0; |
7307 | 0 | bool batch_enable = true; |
7308 | |
|
7309 | 0 | const bool simple_match_enabled = |
7310 | 0 | !md_is_valid && dp_netdev_simple_match_enabled(pmd, port_no); |
7311 | | /* 'simple_match_table' is a full flow table. If the flow is not there, |
7312 | | * upcall is required, and there is no chance to find a match in caches. */ |
7313 | 0 | const bool smc_enable_db = !simple_match_enabled && pmd->ctx.smc_enable_db; |
7314 | 0 | const uint32_t cur_min = simple_match_enabled |
7315 | 0 | ? 0 : pmd->ctx.emc_insert_min; |
7316 | |
|
7317 | 0 | pmd_perf_update_counter(&pmd->perf_stats, |
7318 | 0 | md_is_valid ? PMD_STAT_RECIRC : PMD_STAT_RECV, |
7319 | 0 | cnt); |
7320 | 0 | int i; |
7321 | 0 | DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) { |
7322 | 0 | struct dp_netdev_flow *flow = NULL; |
7323 | 0 | uint16_t tcp_flags; |
7324 | |
|
7325 | 0 | if (OVS_UNLIKELY(dp_packet_size(packet) < ETH_HEADER_LEN)) { |
7326 | 0 | dp_packet_delete(packet); |
7327 | 0 | COVERAGE_INC(datapath_drop_rx_invalid_packet); |
7328 | 0 | continue; |
7329 | 0 | } |
7330 | | |
7331 | 0 | if (i != cnt - 1) { |
7332 | 0 | struct dp_packet **packets = packets_->packets; |
7333 | | /* Prefetch next packet data and metadata. */ |
7334 | 0 | OVS_PREFETCH(dp_packet_data(packets[i+1])); |
7335 | 0 | pkt_metadata_prefetch_init(&packets[i+1]->md); |
7336 | 0 | } |
7337 | |
|
7338 | 0 | if (!md_is_valid) { |
7339 | 0 | pkt_metadata_init(&packet->md, port_no); |
7340 | 0 | } |
7341 | |
|
7342 | 0 | if (offload_enabled && recirc_depth == 0) { |
7343 | 0 | if (OVS_UNLIKELY(dp_netdev_hw_flow(pmd, packet, &flow))) { |
7344 | | /* Packet restoration failed and it was dropped, do not |
7345 | | * continue processing. |
7346 | | */ |
7347 | 0 | continue; |
7348 | 0 | } |
7349 | 0 | if (OVS_LIKELY(flow)) { |
7350 | 0 | tcp_flags = parse_tcp_flags(packet, NULL, NULL, NULL); |
7351 | 0 | n_phwol_hit++; |
7352 | 0 | dfc_processing_enqueue_classified_packet( |
7353 | 0 | packet, flow, tcp_flags, batch_enable, |
7354 | 0 | batches, n_batches, flow_map, &map_cnt); |
7355 | 0 | continue; |
7356 | 0 | } |
7357 | 0 | } |
7358 | | |
7359 | 0 | if (!flow && simple_match_enabled) { |
7360 | 0 | ovs_be16 dl_type = 0, vlan_tci = 0; |
7361 | 0 | uint8_t nw_frag = 0; |
7362 | |
|
7363 | 0 | tcp_flags = parse_tcp_flags(packet, &dl_type, &nw_frag, &vlan_tci); |
7364 | 0 | flow = dp_netdev_simple_match_lookup(pmd, port_no, dl_type, |
7365 | 0 | nw_frag, vlan_tci); |
7366 | 0 | if (OVS_LIKELY(flow)) { |
7367 | 0 | n_simple_hit++; |
7368 | 0 | dfc_processing_enqueue_classified_packet( |
7369 | 0 | packet, flow, tcp_flags, batch_enable, |
7370 | 0 | batches, n_batches, flow_map, &map_cnt); |
7371 | 0 | continue; |
7372 | 0 | } |
7373 | 0 | } |
7374 | | |
7375 | 0 | miniflow_extract(packet, &key->mf); |
7376 | 0 | key->len = 0; /* Not computed yet. */ |
7377 | 0 | key->hash = |
7378 | 0 | (md_is_valid == false) |
7379 | 0 | ? dpif_netdev_packet_get_rss_hash_orig_pkt(packet, &key->mf) |
7380 | 0 | : dpif_netdev_packet_get_rss_hash(packet, &key->mf); |
7381 | | |
7382 | | /* If EMC is disabled skip emc_lookup */ |
7383 | 0 | flow = (cur_min != 0) ? emc_lookup(&cache->emc_cache, key) : NULL; |
7384 | 0 | if (OVS_LIKELY(flow)) { |
7385 | 0 | tcp_flags = miniflow_get_tcp_flags(&key->mf); |
7386 | 0 | n_emc_hit++; |
7387 | 0 | dfc_processing_enqueue_classified_packet( |
7388 | 0 | packet, flow, tcp_flags, batch_enable, |
7389 | 0 | batches, n_batches, flow_map, &map_cnt); |
7390 | 0 | } else { |
7391 | | /* Exact match cache missed. Group missed packets together at |
7392 | | * the beginning of the 'packets' array. */ |
7393 | 0 | dp_packet_batch_refill(packets_, packet, i); |
7394 | | |
7395 | | /* Preserve the order of packet for flow batching. */ |
7396 | 0 | index_map[n_missed] = map_cnt; |
7397 | 0 | flow_map[map_cnt++].flow = NULL; |
7398 | | |
7399 | | /* 'key[n_missed]' contains the key of the current packet and it |
7400 | | * will be passed to SMC lookup. The next key should be extracted |
7401 | | * to 'keys[n_missed + 1]'. |
7402 | | * We also maintain a pointer array to keys missed both SMC and EMC |
7403 | | * which will be returned to the caller for future processing. */ |
7404 | 0 | missed_keys[n_missed] = key; |
7405 | 0 | key = &keys[++n_missed]; |
7406 | | |
7407 | | /* Skip batching for subsequent packets to avoid reordering. */ |
7408 | 0 | batch_enable = false; |
7409 | 0 | } |
7410 | 0 | } |
7411 | | /* Count of packets which are not flow batched. */ |
7412 | 0 | *n_flows = map_cnt; |
7413 | |
|
7414 | 0 | pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_PHWOL_HIT, n_phwol_hit); |
7415 | 0 | pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SIMPLE_HIT, |
7416 | 0 | n_simple_hit); |
7417 | 0 | pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_EXACT_HIT, n_emc_hit); |
7418 | |
|
7419 | 0 | if (!smc_enable_db) { |
7420 | 0 | return dp_packet_batch_size(packets_); |
7421 | 0 | } |
7422 | | |
7423 | | /* Packets miss EMC will do a batch lookup in SMC if enabled */ |
7424 | 0 | smc_lookup_batch(pmd, keys, missed_keys, packets_, |
7425 | 0 | n_missed, flow_map, index_map); |
7426 | |
|
7427 | 0 | return dp_packet_batch_size(packets_); |
7428 | 0 | } |
7429 | | |
7430 | | static inline int |
7431 | | handle_packet_upcall(struct dp_netdev_pmd_thread *pmd, |
7432 | | struct dp_packet *packet, |
7433 | | const struct netdev_flow_key *key, |
7434 | | struct ofpbuf *actions, struct ofpbuf *put_actions) |
7435 | 0 | { |
7436 | 0 | struct ofpbuf *add_actions; |
7437 | 0 | struct dp_packet_batch b; |
7438 | 0 | struct match match; |
7439 | 0 | ovs_u128 ufid; |
7440 | 0 | int error; |
7441 | 0 | uint64_t cycles = cycles_counter_update(&pmd->perf_stats); |
7442 | 0 | odp_port_t orig_in_port = packet->md.orig_in_port; |
7443 | |
|
7444 | 0 | match.tun_md.valid = false; |
7445 | 0 | miniflow_expand(&key->mf, &match.flow); |
7446 | 0 | memset(&match.wc, 0, sizeof match.wc); |
7447 | |
|
7448 | 0 | ofpbuf_clear(actions); |
7449 | 0 | ofpbuf_clear(put_actions); |
7450 | |
|
7451 | 0 | odp_flow_key_hash(&match.flow, sizeof match.flow, &ufid); |
7452 | 0 | error = dp_netdev_upcall(pmd, packet, &match.flow, &match.wc, |
7453 | 0 | &ufid, DPIF_UC_MISS, NULL, actions, |
7454 | 0 | put_actions); |
7455 | 0 | if (OVS_UNLIKELY(error && error != ENOSPC)) { |
7456 | 0 | dp_packet_delete(packet); |
7457 | 0 | COVERAGE_INC(datapath_drop_upcall_error); |
7458 | 0 | return error; |
7459 | 0 | } |
7460 | | |
7461 | | /* The Netlink encoding of datapath flow keys cannot express |
7462 | | * wildcarding the presence of a VLAN tag. Instead, a missing VLAN |
7463 | | * tag is interpreted as exact match on the fact that there is no |
7464 | | * VLAN. Unless we refactor a lot of code that translates between |
7465 | | * Netlink and struct flow representations, we have to do the same |
7466 | | * here. This must be in sync with 'match' in dpif_netdev_flow_put(). */ |
7467 | 0 | if (!match.wc.masks.vlans[0].tci) { |
7468 | 0 | match.wc.masks.vlans[0].tci = htons(VLAN_VID_MASK | VLAN_CFI); |
7469 | 0 | } |
7470 | | |
7471 | | /* We can't allow the packet batching in the next loop to execute |
7472 | | * the actions. Otherwise, if there are any slow path actions, |
7473 | | * we'll send the packet up twice. */ |
7474 | 0 | dp_packet_batch_init_packet(&b, packet); |
7475 | 0 | dp_netdev_execute_actions(pmd, &b, true, &match.flow, |
7476 | 0 | actions->data, actions->size); |
7477 | |
|
7478 | 0 | add_actions = put_actions->size ? put_actions : actions; |
7479 | 0 | if (OVS_LIKELY(error != ENOSPC)) { |
7480 | 0 | struct dp_netdev_flow *netdev_flow; |
7481 | | |
7482 | | /* XXX: There's a race window where a flow covering this packet |
7483 | | * could have already been installed since we last did the flow |
7484 | | * lookup before upcall. This could be solved by moving the |
7485 | | * mutex lock outside the loop, but that's an awful long time |
7486 | | * to be locking revalidators out of making flow modifications. */ |
7487 | 0 | ovs_mutex_lock(&pmd->flow_mutex); |
7488 | 0 | netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL); |
7489 | 0 | if (OVS_LIKELY(!netdev_flow)) { |
7490 | 0 | netdev_flow = dp_netdev_flow_add(pmd, &match, &ufid, |
7491 | 0 | add_actions->data, |
7492 | 0 | add_actions->size, orig_in_port); |
7493 | 0 | } |
7494 | 0 | ovs_mutex_unlock(&pmd->flow_mutex); |
7495 | 0 | uint32_t hash = dp_netdev_flow_hash(&netdev_flow->ufid); |
7496 | 0 | smc_insert(pmd, key, hash); |
7497 | 0 | emc_probabilistic_insert(pmd, key, netdev_flow); |
7498 | 0 | } |
7499 | 0 | if (pmd_perf_metrics_enabled(pmd)) { |
7500 | | /* Update upcall stats. */ |
7501 | 0 | cycles = cycles_counter_update(&pmd->perf_stats) - cycles; |
7502 | 0 | struct pmd_perf_stats *s = &pmd->perf_stats; |
7503 | 0 | s->current.upcalls++; |
7504 | 0 | s->current.upcall_cycles += cycles; |
7505 | 0 | histogram_add_sample(&s->cycles_per_upcall, cycles); |
7506 | 0 | } |
7507 | 0 | return error; |
7508 | 0 | } |
7509 | | |
7510 | | static inline void |
7511 | | fast_path_processing(struct dp_netdev_pmd_thread *pmd, |
7512 | | struct dp_packet_batch *packets_, |
7513 | | struct netdev_flow_key **keys, |
7514 | | struct dp_packet_flow_map *flow_map, |
7515 | | uint8_t *index_map, |
7516 | | odp_port_t in_port) |
7517 | 0 | { |
7518 | 0 | const size_t cnt = dp_packet_batch_size(packets_); |
7519 | 0 | #ifndef __CHECKER__ |
7520 | 0 | const size_t PKT_ARRAY_SIZE = cnt; |
7521 | | #else |
7522 | | /* Sparse doesn't like variable length array. */ |
7523 | | enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST }; |
7524 | | #endif |
7525 | 0 | struct dp_packet *packet; |
7526 | 0 | struct dpcls *cls; |
7527 | 0 | struct dpcls_rule *rules[PKT_ARRAY_SIZE]; |
7528 | 0 | struct dp_netdev *dp = pmd->dp; |
7529 | 0 | int upcall_ok_cnt = 0, upcall_fail_cnt = 0; |
7530 | 0 | int lookup_cnt = 0, add_lookup_cnt; |
7531 | 0 | bool any_miss; |
7532 | |
|
7533 | 0 | for (size_t i = 0; i < cnt; i++) { |
7534 | | /* Key length is needed in all the cases, hash computed on demand. */ |
7535 | 0 | keys[i]->len = netdev_flow_key_size(miniflow_n_values(&keys[i]->mf)); |
7536 | 0 | } |
7537 | | /* Get the classifier for the in_port */ |
7538 | 0 | cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port); |
7539 | 0 | if (OVS_LIKELY(cls)) { |
7540 | 0 | any_miss = !dpcls_lookup(cls, (const struct netdev_flow_key **)keys, |
7541 | 0 | rules, cnt, &lookup_cnt); |
7542 | 0 | } else { |
7543 | 0 | any_miss = true; |
7544 | 0 | memset(rules, 0, sizeof(rules)); |
7545 | 0 | } |
7546 | 0 | if (OVS_UNLIKELY(any_miss) && !fat_rwlock_tryrdlock(&dp->upcall_rwlock)) { |
7547 | 0 | uint64_t actions_stub[512 / 8], slow_stub[512 / 8]; |
7548 | 0 | struct ofpbuf actions, put_actions; |
7549 | |
|
7550 | 0 | ofpbuf_use_stub(&actions, actions_stub, sizeof actions_stub); |
7551 | 0 | ofpbuf_use_stub(&put_actions, slow_stub, sizeof slow_stub); |
7552 | |
|
7553 | 0 | DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) { |
7554 | 0 | struct dp_netdev_flow *netdev_flow; |
7555 | |
|
7556 | 0 | if (OVS_LIKELY(rules[i])) { |
7557 | 0 | continue; |
7558 | 0 | } |
7559 | | |
7560 | | /* It's possible that an earlier slow path execution installed |
7561 | | * a rule covering this flow. In this case, it's a lot cheaper |
7562 | | * to catch it here than execute a miss. */ |
7563 | 0 | netdev_flow = dp_netdev_pmd_lookup_flow(pmd, keys[i], |
7564 | 0 | &add_lookup_cnt); |
7565 | 0 | if (netdev_flow) { |
7566 | 0 | lookup_cnt += add_lookup_cnt; |
7567 | 0 | rules[i] = &netdev_flow->cr; |
7568 | 0 | continue; |
7569 | 0 | } |
7570 | | |
7571 | 0 | int error = handle_packet_upcall(pmd, packet, keys[i], |
7572 | 0 | &actions, &put_actions); |
7573 | |
|
7574 | 0 | if (OVS_UNLIKELY(error)) { |
7575 | 0 | upcall_fail_cnt++; |
7576 | 0 | } else { |
7577 | 0 | upcall_ok_cnt++; |
7578 | 0 | } |
7579 | 0 | } |
7580 | |
|
7581 | 0 | ofpbuf_uninit(&actions); |
7582 | 0 | ofpbuf_uninit(&put_actions); |
7583 | 0 | fat_rwlock_unlock(&dp->upcall_rwlock); |
7584 | 0 | } else if (OVS_UNLIKELY(any_miss)) { |
7585 | 0 | DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) { |
7586 | 0 | if (OVS_UNLIKELY(!rules[i])) { |
7587 | 0 | dp_packet_delete(packet); |
7588 | 0 | COVERAGE_INC(datapath_drop_lock_error); |
7589 | 0 | upcall_fail_cnt++; |
7590 | 0 | } |
7591 | 0 | } |
7592 | 0 | } |
7593 | |
|
7594 | 0 | DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) { |
7595 | 0 | struct dp_netdev_flow *flow; |
7596 | | /* Get the original order of this packet in received batch. */ |
7597 | 0 | int recv_idx = index_map[i]; |
7598 | 0 | uint16_t tcp_flags; |
7599 | |
|
7600 | 0 | if (OVS_UNLIKELY(!rules[i])) { |
7601 | 0 | continue; |
7602 | 0 | } |
7603 | | |
7604 | 0 | flow = dp_netdev_flow_cast(rules[i]); |
7605 | 0 | uint32_t hash = dp_netdev_flow_hash(&flow->ufid); |
7606 | 0 | smc_insert(pmd, keys[i], hash); |
7607 | |
|
7608 | 0 | emc_probabilistic_insert(pmd, keys[i], flow); |
7609 | | /* Add these packets into the flow map in the same order |
7610 | | * as received. |
7611 | | */ |
7612 | 0 | tcp_flags = miniflow_get_tcp_flags(&keys[i]->mf); |
7613 | 0 | packet_enqueue_to_flow_map(packet, flow, tcp_flags, |
7614 | 0 | flow_map, recv_idx); |
7615 | 0 | } |
7616 | |
|
7617 | 0 | pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_HIT, |
7618 | 0 | cnt - upcall_ok_cnt - upcall_fail_cnt); |
7619 | 0 | pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_LOOKUP, |
7620 | 0 | lookup_cnt); |
7621 | 0 | pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MISS, |
7622 | 0 | upcall_ok_cnt); |
7623 | 0 | pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_LOST, |
7624 | 0 | upcall_fail_cnt); |
7625 | 0 | } |
7626 | | |
7627 | | /* Packets enter the datapath from a port (or from recirculation) here. |
7628 | | * |
7629 | | * When 'md_is_valid' is true the metadata in 'packets' are already valid. |
7630 | | * When false the metadata in 'packets' need to be initialized. */ |
7631 | | static void |
7632 | | dp_netdev_input__(struct dp_netdev_pmd_thread *pmd, |
7633 | | struct dp_packet_batch *packets, |
7634 | | bool md_is_valid, odp_port_t port_no) |
7635 | 0 | { |
7636 | 0 | #ifndef __CHECKER__ |
7637 | 0 | const size_t PKT_ARRAY_SIZE = dp_packet_batch_size(packets); |
7638 | | #else |
7639 | | /* Sparse doesn't like variable length array. */ |
7640 | | enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST }; |
7641 | | #endif |
7642 | 0 | OVS_ALIGNED_VAR(CACHE_LINE_SIZE) |
7643 | 0 | struct netdev_flow_key keys[PKT_ARRAY_SIZE]; |
7644 | 0 | struct netdev_flow_key *missed_keys[PKT_ARRAY_SIZE]; |
7645 | 0 | struct packet_batch_per_flow batches[PKT_ARRAY_SIZE]; |
7646 | 0 | size_t n_batches; |
7647 | 0 | struct dp_packet_flow_map flow_map[PKT_ARRAY_SIZE]; |
7648 | 0 | uint8_t index_map[PKT_ARRAY_SIZE]; |
7649 | 0 | size_t n_flows, i; |
7650 | |
|
7651 | 0 | odp_port_t in_port; |
7652 | |
|
7653 | 0 | n_batches = 0; |
7654 | 0 | dfc_processing(pmd, packets, keys, missed_keys, batches, &n_batches, |
7655 | 0 | flow_map, &n_flows, index_map, md_is_valid, port_no); |
7656 | |
|
7657 | 0 | if (!dp_packet_batch_is_empty(packets)) { |
7658 | | /* Get ingress port from first packet's metadata. */ |
7659 | 0 | in_port = packets->packets[0]->md.in_port.odp_port; |
7660 | 0 | fast_path_processing(pmd, packets, missed_keys, |
7661 | 0 | flow_map, index_map, in_port); |
7662 | 0 | } |
7663 | | |
7664 | | /* Batch rest of packets which are in flow map. */ |
7665 | 0 | for (i = 0; i < n_flows; i++) { |
7666 | 0 | struct dp_packet_flow_map *map = &flow_map[i]; |
7667 | |
|
7668 | 0 | if (OVS_UNLIKELY(!map->flow)) { |
7669 | 0 | continue; |
7670 | 0 | } |
7671 | 0 | dp_netdev_queue_batches(map->packet, map->flow, map->tcp_flags, |
7672 | 0 | batches, &n_batches); |
7673 | 0 | } |
7674 | | |
7675 | | /* All the flow batches need to be reset before any call to |
7676 | | * packet_batch_per_flow_execute() as it could potentially trigger |
7677 | | * recirculation. When a packet matching flow 'j' happens to be |
7678 | | * recirculated, the nested call to dp_netdev_input__() could potentially |
7679 | | * classify the packet as matching another flow - say 'k'. It could happen |
7680 | | * that in the previous call to dp_netdev_input__() that same flow 'k' had |
7681 | | * already its own batches[k] still waiting to be served. So if its |
7682 | | * 'batch' member is not reset, the recirculated packet would be wrongly |
7683 | | * appended to batches[k] of the 1st call to dp_netdev_input__(). */ |
7684 | 0 | for (i = 0; i < n_batches; i++) { |
7685 | 0 | batches[i].flow->batch = NULL; |
7686 | 0 | } |
7687 | |
|
7688 | 0 | for (i = 0; i < n_batches; i++) { |
7689 | 0 | packet_batch_per_flow_execute(&batches[i], pmd); |
7690 | 0 | } |
7691 | 0 | } |
7692 | | |
7693 | | static void |
7694 | | dp_netdev_input(struct dp_netdev_pmd_thread *pmd, |
7695 | | struct dp_packet_batch *packets, |
7696 | | odp_port_t port_no) |
7697 | 0 | { |
7698 | 0 | dp_netdev_input__(pmd, packets, false, port_no); |
7699 | 0 | } |
7700 | | |
7701 | | static void |
7702 | | dp_netdev_recirculate(struct dp_netdev_pmd_thread *pmd, |
7703 | | struct dp_packet_batch *packets) |
7704 | 0 | { |
7705 | 0 | dp_netdev_input__(pmd, packets, true, 0); |
7706 | 0 | } |
7707 | | |
7708 | | struct dp_netdev_execute_aux { |
7709 | | struct dp_netdev_pmd_thread *pmd; |
7710 | | const struct flow *flow; |
7711 | | }; |
7712 | | |
7713 | | static void |
7714 | | dpif_netdev_register_dp_purge_cb(struct dpif *dpif, dp_purge_callback *cb, |
7715 | | void *aux) |
7716 | 0 | { |
7717 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
7718 | 0 | dp->dp_purge_aux = aux; |
7719 | 0 | dp->dp_purge_cb = cb; |
7720 | 0 | } |
7721 | | |
7722 | | static void |
7723 | | dpif_netdev_register_upcall_cb(struct dpif *dpif, upcall_callback *cb, |
7724 | | void *aux) |
7725 | 0 | { |
7726 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
7727 | 0 | dp->upcall_aux = aux; |
7728 | 0 | dp->upcall_cb = cb; |
7729 | 0 | } |
7730 | | |
7731 | | static void |
7732 | | dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd, |
7733 | | bool purge) |
7734 | 0 | { |
7735 | 0 | struct tx_port *tx; |
7736 | 0 | struct dp_netdev_port *port; |
7737 | 0 | long long interval; |
7738 | |
|
7739 | 0 | HMAP_FOR_EACH (tx, node, &pmd->send_port_cache) { |
7740 | 0 | if (tx->port->txq_mode != TXQ_MODE_XPS) { |
7741 | 0 | continue; |
7742 | 0 | } |
7743 | 0 | interval = pmd->ctx.now - tx->last_used; |
7744 | 0 | if (tx->qid >= 0 && (purge || interval >= XPS_TIMEOUT)) { |
7745 | 0 | port = tx->port; |
7746 | 0 | ovs_mutex_lock(&port->txq_used_mutex); |
7747 | 0 | port->txq_used[tx->qid]--; |
7748 | 0 | ovs_mutex_unlock(&port->txq_used_mutex); |
7749 | 0 | tx->qid = -1; |
7750 | 0 | } |
7751 | 0 | } |
7752 | 0 | } |
7753 | | |
7754 | | static int |
7755 | | dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd, |
7756 | | struct tx_port *tx) |
7757 | 0 | { |
7758 | 0 | struct dp_netdev_port *port; |
7759 | 0 | long long interval; |
7760 | 0 | int i, min_cnt, min_qid; |
7761 | |
|
7762 | 0 | interval = pmd->ctx.now - tx->last_used; |
7763 | 0 | tx->last_used = pmd->ctx.now; |
7764 | |
|
7765 | 0 | if (OVS_LIKELY(tx->qid >= 0 && interval < XPS_TIMEOUT)) { |
7766 | 0 | return tx->qid; |
7767 | 0 | } |
7768 | | |
7769 | 0 | port = tx->port; |
7770 | |
|
7771 | 0 | ovs_mutex_lock(&port->txq_used_mutex); |
7772 | 0 | if (tx->qid >= 0) { |
7773 | 0 | port->txq_used[tx->qid]--; |
7774 | 0 | tx->qid = -1; |
7775 | 0 | } |
7776 | |
|
7777 | 0 | min_cnt = -1; |
7778 | 0 | min_qid = 0; |
7779 | 0 | for (i = 0; i < netdev_n_txq(port->netdev); i++) { |
7780 | 0 | if (port->txq_used[i] < min_cnt || min_cnt == -1) { |
7781 | 0 | min_cnt = port->txq_used[i]; |
7782 | 0 | min_qid = i; |
7783 | 0 | } |
7784 | 0 | } |
7785 | |
|
7786 | 0 | port->txq_used[min_qid]++; |
7787 | 0 | tx->qid = min_qid; |
7788 | |
|
7789 | 0 | ovs_mutex_unlock(&port->txq_used_mutex); |
7790 | |
|
7791 | 0 | dpif_netdev_xps_revalidate_pmd(pmd, false); |
7792 | |
|
7793 | 0 | VLOG_DBG("Core %d: New TX queue ID %d for port \'%s\'.", |
7794 | 0 | pmd->core_id, tx->qid, netdev_get_name(tx->port->netdev)); |
7795 | 0 | return min_qid; |
7796 | 0 | } |
7797 | | |
7798 | | static struct tx_port * |
7799 | | pmd_tnl_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd, |
7800 | | odp_port_t port_no) |
7801 | 0 | { |
7802 | 0 | return tx_port_lookup(&pmd->tnl_port_cache, port_no); |
7803 | 0 | } |
7804 | | |
7805 | | static struct tx_port * |
7806 | | pmd_send_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd, |
7807 | | odp_port_t port_no) |
7808 | 0 | { |
7809 | 0 | return tx_port_lookup(&pmd->send_port_cache, port_no); |
7810 | 0 | } |
7811 | | |
7812 | | static int |
7813 | | push_tnl_action(const struct dp_netdev_pmd_thread *pmd, |
7814 | | const struct nlattr *attr, |
7815 | | struct dp_packet_batch *batch) |
7816 | 0 | { |
7817 | 0 | const struct netdev *ingress_netdev = NULL; |
7818 | 0 | const struct ovs_action_push_tnl *data; |
7819 | 0 | struct tx_port *tun_port; |
7820 | 0 | int err; |
7821 | |
|
7822 | 0 | data = nl_attr_get(attr); |
7823 | |
|
7824 | 0 | tun_port = pmd_tnl_port_cache_lookup(pmd, data->tnl_port); |
7825 | 0 | if (!tun_port) { |
7826 | 0 | err = -EINVAL; |
7827 | 0 | goto error; |
7828 | 0 | } |
7829 | | |
7830 | 0 | if (dpif_offload_enabled() && !dp_packet_batch_is_empty(batch)) { |
7831 | | /* To avoid multiple port lookups per batch, assume that all packets |
7832 | | * in the batch originate from the same flow and therefore share the |
7833 | | * same original input port. */ |
7834 | 0 | struct tx_port *in_port = pmd_send_port_cache_lookup( |
7835 | 0 | pmd, batch->packets[0]->md.orig_in_port); |
7836 | 0 | if (in_port) { |
7837 | 0 | ingress_netdev = in_port->port->netdev; |
7838 | 0 | } |
7839 | 0 | } |
7840 | |
|
7841 | 0 | err = netdev_push_header(tun_port->port->netdev, ingress_netdev, batch, |
7842 | 0 | data); |
7843 | 0 | if (!err) { |
7844 | 0 | return 0; |
7845 | 0 | } |
7846 | 0 | error: |
7847 | 0 | dp_packet_delete_batch(batch, true); |
7848 | 0 | return err; |
7849 | 0 | } |
7850 | | |
7851 | | static void |
7852 | | dp_execute_userspace_action(struct dp_netdev_pmd_thread *pmd, |
7853 | | struct dp_packet *packet, bool should_steal, |
7854 | | struct flow *flow, ovs_u128 *ufid, |
7855 | | struct ofpbuf *actions, |
7856 | | const struct nlattr *userdata) |
7857 | 0 | { |
7858 | 0 | struct dp_packet_batch b; |
7859 | 0 | int error; |
7860 | |
|
7861 | 0 | ofpbuf_clear(actions); |
7862 | |
|
7863 | 0 | error = dp_netdev_upcall(pmd, packet, flow, NULL, ufid, |
7864 | 0 | DPIF_UC_ACTION, userdata, actions, |
7865 | 0 | NULL); |
7866 | 0 | if (!error || error == ENOSPC) { |
7867 | 0 | dp_packet_batch_init_packet(&b, packet); |
7868 | 0 | dp_netdev_execute_actions(pmd, &b, should_steal, flow, |
7869 | 0 | actions->data, actions->size); |
7870 | 0 | } else if (should_steal) { |
7871 | 0 | dp_packet_delete(packet); |
7872 | 0 | COVERAGE_INC(datapath_drop_userspace_action_error); |
7873 | 0 | } |
7874 | 0 | } |
7875 | | |
7876 | | static bool |
7877 | | dp_execute_output_action(struct dp_netdev_pmd_thread *pmd, |
7878 | | struct dp_packet_batch *packets_, |
7879 | | bool should_steal, odp_port_t port_no) |
7880 | 0 | { |
7881 | 0 | struct tx_port *p = pmd_send_port_cache_lookup(pmd, port_no); |
7882 | 0 | struct dp_packet_batch out; |
7883 | |
|
7884 | 0 | if (!OVS_LIKELY(p)) { |
7885 | 0 | COVERAGE_ADD(datapath_drop_invalid_port, |
7886 | 0 | dp_packet_batch_size(packets_)); |
7887 | 0 | dp_packet_delete_batch(packets_, should_steal); |
7888 | 0 | return false; |
7889 | 0 | } |
7890 | 0 | if (!should_steal) { |
7891 | 0 | dp_packet_batch_clone(&out, packets_); |
7892 | 0 | dp_packet_batch_reset_cutlen(packets_); |
7893 | 0 | packets_ = &out; |
7894 | 0 | } |
7895 | 0 | dp_packet_batch_apply_cutlen(packets_); |
7896 | 0 | if (dp_packet_batch_size(&p->output_pkts) |
7897 | 0 | + dp_packet_batch_size(packets_) > NETDEV_MAX_BURST) { |
7898 | | /* Flush here to avoid overflow. */ |
7899 | 0 | dp_netdev_pmd_flush_output_on_port(pmd, p); |
7900 | 0 | } |
7901 | 0 | if (dp_packet_batch_is_empty(&p->output_pkts)) { |
7902 | 0 | pmd->n_output_batches++; |
7903 | 0 | } |
7904 | |
|
7905 | 0 | struct dp_packet *packet; |
7906 | 0 | DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) { |
7907 | 0 | p->output_pkts_rxqs[dp_packet_batch_size(&p->output_pkts)] = |
7908 | 0 | pmd->ctx.last_rxq; |
7909 | 0 | dp_packet_batch_add(&p->output_pkts, packet); |
7910 | 0 | } |
7911 | 0 | return true; |
7912 | 0 | } |
7913 | | |
7914 | | static void |
7915 | | dp_execute_lb_output_action(struct dp_netdev_pmd_thread *pmd, |
7916 | | struct dp_packet_batch *packets_, |
7917 | | bool should_steal, uint32_t bond) |
7918 | 0 | { |
7919 | 0 | struct tx_bond *p_bond = tx_bond_lookup(&pmd->tx_bonds, bond); |
7920 | 0 | struct dp_packet_batch out; |
7921 | 0 | struct dp_packet *packet; |
7922 | |
|
7923 | 0 | if (!p_bond) { |
7924 | 0 | COVERAGE_ADD(datapath_drop_invalid_bond, |
7925 | 0 | dp_packet_batch_size(packets_)); |
7926 | 0 | dp_packet_delete_batch(packets_, should_steal); |
7927 | 0 | return; |
7928 | 0 | } |
7929 | 0 | if (!should_steal) { |
7930 | 0 | dp_packet_batch_clone(&out, packets_); |
7931 | 0 | dp_packet_batch_reset_cutlen(packets_); |
7932 | 0 | packets_ = &out; |
7933 | 0 | } |
7934 | 0 | dp_packet_batch_apply_cutlen(packets_); |
7935 | |
|
7936 | 0 | DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) { |
7937 | | /* |
7938 | | * Lookup the bond-hash table using hash to get the member. |
7939 | | */ |
7940 | 0 | uint32_t hash = dp_packet_get_rss_hash(packet); |
7941 | 0 | struct member_entry *s_entry |
7942 | 0 | = &p_bond->member_buckets[hash & BOND_MASK]; |
7943 | 0 | odp_port_t bond_member = s_entry->member_id; |
7944 | 0 | uint32_t size = dp_packet_size(packet); |
7945 | 0 | struct dp_packet_batch output_pkt; |
7946 | |
|
7947 | 0 | dp_packet_batch_init_packet(&output_pkt, packet); |
7948 | 0 | if (OVS_LIKELY(dp_execute_output_action(pmd, &output_pkt, true, |
7949 | 0 | bond_member))) { |
7950 | | /* Update member stats. */ |
7951 | 0 | non_atomic_ullong_add(&s_entry->n_packets, 1); |
7952 | 0 | non_atomic_ullong_add(&s_entry->n_bytes, size); |
7953 | 0 | } |
7954 | 0 | } |
7955 | 0 | } |
7956 | | |
7957 | | static void |
7958 | | dp_execute_cb(void *aux_, struct dp_packet_batch *packets_, |
7959 | | const struct nlattr *a, bool should_steal) |
7960 | | OVS_NO_THREAD_SAFETY_ANALYSIS |
7961 | 0 | { |
7962 | 0 | struct dp_netdev_execute_aux *aux = aux_; |
7963 | 0 | uint32_t *depth = recirc_depth_get(); |
7964 | 0 | struct dp_netdev_pmd_thread *pmd = aux->pmd; |
7965 | 0 | struct dp_netdev *dp = pmd->dp; |
7966 | 0 | int type = nl_attr_type(a); |
7967 | 0 | struct tx_port *p; |
7968 | 0 | uint32_t packet_count, packets_dropped; |
7969 | |
|
7970 | 0 | switch ((enum ovs_action_attr)type) { |
7971 | 0 | case OVS_ACTION_ATTR_OUTPUT: |
7972 | 0 | dp_execute_output_action(pmd, packets_, should_steal, |
7973 | 0 | nl_attr_get_odp_port(a)); |
7974 | 0 | return; |
7975 | | |
7976 | 0 | case OVS_ACTION_ATTR_LB_OUTPUT: |
7977 | 0 | dp_execute_lb_output_action(pmd, packets_, should_steal, |
7978 | 0 | nl_attr_get_u32(a)); |
7979 | 0 | return; |
7980 | | |
7981 | 0 | case OVS_ACTION_ATTR_TUNNEL_PUSH: |
7982 | 0 | if (should_steal) { |
7983 | | /* We're requested to push tunnel header, but also we need to take |
7984 | | * the ownership of these packets. Thus, we can avoid performing |
7985 | | * the action, because the caller will not use the result anyway. |
7986 | | * Just break to free the batch. */ |
7987 | 0 | break; |
7988 | 0 | } |
7989 | 0 | dp_packet_batch_apply_cutlen(packets_); |
7990 | 0 | packet_count = dp_packet_batch_size(packets_); |
7991 | 0 | if (push_tnl_action(pmd, a, packets_)) { |
7992 | 0 | COVERAGE_ADD(datapath_drop_tunnel_push_error, |
7993 | 0 | packet_count); |
7994 | 0 | } |
7995 | 0 | return; |
7996 | | |
7997 | 0 | case OVS_ACTION_ATTR_TUNNEL_POP: |
7998 | 0 | if (*depth < MAX_RECIRC_DEPTH) { |
7999 | 0 | struct dp_packet_batch *orig_packets_ = packets_; |
8000 | 0 | odp_port_t portno = nl_attr_get_odp_port(a); |
8001 | |
|
8002 | 0 | p = pmd_tnl_port_cache_lookup(pmd, portno); |
8003 | 0 | if (p) { |
8004 | 0 | struct dp_packet_batch tnl_pkt; |
8005 | |
|
8006 | 0 | if (!should_steal) { |
8007 | 0 | dp_packet_batch_clone(&tnl_pkt, packets_); |
8008 | 0 | packets_ = &tnl_pkt; |
8009 | 0 | dp_packet_batch_reset_cutlen(orig_packets_); |
8010 | 0 | } |
8011 | |
|
8012 | 0 | dp_packet_batch_apply_cutlen(packets_); |
8013 | |
|
8014 | 0 | packet_count = dp_packet_batch_size(packets_); |
8015 | 0 | netdev_pop_header(p->port->netdev, packets_); |
8016 | 0 | packets_dropped = |
8017 | 0 | packet_count - dp_packet_batch_size(packets_); |
8018 | 0 | if (packets_dropped) { |
8019 | 0 | COVERAGE_ADD(datapath_drop_tunnel_pop_error, |
8020 | 0 | packets_dropped); |
8021 | 0 | } |
8022 | 0 | if (dp_packet_batch_is_empty(packets_)) { |
8023 | 0 | return; |
8024 | 0 | } |
8025 | | |
8026 | 0 | struct dp_packet *packet; |
8027 | 0 | DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) { |
8028 | 0 | packet->md.in_port.odp_port = portno; |
8029 | 0 | } |
8030 | |
|
8031 | 0 | (*depth)++; |
8032 | 0 | dp_netdev_recirculate(pmd, packets_); |
8033 | 0 | (*depth)--; |
8034 | 0 | return; |
8035 | 0 | } |
8036 | 0 | COVERAGE_ADD(datapath_drop_invalid_tnl_port, |
8037 | 0 | dp_packet_batch_size(packets_)); |
8038 | 0 | } else { |
8039 | 0 | COVERAGE_ADD(datapath_drop_recirc_error, |
8040 | 0 | dp_packet_batch_size(packets_)); |
8041 | 0 | } |
8042 | 0 | break; |
8043 | | |
8044 | 0 | case OVS_ACTION_ATTR_USERSPACE: |
8045 | 0 | if (!fat_rwlock_tryrdlock(&dp->upcall_rwlock)) { |
8046 | 0 | struct dp_packet_batch *orig_packets_ = packets_; |
8047 | 0 | const struct nlattr *userdata; |
8048 | 0 | struct dp_packet_batch usr_pkt; |
8049 | 0 | struct ofpbuf actions; |
8050 | 0 | struct flow flow; |
8051 | 0 | ovs_u128 ufid; |
8052 | 0 | bool clone = false; |
8053 | |
|
8054 | 0 | userdata = nl_attr_find_nested(a, OVS_USERSPACE_ATTR_USERDATA); |
8055 | 0 | ofpbuf_init(&actions, 0); |
8056 | |
|
8057 | 0 | if (packets_->trunc) { |
8058 | 0 | if (!should_steal) { |
8059 | 0 | dp_packet_batch_clone(&usr_pkt, packets_); |
8060 | 0 | packets_ = &usr_pkt; |
8061 | 0 | clone = true; |
8062 | 0 | dp_packet_batch_reset_cutlen(orig_packets_); |
8063 | 0 | } |
8064 | |
|
8065 | 0 | dp_packet_batch_apply_cutlen(packets_); |
8066 | 0 | } |
8067 | |
|
8068 | 0 | struct dp_packet *packet; |
8069 | 0 | DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) { |
8070 | 0 | flow_extract(packet, &flow); |
8071 | 0 | odp_flow_key_hash(&flow, sizeof flow, &ufid); |
8072 | 0 | dp_execute_userspace_action(pmd, packet, should_steal, &flow, |
8073 | 0 | &ufid, &actions, userdata); |
8074 | 0 | } |
8075 | |
|
8076 | 0 | if (clone) { |
8077 | 0 | dp_packet_delete_batch(packets_, true); |
8078 | 0 | } |
8079 | |
|
8080 | 0 | ofpbuf_uninit(&actions); |
8081 | 0 | fat_rwlock_unlock(&dp->upcall_rwlock); |
8082 | |
|
8083 | 0 | return; |
8084 | 0 | } |
8085 | 0 | COVERAGE_ADD(datapath_drop_lock_error, |
8086 | 0 | dp_packet_batch_size(packets_)); |
8087 | 0 | break; |
8088 | | |
8089 | 0 | case OVS_ACTION_ATTR_RECIRC: |
8090 | 0 | if (*depth < MAX_RECIRC_DEPTH) { |
8091 | 0 | struct dp_packet_batch recirc_pkts; |
8092 | |
|
8093 | 0 | if (!should_steal) { |
8094 | 0 | dp_packet_batch_clone(&recirc_pkts, packets_); |
8095 | 0 | packets_ = &recirc_pkts; |
8096 | 0 | } |
8097 | |
|
8098 | 0 | struct dp_packet *packet; |
8099 | 0 | DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) { |
8100 | 0 | packet->md.recirc_id = nl_attr_get_u32(a); |
8101 | 0 | } |
8102 | |
|
8103 | 0 | (*depth)++; |
8104 | 0 | dp_netdev_recirculate(pmd, packets_); |
8105 | 0 | (*depth)--; |
8106 | |
|
8107 | 0 | return; |
8108 | 0 | } |
8109 | | |
8110 | 0 | COVERAGE_ADD(datapath_drop_recirc_error, |
8111 | 0 | dp_packet_batch_size(packets_)); |
8112 | 0 | VLOG_WARN("Packet dropped. Max recirculation depth exceeded."); |
8113 | 0 | break; |
8114 | | |
8115 | 0 | case OVS_ACTION_ATTR_CT: { |
8116 | 0 | const struct nlattr *b; |
8117 | 0 | bool force = false; |
8118 | 0 | bool commit = false; |
8119 | 0 | unsigned int left; |
8120 | 0 | uint16_t zone = 0; |
8121 | 0 | uint32_t tp_id = 0; |
8122 | 0 | const char *helper = NULL; |
8123 | 0 | const uint32_t *setmark = NULL; |
8124 | 0 | const struct ovs_key_ct_labels *setlabel = NULL; |
8125 | 0 | struct nat_action_info_t nat_action_info; |
8126 | 0 | struct nat_action_info_t *nat_action_info_ref = NULL; |
8127 | 0 | bool nat_config = false; |
8128 | |
|
8129 | 0 | NL_ATTR_FOR_EACH_UNSAFE (b, left, nl_attr_get(a), |
8130 | 0 | nl_attr_get_size(a)) { |
8131 | 0 | enum ovs_ct_attr sub_type = nl_attr_type(b); |
8132 | |
|
8133 | 0 | switch(sub_type) { |
8134 | 0 | case OVS_CT_ATTR_FORCE_COMMIT: |
8135 | 0 | force = true; |
8136 | | /* fall through. */ |
8137 | 0 | case OVS_CT_ATTR_COMMIT: |
8138 | 0 | commit = true; |
8139 | 0 | break; |
8140 | 0 | case OVS_CT_ATTR_ZONE: |
8141 | 0 | zone = nl_attr_get_u16(b); |
8142 | 0 | break; |
8143 | 0 | case OVS_CT_ATTR_HELPER: |
8144 | 0 | helper = nl_attr_get_string(b); |
8145 | 0 | break; |
8146 | 0 | case OVS_CT_ATTR_MARK: |
8147 | 0 | setmark = nl_attr_get(b); |
8148 | 0 | break; |
8149 | 0 | case OVS_CT_ATTR_LABELS: |
8150 | 0 | setlabel = nl_attr_get(b); |
8151 | 0 | break; |
8152 | 0 | case OVS_CT_ATTR_EVENTMASK: |
8153 | | /* Silently ignored, as userspace datapath does not generate |
8154 | | * netlink events. */ |
8155 | 0 | break; |
8156 | 0 | case OVS_CT_ATTR_TIMEOUT: |
8157 | 0 | if (!str_to_uint(nl_attr_get_string(b), 10, &tp_id)) { |
8158 | 0 | VLOG_WARN("Invalid Timeout Policy ID: %s.", |
8159 | 0 | nl_attr_get_string(b)); |
8160 | 0 | tp_id = DEFAULT_TP_ID; |
8161 | 0 | } |
8162 | 0 | break; |
8163 | 0 | case OVS_CT_ATTR_NAT: { |
8164 | 0 | const struct nlattr *b_nest; |
8165 | 0 | unsigned int left_nest; |
8166 | 0 | bool ip_min_specified = false; |
8167 | 0 | bool proto_num_min_specified = false; |
8168 | 0 | bool ip_max_specified = false; |
8169 | 0 | bool proto_num_max_specified = false; |
8170 | 0 | memset(&nat_action_info, 0, sizeof nat_action_info); |
8171 | 0 | nat_action_info_ref = &nat_action_info; |
8172 | |
|
8173 | 0 | NL_NESTED_FOR_EACH_UNSAFE (b_nest, left_nest, b) { |
8174 | 0 | enum ovs_nat_attr sub_type_nest = nl_attr_type(b_nest); |
8175 | |
|
8176 | 0 | switch (sub_type_nest) { |
8177 | 0 | case OVS_NAT_ATTR_SRC: |
8178 | 0 | case OVS_NAT_ATTR_DST: |
8179 | 0 | nat_config = true; |
8180 | 0 | nat_action_info.nat_action |= |
8181 | 0 | ((sub_type_nest == OVS_NAT_ATTR_SRC) |
8182 | 0 | ? NAT_ACTION_SRC : NAT_ACTION_DST); |
8183 | 0 | break; |
8184 | 0 | case OVS_NAT_ATTR_IP_MIN: |
8185 | 0 | memcpy(&nat_action_info.min_addr, |
8186 | 0 | nl_attr_get(b_nest), |
8187 | 0 | nl_attr_get_size(b_nest)); |
8188 | 0 | ip_min_specified = true; |
8189 | 0 | break; |
8190 | 0 | case OVS_NAT_ATTR_IP_MAX: |
8191 | 0 | memcpy(&nat_action_info.max_addr, |
8192 | 0 | nl_attr_get(b_nest), |
8193 | 0 | nl_attr_get_size(b_nest)); |
8194 | 0 | ip_max_specified = true; |
8195 | 0 | break; |
8196 | 0 | case OVS_NAT_ATTR_PROTO_MIN: |
8197 | 0 | nat_action_info.min_port = |
8198 | 0 | nl_attr_get_u16(b_nest); |
8199 | 0 | proto_num_min_specified = true; |
8200 | 0 | break; |
8201 | 0 | case OVS_NAT_ATTR_PROTO_MAX: |
8202 | 0 | nat_action_info.max_port = |
8203 | 0 | nl_attr_get_u16(b_nest); |
8204 | 0 | proto_num_max_specified = true; |
8205 | 0 | break; |
8206 | 0 | case OVS_NAT_ATTR_PROTO_RANDOM: |
8207 | 0 | nat_action_info.nat_flags |= NAT_RANGE_RANDOM; |
8208 | 0 | break; |
8209 | 0 | case OVS_NAT_ATTR_PERSISTENT: |
8210 | 0 | nat_action_info.nat_flags |= NAT_PERSISTENT; |
8211 | 0 | break; |
8212 | 0 | case OVS_NAT_ATTR_PROTO_HASH: |
8213 | 0 | break; |
8214 | 0 | case OVS_NAT_ATTR_UNSPEC: |
8215 | 0 | case __OVS_NAT_ATTR_MAX: |
8216 | 0 | OVS_NOT_REACHED(); |
8217 | 0 | } |
8218 | 0 | } |
8219 | | |
8220 | 0 | if (ip_min_specified && !ip_max_specified) { |
8221 | 0 | nat_action_info.max_addr = nat_action_info.min_addr; |
8222 | 0 | } |
8223 | 0 | if (proto_num_min_specified && !proto_num_max_specified) { |
8224 | 0 | nat_action_info.max_port = nat_action_info.min_port; |
8225 | 0 | } |
8226 | 0 | if (proto_num_min_specified || proto_num_max_specified) { |
8227 | 0 | if (nat_action_info.nat_action & NAT_ACTION_SRC) { |
8228 | 0 | nat_action_info.nat_action |= NAT_ACTION_SRC_PORT; |
8229 | 0 | } else if (nat_action_info.nat_action & NAT_ACTION_DST) { |
8230 | 0 | nat_action_info.nat_action |= NAT_ACTION_DST_PORT; |
8231 | 0 | } |
8232 | 0 | } |
8233 | 0 | break; |
8234 | 0 | } |
8235 | 0 | case OVS_CT_ATTR_UNSPEC: |
8236 | 0 | case __OVS_CT_ATTR_MAX: |
8237 | 0 | OVS_NOT_REACHED(); |
8238 | 0 | } |
8239 | 0 | } |
8240 | | |
8241 | | /* We won't be able to function properly in this case, hence |
8242 | | * complain loudly. */ |
8243 | 0 | if (nat_config && !commit) { |
8244 | 0 | static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5); |
8245 | 0 | VLOG_WARN_RL(&rl, "NAT specified without commit."); |
8246 | 0 | } |
8247 | |
|
8248 | 0 | conntrack_execute(dp->conntrack, packets_, aux->flow->dl_type, force, |
8249 | 0 | commit, zone, setmark, setlabel, helper, |
8250 | 0 | nat_action_info_ref, pmd->ctx.now / 1000, tp_id); |
8251 | 0 | break; |
8252 | 0 | } |
8253 | | |
8254 | 0 | case OVS_ACTION_ATTR_METER: |
8255 | 0 | dp_netdev_run_meter(pmd->dp, packets_, nl_attr_get_u32(a), |
8256 | 0 | pmd->ctx.now / 1000); |
8257 | 0 | break; |
8258 | | |
8259 | 0 | case OVS_ACTION_ATTR_PUSH_VLAN: |
8260 | 0 | case OVS_ACTION_ATTR_POP_VLAN: |
8261 | 0 | case OVS_ACTION_ATTR_PUSH_MPLS: |
8262 | 0 | case OVS_ACTION_ATTR_POP_MPLS: |
8263 | 0 | case OVS_ACTION_ATTR_SET: |
8264 | 0 | case OVS_ACTION_ATTR_SET_MASKED: |
8265 | 0 | case OVS_ACTION_ATTR_SAMPLE: |
8266 | 0 | case OVS_ACTION_ATTR_HASH: |
8267 | 0 | case OVS_ACTION_ATTR_UNSPEC: |
8268 | 0 | case OVS_ACTION_ATTR_TRUNC: |
8269 | 0 | case OVS_ACTION_ATTR_PUSH_ETH: |
8270 | 0 | case OVS_ACTION_ATTR_POP_ETH: |
8271 | 0 | case OVS_ACTION_ATTR_CLONE: |
8272 | 0 | case OVS_ACTION_ATTR_PUSH_NSH: |
8273 | 0 | case OVS_ACTION_ATTR_POP_NSH: |
8274 | 0 | case OVS_ACTION_ATTR_CT_CLEAR: |
8275 | 0 | case OVS_ACTION_ATTR_CHECK_PKT_LEN: |
8276 | 0 | case OVS_ACTION_ATTR_DROP: |
8277 | 0 | case OVS_ACTION_ATTR_ADD_MPLS: |
8278 | 0 | case OVS_ACTION_ATTR_DEC_TTL: |
8279 | 0 | case OVS_ACTION_ATTR_PSAMPLE: |
8280 | 0 | case __OVS_ACTION_ATTR_MAX: |
8281 | 0 | OVS_NOT_REACHED(); |
8282 | 0 | } |
8283 | | |
8284 | 0 | dp_packet_delete_batch(packets_, should_steal); |
8285 | 0 | } |
8286 | | |
8287 | | static void |
8288 | | dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd, |
8289 | | struct dp_packet_batch *packets, |
8290 | | bool should_steal, const struct flow *flow, |
8291 | | const struct nlattr *actions, size_t actions_len) |
8292 | 0 | { |
8293 | 0 | struct dp_netdev_execute_aux aux = { pmd, flow }; |
8294 | |
|
8295 | 0 | odp_execute_actions(&aux, packets, should_steal, actions, |
8296 | 0 | actions_len, dp_execute_cb); |
8297 | 0 | } |
8298 | | |
8299 | | struct dp_netdev_ct_dump { |
8300 | | struct ct_dpif_dump_state up; |
8301 | | struct conntrack_dump dump; |
8302 | | struct conntrack *ct; |
8303 | | struct dp_netdev *dp; |
8304 | | }; |
8305 | | |
8306 | | static int |
8307 | | dpif_netdev_ct_dump_start(struct dpif *dpif, struct ct_dpif_dump_state **dump_, |
8308 | | const uint16_t *pzone, int *ptot_bkts) |
8309 | 0 | { |
8310 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
8311 | 0 | struct dp_netdev_ct_dump *dump; |
8312 | |
|
8313 | 0 | dump = xzalloc(sizeof *dump); |
8314 | 0 | dump->dp = dp; |
8315 | 0 | dump->ct = dp->conntrack; |
8316 | |
|
8317 | 0 | conntrack_dump_start(dp->conntrack, &dump->dump, pzone, ptot_bkts); |
8318 | |
|
8319 | 0 | *dump_ = &dump->up; |
8320 | |
|
8321 | 0 | return 0; |
8322 | 0 | } |
8323 | | |
8324 | | static int |
8325 | | dpif_netdev_ct_dump_next(struct dpif *dpif OVS_UNUSED, |
8326 | | struct ct_dpif_dump_state *dump_, |
8327 | | struct ct_dpif_entry *entry) |
8328 | 0 | { |
8329 | 0 | struct dp_netdev_ct_dump *dump; |
8330 | |
|
8331 | 0 | INIT_CONTAINER(dump, dump_, up); |
8332 | |
|
8333 | 0 | return conntrack_dump_next(&dump->dump, entry); |
8334 | 0 | } |
8335 | | |
8336 | | static int |
8337 | | dpif_netdev_ct_dump_done(struct dpif *dpif OVS_UNUSED, |
8338 | | struct ct_dpif_dump_state *dump_) |
8339 | 0 | { |
8340 | 0 | struct dp_netdev_ct_dump *dump; |
8341 | 0 | int err; |
8342 | |
|
8343 | 0 | INIT_CONTAINER(dump, dump_, up); |
8344 | |
|
8345 | 0 | err = conntrack_dump_done(&dump->dump); |
8346 | |
|
8347 | 0 | free(dump); |
8348 | |
|
8349 | 0 | return err; |
8350 | 0 | } |
8351 | | |
8352 | | static int |
8353 | | dpif_netdev_ct_exp_dump_start(struct dpif *dpif, |
8354 | | struct ct_dpif_dump_state **dump_, |
8355 | | const uint16_t *pzone) |
8356 | 0 | { |
8357 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
8358 | 0 | struct dp_netdev_ct_dump *dump; |
8359 | |
|
8360 | 0 | dump = xzalloc(sizeof *dump); |
8361 | 0 | dump->dp = dp; |
8362 | 0 | dump->ct = dp->conntrack; |
8363 | |
|
8364 | 0 | conntrack_exp_dump_start(dp->conntrack, &dump->dump, pzone); |
8365 | |
|
8366 | 0 | *dump_ = &dump->up; |
8367 | |
|
8368 | 0 | return 0; |
8369 | 0 | } |
8370 | | |
8371 | | static int |
8372 | | dpif_netdev_ct_exp_dump_next(struct dpif *dpif OVS_UNUSED, |
8373 | | struct ct_dpif_dump_state *dump_, |
8374 | | struct ct_dpif_exp *entry) |
8375 | 0 | { |
8376 | 0 | struct dp_netdev_ct_dump *dump; |
8377 | |
|
8378 | 0 | INIT_CONTAINER(dump, dump_, up); |
8379 | |
|
8380 | 0 | return conntrack_exp_dump_next(&dump->dump, entry); |
8381 | 0 | } |
8382 | | |
8383 | | static int |
8384 | | dpif_netdev_ct_exp_dump_done(struct dpif *dpif OVS_UNUSED, |
8385 | | struct ct_dpif_dump_state *dump_) |
8386 | 0 | { |
8387 | 0 | struct dp_netdev_ct_dump *dump; |
8388 | 0 | int err; |
8389 | |
|
8390 | 0 | INIT_CONTAINER(dump, dump_, up); |
8391 | |
|
8392 | 0 | err = conntrack_exp_dump_done(&dump->dump); |
8393 | |
|
8394 | 0 | free(dump); |
8395 | |
|
8396 | 0 | return err; |
8397 | 0 | } |
8398 | | |
8399 | | static int |
8400 | | dpif_netdev_ct_flush(struct dpif *dpif, const uint16_t *zone, |
8401 | | const struct ct_dpif_tuple *tuple) |
8402 | 0 | { |
8403 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
8404 | |
|
8405 | 0 | if (tuple) { |
8406 | 0 | return conntrack_flush_tuple(dp->conntrack, tuple, zone ? *zone : 0); |
8407 | 0 | } |
8408 | 0 | return conntrack_flush(dp->conntrack, zone); |
8409 | 0 | } |
8410 | | |
8411 | | static int |
8412 | | dpif_netdev_ct_set_maxconns(struct dpif *dpif, uint32_t maxconns) |
8413 | 0 | { |
8414 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
8415 | |
|
8416 | 0 | return conntrack_set_maxconns(dp->conntrack, maxconns); |
8417 | 0 | } |
8418 | | |
8419 | | static int |
8420 | | dpif_netdev_ct_get_maxconns(struct dpif *dpif, uint32_t *maxconns) |
8421 | 0 | { |
8422 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
8423 | |
|
8424 | 0 | return conntrack_get_maxconns(dp->conntrack, maxconns); |
8425 | 0 | } |
8426 | | |
8427 | | static int |
8428 | | dpif_netdev_ct_get_nconns(struct dpif *dpif, uint32_t *nconns) |
8429 | 0 | { |
8430 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
8431 | |
|
8432 | 0 | return conntrack_get_nconns(dp->conntrack, nconns); |
8433 | 0 | } |
8434 | | |
8435 | | static int |
8436 | | dpif_netdev_ct_set_tcp_seq_chk(struct dpif *dpif, bool enabled) |
8437 | 0 | { |
8438 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
8439 | |
|
8440 | 0 | return conntrack_set_tcp_seq_chk(dp->conntrack, enabled); |
8441 | 0 | } |
8442 | | |
8443 | | static int |
8444 | | dpif_netdev_ct_get_tcp_seq_chk(struct dpif *dpif, bool *enabled) |
8445 | 0 | { |
8446 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
8447 | 0 | *enabled = conntrack_get_tcp_seq_chk(dp->conntrack); |
8448 | 0 | return 0; |
8449 | 0 | } |
8450 | | |
8451 | | static int |
8452 | | dpif_netdev_ct_set_sweep_interval(struct dpif *dpif, uint32_t ms) |
8453 | 0 | { |
8454 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
8455 | 0 | return conntrack_set_sweep_interval(dp->conntrack, ms); |
8456 | 0 | } |
8457 | | |
8458 | | static int |
8459 | | dpif_netdev_ct_get_sweep_interval(struct dpif *dpif, uint32_t *ms) |
8460 | 0 | { |
8461 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
8462 | 0 | *ms = conntrack_get_sweep_interval(dp->conntrack); |
8463 | 0 | return 0; |
8464 | 0 | } |
8465 | | |
8466 | | static int |
8467 | | dpif_netdev_ct_set_limits(struct dpif *dpif, |
8468 | | const struct ovs_list *zone_limits) |
8469 | 0 | { |
8470 | 0 | int err = 0; |
8471 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
8472 | |
|
8473 | 0 | struct ct_dpif_zone_limit *zone_limit; |
8474 | 0 | LIST_FOR_EACH (zone_limit, node, zone_limits) { |
8475 | 0 | err = zone_limit_update(dp->conntrack, zone_limit->zone, |
8476 | 0 | zone_limit->limit); |
8477 | 0 | if (err != 0) { |
8478 | 0 | break; |
8479 | 0 | } |
8480 | 0 | } |
8481 | 0 | return err; |
8482 | 0 | } |
8483 | | |
8484 | | static int |
8485 | | dpif_netdev_ct_get_limits(struct dpif *dpif, |
8486 | | const struct ovs_list *zone_limits_request, |
8487 | | struct ovs_list *zone_limits_reply) |
8488 | 0 | { |
8489 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
8490 | 0 | struct conntrack_zone_info czl; |
8491 | |
|
8492 | 0 | if (!ovs_list_is_empty(zone_limits_request)) { |
8493 | 0 | struct ct_dpif_zone_limit *zone_limit; |
8494 | 0 | LIST_FOR_EACH (zone_limit, node, zone_limits_request) { |
8495 | 0 | czl = zone_limit_get(dp->conntrack, zone_limit->zone); |
8496 | 0 | if (czl.zone == zone_limit->zone || czl.zone == DEFAULT_ZONE) { |
8497 | 0 | ct_dpif_push_zone_limit(zone_limits_reply, zone_limit->zone, |
8498 | 0 | czl.limit, |
8499 | 0 | czl.count); |
8500 | 0 | } else { |
8501 | 0 | return EINVAL; |
8502 | 0 | } |
8503 | 0 | } |
8504 | 0 | } else { |
8505 | 0 | czl = zone_limit_get(dp->conntrack, DEFAULT_ZONE); |
8506 | 0 | if (czl.zone == DEFAULT_ZONE) { |
8507 | 0 | ct_dpif_push_zone_limit(zone_limits_reply, DEFAULT_ZONE, |
8508 | 0 | czl.limit, 0); |
8509 | 0 | } |
8510 | |
|
8511 | 0 | for (int z = MIN_ZONE; z <= MAX_ZONE; z++) { |
8512 | 0 | czl = zone_limit_get(dp->conntrack, z); |
8513 | 0 | if (czl.zone == z) { |
8514 | 0 | ct_dpif_push_zone_limit(zone_limits_reply, z, czl.limit, |
8515 | 0 | czl.count); |
8516 | 0 | } |
8517 | 0 | } |
8518 | 0 | } |
8519 | | |
8520 | 0 | return 0; |
8521 | 0 | } |
8522 | | |
8523 | | static int |
8524 | | dpif_netdev_ct_del_limits(struct dpif *dpif, |
8525 | | const struct ovs_list *zone_limits) |
8526 | 0 | { |
8527 | 0 | int err = 0; |
8528 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
8529 | 0 | struct ct_dpif_zone_limit *zone_limit; |
8530 | 0 | LIST_FOR_EACH (zone_limit, node, zone_limits) { |
8531 | 0 | err = zone_limit_delete(dp->conntrack, zone_limit->zone); |
8532 | 0 | if (err != 0) { |
8533 | 0 | break; |
8534 | 0 | } |
8535 | 0 | } |
8536 | |
|
8537 | 0 | return err; |
8538 | 0 | } |
8539 | | |
8540 | | static int |
8541 | | dpif_netdev_ct_get_features(struct dpif *dpif OVS_UNUSED, |
8542 | | enum ct_features *features) |
8543 | 0 | { |
8544 | 0 | if (features != NULL) { |
8545 | 0 | *features = CONNTRACK_F_ZERO_SNAT; |
8546 | 0 | } |
8547 | 0 | return 0; |
8548 | 0 | } |
8549 | | |
8550 | | static int |
8551 | | dpif_netdev_ct_set_timeout_policy(struct dpif *dpif, |
8552 | | const struct ct_dpif_timeout_policy *dpif_tp) |
8553 | 0 | { |
8554 | 0 | struct timeout_policy tp; |
8555 | 0 | struct dp_netdev *dp; |
8556 | |
|
8557 | 0 | dp = get_dp_netdev(dpif); |
8558 | 0 | memcpy(&tp.policy, dpif_tp, sizeof tp.policy); |
8559 | 0 | return timeout_policy_update(dp->conntrack, &tp); |
8560 | 0 | } |
8561 | | |
8562 | | static int |
8563 | | dpif_netdev_ct_get_timeout_policy(struct dpif *dpif, uint32_t tp_id, |
8564 | | struct ct_dpif_timeout_policy *dpif_tp) |
8565 | 0 | { |
8566 | 0 | struct timeout_policy *tp; |
8567 | 0 | struct dp_netdev *dp; |
8568 | 0 | int err = 0; |
8569 | |
|
8570 | 0 | dp = get_dp_netdev(dpif); |
8571 | 0 | tp = timeout_policy_get(dp->conntrack, tp_id); |
8572 | 0 | if (!tp) { |
8573 | 0 | return ENOENT; |
8574 | 0 | } |
8575 | 0 | memcpy(dpif_tp, &tp->policy, sizeof tp->policy); |
8576 | 0 | return err; |
8577 | 0 | } |
8578 | | |
8579 | | static int |
8580 | | dpif_netdev_ct_del_timeout_policy(struct dpif *dpif, |
8581 | | uint32_t tp_id) |
8582 | 0 | { |
8583 | 0 | struct dp_netdev *dp; |
8584 | 0 | int err = 0; |
8585 | |
|
8586 | 0 | dp = get_dp_netdev(dpif); |
8587 | 0 | err = timeout_policy_delete(dp->conntrack, tp_id); |
8588 | 0 | return err; |
8589 | 0 | } |
8590 | | |
8591 | | static int |
8592 | | dpif_netdev_ct_get_timeout_policy_name(struct dpif *dpif OVS_UNUSED, |
8593 | | uint32_t tp_id, |
8594 | | uint16_t dl_type OVS_UNUSED, |
8595 | | uint8_t nw_proto OVS_UNUSED, |
8596 | | char **tp_name, bool *is_generic) |
8597 | 0 | { |
8598 | 0 | struct ds ds = DS_EMPTY_INITIALIZER; |
8599 | |
|
8600 | 0 | ds_put_format(&ds, "%"PRIu32, tp_id); |
8601 | 0 | *tp_name = ds_steal_cstr(&ds); |
8602 | 0 | *is_generic = true; |
8603 | 0 | return 0; |
8604 | 0 | } |
8605 | | |
8606 | | static int |
8607 | | dpif_netdev_ipf_set_enabled(struct dpif *dpif, bool v6, bool enable) |
8608 | 0 | { |
8609 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
8610 | 0 | return ipf_set_enabled(conntrack_ipf_ctx(dp->conntrack), v6, enable); |
8611 | 0 | } |
8612 | | |
8613 | | static int |
8614 | | dpif_netdev_ipf_set_min_frag(struct dpif *dpif, bool v6, uint32_t min_frag) |
8615 | 0 | { |
8616 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
8617 | 0 | return ipf_set_min_frag(conntrack_ipf_ctx(dp->conntrack), v6, min_frag); |
8618 | 0 | } |
8619 | | |
8620 | | static int |
8621 | | dpif_netdev_ipf_set_max_nfrags(struct dpif *dpif, uint32_t max_frags) |
8622 | 0 | { |
8623 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
8624 | 0 | return ipf_set_max_nfrags(conntrack_ipf_ctx(dp->conntrack), max_frags); |
8625 | 0 | } |
8626 | | |
8627 | | /* Adjust this function if 'dpif_ipf_status' and 'ipf_status' were to |
8628 | | * diverge. */ |
8629 | | static int |
8630 | | dpif_netdev_ipf_get_status(struct dpif *dpif, |
8631 | | struct dpif_ipf_status *dpif_ipf_status) |
8632 | 0 | { |
8633 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
8634 | 0 | ipf_get_status(conntrack_ipf_ctx(dp->conntrack), |
8635 | 0 | (struct ipf_status *) dpif_ipf_status); |
8636 | 0 | return 0; |
8637 | 0 | } |
8638 | | |
8639 | | static int |
8640 | | dpif_netdev_ipf_dump_start(struct dpif *dpif OVS_UNUSED, |
8641 | | struct ipf_dump_ctx **ipf_dump_ctx) |
8642 | 0 | { |
8643 | 0 | return ipf_dump_start(ipf_dump_ctx); |
8644 | 0 | } |
8645 | | |
8646 | | static int |
8647 | | dpif_netdev_ipf_dump_next(struct dpif *dpif, void *ipf_dump_ctx, char **dump) |
8648 | 0 | { |
8649 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
8650 | 0 | return ipf_dump_next(conntrack_ipf_ctx(dp->conntrack), ipf_dump_ctx, |
8651 | 0 | dump); |
8652 | 0 | } |
8653 | | |
8654 | | static int |
8655 | | dpif_netdev_ipf_dump_done(struct dpif *dpif OVS_UNUSED, void *ipf_dump_ctx) |
8656 | 0 | { |
8657 | 0 | return ipf_dump_done(ipf_dump_ctx); |
8658 | |
|
8659 | 0 | } |
8660 | | |
8661 | | static int |
8662 | | dpif_netdev_bond_add(struct dpif *dpif, uint32_t bond_id, |
8663 | | odp_port_t *member_map) |
8664 | 0 | { |
8665 | 0 | struct tx_bond *new_tx = xzalloc(sizeof *new_tx); |
8666 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
8667 | 0 | struct dp_netdev_pmd_thread *pmd; |
8668 | | |
8669 | | /* Prepare new bond mapping. */ |
8670 | 0 | new_tx->bond_id = bond_id; |
8671 | 0 | for (int bucket = 0; bucket < BOND_BUCKETS; bucket++) { |
8672 | 0 | new_tx->member_buckets[bucket].member_id = member_map[bucket]; |
8673 | 0 | } |
8674 | |
|
8675 | 0 | ovs_mutex_lock(&dp->bond_mutex); |
8676 | | /* Check if bond already existed. */ |
8677 | 0 | struct tx_bond *old_tx = tx_bond_lookup(&dp->tx_bonds, bond_id); |
8678 | 0 | if (old_tx) { |
8679 | 0 | cmap_replace(&dp->tx_bonds, &old_tx->node, &new_tx->node, |
8680 | 0 | hash_bond_id(bond_id)); |
8681 | 0 | ovsrcu_postpone(free, old_tx); |
8682 | 0 | } else { |
8683 | 0 | cmap_insert(&dp->tx_bonds, &new_tx->node, hash_bond_id(bond_id)); |
8684 | 0 | } |
8685 | 0 | ovs_mutex_unlock(&dp->bond_mutex); |
8686 | | |
8687 | | /* Update all PMDs with new bond mapping. */ |
8688 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
8689 | 0 | dp_netdev_add_bond_tx_to_pmd(pmd, new_tx, true); |
8690 | 0 | } |
8691 | 0 | return 0; |
8692 | 0 | } |
8693 | | |
8694 | | static int |
8695 | | dpif_netdev_bond_del(struct dpif *dpif, uint32_t bond_id) |
8696 | 0 | { |
8697 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
8698 | 0 | struct dp_netdev_pmd_thread *pmd; |
8699 | 0 | struct tx_bond *tx; |
8700 | |
|
8701 | 0 | ovs_mutex_lock(&dp->bond_mutex); |
8702 | | /* Check if bond existed. */ |
8703 | 0 | tx = tx_bond_lookup(&dp->tx_bonds, bond_id); |
8704 | 0 | if (tx) { |
8705 | 0 | cmap_remove(&dp->tx_bonds, &tx->node, hash_bond_id(bond_id)); |
8706 | 0 | ovsrcu_postpone(free, tx); |
8707 | 0 | } else { |
8708 | | /* Bond is not present. */ |
8709 | 0 | ovs_mutex_unlock(&dp->bond_mutex); |
8710 | 0 | return ENOENT; |
8711 | 0 | } |
8712 | 0 | ovs_mutex_unlock(&dp->bond_mutex); |
8713 | | |
8714 | | /* Remove the bond map in all pmds. */ |
8715 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
8716 | 0 | dp_netdev_del_bond_tx_from_pmd(pmd, bond_id); |
8717 | 0 | } |
8718 | 0 | return 0; |
8719 | 0 | } |
8720 | | |
8721 | | static int |
8722 | | dpif_netdev_bond_stats_get(struct dpif *dpif, uint32_t bond_id, |
8723 | | uint64_t *n_bytes) |
8724 | 0 | { |
8725 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
8726 | 0 | struct dp_netdev_pmd_thread *pmd; |
8727 | |
|
8728 | 0 | if (!tx_bond_lookup(&dp->tx_bonds, bond_id)) { |
8729 | 0 | return ENOENT; |
8730 | 0 | } |
8731 | | |
8732 | | /* Search the bond in all PMDs. */ |
8733 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
8734 | 0 | struct tx_bond *pmd_bond_entry |
8735 | 0 | = tx_bond_lookup(&pmd->tx_bonds, bond_id); |
8736 | |
|
8737 | 0 | if (!pmd_bond_entry) { |
8738 | 0 | continue; |
8739 | 0 | } |
8740 | | |
8741 | | /* Read bond stats. */ |
8742 | 0 | for (int i = 0; i < BOND_BUCKETS; i++) { |
8743 | 0 | uint64_t pmd_n_bytes; |
8744 | |
|
8745 | 0 | atomic_read_relaxed(&pmd_bond_entry->member_buckets[i].n_bytes, |
8746 | 0 | &pmd_n_bytes); |
8747 | 0 | n_bytes[i] += pmd_n_bytes; |
8748 | 0 | } |
8749 | 0 | } |
8750 | 0 | return 0; |
8751 | 0 | } |
8752 | | |
8753 | | const struct dpif_class dpif_netdev_class = { |
8754 | | "netdev", |
8755 | | true, /* cleanup_required */ |
8756 | | dpif_netdev_init, |
8757 | | dpif_netdev_enumerate, |
8758 | | dpif_netdev_port_open_type, |
8759 | | dpif_netdev_open, |
8760 | | dpif_netdev_close, |
8761 | | dpif_netdev_destroy, |
8762 | | dpif_netdev_run, |
8763 | | dpif_netdev_wait, |
8764 | | dpif_netdev_get_stats, |
8765 | | NULL, /* set_features */ |
8766 | | NULL, /* get_features */ |
8767 | | dpif_netdev_port_add, |
8768 | | dpif_netdev_port_del, |
8769 | | dpif_netdev_port_set_config, |
8770 | | dpif_netdev_port_query_by_number, |
8771 | | dpif_netdev_port_query_by_name, |
8772 | | NULL, /* port_get_pid */ |
8773 | | dpif_netdev_port_dump_start, |
8774 | | dpif_netdev_port_dump_next, |
8775 | | dpif_netdev_port_dump_done, |
8776 | | dpif_netdev_port_poll, |
8777 | | dpif_netdev_port_poll_wait, |
8778 | | dpif_netdev_flow_flush, |
8779 | | dpif_netdev_flow_dump_create, |
8780 | | dpif_netdev_flow_dump_destroy, |
8781 | | dpif_netdev_flow_dump_thread_create, |
8782 | | dpif_netdev_flow_dump_thread_destroy, |
8783 | | dpif_netdev_flow_dump_next, |
8784 | | dpif_netdev_operate, |
8785 | | NULL, /* recv_set */ |
8786 | | NULL, /* handlers_set */ |
8787 | | dpif_netdev_number_handlers_required, |
8788 | | dpif_netdev_set_config, |
8789 | | dpif_netdev_queue_to_priority, |
8790 | | NULL, /* recv */ |
8791 | | NULL, /* recv_wait */ |
8792 | | NULL, /* recv_purge */ |
8793 | | dpif_netdev_register_dp_purge_cb, |
8794 | | dpif_netdev_register_upcall_cb, |
8795 | | dpif_netdev_enable_upcall, |
8796 | | dpif_netdev_disable_upcall, |
8797 | | dpif_netdev_get_datapath_version, |
8798 | | dpif_netdev_ct_dump_start, |
8799 | | dpif_netdev_ct_dump_next, |
8800 | | dpif_netdev_ct_dump_done, |
8801 | | dpif_netdev_ct_exp_dump_start, |
8802 | | dpif_netdev_ct_exp_dump_next, |
8803 | | dpif_netdev_ct_exp_dump_done, |
8804 | | dpif_netdev_ct_flush, |
8805 | | dpif_netdev_ct_set_maxconns, |
8806 | | dpif_netdev_ct_get_maxconns, |
8807 | | dpif_netdev_ct_get_nconns, |
8808 | | dpif_netdev_ct_set_tcp_seq_chk, |
8809 | | dpif_netdev_ct_get_tcp_seq_chk, |
8810 | | dpif_netdev_ct_set_sweep_interval, |
8811 | | dpif_netdev_ct_get_sweep_interval, |
8812 | | dpif_netdev_ct_set_limits, |
8813 | | dpif_netdev_ct_get_limits, |
8814 | | dpif_netdev_ct_del_limits, |
8815 | | dpif_netdev_ct_set_timeout_policy, |
8816 | | dpif_netdev_ct_get_timeout_policy, |
8817 | | dpif_netdev_ct_del_timeout_policy, |
8818 | | NULL, /* ct_timeout_policy_dump_start */ |
8819 | | NULL, /* ct_timeout_policy_dump_next */ |
8820 | | NULL, /* ct_timeout_policy_dump_done */ |
8821 | | dpif_netdev_ct_get_timeout_policy_name, |
8822 | | dpif_netdev_ct_get_features, |
8823 | | dpif_netdev_ipf_set_enabled, |
8824 | | dpif_netdev_ipf_set_min_frag, |
8825 | | dpif_netdev_ipf_set_max_nfrags, |
8826 | | dpif_netdev_ipf_get_status, |
8827 | | dpif_netdev_ipf_dump_start, |
8828 | | dpif_netdev_ipf_dump_next, |
8829 | | dpif_netdev_ipf_dump_done, |
8830 | | dpif_netdev_meter_get_features, |
8831 | | dpif_netdev_meter_set, |
8832 | | dpif_netdev_meter_get, |
8833 | | dpif_netdev_meter_del, |
8834 | | dpif_netdev_bond_add, |
8835 | | dpif_netdev_bond_del, |
8836 | | dpif_netdev_bond_stats_get, |
8837 | | NULL, /* cache_get_supported_levels */ |
8838 | | NULL, /* cache_get_name */ |
8839 | | NULL, /* cache_get_size */ |
8840 | | NULL, /* cache_set_size */ |
8841 | | }; |
8842 | | |
8843 | | static void |
8844 | | dpif_dummy_change_port_number(struct unixctl_conn *conn, int argc OVS_UNUSED, |
8845 | | const char *argv[], void *aux OVS_UNUSED) |
8846 | 0 | { |
8847 | 0 | struct dp_netdev_port *port; |
8848 | 0 | struct dp_netdev *dp; |
8849 | 0 | odp_port_t port_no; |
8850 | |
|
8851 | 0 | ovs_mutex_lock(&dp_netdev_mutex); |
8852 | 0 | dp = shash_find_data(&dp_netdevs, argv[1]); |
8853 | 0 | if (!dp || !dpif_netdev_class_is_dummy(dp->class)) { |
8854 | 0 | ovs_mutex_unlock(&dp_netdev_mutex); |
8855 | 0 | unixctl_command_reply_error(conn, "unknown datapath or not a dummy"); |
8856 | 0 | return; |
8857 | 0 | } |
8858 | 0 | ovs_refcount_ref(&dp->ref_cnt); |
8859 | 0 | ovs_mutex_unlock(&dp_netdev_mutex); |
8860 | |
|
8861 | 0 | ovs_rwlock_wrlock(&dp->port_rwlock); |
8862 | 0 | if (get_port_by_name(dp, argv[2], &port)) { |
8863 | 0 | unixctl_command_reply_error(conn, "unknown port"); |
8864 | 0 | goto exit; |
8865 | 0 | } |
8866 | | |
8867 | 0 | port_no = u32_to_odp(atoi(argv[3])); |
8868 | 0 | if (!port_no || port_no == ODPP_NONE) { |
8869 | 0 | unixctl_command_reply_error(conn, "bad port number"); |
8870 | 0 | goto exit; |
8871 | 0 | } |
8872 | 0 | if (dp_netdev_lookup_port(dp, port_no)) { |
8873 | 0 | unixctl_command_reply_error(conn, "port number already in use"); |
8874 | 0 | goto exit; |
8875 | 0 | } |
8876 | | |
8877 | | /* Remove port. */ |
8878 | 0 | hmap_remove(&dp->ports, &port->node); |
8879 | 0 | reconfigure_datapath(dp); |
8880 | | |
8881 | | /* Reinsert with new port number. */ |
8882 | 0 | port->port_no = port_no; |
8883 | 0 | hmap_insert(&dp->ports, &port->node, hash_port_no(port_no)); |
8884 | 0 | reconfigure_datapath(dp); |
8885 | |
|
8886 | 0 | seq_change(dp->port_seq); |
8887 | 0 | unixctl_command_reply(conn, NULL); |
8888 | |
|
8889 | 0 | exit: |
8890 | 0 | ovs_rwlock_unlock(&dp->port_rwlock); |
8891 | 0 | dp_netdev_unref(dp); |
8892 | 0 | } |
8893 | | |
8894 | | static void |
8895 | | dpif_dummy_register__(const char *type) |
8896 | 0 | { |
8897 | 0 | struct dpif_class *class; |
8898 | |
|
8899 | 0 | class = xmalloc(sizeof *class); |
8900 | 0 | *class = dpif_netdev_class; |
8901 | 0 | class->type = xstrdup(type); |
8902 | 0 | dp_register_provider(class); |
8903 | 0 | } |
8904 | | |
8905 | | static void |
8906 | | dpif_dummy_override(const char *type) |
8907 | 0 | { |
8908 | 0 | int error; |
8909 | | |
8910 | | /* |
8911 | | * Ignore EAFNOSUPPORT to allow --enable-dummy=system with |
8912 | | * a userland-only build. It's useful for testsuite. |
8913 | | */ |
8914 | 0 | error = dp_unregister_provider(type); |
8915 | 0 | if (error == 0 || error == EAFNOSUPPORT) { |
8916 | 0 | dpif_dummy_register__(type); |
8917 | 0 | } |
8918 | 0 | } |
8919 | | |
8920 | | void |
8921 | | dpif_dummy_register(enum dummy_level level) |
8922 | 0 | { |
8923 | 0 | if (level == DUMMY_OVERRIDE_ALL) { |
8924 | 0 | struct sset types; |
8925 | 0 | const char *type; |
8926 | |
|
8927 | 0 | sset_init(&types); |
8928 | 0 | dp_enumerate_types(&types); |
8929 | 0 | SSET_FOR_EACH (type, &types) { |
8930 | 0 | dpif_dummy_override(type); |
8931 | 0 | } |
8932 | 0 | sset_destroy(&types); |
8933 | 0 | } else if (level == DUMMY_OVERRIDE_SYSTEM) { |
8934 | 0 | dpif_dummy_override("system"); |
8935 | 0 | } |
8936 | |
|
8937 | 0 | dpif_dummy_register__("dummy"); |
8938 | |
|
8939 | 0 | unixctl_command_register("dpif-dummy/change-port-number", |
8940 | 0 | "dp port new-number", |
8941 | 0 | 3, 3, dpif_dummy_change_port_number, NULL); |
8942 | 0 | } |
8943 | | |
8944 | | /* Datapath Classifier. */ |
8945 | | |
8946 | | static void |
8947 | | dpcls_subtable_destroy_cb(struct dpcls_subtable *subtable) |
8948 | 0 | { |
8949 | 0 | cmap_destroy(&subtable->rules); |
8950 | 0 | ovsrcu_postpone(free, subtable->mf_masks); |
8951 | 0 | ovsrcu_postpone(free, subtable); |
8952 | 0 | } |
8953 | | |
8954 | | /* Initializes 'cls' as a classifier that initially contains no classification |
8955 | | * rules. */ |
8956 | | static void |
8957 | | dpcls_init(struct dpcls *cls) |
8958 | 0 | { |
8959 | 0 | cmap_init(&cls->subtables_map); |
8960 | 0 | pvector_init(&cls->subtables); |
8961 | 0 | } |
8962 | | |
8963 | | static void |
8964 | | dpcls_destroy_subtable(struct dpcls *cls, struct dpcls_subtable *subtable) |
8965 | 0 | { |
8966 | 0 | VLOG_DBG("Destroying subtable %p for in_port %d", subtable, cls->in_port); |
8967 | 0 | pvector_remove(&cls->subtables, subtable); |
8968 | 0 | cmap_remove(&cls->subtables_map, &subtable->cmap_node, |
8969 | 0 | subtable->mask.hash); |
8970 | 0 | ovsrcu_postpone(dpcls_subtable_destroy_cb, subtable); |
8971 | 0 | } |
8972 | | |
8973 | | /* Destroys 'cls'. Rules within 'cls', if any, are not freed; this is the |
8974 | | * caller's responsibility. |
8975 | | * May only be called after all the readers have been terminated. */ |
8976 | | static void |
8977 | | dpcls_destroy(struct dpcls *cls) |
8978 | 0 | { |
8979 | 0 | if (cls) { |
8980 | 0 | struct dpcls_subtable *subtable; |
8981 | |
|
8982 | 0 | CMAP_FOR_EACH (subtable, cmap_node, &cls->subtables_map) { |
8983 | 0 | ovs_assert(cmap_count(&subtable->rules) == 0); |
8984 | 0 | dpcls_destroy_subtable(cls, subtable); |
8985 | 0 | } |
8986 | 0 | cmap_destroy(&cls->subtables_map); |
8987 | 0 | pvector_destroy(&cls->subtables); |
8988 | 0 | } |
8989 | 0 | } |
8990 | | |
8991 | | static struct dpcls_subtable * |
8992 | | dpcls_create_subtable(struct dpcls *cls, const struct netdev_flow_key *mask) |
8993 | 0 | { |
8994 | 0 | struct dpcls_subtable *subtable; |
8995 | | |
8996 | | /* Need to add one. */ |
8997 | 0 | subtable = xmalloc(sizeof *subtable |
8998 | 0 | - sizeof subtable->mask.mf + mask->len); |
8999 | 0 | cmap_init(&subtable->rules); |
9000 | 0 | subtable->hit_cnt = 0; |
9001 | 0 | netdev_flow_key_clone(&subtable->mask, mask); |
9002 | | |
9003 | | /* The count of bits in the mask defines the space required for masks. |
9004 | | * Then call gen_masks() to create the appropriate masks, avoiding the cost |
9005 | | * of doing runtime calculations. */ |
9006 | 0 | uint32_t unit0 = count_1bits(mask->mf.map.bits[0]); |
9007 | 0 | uint32_t unit1 = count_1bits(mask->mf.map.bits[1]); |
9008 | 0 | subtable->mf_bits_set_unit0 = unit0; |
9009 | 0 | subtable->mf_bits_set_unit1 = unit1; |
9010 | 0 | subtable->mf_masks = xmalloc(sizeof(uint64_t) * (unit0 + unit1)); |
9011 | 0 | dpcls_flow_key_gen_masks(mask, subtable->mf_masks, unit0, unit1); |
9012 | | |
9013 | | /* Get the preferred subtable search function for this (u0,u1) subtable. |
9014 | | * The function is guaranteed to always return a valid implementation, and |
9015 | | * possibly a specialized implementation. */ |
9016 | 0 | subtable->lookup_func = dpcls_subtable_lookup_probe(unit0, unit1); |
9017 | |
|
9018 | 0 | cmap_insert(&cls->subtables_map, &subtable->cmap_node, mask->hash); |
9019 | | /* Add the new subtable at the end of the pvector (with no hits yet) */ |
9020 | 0 | pvector_insert(&cls->subtables, subtable, 0); |
9021 | 0 | VLOG_DBG("Creating %"PRIuSIZE". subtable %p for in_port %d", |
9022 | 0 | cmap_count(&cls->subtables_map), subtable, cls->in_port); |
9023 | 0 | pvector_publish(&cls->subtables); |
9024 | |
|
9025 | 0 | return subtable; |
9026 | 0 | } |
9027 | | |
9028 | | static inline struct dpcls_subtable * |
9029 | | dpcls_find_subtable(struct dpcls *cls, const struct netdev_flow_key *mask) |
9030 | 0 | { |
9031 | 0 | struct dpcls_subtable *subtable; |
9032 | |
|
9033 | 0 | CMAP_FOR_EACH_WITH_HASH (subtable, cmap_node, mask->hash, |
9034 | 0 | &cls->subtables_map) { |
9035 | 0 | if (netdev_flow_key_equal(&subtable->mask, mask)) { |
9036 | 0 | return subtable; |
9037 | 0 | } |
9038 | 0 | } |
9039 | 0 | return dpcls_create_subtable(cls, mask); |
9040 | 0 | } |
9041 | | |
9042 | | /* Periodically sort the dpcls subtable vectors according to hit counts */ |
9043 | | static void |
9044 | | dpcls_sort_subtable_vector(struct dpcls *cls) |
9045 | 0 | { |
9046 | 0 | struct pvector *pvec = &cls->subtables; |
9047 | 0 | struct dpcls_subtable *subtable; |
9048 | |
|
9049 | 0 | PVECTOR_FOR_EACH (subtable, pvec) { |
9050 | 0 | pvector_change_priority(pvec, subtable, subtable->hit_cnt); |
9051 | 0 | subtable->hit_cnt = 0; |
9052 | 0 | } |
9053 | 0 | pvector_publish(pvec); |
9054 | 0 | } |
9055 | | |
9056 | | static inline void |
9057 | | dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd, |
9058 | | struct polled_queue *poll_list, int poll_cnt) |
9059 | 0 | { |
9060 | 0 | struct dpcls *cls; |
9061 | 0 | uint64_t tot_idle = 0, tot_proc = 0, tot_sleep = 0; |
9062 | 0 | unsigned int pmd_load = 0; |
9063 | |
|
9064 | 0 | if (pmd->ctx.now > pmd->next_cycle_store) { |
9065 | 0 | uint64_t curr_tsc; |
9066 | 0 | uint8_t rebalance_load_trigger; |
9067 | 0 | struct pmd_auto_lb *pmd_alb = &pmd->dp->pmd_alb; |
9068 | 0 | unsigned int idx; |
9069 | |
|
9070 | 0 | if (pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] >= |
9071 | 0 | pmd->prev_stats[PMD_CYCLES_ITER_IDLE] && |
9072 | 0 | pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] >= |
9073 | 0 | pmd->prev_stats[PMD_CYCLES_ITER_BUSY]) { |
9074 | 0 | tot_idle = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] - |
9075 | 0 | pmd->prev_stats[PMD_CYCLES_ITER_IDLE]; |
9076 | 0 | tot_proc = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] - |
9077 | 0 | pmd->prev_stats[PMD_CYCLES_ITER_BUSY]; |
9078 | 0 | tot_sleep = pmd->perf_stats.counters.n[PMD_CYCLES_SLEEP] - |
9079 | 0 | pmd->prev_stats[PMD_CYCLES_SLEEP]; |
9080 | |
|
9081 | 0 | if (pmd_alb->is_enabled && !pmd->isolated) { |
9082 | 0 | if (tot_proc) { |
9083 | 0 | pmd_load = ((tot_proc * 100) / |
9084 | 0 | (tot_idle + tot_proc + tot_sleep)); |
9085 | 0 | } |
9086 | |
|
9087 | 0 | atomic_read_relaxed(&pmd_alb->rebalance_load_thresh, |
9088 | 0 | &rebalance_load_trigger); |
9089 | 0 | if (pmd_load >= rebalance_load_trigger) { |
9090 | 0 | atomic_count_inc(&pmd->pmd_overloaded); |
9091 | 0 | } else { |
9092 | 0 | atomic_count_set(&pmd->pmd_overloaded, 0); |
9093 | 0 | } |
9094 | 0 | } |
9095 | 0 | } |
9096 | |
|
9097 | 0 | pmd->prev_stats[PMD_CYCLES_ITER_IDLE] = |
9098 | 0 | pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE]; |
9099 | 0 | pmd->prev_stats[PMD_CYCLES_ITER_BUSY] = |
9100 | 0 | pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY]; |
9101 | 0 | pmd->prev_stats[PMD_CYCLES_SLEEP] = |
9102 | 0 | pmd->perf_stats.counters.n[PMD_CYCLES_SLEEP]; |
9103 | | |
9104 | | /* Get the cycles that were used to process each queue and store. */ |
9105 | 0 | for (unsigned i = 0; i < poll_cnt; i++) { |
9106 | 0 | uint64_t rxq_cyc_curr = dp_netdev_rxq_get_cycles(poll_list[i].rxq, |
9107 | 0 | RXQ_CYCLES_PROC_CURR); |
9108 | 0 | dp_netdev_rxq_set_intrvl_cycles(poll_list[i].rxq, rxq_cyc_curr); |
9109 | 0 | dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR, |
9110 | 0 | 0); |
9111 | 0 | } |
9112 | 0 | curr_tsc = cycles_counter_update(&pmd->perf_stats); |
9113 | 0 | if (pmd->intrvl_tsc_prev) { |
9114 | | /* There is a prev timestamp, store a new intrvl cycle count. */ |
9115 | 0 | atomic_store_relaxed(&pmd->intrvl_cycles, |
9116 | 0 | curr_tsc - pmd->intrvl_tsc_prev); |
9117 | 0 | } |
9118 | 0 | idx = atomic_count_inc(&pmd->intrvl_idx) % PMD_INTERVAL_MAX; |
9119 | 0 | atomic_store_relaxed(&pmd->busy_cycles_intrvl[idx], tot_proc); |
9120 | 0 | pmd->intrvl_tsc_prev = curr_tsc; |
9121 | | /* Start new measuring interval */ |
9122 | 0 | pmd->next_cycle_store = pmd->ctx.now + PMD_INTERVAL_LEN; |
9123 | 0 | } |
9124 | |
|
9125 | 0 | if (pmd->ctx.now > pmd->next_optimization) { |
9126 | | /* Try to obtain the flow lock to block out revalidator threads. |
9127 | | * If not possible, just try next time. */ |
9128 | 0 | if (!ovs_mutex_trylock(&pmd->flow_mutex)) { |
9129 | | /* Optimize each classifier */ |
9130 | 0 | CMAP_FOR_EACH (cls, node, &pmd->classifiers) { |
9131 | 0 | dpcls_sort_subtable_vector(cls); |
9132 | 0 | } |
9133 | 0 | ovs_mutex_unlock(&pmd->flow_mutex); |
9134 | | /* Start new measuring interval */ |
9135 | 0 | pmd->next_optimization = pmd->ctx.now |
9136 | 0 | + DPCLS_OPTIMIZATION_INTERVAL; |
9137 | 0 | } |
9138 | 0 | } |
9139 | 0 | } |
9140 | | |
9141 | | /* Returns the sum of a specified number of newest to |
9142 | | * oldest interval values. 'cur_idx' is where the next |
9143 | | * write will be and wrap around needs to be handled. |
9144 | | */ |
9145 | | static uint64_t |
9146 | | get_interval_values(atomic_ullong *source, atomic_count *cur_idx, |
9147 | 0 | int num_to_read) { |
9148 | 0 | unsigned int i; |
9149 | 0 | uint64_t total = 0; |
9150 | |
|
9151 | 0 | i = atomic_count_get(cur_idx) % PMD_INTERVAL_MAX; |
9152 | 0 | for (int read = 0; read < num_to_read; read++) { |
9153 | 0 | uint64_t interval_value; |
9154 | |
|
9155 | 0 | i = i ? i - 1 : PMD_INTERVAL_MAX - 1; |
9156 | 0 | atomic_read_relaxed(&source[i], &interval_value); |
9157 | 0 | total += interval_value; |
9158 | 0 | } |
9159 | 0 | return total; |
9160 | 0 | } |
9161 | | |
9162 | | /* Insert 'rule' into 'cls'. */ |
9163 | | static void |
9164 | | dpcls_insert(struct dpcls *cls, struct dpcls_rule *rule, |
9165 | | const struct netdev_flow_key *mask) |
9166 | 0 | { |
9167 | 0 | struct dpcls_subtable *subtable = dpcls_find_subtable(cls, mask); |
9168 | | |
9169 | | /* Refer to subtable's mask, also for later removal. */ |
9170 | 0 | rule->mask = &subtable->mask; |
9171 | 0 | cmap_insert(&subtable->rules, &rule->cmap_node, rule->flow.hash); |
9172 | 0 | } |
9173 | | |
9174 | | /* Removes 'rule' from 'cls', also destructing the 'rule'. */ |
9175 | | static void |
9176 | | dpcls_remove(struct dpcls *cls, struct dpcls_rule *rule) |
9177 | 0 | { |
9178 | 0 | struct dpcls_subtable *subtable; |
9179 | |
|
9180 | 0 | ovs_assert(rule->mask); |
9181 | | |
9182 | | /* Get subtable from reference in rule->mask. */ |
9183 | 0 | INIT_CONTAINER(subtable, rule->mask, mask); |
9184 | 0 | if (cmap_remove(&subtable->rules, &rule->cmap_node, rule->flow.hash) |
9185 | 0 | == 0) { |
9186 | | /* Delete empty subtable. */ |
9187 | 0 | dpcls_destroy_subtable(cls, subtable); |
9188 | 0 | pvector_publish(&cls->subtables); |
9189 | 0 | } |
9190 | 0 | } |
9191 | | |
9192 | | /* Inner loop for mask generation of a unit, see dpcls_flow_key_gen_masks. */ |
9193 | | static inline void |
9194 | | dpcls_flow_key_gen_mask_unit(uint64_t iter, const uint64_t count, |
9195 | | uint64_t *mf_masks) |
9196 | 0 | { |
9197 | 0 | int i; |
9198 | 0 | for (i = 0; i < count; i++) { |
9199 | 0 | uint64_t lowest_bit = (iter & -iter); |
9200 | 0 | iter &= ~lowest_bit; |
9201 | 0 | mf_masks[i] = (lowest_bit - 1); |
9202 | 0 | } |
9203 | | /* Checks that count has covered all bits in the iter bitmap. */ |
9204 | 0 | ovs_assert(iter == 0); |
9205 | 0 | } |
9206 | | |
9207 | | /* Generate a mask for each block in the miniflow, based on the bits set. This |
9208 | | * allows easily masking packets with the generated array here, without |
9209 | | * calculations. This replaces runtime-calculating the masks. |
9210 | | * @param key The table to generate the mf_masks for |
9211 | | * @param mf_masks Pointer to a u64 array of at least *mf_bits* in size |
9212 | | * @param mf_bits_total Number of bits set in the whole miniflow (both units) |
9213 | | * @param mf_bits_unit0 Number of bits set in unit0 of the miniflow |
9214 | | */ |
9215 | | void |
9216 | | dpcls_flow_key_gen_masks(const struct netdev_flow_key *tbl, |
9217 | | uint64_t *mf_masks, |
9218 | | const uint32_t mf_bits_u0, |
9219 | | const uint32_t mf_bits_u1) |
9220 | 0 | { |
9221 | 0 | uint64_t iter_u0 = tbl->mf.map.bits[0]; |
9222 | 0 | uint64_t iter_u1 = tbl->mf.map.bits[1]; |
9223 | |
|
9224 | 0 | dpcls_flow_key_gen_mask_unit(iter_u0, mf_bits_u0, &mf_masks[0]); |
9225 | 0 | dpcls_flow_key_gen_mask_unit(iter_u1, mf_bits_u1, &mf_masks[mf_bits_u0]); |
9226 | 0 | } |
9227 | | |
9228 | | /* Returns true if 'target' satisfies 'key' in 'mask', that is, if each 1-bit |
9229 | | * in 'mask' the values in 'key' and 'target' are the same. */ |
9230 | | inline bool |
9231 | | dpcls_rule_matches_key(const struct dpcls_rule *rule, |
9232 | | const struct netdev_flow_key *target) |
9233 | 0 | { |
9234 | 0 | const uint64_t *keyp = miniflow_get_values(&rule->flow.mf); |
9235 | 0 | const uint64_t *maskp = miniflow_get_values(&rule->mask->mf); |
9236 | 0 | uint64_t value; |
9237 | |
|
9238 | 0 | NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, target, rule->flow.mf.map) { |
9239 | 0 | if (OVS_UNLIKELY((value & *maskp++) != *keyp++)) { |
9240 | 0 | return false; |
9241 | 0 | } |
9242 | 0 | } |
9243 | 0 | return true; |
9244 | 0 | } |
9245 | | |
9246 | | /* For each miniflow in 'keys' performs a classifier lookup writing the result |
9247 | | * into the corresponding slot in 'rules'. If a particular entry in 'keys' is |
9248 | | * NULL it is skipped. |
9249 | | * |
9250 | | * This function is optimized for use in the userspace datapath and therefore |
9251 | | * does not implement a lot of features available in the standard |
9252 | | * classifier_lookup() function. Specifically, it does not implement |
9253 | | * priorities, instead returning any rule which matches the flow. |
9254 | | * |
9255 | | * Returns true if all miniflows found a corresponding rule. */ |
9256 | | bool |
9257 | | dpcls_lookup(struct dpcls *cls, const struct netdev_flow_key *keys[], |
9258 | | struct dpcls_rule **rules, const size_t cnt, |
9259 | | int *num_lookups_p) |
9260 | 0 | { |
9261 | | /* The received 'cnt' miniflows are the search-keys that will be processed |
9262 | | * to find a matching entry into the available subtables. |
9263 | | * The number of bits in map_type is equal to NETDEV_MAX_BURST. */ |
9264 | 0 | #define MAP_BITS (sizeof(uint32_t) * CHAR_BIT) |
9265 | 0 | BUILD_ASSERT_DECL(MAP_BITS >= NETDEV_MAX_BURST); |
9266 | |
|
9267 | 0 | struct dpcls_subtable *subtable; |
9268 | 0 | uint32_t keys_map = TYPE_MAXIMUM(uint32_t); /* Set all bits. */ |
9269 | |
|
9270 | 0 | if (cnt != MAP_BITS) { |
9271 | 0 | keys_map >>= MAP_BITS - cnt; /* Clear extra bits. */ |
9272 | 0 | } |
9273 | 0 | memset(rules, 0, cnt * sizeof *rules); |
9274 | |
|
9275 | 0 | int lookups_match = 0, subtable_pos = 1; |
9276 | 0 | uint32_t found_map; |
9277 | | |
9278 | | /* The Datapath classifier - aka dpcls - is composed of subtables. |
9279 | | * Subtables are dynamically created as needed when new rules are inserted. |
9280 | | * Each subtable collects rules with matches on a specific subset of packet |
9281 | | * fields as defined by the subtable's mask. We proceed to process every |
9282 | | * search-key against each subtable, but when a match is found for a |
9283 | | * search-key, the search for that key can stop because the rules are |
9284 | | * non-overlapping. */ |
9285 | 0 | PVECTOR_FOR_EACH (subtable, &cls->subtables) { |
9286 | | /* Call the subtable specific lookup function. */ |
9287 | 0 | found_map = subtable->lookup_func(subtable, keys_map, keys, rules); |
9288 | | |
9289 | | /* Count the number of subtables searched for this packet match. This |
9290 | | * estimates the "spread" of subtables looked at per matched packet. */ |
9291 | 0 | uint32_t pkts_matched = count_1bits(found_map); |
9292 | 0 | lookups_match += pkts_matched * subtable_pos; |
9293 | | |
9294 | | /* Clear the found rules, and return early if all packets are found. */ |
9295 | 0 | keys_map &= ~found_map; |
9296 | 0 | if (!keys_map) { |
9297 | 0 | if (num_lookups_p) { |
9298 | 0 | *num_lookups_p = lookups_match; |
9299 | 0 | } |
9300 | 0 | return true; |
9301 | 0 | } |
9302 | 0 | subtable_pos++; |
9303 | 0 | } |
9304 | | |
9305 | 0 | if (num_lookups_p) { |
9306 | 0 | *num_lookups_p = lookups_match; |
9307 | 0 | } |
9308 | | return false; |
9309 | 0 | } |