/src/openvswitch/lib/dpif-netdev.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2009-2014, 2016-2018 Nicira, Inc. |
3 | | * |
4 | | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | | * you may not use this file except in compliance with the License. |
6 | | * You may obtain a copy of the License at: |
7 | | * |
8 | | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | | * |
10 | | * Unless required by applicable law or agreed to in writing, software |
11 | | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | | * See the License for the specific language governing permissions and |
14 | | * limitations under the License. |
15 | | */ |
16 | | |
17 | | #include <config.h> |
18 | | #include "dpif-netdev.h" |
19 | | #include "dpif-netdev-private.h" |
20 | | #include "dpif-netdev-private-dfc.h" |
21 | | #include "dpif-offload.h" |
22 | | |
23 | | #include <ctype.h> |
24 | | #include <errno.h> |
25 | | #include <fcntl.h> |
26 | | #include <inttypes.h> |
27 | | #include <net/if.h> |
28 | | #include <sys/types.h> |
29 | | #include <netinet/in.h> |
30 | | #include <stdint.h> |
31 | | #include <stdlib.h> |
32 | | #include <string.h> |
33 | | #include <sys/ioctl.h> |
34 | | #include <sys/socket.h> |
35 | | #include <sys/stat.h> |
36 | | #include <unistd.h> |
37 | | |
38 | | #include "bitmap.h" |
39 | | #include "ccmap.h" |
40 | | #include "cmap.h" |
41 | | #include "conntrack.h" |
42 | | #include "conntrack-tp.h" |
43 | | #include "coverage.h" |
44 | | #include "ct-dpif.h" |
45 | | #include "csum.h" |
46 | | #include "dp-packet.h" |
47 | | #include "dpif.h" |
48 | | #include "dpif-netdev-lookup.h" |
49 | | #include "dpif-netdev-perf.h" |
50 | | #include "dpif-netdev-private-extract.h" |
51 | | #include "dpif-provider.h" |
52 | | #include "dummy.h" |
53 | | #include "fat-rwlock.h" |
54 | | #include "flow.h" |
55 | | #include "hmapx.h" |
56 | | #include "id-fpool.h" |
57 | | #include "id-pool.h" |
58 | | #include "ipf.h" |
59 | | #include "mov-avg.h" |
60 | | #include "mpsc-queue.h" |
61 | | #include "netdev.h" |
62 | | #include "netdev-provider.h" |
63 | | #include "netdev-vport.h" |
64 | | #include "netlink.h" |
65 | | #include "odp-execute.h" |
66 | | #include "odp-util.h" |
67 | | #include "openvswitch/dynamic-string.h" |
68 | | #include "openvswitch/list.h" |
69 | | #include "openvswitch/match.h" |
70 | | #include "openvswitch/ofp-parse.h" |
71 | | #include "openvswitch/ofp-print.h" |
72 | | #include "openvswitch/ofpbuf.h" |
73 | | #include "openvswitch/shash.h" |
74 | | #include "openvswitch/vlog.h" |
75 | | #include "ovs-numa.h" |
76 | | #include "ovs-rcu.h" |
77 | | #include "packets.h" |
78 | | #include "openvswitch/poll-loop.h" |
79 | | #include "pvector.h" |
80 | | #include "random.h" |
81 | | #include "seq.h" |
82 | | #include "smap.h" |
83 | | #include "sset.h" |
84 | | #include "timeval.h" |
85 | | #include "tnl-neigh-cache.h" |
86 | | #include "tnl-ports.h" |
87 | | #include "unixctl.h" |
88 | | #include "util.h" |
89 | | #include "uuid.h" |
90 | | |
91 | | VLOG_DEFINE_THIS_MODULE(dpif_netdev); |
92 | | |
93 | | /* Auto Load Balancing Defaults */ |
94 | 0 | #define ALB_IMPROVEMENT_THRESHOLD 25 |
95 | 0 | #define ALB_LOAD_THRESHOLD 95 |
96 | 0 | #define ALB_REBALANCE_INTERVAL 1 /* 1 Min */ |
97 | 0 | #define MAX_ALB_REBALANCE_INTERVAL 20000 /* 20000 Min */ |
98 | 0 | #define MIN_TO_MSEC 60000 |
99 | | |
100 | | #define FLOW_DUMP_MAX_BATCH 50 |
101 | | /* Use per thread recirc_depth to prevent recirculation loop. */ |
102 | 0 | #define MAX_RECIRC_DEPTH 8 |
103 | | DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0) |
104 | | |
105 | | /* Use instant packet send by default. */ |
106 | 0 | #define DEFAULT_TX_FLUSH_INTERVAL 0 |
107 | | |
108 | | /* Configuration parameters. */ |
109 | | enum { MAX_METERS = 1 << 18 }; /* Maximum number of meters. */ |
110 | | enum { MAX_BANDS = 8 }; /* Maximum number of bands / meter. */ |
111 | | |
112 | | COVERAGE_DEFINE(datapath_drop_meter); |
113 | | COVERAGE_DEFINE(datapath_drop_upcall_error); |
114 | | COVERAGE_DEFINE(datapath_drop_lock_error); |
115 | | COVERAGE_DEFINE(datapath_drop_userspace_action_error); |
116 | | COVERAGE_DEFINE(datapath_drop_tunnel_push_error); |
117 | | COVERAGE_DEFINE(datapath_drop_tunnel_pop_error); |
118 | | COVERAGE_DEFINE(datapath_drop_recirc_error); |
119 | | COVERAGE_DEFINE(datapath_drop_invalid_port); |
120 | | COVERAGE_DEFINE(datapath_drop_invalid_bond); |
121 | | COVERAGE_DEFINE(datapath_drop_invalid_tnl_port); |
122 | | COVERAGE_DEFINE(datapath_drop_rx_invalid_packet); |
123 | | COVERAGE_DEFINE(datapath_drop_hw_post_process); |
124 | | COVERAGE_DEFINE(datapath_drop_hw_post_process_consumed); |
125 | | |
126 | | /* Protects against changes to 'dp_netdevs'. */ |
127 | | struct ovs_mutex dp_netdev_mutex = OVS_MUTEX_INITIALIZER; |
128 | | |
129 | | /* Contains all 'struct dp_netdev's. */ |
130 | | static struct shash dp_netdevs OVS_GUARDED_BY(dp_netdev_mutex) |
131 | | = SHASH_INITIALIZER(&dp_netdevs); |
132 | | |
133 | | static struct vlog_rate_limit upcall_rl = VLOG_RATE_LIMIT_INIT(600, 600); |
134 | | |
135 | 0 | #define DP_NETDEV_CS_SUPPORTED_MASK (CS_NEW | CS_ESTABLISHED | CS_RELATED \ |
136 | 0 | | CS_INVALID | CS_REPLY_DIR | CS_TRACKED \ |
137 | 0 | | CS_SRC_NAT | CS_DST_NAT) |
138 | 0 | #define DP_NETDEV_CS_UNSUPPORTED_MASK (~(uint32_t)DP_NETDEV_CS_SUPPORTED_MASK) |
139 | | |
140 | | static struct odp_support dp_netdev_support = { |
141 | | .max_vlan_headers = SIZE_MAX, |
142 | | .max_mpls_depth = SIZE_MAX, |
143 | | .recirc = true, |
144 | | .ct_state = true, |
145 | | .ct_zone = true, |
146 | | .ct_mark = true, |
147 | | .ct_label = true, |
148 | | .ct_state_nat = true, |
149 | | .ct_orig_tuple = true, |
150 | | .ct_orig_tuple6 = true, |
151 | | }; |
152 | | |
153 | | |
154 | | /* Simple non-wildcarding single-priority classifier. */ |
155 | | |
156 | | /* Time in microseconds between successive optimizations of the dpcls |
157 | | * subtable vector */ |
158 | 0 | #define DPCLS_OPTIMIZATION_INTERVAL 1000000LL |
159 | | |
160 | | /* Time in microseconds of the interval in which rxq processing cycles used |
161 | | * in rxq to pmd assignments is measured and stored. */ |
162 | 0 | #define PMD_INTERVAL_LEN 5000000LL |
163 | | /* For converting PMD_INTERVAL_LEN to secs. */ |
164 | 0 | #define INTERVAL_USEC_TO_SEC 1000000LL |
165 | | |
166 | | /* Number of intervals for which cycles are stored |
167 | | * and used during rxq to pmd assignment. */ |
168 | 0 | #define PMD_INTERVAL_MAX 12 |
169 | | |
170 | | /* Time in microseconds to try RCU quiescing. */ |
171 | 0 | #define PMD_RCU_QUIESCE_INTERVAL 10000LL |
172 | | |
173 | | /* Timer resolution for PMD threads in nanoseconds. */ |
174 | 0 | #define PMD_TIMER_RES_NS 1000 |
175 | | |
176 | | /* Number of pkts Rx on an interface that will stop pmd thread sleeping. */ |
177 | 0 | #define PMD_SLEEP_THRESH (NETDEV_MAX_BURST / 2) |
178 | | /* Time in uS to increment a pmd thread sleep time. */ |
179 | 0 | #define PMD_SLEEP_INC_US 1 |
180 | | |
181 | | struct pmd_sleep { |
182 | | unsigned core_id; |
183 | | uint64_t max_sleep; |
184 | | }; |
185 | | |
186 | | struct dpcls { |
187 | | struct cmap_node node; /* Within dp_netdev_pmd_thread.classifiers */ |
188 | | odp_port_t in_port; |
189 | | struct cmap subtables_map; |
190 | | struct pvector subtables; |
191 | | }; |
192 | | |
193 | | /* Data structure to keep packet order till fastpath processing. */ |
194 | | struct dp_packet_flow_map { |
195 | | struct dp_packet *packet; |
196 | | struct dp_netdev_flow *flow; |
197 | | uint16_t tcp_flags; |
198 | | }; |
199 | | |
200 | | static void dpcls_init(struct dpcls *); |
201 | | static void dpcls_destroy(struct dpcls *); |
202 | | static void dpcls_sort_subtable_vector(struct dpcls *); |
203 | | static uint32_t dpcls_subtable_lookup_reprobe(struct dpcls *cls); |
204 | | static void dpcls_insert(struct dpcls *, struct dpcls_rule *, |
205 | | const struct netdev_flow_key *mask); |
206 | | static void dpcls_remove(struct dpcls *, struct dpcls_rule *); |
207 | | |
208 | | /* Set of supported meter flags */ |
209 | | #define DP_SUPPORTED_METER_FLAGS_MASK \ |
210 | 0 | (OFPMF13_STATS | OFPMF13_PKTPS | OFPMF13_KBPS | OFPMF13_BURST) |
211 | | |
212 | | /* Set of supported meter band types */ |
213 | | #define DP_SUPPORTED_METER_BAND_TYPES \ |
214 | 0 | ( 1 << OFPMBT13_DROP ) |
215 | | |
216 | | struct dp_meter_band { |
217 | | uint32_t rate; |
218 | | uint32_t burst_size; |
219 | | atomic_uint64_t bucket; /* In 1/1000 packets for PKTPS, |
220 | | * or in bits for KBPS. */ |
221 | | atomic_uint64_t packet_count; |
222 | | atomic_uint64_t byte_count; |
223 | | }; |
224 | | |
225 | | struct dp_meter { |
226 | | struct cmap_node node; |
227 | | uint32_t id; |
228 | | uint16_t flags; |
229 | | uint16_t n_bands; |
230 | | uint32_t max_delta_t; |
231 | | atomic_uint64_t used; /* Time of a last use in milliseconds. */ |
232 | | atomic_uint64_t packet_count; |
233 | | atomic_uint64_t byte_count; |
234 | | struct dp_meter_band bands[]; |
235 | | }; |
236 | | |
237 | | struct pmd_auto_lb { |
238 | | bool do_dry_run; |
239 | | bool recheck_config; |
240 | | bool is_enabled; /* Current status of Auto load balancing. */ |
241 | | uint64_t rebalance_intvl; |
242 | | uint64_t rebalance_poll_timer; |
243 | | uint8_t rebalance_improve_thresh; |
244 | | atomic_uint8_t rebalance_load_thresh; |
245 | | }; |
246 | | |
247 | | enum sched_assignment_type { |
248 | | SCHED_ROUNDROBIN, |
249 | | SCHED_CYCLES, /* Default.*/ |
250 | | SCHED_GROUP |
251 | | }; |
252 | | |
253 | | /* Datapath based on the network device interface from netdev.h. |
254 | | * |
255 | | * |
256 | | * Thread-safety |
257 | | * ============= |
258 | | * |
259 | | * Some members, marked 'const', are immutable. Accessing other members |
260 | | * requires synchronization, as noted in more detail below. |
261 | | * |
262 | | * Acquisition order is, from outermost to innermost: |
263 | | * |
264 | | * dp_netdev_mutex (global) |
265 | | * port_rwlock |
266 | | * bond_mutex |
267 | | * non_pmd_mutex |
268 | | */ |
269 | | struct dp_netdev { |
270 | | const struct dpif_class *const class; |
271 | | const char *const name; |
272 | | const char *const full_name; |
273 | | struct ovs_refcount ref_cnt; |
274 | | atomic_flag destroyed; |
275 | | |
276 | | /* Ports. |
277 | | * |
278 | | * Any lookup into 'ports' or any access to the dp_netdev_ports found |
279 | | * through 'ports' requires taking 'port_rwlock'. */ |
280 | | struct ovs_rwlock port_rwlock; |
281 | | struct hmap ports; |
282 | | struct seq *port_seq; /* Incremented whenever a port changes. */ |
283 | | |
284 | | /* The time that a packet can wait in output batch for sending. */ |
285 | | atomic_uint32_t tx_flush_interval; |
286 | | |
287 | | /* Meters. */ |
288 | | struct ovs_mutex meters_lock; |
289 | | struct cmap meters; |
290 | | |
291 | | /* Probability of EMC insertions is a factor of 'emc_insert_min'.*/ |
292 | | atomic_uint32_t emc_insert_min; |
293 | | /* Enable collection of PMD performance metrics. */ |
294 | | atomic_bool pmd_perf_metrics; |
295 | | /* Default max load based sleep request. */ |
296 | | uint64_t pmd_max_sleep_default; |
297 | | /* Enable the SMC cache from ovsdb config */ |
298 | | atomic_bool smc_enable_db; |
299 | | |
300 | | /* Protects access to ofproto-dpif-upcall interface during revalidator |
301 | | * thread synchronization. */ |
302 | | struct fat_rwlock upcall_rwlock; |
303 | | upcall_callback *upcall_cb; /* Callback function for executing upcalls. */ |
304 | | void *upcall_aux; |
305 | | |
306 | | /* Callback function for notifying the purging of dp flows (during |
307 | | * reseting pmd deletion). */ |
308 | | dp_purge_callback *dp_purge_cb; |
309 | | void *dp_purge_aux; |
310 | | |
311 | | /* Stores all 'struct dp_netdev_pmd_thread's. */ |
312 | | struct cmap poll_threads; |
313 | | /* id pool for per thread static_tx_qid. */ |
314 | | struct id_pool *tx_qid_pool; |
315 | | struct ovs_mutex tx_qid_pool_mutex; |
316 | | /* Rxq to pmd assignment type. */ |
317 | | enum sched_assignment_type pmd_rxq_assign_type; |
318 | | bool pmd_iso; |
319 | | |
320 | | /* Protects the access of the 'struct dp_netdev_pmd_thread' |
321 | | * instance for non-pmd thread. */ |
322 | | struct ovs_mutex non_pmd_mutex; |
323 | | |
324 | | /* Each pmd thread will store its pointer to |
325 | | * 'struct dp_netdev_pmd_thread' in 'per_pmd_key'. */ |
326 | | ovsthread_key_t per_pmd_key; |
327 | | |
328 | | struct seq *reconfigure_seq; |
329 | | uint64_t last_reconfigure_seq; |
330 | | |
331 | | /* Cpu mask for pin of pmd threads. */ |
332 | | char *pmd_cmask; |
333 | | |
334 | | /* PMD max load based sleep request user string. */ |
335 | | char *max_sleep_list; |
336 | | |
337 | | uint64_t last_tnl_conf_seq; |
338 | | |
339 | | struct conntrack *conntrack; |
340 | | struct pmd_auto_lb pmd_alb; |
341 | | |
342 | | /* Bonds. */ |
343 | | struct ovs_mutex bond_mutex; /* Protects updates of 'tx_bonds'. */ |
344 | | struct cmap tx_bonds; /* Contains 'struct tx_bond'. */ |
345 | | }; |
346 | | |
347 | | static struct dp_netdev_port *dp_netdev_lookup_port(const struct dp_netdev *dp, |
348 | | odp_port_t) |
349 | | OVS_REQ_RDLOCK(dp->port_rwlock); |
350 | | |
351 | | enum rxq_cycles_counter_type { |
352 | | RXQ_CYCLES_PROC_CURR, /* Cycles spent successfully polling and |
353 | | processing packets during the current |
354 | | interval. */ |
355 | | RXQ_CYCLES_PROC_HIST, /* Total cycles of all intervals that are used |
356 | | during rxq to pmd assignment. */ |
357 | | RXQ_N_CYCLES |
358 | | }; |
359 | | |
360 | 0 | #define XPS_TIMEOUT 500000LL /* In microseconds. */ |
361 | | |
362 | | /* Contained by struct dp_netdev_port's 'rxqs' member. */ |
363 | | struct dp_netdev_rxq { |
364 | | struct dp_netdev_port *port; |
365 | | struct netdev_rxq *rx; |
366 | | unsigned core_id; /* Core to which this queue should be |
367 | | pinned. OVS_CORE_UNSPEC if the |
368 | | queue doesn't need to be pinned to a |
369 | | particular core. */ |
370 | | atomic_count intrvl_idx; /* Write index for 'cycles_intrvl'. */ |
371 | | struct dp_netdev_pmd_thread *pmd; /* pmd thread that polls this queue. */ |
372 | | bool is_vhost; /* Is rxq of a vhost port. */ |
373 | | |
374 | | /* Counters of cycles spent successfully polling and processing pkts. */ |
375 | | atomic_ullong cycles[RXQ_N_CYCLES]; |
376 | | /* We store PMD_INTERVAL_MAX intervals of data for an rxq and then |
377 | | sum them to yield the cycles used for an rxq. */ |
378 | | atomic_ullong cycles_intrvl[PMD_INTERVAL_MAX]; |
379 | | }; |
380 | | |
381 | | enum txq_req_mode { |
382 | | TXQ_REQ_MODE_THREAD, |
383 | | TXQ_REQ_MODE_HASH, |
384 | | }; |
385 | | |
386 | | enum txq_mode { |
387 | | TXQ_MODE_STATIC, |
388 | | TXQ_MODE_XPS, |
389 | | TXQ_MODE_XPS_HASH, |
390 | | }; |
391 | | |
392 | | /* A port in a netdev-based datapath. */ |
393 | | struct dp_netdev_port { |
394 | | odp_port_t port_no; |
395 | | enum txq_mode txq_mode; /* static, XPS, XPS_HASH. */ |
396 | | bool need_reconfigure; /* True if we should reconfigure netdev. */ |
397 | | struct netdev *netdev; |
398 | | struct hmap_node node; /* Node in dp_netdev's 'ports'. */ |
399 | | struct netdev_saved_flags *sf; |
400 | | struct dp_netdev_rxq *rxqs; |
401 | | unsigned n_rxq; /* Number of elements in 'rxqs' */ |
402 | | unsigned *txq_used; /* Number of threads that use each tx queue. */ |
403 | | struct ovs_mutex txq_used_mutex; |
404 | | bool emc_enabled; /* If true EMC will be used. */ |
405 | | char *type; /* Port type as requested by user. */ |
406 | | char *rxq_affinity_list; /* Requested affinity of rx queues. */ |
407 | | enum txq_req_mode txq_requested_mode; |
408 | | }; |
409 | | |
410 | | static bool dp_netdev_flow_ref(struct dp_netdev_flow *); |
411 | | static int dpif_netdev_flow_from_nlattrs(const struct nlattr *, uint32_t, |
412 | | struct flow *, bool); |
413 | | |
414 | | struct dp_netdev_actions *dp_netdev_actions_create(const struct nlattr *, |
415 | | size_t); |
416 | | struct dp_netdev_actions *dp_netdev_flow_get_actions( |
417 | | const struct dp_netdev_flow *); |
418 | | static void dp_netdev_actions_free(struct dp_netdev_actions *); |
419 | | |
420 | | struct polled_queue { |
421 | | struct dp_netdev_rxq *rxq; |
422 | | odp_port_t port_no; |
423 | | bool emc_enabled; |
424 | | bool rxq_enabled; |
425 | | uint64_t change_seq; |
426 | | }; |
427 | | |
428 | | /* Contained by struct dp_netdev_pmd_thread's 'poll_list' member. */ |
429 | | struct rxq_poll { |
430 | | struct dp_netdev_rxq *rxq; |
431 | | struct hmap_node node; |
432 | | }; |
433 | | |
434 | | /* Contained by struct dp_netdev_pmd_thread's 'send_port_cache', |
435 | | * 'tnl_port_cache' or 'tx_ports'. */ |
436 | | struct tx_port { |
437 | | struct dp_netdev_port *port; |
438 | | int qid; |
439 | | long long last_used; |
440 | | struct hmap_node node; |
441 | | long long flush_time; |
442 | | struct dp_packet_batch output_pkts; |
443 | | struct dp_packet_batch *txq_pkts; /* Only for hash mode. */ |
444 | | struct dp_netdev_rxq *output_pkts_rxqs[NETDEV_MAX_BURST]; |
445 | | }; |
446 | | |
447 | | /* Contained by struct tx_bond 'member_buckets'. */ |
448 | | struct member_entry { |
449 | | odp_port_t member_id; |
450 | | atomic_ullong n_packets; |
451 | | atomic_ullong n_bytes; |
452 | | }; |
453 | | |
454 | | /* Contained by struct dp_netdev_pmd_thread's 'tx_bonds'. */ |
455 | | struct tx_bond { |
456 | | struct cmap_node node; |
457 | | uint32_t bond_id; |
458 | | struct member_entry member_buckets[BOND_BUCKETS]; |
459 | | }; |
460 | | |
461 | | /* Interface to netdev-based datapath. */ |
462 | | struct dpif_netdev { |
463 | | struct dpif dpif; |
464 | | struct dp_netdev *dp; |
465 | | uint64_t last_port_seq; |
466 | | }; |
467 | | |
468 | | static int get_port_by_number(struct dp_netdev *dp, odp_port_t port_no, |
469 | | struct dp_netdev_port **portp) |
470 | | OVS_REQ_RDLOCK(dp->port_rwlock); |
471 | | static int get_port_by_name(struct dp_netdev *dp, const char *devname, |
472 | | struct dp_netdev_port **portp) |
473 | | OVS_REQ_RDLOCK(dp->port_rwlock); |
474 | | static void dp_netdev_free(struct dp_netdev *) |
475 | | OVS_REQUIRES(dp_netdev_mutex); |
476 | | static int do_add_port(struct dp_netdev *dp, const char *devname, |
477 | | const char *type, odp_port_t port_no) |
478 | | OVS_REQ_WRLOCK(dp->port_rwlock); |
479 | | static void do_del_port(struct dp_netdev *dp, struct dp_netdev_port *) |
480 | | OVS_REQ_WRLOCK(dp->port_rwlock); |
481 | | static int dpif_netdev_open(const struct dpif_class *, const char *name, |
482 | | bool create, struct dpif **); |
483 | | static void dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd, |
484 | | struct dp_packet_batch *, |
485 | | bool should_steal, |
486 | | const struct flow *flow, |
487 | | const struct nlattr *actions, |
488 | | size_t actions_len); |
489 | | static void dp_netdev_recirculate(struct dp_netdev_pmd_thread *, |
490 | | struct dp_packet_batch *); |
491 | | |
492 | | static void dp_netdev_disable_upcall(struct dp_netdev *); |
493 | | static void dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd); |
494 | | static void dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, |
495 | | struct dp_netdev *dp, unsigned core_id, |
496 | | int numa_id); |
497 | | static void dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd); |
498 | | static void dp_netdev_set_nonpmd(struct dp_netdev *dp) |
499 | | OVS_REQ_WRLOCK(dp->port_rwlock); |
500 | | |
501 | | static void *pmd_thread_main(void *); |
502 | | static struct dp_netdev_pmd_thread *dp_netdev_get_pmd(struct dp_netdev *dp, |
503 | | unsigned core_id); |
504 | | static struct dp_netdev_pmd_thread * |
505 | | dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos); |
506 | | static void dp_netdev_del_pmd(struct dp_netdev *dp, |
507 | | struct dp_netdev_pmd_thread *pmd); |
508 | | static void dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd); |
509 | | static void dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd); |
510 | | static void dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd, |
511 | | struct dp_netdev_port *port) |
512 | | OVS_REQUIRES(pmd->port_mutex); |
513 | | static void dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd, |
514 | | struct tx_port *tx) |
515 | | OVS_REQUIRES(pmd->port_mutex); |
516 | | static void dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd, |
517 | | struct dp_netdev_rxq *rxq) |
518 | | OVS_REQUIRES(pmd->port_mutex); |
519 | | static void dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd, |
520 | | struct rxq_poll *poll) |
521 | | OVS_REQUIRES(pmd->port_mutex); |
522 | | static int |
523 | | dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd, |
524 | | bool force); |
525 | | static void dp_netdev_add_bond_tx_to_pmd(struct dp_netdev_pmd_thread *pmd, |
526 | | struct tx_bond *bond, bool update) |
527 | | OVS_EXCLUDED(pmd->bond_mutex); |
528 | | static void dp_netdev_del_bond_tx_from_pmd(struct dp_netdev_pmd_thread *pmd, |
529 | | uint32_t bond_id) |
530 | | OVS_EXCLUDED(pmd->bond_mutex); |
531 | | |
532 | | static void reconfigure_datapath(struct dp_netdev *dp) |
533 | | OVS_REQ_RDLOCK(dp->port_rwlock); |
534 | | static bool dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd); |
535 | | static void dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd); |
536 | | static void dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd); |
537 | | static void pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd) |
538 | | OVS_REQUIRES(pmd->port_mutex); |
539 | | static inline void |
540 | | dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd, |
541 | | struct polled_queue *poll_list, int poll_cnt); |
542 | | static void |
543 | | dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx, |
544 | | enum rxq_cycles_counter_type type, |
545 | | unsigned long long cycles); |
546 | | static uint64_t |
547 | | dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx, |
548 | | enum rxq_cycles_counter_type type); |
549 | | static void |
550 | | dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx, |
551 | | unsigned long long cycles); |
552 | | static uint64_t |
553 | | dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx); |
554 | | static uint64_t |
555 | | get_interval_values(atomic_ullong *source, atomic_count *cur_idx, |
556 | | int num_to_read); |
557 | | static void |
558 | | dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd, |
559 | | bool purge); |
560 | | static int dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd, |
561 | | struct tx_port *tx); |
562 | | inline struct dpcls * |
563 | | dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd, |
564 | | odp_port_t in_port); |
565 | | |
566 | | static void dp_netdev_request_reconfigure(struct dp_netdev *dp); |
567 | | static inline bool |
568 | | pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd); |
569 | | |
570 | | static void dp_netdev_simple_match_insert(struct dp_netdev_pmd_thread *pmd, |
571 | | struct dp_netdev_flow *flow) |
572 | | OVS_REQUIRES(pmd->flow_mutex); |
573 | | static void dp_netdev_simple_match_remove(struct dp_netdev_pmd_thread *pmd, |
574 | | struct dp_netdev_flow *flow) |
575 | | OVS_REQUIRES(pmd->flow_mutex); |
576 | | |
577 | | static bool dp_netdev_flow_is_simple_match(const struct match *); |
578 | | |
579 | | /* Updates the time in PMD threads context and should be called in three cases: |
580 | | * |
581 | | * 1. PMD structure initialization: |
582 | | * - dp_netdev_configure_pmd() |
583 | | * |
584 | | * 2. Before processing of the new packet batch: |
585 | | * - dpif_netdev_execute() |
586 | | * - dp_netdev_process_rxq_port() |
587 | | * |
588 | | * 3. At least once per polling iteration in main polling threads if no |
589 | | * packets received on current iteration: |
590 | | * - dpif_netdev_run() |
591 | | * - pmd_thread_main() |
592 | | * |
593 | | * 'pmd->ctx.now' should be used without update in all other cases if possible. |
594 | | */ |
595 | | static inline void |
596 | | pmd_thread_ctx_time_update(struct dp_netdev_pmd_thread *pmd) |
597 | 0 | { |
598 | 0 | pmd->ctx.now = time_usec(); |
599 | 0 | } |
600 | | |
601 | | /* Returns true if 'dpif' is a netdev or dummy dpif, false otherwise. */ |
602 | | bool |
603 | | dpif_is_netdev(const struct dpif *dpif) |
604 | 0 | { |
605 | 0 | return dpif->dpif_class->open == dpif_netdev_open; |
606 | 0 | } |
607 | | |
608 | | static struct dpif_netdev * |
609 | | dpif_netdev_cast(const struct dpif *dpif) |
610 | 0 | { |
611 | 0 | ovs_assert(dpif_is_netdev(dpif)); |
612 | 0 | return CONTAINER_OF(dpif, struct dpif_netdev, dpif); |
613 | 0 | } |
614 | | |
615 | | static struct dp_netdev * |
616 | | get_dp_netdev(const struct dpif *dpif) |
617 | 0 | { |
618 | 0 | return dpif_netdev_cast(dpif)->dp; |
619 | 0 | } |
620 | | |
621 | | enum pmd_info_type { |
622 | | PMD_INFO_SHOW_STATS, /* Show how cpu cycles are spent. */ |
623 | | PMD_INFO_CLEAR_STATS, /* Set the cycles count to 0. */ |
624 | | PMD_INFO_SHOW_RXQ, /* Show poll lists of pmd threads. */ |
625 | | PMD_INFO_PERF_SHOW, /* Show pmd performance details. */ |
626 | | PMD_INFO_SLEEP_SHOW, /* Show max sleep configuration details. */ |
627 | | }; |
628 | | |
629 | | static void |
630 | | format_pmd_thread(struct ds *reply, struct dp_netdev_pmd_thread *pmd) |
631 | 0 | { |
632 | 0 | ds_put_cstr(reply, (pmd->core_id == NON_PMD_CORE_ID) |
633 | 0 | ? "main thread" : "pmd thread"); |
634 | 0 | if (pmd->numa_id != OVS_NUMA_UNSPEC) { |
635 | 0 | ds_put_format(reply, " numa_id %d", pmd->numa_id); |
636 | 0 | } |
637 | 0 | if (pmd->core_id != OVS_CORE_UNSPEC && pmd->core_id != NON_PMD_CORE_ID) { |
638 | 0 | ds_put_format(reply, " core_id %u", pmd->core_id); |
639 | 0 | } |
640 | 0 | ds_put_cstr(reply, ":\n"); |
641 | 0 | } |
642 | | |
643 | | static void |
644 | | pmd_info_show_stats(struct ds *reply, |
645 | | struct dp_netdev_pmd_thread *pmd) |
646 | 0 | { |
647 | 0 | uint64_t stats[PMD_N_STATS]; |
648 | 0 | uint64_t total_cycles, total_packets; |
649 | 0 | double passes_per_pkt = 0; |
650 | 0 | double lookups_per_hit = 0; |
651 | 0 | double packets_per_batch = 0; |
652 | |
|
653 | 0 | pmd_perf_read_counters(&pmd->perf_stats, stats); |
654 | 0 | total_cycles = stats[PMD_CYCLES_ITER_IDLE] |
655 | 0 | + stats[PMD_CYCLES_ITER_BUSY]; |
656 | 0 | total_packets = stats[PMD_STAT_RECV]; |
657 | |
|
658 | 0 | format_pmd_thread(reply, pmd); |
659 | |
|
660 | 0 | if (total_packets > 0) { |
661 | 0 | passes_per_pkt = (total_packets + stats[PMD_STAT_RECIRC]) |
662 | 0 | / (double) total_packets; |
663 | 0 | } |
664 | 0 | if (stats[PMD_STAT_MASKED_HIT] > 0) { |
665 | 0 | lookups_per_hit = stats[PMD_STAT_MASKED_LOOKUP] |
666 | 0 | / (double) stats[PMD_STAT_MASKED_HIT]; |
667 | 0 | } |
668 | 0 | if (stats[PMD_STAT_SENT_BATCHES] > 0) { |
669 | 0 | packets_per_batch = stats[PMD_STAT_SENT_PKTS] |
670 | 0 | / (double) stats[PMD_STAT_SENT_BATCHES]; |
671 | 0 | } |
672 | |
|
673 | 0 | ds_put_format(reply, |
674 | 0 | " packets received: %"PRIu64"\n" |
675 | 0 | " packet recirculations: %"PRIu64"\n" |
676 | 0 | " avg. datapath passes per packet: %.02f\n" |
677 | 0 | " phwol hits: %"PRIu64"\n" |
678 | 0 | " mfex opt hits: %"PRIu64"\n" |
679 | 0 | " simple match hits: %"PRIu64"\n" |
680 | 0 | " emc hits: %"PRIu64"\n" |
681 | 0 | " smc hits: %"PRIu64"\n" |
682 | 0 | " megaflow hits: %"PRIu64"\n" |
683 | 0 | " avg. subtable lookups per megaflow hit: %.02f\n" |
684 | 0 | " miss with success upcall: %"PRIu64"\n" |
685 | 0 | " miss with failed upcall: %"PRIu64"\n" |
686 | 0 | " avg. packets per output batch: %.02f\n", |
687 | 0 | total_packets, stats[PMD_STAT_RECIRC], |
688 | 0 | passes_per_pkt, stats[PMD_STAT_PHWOL_HIT], |
689 | 0 | stats[PMD_STAT_MFEX_OPT_HIT], |
690 | 0 | stats[PMD_STAT_SIMPLE_HIT], |
691 | 0 | stats[PMD_STAT_EXACT_HIT], |
692 | 0 | stats[PMD_STAT_SMC_HIT], |
693 | 0 | stats[PMD_STAT_MASKED_HIT], |
694 | 0 | lookups_per_hit, stats[PMD_STAT_MISS], stats[PMD_STAT_LOST], |
695 | 0 | packets_per_batch); |
696 | |
|
697 | 0 | if (total_cycles == 0) { |
698 | 0 | return; |
699 | 0 | } |
700 | | |
701 | 0 | ds_put_format(reply, |
702 | 0 | " idle cycles: %"PRIu64" (%.02f%%)\n" |
703 | 0 | " processing cycles: %"PRIu64" (%.02f%%)\n", |
704 | 0 | stats[PMD_CYCLES_ITER_IDLE], |
705 | 0 | stats[PMD_CYCLES_ITER_IDLE] / (double) total_cycles * 100, |
706 | 0 | stats[PMD_CYCLES_ITER_BUSY], |
707 | 0 | stats[PMD_CYCLES_ITER_BUSY] / (double) total_cycles * 100); |
708 | |
|
709 | 0 | if (total_packets == 0) { |
710 | 0 | return; |
711 | 0 | } |
712 | | |
713 | 0 | ds_put_format(reply, |
714 | 0 | " avg cycles per packet: %.02f (%"PRIu64"/%"PRIu64")\n", |
715 | 0 | total_cycles / (double) total_packets, |
716 | 0 | total_cycles, total_packets); |
717 | |
|
718 | 0 | ds_put_format(reply, |
719 | 0 | " avg processing cycles per packet: " |
720 | 0 | "%.02f (%"PRIu64"/%"PRIu64")\n", |
721 | 0 | stats[PMD_CYCLES_ITER_BUSY] / (double) total_packets, |
722 | 0 | stats[PMD_CYCLES_ITER_BUSY], total_packets); |
723 | 0 | } |
724 | | |
725 | | static void |
726 | | pmd_info_show_perf(struct ds *reply, |
727 | | struct dp_netdev_pmd_thread *pmd, |
728 | | struct pmd_perf_params *par) |
729 | 0 | { |
730 | 0 | if (pmd->core_id != NON_PMD_CORE_ID) { |
731 | 0 | char *time_str = |
732 | 0 | xastrftime_msec("%H:%M:%S.###", time_wall_msec(), true); |
733 | 0 | long long now = time_msec(); |
734 | 0 | double duration = (now - pmd->perf_stats.start_ms) / 1000.0; |
735 | |
|
736 | 0 | ds_put_cstr(reply, "\n"); |
737 | 0 | ds_put_format(reply, "Time: %s\n", time_str); |
738 | 0 | ds_put_format(reply, "Measurement duration: %.3f s\n", duration); |
739 | 0 | ds_put_cstr(reply, "\n"); |
740 | 0 | format_pmd_thread(reply, pmd); |
741 | 0 | ds_put_cstr(reply, "\n"); |
742 | 0 | pmd_perf_format_overall_stats(reply, &pmd->perf_stats, duration); |
743 | 0 | if (pmd_perf_metrics_enabled(pmd)) { |
744 | | /* Prevent parallel clearing of perf metrics. */ |
745 | 0 | ovs_mutex_lock(&pmd->perf_stats.clear_mutex); |
746 | 0 | if (par->histograms) { |
747 | 0 | ds_put_cstr(reply, "\n"); |
748 | 0 | pmd_perf_format_histograms(reply, &pmd->perf_stats); |
749 | 0 | } |
750 | 0 | if (par->iter_hist_len > 0) { |
751 | 0 | ds_put_cstr(reply, "\n"); |
752 | 0 | pmd_perf_format_iteration_history(reply, &pmd->perf_stats, |
753 | 0 | par->iter_hist_len); |
754 | 0 | } |
755 | 0 | if (par->ms_hist_len > 0) { |
756 | 0 | ds_put_cstr(reply, "\n"); |
757 | 0 | pmd_perf_format_ms_history(reply, &pmd->perf_stats, |
758 | 0 | par->ms_hist_len); |
759 | 0 | } |
760 | 0 | ovs_mutex_unlock(&pmd->perf_stats.clear_mutex); |
761 | 0 | } |
762 | 0 | free(time_str); |
763 | 0 | } |
764 | 0 | } |
765 | | |
766 | | static int |
767 | | compare_poll_list(const void *a_, const void *b_) |
768 | 0 | { |
769 | 0 | const struct rxq_poll *a = a_; |
770 | 0 | const struct rxq_poll *b = b_; |
771 | |
|
772 | 0 | const char *namea = netdev_rxq_get_name(a->rxq->rx); |
773 | 0 | const char *nameb = netdev_rxq_get_name(b->rxq->rx); |
774 | |
|
775 | 0 | int cmp = strcmp(namea, nameb); |
776 | 0 | if (!cmp) { |
777 | 0 | return netdev_rxq_get_queue_id(a->rxq->rx) |
778 | 0 | - netdev_rxq_get_queue_id(b->rxq->rx); |
779 | 0 | } else { |
780 | 0 | return cmp; |
781 | 0 | } |
782 | 0 | } |
783 | | |
784 | | static void |
785 | | sorted_poll_list(struct dp_netdev_pmd_thread *pmd, struct rxq_poll **list, |
786 | | size_t *n) |
787 | | OVS_REQUIRES(pmd->port_mutex) |
788 | 0 | { |
789 | 0 | struct rxq_poll *ret, *poll; |
790 | 0 | size_t i; |
791 | |
|
792 | 0 | *n = hmap_count(&pmd->poll_list); |
793 | 0 | if (!*n) { |
794 | 0 | ret = NULL; |
795 | 0 | } else { |
796 | 0 | ret = xcalloc(*n, sizeof *ret); |
797 | 0 | i = 0; |
798 | 0 | HMAP_FOR_EACH (poll, node, &pmd->poll_list) { |
799 | 0 | ret[i] = *poll; |
800 | 0 | i++; |
801 | 0 | } |
802 | 0 | ovs_assert(i == *n); |
803 | 0 | qsort(ret, *n, sizeof *ret, compare_poll_list); |
804 | 0 | } |
805 | |
|
806 | 0 | *list = ret; |
807 | 0 | } |
808 | | |
809 | | static void |
810 | | pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd, |
811 | | int secs) |
812 | 0 | { |
813 | 0 | if (pmd->core_id != NON_PMD_CORE_ID) { |
814 | 0 | struct rxq_poll *list; |
815 | 0 | size_t n_rxq; |
816 | 0 | uint64_t total_pmd_cycles = 0; |
817 | 0 | uint64_t busy_pmd_cycles = 0; |
818 | 0 | uint64_t total_rxq_proc_cycles = 0; |
819 | 0 | unsigned int intervals; |
820 | |
|
821 | 0 | ds_put_format(reply, |
822 | 0 | "pmd thread numa_id %d core_id %u:\n isolated : %s\n", |
823 | 0 | pmd->numa_id, pmd->core_id, (pmd->isolated) |
824 | 0 | ? "true" : "false"); |
825 | |
|
826 | 0 | ovs_mutex_lock(&pmd->port_mutex); |
827 | 0 | sorted_poll_list(pmd, &list, &n_rxq); |
828 | | |
829 | | /* Get the total pmd cycles for an interval. */ |
830 | 0 | atomic_read_relaxed(&pmd->intrvl_cycles, &total_pmd_cycles); |
831 | | /* Calculate how many intervals are to be used. */ |
832 | 0 | intervals = DIV_ROUND_UP(secs, |
833 | 0 | PMD_INTERVAL_LEN / INTERVAL_USEC_TO_SEC); |
834 | | /* Estimate the cycles to cover all intervals. */ |
835 | 0 | total_pmd_cycles *= intervals; |
836 | 0 | busy_pmd_cycles = get_interval_values(pmd->busy_cycles_intrvl, |
837 | 0 | &pmd->intrvl_idx, |
838 | 0 | intervals); |
839 | 0 | if (busy_pmd_cycles > total_pmd_cycles) { |
840 | 0 | busy_pmd_cycles = total_pmd_cycles; |
841 | 0 | } |
842 | |
|
843 | 0 | for (int i = 0; i < n_rxq; i++) { |
844 | 0 | struct dp_netdev_rxq *rxq = list[i].rxq; |
845 | 0 | const char *name = netdev_rxq_get_name(rxq->rx); |
846 | 0 | uint64_t rxq_proc_cycles = 0; |
847 | |
|
848 | 0 | rxq_proc_cycles = get_interval_values(rxq->cycles_intrvl, |
849 | 0 | &rxq->intrvl_idx, |
850 | 0 | intervals); |
851 | 0 | total_rxq_proc_cycles += rxq_proc_cycles; |
852 | 0 | ds_put_format(reply, " port: %-16s queue-id: %2d", name, |
853 | 0 | netdev_rxq_get_queue_id(list[i].rxq->rx)); |
854 | 0 | ds_put_format(reply, " %s", netdev_rxq_enabled(list[i].rxq->rx) |
855 | 0 | ? "(enabled) " : "(disabled)"); |
856 | 0 | ds_put_format(reply, " pmd usage: "); |
857 | 0 | if (total_pmd_cycles) { |
858 | 0 | ds_put_format(reply, "%2.0f %%", |
859 | 0 | (double) (rxq_proc_cycles * 100) / |
860 | 0 | total_pmd_cycles); |
861 | 0 | } else { |
862 | 0 | ds_put_format(reply, "%s", "NOT AVAIL"); |
863 | 0 | } |
864 | 0 | ds_put_cstr(reply, "\n"); |
865 | 0 | } |
866 | |
|
867 | 0 | if (n_rxq > 0) { |
868 | 0 | ds_put_cstr(reply, " overhead: "); |
869 | 0 | if (total_pmd_cycles) { |
870 | 0 | uint64_t overhead_cycles = 0; |
871 | |
|
872 | 0 | if (total_rxq_proc_cycles < busy_pmd_cycles) { |
873 | 0 | overhead_cycles = busy_pmd_cycles - total_rxq_proc_cycles; |
874 | 0 | } |
875 | |
|
876 | 0 | ds_put_format(reply, "%2.0f %%", |
877 | 0 | (double) (overhead_cycles * 100) / |
878 | 0 | total_pmd_cycles); |
879 | 0 | } else { |
880 | 0 | ds_put_cstr(reply, "NOT AVAIL"); |
881 | 0 | } |
882 | 0 | ds_put_cstr(reply, "\n"); |
883 | 0 | } |
884 | |
|
885 | 0 | ovs_mutex_unlock(&pmd->port_mutex); |
886 | 0 | free(list); |
887 | 0 | } |
888 | 0 | } |
889 | | |
890 | | static int |
891 | | compare_poll_thread_list(const void *a_, const void *b_) |
892 | 0 | { |
893 | 0 | const struct dp_netdev_pmd_thread *a, *b; |
894 | |
|
895 | 0 | a = *(struct dp_netdev_pmd_thread **)a_; |
896 | 0 | b = *(struct dp_netdev_pmd_thread **)b_; |
897 | |
|
898 | 0 | if (a->core_id < b->core_id) { |
899 | 0 | return -1; |
900 | 0 | } |
901 | 0 | if (a->core_id > b->core_id) { |
902 | 0 | return 1; |
903 | 0 | } |
904 | 0 | return 0; |
905 | 0 | } |
906 | | |
907 | | /* Create a sorted list of pmd's from the dp->poll_threads cmap. We can use |
908 | | * this list, as long as we do not go to quiescent state. */ |
909 | | static void |
910 | | sorted_poll_thread_list(struct dp_netdev *dp, |
911 | | struct dp_netdev_pmd_thread ***list, |
912 | | size_t *n) |
913 | 0 | { |
914 | 0 | struct dp_netdev_pmd_thread *pmd; |
915 | 0 | struct dp_netdev_pmd_thread **pmd_list; |
916 | 0 | size_t k = 0, n_pmds; |
917 | |
|
918 | 0 | n_pmds = cmap_count(&dp->poll_threads); |
919 | 0 | pmd_list = xcalloc(n_pmds, sizeof *pmd_list); |
920 | |
|
921 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
922 | 0 | if (k >= n_pmds) { |
923 | 0 | break; |
924 | 0 | } |
925 | 0 | pmd_list[k++] = pmd; |
926 | 0 | } |
927 | |
|
928 | 0 | qsort(pmd_list, k, sizeof *pmd_list, compare_poll_thread_list); |
929 | |
|
930 | 0 | *list = pmd_list; |
931 | 0 | *n = k; |
932 | 0 | } |
933 | | |
934 | | static void |
935 | | dpif_netdev_subtable_lookup_get(struct unixctl_conn *conn, int argc OVS_UNUSED, |
936 | | const char *argv[] OVS_UNUSED, |
937 | | void *aux OVS_UNUSED) |
938 | 0 | { |
939 | 0 | struct ds reply = DS_EMPTY_INITIALIZER; |
940 | |
|
941 | 0 | dpcls_impl_print_stats(&reply); |
942 | 0 | unixctl_command_reply(conn, ds_cstr(&reply)); |
943 | 0 | ds_destroy(&reply); |
944 | 0 | } |
945 | | |
946 | | static void |
947 | | dpif_netdev_subtable_lookup_set(struct unixctl_conn *conn, int argc OVS_UNUSED, |
948 | | const char *argv[], void *aux OVS_UNUSED) |
949 | 0 | { |
950 | | /* This function requires 2 parameters (argv[1] and argv[2]) to execute. |
951 | | * argv[1] is subtable name |
952 | | * argv[2] is priority |
953 | | */ |
954 | 0 | const char *func_name = argv[1]; |
955 | |
|
956 | 0 | errno = 0; |
957 | 0 | char *err_char; |
958 | 0 | uint32_t new_prio = strtoul(argv[2], &err_char, 10); |
959 | 0 | uint32_t lookup_dpcls_changed = 0; |
960 | 0 | uint32_t lookup_subtable_changed = 0; |
961 | 0 | struct shash_node *node; |
962 | 0 | if (errno != 0 || new_prio > UINT8_MAX) { |
963 | 0 | unixctl_command_reply_error(conn, |
964 | 0 | "error converting priority, use integer in range 0-255\n"); |
965 | 0 | return; |
966 | 0 | } |
967 | | |
968 | 0 | int32_t err = dpcls_subtable_set_prio(func_name, new_prio); |
969 | 0 | if (err) { |
970 | 0 | unixctl_command_reply_error(conn, |
971 | 0 | "error, subtable lookup function not found\n"); |
972 | 0 | return; |
973 | 0 | } |
974 | | |
975 | 0 | ovs_mutex_lock(&dp_netdev_mutex); |
976 | 0 | SHASH_FOR_EACH (node, &dp_netdevs) { |
977 | 0 | struct dp_netdev *dp = node->data; |
978 | | |
979 | | /* Get PMD threads list, required to get DPCLS instances. */ |
980 | 0 | size_t n; |
981 | 0 | struct dp_netdev_pmd_thread **pmd_list; |
982 | 0 | sorted_poll_thread_list(dp, &pmd_list, &n); |
983 | | |
984 | | /* take port mutex as HMAP iters over them. */ |
985 | 0 | ovs_rwlock_rdlock(&dp->port_rwlock); |
986 | |
|
987 | 0 | for (size_t i = 0; i < n; i++) { |
988 | 0 | struct dp_netdev_pmd_thread *pmd = pmd_list[i]; |
989 | 0 | if (pmd->core_id == NON_PMD_CORE_ID) { |
990 | 0 | continue; |
991 | 0 | } |
992 | | |
993 | 0 | struct dp_netdev_port *port = NULL; |
994 | 0 | HMAP_FOR_EACH (port, node, &dp->ports) { |
995 | 0 | odp_port_t in_port = port->port_no; |
996 | 0 | struct dpcls *cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port); |
997 | 0 | if (!cls) { |
998 | 0 | continue; |
999 | 0 | } |
1000 | 0 | ovs_mutex_lock(&pmd->flow_mutex); |
1001 | 0 | uint32_t subtbl_changes = dpcls_subtable_lookup_reprobe(cls); |
1002 | 0 | ovs_mutex_unlock(&pmd->flow_mutex); |
1003 | 0 | if (subtbl_changes) { |
1004 | 0 | lookup_dpcls_changed++; |
1005 | 0 | lookup_subtable_changed += subtbl_changes; |
1006 | 0 | } |
1007 | 0 | } |
1008 | 0 | } |
1009 | | |
1010 | | /* release port mutex before netdev mutex. */ |
1011 | 0 | ovs_rwlock_unlock(&dp->port_rwlock); |
1012 | 0 | free(pmd_list); |
1013 | 0 | } |
1014 | 0 | ovs_mutex_unlock(&dp_netdev_mutex); |
1015 | |
|
1016 | 0 | struct ds reply = DS_EMPTY_INITIALIZER; |
1017 | 0 | ds_put_format(&reply, |
1018 | 0 | "Lookup priority change affected %d dpcls ports and %d subtables.\n", |
1019 | 0 | lookup_dpcls_changed, lookup_subtable_changed); |
1020 | 0 | const char *reply_str = ds_cstr(&reply); |
1021 | 0 | unixctl_command_reply(conn, reply_str); |
1022 | 0 | VLOG_INFO("%s", reply_str); |
1023 | 0 | ds_destroy(&reply); |
1024 | 0 | } |
1025 | | |
1026 | | static void |
1027 | | dpif_netdev_impl_get(struct unixctl_conn *conn, int argc OVS_UNUSED, |
1028 | | const char *argv[] OVS_UNUSED, void *aux OVS_UNUSED) |
1029 | 0 | { |
1030 | 0 | struct ds reply = DS_EMPTY_INITIALIZER; |
1031 | 0 | struct shash_node *node; |
1032 | |
|
1033 | 0 | ovs_mutex_lock(&dp_netdev_mutex); |
1034 | 0 | SHASH_FOR_EACH (node, &dp_netdevs) { |
1035 | 0 | struct dp_netdev_pmd_thread **pmd_list; |
1036 | 0 | struct dp_netdev *dp = node->data; |
1037 | 0 | size_t n; |
1038 | | |
1039 | | /* Get PMD threads list, required to get the DPIF impl used by each PMD |
1040 | | * thread. */ |
1041 | 0 | sorted_poll_thread_list(dp, &pmd_list, &n); |
1042 | 0 | dp_netdev_impl_get(&reply, pmd_list, n); |
1043 | 0 | free(pmd_list); |
1044 | 0 | } |
1045 | 0 | ovs_mutex_unlock(&dp_netdev_mutex); |
1046 | 0 | unixctl_command_reply(conn, ds_cstr(&reply)); |
1047 | 0 | ds_destroy(&reply); |
1048 | 0 | } |
1049 | | |
1050 | | static void |
1051 | | dpif_netdev_impl_set(struct unixctl_conn *conn, int argc OVS_UNUSED, |
1052 | | const char *argv[], void *aux OVS_UNUSED) |
1053 | 0 | { |
1054 | | /* This function requires just one parameter, the DPIF name. */ |
1055 | 0 | const char *dpif_name = argv[1]; |
1056 | 0 | struct shash_node *node; |
1057 | |
|
1058 | 0 | static const char *error_description[2] = { |
1059 | 0 | "Unknown DPIF implementation", |
1060 | 0 | "CPU doesn't support the required instruction for", |
1061 | 0 | }; |
1062 | |
|
1063 | 0 | ovs_mutex_lock(&dp_netdev_mutex); |
1064 | 0 | int32_t err = dp_netdev_impl_set_default_by_name(dpif_name); |
1065 | |
|
1066 | 0 | if (err) { |
1067 | 0 | struct ds reply = DS_EMPTY_INITIALIZER; |
1068 | 0 | ds_put_format(&reply, "DPIF implementation not available: %s %s.\n", |
1069 | 0 | error_description[ (err == -ENOTSUP) ], dpif_name); |
1070 | 0 | const char *reply_str = ds_cstr(&reply); |
1071 | 0 | unixctl_command_reply_error(conn, reply_str); |
1072 | 0 | VLOG_ERR("%s", reply_str); |
1073 | 0 | ds_destroy(&reply); |
1074 | 0 | ovs_mutex_unlock(&dp_netdev_mutex); |
1075 | 0 | return; |
1076 | 0 | } |
1077 | | |
1078 | 0 | SHASH_FOR_EACH (node, &dp_netdevs) { |
1079 | 0 | struct dp_netdev *dp = node->data; |
1080 | | |
1081 | | /* Get PMD threads list, required to get DPCLS instances. */ |
1082 | 0 | size_t n; |
1083 | 0 | struct dp_netdev_pmd_thread **pmd_list; |
1084 | 0 | sorted_poll_thread_list(dp, &pmd_list, &n); |
1085 | |
|
1086 | 0 | for (size_t i = 0; i < n; i++) { |
1087 | 0 | struct dp_netdev_pmd_thread *pmd = pmd_list[i]; |
1088 | 0 | if (pmd->core_id == NON_PMD_CORE_ID) { |
1089 | 0 | continue; |
1090 | 0 | } |
1091 | | |
1092 | | /* Initialize DPIF function pointer to the newly configured |
1093 | | * default. */ |
1094 | 0 | atomic_store_relaxed(&pmd->netdev_input_func, |
1095 | 0 | dp_netdev_impl_get_default()); |
1096 | 0 | }; |
1097 | |
|
1098 | 0 | free(pmd_list); |
1099 | 0 | } |
1100 | 0 | ovs_mutex_unlock(&dp_netdev_mutex); |
1101 | | |
1102 | | /* Reply with success to command. */ |
1103 | 0 | struct ds reply = DS_EMPTY_INITIALIZER; |
1104 | 0 | ds_put_format(&reply, "DPIF implementation set to %s.\n", dpif_name); |
1105 | 0 | const char *reply_str = ds_cstr(&reply); |
1106 | 0 | unixctl_command_reply(conn, reply_str); |
1107 | 0 | VLOG_INFO("%s", reply_str); |
1108 | 0 | ds_destroy(&reply); |
1109 | 0 | } |
1110 | | |
1111 | | static void |
1112 | | dpif_miniflow_extract_impl_get(struct unixctl_conn *conn, int argc OVS_UNUSED, |
1113 | | const char *argv[] OVS_UNUSED, |
1114 | | void *aux OVS_UNUSED) |
1115 | 0 | { |
1116 | 0 | struct ds reply = DS_EMPTY_INITIALIZER; |
1117 | 0 | struct shash_node *node; |
1118 | |
|
1119 | 0 | ovs_mutex_lock(&dp_netdev_mutex); |
1120 | 0 | SHASH_FOR_EACH (node, &dp_netdevs) { |
1121 | 0 | struct dp_netdev_pmd_thread **pmd_list; |
1122 | 0 | struct dp_netdev *dp = node->data; |
1123 | 0 | size_t n; |
1124 | | |
1125 | | /* Get PMD threads list, required to get the DPIF impl used by each PMD |
1126 | | * thread. */ |
1127 | 0 | sorted_poll_thread_list(dp, &pmd_list, &n); |
1128 | 0 | dp_mfex_impl_get(&reply, pmd_list, n); |
1129 | 0 | free(pmd_list); |
1130 | 0 | } |
1131 | 0 | ovs_mutex_unlock(&dp_netdev_mutex); |
1132 | 0 | unixctl_command_reply(conn, ds_cstr(&reply)); |
1133 | 0 | ds_destroy(&reply); |
1134 | 0 | } |
1135 | | |
1136 | | static void |
1137 | | dpif_miniflow_extract_impl_set(struct unixctl_conn *conn, int argc, |
1138 | | const char *argv[], void *aux OVS_UNUSED) |
1139 | 0 | { |
1140 | | /* This command takes some optional and mandatory arguments. The function |
1141 | | * here first parses all of the options, saving results in local variables. |
1142 | | * Then the parsed values are acted on. |
1143 | | */ |
1144 | 0 | unsigned int pmd_thread_to_change = NON_PMD_CORE_ID; |
1145 | 0 | unsigned int study_count = MFEX_MAX_PKT_COUNT; |
1146 | 0 | struct ds reply = DS_EMPTY_INITIALIZER; |
1147 | 0 | bool pmd_thread_update_done = false; |
1148 | 0 | bool mfex_name_is_study = false; |
1149 | 0 | const char *mfex_name = NULL; |
1150 | 0 | const char *reply_str = NULL; |
1151 | 0 | struct shash_node *node; |
1152 | 0 | int err; |
1153 | |
|
1154 | 0 | while (argc > 1) { |
1155 | | /* Optional argument "-pmd" limits the commands actions to just this |
1156 | | * PMD thread. |
1157 | | */ |
1158 | 0 | if ((!strcmp(argv[1], "-pmd") && !mfex_name)) { |
1159 | 0 | if (argc < 3) { |
1160 | 0 | ds_put_format(&reply, |
1161 | 0 | "Error: -pmd option requires a thread id" |
1162 | 0 | " argument.\n"); |
1163 | 0 | goto error; |
1164 | 0 | } |
1165 | | |
1166 | | /* Ensure argument can be parsed to an integer. */ |
1167 | 0 | if (!str_to_uint(argv[2], 10, &pmd_thread_to_change) || |
1168 | 0 | (pmd_thread_to_change == NON_PMD_CORE_ID)) { |
1169 | 0 | ds_put_format(&reply, |
1170 | 0 | "Error: miniflow extract parser not changed," |
1171 | 0 | " PMD thread passed is not valid: '%s'." |
1172 | 0 | " Pass a valid pmd thread ID.\n", |
1173 | 0 | argv[2]); |
1174 | 0 | goto error; |
1175 | 0 | } |
1176 | | |
1177 | 0 | argc -= 2; |
1178 | 0 | argv += 2; |
1179 | |
|
1180 | 0 | } else if (!mfex_name) { |
1181 | | /* Name of MFEX impl requested by user. */ |
1182 | 0 | mfex_name = argv[1]; |
1183 | 0 | mfex_name_is_study = strcmp("study", mfex_name) == 0; |
1184 | 0 | argc -= 1; |
1185 | 0 | argv += 1; |
1186 | | |
1187 | | /* If name is study and more args exist, parse study_count value. */ |
1188 | 0 | } else if (mfex_name && mfex_name_is_study) { |
1189 | 0 | if (!str_to_uint(argv[1], 10, &study_count) || |
1190 | 0 | (study_count == 0)) { |
1191 | 0 | ds_put_format(&reply, |
1192 | 0 | "Error: invalid study_pkt_cnt value: %s.\n", |
1193 | 0 | argv[1]); |
1194 | 0 | goto error; |
1195 | 0 | } |
1196 | | |
1197 | 0 | argc -= 1; |
1198 | 0 | argv += 1; |
1199 | 0 | } else { |
1200 | 0 | ds_put_format(&reply, "Error: unknown argument %s.\n", argv[1]); |
1201 | 0 | goto error; |
1202 | 0 | } |
1203 | 0 | } |
1204 | | |
1205 | | /* Ensure user passed an MFEX name. */ |
1206 | 0 | if (!mfex_name) { |
1207 | 0 | ds_put_format(&reply, "Error: no miniflow extract name provided." |
1208 | 0 | " Output of miniflow-parser-get shows implementation" |
1209 | 0 | " list.\n"); |
1210 | 0 | goto error; |
1211 | 0 | } |
1212 | | |
1213 | | /* If the MFEX name is "study", set the study packet count. */ |
1214 | 0 | if (mfex_name_is_study) { |
1215 | 0 | err = mfex_set_study_pkt_cnt(study_count, mfex_name); |
1216 | 0 | if (err) { |
1217 | 0 | ds_put_format(&reply, "Error: failed to set study count %d for" |
1218 | 0 | " miniflow extract implementation %s.\n", |
1219 | 0 | study_count, mfex_name); |
1220 | 0 | goto error; |
1221 | 0 | } |
1222 | 0 | } |
1223 | | |
1224 | | /* Set the default MFEX impl only if the command was applied to all PMD |
1225 | | * threads. If a PMD thread was selected, do NOT update the default. |
1226 | | */ |
1227 | 0 | if (pmd_thread_to_change == NON_PMD_CORE_ID) { |
1228 | 0 | err = dp_mfex_impl_set_default_by_name(mfex_name); |
1229 | 0 | if (err == -ENODEV) { |
1230 | 0 | ds_put_format(&reply, |
1231 | 0 | "Error: miniflow extract not available due to CPU" |
1232 | 0 | " ISA requirements: %s", |
1233 | 0 | mfex_name); |
1234 | 0 | goto error; |
1235 | 0 | } else if (err) { |
1236 | 0 | ds_put_format(&reply, |
1237 | 0 | "Error: unknown miniflow extract implementation %s.", |
1238 | 0 | mfex_name); |
1239 | 0 | goto error; |
1240 | 0 | } |
1241 | 0 | } |
1242 | | |
1243 | | /* Get the desired MFEX function pointer and error check its usage. */ |
1244 | 0 | miniflow_extract_func mfex_func = NULL; |
1245 | 0 | err = dp_mfex_impl_get_by_name(mfex_name, &mfex_func); |
1246 | 0 | if (err) { |
1247 | 0 | if (err == -ENODEV) { |
1248 | 0 | ds_put_format(&reply, |
1249 | 0 | "Error: miniflow extract not available due to CPU" |
1250 | 0 | " ISA requirements: %s", mfex_name); |
1251 | 0 | } else { |
1252 | 0 | ds_put_format(&reply, |
1253 | 0 | "Error: unknown miniflow extract implementation %s.", |
1254 | 0 | mfex_name); |
1255 | 0 | } |
1256 | 0 | goto error; |
1257 | 0 | } |
1258 | | |
1259 | | /* Apply the MFEX pointer to each pmd thread in each netdev, filtering |
1260 | | * by the users "-pmd" argument if required. |
1261 | | */ |
1262 | 0 | ovs_mutex_lock(&dp_netdev_mutex); |
1263 | |
|
1264 | 0 | SHASH_FOR_EACH (node, &dp_netdevs) { |
1265 | 0 | struct dp_netdev_pmd_thread **pmd_list; |
1266 | 0 | struct dp_netdev *dp = node->data; |
1267 | 0 | size_t n; |
1268 | |
|
1269 | 0 | sorted_poll_thread_list(dp, &pmd_list, &n); |
1270 | |
|
1271 | 0 | for (size_t i = 0; i < n; i++) { |
1272 | 0 | struct dp_netdev_pmd_thread *pmd = pmd_list[i]; |
1273 | 0 | if (pmd->core_id == NON_PMD_CORE_ID) { |
1274 | 0 | continue; |
1275 | 0 | } |
1276 | | |
1277 | | /* If -pmd specified, skip all other pmd threads. */ |
1278 | 0 | if ((pmd_thread_to_change != NON_PMD_CORE_ID) && |
1279 | 0 | (pmd->core_id != pmd_thread_to_change)) { |
1280 | 0 | continue; |
1281 | 0 | } |
1282 | | |
1283 | 0 | pmd_thread_update_done = true; |
1284 | 0 | atomic_store_relaxed(&pmd->miniflow_extract_opt, mfex_func); |
1285 | 0 | }; |
1286 | |
|
1287 | 0 | free(pmd_list); |
1288 | 0 | } |
1289 | |
|
1290 | 0 | ovs_mutex_unlock(&dp_netdev_mutex); |
1291 | | |
1292 | | /* If PMD thread was specified, but it wasn't found, return error. */ |
1293 | 0 | if (pmd_thread_to_change != NON_PMD_CORE_ID && !pmd_thread_update_done) { |
1294 | 0 | ds_put_format(&reply, |
1295 | 0 | "Error: miniflow extract parser not changed, " |
1296 | 0 | "PMD thread %d not in use, pass a valid pmd" |
1297 | 0 | " thread ID.\n", pmd_thread_to_change); |
1298 | 0 | goto error; |
1299 | 0 | } |
1300 | | |
1301 | | /* Reply with success to command. */ |
1302 | 0 | ds_put_format(&reply, "Miniflow extract implementation set to %s", |
1303 | 0 | mfex_name); |
1304 | 0 | if (pmd_thread_to_change != NON_PMD_CORE_ID) { |
1305 | 0 | ds_put_format(&reply, ", on pmd thread %d", pmd_thread_to_change); |
1306 | 0 | } |
1307 | 0 | if (mfex_name_is_study) { |
1308 | 0 | ds_put_format(&reply, ", studying %d packets", study_count); |
1309 | 0 | } |
1310 | 0 | ds_put_format(&reply, ".\n"); |
1311 | |
|
1312 | 0 | reply_str = ds_cstr(&reply); |
1313 | 0 | VLOG_INFO("%s", reply_str); |
1314 | 0 | unixctl_command_reply(conn, reply_str); |
1315 | 0 | ds_destroy(&reply); |
1316 | 0 | return; |
1317 | | |
1318 | 0 | error: |
1319 | 0 | reply_str = ds_cstr(&reply); |
1320 | 0 | VLOG_ERR("%s", reply_str); |
1321 | 0 | unixctl_command_reply_error(conn, reply_str); |
1322 | 0 | ds_destroy(&reply); |
1323 | 0 | } |
1324 | | |
1325 | | static void |
1326 | | dpif_netdev_pmd_rebalance(struct unixctl_conn *conn, int argc, |
1327 | | const char *argv[], void *aux OVS_UNUSED) |
1328 | 0 | { |
1329 | 0 | struct ds reply = DS_EMPTY_INITIALIZER; |
1330 | 0 | struct dp_netdev *dp = NULL; |
1331 | |
|
1332 | 0 | ovs_mutex_lock(&dp_netdev_mutex); |
1333 | |
|
1334 | 0 | if (argc == 2) { |
1335 | 0 | dp = shash_find_data(&dp_netdevs, argv[1]); |
1336 | 0 | } else if (shash_count(&dp_netdevs) == 1) { |
1337 | | /* There's only one datapath */ |
1338 | 0 | dp = shash_first(&dp_netdevs)->data; |
1339 | 0 | } |
1340 | |
|
1341 | 0 | if (!dp) { |
1342 | 0 | ovs_mutex_unlock(&dp_netdev_mutex); |
1343 | 0 | unixctl_command_reply_error(conn, |
1344 | 0 | "please specify an existing datapath"); |
1345 | 0 | return; |
1346 | 0 | } |
1347 | | |
1348 | 0 | dp_netdev_request_reconfigure(dp); |
1349 | 0 | ovs_mutex_unlock(&dp_netdev_mutex); |
1350 | 0 | ds_put_cstr(&reply, "pmd rxq rebalance requested.\n"); |
1351 | 0 | unixctl_command_reply(conn, ds_cstr(&reply)); |
1352 | 0 | ds_destroy(&reply); |
1353 | 0 | } |
1354 | | |
1355 | | static void |
1356 | | pmd_info_show_sleep(struct ds *reply, unsigned core_id, int numa_id, |
1357 | | uint64_t pmd_max_sleep) |
1358 | 0 | { |
1359 | 0 | if (core_id == NON_PMD_CORE_ID) { |
1360 | 0 | return; |
1361 | 0 | } |
1362 | 0 | ds_put_format(reply, |
1363 | 0 | "pmd thread numa_id %d core_id %d:\n" |
1364 | 0 | " max sleep: %4"PRIu64" us\n", |
1365 | 0 | numa_id, core_id, pmd_max_sleep); |
1366 | 0 | } |
1367 | | |
1368 | | static void |
1369 | | dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[], |
1370 | | void *aux) |
1371 | 0 | { |
1372 | 0 | struct ds reply = DS_EMPTY_INITIALIZER; |
1373 | 0 | struct dp_netdev_pmd_thread **pmd_list; |
1374 | 0 | struct dp_netdev *dp = NULL; |
1375 | 0 | enum pmd_info_type type = *(enum pmd_info_type *) aux; |
1376 | 0 | unsigned int core_id; |
1377 | 0 | bool filter_on_pmd = false; |
1378 | 0 | size_t n; |
1379 | 0 | unsigned int secs = 0; |
1380 | 0 | unsigned long long max_secs = (PMD_INTERVAL_LEN * PMD_INTERVAL_MAX) |
1381 | 0 | / INTERVAL_USEC_TO_SEC; |
1382 | 0 | bool show_header = true; |
1383 | 0 | uint64_t max_sleep; |
1384 | |
|
1385 | 0 | ovs_mutex_lock(&dp_netdev_mutex); |
1386 | |
|
1387 | 0 | while (argc > 1) { |
1388 | 0 | if (!strcmp(argv[1], "-pmd") && argc > 2) { |
1389 | 0 | if (str_to_uint(argv[2], 10, &core_id)) { |
1390 | 0 | filter_on_pmd = true; |
1391 | 0 | } |
1392 | 0 | argc -= 2; |
1393 | 0 | argv += 2; |
1394 | 0 | } else if (type == PMD_INFO_SHOW_RXQ && |
1395 | 0 | !strcmp(argv[1], "-secs") && |
1396 | 0 | argc > 2) { |
1397 | 0 | if (!str_to_uint(argv[2], 10, &secs)) { |
1398 | 0 | secs = max_secs; |
1399 | 0 | } |
1400 | 0 | argc -= 2; |
1401 | 0 | argv += 2; |
1402 | 0 | } else { |
1403 | 0 | dp = shash_find_data(&dp_netdevs, argv[1]); |
1404 | 0 | argc -= 1; |
1405 | 0 | argv += 1; |
1406 | 0 | } |
1407 | 0 | } |
1408 | |
|
1409 | 0 | if (!dp) { |
1410 | 0 | if (shash_count(&dp_netdevs) == 1) { |
1411 | | /* There's only one datapath */ |
1412 | 0 | dp = shash_first(&dp_netdevs)->data; |
1413 | 0 | } else { |
1414 | 0 | ovs_mutex_unlock(&dp_netdev_mutex); |
1415 | 0 | unixctl_command_reply_error(conn, |
1416 | 0 | "please specify an existing datapath"); |
1417 | 0 | return; |
1418 | 0 | } |
1419 | 0 | } |
1420 | | |
1421 | 0 | sorted_poll_thread_list(dp, &pmd_list, &n); |
1422 | 0 | for (size_t i = 0; i < n; i++) { |
1423 | 0 | struct dp_netdev_pmd_thread *pmd = pmd_list[i]; |
1424 | 0 | if (!pmd) { |
1425 | 0 | break; |
1426 | 0 | } |
1427 | 0 | if (filter_on_pmd && pmd->core_id != core_id) { |
1428 | 0 | continue; |
1429 | 0 | } |
1430 | 0 | if (type == PMD_INFO_SHOW_RXQ) { |
1431 | 0 | if (show_header) { |
1432 | 0 | if (!secs || secs > max_secs) { |
1433 | 0 | secs = max_secs; |
1434 | 0 | } else { |
1435 | 0 | secs = ROUND_UP(secs, |
1436 | 0 | PMD_INTERVAL_LEN / INTERVAL_USEC_TO_SEC); |
1437 | 0 | } |
1438 | 0 | ds_put_format(&reply, "Displaying last %u seconds " |
1439 | 0 | "pmd usage %%\n", secs); |
1440 | 0 | show_header = false; |
1441 | 0 | } |
1442 | 0 | pmd_info_show_rxq(&reply, pmd, secs); |
1443 | 0 | } else if (type == PMD_INFO_CLEAR_STATS) { |
1444 | 0 | pmd_perf_stats_clear(&pmd->perf_stats); |
1445 | 0 | } else if (type == PMD_INFO_SHOW_STATS) { |
1446 | 0 | pmd_info_show_stats(&reply, pmd); |
1447 | 0 | } else if (type == PMD_INFO_PERF_SHOW) { |
1448 | 0 | pmd_info_show_perf(&reply, pmd, (struct pmd_perf_params *)aux); |
1449 | 0 | } else if (type == PMD_INFO_SLEEP_SHOW) { |
1450 | 0 | if (show_header) { |
1451 | 0 | ds_put_format(&reply, "Default max sleep: %4"PRIu64" us\n", |
1452 | 0 | dp->pmd_max_sleep_default); |
1453 | 0 | show_header = false; |
1454 | 0 | } |
1455 | 0 | atomic_read_relaxed(&pmd->max_sleep, &max_sleep); |
1456 | 0 | pmd_info_show_sleep(&reply, pmd->core_id, pmd->numa_id, |
1457 | 0 | max_sleep); |
1458 | 0 | } |
1459 | 0 | } |
1460 | 0 | free(pmd_list); |
1461 | |
|
1462 | 0 | ovs_mutex_unlock(&dp_netdev_mutex); |
1463 | |
|
1464 | 0 | unixctl_command_reply(conn, ds_cstr(&reply)); |
1465 | 0 | ds_destroy(&reply); |
1466 | 0 | } |
1467 | | |
1468 | | static void |
1469 | | pmd_perf_show_cmd(struct unixctl_conn *conn, int argc, |
1470 | | const char *argv[], |
1471 | | void *aux OVS_UNUSED) |
1472 | 0 | { |
1473 | 0 | struct pmd_perf_params par; |
1474 | 0 | long int it_hist = 0, ms_hist = 0; |
1475 | 0 | par.histograms = true; |
1476 | |
|
1477 | 0 | while (argc > 1) { |
1478 | 0 | if (!strcmp(argv[1], "-nh")) { |
1479 | 0 | par.histograms = false; |
1480 | 0 | argc -= 1; |
1481 | 0 | argv += 1; |
1482 | 0 | } else if (!strcmp(argv[1], "-it") && argc > 2) { |
1483 | 0 | it_hist = strtol(argv[2], NULL, 10); |
1484 | 0 | if (it_hist < 0) { |
1485 | 0 | it_hist = 0; |
1486 | 0 | } else if (it_hist > HISTORY_LEN) { |
1487 | 0 | it_hist = HISTORY_LEN; |
1488 | 0 | } |
1489 | 0 | argc -= 2; |
1490 | 0 | argv += 2; |
1491 | 0 | } else if (!strcmp(argv[1], "-ms") && argc > 2) { |
1492 | 0 | ms_hist = strtol(argv[2], NULL, 10); |
1493 | 0 | if (ms_hist < 0) { |
1494 | 0 | ms_hist = 0; |
1495 | 0 | } else if (ms_hist > HISTORY_LEN) { |
1496 | 0 | ms_hist = HISTORY_LEN; |
1497 | 0 | } |
1498 | 0 | argc -= 2; |
1499 | 0 | argv += 2; |
1500 | 0 | } else { |
1501 | 0 | break; |
1502 | 0 | } |
1503 | 0 | } |
1504 | 0 | par.iter_hist_len = it_hist; |
1505 | 0 | par.ms_hist_len = ms_hist; |
1506 | 0 | par.command_type = PMD_INFO_PERF_SHOW; |
1507 | 0 | dpif_netdev_pmd_info(conn, argc, argv, &par); |
1508 | 0 | } |
1509 | | |
1510 | | static void |
1511 | | dpif_netdev_bond_show(struct unixctl_conn *conn, int argc, |
1512 | | const char *argv[], void *aux OVS_UNUSED) |
1513 | 0 | { |
1514 | 0 | struct ds reply = DS_EMPTY_INITIALIZER; |
1515 | 0 | struct dp_netdev *dp = NULL; |
1516 | |
|
1517 | 0 | ovs_mutex_lock(&dp_netdev_mutex); |
1518 | 0 | if (argc == 2) { |
1519 | 0 | dp = shash_find_data(&dp_netdevs, argv[1]); |
1520 | 0 | } else if (shash_count(&dp_netdevs) == 1) { |
1521 | | /* There's only one datapath. */ |
1522 | 0 | dp = shash_first(&dp_netdevs)->data; |
1523 | 0 | } |
1524 | 0 | if (!dp) { |
1525 | 0 | ovs_mutex_unlock(&dp_netdev_mutex); |
1526 | 0 | unixctl_command_reply_error(conn, |
1527 | 0 | "please specify an existing datapath"); |
1528 | 0 | return; |
1529 | 0 | } |
1530 | | |
1531 | 0 | if (cmap_count(&dp->tx_bonds) > 0) { |
1532 | 0 | struct tx_bond *dp_bond_entry; |
1533 | |
|
1534 | 0 | ds_put_cstr(&reply, "Bonds:\n"); |
1535 | 0 | CMAP_FOR_EACH (dp_bond_entry, node, &dp->tx_bonds) { |
1536 | 0 | ds_put_format(&reply, " bond-id %"PRIu32":\n", |
1537 | 0 | dp_bond_entry->bond_id); |
1538 | 0 | for (int bucket = 0; bucket < BOND_BUCKETS; bucket++) { |
1539 | 0 | uint32_t member_id = odp_to_u32( |
1540 | 0 | dp_bond_entry->member_buckets[bucket].member_id); |
1541 | 0 | ds_put_format(&reply, |
1542 | 0 | " bucket %d - member %"PRIu32"\n", |
1543 | 0 | bucket, member_id); |
1544 | 0 | } |
1545 | 0 | } |
1546 | 0 | } |
1547 | 0 | ovs_mutex_unlock(&dp_netdev_mutex); |
1548 | 0 | unixctl_command_reply(conn, ds_cstr(&reply)); |
1549 | 0 | ds_destroy(&reply); |
1550 | 0 | } |
1551 | | |
1552 | | |
1553 | | static int |
1554 | | dpif_netdev_init(void) |
1555 | 0 | { |
1556 | 0 | static enum pmd_info_type show_aux = PMD_INFO_SHOW_STATS, |
1557 | 0 | clear_aux = PMD_INFO_CLEAR_STATS, |
1558 | 0 | poll_aux = PMD_INFO_SHOW_RXQ, |
1559 | 0 | sleep_aux = PMD_INFO_SLEEP_SHOW; |
1560 | |
|
1561 | 0 | unixctl_command_register("dpif-netdev/pmd-stats-show", "[-pmd core] [dp]", |
1562 | 0 | 0, 3, dpif_netdev_pmd_info, |
1563 | 0 | (void *)&show_aux); |
1564 | 0 | unixctl_command_register("dpif-netdev/pmd-stats-clear", "[-pmd core] [dp]", |
1565 | 0 | 0, 3, dpif_netdev_pmd_info, |
1566 | 0 | (void *)&clear_aux); |
1567 | 0 | unixctl_command_register("dpif-netdev/pmd-rxq-show", "[-pmd core] " |
1568 | 0 | "[-secs secs] [dp]", |
1569 | 0 | 0, 5, dpif_netdev_pmd_info, |
1570 | 0 | (void *)&poll_aux); |
1571 | 0 | unixctl_command_register("dpif-netdev/pmd-sleep-show", "[dp]", |
1572 | 0 | 0, 1, dpif_netdev_pmd_info, |
1573 | 0 | (void *)&sleep_aux); |
1574 | 0 | unixctl_command_register("dpif-netdev/pmd-perf-show", |
1575 | 0 | "[-nh] [-it iter-history-len]" |
1576 | 0 | " [-ms ms-history-len]" |
1577 | 0 | " [-pmd core] [dp]", |
1578 | 0 | 0, 8, pmd_perf_show_cmd, |
1579 | 0 | NULL); |
1580 | 0 | unixctl_command_register("dpif-netdev/pmd-rxq-rebalance", "[dp]", |
1581 | 0 | 0, 1, dpif_netdev_pmd_rebalance, |
1582 | 0 | NULL); |
1583 | 0 | unixctl_command_register("dpif-netdev/pmd-perf-log-set", |
1584 | 0 | "on|off [-b before] [-a after] [-e|-ne] " |
1585 | 0 | "[-us usec] [-q qlen]", |
1586 | 0 | 0, 10, pmd_perf_log_set_cmd, |
1587 | 0 | NULL); |
1588 | 0 | unixctl_command_register("dpif-netdev/bond-show", "[dp]", |
1589 | 0 | 0, 1, dpif_netdev_bond_show, |
1590 | 0 | NULL); |
1591 | 0 | unixctl_command_register("dpif-netdev/subtable-lookup-prio-set", |
1592 | 0 | "[lookup_func] [prio]", |
1593 | 0 | 2, 2, dpif_netdev_subtable_lookup_set, |
1594 | 0 | NULL); |
1595 | 0 | unixctl_command_register("dpif-netdev/subtable-lookup-info-get", "", |
1596 | 0 | 0, 0, dpif_netdev_subtable_lookup_get, |
1597 | 0 | NULL); |
1598 | 0 | unixctl_command_register("dpif-netdev/subtable-lookup-prio-get", NULL, |
1599 | 0 | 0, 0, dpif_netdev_subtable_lookup_get, |
1600 | 0 | NULL); |
1601 | 0 | unixctl_command_register("dpif-netdev/dpif-impl-set", |
1602 | 0 | "dpif_implementation_name", |
1603 | 0 | 1, 1, dpif_netdev_impl_set, |
1604 | 0 | NULL); |
1605 | 0 | unixctl_command_register("dpif-netdev/dpif-impl-get", "", |
1606 | 0 | 0, 0, dpif_netdev_impl_get, |
1607 | 0 | NULL); |
1608 | 0 | unixctl_command_register("dpif-netdev/miniflow-parser-set", |
1609 | 0 | "[-pmd core] miniflow_implementation_name" |
1610 | 0 | " [study_pkt_cnt]", |
1611 | 0 | 1, 5, dpif_miniflow_extract_impl_set, |
1612 | 0 | NULL); |
1613 | 0 | unixctl_command_register("dpif-netdev/miniflow-parser-get", "", |
1614 | 0 | 0, 0, dpif_miniflow_extract_impl_get, |
1615 | 0 | NULL); |
1616 | 0 | return 0; |
1617 | 0 | } |
1618 | | |
1619 | | static int |
1620 | | dpif_netdev_enumerate(struct sset *all_dps, |
1621 | | const struct dpif_class *dpif_class) |
1622 | 0 | { |
1623 | 0 | struct shash_node *node; |
1624 | |
|
1625 | 0 | ovs_mutex_lock(&dp_netdev_mutex); |
1626 | 0 | SHASH_FOR_EACH(node, &dp_netdevs) { |
1627 | 0 | struct dp_netdev *dp = node->data; |
1628 | 0 | if (dpif_class != dp->class) { |
1629 | | /* 'dp_netdevs' contains both "netdev" and "dummy" dpifs. |
1630 | | * If the class doesn't match, skip this dpif. */ |
1631 | 0 | continue; |
1632 | 0 | } |
1633 | 0 | sset_add(all_dps, node->name); |
1634 | 0 | } |
1635 | 0 | ovs_mutex_unlock(&dp_netdev_mutex); |
1636 | |
|
1637 | 0 | return 0; |
1638 | 0 | } |
1639 | | |
1640 | | static bool |
1641 | | dpif_netdev_class_is_dummy(const struct dpif_class *class) |
1642 | 0 | { |
1643 | 0 | return class != &dpif_netdev_class; |
1644 | 0 | } |
1645 | | |
1646 | | static const char * |
1647 | | dpif_netdev_port_open_type(const struct dpif_class *class, const char *type) |
1648 | 0 | { |
1649 | 0 | return strcmp(type, "internal") ? type |
1650 | 0 | : dpif_netdev_class_is_dummy(class) ? "dummy-internal" |
1651 | 0 | : "tap"; |
1652 | 0 | } |
1653 | | |
1654 | | static struct dpif * |
1655 | | create_dpif_netdev(struct dp_netdev *dp) |
1656 | 0 | { |
1657 | 0 | uint16_t netflow_id = hash_string(dp->name, 0); |
1658 | 0 | struct dpif_netdev *dpif; |
1659 | |
|
1660 | 0 | ovs_refcount_ref(&dp->ref_cnt); |
1661 | |
|
1662 | 0 | dpif = xmalloc(sizeof *dpif); |
1663 | 0 | dpif_init(&dpif->dpif, dp->class, dp->name, netflow_id >> 8, netflow_id); |
1664 | 0 | dpif->dp = dp; |
1665 | 0 | dpif->last_port_seq = seq_read(dp->port_seq); |
1666 | |
|
1667 | 0 | return &dpif->dpif; |
1668 | 0 | } |
1669 | | |
1670 | | /* Choose an unused, non-zero port number and return it on success. |
1671 | | * Return ODPP_NONE on failure. */ |
1672 | | static odp_port_t |
1673 | | choose_port(struct dp_netdev *dp, const char *name) |
1674 | | OVS_REQ_RDLOCK(dp->port_rwlock) |
1675 | 0 | { |
1676 | 0 | uint32_t port_no; |
1677 | |
|
1678 | 0 | if (dp->class != &dpif_netdev_class) { |
1679 | 0 | const char *p; |
1680 | 0 | int start_no = 0; |
1681 | | |
1682 | | /* If the port name begins with "br", start the number search at |
1683 | | * 100 to make writing tests easier. */ |
1684 | 0 | if (!strncmp(name, "br", 2)) { |
1685 | 0 | start_no = 100; |
1686 | 0 | } |
1687 | | |
1688 | | /* If the port name contains a number, try to assign that port number. |
1689 | | * This can make writing unit tests easier because port numbers are |
1690 | | * predictable. */ |
1691 | 0 | for (p = name; *p != '\0'; p++) { |
1692 | 0 | if (isdigit((unsigned char) *p)) { |
1693 | 0 | port_no = start_no + strtol(p, NULL, 10); |
1694 | 0 | if (port_no > 0 && port_no != odp_to_u32(ODPP_NONE) |
1695 | 0 | && !dp_netdev_lookup_port(dp, u32_to_odp(port_no))) { |
1696 | 0 | return u32_to_odp(port_no); |
1697 | 0 | } |
1698 | 0 | break; |
1699 | 0 | } |
1700 | 0 | } |
1701 | 0 | } |
1702 | | |
1703 | 0 | for (port_no = 1; port_no <= UINT16_MAX; port_no++) { |
1704 | 0 | if (!dp_netdev_lookup_port(dp, u32_to_odp(port_no))) { |
1705 | 0 | return u32_to_odp(port_no); |
1706 | 0 | } |
1707 | 0 | } |
1708 | | |
1709 | 0 | return ODPP_NONE; |
1710 | 0 | } |
1711 | | |
1712 | | static uint32_t |
1713 | | dp_meter_hash(uint32_t meter_id) |
1714 | 0 | { |
1715 | | /* In the ofproto-dpif layer, we use the id-pool to alloc meter id |
1716 | | * orderly (e.g. 1, 2, ... N.), which provides a better hash |
1717 | | * distribution. Use them directly instead of hash_xxx function for |
1718 | | * achieving high-performance. */ |
1719 | 0 | return meter_id; |
1720 | 0 | } |
1721 | | |
1722 | | static void |
1723 | | dp_netdev_meter_destroy(struct dp_netdev *dp) |
1724 | 0 | { |
1725 | 0 | struct dp_meter *m; |
1726 | |
|
1727 | 0 | ovs_mutex_lock(&dp->meters_lock); |
1728 | 0 | CMAP_FOR_EACH (m, node, &dp->meters) { |
1729 | 0 | cmap_remove(&dp->meters, &m->node, dp_meter_hash(m->id)); |
1730 | 0 | ovsrcu_postpone(free, m); |
1731 | 0 | } |
1732 | |
|
1733 | 0 | cmap_destroy(&dp->meters); |
1734 | 0 | ovs_mutex_unlock(&dp->meters_lock); |
1735 | 0 | ovs_mutex_destroy(&dp->meters_lock); |
1736 | 0 | } |
1737 | | |
1738 | | static struct dp_meter * |
1739 | | dp_meter_lookup(struct cmap *meters, uint32_t meter_id) |
1740 | 0 | { |
1741 | 0 | uint32_t hash = dp_meter_hash(meter_id); |
1742 | 0 | struct dp_meter *m; |
1743 | |
|
1744 | 0 | CMAP_FOR_EACH_WITH_HASH (m, node, hash, meters) { |
1745 | 0 | if (m->id == meter_id) { |
1746 | 0 | return m; |
1747 | 0 | } |
1748 | 0 | } |
1749 | | |
1750 | 0 | return NULL; |
1751 | 0 | } |
1752 | | |
1753 | | static void |
1754 | | dp_meter_detach_free(struct cmap *meters, uint32_t meter_id) |
1755 | 0 | { |
1756 | 0 | struct dp_meter *m = dp_meter_lookup(meters, meter_id); |
1757 | |
|
1758 | 0 | if (m) { |
1759 | 0 | cmap_remove(meters, &m->node, dp_meter_hash(meter_id)); |
1760 | 0 | ovsrcu_postpone(free, m); |
1761 | 0 | } |
1762 | 0 | } |
1763 | | |
1764 | | static void |
1765 | | dp_meter_attach(struct cmap *meters, struct dp_meter *meter) |
1766 | 0 | { |
1767 | 0 | cmap_insert(meters, &meter->node, dp_meter_hash(meter->id)); |
1768 | 0 | } |
1769 | | |
1770 | | static int |
1771 | | create_dp_netdev(const char *name, const struct dpif_class *class, |
1772 | | struct dp_netdev **dpp) |
1773 | | OVS_REQUIRES(dp_netdev_mutex) |
1774 | 0 | { |
1775 | 0 | static struct ovsthread_once tsc_freq_check = OVSTHREAD_ONCE_INITIALIZER; |
1776 | 0 | struct dp_netdev *dp; |
1777 | 0 | int error; |
1778 | | |
1779 | | /* Avoid estimating TSC frequency for dummy datapath to not slow down |
1780 | | * unit tests. */ |
1781 | 0 | if (!dpif_netdev_class_is_dummy(class) |
1782 | 0 | && ovsthread_once_start(&tsc_freq_check)) { |
1783 | 0 | pmd_perf_estimate_tsc_frequency(); |
1784 | 0 | ovsthread_once_done(&tsc_freq_check); |
1785 | 0 | } |
1786 | |
|
1787 | 0 | dp = xzalloc(sizeof *dp); |
1788 | 0 | shash_add(&dp_netdevs, name, dp); |
1789 | |
|
1790 | 0 | *CONST_CAST(const struct dpif_class **, &dp->class) = class; |
1791 | 0 | *CONST_CAST(const char **, &dp->name) = xstrdup(name); |
1792 | 0 | *CONST_CAST(const char **, &dp->full_name) = xasprintf("%s@%s", |
1793 | 0 | class->type, name); |
1794 | 0 | ovs_refcount_init(&dp->ref_cnt); |
1795 | 0 | atomic_flag_clear(&dp->destroyed); |
1796 | |
|
1797 | 0 | ovs_rwlock_init(&dp->port_rwlock); |
1798 | 0 | hmap_init(&dp->ports); |
1799 | 0 | dp->port_seq = seq_create(); |
1800 | 0 | ovs_mutex_init(&dp->bond_mutex); |
1801 | 0 | cmap_init(&dp->tx_bonds); |
1802 | |
|
1803 | 0 | fat_rwlock_init(&dp->upcall_rwlock); |
1804 | |
|
1805 | 0 | dp->reconfigure_seq = seq_create(); |
1806 | 0 | dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq); |
1807 | | |
1808 | | /* Init meter resources. */ |
1809 | 0 | cmap_init(&dp->meters); |
1810 | 0 | ovs_mutex_init(&dp->meters_lock); |
1811 | | |
1812 | | /* Disable upcalls by default. */ |
1813 | 0 | dp_netdev_disable_upcall(dp); |
1814 | 0 | dp->upcall_aux = NULL; |
1815 | 0 | dp->upcall_cb = NULL; |
1816 | |
|
1817 | 0 | dp->conntrack = conntrack_init(); |
1818 | |
|
1819 | 0 | dpif_miniflow_extract_init(); |
1820 | |
|
1821 | 0 | atomic_init(&dp->emc_insert_min, DEFAULT_EM_FLOW_INSERT_MIN); |
1822 | 0 | atomic_init(&dp->tx_flush_interval, DEFAULT_TX_FLUSH_INTERVAL); |
1823 | |
|
1824 | 0 | cmap_init(&dp->poll_threads); |
1825 | 0 | dp->pmd_rxq_assign_type = SCHED_CYCLES; |
1826 | |
|
1827 | 0 | ovs_mutex_init(&dp->tx_qid_pool_mutex); |
1828 | | /* We need 1 Tx queue for each possible core + 1 for non-PMD threads. */ |
1829 | 0 | dp->tx_qid_pool = id_pool_create(0, ovs_numa_get_n_cores() + 1); |
1830 | |
|
1831 | 0 | ovs_mutex_init_recursive(&dp->non_pmd_mutex); |
1832 | 0 | ovsthread_key_create(&dp->per_pmd_key, NULL); |
1833 | |
|
1834 | 0 | ovs_rwlock_wrlock(&dp->port_rwlock); |
1835 | | /* non-PMD will be created before all other threads and will |
1836 | | * allocate static_tx_qid = 0. */ |
1837 | 0 | dp_netdev_set_nonpmd(dp); |
1838 | |
|
1839 | 0 | error = do_add_port(dp, name, dpif_netdev_port_open_type(dp->class, |
1840 | 0 | "internal"), |
1841 | 0 | ODPP_LOCAL); |
1842 | 0 | ovs_rwlock_unlock(&dp->port_rwlock); |
1843 | 0 | if (error) { |
1844 | 0 | dp_netdev_free(dp); |
1845 | 0 | return error; |
1846 | 0 | } |
1847 | | |
1848 | 0 | dp->max_sleep_list = NULL; |
1849 | |
|
1850 | 0 | dp->last_tnl_conf_seq = seq_read(tnl_conf_seq); |
1851 | 0 | *dpp = dp; |
1852 | 0 | return 0; |
1853 | 0 | } |
1854 | | |
1855 | | static void |
1856 | | dp_netdev_request_reconfigure(struct dp_netdev *dp) |
1857 | 0 | { |
1858 | 0 | seq_change(dp->reconfigure_seq); |
1859 | 0 | } |
1860 | | |
1861 | | static bool |
1862 | | dp_netdev_is_reconf_required(struct dp_netdev *dp) |
1863 | 0 | { |
1864 | 0 | return seq_read(dp->reconfigure_seq) != dp->last_reconfigure_seq; |
1865 | 0 | } |
1866 | | |
1867 | | static int |
1868 | | dpif_netdev_open(const struct dpif_class *class, const char *name, |
1869 | | bool create, struct dpif **dpifp) |
1870 | 0 | { |
1871 | 0 | struct dp_netdev *dp; |
1872 | 0 | int error; |
1873 | |
|
1874 | 0 | ovs_mutex_lock(&dp_netdev_mutex); |
1875 | 0 | dp = shash_find_data(&dp_netdevs, name); |
1876 | 0 | if (!dp) { |
1877 | 0 | error = create ? create_dp_netdev(name, class, &dp) : ENODEV; |
1878 | 0 | } else { |
1879 | 0 | error = (dp->class != class ? EINVAL |
1880 | 0 | : create ? EEXIST |
1881 | 0 | : 0); |
1882 | 0 | } |
1883 | 0 | if (!error) { |
1884 | 0 | *dpifp = create_dpif_netdev(dp); |
1885 | 0 | } |
1886 | 0 | ovs_mutex_unlock(&dp_netdev_mutex); |
1887 | |
|
1888 | 0 | return error; |
1889 | 0 | } |
1890 | | |
1891 | | static void |
1892 | | dp_netdev_destroy_upcall_lock(struct dp_netdev *dp) |
1893 | | OVS_NO_THREAD_SAFETY_ANALYSIS |
1894 | 0 | { |
1895 | | /* Check that upcalls are disabled, i.e. that the rwlock is taken */ |
1896 | 0 | ovs_assert(fat_rwlock_tryrdlock(&dp->upcall_rwlock)); |
1897 | | |
1898 | | /* Before freeing a lock we should release it */ |
1899 | 0 | fat_rwlock_unlock(&dp->upcall_rwlock); |
1900 | 0 | fat_rwlock_destroy(&dp->upcall_rwlock); |
1901 | 0 | } |
1902 | | |
1903 | | static uint32_t |
1904 | | hash_bond_id(uint32_t bond_id) |
1905 | 0 | { |
1906 | 0 | return hash_int(bond_id, 0); |
1907 | 0 | } |
1908 | | |
1909 | | /* Requires dp_netdev_mutex so that we can't get a new reference to 'dp' |
1910 | | * through the 'dp_netdevs' shash while freeing 'dp'. */ |
1911 | | static void |
1912 | | dp_netdev_free(struct dp_netdev *dp) |
1913 | | OVS_REQUIRES(dp_netdev_mutex) |
1914 | 0 | { |
1915 | 0 | struct dp_netdev_port *port; |
1916 | 0 | struct tx_bond *bond; |
1917 | |
|
1918 | 0 | shash_find_and_delete(&dp_netdevs, dp->name); |
1919 | |
|
1920 | 0 | ovs_rwlock_wrlock(&dp->port_rwlock); |
1921 | 0 | HMAP_FOR_EACH_SAFE (port, node, &dp->ports) { |
1922 | 0 | do_del_port(dp, port); |
1923 | 0 | } |
1924 | 0 | ovs_rwlock_unlock(&dp->port_rwlock); |
1925 | |
|
1926 | 0 | ovs_mutex_lock(&dp->bond_mutex); |
1927 | 0 | CMAP_FOR_EACH (bond, node, &dp->tx_bonds) { |
1928 | 0 | cmap_remove(&dp->tx_bonds, &bond->node, hash_bond_id(bond->bond_id)); |
1929 | 0 | ovsrcu_postpone(free, bond); |
1930 | 0 | } |
1931 | 0 | ovs_mutex_unlock(&dp->bond_mutex); |
1932 | |
|
1933 | 0 | dp_netdev_destroy_all_pmds(dp, true); |
1934 | 0 | cmap_destroy(&dp->poll_threads); |
1935 | |
|
1936 | 0 | ovs_mutex_destroy(&dp->tx_qid_pool_mutex); |
1937 | 0 | id_pool_destroy(dp->tx_qid_pool); |
1938 | |
|
1939 | 0 | ovs_mutex_destroy(&dp->non_pmd_mutex); |
1940 | 0 | ovsthread_key_delete(dp->per_pmd_key); |
1941 | |
|
1942 | 0 | conntrack_destroy(dp->conntrack); |
1943 | | |
1944 | |
|
1945 | 0 | seq_destroy(dp->reconfigure_seq); |
1946 | |
|
1947 | 0 | seq_destroy(dp->port_seq); |
1948 | 0 | hmap_destroy(&dp->ports); |
1949 | 0 | ovs_rwlock_destroy(&dp->port_rwlock); |
1950 | |
|
1951 | 0 | cmap_destroy(&dp->tx_bonds); |
1952 | 0 | ovs_mutex_destroy(&dp->bond_mutex); |
1953 | | |
1954 | | /* Upcalls must be disabled at this point */ |
1955 | 0 | dp_netdev_destroy_upcall_lock(dp); |
1956 | |
|
1957 | 0 | dp_netdev_meter_destroy(dp); |
1958 | |
|
1959 | 0 | free(dp->max_sleep_list); |
1960 | 0 | free(dp->pmd_cmask); |
1961 | 0 | free(CONST_CAST(char *, dp->name)); |
1962 | 0 | free(CONST_CAST(char *, dp->full_name)); |
1963 | 0 | free(dp); |
1964 | 0 | } |
1965 | | |
1966 | | static void |
1967 | | dp_netdev_unref(struct dp_netdev *dp) |
1968 | 0 | { |
1969 | 0 | if (dp) { |
1970 | | /* Take dp_netdev_mutex so that, if dp->ref_cnt falls to zero, we can't |
1971 | | * get a new reference to 'dp' through the 'dp_netdevs' shash. */ |
1972 | 0 | ovs_mutex_lock(&dp_netdev_mutex); |
1973 | 0 | if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) { |
1974 | 0 | dp_netdev_free(dp); |
1975 | 0 | } |
1976 | 0 | ovs_mutex_unlock(&dp_netdev_mutex); |
1977 | 0 | } |
1978 | 0 | } |
1979 | | |
1980 | | static void |
1981 | | dpif_netdev_close(struct dpif *dpif) |
1982 | 0 | { |
1983 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
1984 | |
|
1985 | 0 | dp_netdev_unref(dp); |
1986 | 0 | free(dpif); |
1987 | 0 | } |
1988 | | |
1989 | | static int |
1990 | | dpif_netdev_destroy(struct dpif *dpif) |
1991 | 0 | { |
1992 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
1993 | |
|
1994 | 0 | if (!atomic_flag_test_and_set(&dp->destroyed)) { |
1995 | 0 | if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) { |
1996 | | /* Can't happen: 'dpif' still owns a reference to 'dp'. */ |
1997 | 0 | OVS_NOT_REACHED(); |
1998 | 0 | } |
1999 | 0 | } |
2000 | | |
2001 | 0 | return 0; |
2002 | 0 | } |
2003 | | |
2004 | | /* Add 'n' to the atomic variable 'var' non-atomically and using relaxed |
2005 | | * load/store semantics. While the increment is not atomic, the load and |
2006 | | * store operations are, making it impossible to read inconsistent values. |
2007 | | * |
2008 | | * This is used to update thread local stats counters. */ |
2009 | | static void |
2010 | | non_atomic_ullong_add(atomic_ullong *var, unsigned long long n) |
2011 | 0 | { |
2012 | 0 | unsigned long long tmp; |
2013 | |
|
2014 | 0 | atomic_read_relaxed(var, &tmp); |
2015 | 0 | tmp += n; |
2016 | 0 | atomic_store_relaxed(var, tmp); |
2017 | 0 | } |
2018 | | |
2019 | | static int |
2020 | | dpif_netdev_get_stats(const struct dpif *dpif, struct dpif_dp_stats *stats) |
2021 | 0 | { |
2022 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
2023 | 0 | struct dp_netdev_pmd_thread *pmd; |
2024 | 0 | uint64_t pmd_stats[PMD_N_STATS]; |
2025 | |
|
2026 | 0 | stats->n_flows = stats->n_hit = stats->n_missed = stats->n_lost = 0; |
2027 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
2028 | 0 | stats->n_flows += cmap_count(&pmd->flow_table); |
2029 | 0 | pmd_perf_read_counters(&pmd->perf_stats, pmd_stats); |
2030 | 0 | stats->n_hit += pmd_stats[PMD_STAT_PHWOL_HIT]; |
2031 | 0 | stats->n_hit += pmd_stats[PMD_STAT_SIMPLE_HIT]; |
2032 | 0 | stats->n_hit += pmd_stats[PMD_STAT_EXACT_HIT]; |
2033 | 0 | stats->n_hit += pmd_stats[PMD_STAT_SMC_HIT]; |
2034 | 0 | stats->n_hit += pmd_stats[PMD_STAT_MASKED_HIT]; |
2035 | 0 | stats->n_missed += pmd_stats[PMD_STAT_MISS]; |
2036 | 0 | stats->n_lost += pmd_stats[PMD_STAT_LOST]; |
2037 | 0 | } |
2038 | 0 | stats->n_masks = UINT32_MAX; |
2039 | 0 | stats->n_mask_hit = UINT64_MAX; |
2040 | 0 | stats->n_cache_hit = UINT64_MAX; |
2041 | |
|
2042 | 0 | return 0; |
2043 | 0 | } |
2044 | | |
2045 | | static void |
2046 | | dp_netdev_reload_pmd__(struct dp_netdev_pmd_thread *pmd) |
2047 | 0 | { |
2048 | 0 | if (pmd->core_id == NON_PMD_CORE_ID) { |
2049 | 0 | ovs_mutex_lock(&pmd->dp->non_pmd_mutex); |
2050 | 0 | ovs_mutex_lock(&pmd->port_mutex); |
2051 | 0 | pmd_load_cached_ports(pmd); |
2052 | 0 | ovs_mutex_unlock(&pmd->port_mutex); |
2053 | 0 | ovs_mutex_unlock(&pmd->dp->non_pmd_mutex); |
2054 | 0 | return; |
2055 | 0 | } |
2056 | | |
2057 | 0 | seq_change(pmd->reload_seq); |
2058 | 0 | atomic_store_explicit(&pmd->reload, true, memory_order_release); |
2059 | 0 | } |
2060 | | |
2061 | | static uint32_t |
2062 | | hash_port_no(odp_port_t port_no) |
2063 | 0 | { |
2064 | 0 | return hash_int(odp_to_u32(port_no), 0); |
2065 | 0 | } |
2066 | | |
2067 | | static int |
2068 | | port_create(const char *devname, const char *type, |
2069 | | odp_port_t port_no, struct dp_netdev_port **portp) |
2070 | 0 | { |
2071 | 0 | struct dp_netdev_port *port; |
2072 | 0 | enum netdev_flags flags; |
2073 | 0 | struct netdev *netdev; |
2074 | 0 | int error; |
2075 | |
|
2076 | 0 | *portp = NULL; |
2077 | | |
2078 | | /* Open and validate network device. */ |
2079 | 0 | error = netdev_open(devname, type, &netdev); |
2080 | 0 | if (error) { |
2081 | 0 | return error; |
2082 | 0 | } |
2083 | | /* XXX reject non-Ethernet devices */ |
2084 | | |
2085 | 0 | netdev_get_flags(netdev, &flags); |
2086 | 0 | if (flags & NETDEV_LOOPBACK) { |
2087 | 0 | VLOG_ERR("%s: cannot add a loopback device", devname); |
2088 | 0 | error = EINVAL; |
2089 | 0 | goto out; |
2090 | 0 | } |
2091 | | |
2092 | 0 | port = xzalloc(sizeof *port); |
2093 | 0 | port->port_no = port_no; |
2094 | 0 | port->netdev = netdev; |
2095 | 0 | port->type = xstrdup(type); |
2096 | 0 | port->sf = NULL; |
2097 | 0 | port->emc_enabled = true; |
2098 | 0 | port->need_reconfigure = true; |
2099 | 0 | ovs_mutex_init(&port->txq_used_mutex); |
2100 | |
|
2101 | 0 | *portp = port; |
2102 | |
|
2103 | 0 | return 0; |
2104 | | |
2105 | 0 | out: |
2106 | 0 | netdev_close(netdev); |
2107 | 0 | return error; |
2108 | 0 | } |
2109 | | |
2110 | | static int |
2111 | | do_add_port(struct dp_netdev *dp, const char *devname, const char *type, |
2112 | | odp_port_t port_no) |
2113 | | OVS_REQ_WRLOCK(dp->port_rwlock) |
2114 | 0 | { |
2115 | 0 | struct netdev_saved_flags *sf; |
2116 | 0 | struct dp_netdev_port *port; |
2117 | 0 | int error; |
2118 | | |
2119 | | /* Reject devices already in 'dp'. */ |
2120 | 0 | if (!get_port_by_name(dp, devname, &port)) { |
2121 | 0 | return EEXIST; |
2122 | 0 | } |
2123 | | |
2124 | 0 | error = port_create(devname, type, port_no, &port); |
2125 | 0 | if (error) { |
2126 | 0 | return error; |
2127 | 0 | } |
2128 | | |
2129 | 0 | hmap_insert(&dp->ports, &port->node, hash_port_no(port_no)); |
2130 | 0 | seq_change(dp->port_seq); |
2131 | |
|
2132 | 0 | reconfigure_datapath(dp); |
2133 | | |
2134 | | /* Check that port was successfully configured. */ |
2135 | 0 | if (!dp_netdev_lookup_port(dp, port_no)) { |
2136 | 0 | return EINVAL; |
2137 | 0 | } |
2138 | | |
2139 | | /* Updating device flags triggers an if_notifier, which triggers a bridge |
2140 | | * reconfiguration and another attempt to add this port, leading to an |
2141 | | * infinite loop if the device is configured incorrectly and cannot be |
2142 | | * added. Setting the promisc mode after a successful reconfiguration, |
2143 | | * since we already know that the device is somehow properly configured. */ |
2144 | 0 | error = netdev_turn_flags_on(port->netdev, NETDEV_PROMISC, &sf); |
2145 | 0 | if (error) { |
2146 | 0 | VLOG_ERR("%s: cannot set promisc flag", devname); |
2147 | 0 | do_del_port(dp, port); |
2148 | 0 | return error; |
2149 | 0 | } |
2150 | 0 | port->sf = sf; |
2151 | |
|
2152 | 0 | return 0; |
2153 | 0 | } |
2154 | | |
2155 | | static int |
2156 | | dpif_netdev_port_add(struct dpif *dpif, struct netdev *netdev, |
2157 | | odp_port_t *port_nop) |
2158 | 0 | { |
2159 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
2160 | 0 | char namebuf[NETDEV_VPORT_NAME_BUFSIZE]; |
2161 | 0 | const char *dpif_port; |
2162 | 0 | odp_port_t port_no; |
2163 | 0 | int error; |
2164 | |
|
2165 | 0 | ovs_rwlock_wrlock(&dp->port_rwlock); |
2166 | 0 | dpif_port = netdev_vport_get_dpif_port(netdev, namebuf, sizeof namebuf); |
2167 | 0 | if (*port_nop != ODPP_NONE) { |
2168 | 0 | port_no = *port_nop; |
2169 | 0 | error = dp_netdev_lookup_port(dp, *port_nop) ? EBUSY : 0; |
2170 | 0 | } else { |
2171 | 0 | port_no = choose_port(dp, dpif_port); |
2172 | 0 | error = port_no == ODPP_NONE ? EFBIG : 0; |
2173 | 0 | } |
2174 | 0 | if (!error) { |
2175 | 0 | *port_nop = port_no; |
2176 | 0 | error = do_add_port(dp, dpif_port, netdev_get_type(netdev), port_no); |
2177 | 0 | } |
2178 | 0 | ovs_rwlock_unlock(&dp->port_rwlock); |
2179 | |
|
2180 | 0 | return error; |
2181 | 0 | } |
2182 | | |
2183 | | static int |
2184 | | dpif_netdev_port_del(struct dpif *dpif, odp_port_t port_no) |
2185 | 0 | { |
2186 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
2187 | 0 | int error; |
2188 | |
|
2189 | 0 | ovs_rwlock_wrlock(&dp->port_rwlock); |
2190 | 0 | if (port_no == ODPP_LOCAL) { |
2191 | 0 | error = EINVAL; |
2192 | 0 | } else { |
2193 | 0 | struct dp_netdev_port *port; |
2194 | |
|
2195 | 0 | error = get_port_by_number(dp, port_no, &port); |
2196 | 0 | if (!error) { |
2197 | 0 | do_del_port(dp, port); |
2198 | 0 | } |
2199 | 0 | } |
2200 | 0 | ovs_rwlock_unlock(&dp->port_rwlock); |
2201 | |
|
2202 | 0 | return error; |
2203 | 0 | } |
2204 | | |
2205 | | static bool |
2206 | | is_valid_port_number(odp_port_t port_no) |
2207 | 0 | { |
2208 | 0 | return port_no != ODPP_NONE; |
2209 | 0 | } |
2210 | | |
2211 | | static struct dp_netdev_port * |
2212 | | dp_netdev_lookup_port(const struct dp_netdev *dp, odp_port_t port_no) |
2213 | | OVS_REQ_RDLOCK(dp->port_rwlock) |
2214 | 0 | { |
2215 | 0 | struct dp_netdev_port *port; |
2216 | |
|
2217 | 0 | HMAP_FOR_EACH_WITH_HASH (port, node, hash_port_no(port_no), &dp->ports) { |
2218 | 0 | if (port->port_no == port_no) { |
2219 | 0 | return port; |
2220 | 0 | } |
2221 | 0 | } |
2222 | 0 | return NULL; |
2223 | 0 | } |
2224 | | |
2225 | | static int |
2226 | | get_port_by_number(struct dp_netdev *dp, |
2227 | | odp_port_t port_no, struct dp_netdev_port **portp) |
2228 | | OVS_REQ_RDLOCK(dp->port_rwlock) |
2229 | 0 | { |
2230 | 0 | if (!is_valid_port_number(port_no)) { |
2231 | 0 | *portp = NULL; |
2232 | 0 | return EINVAL; |
2233 | 0 | } else { |
2234 | 0 | *portp = dp_netdev_lookup_port(dp, port_no); |
2235 | 0 | return *portp ? 0 : ENODEV; |
2236 | 0 | } |
2237 | 0 | } |
2238 | | |
2239 | | static void |
2240 | | port_destroy(struct dp_netdev_port *port) |
2241 | 0 | { |
2242 | 0 | if (!port) { |
2243 | 0 | return; |
2244 | 0 | } |
2245 | | |
2246 | 0 | netdev_close(port->netdev); |
2247 | 0 | netdev_restore_flags(port->sf); |
2248 | |
|
2249 | 0 | for (unsigned i = 0; i < port->n_rxq; i++) { |
2250 | 0 | netdev_rxq_close(port->rxqs[i].rx); |
2251 | 0 | } |
2252 | 0 | ovs_mutex_destroy(&port->txq_used_mutex); |
2253 | 0 | free(port->rxq_affinity_list); |
2254 | 0 | free(port->txq_used); |
2255 | 0 | free(port->rxqs); |
2256 | 0 | free(port->type); |
2257 | 0 | free(port); |
2258 | 0 | } |
2259 | | |
2260 | | static int |
2261 | | get_port_by_name(struct dp_netdev *dp, |
2262 | | const char *devname, struct dp_netdev_port **portp) |
2263 | | OVS_REQ_RDLOCK(dp->port_rwlock) |
2264 | 0 | { |
2265 | 0 | struct dp_netdev_port *port; |
2266 | |
|
2267 | 0 | HMAP_FOR_EACH (port, node, &dp->ports) { |
2268 | 0 | if (!strcmp(netdev_get_name(port->netdev), devname)) { |
2269 | 0 | *portp = port; |
2270 | 0 | return 0; |
2271 | 0 | } |
2272 | 0 | } |
2273 | | |
2274 | | /* Callers of dpif_netdev_port_query_by_name() expect ENODEV for a non |
2275 | | * existing port. */ |
2276 | 0 | return ENODEV; |
2277 | 0 | } |
2278 | | |
2279 | | /* Returns 'true' if there is a port with pmd netdev. */ |
2280 | | static bool |
2281 | | has_pmd_port(struct dp_netdev *dp) |
2282 | | OVS_REQ_RDLOCK(dp->port_rwlock) |
2283 | 0 | { |
2284 | 0 | struct dp_netdev_port *port; |
2285 | |
|
2286 | 0 | HMAP_FOR_EACH (port, node, &dp->ports) { |
2287 | 0 | if (netdev_is_pmd(port->netdev)) { |
2288 | 0 | return true; |
2289 | 0 | } |
2290 | 0 | } |
2291 | | |
2292 | 0 | return false; |
2293 | 0 | } |
2294 | | |
2295 | | static void |
2296 | | do_del_port(struct dp_netdev *dp, struct dp_netdev_port *port) |
2297 | | OVS_REQ_WRLOCK(dp->port_rwlock) |
2298 | 0 | { |
2299 | 0 | hmap_remove(&dp->ports, &port->node); |
2300 | 0 | seq_change(dp->port_seq); |
2301 | |
|
2302 | 0 | reconfigure_datapath(dp); |
2303 | 0 | port_destroy(port); |
2304 | 0 | } |
2305 | | |
2306 | | static void |
2307 | | answer_port_query(const struct dp_netdev_port *port, |
2308 | | struct dpif_port *dpif_port) |
2309 | 0 | { |
2310 | 0 | dpif_port->name = xstrdup(netdev_get_name(port->netdev)); |
2311 | 0 | dpif_port->type = xstrdup(port->type); |
2312 | 0 | dpif_port->port_no = port->port_no; |
2313 | 0 | } |
2314 | | |
2315 | | static int |
2316 | | dpif_netdev_port_query_by_number(const struct dpif *dpif, odp_port_t port_no, |
2317 | | struct dpif_port *dpif_port) |
2318 | 0 | { |
2319 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
2320 | 0 | struct dp_netdev_port *port; |
2321 | 0 | int error; |
2322 | |
|
2323 | 0 | ovs_rwlock_rdlock(&dp->port_rwlock); |
2324 | 0 | error = get_port_by_number(dp, port_no, &port); |
2325 | 0 | if (!error && dpif_port) { |
2326 | 0 | answer_port_query(port, dpif_port); |
2327 | 0 | } |
2328 | 0 | ovs_rwlock_unlock(&dp->port_rwlock); |
2329 | |
|
2330 | 0 | return error; |
2331 | 0 | } |
2332 | | |
2333 | | static int |
2334 | | dpif_netdev_port_query_by_name(const struct dpif *dpif, const char *devname, |
2335 | | struct dpif_port *dpif_port) |
2336 | 0 | { |
2337 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
2338 | 0 | struct dp_netdev_port *port; |
2339 | 0 | int error; |
2340 | |
|
2341 | 0 | ovs_rwlock_rdlock(&dp->port_rwlock); |
2342 | 0 | error = get_port_by_name(dp, devname, &port); |
2343 | 0 | if (!error && dpif_port) { |
2344 | 0 | answer_port_query(port, dpif_port); |
2345 | 0 | } |
2346 | 0 | ovs_rwlock_unlock(&dp->port_rwlock); |
2347 | |
|
2348 | 0 | return error; |
2349 | 0 | } |
2350 | | |
2351 | | static void |
2352 | | dp_netdev_flow_free(struct dp_netdev_flow *flow) |
2353 | 0 | { |
2354 | 0 | dp_netdev_actions_free(dp_netdev_flow_get_actions(flow)); |
2355 | 0 | free(flow->dp_extra_info); |
2356 | 0 | free(flow); |
2357 | 0 | } |
2358 | | |
2359 | | void dp_netdev_flow_unref(struct dp_netdev_flow *flow) |
2360 | 0 | { |
2361 | 0 | if (ovs_refcount_unref_relaxed(&flow->ref_cnt) == 1) { |
2362 | 0 | ovsrcu_postpone(dp_netdev_flow_free, flow); |
2363 | 0 | } |
2364 | 0 | } |
2365 | | |
2366 | | inline struct dpcls * |
2367 | | dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd, |
2368 | | odp_port_t in_port) |
2369 | 0 | { |
2370 | 0 | struct dpcls *cls; |
2371 | 0 | uint32_t hash = hash_port_no(in_port); |
2372 | 0 | CMAP_FOR_EACH_WITH_HASH (cls, node, hash, &pmd->classifiers) { |
2373 | 0 | if (cls->in_port == in_port) { |
2374 | | /* Port classifier exists already */ |
2375 | 0 | return cls; |
2376 | 0 | } |
2377 | 0 | } |
2378 | 0 | return NULL; |
2379 | 0 | } |
2380 | | |
2381 | | static inline struct dpcls * |
2382 | | dp_netdev_pmd_find_dpcls(struct dp_netdev_pmd_thread *pmd, |
2383 | | odp_port_t in_port) |
2384 | | OVS_REQUIRES(pmd->flow_mutex) |
2385 | 0 | { |
2386 | 0 | struct dpcls *cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port); |
2387 | |
|
2388 | 0 | if (!cls) { |
2389 | 0 | uint32_t hash = hash_port_no(in_port); |
2390 | | |
2391 | | /* Create new classifier for in_port */ |
2392 | 0 | cls = xmalloc(sizeof(*cls)); |
2393 | 0 | dpcls_init(cls); |
2394 | 0 | cls->in_port = in_port; |
2395 | 0 | cmap_insert(&pmd->classifiers, &cls->node, hash); |
2396 | 0 | VLOG_DBG("Creating dpcls %p for in_port %d", cls, in_port); |
2397 | 0 | } |
2398 | 0 | return cls; |
2399 | 0 | } |
2400 | | |
2401 | | static void |
2402 | | log_netdev_flow_change(const struct dp_netdev_flow *flow, |
2403 | | const struct match *match, |
2404 | | const struct dp_netdev_actions *old_actions, |
2405 | | const struct nlattr *actions, |
2406 | | size_t actions_len) |
2407 | 0 | { |
2408 | 0 | struct ds ds = DS_EMPTY_INITIALIZER; |
2409 | 0 | struct ofpbuf key_buf, mask_buf; |
2410 | 0 | struct odp_flow_key_parms odp_parms = { |
2411 | 0 | .flow = &match->flow, |
2412 | 0 | .mask = &match->wc.masks, |
2413 | 0 | .support = dp_netdev_support, |
2414 | 0 | }; |
2415 | |
|
2416 | 0 | if (OVS_LIKELY(VLOG_DROP_DBG((&upcall_rl)))) { |
2417 | 0 | return; |
2418 | 0 | } |
2419 | | |
2420 | 0 | ofpbuf_init(&key_buf, 0); |
2421 | 0 | ofpbuf_init(&mask_buf, 0); |
2422 | |
|
2423 | 0 | odp_flow_key_from_flow(&odp_parms, &key_buf); |
2424 | 0 | odp_parms.key_buf = &key_buf; |
2425 | 0 | odp_flow_key_from_mask(&odp_parms, &mask_buf); |
2426 | |
|
2427 | 0 | if (old_actions) { |
2428 | 0 | ds_put_cstr(&ds, "flow_mod: "); |
2429 | 0 | } else { |
2430 | 0 | ds_put_cstr(&ds, "flow_add: "); |
2431 | 0 | } |
2432 | 0 | odp_format_ufid(&flow->ufid, &ds); |
2433 | 0 | ds_put_cstr(&ds, " mega_"); |
2434 | 0 | odp_format_ufid(&flow->mega_ufid, &ds); |
2435 | 0 | ds_put_cstr(&ds, " "); |
2436 | 0 | odp_flow_format(key_buf.data, key_buf.size, |
2437 | 0 | mask_buf.data, mask_buf.size, |
2438 | 0 | NULL, &ds, false, true); |
2439 | 0 | if (old_actions) { |
2440 | 0 | ds_put_cstr(&ds, ", old_actions:"); |
2441 | 0 | format_odp_actions(&ds, old_actions->actions, old_actions->size, |
2442 | 0 | NULL); |
2443 | 0 | } |
2444 | 0 | ds_put_cstr(&ds, ", actions:"); |
2445 | 0 | format_odp_actions(&ds, actions, actions_len, NULL); |
2446 | |
|
2447 | 0 | VLOG_DBG("%s", ds_cstr(&ds)); |
2448 | |
|
2449 | 0 | ofpbuf_uninit(&key_buf); |
2450 | 0 | ofpbuf_uninit(&mask_buf); |
2451 | | |
2452 | | /* Add a printout of the actual match installed. */ |
2453 | 0 | struct match m; |
2454 | 0 | ds_clear(&ds); |
2455 | 0 | ds_put_cstr(&ds, "flow match: "); |
2456 | 0 | miniflow_expand(&flow->cr.flow.mf, &m.flow); |
2457 | 0 | miniflow_expand(&flow->cr.mask->mf, &m.wc.masks); |
2458 | 0 | memset(&m.tun_md, 0, sizeof m.tun_md); |
2459 | 0 | match_format(&m, NULL, &ds, OFP_DEFAULT_PRIORITY); |
2460 | |
|
2461 | 0 | VLOG_DBG("%s", ds_cstr(&ds)); |
2462 | |
|
2463 | 0 | ds_destroy(&ds); |
2464 | 0 | } |
2465 | | |
2466 | | /* Offloaded flows can be handled asynchronously, so we do not always know |
2467 | | * whether a specific flow is offloaded or not. It might still be pending; |
2468 | | * in fact, multiple modifications can be pending, and the actual offload |
2469 | | * state depends on the completion of each modification. |
2470 | | * |
2471 | | * To correctly determine whether a flow is offloaded when it is being |
2472 | | * destroyed (and therefore requires cleanup), we must ensure that all |
2473 | | * operations have completed. To achieve this, we track the number of |
2474 | | * outstanding offloaded flow modifications. */ |
2475 | | static bool |
2476 | | offload_queue_inc(struct dp_netdev_flow *flow) |
2477 | 0 | { |
2478 | 0 | int current; |
2479 | |
|
2480 | 0 | while (true) { |
2481 | 0 | atomic_read(&flow->offload_queue_depth, ¤t); |
2482 | 0 | if (current < 0) { |
2483 | | /* We are cleaning up, so no longer enqueue operations. */ |
2484 | 0 | return false; |
2485 | 0 | } |
2486 | | |
2487 | | /* Here we try to atomically increase the value. If we do not succeed, |
2488 | | * someone else has modified it, and we need to check again for a |
2489 | | * current negative value. */ |
2490 | 0 | if (atomic_compare_exchange_strong(&flow->offload_queue_depth, |
2491 | 0 | ¤t, current + 1)) { |
2492 | 0 | return true; |
2493 | 0 | } |
2494 | 0 | } |
2495 | 0 | } |
2496 | | |
2497 | | static bool |
2498 | | offload_queue_dec(struct dp_netdev_flow *flow) |
2499 | 0 | { |
2500 | 0 | int old; |
2501 | |
|
2502 | 0 | atomic_sub(&flow->offload_queue_depth, 1, &old); |
2503 | 0 | ovs_assert(old >= 1); |
2504 | |
|
2505 | 0 | if (old == 1) { |
2506 | | /* Note that this only indicates that the queue might be empty. */ |
2507 | 0 | return true; |
2508 | 0 | } |
2509 | 0 | return false; |
2510 | 0 | } |
2511 | | |
2512 | | static bool |
2513 | | offload_queue_complete(struct dp_netdev_flow *flow) |
2514 | 0 | { |
2515 | | /* This function returns false if the queue is still in use. |
2516 | | * If the queue is empty, it will attempt to atomically mark it as |
2517 | | * 'not in use' by making the queue depth negative. This prevents |
2518 | | * other flow operations from being added. If successful, it returns |
2519 | | * true. */ |
2520 | 0 | int expected_val = 0; |
2521 | |
|
2522 | 0 | return atomic_compare_exchange_strong(&flow->offload_queue_depth, |
2523 | 0 | &expected_val, -1); |
2524 | 0 | } |
2525 | | |
2526 | | static void |
2527 | | offload_flow_reference_unreference_cb(unsigned pmd_id OVS_UNUSED, |
2528 | | void *flow_reference_) |
2529 | 0 | { |
2530 | 0 | struct dp_netdev_flow *flow_reference = flow_reference_; |
2531 | |
|
2532 | 0 | if (flow_reference) { |
2533 | 0 | flow_reference->offloaded = false; |
2534 | 0 | dp_netdev_flow_unref(flow_reference); |
2535 | 0 | } |
2536 | 0 | } |
2537 | | |
2538 | | static void |
2539 | | offload_flow_del_resume(struct dp_netdev_flow *flow_reference, |
2540 | | int error) |
2541 | 0 | { |
2542 | 0 | if (error == EINPROGRESS) { |
2543 | 0 | return; |
2544 | 0 | } |
2545 | | |
2546 | 0 | if (error) { |
2547 | 0 | odp_port_t in_port = flow_reference->flow.in_port.odp_port; |
2548 | |
|
2549 | 0 | VLOG_DBG( |
2550 | 0 | "Failed removing offload flow ufid " UUID_FMT " from port %d: %d", |
2551 | 0 | UUID_ARGS((struct uuid *)&flow_reference->mega_ufid), in_port, |
2552 | 0 | error); |
2553 | 0 | } else { |
2554 | | /* Release because we successfully removed the reference. */ |
2555 | 0 | dp_netdev_flow_unref(flow_reference); |
2556 | 0 | } |
2557 | | |
2558 | | /* Release as we took a reference in offload_flow_del(). */ |
2559 | 0 | dp_netdev_flow_unref(flow_reference); |
2560 | 0 | } |
2561 | | |
2562 | | static void |
2563 | | offload_flow_del_resume_cb(void *aux OVS_UNUSED, |
2564 | | struct dpif_flow_stats *stats OVS_UNUSED, |
2565 | | unsigned pmd_id OVS_UNUSED, |
2566 | | void *flow_reference, |
2567 | | void *previous_flow_reference OVS_UNUSED, int error) |
2568 | 0 | { |
2569 | 0 | offload_flow_del_resume(flow_reference, error); |
2570 | 0 | } |
2571 | | |
2572 | | static void |
2573 | | offload_flow_del(struct dp_netdev *dp, unsigned pmd_id, |
2574 | | struct dp_netdev_flow *flow) |
2575 | 0 | { |
2576 | 0 | odp_port_t in_port = flow->flow.in_port.odp_port; |
2577 | 0 | struct dpif_offload_flow_del del = { |
2578 | 0 | .in_port = in_port, |
2579 | 0 | .pmd_id = pmd_id, |
2580 | 0 | .ufid = CONST_CAST(ovs_u128 *, &flow->mega_ufid), |
2581 | 0 | .flow_reference = flow, |
2582 | 0 | .stats = NULL, |
2583 | 0 | .cb_data = { .callback = offload_flow_del_resume_cb }, |
2584 | 0 | }; |
2585 | 0 | int error; |
2586 | |
|
2587 | 0 | if (!dpif_offload_enabled()) { |
2588 | 0 | return; |
2589 | 0 | } |
2590 | | |
2591 | | /* This offload flow delete is only called when the actual flow is |
2592 | | * destructed. However, we can only trust the state of flow->offloaded |
2593 | | * if no more flow_put operations are pending. Below, we check whether |
2594 | | * the queue can be marked as complete, and then determine if we need |
2595 | | * to schedule a removal. If not, the delete will be rescheduled later |
2596 | | * in the last offload_flow_put_resume_cb() callback. */ |
2597 | 0 | ovs_assert(flow->dead); |
2598 | 0 | if (!offload_queue_complete(flow) || !flow->offloaded) { |
2599 | 0 | return; |
2600 | 0 | } |
2601 | | |
2602 | 0 | flow->offloaded = false; |
2603 | 0 | dp_netdev_flow_ref(flow); |
2604 | | |
2605 | | /* It's the responsibility of the offload provider to remove the |
2606 | | * actual rule from hardware only if none of the other PMD threads |
2607 | | * have the rule installed in hardware. */ |
2608 | 0 | error = dpif_offload_datapath_flow_del(dp->full_name, &del); |
2609 | 0 | offload_flow_del_resume(flow, error); |
2610 | 0 | } |
2611 | | |
2612 | | static void |
2613 | | dp_netdev_pmd_remove_flow(struct dp_netdev_pmd_thread *pmd, |
2614 | | struct dp_netdev_flow *flow) |
2615 | | OVS_REQUIRES(pmd->flow_mutex) |
2616 | 0 | { |
2617 | 0 | struct cmap_node *node = CONST_CAST(struct cmap_node *, &flow->node); |
2618 | 0 | struct dpcls *cls; |
2619 | 0 | odp_port_t in_port = flow->flow.in_port.odp_port; |
2620 | |
|
2621 | 0 | cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port); |
2622 | 0 | ovs_assert(cls != NULL); |
2623 | 0 | dpcls_remove(cls, &flow->cr); |
2624 | 0 | dp_netdev_simple_match_remove(pmd, flow); |
2625 | 0 | cmap_remove(&pmd->flow_table, node, dp_netdev_flow_hash(&flow->ufid)); |
2626 | 0 | ccmap_dec(&pmd->n_flows, odp_to_u32(in_port)); |
2627 | 0 | flow->dead = true; |
2628 | 0 | offload_flow_del(pmd->dp, pmd->core_id, flow); |
2629 | |
|
2630 | 0 | dp_netdev_flow_unref(flow); |
2631 | 0 | } |
2632 | | |
2633 | | static void |
2634 | | dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd) |
2635 | 0 | { |
2636 | 0 | struct dp_netdev_flow *netdev_flow; |
2637 | |
|
2638 | 0 | ovs_mutex_lock(&pmd->flow_mutex); |
2639 | 0 | CMAP_FOR_EACH (netdev_flow, node, &pmd->flow_table) { |
2640 | 0 | dp_netdev_pmd_remove_flow(pmd, netdev_flow); |
2641 | 0 | } |
2642 | 0 | ovs_mutex_unlock(&pmd->flow_mutex); |
2643 | 0 | } |
2644 | | |
2645 | | static int |
2646 | | dpif_netdev_flow_flush(struct dpif *dpif) |
2647 | 0 | { |
2648 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
2649 | 0 | struct dp_netdev_pmd_thread *pmd; |
2650 | |
|
2651 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
2652 | 0 | dp_netdev_pmd_flow_flush(pmd); |
2653 | 0 | } |
2654 | |
|
2655 | 0 | return 0; |
2656 | 0 | } |
2657 | | |
2658 | | struct dp_netdev_port_state { |
2659 | | struct hmap_position position; |
2660 | | char *name; |
2661 | | }; |
2662 | | |
2663 | | static int |
2664 | | dpif_netdev_port_dump_start(const struct dpif *dpif OVS_UNUSED, void **statep) |
2665 | 0 | { |
2666 | 0 | *statep = xzalloc(sizeof(struct dp_netdev_port_state)); |
2667 | 0 | return 0; |
2668 | 0 | } |
2669 | | |
2670 | | static int |
2671 | | dpif_netdev_port_dump_next(const struct dpif *dpif, void *state_, |
2672 | | struct dpif_port *dpif_port) |
2673 | 0 | { |
2674 | 0 | struct dp_netdev_port_state *state = state_; |
2675 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
2676 | 0 | struct hmap_node *node; |
2677 | 0 | int retval; |
2678 | |
|
2679 | 0 | ovs_rwlock_rdlock(&dp->port_rwlock); |
2680 | 0 | node = hmap_at_position(&dp->ports, &state->position); |
2681 | 0 | if (node) { |
2682 | 0 | struct dp_netdev_port *port; |
2683 | |
|
2684 | 0 | port = CONTAINER_OF(node, struct dp_netdev_port, node); |
2685 | |
|
2686 | 0 | free(state->name); |
2687 | 0 | state->name = xstrdup(netdev_get_name(port->netdev)); |
2688 | 0 | dpif_port->name = state->name; |
2689 | 0 | dpif_port->type = port->type; |
2690 | 0 | dpif_port->port_no = port->port_no; |
2691 | |
|
2692 | 0 | retval = 0; |
2693 | 0 | } else { |
2694 | 0 | retval = EOF; |
2695 | 0 | } |
2696 | 0 | ovs_rwlock_unlock(&dp->port_rwlock); |
2697 | |
|
2698 | 0 | return retval; |
2699 | 0 | } |
2700 | | |
2701 | | static int |
2702 | | dpif_netdev_port_dump_done(const struct dpif *dpif OVS_UNUSED, void *state_) |
2703 | 0 | { |
2704 | 0 | struct dp_netdev_port_state *state = state_; |
2705 | 0 | free(state->name); |
2706 | 0 | free(state); |
2707 | 0 | return 0; |
2708 | 0 | } |
2709 | | |
2710 | | static int |
2711 | | dpif_netdev_port_poll(const struct dpif *dpif_, char **devnamep OVS_UNUSED) |
2712 | 0 | { |
2713 | 0 | struct dpif_netdev *dpif = dpif_netdev_cast(dpif_); |
2714 | 0 | uint64_t new_port_seq; |
2715 | 0 | int error; |
2716 | |
|
2717 | 0 | new_port_seq = seq_read(dpif->dp->port_seq); |
2718 | 0 | if (dpif->last_port_seq != new_port_seq) { |
2719 | 0 | dpif->last_port_seq = new_port_seq; |
2720 | 0 | error = ENOBUFS; |
2721 | 0 | } else { |
2722 | 0 | error = EAGAIN; |
2723 | 0 | } |
2724 | |
|
2725 | 0 | return error; |
2726 | 0 | } |
2727 | | |
2728 | | static void |
2729 | | dpif_netdev_port_poll_wait(const struct dpif *dpif_) |
2730 | 0 | { |
2731 | 0 | struct dpif_netdev *dpif = dpif_netdev_cast(dpif_); |
2732 | |
|
2733 | 0 | seq_wait(dpif->dp->port_seq, dpif->last_port_seq); |
2734 | 0 | } |
2735 | | |
2736 | | static struct dp_netdev_flow * |
2737 | | dp_netdev_flow_cast(const struct dpcls_rule *cr) |
2738 | 0 | { |
2739 | 0 | return cr ? CONTAINER_OF(cr, struct dp_netdev_flow, cr) : NULL; |
2740 | 0 | } |
2741 | | |
2742 | | static bool dp_netdev_flow_ref(struct dp_netdev_flow *flow) |
2743 | 0 | { |
2744 | 0 | return ovs_refcount_try_ref_rcu(&flow->ref_cnt); |
2745 | 0 | } |
2746 | | |
2747 | | /* netdev_flow_key utilities. |
2748 | | * |
2749 | | * netdev_flow_key is basically a miniflow. We use these functions |
2750 | | * (netdev_flow_key_clone, netdev_flow_key_equal, ...) instead of the miniflow |
2751 | | * functions (miniflow_clone_inline, miniflow_equal, ...), because: |
2752 | | * |
2753 | | * - Since we are dealing exclusively with miniflows created by |
2754 | | * miniflow_extract(), if the map is different the miniflow is different. |
2755 | | * Therefore we can be faster by comparing the map and the miniflow in a |
2756 | | * single memcmp(). |
2757 | | * - These functions can be inlined by the compiler. */ |
2758 | | |
2759 | | static inline bool |
2760 | | netdev_flow_key_equal(const struct netdev_flow_key *a, |
2761 | | const struct netdev_flow_key *b) |
2762 | 0 | { |
2763 | | /* 'b->len' may be not set yet. */ |
2764 | 0 | return a->hash == b->hash && !memcmp(&a->mf, &b->mf, a->len); |
2765 | 0 | } |
2766 | | |
2767 | | static inline void |
2768 | | netdev_flow_key_clone(struct netdev_flow_key *dst, |
2769 | | const struct netdev_flow_key *src) |
2770 | 0 | { |
2771 | 0 | memcpy(dst, src, |
2772 | 0 | offsetof(struct netdev_flow_key, mf) + src->len); |
2773 | 0 | } |
2774 | | |
2775 | | /* Initialize a netdev_flow_key 'mask' from 'match'. */ |
2776 | | static inline void |
2777 | | netdev_flow_mask_init(struct netdev_flow_key *mask, |
2778 | | const struct match *match) |
2779 | 0 | { |
2780 | 0 | uint64_t *dst = miniflow_values(&mask->mf); |
2781 | 0 | struct flowmap fmap; |
2782 | 0 | uint32_t hash = 0; |
2783 | 0 | size_t idx; |
2784 | | |
2785 | | /* Only check masks that make sense for the flow. */ |
2786 | 0 | flow_wc_map(&match->flow, &fmap); |
2787 | 0 | flowmap_init(&mask->mf.map); |
2788 | |
|
2789 | 0 | FLOWMAP_FOR_EACH_INDEX(idx, fmap) { |
2790 | 0 | uint64_t mask_u64 = flow_u64_value(&match->wc.masks, idx); |
2791 | |
|
2792 | 0 | if (mask_u64) { |
2793 | 0 | flowmap_set(&mask->mf.map, idx, 1); |
2794 | 0 | *dst++ = mask_u64; |
2795 | 0 | hash = hash_add64(hash, mask_u64); |
2796 | 0 | } |
2797 | 0 | } |
2798 | |
|
2799 | 0 | map_t map; |
2800 | |
|
2801 | 0 | FLOWMAP_FOR_EACH_MAP (map, mask->mf.map) { |
2802 | 0 | hash = hash_add64(hash, map); |
2803 | 0 | } |
2804 | |
|
2805 | 0 | size_t n = dst - miniflow_get_values(&mask->mf); |
2806 | |
|
2807 | 0 | mask->hash = hash_finish(hash, n * 8); |
2808 | 0 | mask->len = netdev_flow_key_size(n); |
2809 | 0 | } |
2810 | | |
2811 | | /* Initializes 'dst' as a copy of 'flow' masked with 'mask'. */ |
2812 | | static inline void |
2813 | | netdev_flow_key_init_masked(struct netdev_flow_key *dst, |
2814 | | const struct flow *flow, |
2815 | | const struct netdev_flow_key *mask) |
2816 | 0 | { |
2817 | 0 | uint64_t *dst_u64 = miniflow_values(&dst->mf); |
2818 | 0 | const uint64_t *mask_u64 = miniflow_get_values(&mask->mf); |
2819 | 0 | uint32_t hash = 0; |
2820 | 0 | uint64_t value; |
2821 | |
|
2822 | 0 | dst->len = mask->len; |
2823 | 0 | dst->mf = mask->mf; /* Copy maps. */ |
2824 | |
|
2825 | 0 | FLOW_FOR_EACH_IN_MAPS(value, flow, mask->mf.map) { |
2826 | 0 | *dst_u64 = value & *mask_u64++; |
2827 | 0 | hash = hash_add64(hash, *dst_u64++); |
2828 | 0 | } |
2829 | 0 | dst->hash = hash_finish(hash, |
2830 | 0 | (dst_u64 - miniflow_get_values(&dst->mf)) * 8); |
2831 | 0 | } |
2832 | | |
2833 | | /* Initializes 'key' as a copy of 'flow'. */ |
2834 | | static inline void |
2835 | | netdev_flow_key_init(struct netdev_flow_key *key, |
2836 | | const struct flow *flow) |
2837 | 0 | { |
2838 | 0 | uint32_t hash = 0; |
2839 | 0 | uint64_t value; |
2840 | |
|
2841 | 0 | miniflow_map_init(&key->mf, flow); |
2842 | 0 | miniflow_init(&key->mf, flow); |
2843 | |
|
2844 | 0 | size_t n = miniflow_n_values(&key->mf); |
2845 | |
|
2846 | 0 | FLOW_FOR_EACH_IN_MAPS (value, flow, key->mf.map) { |
2847 | 0 | hash = hash_add64(hash, value); |
2848 | 0 | } |
2849 | |
|
2850 | 0 | key->hash = hash_finish(hash, n * 8); |
2851 | 0 | key->len = netdev_flow_key_size(n); |
2852 | 0 | } |
2853 | | |
2854 | | static inline void |
2855 | | emc_change_entry(struct emc_entry *ce, struct dp_netdev_flow *flow, |
2856 | | const struct netdev_flow_key *key) |
2857 | 0 | { |
2858 | 0 | if (ce->flow != flow) { |
2859 | 0 | if (ce->flow) { |
2860 | 0 | dp_netdev_flow_unref(ce->flow); |
2861 | 0 | } |
2862 | |
|
2863 | 0 | if (dp_netdev_flow_ref(flow)) { |
2864 | 0 | ce->flow = flow; |
2865 | 0 | } else { |
2866 | 0 | ce->flow = NULL; |
2867 | 0 | } |
2868 | 0 | } |
2869 | 0 | if (key) { |
2870 | 0 | netdev_flow_key_clone(&ce->key, key); |
2871 | 0 | } |
2872 | 0 | } |
2873 | | |
2874 | | static inline void |
2875 | | emc_insert(struct emc_cache *cache, const struct netdev_flow_key *key, |
2876 | | struct dp_netdev_flow *flow) |
2877 | 0 | { |
2878 | 0 | struct emc_entry *to_be_replaced = NULL; |
2879 | 0 | struct emc_entry *current_entry; |
2880 | |
|
2881 | 0 | EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) { |
2882 | 0 | if (netdev_flow_key_equal(¤t_entry->key, key)) { |
2883 | | /* We found the entry with the 'mf' miniflow */ |
2884 | 0 | emc_change_entry(current_entry, flow, NULL); |
2885 | 0 | return; |
2886 | 0 | } |
2887 | | |
2888 | | /* Replacement policy: put the flow in an empty (not alive) entry, or |
2889 | | * in the first entry where it can be */ |
2890 | 0 | if (!to_be_replaced |
2891 | 0 | || (emc_entry_alive(to_be_replaced) |
2892 | 0 | && !emc_entry_alive(current_entry)) |
2893 | 0 | || current_entry->key.hash < to_be_replaced->key.hash) { |
2894 | 0 | to_be_replaced = current_entry; |
2895 | 0 | } |
2896 | 0 | } |
2897 | | /* We didn't find the miniflow in the cache. |
2898 | | * The 'to_be_replaced' entry is where the new flow will be stored */ |
2899 | | |
2900 | 0 | emc_change_entry(to_be_replaced, flow, key); |
2901 | 0 | } |
2902 | | |
2903 | | static inline void |
2904 | | emc_probabilistic_insert(struct dp_netdev_pmd_thread *pmd, |
2905 | | const struct netdev_flow_key *key, |
2906 | | struct dp_netdev_flow *flow) |
2907 | 0 | { |
2908 | | /* Insert an entry into the EMC based on probability value 'min'. By |
2909 | | * default the value is UINT32_MAX / 100 which yields an insertion |
2910 | | * probability of 1/100 ie. 1% */ |
2911 | |
|
2912 | 0 | uint32_t min = pmd->ctx.emc_insert_min; |
2913 | |
|
2914 | 0 | if (min && random_uint32() <= min) { |
2915 | 0 | emc_insert(&(pmd->flow_cache).emc_cache, key, flow); |
2916 | 0 | } |
2917 | 0 | } |
2918 | | |
2919 | | static inline const struct cmap_node * |
2920 | | smc_entry_get(struct dp_netdev_pmd_thread *pmd, const uint32_t hash) |
2921 | 0 | { |
2922 | 0 | struct smc_cache *cache = &(pmd->flow_cache).smc_cache; |
2923 | 0 | struct smc_bucket *bucket = &cache->buckets[hash & SMC_MASK]; |
2924 | 0 | uint16_t sig = hash >> 16; |
2925 | 0 | uint16_t index = UINT16_MAX; |
2926 | |
|
2927 | 0 | for (int i = 0; i < SMC_ENTRY_PER_BUCKET; i++) { |
2928 | 0 | if (bucket->sig[i] == sig) { |
2929 | 0 | index = bucket->flow_idx[i]; |
2930 | 0 | break; |
2931 | 0 | } |
2932 | 0 | } |
2933 | 0 | if (index != UINT16_MAX) { |
2934 | 0 | return cmap_find_by_index(&pmd->flow_table, index); |
2935 | 0 | } |
2936 | 0 | return NULL; |
2937 | 0 | } |
2938 | | |
2939 | | /* Insert the flow_table index into SMC. Insertion may fail when 1) SMC is |
2940 | | * turned off, 2) the flow_table index is larger than uint16_t can handle. |
2941 | | * If there is already an SMC entry having same signature, the index will be |
2942 | | * updated. If there is no existing entry, but an empty entry is available, |
2943 | | * the empty entry will be taken. If no empty entry or existing same signature, |
2944 | | * a random entry from the hashed bucket will be picked. */ |
2945 | | static inline void |
2946 | | smc_insert(struct dp_netdev_pmd_thread *pmd, |
2947 | | const struct netdev_flow_key *key, |
2948 | | uint32_t hash) |
2949 | 0 | { |
2950 | 0 | struct smc_cache *smc_cache = &(pmd->flow_cache).smc_cache; |
2951 | 0 | struct smc_bucket *bucket = &smc_cache->buckets[key->hash & SMC_MASK]; |
2952 | 0 | uint16_t index; |
2953 | 0 | uint32_t cmap_index; |
2954 | 0 | int i; |
2955 | |
|
2956 | 0 | if (!pmd->ctx.smc_enable_db) { |
2957 | 0 | return; |
2958 | 0 | } |
2959 | | |
2960 | 0 | cmap_index = cmap_find_index(&pmd->flow_table, hash); |
2961 | 0 | index = (cmap_index >= UINT16_MAX) ? UINT16_MAX : (uint16_t)cmap_index; |
2962 | | |
2963 | | /* If the index is larger than SMC can handle (uint16_t), we don't |
2964 | | * insert */ |
2965 | 0 | if (index == UINT16_MAX) { |
2966 | 0 | return; |
2967 | 0 | } |
2968 | | |
2969 | | /* If an entry with same signature already exists, update the index */ |
2970 | 0 | uint16_t sig = key->hash >> 16; |
2971 | 0 | for (i = 0; i < SMC_ENTRY_PER_BUCKET; i++) { |
2972 | 0 | if (bucket->sig[i] == sig) { |
2973 | 0 | bucket->flow_idx[i] = index; |
2974 | 0 | return; |
2975 | 0 | } |
2976 | 0 | } |
2977 | | /* If there is an empty entry, occupy it. */ |
2978 | 0 | for (i = 0; i < SMC_ENTRY_PER_BUCKET; i++) { |
2979 | 0 | if (bucket->flow_idx[i] == UINT16_MAX) { |
2980 | 0 | bucket->sig[i] = sig; |
2981 | 0 | bucket->flow_idx[i] = index; |
2982 | 0 | return; |
2983 | 0 | } |
2984 | 0 | } |
2985 | | /* Otherwise, pick a random entry. */ |
2986 | 0 | i = random_uint32() % SMC_ENTRY_PER_BUCKET; |
2987 | 0 | bucket->sig[i] = sig; |
2988 | 0 | bucket->flow_idx[i] = index; |
2989 | 0 | } |
2990 | | |
2991 | | inline void |
2992 | | emc_probabilistic_insert_batch(struct dp_netdev_pmd_thread *pmd, |
2993 | | const struct netdev_flow_key *keys, |
2994 | | struct dpcls_rule **rules, |
2995 | | uint32_t emc_insert_mask) |
2996 | 0 | { |
2997 | 0 | while (emc_insert_mask) { |
2998 | 0 | uint32_t i = raw_ctz(emc_insert_mask); |
2999 | 0 | emc_insert_mask &= emc_insert_mask - 1; |
3000 | | /* Get the require parameters for EMC/SMC from the rule */ |
3001 | 0 | struct dp_netdev_flow *flow = dp_netdev_flow_cast(rules[i]); |
3002 | | /* Insert the key into EMC/SMC. */ |
3003 | 0 | emc_probabilistic_insert(pmd, &keys[i], flow); |
3004 | 0 | } |
3005 | 0 | } |
3006 | | |
3007 | | inline void |
3008 | | smc_insert_batch(struct dp_netdev_pmd_thread *pmd, |
3009 | | const struct netdev_flow_key *keys, |
3010 | | struct dpcls_rule **rules, |
3011 | | uint32_t smc_insert_mask) |
3012 | 0 | { |
3013 | 0 | while (smc_insert_mask) { |
3014 | 0 | uint32_t i = raw_ctz(smc_insert_mask); |
3015 | 0 | smc_insert_mask &= smc_insert_mask - 1; |
3016 | | /* Get the require parameters for EMC/SMC from the rule */ |
3017 | 0 | struct dp_netdev_flow *flow = dp_netdev_flow_cast(rules[i]); |
3018 | 0 | uint32_t hash = dp_netdev_flow_hash(&flow->ufid); |
3019 | | /* Insert the key into EMC/SMC. */ |
3020 | 0 | smc_insert(pmd, &keys[i], hash); |
3021 | 0 | } |
3022 | 0 | } |
3023 | | |
3024 | | static struct dp_netdev_flow * |
3025 | | dp_netdev_pmd_lookup_flow(struct dp_netdev_pmd_thread *pmd, |
3026 | | const struct netdev_flow_key *key, |
3027 | | int *lookup_num_p) |
3028 | 0 | { |
3029 | 0 | struct dpcls *cls; |
3030 | 0 | struct dpcls_rule *rule = NULL; |
3031 | 0 | odp_port_t in_port = u32_to_odp(MINIFLOW_GET_U32(&key->mf, |
3032 | 0 | in_port.odp_port)); |
3033 | 0 | struct dp_netdev_flow *netdev_flow = NULL; |
3034 | |
|
3035 | 0 | cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port); |
3036 | 0 | if (OVS_LIKELY(cls)) { |
3037 | 0 | dpcls_lookup(cls, &key, &rule, 1, lookup_num_p); |
3038 | 0 | netdev_flow = dp_netdev_flow_cast(rule); |
3039 | 0 | } |
3040 | 0 | return netdev_flow; |
3041 | 0 | } |
3042 | | |
3043 | | static struct dp_netdev_flow * |
3044 | | dp_netdev_pmd_find_flow(const struct dp_netdev_pmd_thread *pmd, |
3045 | | const ovs_u128 *ufidp, const struct nlattr *key, |
3046 | | size_t key_len) |
3047 | 0 | { |
3048 | 0 | struct dp_netdev_flow *netdev_flow; |
3049 | 0 | struct flow flow; |
3050 | 0 | ovs_u128 ufid; |
3051 | | |
3052 | | /* If a UFID is not provided, determine one based on the key. */ |
3053 | 0 | if (!ufidp && key && key_len |
3054 | 0 | && !dpif_netdev_flow_from_nlattrs(key, key_len, &flow, false)) { |
3055 | 0 | odp_flow_key_hash(&flow, sizeof flow, &ufid); |
3056 | 0 | ufidp = &ufid; |
3057 | 0 | } |
3058 | |
|
3059 | 0 | if (ufidp) { |
3060 | 0 | CMAP_FOR_EACH_WITH_HASH (netdev_flow, node, dp_netdev_flow_hash(ufidp), |
3061 | 0 | &pmd->flow_table) { |
3062 | 0 | if (ovs_u128_equals(netdev_flow->ufid, *ufidp)) { |
3063 | 0 | return netdev_flow; |
3064 | 0 | } |
3065 | 0 | } |
3066 | 0 | } |
3067 | | |
3068 | 0 | return NULL; |
3069 | 0 | } |
3070 | | |
3071 | | static void |
3072 | | get_dpif_flow_status(const struct dp_netdev *dp, |
3073 | | const struct dp_netdev_flow *netdev_flow_, |
3074 | | struct dpif_flow_stats *stats, |
3075 | | struct dpif_flow_attrs *attrs) |
3076 | 0 | { |
3077 | 0 | struct dpif_flow_stats offload_stats; |
3078 | 0 | struct dpif_flow_attrs offload_attrs; |
3079 | 0 | struct dp_netdev_flow *netdev_flow; |
3080 | 0 | unsigned long long n; |
3081 | 0 | long long used; |
3082 | 0 | uint16_t flags; |
3083 | |
|
3084 | 0 | netdev_flow = CONST_CAST(struct dp_netdev_flow *, netdev_flow_); |
3085 | |
|
3086 | 0 | atomic_read_relaxed(&netdev_flow->stats.packet_count, &n); |
3087 | 0 | stats->n_packets = n; |
3088 | 0 | atomic_read_relaxed(&netdev_flow->stats.byte_count, &n); |
3089 | 0 | stats->n_bytes = n; |
3090 | 0 | atomic_read_relaxed(&netdev_flow->stats.used, &used); |
3091 | 0 | stats->used = used; |
3092 | 0 | atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags); |
3093 | 0 | stats->tcp_flags = flags; |
3094 | |
|
3095 | 0 | if (dpif_offload_datapath_flow_stats(dp->full_name, |
3096 | 0 | netdev_flow->flow.in_port.odp_port, |
3097 | 0 | &netdev_flow->mega_ufid, |
3098 | 0 | &offload_stats, &offload_attrs)) { |
3099 | 0 | stats->n_packets += offload_stats.n_packets; |
3100 | 0 | stats->n_bytes += offload_stats.n_bytes; |
3101 | 0 | stats->used = MAX(stats->used, offload_stats.used); |
3102 | 0 | stats->tcp_flags |= offload_stats.tcp_flags; |
3103 | 0 | if (attrs) { |
3104 | 0 | attrs->offloaded = offload_attrs.offloaded; |
3105 | 0 | attrs->dp_layer = offload_attrs.dp_layer; |
3106 | 0 | } |
3107 | 0 | } else if (attrs) { |
3108 | 0 | attrs->offloaded = false; |
3109 | 0 | attrs->dp_layer = "ovs"; |
3110 | 0 | } |
3111 | 0 | } |
3112 | | |
3113 | | /* Converts to the dpif_flow format, using 'key_buf' and 'mask_buf' for |
3114 | | * storing the netlink-formatted key/mask. 'key_buf' may be the same as |
3115 | | * 'mask_buf'. Actions will be returned without copying, by relying on RCU to |
3116 | | * protect them. */ |
3117 | | static void |
3118 | | dp_netdev_flow_to_dpif_flow(const struct dp_netdev *dp, |
3119 | | const struct dp_netdev_flow *netdev_flow, |
3120 | | struct ofpbuf *key_buf, struct ofpbuf *mask_buf, |
3121 | | struct dpif_flow *flow, bool terse) |
3122 | 0 | { |
3123 | 0 | if (terse) { |
3124 | 0 | memset(flow, 0, sizeof *flow); |
3125 | 0 | } else { |
3126 | 0 | struct flow_wildcards wc; |
3127 | 0 | struct dp_netdev_actions *actions; |
3128 | 0 | size_t offset; |
3129 | 0 | struct odp_flow_key_parms odp_parms = { |
3130 | 0 | .flow = &netdev_flow->flow, |
3131 | 0 | .mask = &wc.masks, |
3132 | 0 | .support = dp_netdev_support, |
3133 | 0 | }; |
3134 | |
|
3135 | 0 | miniflow_expand(&netdev_flow->cr.mask->mf, &wc.masks); |
3136 | | /* in_port is exact matched, but we have left it out from the mask for |
3137 | | * optimnization reasons. Add in_port back to the mask. */ |
3138 | 0 | wc.masks.in_port.odp_port = ODPP_NONE; |
3139 | | |
3140 | | /* Key */ |
3141 | 0 | offset = key_buf->size; |
3142 | 0 | flow->key = ofpbuf_tail(key_buf); |
3143 | 0 | odp_flow_key_from_flow(&odp_parms, key_buf); |
3144 | 0 | flow->key_len = key_buf->size - offset; |
3145 | | |
3146 | | /* Mask */ |
3147 | 0 | offset = mask_buf->size; |
3148 | 0 | flow->mask = ofpbuf_tail(mask_buf); |
3149 | 0 | odp_parms.key_buf = key_buf; |
3150 | 0 | odp_flow_key_from_mask(&odp_parms, mask_buf); |
3151 | 0 | flow->mask_len = mask_buf->size - offset; |
3152 | | |
3153 | | /* Actions */ |
3154 | 0 | actions = dp_netdev_flow_get_actions(netdev_flow); |
3155 | 0 | flow->actions = actions->actions; |
3156 | 0 | flow->actions_len = actions->size; |
3157 | 0 | } |
3158 | |
|
3159 | 0 | flow->ufid = netdev_flow->ufid; |
3160 | 0 | flow->ufid_present = true; |
3161 | 0 | flow->pmd_id = netdev_flow->pmd_id; |
3162 | |
|
3163 | 0 | get_dpif_flow_status(dp, netdev_flow, &flow->stats, &flow->attrs); |
3164 | 0 | flow->attrs.dp_extra_info = netdev_flow->dp_extra_info; |
3165 | 0 | } |
3166 | | |
3167 | | static int |
3168 | | dpif_netdev_mask_from_nlattrs(const struct nlattr *key, uint32_t key_len, |
3169 | | const struct nlattr *mask_key, |
3170 | | uint32_t mask_key_len, const struct flow *flow, |
3171 | | struct flow_wildcards *wc, bool probe) |
3172 | 0 | { |
3173 | 0 | enum odp_key_fitness fitness; |
3174 | |
|
3175 | 0 | fitness = odp_flow_key_to_mask(mask_key, mask_key_len, wc, flow, NULL); |
3176 | 0 | if (fitness) { |
3177 | 0 | if (!probe) { |
3178 | | /* This should not happen: it indicates that |
3179 | | * odp_flow_key_from_mask() and odp_flow_key_to_mask() |
3180 | | * disagree on the acceptable form of a mask. Log the problem |
3181 | | * as an error, with enough details to enable debugging. */ |
3182 | 0 | static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); |
3183 | |
|
3184 | 0 | if (!VLOG_DROP_ERR(&rl)) { |
3185 | 0 | struct ds s; |
3186 | |
|
3187 | 0 | ds_init(&s); |
3188 | 0 | odp_flow_format(key, key_len, mask_key, mask_key_len, NULL, &s, |
3189 | 0 | true, true); |
3190 | 0 | VLOG_ERR("internal error parsing flow mask %s (%s)", |
3191 | 0 | ds_cstr(&s), odp_key_fitness_to_string(fitness)); |
3192 | 0 | ds_destroy(&s); |
3193 | 0 | } |
3194 | 0 | } |
3195 | |
|
3196 | 0 | return EINVAL; |
3197 | 0 | } |
3198 | | |
3199 | 0 | return 0; |
3200 | 0 | } |
3201 | | |
3202 | | static int |
3203 | | dpif_netdev_flow_from_nlattrs(const struct nlattr *key, uint32_t key_len, |
3204 | | struct flow *flow, bool probe) |
3205 | 0 | { |
3206 | 0 | if (odp_flow_key_to_flow(key, key_len, flow, NULL)) { |
3207 | 0 | if (!probe) { |
3208 | | /* This should not happen: it indicates that |
3209 | | * odp_flow_key_from_flow() and odp_flow_key_to_flow() disagree on |
3210 | | * the acceptable form of a flow. Log the problem as an error, |
3211 | | * with enough details to enable debugging. */ |
3212 | 0 | static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); |
3213 | |
|
3214 | 0 | if (!VLOG_DROP_ERR(&rl)) { |
3215 | 0 | struct ds s; |
3216 | |
|
3217 | 0 | ds_init(&s); |
3218 | 0 | odp_flow_format(key, key_len, NULL, 0, NULL, &s, true, false); |
3219 | 0 | VLOG_ERR("internal error parsing flow key %s", ds_cstr(&s)); |
3220 | 0 | ds_destroy(&s); |
3221 | 0 | } |
3222 | 0 | } |
3223 | |
|
3224 | 0 | return EINVAL; |
3225 | 0 | } |
3226 | | |
3227 | 0 | if (flow->ct_state & DP_NETDEV_CS_UNSUPPORTED_MASK) { |
3228 | 0 | return EINVAL; |
3229 | 0 | } |
3230 | | |
3231 | 0 | return 0; |
3232 | 0 | } |
3233 | | |
3234 | | static int |
3235 | | dpif_netdev_flow_get(const struct dpif *dpif, const struct dpif_flow_get *get) |
3236 | 0 | { |
3237 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
3238 | 0 | struct dp_netdev_flow *netdev_flow; |
3239 | 0 | struct dp_netdev_pmd_thread *pmd; |
3240 | 0 | struct hmapx to_find = HMAPX_INITIALIZER(&to_find); |
3241 | 0 | struct hmapx_node *node; |
3242 | 0 | int error = EINVAL; |
3243 | |
|
3244 | 0 | if (get->pmd_id == PMD_ID_NULL) { |
3245 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
3246 | 0 | if (dp_netdev_pmd_try_ref(pmd) && !hmapx_add(&to_find, pmd)) { |
3247 | 0 | dp_netdev_pmd_unref(pmd); |
3248 | 0 | } |
3249 | 0 | } |
3250 | 0 | } else { |
3251 | 0 | pmd = dp_netdev_get_pmd(dp, get->pmd_id); |
3252 | 0 | if (!pmd) { |
3253 | 0 | goto out; |
3254 | 0 | } |
3255 | 0 | hmapx_add(&to_find, pmd); |
3256 | 0 | } |
3257 | | |
3258 | 0 | if (!hmapx_count(&to_find)) { |
3259 | 0 | goto out; |
3260 | 0 | } |
3261 | | |
3262 | 0 | HMAPX_FOR_EACH (node, &to_find) { |
3263 | 0 | pmd = (struct dp_netdev_pmd_thread *) node->data; |
3264 | 0 | netdev_flow = dp_netdev_pmd_find_flow(pmd, get->ufid, get->key, |
3265 | 0 | get->key_len); |
3266 | 0 | if (netdev_flow) { |
3267 | 0 | dp_netdev_flow_to_dpif_flow(dp, netdev_flow, get->buffer, |
3268 | 0 | get->buffer, get->flow, false); |
3269 | 0 | error = 0; |
3270 | 0 | break; |
3271 | 0 | } else { |
3272 | 0 | error = ENOENT; |
3273 | 0 | } |
3274 | 0 | } |
3275 | |
|
3276 | 0 | HMAPX_FOR_EACH (node, &to_find) { |
3277 | 0 | pmd = (struct dp_netdev_pmd_thread *) node->data; |
3278 | 0 | dp_netdev_pmd_unref(pmd); |
3279 | 0 | } |
3280 | 0 | out: |
3281 | 0 | hmapx_destroy(&to_find); |
3282 | 0 | return error; |
3283 | 0 | } |
3284 | | |
3285 | | static void |
3286 | | dp_netdev_get_mega_ufid(const struct match *match, ovs_u128 *mega_ufid) |
3287 | 0 | { |
3288 | 0 | struct flow masked_flow; |
3289 | 0 | size_t i; |
3290 | |
|
3291 | 0 | for (i = 0; i < sizeof(struct flow); i++) { |
3292 | 0 | ((uint8_t *)&masked_flow)[i] = ((uint8_t *)&match->flow)[i] & |
3293 | 0 | ((uint8_t *)&match->wc)[i]; |
3294 | 0 | } |
3295 | 0 | odp_flow_key_hash(&masked_flow, sizeof masked_flow, mega_ufid); |
3296 | 0 | } |
3297 | | |
3298 | | uint64_t |
3299 | | dp_netdev_simple_match_mark(odp_port_t in_port, ovs_be16 dl_type, |
3300 | | uint8_t nw_frag, ovs_be16 vlan_tci) |
3301 | 0 | { |
3302 | | /* Simple Match Mark: |
3303 | | * |
3304 | | * BE: |
3305 | | * +-----------------+-------------++---------+---+-----------+ |
3306 | | * | in_port | dl_type || nw_frag |CFI| VID(12) | |
3307 | | * +-----------------+-------------++---------+---+-----------+ |
3308 | | * 0 32 47 49 51 52 63 |
3309 | | * |
3310 | | * LE: |
3311 | | * +-----------------+-------------+------++-------+---+------+ |
3312 | | * | in_port | dl_type |VID(8)||nw_frag|CFI|VID(4)| |
3313 | | * +-----------------+-------------+------++-------+---+------+ |
3314 | | * 0 32 47 48 55 57 59 60 61 63 |
3315 | | * |
3316 | | * Big Endian Little Endian |
3317 | | * in_port : 32 bits [ 0..31] in_port : 32 bits [ 0..31] |
3318 | | * dl_type : 16 bits [32..47] dl_type : 16 bits [32..47] |
3319 | | * <empty> : 1 bit [48..48] vlan VID: 8 bits [48..55] |
3320 | | * nw_frag : 2 bits [49..50] <empty> : 1 bit [56..56] |
3321 | | * vlan CFI: 1 bit [51..51] nw_frag : 2 bits [57..59] |
3322 | | * vlan VID: 12 bits [52..63] vlan CFI: 1 bit [60..60] |
3323 | | * vlan VID: 4 bits [61..63] |
3324 | | * |
3325 | | * Layout is different for LE and BE in order to save a couple of |
3326 | | * network to host translations. |
3327 | | * */ |
3328 | 0 | return ((uint64_t) odp_to_u32(in_port) << 32) |
3329 | 0 | | ((OVS_FORCE uint32_t) dl_type << 16) |
3330 | | #if WORDS_BIGENDIAN |
3331 | | | (((uint16_t) nw_frag & FLOW_NW_FRAG_MASK) << VLAN_PCP_SHIFT) |
3332 | | #else |
3333 | 0 | | ((nw_frag & FLOW_NW_FRAG_MASK) << (VLAN_PCP_SHIFT - 8)) |
3334 | 0 | #endif |
3335 | 0 | | (OVS_FORCE uint16_t) (vlan_tci & htons(VLAN_VID_MASK | VLAN_CFI)); |
3336 | 0 | } |
3337 | | |
3338 | | struct dp_netdev_flow * |
3339 | | dp_netdev_simple_match_lookup(const struct dp_netdev_pmd_thread *pmd, |
3340 | | odp_port_t in_port, ovs_be16 dl_type, |
3341 | | uint8_t nw_frag, ovs_be16 vlan_tci) |
3342 | 0 | { |
3343 | 0 | uint64_t mark = dp_netdev_simple_match_mark(in_port, dl_type, |
3344 | 0 | nw_frag, vlan_tci); |
3345 | 0 | uint32_t hash = hash_uint64(mark); |
3346 | 0 | struct dp_netdev_flow *flow; |
3347 | 0 | bool found = false; |
3348 | |
|
3349 | 0 | CMAP_FOR_EACH_WITH_HASH (flow, simple_match_node, |
3350 | 0 | hash, &pmd->simple_match_table) { |
3351 | 0 | if (flow->simple_match_mark == mark) { |
3352 | 0 | found = true; |
3353 | 0 | break; |
3354 | 0 | } |
3355 | 0 | } |
3356 | 0 | return found ? flow : NULL; |
3357 | 0 | } |
3358 | | |
3359 | | bool |
3360 | | dp_netdev_simple_match_enabled(const struct dp_netdev_pmd_thread *pmd, |
3361 | | odp_port_t in_port) |
3362 | 0 | { |
3363 | 0 | return ccmap_find(&pmd->n_flows, odp_to_u32(in_port)) |
3364 | 0 | == ccmap_find(&pmd->n_simple_flows, odp_to_u32(in_port)); |
3365 | 0 | } |
3366 | | |
3367 | | static void |
3368 | | dp_netdev_simple_match_insert(struct dp_netdev_pmd_thread *pmd, |
3369 | | struct dp_netdev_flow *dp_flow) |
3370 | | OVS_REQUIRES(pmd->flow_mutex) |
3371 | 0 | { |
3372 | 0 | odp_port_t in_port = dp_flow->flow.in_port.odp_port; |
3373 | 0 | ovs_be16 vlan_tci = dp_flow->flow.vlans[0].tci; |
3374 | 0 | ovs_be16 dl_type = dp_flow->flow.dl_type; |
3375 | 0 | uint8_t nw_frag = dp_flow->flow.nw_frag; |
3376 | |
|
3377 | 0 | if (!dp_netdev_flow_ref(dp_flow)) { |
3378 | 0 | return; |
3379 | 0 | } |
3380 | | |
3381 | | /* Avoid double insertion. Should not happen in practice. */ |
3382 | 0 | dp_netdev_simple_match_remove(pmd, dp_flow); |
3383 | |
|
3384 | 0 | uint64_t mark = dp_netdev_simple_match_mark(in_port, dl_type, |
3385 | 0 | nw_frag, vlan_tci); |
3386 | 0 | uint32_t hash = hash_uint64(mark); |
3387 | |
|
3388 | 0 | dp_flow->simple_match_mark = mark; |
3389 | 0 | cmap_insert(&pmd->simple_match_table, |
3390 | 0 | CONST_CAST(struct cmap_node *, &dp_flow->simple_match_node), |
3391 | 0 | hash); |
3392 | 0 | ccmap_inc(&pmd->n_simple_flows, odp_to_u32(in_port)); |
3393 | |
|
3394 | 0 | VLOG_DBG("Simple match insert: " |
3395 | 0 | "core_id(%d),in_port(%"PRIu32"),mark(0x%016"PRIx64").", |
3396 | 0 | pmd->core_id, in_port, mark); |
3397 | 0 | } |
3398 | | |
3399 | | static void |
3400 | | dp_netdev_simple_match_remove(struct dp_netdev_pmd_thread *pmd, |
3401 | | struct dp_netdev_flow *dp_flow) |
3402 | | OVS_REQUIRES(pmd->flow_mutex) |
3403 | 0 | { |
3404 | 0 | odp_port_t in_port = dp_flow->flow.in_port.odp_port; |
3405 | 0 | ovs_be16 vlan_tci = dp_flow->flow.vlans[0].tci; |
3406 | 0 | ovs_be16 dl_type = dp_flow->flow.dl_type; |
3407 | 0 | uint8_t nw_frag = dp_flow->flow.nw_frag; |
3408 | 0 | struct dp_netdev_flow *flow; |
3409 | 0 | uint64_t mark = dp_netdev_simple_match_mark(in_port, dl_type, |
3410 | 0 | nw_frag, vlan_tci); |
3411 | 0 | uint32_t hash = hash_uint64(mark); |
3412 | |
|
3413 | 0 | flow = dp_netdev_simple_match_lookup(pmd, in_port, dl_type, |
3414 | 0 | nw_frag, vlan_tci); |
3415 | 0 | if (flow == dp_flow) { |
3416 | 0 | VLOG_DBG("Simple match remove: " |
3417 | 0 | "core_id(%d),in_port(%"PRIu32"),mark(0x%016"PRIx64").", |
3418 | 0 | pmd->core_id, in_port, mark); |
3419 | 0 | cmap_remove(&pmd->simple_match_table, |
3420 | 0 | CONST_CAST(struct cmap_node *, &flow->simple_match_node), |
3421 | 0 | hash); |
3422 | 0 | ccmap_dec(&pmd->n_simple_flows, odp_to_u32(in_port)); |
3423 | 0 | dp_netdev_flow_unref(flow); |
3424 | 0 | } |
3425 | 0 | } |
3426 | | |
3427 | | static bool |
3428 | | dp_netdev_flow_is_simple_match(const struct match *match) |
3429 | 0 | { |
3430 | 0 | const struct flow *flow = &match->flow; |
3431 | 0 | const struct flow_wildcards *wc = &match->wc; |
3432 | |
|
3433 | 0 | if (flow->recirc_id || flow->packet_type != htonl(PT_ETH)) { |
3434 | 0 | return false; |
3435 | 0 | } |
3436 | | |
3437 | | /* Check that flow matches only minimal set of fields that always set. |
3438 | | * Also checking that VLAN VID+CFI is an exact match, because these |
3439 | | * are not mandatory and could be masked. */ |
3440 | 0 | struct flow_wildcards *minimal = xmalloc(sizeof *minimal); |
3441 | 0 | ovs_be16 vlan_tci_mask = htons(VLAN_VID_MASK | VLAN_CFI); |
3442 | |
|
3443 | 0 | flow_wildcards_init_catchall(minimal); |
3444 | | /* 'dpif-netdev' always has following in exact match: |
3445 | | * - recirc_id <-- recirc_id == 0 checked on input. |
3446 | | * - in_port <-- Will be checked on input. |
3447 | | * - packet_type <-- Assuming all packets are PT_ETH. |
3448 | | * - dl_type <-- Need to match with. |
3449 | | * - vlan_tci <-- Need to match with. |
3450 | | * - and nw_frag for ip packets. <-- Need to match with. |
3451 | | */ |
3452 | 0 | WC_MASK_FIELD(minimal, recirc_id); |
3453 | 0 | WC_MASK_FIELD(minimal, in_port); |
3454 | 0 | WC_MASK_FIELD(minimal, packet_type); |
3455 | 0 | WC_MASK_FIELD(minimal, dl_type); |
3456 | 0 | WC_MASK_FIELD_MASK(minimal, vlans[0].tci, vlan_tci_mask); |
3457 | 0 | WC_MASK_FIELD_MASK(minimal, nw_frag, FLOW_NW_FRAG_MASK); |
3458 | |
|
3459 | 0 | if (flow_wildcards_has_extra(minimal, wc) |
3460 | 0 | || wc->masks.vlans[0].tci != vlan_tci_mask) { |
3461 | 0 | free(minimal); |
3462 | 0 | return false; |
3463 | 0 | } |
3464 | 0 | free(minimal); |
3465 | |
|
3466 | 0 | return true; |
3467 | 0 | } |
3468 | | |
3469 | | static void |
3470 | | offload_flow_put_resume(struct dp_netdev *dp, struct dp_netdev_flow *flow, |
3471 | | struct dp_netdev_flow *previous_flow_reference, |
3472 | | unsigned pmd_id, int error) |
3473 | 0 | { |
3474 | 0 | if (error == EINPROGRESS) { |
3475 | 0 | return; |
3476 | 0 | } |
3477 | | |
3478 | 0 | if (!error) { |
3479 | 0 | flow->offloaded = true; |
3480 | 0 | } else { |
3481 | | /* If the flow was already offloaded, the new action set can no |
3482 | | * longer be offloaded. In theory, we should disassociate the |
3483 | | * offload from all PMDs that have this flow marked as offloaded. |
3484 | | * Unfortunately, there is no mechanism to inform other PMDs, so |
3485 | | * we cannot explicitly mark such flows. This situation typically |
3486 | | * occurs when the revalidator modifies the flow, so it is safe to |
3487 | | * assume it will update all affected flows and that the offload |
3488 | | * will subsequently fail. */ |
3489 | 0 | flow->offloaded = false; |
3490 | | |
3491 | | /* On error, the flow reference was not stored by the offload provider, |
3492 | | * so we should decrease the reference. */ |
3493 | 0 | dp_netdev_flow_unref(flow); |
3494 | 0 | } |
3495 | |
|
3496 | 0 | if (offload_queue_dec(flow) && flow->dead) { |
3497 | | /* If flows are processed asynchronously, modifications might |
3498 | | * still be queued up while the flow is being removed. If this |
3499 | | * was the last flow in the queue on a dead flow, we try again |
3500 | | * to see if we need to remove this flow. */ |
3501 | 0 | offload_flow_del(dp, pmd_id, flow); |
3502 | 0 | } |
3503 | |
|
3504 | 0 | if (previous_flow_reference) { |
3505 | 0 | dp_netdev_flow_unref(previous_flow_reference); |
3506 | 0 | if (previous_flow_reference != flow) { |
3507 | 0 | VLOG_DBG("Updated flow reference was from outdated flow"); |
3508 | 0 | } |
3509 | 0 | } |
3510 | 0 | } |
3511 | | |
3512 | | static void |
3513 | | offload_flow_put_resume_cb(void *aux, struct dpif_flow_stats *stats OVS_UNUSED, |
3514 | | unsigned pmd_id, void *flow_reference_, |
3515 | | void *old_flow_reference_, |
3516 | | int error) |
3517 | 0 | { |
3518 | 0 | struct dp_netdev *dp = aux; |
3519 | 0 | struct dp_netdev_flow *flow_reference = flow_reference_; |
3520 | 0 | struct dp_netdev_flow *old_flow_reference = old_flow_reference_; |
3521 | |
|
3522 | 0 | offload_flow_put_resume(dp, flow_reference, old_flow_reference, |
3523 | 0 | pmd_id, error); |
3524 | 0 | } |
3525 | | |
3526 | | static void |
3527 | | offload_flow_put(struct dp_netdev_pmd_thread *pmd, struct dp_netdev_flow *flow, |
3528 | | struct match *match, const struct nlattr *actions, |
3529 | | size_t actions_len) |
3530 | 0 | { |
3531 | 0 | struct dpif_offload_flow_put put = { |
3532 | 0 | .in_port = match->flow.in_port.odp_port, |
3533 | 0 | .orig_in_port = flow->orig_in_port, |
3534 | 0 | .pmd_id = pmd->core_id, |
3535 | 0 | .ufid = CONST_CAST(ovs_u128 *, &flow->mega_ufid), |
3536 | 0 | .match = match, |
3537 | 0 | .actions = actions, |
3538 | 0 | .actions_len = actions_len, |
3539 | 0 | .stats = NULL, |
3540 | 0 | .flow_reference = flow, |
3541 | 0 | .cb_data = { |
3542 | 0 | .callback = offload_flow_put_resume_cb, |
3543 | 0 | .callback_aux = pmd->dp, |
3544 | 0 | }, |
3545 | 0 | }; |
3546 | 0 | void *previous_flow_reference = NULL; |
3547 | 0 | int error; |
3548 | |
|
3549 | 0 | if (!dpif_offload_enabled() || flow->dead || !offload_queue_inc(flow)) { |
3550 | 0 | return; |
3551 | 0 | } |
3552 | | |
3553 | 0 | dp_netdev_flow_ref(flow); |
3554 | |
|
3555 | 0 | error = dpif_offload_datapath_flow_put(pmd->dp->full_name, &put, |
3556 | 0 | &previous_flow_reference); |
3557 | 0 | offload_flow_put_resume(pmd->dp, put.flow_reference, |
3558 | 0 | previous_flow_reference, |
3559 | 0 | pmd->core_id, error); |
3560 | 0 | } |
3561 | | |
3562 | | static struct dp_netdev_flow * |
3563 | | dp_netdev_flow_add(struct dp_netdev_pmd_thread *pmd, |
3564 | | struct match *match, const ovs_u128 *ufid, |
3565 | | const struct nlattr *actions, size_t actions_len, |
3566 | | odp_port_t orig_in_port) |
3567 | | OVS_REQUIRES(pmd->flow_mutex) |
3568 | 0 | { |
3569 | 0 | struct ds extra_info = DS_EMPTY_INITIALIZER; |
3570 | 0 | struct dp_netdev_flow *flow; |
3571 | 0 | struct netdev_flow_key mask; |
3572 | 0 | struct dpcls *cls; |
3573 | 0 | size_t unit; |
3574 | | |
3575 | | /* Make sure in_port is exact matched before we read it. */ |
3576 | 0 | ovs_assert(match->wc.masks.in_port.odp_port == ODPP_NONE); |
3577 | 0 | odp_port_t in_port = match->flow.in_port.odp_port; |
3578 | | |
3579 | | /* As we select the dpcls based on the port number, each netdev flow |
3580 | | * belonging to the same dpcls will have the same odp_port value. |
3581 | | * For performance reasons we wildcard odp_port here in the mask. In the |
3582 | | * typical case dp_hash is also wildcarded, and the resulting 8-byte |
3583 | | * chunk {dp_hash, in_port} will be ignored by netdev_flow_mask_init() and |
3584 | | * will not be part of the subtable mask. |
3585 | | * This will speed up the hash computation during dpcls_lookup() because |
3586 | | * there is one less call to hash_add64() in this case. */ |
3587 | 0 | match->wc.masks.in_port.odp_port = 0; |
3588 | 0 | netdev_flow_mask_init(&mask, match); |
3589 | 0 | match->wc.masks.in_port.odp_port = ODPP_NONE; |
3590 | | |
3591 | | /* Make sure wc does not have metadata. */ |
3592 | 0 | ovs_assert(!FLOWMAP_HAS_FIELD(&mask.mf.map, metadata) |
3593 | 0 | && !FLOWMAP_HAS_FIELD(&mask.mf.map, regs)); |
3594 | | |
3595 | | /* Do not allocate extra space. */ |
3596 | 0 | flow = xmalloc(sizeof *flow - sizeof flow->cr.flow.mf + mask.len); |
3597 | 0 | memset(&flow->stats, 0, sizeof flow->stats); |
3598 | 0 | flow->dead = false; |
3599 | 0 | flow->offloaded = false; |
3600 | 0 | atomic_init(&flow->offload_queue_depth, 0); |
3601 | 0 | flow->batch = NULL; |
3602 | 0 | flow->orig_in_port = orig_in_port; |
3603 | 0 | *CONST_CAST(unsigned *, &flow->pmd_id) = pmd->core_id; |
3604 | 0 | *CONST_CAST(struct flow *, &flow->flow) = match->flow; |
3605 | 0 | *CONST_CAST(ovs_u128 *, &flow->ufid) = *ufid; |
3606 | 0 | ovs_refcount_init(&flow->ref_cnt); |
3607 | 0 | ovsrcu_set(&flow->actions, dp_netdev_actions_create(actions, actions_len)); |
3608 | |
|
3609 | 0 | dp_netdev_get_mega_ufid(match, CONST_CAST(ovs_u128 *, &flow->mega_ufid)); |
3610 | 0 | netdev_flow_key_init_masked(&flow->cr.flow, &match->flow, &mask); |
3611 | | |
3612 | | /* Select dpcls for in_port. Relies on in_port to be exact match. */ |
3613 | 0 | cls = dp_netdev_pmd_find_dpcls(pmd, in_port); |
3614 | 0 | dpcls_insert(cls, &flow->cr, &mask); |
3615 | |
|
3616 | 0 | ds_put_cstr(&extra_info, "miniflow_bits("); |
3617 | 0 | FLOWMAP_FOR_EACH_UNIT (unit) { |
3618 | 0 | if (unit) { |
3619 | 0 | ds_put_char(&extra_info, ','); |
3620 | 0 | } |
3621 | 0 | ds_put_format(&extra_info, "%d", |
3622 | 0 | count_1bits(flow->cr.mask->mf.map.bits[unit])); |
3623 | 0 | } |
3624 | 0 | ds_put_char(&extra_info, ')'); |
3625 | 0 | flow->dp_extra_info = ds_steal_cstr(&extra_info); |
3626 | 0 | ds_destroy(&extra_info); |
3627 | |
|
3628 | 0 | cmap_insert(&pmd->flow_table, CONST_CAST(struct cmap_node *, &flow->node), |
3629 | 0 | dp_netdev_flow_hash(&flow->ufid)); |
3630 | 0 | ccmap_inc(&pmd->n_flows, odp_to_u32(in_port)); |
3631 | |
|
3632 | 0 | if (dp_netdev_flow_is_simple_match(match)) { |
3633 | 0 | dp_netdev_simple_match_insert(pmd, flow); |
3634 | 0 | } |
3635 | |
|
3636 | 0 | offload_flow_put(pmd, flow, match, actions, actions_len); |
3637 | 0 | log_netdev_flow_change(flow, match, NULL, actions, actions_len); |
3638 | |
|
3639 | 0 | return flow; |
3640 | 0 | } |
3641 | | |
3642 | | static int |
3643 | | flow_put_on_pmd(struct dp_netdev_pmd_thread *pmd, |
3644 | | struct netdev_flow_key *key, |
3645 | | struct match *match, |
3646 | | ovs_u128 *ufid, |
3647 | | const struct dpif_flow_put *put, |
3648 | | struct dpif_flow_stats *stats) |
3649 | 0 | { |
3650 | 0 | struct dp_netdev_flow *netdev_flow = NULL; |
3651 | 0 | int error = 0; |
3652 | |
|
3653 | 0 | if (stats) { |
3654 | 0 | memset(stats, 0, sizeof *stats); |
3655 | 0 | } |
3656 | |
|
3657 | 0 | ovs_mutex_lock(&pmd->flow_mutex); |
3658 | 0 | if (put->ufid) { |
3659 | 0 | netdev_flow = dp_netdev_pmd_find_flow(pmd, put->ufid, |
3660 | 0 | put->key, put->key_len); |
3661 | 0 | } else { |
3662 | | /* Use key instead of the locally generated ufid |
3663 | | * to search netdev_flow. */ |
3664 | 0 | netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL); |
3665 | 0 | } |
3666 | |
|
3667 | 0 | if (put->flags & DPIF_FP_CREATE) { |
3668 | 0 | if (!netdev_flow) { |
3669 | 0 | dp_netdev_flow_add(pmd, match, ufid, |
3670 | 0 | put->actions, put->actions_len, ODPP_NONE); |
3671 | 0 | } else { |
3672 | 0 | error = EEXIST; |
3673 | 0 | } |
3674 | 0 | goto exit; |
3675 | 0 | } |
3676 | | |
3677 | 0 | if (put->flags & DPIF_FP_MODIFY) { |
3678 | 0 | if (!netdev_flow) { |
3679 | 0 | error = ENOENT; |
3680 | 0 | } else { |
3681 | 0 | if (!put->ufid && !flow_equal(&match->flow, &netdev_flow->flow)) { |
3682 | | /* Overlapping flow. */ |
3683 | 0 | error = EINVAL; |
3684 | 0 | goto exit; |
3685 | 0 | } |
3686 | | |
3687 | 0 | struct dp_netdev_actions *new_actions; |
3688 | 0 | struct dp_netdev_actions *old_actions; |
3689 | |
|
3690 | 0 | new_actions = dp_netdev_actions_create(put->actions, |
3691 | 0 | put->actions_len); |
3692 | |
|
3693 | 0 | old_actions = dp_netdev_flow_get_actions(netdev_flow); |
3694 | 0 | ovsrcu_set(&netdev_flow->actions, new_actions); |
3695 | |
|
3696 | 0 | offload_flow_put(pmd, netdev_flow, match, put->actions, |
3697 | 0 | put->actions_len); |
3698 | 0 | log_netdev_flow_change(netdev_flow, match, old_actions, |
3699 | 0 | put->actions, put->actions_len); |
3700 | |
|
3701 | 0 | if (stats) { |
3702 | 0 | get_dpif_flow_status(pmd->dp, netdev_flow, stats, NULL); |
3703 | 0 | } |
3704 | 0 | if (put->flags & DPIF_FP_ZERO_STATS) { |
3705 | | /* XXX: The userspace datapath uses thread local statistics |
3706 | | * (for flows), which should be updated only by the owning |
3707 | | * thread. Since we cannot write on stats memory here, |
3708 | | * we choose not to support this flag. Please note: |
3709 | | * - This feature is currently used only by dpctl commands with |
3710 | | * option --clear. |
3711 | | * - Should the need arise, this operation can be implemented |
3712 | | * by keeping a base value (to be update here) for each |
3713 | | * counter, and subtracting it before outputting the stats */ |
3714 | 0 | error = EOPNOTSUPP; |
3715 | 0 | } |
3716 | 0 | ovsrcu_postpone(dp_netdev_actions_free, old_actions); |
3717 | 0 | } |
3718 | 0 | } |
3719 | | |
3720 | 0 | exit: |
3721 | 0 | ovs_mutex_unlock(&pmd->flow_mutex); |
3722 | 0 | return error; |
3723 | 0 | } |
3724 | | |
3725 | | static int |
3726 | | dpif_netdev_flow_put(struct dpif *dpif, const struct dpif_flow_put *put) |
3727 | 0 | { |
3728 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
3729 | 0 | struct netdev_flow_key key; |
3730 | 0 | struct dp_netdev_pmd_thread *pmd; |
3731 | 0 | struct match match; |
3732 | 0 | ovs_u128 ufid; |
3733 | 0 | int error; |
3734 | 0 | bool probe = put->flags & DPIF_FP_PROBE; |
3735 | |
|
3736 | 0 | if (put->stats) { |
3737 | 0 | memset(put->stats, 0, sizeof *put->stats); |
3738 | 0 | } |
3739 | 0 | error = dpif_netdev_flow_from_nlattrs(put->key, put->key_len, &match.flow, |
3740 | 0 | probe); |
3741 | 0 | if (error) { |
3742 | 0 | return error; |
3743 | 0 | } |
3744 | 0 | error = dpif_netdev_mask_from_nlattrs(put->key, put->key_len, |
3745 | 0 | put->mask, put->mask_len, |
3746 | 0 | &match.flow, &match.wc, probe); |
3747 | 0 | if (error) { |
3748 | 0 | return error; |
3749 | 0 | } |
3750 | | |
3751 | 0 | if (match.wc.masks.in_port.odp_port != ODPP_NONE) { |
3752 | 0 | static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); |
3753 | |
|
3754 | 0 | VLOG_ERR_RL(&rl, "failed to put%s flow: in_port is not an exact match", |
3755 | 0 | (put->flags & DPIF_FP_CREATE) ? "[create]" |
3756 | 0 | : (put->flags & DPIF_FP_MODIFY) ? "[modify]" : "[zero]"); |
3757 | 0 | return EINVAL; |
3758 | 0 | } |
3759 | | |
3760 | 0 | if (put->ufid) { |
3761 | 0 | ufid = *put->ufid; |
3762 | 0 | } else { |
3763 | 0 | odp_flow_key_hash(&match.flow, sizeof match.flow, &ufid); |
3764 | 0 | } |
3765 | | |
3766 | | /* The Netlink encoding of datapath flow keys cannot express |
3767 | | * wildcarding the presence of a VLAN tag. Instead, a missing VLAN |
3768 | | * tag is interpreted as exact match on the fact that there is no |
3769 | | * VLAN. Unless we refactor a lot of code that translates between |
3770 | | * Netlink and struct flow representations, we have to do the same |
3771 | | * here. This must be in sync with 'match' in handle_packet_upcall(). */ |
3772 | 0 | if (!match.wc.masks.vlans[0].tci) { |
3773 | 0 | match.wc.masks.vlans[0].tci = htons(VLAN_VID_MASK | VLAN_CFI); |
3774 | 0 | } |
3775 | | |
3776 | | /* Must produce a netdev_flow_key for lookup. |
3777 | | * Use the same method as employed to create the key when adding |
3778 | | * the flow to the dplcs to make sure they match. |
3779 | | * We need to put in the unmasked key as flow_put_on_pmd() will first try |
3780 | | * to see if an entry exists doing a packet type lookup. As masked-out |
3781 | | * fields are interpreted as zeros, they could falsely match a wider IP |
3782 | | * address mask. Installation of the flow will use the match variable. */ |
3783 | 0 | netdev_flow_key_init(&key, &match.flow); |
3784 | |
|
3785 | 0 | if (put->pmd_id == PMD_ID_NULL) { |
3786 | 0 | if (cmap_count(&dp->poll_threads) == 0) { |
3787 | 0 | return EINVAL; |
3788 | 0 | } |
3789 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
3790 | 0 | struct dpif_flow_stats pmd_stats; |
3791 | 0 | int pmd_error; |
3792 | |
|
3793 | 0 | pmd_error = flow_put_on_pmd(pmd, &key, &match, &ufid, put, |
3794 | 0 | &pmd_stats); |
3795 | 0 | if (pmd_error) { |
3796 | 0 | error = pmd_error; |
3797 | 0 | } else if (put->stats) { |
3798 | 0 | put->stats->n_packets += pmd_stats.n_packets; |
3799 | 0 | put->stats->n_bytes += pmd_stats.n_bytes; |
3800 | 0 | put->stats->used = MAX(put->stats->used, pmd_stats.used); |
3801 | 0 | put->stats->tcp_flags |= pmd_stats.tcp_flags; |
3802 | 0 | } |
3803 | 0 | } |
3804 | 0 | } else { |
3805 | 0 | pmd = dp_netdev_get_pmd(dp, put->pmd_id); |
3806 | 0 | if (!pmd) { |
3807 | 0 | return EINVAL; |
3808 | 0 | } |
3809 | 0 | error = flow_put_on_pmd(pmd, &key, &match, &ufid, put, put->stats); |
3810 | 0 | dp_netdev_pmd_unref(pmd); |
3811 | 0 | } |
3812 | | |
3813 | 0 | return error; |
3814 | 0 | } |
3815 | | |
3816 | | static int |
3817 | | flow_del_on_pmd(struct dp_netdev_pmd_thread *pmd, |
3818 | | struct dpif_flow_stats *stats, |
3819 | | const struct dpif_flow_del *del) |
3820 | 0 | { |
3821 | 0 | struct dp_netdev_flow *netdev_flow; |
3822 | 0 | int error = 0; |
3823 | |
|
3824 | 0 | ovs_mutex_lock(&pmd->flow_mutex); |
3825 | 0 | netdev_flow = dp_netdev_pmd_find_flow(pmd, del->ufid, del->key, |
3826 | 0 | del->key_len); |
3827 | 0 | if (netdev_flow) { |
3828 | 0 | if (stats) { |
3829 | 0 | get_dpif_flow_status(pmd->dp, netdev_flow, stats, NULL); |
3830 | 0 | } |
3831 | 0 | dp_netdev_pmd_remove_flow(pmd, netdev_flow); |
3832 | 0 | } else { |
3833 | 0 | error = ENOENT; |
3834 | 0 | } |
3835 | 0 | ovs_mutex_unlock(&pmd->flow_mutex); |
3836 | |
|
3837 | 0 | return error; |
3838 | 0 | } |
3839 | | |
3840 | | static int |
3841 | | dpif_netdev_flow_del(struct dpif *dpif, const struct dpif_flow_del *del) |
3842 | 0 | { |
3843 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
3844 | 0 | struct dp_netdev_pmd_thread *pmd; |
3845 | 0 | int error = 0; |
3846 | |
|
3847 | 0 | if (del->stats) { |
3848 | 0 | memset(del->stats, 0, sizeof *del->stats); |
3849 | 0 | } |
3850 | |
|
3851 | 0 | if (del->pmd_id == PMD_ID_NULL) { |
3852 | 0 | if (cmap_count(&dp->poll_threads) == 0) { |
3853 | 0 | return EINVAL; |
3854 | 0 | } |
3855 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
3856 | 0 | struct dpif_flow_stats pmd_stats; |
3857 | 0 | int pmd_error; |
3858 | |
|
3859 | 0 | pmd_error = flow_del_on_pmd(pmd, &pmd_stats, del); |
3860 | 0 | if (pmd_error) { |
3861 | 0 | error = pmd_error; |
3862 | 0 | } else if (del->stats) { |
3863 | 0 | del->stats->n_packets += pmd_stats.n_packets; |
3864 | 0 | del->stats->n_bytes += pmd_stats.n_bytes; |
3865 | 0 | del->stats->used = MAX(del->stats->used, pmd_stats.used); |
3866 | 0 | del->stats->tcp_flags |= pmd_stats.tcp_flags; |
3867 | 0 | } |
3868 | 0 | } |
3869 | 0 | } else { |
3870 | 0 | pmd = dp_netdev_get_pmd(dp, del->pmd_id); |
3871 | 0 | if (!pmd) { |
3872 | 0 | return EINVAL; |
3873 | 0 | } |
3874 | 0 | error = flow_del_on_pmd(pmd, del->stats, del); |
3875 | 0 | dp_netdev_pmd_unref(pmd); |
3876 | 0 | } |
3877 | | |
3878 | | |
3879 | 0 | return error; |
3880 | 0 | } |
3881 | | |
3882 | | struct dpif_netdev_flow_dump { |
3883 | | struct dpif_flow_dump up; |
3884 | | struct cmap_position poll_thread_pos; |
3885 | | struct cmap_position flow_pos; |
3886 | | struct dp_netdev_pmd_thread *cur_pmd; |
3887 | | int status; |
3888 | | struct ovs_mutex mutex; |
3889 | | }; |
3890 | | |
3891 | | static struct dpif_netdev_flow_dump * |
3892 | | dpif_netdev_flow_dump_cast(struct dpif_flow_dump *dump) |
3893 | 0 | { |
3894 | 0 | return CONTAINER_OF(dump, struct dpif_netdev_flow_dump, up); |
3895 | 0 | } |
3896 | | |
3897 | | static struct dpif_flow_dump * |
3898 | | dpif_netdev_flow_dump_create(const struct dpif *dpif_, bool terse, |
3899 | | struct dpif_flow_dump_types *types) |
3900 | 0 | { |
3901 | 0 | struct dpif_netdev_flow_dump *dump; |
3902 | |
|
3903 | 0 | dump = xzalloc(sizeof *dump); |
3904 | 0 | dpif_flow_dump_init(&dump->up, dpif_, terse, types); |
3905 | 0 | ovs_mutex_init(&dump->mutex); |
3906 | |
|
3907 | 0 | return &dump->up; |
3908 | 0 | } |
3909 | | |
3910 | | static int |
3911 | | dpif_netdev_flow_dump_destroy(struct dpif_flow_dump *dump_) |
3912 | 0 | { |
3913 | 0 | struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_); |
3914 | |
|
3915 | 0 | ovs_mutex_destroy(&dump->mutex); |
3916 | 0 | free(dump); |
3917 | 0 | return 0; |
3918 | 0 | } |
3919 | | |
3920 | | struct dpif_netdev_flow_dump_thread { |
3921 | | struct dpif_flow_dump_thread up; |
3922 | | struct dpif_netdev_flow_dump *dump; |
3923 | | struct odputil_keybuf keybuf[FLOW_DUMP_MAX_BATCH]; |
3924 | | struct odputil_keybuf maskbuf[FLOW_DUMP_MAX_BATCH]; |
3925 | | }; |
3926 | | |
3927 | | static struct dpif_netdev_flow_dump_thread * |
3928 | | dpif_netdev_flow_dump_thread_cast(struct dpif_flow_dump_thread *thread) |
3929 | 0 | { |
3930 | 0 | return CONTAINER_OF(thread, struct dpif_netdev_flow_dump_thread, up); |
3931 | 0 | } |
3932 | | |
3933 | | static struct dpif_flow_dump_thread * |
3934 | | dpif_netdev_flow_dump_thread_create(struct dpif_flow_dump *dump_) |
3935 | 0 | { |
3936 | 0 | struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_); |
3937 | 0 | struct dpif_netdev_flow_dump_thread *thread; |
3938 | |
|
3939 | 0 | thread = xmalloc(sizeof *thread); |
3940 | 0 | dpif_flow_dump_thread_init(&thread->up, &dump->up); |
3941 | 0 | thread->dump = dump; |
3942 | 0 | return &thread->up; |
3943 | 0 | } |
3944 | | |
3945 | | static void |
3946 | | dpif_netdev_flow_dump_thread_destroy(struct dpif_flow_dump_thread *thread_) |
3947 | 0 | { |
3948 | 0 | struct dpif_netdev_flow_dump_thread *thread |
3949 | 0 | = dpif_netdev_flow_dump_thread_cast(thread_); |
3950 | |
|
3951 | 0 | free(thread); |
3952 | 0 | } |
3953 | | |
3954 | | static int |
3955 | | dpif_netdev_flow_dump_next(struct dpif_flow_dump_thread *thread_, |
3956 | | struct dpif_flow *flows, int max_flows) |
3957 | 0 | { |
3958 | 0 | struct dpif_netdev_flow_dump_thread *thread |
3959 | 0 | = dpif_netdev_flow_dump_thread_cast(thread_); |
3960 | 0 | struct dpif_netdev_flow_dump *dump = thread->dump; |
3961 | 0 | struct dp_netdev_flow *netdev_flows[FLOW_DUMP_MAX_BATCH]; |
3962 | 0 | struct dpif_netdev *dpif = dpif_netdev_cast(thread->up.dump->dpif); |
3963 | 0 | struct dp_netdev *dp = get_dp_netdev(&dpif->dpif); |
3964 | 0 | int n_flows = 0; |
3965 | 0 | int i; |
3966 | |
|
3967 | 0 | ovs_mutex_lock(&dump->mutex); |
3968 | 0 | if (!dump->status) { |
3969 | 0 | struct dp_netdev_pmd_thread *pmd = dump->cur_pmd; |
3970 | 0 | int flow_limit = MIN(max_flows, FLOW_DUMP_MAX_BATCH); |
3971 | | |
3972 | | /* First call to dump_next(), extracts the first pmd thread. |
3973 | | * If there is no pmd thread, returns immediately. */ |
3974 | 0 | if (!pmd) { |
3975 | 0 | pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos); |
3976 | 0 | if (!pmd) { |
3977 | 0 | ovs_mutex_unlock(&dump->mutex); |
3978 | 0 | return n_flows; |
3979 | |
|
3980 | 0 | } |
3981 | 0 | } |
3982 | | |
3983 | 0 | do { |
3984 | 0 | for (n_flows = 0; n_flows < flow_limit; n_flows++) { |
3985 | 0 | struct cmap_node *node; |
3986 | |
|
3987 | 0 | node = cmap_next_position(&pmd->flow_table, &dump->flow_pos); |
3988 | 0 | if (!node) { |
3989 | 0 | break; |
3990 | 0 | } |
3991 | 0 | netdev_flows[n_flows] = CONTAINER_OF(node, |
3992 | 0 | struct dp_netdev_flow, |
3993 | 0 | node); |
3994 | 0 | } |
3995 | | /* When finishing dumping the current pmd thread, moves to |
3996 | | * the next. */ |
3997 | 0 | if (n_flows < flow_limit) { |
3998 | 0 | memset(&dump->flow_pos, 0, sizeof dump->flow_pos); |
3999 | 0 | dp_netdev_pmd_unref(pmd); |
4000 | 0 | pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos); |
4001 | 0 | if (!pmd) { |
4002 | 0 | dump->status = EOF; |
4003 | 0 | break; |
4004 | 0 | } |
4005 | 0 | } |
4006 | | /* Keeps the reference to next caller. */ |
4007 | 0 | dump->cur_pmd = pmd; |
4008 | | |
4009 | | /* If the current dump is empty, do not exit the loop, since the |
4010 | | * remaining pmds could have flows to be dumped. Just dumps again |
4011 | | * on the new 'pmd'. */ |
4012 | 0 | } while (!n_flows); |
4013 | 0 | } |
4014 | 0 | ovs_mutex_unlock(&dump->mutex); |
4015 | |
|
4016 | 0 | for (i = 0; i < n_flows; i++) { |
4017 | 0 | struct odputil_keybuf *maskbuf = &thread->maskbuf[i]; |
4018 | 0 | struct odputil_keybuf *keybuf = &thread->keybuf[i]; |
4019 | 0 | struct dp_netdev_flow *netdev_flow = netdev_flows[i]; |
4020 | 0 | struct dpif_flow *f = &flows[i]; |
4021 | 0 | struct ofpbuf key, mask; |
4022 | |
|
4023 | 0 | ofpbuf_use_stack(&key, keybuf, sizeof *keybuf); |
4024 | 0 | ofpbuf_use_stack(&mask, maskbuf, sizeof *maskbuf); |
4025 | 0 | dp_netdev_flow_to_dpif_flow(dp, netdev_flow, &key, &mask, f, |
4026 | 0 | dump->up.terse); |
4027 | 0 | } |
4028 | |
|
4029 | 0 | return n_flows; |
4030 | 0 | } |
4031 | | |
4032 | | static int |
4033 | | dpif_netdev_execute(struct dpif *dpif, struct dpif_execute *execute) |
4034 | | OVS_NO_THREAD_SAFETY_ANALYSIS |
4035 | 0 | { |
4036 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
4037 | 0 | struct dp_netdev_pmd_thread *pmd; |
4038 | 0 | struct dp_packet_batch pp; |
4039 | |
|
4040 | 0 | if (dp_packet_size(execute->packet) < ETH_HEADER_LEN || |
4041 | 0 | dp_packet_size(execute->packet) > UINT16_MAX) { |
4042 | 0 | return EINVAL; |
4043 | 0 | } |
4044 | | |
4045 | | /* Tries finding the 'pmd'. If NULL is returned, that means |
4046 | | * the current thread is a non-pmd thread and should use |
4047 | | * dp_netdev_get_pmd(dp, NON_PMD_CORE_ID). */ |
4048 | 0 | pmd = ovsthread_getspecific(dp->per_pmd_key); |
4049 | 0 | if (!pmd) { |
4050 | 0 | pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID); |
4051 | 0 | if (!pmd) { |
4052 | 0 | return EBUSY; |
4053 | 0 | } |
4054 | 0 | } |
4055 | | |
4056 | 0 | if (execute->probe) { |
4057 | | /* If this is part of a probe, Drop the packet, since executing |
4058 | | * the action may actually cause spurious packets be sent into |
4059 | | * the network. */ |
4060 | 0 | if (pmd->core_id == NON_PMD_CORE_ID) { |
4061 | 0 | dp_netdev_pmd_unref(pmd); |
4062 | 0 | } |
4063 | 0 | return 0; |
4064 | 0 | } |
4065 | | |
4066 | | /* If the current thread is non-pmd thread, acquires |
4067 | | * the 'non_pmd_mutex'. */ |
4068 | 0 | if (pmd->core_id == NON_PMD_CORE_ID) { |
4069 | 0 | ovs_mutex_lock(&dp->non_pmd_mutex); |
4070 | 0 | } |
4071 | | |
4072 | | /* Update current time in PMD context. We don't care about EMC insertion |
4073 | | * probability, because we are on a slow path. */ |
4074 | 0 | pmd_thread_ctx_time_update(pmd); |
4075 | | |
4076 | | /* The action processing expects the RSS hash to be valid, because |
4077 | | * it's always initialized at the beginning of datapath processing. |
4078 | | * In this case, though, 'execute->packet' may not have gone through |
4079 | | * the datapath at all, it may have been generated by the upper layer |
4080 | | * (OpenFlow packet-out, BFD frame, ...). */ |
4081 | 0 | if (!dp_packet_rss_valid(execute->packet)) { |
4082 | 0 | dp_packet_set_rss_hash(execute->packet, |
4083 | 0 | flow_hash_5tuple(execute->flow, 0)); |
4084 | 0 | } |
4085 | | |
4086 | | /* Making a copy because the packet might be stolen during the execution |
4087 | | * and caller might still need it. */ |
4088 | 0 | struct dp_packet *packet_clone = dp_packet_clone(execute->packet); |
4089 | 0 | dp_packet_batch_init_packet(&pp, packet_clone); |
4090 | 0 | dp_netdev_execute_actions(pmd, &pp, false, execute->flow, |
4091 | 0 | execute->actions, execute->actions_len); |
4092 | 0 | dp_netdev_pmd_flush_output_packets(pmd, true); |
4093 | |
|
4094 | 0 | if (pmd->core_id == NON_PMD_CORE_ID) { |
4095 | 0 | ovs_mutex_unlock(&dp->non_pmd_mutex); |
4096 | 0 | dp_netdev_pmd_unref(pmd); |
4097 | 0 | } |
4098 | |
|
4099 | 0 | if (dp_packet_batch_size(&pp) == 1) { |
4100 | | /* Packet wasn't dropped during the execution. Swapping content with |
4101 | | * the original packet, because the caller might expect actions to |
4102 | | * modify it. Uisng the packet from a batch instead of 'packet_clone' |
4103 | | * because it maybe stolen and replaced by other packet, e.g. by |
4104 | | * the fragmentation engine. */ |
4105 | 0 | dp_packet_swap(execute->packet, pp.packets[0]); |
4106 | 0 | dp_packet_delete_batch(&pp, true); |
4107 | 0 | } else if (dp_packet_batch_size(&pp)) { |
4108 | | /* FIXME: We have more packets than expected. Likely, we got IP |
4109 | | * fragments of the reassembled packet. Dropping them here as we have |
4110 | | * no way to get them to the caller. It might be that all the required |
4111 | | * actions with them are already executed, but it also might not be a |
4112 | | * case, e.g. if dpif_netdev_execute() called to execute a single |
4113 | | * tunnel push. */ |
4114 | 0 | dp_packet_delete_batch(&pp, true); |
4115 | 0 | } |
4116 | |
|
4117 | 0 | return 0; |
4118 | 0 | } |
4119 | | |
4120 | | static void |
4121 | | dpif_netdev_operate(struct dpif *dpif, struct dpif_op **ops, size_t n_ops) |
4122 | 0 | { |
4123 | 0 | size_t i; |
4124 | |
|
4125 | 0 | for (i = 0; i < n_ops; i++) { |
4126 | 0 | struct dpif_op *op = ops[i]; |
4127 | |
|
4128 | 0 | switch (op->type) { |
4129 | 0 | case DPIF_OP_FLOW_PUT: |
4130 | 0 | op->error = dpif_netdev_flow_put(dpif, &op->flow_put); |
4131 | 0 | break; |
4132 | | |
4133 | 0 | case DPIF_OP_FLOW_DEL: |
4134 | 0 | op->error = dpif_netdev_flow_del(dpif, &op->flow_del); |
4135 | 0 | break; |
4136 | | |
4137 | 0 | case DPIF_OP_EXECUTE: |
4138 | 0 | op->error = dpif_netdev_execute(dpif, &op->execute); |
4139 | 0 | break; |
4140 | | |
4141 | 0 | case DPIF_OP_FLOW_GET: |
4142 | 0 | op->error = dpif_netdev_flow_get(dpif, &op->flow_get); |
4143 | 0 | break; |
4144 | 0 | } |
4145 | 0 | } |
4146 | 0 | } |
4147 | | |
4148 | | /* Enable or Disable PMD auto load balancing. */ |
4149 | | static void |
4150 | | set_pmd_auto_lb(struct dp_netdev *dp, bool state, bool always_log) |
4151 | 0 | { |
4152 | 0 | struct pmd_auto_lb *pmd_alb = &dp->pmd_alb; |
4153 | |
|
4154 | 0 | if (pmd_alb->is_enabled != state || always_log) { |
4155 | 0 | pmd_alb->is_enabled = state; |
4156 | 0 | if (pmd_alb->is_enabled) { |
4157 | 0 | uint8_t rebalance_load_thresh; |
4158 | |
|
4159 | 0 | atomic_read_relaxed(&pmd_alb->rebalance_load_thresh, |
4160 | 0 | &rebalance_load_thresh); |
4161 | 0 | VLOG_INFO("PMD auto load balance is enabled, " |
4162 | 0 | "interval %"PRIu64" mins, " |
4163 | 0 | "pmd load threshold %"PRIu8"%%, " |
4164 | 0 | "improvement threshold %"PRIu8"%%.", |
4165 | 0 | pmd_alb->rebalance_intvl / MIN_TO_MSEC, |
4166 | 0 | rebalance_load_thresh, |
4167 | 0 | pmd_alb->rebalance_improve_thresh); |
4168 | 0 | } else { |
4169 | 0 | pmd_alb->rebalance_poll_timer = 0; |
4170 | 0 | VLOG_INFO("PMD auto load balance is disabled."); |
4171 | 0 | } |
4172 | 0 | } |
4173 | 0 | } |
4174 | | |
4175 | | static int |
4176 | | parse_pmd_sleep_list(const char *max_sleep_list, |
4177 | | struct pmd_sleep **pmd_sleeps) |
4178 | 0 | { |
4179 | 0 | char *list, *copy, *key, *value; |
4180 | 0 | int num_vals = 0; |
4181 | |
|
4182 | 0 | if (!max_sleep_list) { |
4183 | 0 | return num_vals; |
4184 | 0 | } |
4185 | | |
4186 | 0 | list = copy = xstrdup(max_sleep_list); |
4187 | |
|
4188 | 0 | while (ofputil_parse_key_value(&list, &key, &value)) { |
4189 | 0 | uint64_t temp, pmd_max_sleep; |
4190 | 0 | char *error = NULL; |
4191 | 0 | unsigned core; |
4192 | 0 | int i; |
4193 | |
|
4194 | 0 | error = str_to_u64(key, &temp); |
4195 | 0 | if (error) { |
4196 | 0 | free(error); |
4197 | 0 | continue; |
4198 | 0 | } |
4199 | | |
4200 | 0 | if (value[0] == '\0') { |
4201 | | /* No value specified. key is dp default. */ |
4202 | 0 | core = UINT_MAX; |
4203 | 0 | pmd_max_sleep = temp; |
4204 | 0 | } else { |
4205 | 0 | error = str_to_u64(value, &pmd_max_sleep); |
4206 | 0 | if (!error && temp < UINT_MAX) { |
4207 | | /* Key is pmd core id. */ |
4208 | 0 | core = (unsigned) temp; |
4209 | 0 | } else { |
4210 | 0 | free(error); |
4211 | 0 | continue; |
4212 | 0 | } |
4213 | 0 | } |
4214 | | |
4215 | | /* Detect duplicate max sleep values. */ |
4216 | 0 | for (i = 0; i < num_vals; i++) { |
4217 | 0 | if ((*pmd_sleeps)[i].core_id == core) { |
4218 | 0 | break; |
4219 | 0 | } |
4220 | 0 | } |
4221 | 0 | if (i == num_vals) { |
4222 | | /* Not duplicate, add a new entry. */ |
4223 | 0 | *pmd_sleeps = xrealloc(*pmd_sleeps, |
4224 | 0 | (num_vals + 1) * sizeof **pmd_sleeps); |
4225 | 0 | num_vals++; |
4226 | 0 | } |
4227 | |
|
4228 | 0 | pmd_max_sleep = MIN(PMD_RCU_QUIESCE_INTERVAL, pmd_max_sleep); |
4229 | |
|
4230 | 0 | (*pmd_sleeps)[i].core_id = core; |
4231 | 0 | (*pmd_sleeps)[i].max_sleep = pmd_max_sleep; |
4232 | 0 | } |
4233 | |
|
4234 | 0 | free(copy); |
4235 | 0 | return num_vals; |
4236 | 0 | } |
4237 | | |
4238 | | static void |
4239 | | log_pmd_sleep(unsigned core_id, int numa_id, uint64_t pmd_max_sleep) |
4240 | 0 | { |
4241 | 0 | if (core_id == NON_PMD_CORE_ID) { |
4242 | 0 | return; |
4243 | 0 | } |
4244 | 0 | VLOG_INFO("PMD thread on numa_id: %d, core id: %2d, " |
4245 | 0 | "max sleep: %4"PRIu64" us.", numa_id, core_id, pmd_max_sleep); |
4246 | 0 | } |
4247 | | |
4248 | | static void |
4249 | | pmd_init_max_sleep(struct dp_netdev *dp, struct dp_netdev_pmd_thread *pmd) |
4250 | 0 | { |
4251 | 0 | uint64_t max_sleep = dp->pmd_max_sleep_default; |
4252 | 0 | struct pmd_sleep *pmd_sleeps = NULL; |
4253 | 0 | int num_vals; |
4254 | |
|
4255 | 0 | num_vals = parse_pmd_sleep_list(dp->max_sleep_list, &pmd_sleeps); |
4256 | | |
4257 | | /* Check if the user has set a specific value for this pmd. */ |
4258 | 0 | for (int i = 0; i < num_vals; i++) { |
4259 | 0 | if (pmd_sleeps[i].core_id == pmd->core_id) { |
4260 | 0 | max_sleep = pmd_sleeps[i].max_sleep; |
4261 | 0 | break; |
4262 | 0 | } |
4263 | 0 | } |
4264 | 0 | atomic_init(&pmd->max_sleep, max_sleep); |
4265 | 0 | log_pmd_sleep(pmd->core_id, pmd->numa_id, max_sleep); |
4266 | 0 | free(pmd_sleeps); |
4267 | 0 | } |
4268 | | |
4269 | | static bool |
4270 | | assign_sleep_values_to_pmds(struct dp_netdev *dp, int num_vals, |
4271 | | struct pmd_sleep *pmd_sleeps) |
4272 | 0 | { |
4273 | 0 | struct dp_netdev_pmd_thread *pmd; |
4274 | 0 | bool value_changed = false; |
4275 | |
|
4276 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
4277 | 0 | uint64_t new_max_sleep, cur_pmd_max_sleep; |
4278 | |
|
4279 | 0 | if (pmd->core_id == NON_PMD_CORE_ID) { |
4280 | 0 | continue; |
4281 | 0 | } |
4282 | | |
4283 | | /* Default to global value. */ |
4284 | 0 | new_max_sleep = dp->pmd_max_sleep_default; |
4285 | | |
4286 | | /* Check for pmd specific value. */ |
4287 | 0 | for (int i = 0; i < num_vals; i++) { |
4288 | 0 | if (pmd->core_id == pmd_sleeps[i].core_id) { |
4289 | 0 | new_max_sleep = pmd_sleeps[i].max_sleep; |
4290 | 0 | break; |
4291 | 0 | } |
4292 | 0 | } |
4293 | 0 | atomic_read_relaxed(&pmd->max_sleep, &cur_pmd_max_sleep); |
4294 | 0 | if (new_max_sleep != cur_pmd_max_sleep) { |
4295 | 0 | atomic_store_relaxed(&pmd->max_sleep, new_max_sleep); |
4296 | 0 | value_changed = true; |
4297 | 0 | } |
4298 | 0 | } |
4299 | 0 | return value_changed; |
4300 | 0 | } |
4301 | | |
4302 | | static void |
4303 | | log_all_pmd_sleeps(struct dp_netdev *dp) |
4304 | 0 | { |
4305 | 0 | struct dp_netdev_pmd_thread **pmd_list = NULL; |
4306 | 0 | struct dp_netdev_pmd_thread *pmd; |
4307 | 0 | size_t n; |
4308 | |
|
4309 | 0 | VLOG_INFO("Default PMD thread max sleep: %4"PRIu64" us.", |
4310 | 0 | dp->pmd_max_sleep_default); |
4311 | |
|
4312 | 0 | sorted_poll_thread_list(dp, &pmd_list, &n); |
4313 | |
|
4314 | 0 | for (size_t i = 0; i < n; i++) { |
4315 | 0 | uint64_t cur_pmd_max_sleep; |
4316 | |
|
4317 | 0 | pmd = pmd_list[i]; |
4318 | 0 | atomic_read_relaxed(&pmd->max_sleep, &cur_pmd_max_sleep); |
4319 | 0 | log_pmd_sleep(pmd->core_id, pmd->numa_id, cur_pmd_max_sleep); |
4320 | 0 | } |
4321 | 0 | free(pmd_list); |
4322 | 0 | } |
4323 | | |
4324 | | static bool |
4325 | | set_all_pmd_max_sleeps(struct dp_netdev *dp, const struct smap *config) |
4326 | 0 | { |
4327 | 0 | const char *max_sleep_list = smap_get(config, "pmd-sleep-max"); |
4328 | 0 | struct pmd_sleep *pmd_sleeps = NULL; |
4329 | 0 | uint64_t default_max_sleep = 0; |
4330 | 0 | bool default_changed = false; |
4331 | 0 | bool pmd_changed = false; |
4332 | 0 | uint64_t pmd_maxsleep; |
4333 | 0 | int num_vals = 0; |
4334 | | |
4335 | | /* Check for deprecated 'pmd-maxsleep' value. */ |
4336 | 0 | pmd_maxsleep = smap_get_ullong(config, "pmd-maxsleep", UINT64_MAX); |
4337 | 0 | if (pmd_maxsleep != UINT64_MAX && !max_sleep_list) { |
4338 | 0 | VLOG_WARN_ONCE("pmd-maxsleep is deprecated. " |
4339 | 0 | "Please use pmd-sleep-max instead."); |
4340 | 0 | default_max_sleep = pmd_maxsleep; |
4341 | 0 | } |
4342 | | |
4343 | | /* Check if there is no change in string or value. */ |
4344 | 0 | if (!!dp->max_sleep_list == !!max_sleep_list) { |
4345 | 0 | if (max_sleep_list |
4346 | 0 | ? nullable_string_is_equal(max_sleep_list, dp->max_sleep_list) |
4347 | 0 | : default_max_sleep == dp->pmd_max_sleep_default) { |
4348 | 0 | return false; |
4349 | 0 | } |
4350 | 0 | } |
4351 | | |
4352 | | /* Free existing string and copy new one (if any). */ |
4353 | 0 | free(dp->max_sleep_list); |
4354 | 0 | dp->max_sleep_list = nullable_xstrdup(max_sleep_list); |
4355 | |
|
4356 | 0 | if (max_sleep_list) { |
4357 | 0 | num_vals = parse_pmd_sleep_list(max_sleep_list, &pmd_sleeps); |
4358 | | |
4359 | | /* Check if the user has set a global value. */ |
4360 | 0 | for (int i = 0; i < num_vals; i++) { |
4361 | 0 | if (pmd_sleeps[i].core_id == UINT_MAX) { |
4362 | 0 | default_max_sleep = pmd_sleeps[i].max_sleep; |
4363 | 0 | break; |
4364 | 0 | } |
4365 | 0 | } |
4366 | 0 | } |
4367 | |
|
4368 | 0 | if (dp->pmd_max_sleep_default != default_max_sleep) { |
4369 | 0 | dp->pmd_max_sleep_default = default_max_sleep; |
4370 | 0 | default_changed = true; |
4371 | 0 | } |
4372 | 0 | pmd_changed = assign_sleep_values_to_pmds(dp, num_vals, pmd_sleeps); |
4373 | |
|
4374 | 0 | free(pmd_sleeps); |
4375 | 0 | return default_changed || pmd_changed; |
4376 | 0 | } |
4377 | | |
4378 | | /* Applies datapath configuration from the database. Some of the changes are |
4379 | | * actually applied in dpif_netdev_run(). */ |
4380 | | static int |
4381 | | dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config) |
4382 | 0 | { |
4383 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
4384 | 0 | const char *cmask = smap_get(other_config, "pmd-cpu-mask"); |
4385 | 0 | const char *pmd_rxq_assign = smap_get_def(other_config, "pmd-rxq-assign", |
4386 | 0 | "cycles"); |
4387 | 0 | unsigned long long insert_prob = |
4388 | 0 | smap_get_ullong(other_config, "emc-insert-inv-prob", |
4389 | 0 | DEFAULT_EM_FLOW_INSERT_INV_PROB); |
4390 | 0 | uint32_t insert_min, cur_min; |
4391 | 0 | uint32_t tx_flush_interval, cur_tx_flush_interval; |
4392 | 0 | uint64_t rebalance_intvl; |
4393 | 0 | uint8_t cur_rebalance_load; |
4394 | 0 | uint32_t rebalance_load, rebalance_improve; |
4395 | 0 | bool log_autolb = false; |
4396 | 0 | enum sched_assignment_type pmd_rxq_assign_type; |
4397 | 0 | static bool first_set_config = true; |
4398 | |
|
4399 | 0 | tx_flush_interval = smap_get_int(other_config, "tx-flush-interval", |
4400 | 0 | DEFAULT_TX_FLUSH_INTERVAL); |
4401 | 0 | atomic_read_relaxed(&dp->tx_flush_interval, &cur_tx_flush_interval); |
4402 | 0 | if (tx_flush_interval != cur_tx_flush_interval) { |
4403 | 0 | atomic_store_relaxed(&dp->tx_flush_interval, tx_flush_interval); |
4404 | 0 | VLOG_INFO("Flushing interval for tx queues set to %"PRIu32" us", |
4405 | 0 | tx_flush_interval); |
4406 | 0 | } |
4407 | |
|
4408 | 0 | if (!nullable_string_is_equal(dp->pmd_cmask, cmask)) { |
4409 | 0 | free(dp->pmd_cmask); |
4410 | 0 | dp->pmd_cmask = nullable_xstrdup(cmask); |
4411 | 0 | dp_netdev_request_reconfigure(dp); |
4412 | 0 | } |
4413 | |
|
4414 | 0 | atomic_read_relaxed(&dp->emc_insert_min, &cur_min); |
4415 | 0 | if (insert_prob <= UINT32_MAX) { |
4416 | 0 | insert_min = insert_prob == 0 ? 0 : UINT32_MAX / insert_prob; |
4417 | 0 | } else { |
4418 | 0 | insert_min = DEFAULT_EM_FLOW_INSERT_MIN; |
4419 | 0 | insert_prob = DEFAULT_EM_FLOW_INSERT_INV_PROB; |
4420 | 0 | } |
4421 | |
|
4422 | 0 | if (insert_min != cur_min) { |
4423 | 0 | atomic_store_relaxed(&dp->emc_insert_min, insert_min); |
4424 | 0 | if (insert_min == 0) { |
4425 | 0 | VLOG_INFO("EMC insertion probability changed to zero"); |
4426 | 0 | } else { |
4427 | 0 | VLOG_INFO("EMC insertion probability changed to 1/%llu (~%.2f%%)", |
4428 | 0 | insert_prob, (100 / (float)insert_prob)); |
4429 | 0 | } |
4430 | 0 | } |
4431 | |
|
4432 | 0 | bool perf_enabled = smap_get_bool(other_config, "pmd-perf-metrics", false); |
4433 | 0 | bool cur_perf_enabled; |
4434 | 0 | atomic_read_relaxed(&dp->pmd_perf_metrics, &cur_perf_enabled); |
4435 | 0 | if (perf_enabled != cur_perf_enabled) { |
4436 | 0 | atomic_store_relaxed(&dp->pmd_perf_metrics, perf_enabled); |
4437 | 0 | if (perf_enabled) { |
4438 | 0 | VLOG_INFO("PMD performance metrics collection enabled"); |
4439 | 0 | } else { |
4440 | 0 | VLOG_INFO("PMD performance metrics collection disabled"); |
4441 | 0 | } |
4442 | 0 | } |
4443 | |
|
4444 | 0 | bool smc_enable = smap_get_bool(other_config, "smc-enable", false); |
4445 | 0 | bool cur_smc; |
4446 | 0 | atomic_read_relaxed(&dp->smc_enable_db, &cur_smc); |
4447 | 0 | if (smc_enable != cur_smc) { |
4448 | 0 | atomic_store_relaxed(&dp->smc_enable_db, smc_enable); |
4449 | 0 | if (smc_enable) { |
4450 | 0 | VLOG_INFO("SMC cache is enabled"); |
4451 | 0 | } else { |
4452 | 0 | VLOG_INFO("SMC cache is disabled"); |
4453 | 0 | } |
4454 | 0 | } |
4455 | |
|
4456 | 0 | if (!strcmp(pmd_rxq_assign, "roundrobin")) { |
4457 | 0 | pmd_rxq_assign_type = SCHED_ROUNDROBIN; |
4458 | 0 | } else if (!strcmp(pmd_rxq_assign, "cycles")) { |
4459 | 0 | pmd_rxq_assign_type = SCHED_CYCLES; |
4460 | 0 | } else if (!strcmp(pmd_rxq_assign, "group")) { |
4461 | 0 | pmd_rxq_assign_type = SCHED_GROUP; |
4462 | 0 | } else { |
4463 | | /* Default. */ |
4464 | 0 | VLOG_WARN("Unsupported rx queue to PMD assignment mode in " |
4465 | 0 | "pmd-rxq-assign. Defaulting to 'cycles'."); |
4466 | 0 | pmd_rxq_assign_type = SCHED_CYCLES; |
4467 | 0 | pmd_rxq_assign = "cycles"; |
4468 | 0 | } |
4469 | 0 | if (dp->pmd_rxq_assign_type != pmd_rxq_assign_type) { |
4470 | 0 | dp->pmd_rxq_assign_type = pmd_rxq_assign_type; |
4471 | 0 | VLOG_INFO("Rxq to PMD assignment mode changed to: \'%s\'.", |
4472 | 0 | pmd_rxq_assign); |
4473 | 0 | dp_netdev_request_reconfigure(dp); |
4474 | 0 | } |
4475 | |
|
4476 | 0 | bool pmd_iso = smap_get_bool(other_config, "pmd-rxq-isolate", true); |
4477 | |
|
4478 | 0 | if (pmd_rxq_assign_type != SCHED_GROUP && pmd_iso == false) { |
4479 | | /* Invalid combination. */ |
4480 | 0 | VLOG_WARN("pmd-rxq-isolate can only be set false " |
4481 | 0 | "when using pmd-rxq-assign=group"); |
4482 | 0 | pmd_iso = true; |
4483 | 0 | } |
4484 | 0 | if (dp->pmd_iso != pmd_iso) { |
4485 | 0 | dp->pmd_iso = pmd_iso; |
4486 | 0 | if (pmd_iso) { |
4487 | 0 | VLOG_INFO("pmd-rxq-affinity isolates PMD core"); |
4488 | 0 | } else { |
4489 | 0 | VLOG_INFO("pmd-rxq-affinity does not isolate PMD core"); |
4490 | 0 | } |
4491 | 0 | dp_netdev_request_reconfigure(dp); |
4492 | 0 | } |
4493 | |
|
4494 | 0 | struct pmd_auto_lb *pmd_alb = &dp->pmd_alb; |
4495 | |
|
4496 | 0 | rebalance_intvl = smap_get_ullong(other_config, |
4497 | 0 | "pmd-auto-lb-rebal-interval", |
4498 | 0 | ALB_REBALANCE_INTERVAL); |
4499 | 0 | if (rebalance_intvl > MAX_ALB_REBALANCE_INTERVAL) { |
4500 | 0 | rebalance_intvl = ALB_REBALANCE_INTERVAL; |
4501 | 0 | } |
4502 | | |
4503 | | /* Input is in min, convert it to msec. */ |
4504 | 0 | rebalance_intvl = |
4505 | 0 | rebalance_intvl ? rebalance_intvl * MIN_TO_MSEC : MIN_TO_MSEC; |
4506 | |
|
4507 | 0 | if (pmd_alb->rebalance_intvl != rebalance_intvl) { |
4508 | 0 | pmd_alb->rebalance_intvl = rebalance_intvl; |
4509 | 0 | VLOG_INFO("PMD auto load balance interval set to " |
4510 | 0 | "%"PRIu64" mins\n", rebalance_intvl / MIN_TO_MSEC); |
4511 | 0 | log_autolb = true; |
4512 | 0 | } |
4513 | |
|
4514 | 0 | rebalance_improve = smap_get_uint(other_config, |
4515 | 0 | "pmd-auto-lb-improvement-threshold", |
4516 | 0 | ALB_IMPROVEMENT_THRESHOLD); |
4517 | 0 | if (rebalance_improve > 100) { |
4518 | 0 | rebalance_improve = ALB_IMPROVEMENT_THRESHOLD; |
4519 | 0 | } |
4520 | 0 | if (rebalance_improve != pmd_alb->rebalance_improve_thresh) { |
4521 | 0 | pmd_alb->rebalance_improve_thresh = rebalance_improve; |
4522 | 0 | VLOG_INFO("PMD auto load balance improvement threshold set to " |
4523 | 0 | "%"PRIu32"%%", rebalance_improve); |
4524 | 0 | log_autolb = true; |
4525 | 0 | } |
4526 | |
|
4527 | 0 | rebalance_load = smap_get_uint(other_config, "pmd-auto-lb-load-threshold", |
4528 | 0 | ALB_LOAD_THRESHOLD); |
4529 | 0 | if (rebalance_load > 100) { |
4530 | 0 | rebalance_load = ALB_LOAD_THRESHOLD; |
4531 | 0 | } |
4532 | 0 | atomic_read_relaxed(&pmd_alb->rebalance_load_thresh, &cur_rebalance_load); |
4533 | 0 | if (rebalance_load != cur_rebalance_load) { |
4534 | 0 | atomic_store_relaxed(&pmd_alb->rebalance_load_thresh, |
4535 | 0 | rebalance_load); |
4536 | 0 | VLOG_INFO("PMD auto load balance load threshold set to %"PRIu32"%%", |
4537 | 0 | rebalance_load); |
4538 | 0 | log_autolb = true; |
4539 | 0 | } |
4540 | |
|
4541 | 0 | bool autolb_state = smap_get_bool(other_config, "pmd-auto-lb", false); |
4542 | |
|
4543 | 0 | set_pmd_auto_lb(dp, autolb_state, log_autolb); |
4544 | |
|
4545 | 0 | bool sleep_changed = set_all_pmd_max_sleeps(dp, other_config); |
4546 | 0 | if (first_set_config || sleep_changed) { |
4547 | 0 | log_all_pmd_sleeps(dp); |
4548 | 0 | } |
4549 | |
|
4550 | 0 | if (first_set_config) { |
4551 | 0 | dpif_offload_datapath_register_flow_unreference_cb( |
4552 | 0 | dpif, offload_flow_reference_unreference_cb); |
4553 | 0 | } |
4554 | |
|
4555 | 0 | first_set_config = false; |
4556 | 0 | return 0; |
4557 | 0 | } |
4558 | | |
4559 | | static bool |
4560 | | dpif_netdev_number_handlers_required(struct dpif *dpif_ OVS_UNUSED, |
4561 | | uint32_t *n_handlers) |
4562 | 0 | { |
4563 | 0 | *n_handlers = 0; |
4564 | 0 | return true; |
4565 | 0 | } |
4566 | | |
4567 | | /* Parses affinity list and returns result in 'core_ids'. */ |
4568 | | static int |
4569 | | parse_affinity_list(const char *affinity_list, unsigned *core_ids, int n_rxq) |
4570 | 0 | { |
4571 | 0 | unsigned i; |
4572 | 0 | char *list, *copy, *key, *value; |
4573 | 0 | int error = 0; |
4574 | |
|
4575 | 0 | for (i = 0; i < n_rxq; i++) { |
4576 | 0 | core_ids[i] = OVS_CORE_UNSPEC; |
4577 | 0 | } |
4578 | |
|
4579 | 0 | if (!affinity_list) { |
4580 | 0 | return 0; |
4581 | 0 | } |
4582 | | |
4583 | 0 | list = copy = xstrdup(affinity_list); |
4584 | |
|
4585 | 0 | while (ofputil_parse_key_value(&list, &key, &value)) { |
4586 | 0 | int rxq_id, core_id; |
4587 | |
|
4588 | 0 | if (!str_to_int(key, 0, &rxq_id) || rxq_id < 0 |
4589 | 0 | || !str_to_int(value, 0, &core_id) || core_id < 0) { |
4590 | 0 | error = EINVAL; |
4591 | 0 | break; |
4592 | 0 | } |
4593 | | |
4594 | 0 | if (rxq_id < n_rxq) { |
4595 | 0 | core_ids[rxq_id] = core_id; |
4596 | 0 | } |
4597 | 0 | } |
4598 | |
|
4599 | 0 | free(copy); |
4600 | 0 | return error; |
4601 | 0 | } |
4602 | | |
4603 | | /* Parses 'affinity_list' and applies configuration if it is valid. */ |
4604 | | static int |
4605 | | dpif_netdev_port_set_rxq_affinity(struct dp_netdev_port *port, |
4606 | | const char *affinity_list) |
4607 | 0 | { |
4608 | 0 | unsigned *core_ids, i; |
4609 | 0 | int error = 0; |
4610 | |
|
4611 | 0 | core_ids = xmalloc(port->n_rxq * sizeof *core_ids); |
4612 | 0 | if (parse_affinity_list(affinity_list, core_ids, port->n_rxq)) { |
4613 | 0 | error = EINVAL; |
4614 | 0 | goto exit; |
4615 | 0 | } |
4616 | | |
4617 | 0 | for (i = 0; i < port->n_rxq; i++) { |
4618 | 0 | port->rxqs[i].core_id = core_ids[i]; |
4619 | 0 | } |
4620 | |
|
4621 | 0 | exit: |
4622 | 0 | free(core_ids); |
4623 | 0 | return error; |
4624 | 0 | } |
4625 | | |
4626 | | /* Returns 'true' if one of the 'port's RX queues exists in 'poll_list' |
4627 | | * of given PMD thread. */ |
4628 | | static bool |
4629 | | dpif_netdev_pmd_polls_port(struct dp_netdev_pmd_thread *pmd, |
4630 | | struct dp_netdev_port *port) |
4631 | | OVS_EXCLUDED(pmd->port_mutex) |
4632 | 0 | { |
4633 | 0 | struct rxq_poll *poll; |
4634 | 0 | bool found = false; |
4635 | |
|
4636 | 0 | ovs_mutex_lock(&pmd->port_mutex); |
4637 | 0 | HMAP_FOR_EACH (poll, node, &pmd->poll_list) { |
4638 | 0 | if (port == poll->rxq->port) { |
4639 | 0 | found = true; |
4640 | 0 | break; |
4641 | 0 | } |
4642 | 0 | } |
4643 | 0 | ovs_mutex_unlock(&pmd->port_mutex); |
4644 | 0 | return found; |
4645 | 0 | } |
4646 | | |
4647 | | /* Updates port configuration from the database. The changes are actually |
4648 | | * applied in dpif_netdev_run(). */ |
4649 | | static int |
4650 | | dpif_netdev_port_set_config(struct dpif *dpif, odp_port_t port_no, |
4651 | | const struct smap *cfg) |
4652 | 0 | { |
4653 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
4654 | 0 | struct dp_netdev_port *port; |
4655 | 0 | int error = 0; |
4656 | 0 | const char *affinity_list = smap_get(cfg, "pmd-rxq-affinity"); |
4657 | 0 | bool emc_enabled = smap_get_bool(cfg, "emc-enable", true); |
4658 | 0 | const char *tx_steering_mode = smap_get(cfg, "tx-steering"); |
4659 | 0 | enum txq_req_mode txq_mode; |
4660 | |
|
4661 | 0 | ovs_rwlock_wrlock(&dp->port_rwlock); |
4662 | 0 | error = get_port_by_number(dp, port_no, &port); |
4663 | 0 | if (error) { |
4664 | 0 | goto unlock; |
4665 | 0 | } |
4666 | | |
4667 | 0 | if (emc_enabled != port->emc_enabled) { |
4668 | 0 | struct dp_netdev_pmd_thread *pmd; |
4669 | 0 | struct ds ds = DS_EMPTY_INITIALIZER; |
4670 | 0 | uint32_t cur_min, insert_prob; |
4671 | |
|
4672 | 0 | port->emc_enabled = emc_enabled; |
4673 | | /* Mark for reload all the threads that polls this port and request |
4674 | | * for reconfiguration for the actual reloading of threads. */ |
4675 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
4676 | 0 | if (dpif_netdev_pmd_polls_port(pmd, port)) { |
4677 | 0 | pmd->need_reload = true; |
4678 | 0 | } |
4679 | 0 | } |
4680 | 0 | dp_netdev_request_reconfigure(dp); |
4681 | |
|
4682 | 0 | ds_put_format(&ds, "%s: EMC has been %s.", |
4683 | 0 | netdev_get_name(port->netdev), |
4684 | 0 | (emc_enabled) ? "enabled" : "disabled"); |
4685 | 0 | if (emc_enabled) { |
4686 | 0 | ds_put_cstr(&ds, " Current insertion probability is "); |
4687 | 0 | atomic_read_relaxed(&dp->emc_insert_min, &cur_min); |
4688 | 0 | if (!cur_min) { |
4689 | 0 | ds_put_cstr(&ds, "zero."); |
4690 | 0 | } else { |
4691 | 0 | insert_prob = UINT32_MAX / cur_min; |
4692 | 0 | ds_put_format(&ds, "1/%"PRIu32" (~%.2f%%).", |
4693 | 0 | insert_prob, 100 / (float) insert_prob); |
4694 | 0 | } |
4695 | 0 | } |
4696 | 0 | VLOG_INFO("%s", ds_cstr(&ds)); |
4697 | 0 | ds_destroy(&ds); |
4698 | 0 | } |
4699 | | |
4700 | | /* Checking for RXq affinity changes. */ |
4701 | 0 | if (netdev_is_pmd(port->netdev) |
4702 | 0 | && !nullable_string_is_equal(affinity_list, port->rxq_affinity_list)) { |
4703 | |
|
4704 | 0 | error = dpif_netdev_port_set_rxq_affinity(port, affinity_list); |
4705 | 0 | if (error) { |
4706 | 0 | goto unlock; |
4707 | 0 | } |
4708 | 0 | free(port->rxq_affinity_list); |
4709 | 0 | port->rxq_affinity_list = nullable_xstrdup(affinity_list); |
4710 | |
|
4711 | 0 | dp_netdev_request_reconfigure(dp); |
4712 | 0 | } |
4713 | | |
4714 | 0 | if (nullable_string_is_equal(tx_steering_mode, "hash")) { |
4715 | 0 | txq_mode = TXQ_REQ_MODE_HASH; |
4716 | 0 | } else { |
4717 | 0 | txq_mode = TXQ_REQ_MODE_THREAD; |
4718 | 0 | } |
4719 | |
|
4720 | 0 | if (txq_mode != port->txq_requested_mode) { |
4721 | 0 | port->txq_requested_mode = txq_mode; |
4722 | 0 | VLOG_INFO("%s: Tx packet steering mode has been set to '%s'.", |
4723 | 0 | netdev_get_name(port->netdev), |
4724 | 0 | (txq_mode == TXQ_REQ_MODE_THREAD) ? "thread" : "hash"); |
4725 | 0 | dp_netdev_request_reconfigure(dp); |
4726 | 0 | } |
4727 | |
|
4728 | 0 | unlock: |
4729 | 0 | ovs_rwlock_unlock(&dp->port_rwlock); |
4730 | 0 | return error; |
4731 | 0 | } |
4732 | | |
4733 | | static int |
4734 | | dpif_netdev_queue_to_priority(const struct dpif *dpif OVS_UNUSED, |
4735 | | uint32_t queue_id, uint32_t *priority) |
4736 | 0 | { |
4737 | 0 | *priority = queue_id; |
4738 | 0 | return 0; |
4739 | 0 | } |
4740 | | |
4741 | | |
4742 | | /* Creates and returns a new 'struct dp_netdev_actions', whose actions are |
4743 | | * a copy of the 'size' bytes of 'actions' input parameters. */ |
4744 | | struct dp_netdev_actions * |
4745 | | dp_netdev_actions_create(const struct nlattr *actions, size_t size) |
4746 | 0 | { |
4747 | 0 | struct dp_netdev_actions *netdev_actions; |
4748 | |
|
4749 | 0 | netdev_actions = xmalloc(sizeof *netdev_actions + size); |
4750 | 0 | netdev_actions->size = size; |
4751 | 0 | if (size) { |
4752 | 0 | memcpy(netdev_actions->actions, actions, size); |
4753 | 0 | } |
4754 | |
|
4755 | 0 | return netdev_actions; |
4756 | 0 | } |
4757 | | |
4758 | | struct dp_netdev_actions * |
4759 | | dp_netdev_flow_get_actions(const struct dp_netdev_flow *flow) |
4760 | 0 | { |
4761 | 0 | return ovsrcu_get(struct dp_netdev_actions *, &flow->actions); |
4762 | 0 | } |
4763 | | |
4764 | | static void |
4765 | | dp_netdev_actions_free(struct dp_netdev_actions *actions) |
4766 | 0 | { |
4767 | 0 | free(actions); |
4768 | 0 | } |
4769 | | |
4770 | | static void |
4771 | | dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx, |
4772 | | enum rxq_cycles_counter_type type, |
4773 | | unsigned long long cycles) |
4774 | 0 | { |
4775 | 0 | atomic_store_relaxed(&rx->cycles[type], cycles); |
4776 | 0 | } |
4777 | | |
4778 | | static void |
4779 | | dp_netdev_rxq_add_cycles(struct dp_netdev_rxq *rx, |
4780 | | enum rxq_cycles_counter_type type, |
4781 | | unsigned long long cycles) |
4782 | 0 | { |
4783 | 0 | non_atomic_ullong_add(&rx->cycles[type], cycles); |
4784 | 0 | } |
4785 | | |
4786 | | static uint64_t |
4787 | | dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx, |
4788 | | enum rxq_cycles_counter_type type) |
4789 | 0 | { |
4790 | 0 | unsigned long long processing_cycles; |
4791 | 0 | atomic_read_relaxed(&rx->cycles[type], &processing_cycles); |
4792 | 0 | return processing_cycles; |
4793 | 0 | } |
4794 | | |
4795 | | static void |
4796 | | dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx, |
4797 | | unsigned long long cycles) |
4798 | 0 | { |
4799 | 0 | unsigned int idx = atomic_count_inc(&rx->intrvl_idx) % PMD_INTERVAL_MAX; |
4800 | 0 | atomic_store_relaxed(&rx->cycles_intrvl[idx], cycles); |
4801 | 0 | } |
4802 | | |
4803 | | static uint64_t |
4804 | | dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx) |
4805 | 0 | { |
4806 | 0 | unsigned long long processing_cycles; |
4807 | 0 | atomic_read_relaxed(&rx->cycles_intrvl[idx], &processing_cycles); |
4808 | 0 | return processing_cycles; |
4809 | 0 | } |
4810 | | |
4811 | | #if ATOMIC_ALWAYS_LOCK_FREE_8B |
4812 | | static inline bool |
4813 | | pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd) |
4814 | 0 | { |
4815 | 0 | bool pmd_perf_enabled; |
4816 | 0 | atomic_read_relaxed(&pmd->dp->pmd_perf_metrics, &pmd_perf_enabled); |
4817 | 0 | return pmd_perf_enabled; |
4818 | 0 | } |
4819 | | #else |
4820 | | /* If stores and reads of 64-bit integers are not atomic, the full PMD |
4821 | | * performance metrics are not available as locked access to 64 bit |
4822 | | * integers would be prohibitively expensive. */ |
4823 | | static inline bool |
4824 | | pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd OVS_UNUSED) |
4825 | | { |
4826 | | return false; |
4827 | | } |
4828 | | #endif |
4829 | | |
4830 | | static int |
4831 | | dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd, |
4832 | | struct tx_port *p) |
4833 | 0 | { |
4834 | 0 | int i; |
4835 | 0 | int tx_qid; |
4836 | 0 | int output_cnt; |
4837 | 0 | bool concurrent_txqs; |
4838 | 0 | struct cycle_timer timer; |
4839 | 0 | uint64_t cycles; |
4840 | 0 | uint32_t tx_flush_interval; |
4841 | |
|
4842 | 0 | cycle_timer_start(&pmd->perf_stats, &timer); |
4843 | |
|
4844 | 0 | output_cnt = dp_packet_batch_size(&p->output_pkts); |
4845 | 0 | ovs_assert(output_cnt > 0); |
4846 | |
|
4847 | 0 | if (p->port->txq_mode == TXQ_MODE_XPS_HASH) { |
4848 | 0 | int n_txq = netdev_n_txq(p->port->netdev); |
4849 | | |
4850 | | /* Re-batch per txq based on packet hash. */ |
4851 | 0 | struct dp_packet *packet; |
4852 | 0 | DP_PACKET_BATCH_FOR_EACH (j, packet, &p->output_pkts) { |
4853 | 0 | uint32_t hash; |
4854 | |
|
4855 | 0 | if (OVS_LIKELY(dp_packet_rss_valid(packet))) { |
4856 | 0 | hash = dp_packet_get_rss_hash(packet); |
4857 | 0 | } else { |
4858 | 0 | struct flow flow; |
4859 | |
|
4860 | 0 | flow_extract(packet, &flow); |
4861 | 0 | hash = flow_hash_5tuple(&flow, 0); |
4862 | 0 | } |
4863 | 0 | dp_packet_batch_add(&p->txq_pkts[hash % n_txq], packet); |
4864 | 0 | } |
4865 | | |
4866 | | /* Flush batches of each Tx queues. */ |
4867 | 0 | for (i = 0; i < n_txq; i++) { |
4868 | 0 | if (dp_packet_batch_is_empty(&p->txq_pkts[i])) { |
4869 | 0 | continue; |
4870 | 0 | } |
4871 | 0 | netdev_send(p->port->netdev, i, &p->txq_pkts[i], true); |
4872 | 0 | dp_packet_batch_init(&p->txq_pkts[i]); |
4873 | 0 | } |
4874 | 0 | } else { |
4875 | 0 | if (p->port->txq_mode == TXQ_MODE_XPS) { |
4876 | 0 | tx_qid = dpif_netdev_xps_get_tx_qid(pmd, p); |
4877 | 0 | concurrent_txqs = true; |
4878 | 0 | } else { |
4879 | 0 | tx_qid = pmd->static_tx_qid; |
4880 | 0 | concurrent_txqs = false; |
4881 | 0 | } |
4882 | 0 | netdev_send(p->port->netdev, tx_qid, &p->output_pkts, concurrent_txqs); |
4883 | 0 | } |
4884 | 0 | dp_packet_batch_init(&p->output_pkts); |
4885 | | |
4886 | | /* Update time of the next flush. */ |
4887 | 0 | atomic_read_relaxed(&pmd->dp->tx_flush_interval, &tx_flush_interval); |
4888 | 0 | p->flush_time = pmd->ctx.now + tx_flush_interval; |
4889 | |
|
4890 | 0 | ovs_assert(pmd->n_output_batches > 0); |
4891 | 0 | pmd->n_output_batches--; |
4892 | |
|
4893 | 0 | pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_PKTS, output_cnt); |
4894 | 0 | pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_BATCHES, 1); |
4895 | | |
4896 | | /* Distribute send cycles evenly among transmitted packets and assign to |
4897 | | * their respective rx queues. */ |
4898 | 0 | cycles = cycle_timer_stop(&pmd->perf_stats, &timer) / output_cnt; |
4899 | 0 | for (i = 0; i < output_cnt; i++) { |
4900 | 0 | if (p->output_pkts_rxqs[i]) { |
4901 | 0 | dp_netdev_rxq_add_cycles(p->output_pkts_rxqs[i], |
4902 | 0 | RXQ_CYCLES_PROC_CURR, cycles); |
4903 | 0 | } |
4904 | 0 | } |
4905 | |
|
4906 | 0 | return output_cnt; |
4907 | 0 | } |
4908 | | |
4909 | | static int |
4910 | | dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd, |
4911 | | bool force) |
4912 | 0 | { |
4913 | 0 | struct tx_port *p; |
4914 | 0 | int output_cnt = 0; |
4915 | |
|
4916 | 0 | if (!pmd->n_output_batches) { |
4917 | 0 | return 0; |
4918 | 0 | } |
4919 | | |
4920 | 0 | HMAP_FOR_EACH (p, node, &pmd->send_port_cache) { |
4921 | 0 | if (!dp_packet_batch_is_empty(&p->output_pkts) |
4922 | 0 | && (force || pmd->ctx.now >= p->flush_time)) { |
4923 | 0 | output_cnt += dp_netdev_pmd_flush_output_on_port(pmd, p); |
4924 | 0 | } |
4925 | 0 | } |
4926 | 0 | return output_cnt; |
4927 | 0 | } |
4928 | | |
4929 | | static int |
4930 | | dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd, |
4931 | | struct dp_netdev_rxq *rxq, |
4932 | | odp_port_t port_no) |
4933 | 0 | { |
4934 | 0 | struct pmd_perf_stats *s = &pmd->perf_stats; |
4935 | 0 | struct dp_packet_batch batch; |
4936 | 0 | struct cycle_timer timer; |
4937 | 0 | int error; |
4938 | 0 | int batch_cnt = 0; |
4939 | 0 | int rem_qlen = 0, *qlen_p = NULL; |
4940 | 0 | uint64_t cycles; |
4941 | | |
4942 | | /* Measure duration for polling and processing rx burst. */ |
4943 | 0 | cycle_timer_start(&pmd->perf_stats, &timer); |
4944 | |
|
4945 | 0 | pmd->ctx.last_rxq = rxq; |
4946 | 0 | dp_packet_batch_init(&batch); |
4947 | | |
4948 | | /* Fetch the rx queue length only for vhostuser ports. */ |
4949 | 0 | if (pmd_perf_metrics_enabled(pmd) && rxq->is_vhost) { |
4950 | 0 | qlen_p = &rem_qlen; |
4951 | 0 | } |
4952 | |
|
4953 | 0 | error = netdev_rxq_recv(rxq->rx, &batch, qlen_p); |
4954 | 0 | if (!error) { |
4955 | | /* At least one packet received. */ |
4956 | 0 | *recirc_depth_get() = 0; |
4957 | 0 | pmd_thread_ctx_time_update(pmd); |
4958 | 0 | batch_cnt = dp_packet_batch_size(&batch); |
4959 | 0 | if (pmd_perf_metrics_enabled(pmd)) { |
4960 | | /* Update batch histogram. */ |
4961 | 0 | s->current.batches++; |
4962 | 0 | histogram_add_sample(&s->pkts_per_batch, batch_cnt); |
4963 | | /* Update the maximum vhost rx queue fill level. */ |
4964 | 0 | if (rxq->is_vhost && rem_qlen >= 0) { |
4965 | 0 | uint32_t qfill = batch_cnt + rem_qlen; |
4966 | 0 | if (qfill > s->current.max_vhost_qfill) { |
4967 | 0 | s->current.max_vhost_qfill = qfill; |
4968 | 0 | } |
4969 | 0 | } |
4970 | 0 | } |
4971 | | |
4972 | | /* Process packet batch. */ |
4973 | 0 | int ret = pmd->netdev_input_func(pmd, &batch, port_no); |
4974 | 0 | if (ret) { |
4975 | 0 | dp_netdev_input(pmd, &batch, port_no); |
4976 | 0 | } |
4977 | | |
4978 | | /* Assign processing cycles to rx queue. */ |
4979 | 0 | cycles = cycle_timer_stop(&pmd->perf_stats, &timer); |
4980 | 0 | dp_netdev_rxq_add_cycles(rxq, RXQ_CYCLES_PROC_CURR, cycles); |
4981 | |
|
4982 | 0 | dp_netdev_pmd_flush_output_packets(pmd, false); |
4983 | 0 | } else { |
4984 | | /* Discard cycles. */ |
4985 | 0 | cycle_timer_stop(&pmd->perf_stats, &timer); |
4986 | 0 | if (error != EAGAIN && error != EOPNOTSUPP) { |
4987 | 0 | static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); |
4988 | |
|
4989 | 0 | VLOG_ERR_RL(&rl, "error receiving data from %s: %s", |
4990 | 0 | netdev_rxq_get_name(rxq->rx), ovs_strerror(error)); |
4991 | 0 | } |
4992 | 0 | } |
4993 | |
|
4994 | 0 | pmd->ctx.last_rxq = NULL; |
4995 | |
|
4996 | 0 | return batch_cnt; |
4997 | 0 | } |
4998 | | |
4999 | | static struct tx_port * |
5000 | | tx_port_lookup(const struct hmap *hmap, odp_port_t port_no) |
5001 | 0 | { |
5002 | 0 | struct tx_port *tx; |
5003 | |
|
5004 | 0 | HMAP_FOR_EACH_IN_BUCKET (tx, node, hash_port_no(port_no), hmap) { |
5005 | 0 | if (tx->port->port_no == port_no) { |
5006 | 0 | return tx; |
5007 | 0 | } |
5008 | 0 | } |
5009 | | |
5010 | 0 | return NULL; |
5011 | 0 | } |
5012 | | |
5013 | | static struct tx_bond * |
5014 | | tx_bond_lookup(const struct cmap *tx_bonds, uint32_t bond_id) |
5015 | 0 | { |
5016 | 0 | uint32_t hash = hash_bond_id(bond_id); |
5017 | 0 | struct tx_bond *tx; |
5018 | |
|
5019 | 0 | CMAP_FOR_EACH_WITH_HASH (tx, node, hash, tx_bonds) { |
5020 | 0 | if (tx->bond_id == bond_id) { |
5021 | 0 | return tx; |
5022 | 0 | } |
5023 | 0 | } |
5024 | 0 | return NULL; |
5025 | 0 | } |
5026 | | |
5027 | | static int |
5028 | | port_reconfigure(struct dp_netdev_port *port) |
5029 | 0 | { |
5030 | 0 | struct netdev *netdev = port->netdev; |
5031 | 0 | int i, err; |
5032 | | |
5033 | | /* Closes the existing 'rxq's. */ |
5034 | 0 | for (i = 0; i < port->n_rxq; i++) { |
5035 | 0 | netdev_rxq_close(port->rxqs[i].rx); |
5036 | 0 | port->rxqs[i].rx = NULL; |
5037 | 0 | } |
5038 | 0 | unsigned last_nrxq = port->n_rxq; |
5039 | 0 | port->n_rxq = 0; |
5040 | | |
5041 | | /* Allows 'netdev' to apply the pending configuration changes. */ |
5042 | 0 | if (netdev_is_reconf_required(netdev) || port->need_reconfigure) { |
5043 | 0 | err = netdev_reconfigure(netdev); |
5044 | 0 | if (err && (err != EOPNOTSUPP)) { |
5045 | 0 | VLOG_ERR("Failed to set interface %s new configuration", |
5046 | 0 | netdev_get_name(netdev)); |
5047 | 0 | return err; |
5048 | 0 | } |
5049 | 0 | } |
5050 | | /* If the netdev_reconfigure() above succeeds, reopens the 'rxq's. */ |
5051 | 0 | port->rxqs = xrealloc(port->rxqs, |
5052 | 0 | sizeof *port->rxqs * netdev_n_rxq(netdev)); |
5053 | | /* Realloc 'used' counters for tx queues. */ |
5054 | 0 | free(port->txq_used); |
5055 | 0 | port->txq_used = xcalloc(netdev_n_txq(netdev), sizeof *port->txq_used); |
5056 | |
|
5057 | 0 | for (i = 0; i < netdev_n_rxq(netdev); i++) { |
5058 | 0 | bool new_queue = i >= last_nrxq; |
5059 | 0 | if (new_queue) { |
5060 | 0 | memset(&port->rxqs[i], 0, sizeof port->rxqs[i]); |
5061 | 0 | } |
5062 | |
|
5063 | 0 | port->rxqs[i].port = port; |
5064 | 0 | port->rxqs[i].is_vhost = !strncmp(port->type, "dpdkvhost", 9); |
5065 | |
|
5066 | 0 | err = netdev_rxq_open(netdev, &port->rxqs[i].rx, i); |
5067 | 0 | if (err) { |
5068 | 0 | return err; |
5069 | 0 | } |
5070 | 0 | port->n_rxq++; |
5071 | 0 | } |
5072 | | |
5073 | | /* Parse affinity list to apply configuration for new queues. */ |
5074 | 0 | dpif_netdev_port_set_rxq_affinity(port, port->rxq_affinity_list); |
5075 | | |
5076 | | /* If reconfiguration was successful mark it as such, so we can use it */ |
5077 | 0 | port->need_reconfigure = false; |
5078 | |
|
5079 | 0 | return 0; |
5080 | 0 | } |
5081 | | |
5082 | | struct sched_numa_list { |
5083 | | struct hmap numas; /* Contains 'struct sched_numa'. */ |
5084 | | }; |
5085 | | |
5086 | | /* Meta data for out-of-place pmd rxq assignments. */ |
5087 | | struct sched_pmd { |
5088 | | struct sched_numa *numa; |
5089 | | /* Associated PMD thread. */ |
5090 | | struct dp_netdev_pmd_thread *pmd; |
5091 | | uint64_t pmd_proc_cycles; |
5092 | | struct dp_netdev_rxq **rxqs; |
5093 | | unsigned n_rxq; |
5094 | | bool isolated; |
5095 | | }; |
5096 | | |
5097 | | struct sched_numa { |
5098 | | struct hmap_node node; |
5099 | | int numa_id; |
5100 | | /* PMDs on numa node. */ |
5101 | | struct sched_pmd *pmds; |
5102 | | /* Num of PMDs on numa node. */ |
5103 | | unsigned n_pmds; |
5104 | | /* Num of isolated PMDs on numa node. */ |
5105 | | unsigned n_isolated; |
5106 | | int rr_cur_index; |
5107 | | bool rr_idx_inc; |
5108 | | }; |
5109 | | |
5110 | | static size_t |
5111 | | sched_numa_list_count(struct sched_numa_list *numa_list) |
5112 | 0 | { |
5113 | 0 | return hmap_count(&numa_list->numas); |
5114 | 0 | } |
5115 | | |
5116 | | static struct sched_numa * |
5117 | | sched_numa_list_next(struct sched_numa_list *numa_list, |
5118 | | const struct sched_numa *numa) |
5119 | 0 | { |
5120 | 0 | struct hmap_node *node = NULL; |
5121 | |
|
5122 | 0 | if (numa) { |
5123 | 0 | node = hmap_next(&numa_list->numas, &numa->node); |
5124 | 0 | } |
5125 | 0 | if (!node) { |
5126 | 0 | node = hmap_first(&numa_list->numas); |
5127 | 0 | } |
5128 | |
|
5129 | 0 | return (node) ? CONTAINER_OF(node, struct sched_numa, node) : NULL; |
5130 | 0 | } |
5131 | | |
5132 | | static struct sched_numa * |
5133 | | sched_numa_list_lookup(struct sched_numa_list *numa_list, int numa_id) |
5134 | 0 | { |
5135 | 0 | struct sched_numa *numa; |
5136 | |
|
5137 | 0 | HMAP_FOR_EACH_WITH_HASH (numa, node, hash_int(numa_id, 0), |
5138 | 0 | &numa_list->numas) { |
5139 | 0 | if (numa->numa_id == numa_id) { |
5140 | 0 | return numa; |
5141 | 0 | } |
5142 | 0 | } |
5143 | 0 | return NULL; |
5144 | 0 | } |
5145 | | |
5146 | | static int |
5147 | | compare_sched_pmd_list(const void *a_, const void *b_) |
5148 | 0 | { |
5149 | 0 | struct sched_pmd *a, *b; |
5150 | |
|
5151 | 0 | a = (struct sched_pmd *) a_; |
5152 | 0 | b = (struct sched_pmd *) b_; |
5153 | |
|
5154 | 0 | return compare_poll_thread_list(&a->pmd, &b->pmd); |
5155 | 0 | } |
5156 | | |
5157 | | static void |
5158 | | sort_numa_list_pmds(struct sched_numa_list *numa_list) |
5159 | 0 | { |
5160 | 0 | struct sched_numa *numa; |
5161 | |
|
5162 | 0 | HMAP_FOR_EACH (numa, node, &numa_list->numas) { |
5163 | 0 | if (numa->n_pmds > 1) { |
5164 | 0 | qsort(numa->pmds, numa->n_pmds, sizeof *numa->pmds, |
5165 | 0 | compare_sched_pmd_list); |
5166 | 0 | } |
5167 | 0 | } |
5168 | 0 | } |
5169 | | |
5170 | | /* Populate numas and pmds on those numas. */ |
5171 | | static void |
5172 | | sched_numa_list_populate(struct sched_numa_list *numa_list, |
5173 | | struct dp_netdev *dp) |
5174 | 0 | { |
5175 | 0 | struct dp_netdev_pmd_thread *pmd; |
5176 | |
|
5177 | 0 | hmap_init(&numa_list->numas); |
5178 | | |
5179 | | /* For each pmd on this datapath. */ |
5180 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
5181 | 0 | struct sched_numa *numa; |
5182 | 0 | struct sched_pmd *sched_pmd; |
5183 | 0 | if (pmd->core_id == NON_PMD_CORE_ID) { |
5184 | 0 | continue; |
5185 | 0 | } |
5186 | | |
5187 | | /* Get the numa of the PMD. */ |
5188 | 0 | numa = sched_numa_list_lookup(numa_list, pmd->numa_id); |
5189 | | /* Create a new numa node for it if not already created. */ |
5190 | 0 | if (!numa) { |
5191 | 0 | numa = xzalloc(sizeof *numa); |
5192 | 0 | numa->numa_id = pmd->numa_id; |
5193 | 0 | hmap_insert(&numa_list->numas, &numa->node, |
5194 | 0 | hash_int(pmd->numa_id, 0)); |
5195 | 0 | } |
5196 | | |
5197 | | /* Create a sched_pmd on this numa for the pmd. */ |
5198 | 0 | numa->n_pmds++; |
5199 | 0 | numa->pmds = xrealloc(numa->pmds, numa->n_pmds * sizeof *numa->pmds); |
5200 | 0 | sched_pmd = &numa->pmds[numa->n_pmds - 1]; |
5201 | 0 | memset(sched_pmd, 0, sizeof *sched_pmd); |
5202 | 0 | sched_pmd->numa = numa; |
5203 | 0 | sched_pmd->pmd = pmd; |
5204 | | /* At least one pmd is present so initialize curr_idx and idx_inc. */ |
5205 | 0 | numa->rr_cur_index = 0; |
5206 | 0 | numa->rr_idx_inc = true; |
5207 | 0 | } |
5208 | 0 | sort_numa_list_pmds(numa_list); |
5209 | 0 | } |
5210 | | |
5211 | | static void |
5212 | | sched_numa_list_free_entries(struct sched_numa_list *numa_list) |
5213 | 0 | { |
5214 | 0 | struct sched_numa *numa; |
5215 | |
|
5216 | 0 | HMAP_FOR_EACH_POP (numa, node, &numa_list->numas) { |
5217 | 0 | for (unsigned i = 0; i < numa->n_pmds; i++) { |
5218 | 0 | struct sched_pmd *sched_pmd; |
5219 | |
|
5220 | 0 | sched_pmd = &numa->pmds[i]; |
5221 | 0 | sched_pmd->n_rxq = 0; |
5222 | 0 | free(sched_pmd->rxqs); |
5223 | 0 | } |
5224 | 0 | numa->n_pmds = 0; |
5225 | 0 | free(numa->pmds); |
5226 | 0 | free(numa); |
5227 | 0 | } |
5228 | 0 | hmap_destroy(&numa_list->numas); |
5229 | 0 | } |
5230 | | |
5231 | | static struct sched_pmd * |
5232 | | sched_pmd_find_by_pmd(struct sched_numa_list *numa_list, |
5233 | | struct dp_netdev_pmd_thread *pmd) |
5234 | 0 | { |
5235 | 0 | struct sched_numa *numa; |
5236 | |
|
5237 | 0 | HMAP_FOR_EACH (numa, node, &numa_list->numas) { |
5238 | 0 | for (unsigned i = 0; i < numa->n_pmds; i++) { |
5239 | 0 | struct sched_pmd *sched_pmd; |
5240 | |
|
5241 | 0 | sched_pmd = &numa->pmds[i]; |
5242 | 0 | if (pmd == sched_pmd->pmd) { |
5243 | 0 | return sched_pmd; |
5244 | 0 | } |
5245 | 0 | } |
5246 | 0 | } |
5247 | 0 | return NULL; |
5248 | 0 | } |
5249 | | |
5250 | | static void |
5251 | | sched_pmd_add_rxq(struct sched_pmd *sched_pmd, struct dp_netdev_rxq *rxq, |
5252 | | uint64_t cycles) |
5253 | 0 | { |
5254 | | /* As sched_pmd is allocated outside this fn. better to not assume |
5255 | | * rxqs is initialized to NULL. */ |
5256 | 0 | if (sched_pmd->n_rxq == 0) { |
5257 | 0 | sched_pmd->rxqs = xmalloc(sizeof *sched_pmd->rxqs); |
5258 | 0 | } else { |
5259 | 0 | sched_pmd->rxqs = xrealloc(sched_pmd->rxqs, (sched_pmd->n_rxq + 1) * |
5260 | 0 | sizeof *sched_pmd->rxqs); |
5261 | 0 | } |
5262 | |
|
5263 | 0 | sched_pmd->rxqs[sched_pmd->n_rxq++] = rxq; |
5264 | 0 | sched_pmd->pmd_proc_cycles += cycles; |
5265 | 0 | } |
5266 | | |
5267 | | static void |
5268 | | sched_numa_list_assignments(struct sched_numa_list *numa_list, |
5269 | | struct dp_netdev *dp) |
5270 | | OVS_REQ_RDLOCK(dp->port_rwlock) |
5271 | 0 | { |
5272 | 0 | struct dp_netdev_port *port; |
5273 | | |
5274 | | /* For each port. */ |
5275 | 0 | HMAP_FOR_EACH (port, node, &dp->ports) { |
5276 | 0 | if (!netdev_is_pmd(port->netdev)) { |
5277 | 0 | continue; |
5278 | 0 | } |
5279 | | /* For each rxq on the port. */ |
5280 | 0 | for (unsigned qid = 0; qid < port->n_rxq; qid++) { |
5281 | 0 | struct dp_netdev_rxq *rxq = &port->rxqs[qid]; |
5282 | 0 | struct sched_pmd *sched_pmd; |
5283 | 0 | uint64_t proc_cycles = 0; |
5284 | |
|
5285 | 0 | for (int i = 0; i < PMD_INTERVAL_MAX; i++) { |
5286 | 0 | proc_cycles += dp_netdev_rxq_get_intrvl_cycles(rxq, i); |
5287 | 0 | } |
5288 | |
|
5289 | 0 | sched_pmd = sched_pmd_find_by_pmd(numa_list, rxq->pmd); |
5290 | 0 | if (sched_pmd) { |
5291 | 0 | if (rxq->core_id != OVS_CORE_UNSPEC && dp->pmd_iso) { |
5292 | 0 | sched_pmd->isolated = true; |
5293 | 0 | } |
5294 | 0 | sched_pmd_add_rxq(sched_pmd, rxq, proc_cycles); |
5295 | 0 | } |
5296 | 0 | } |
5297 | 0 | } |
5298 | 0 | } |
5299 | | |
5300 | | static void |
5301 | | sched_numa_list_put_in_place(struct sched_numa_list *numa_list) |
5302 | 0 | { |
5303 | 0 | struct sched_numa *numa; |
5304 | | |
5305 | | /* For each numa. */ |
5306 | 0 | HMAP_FOR_EACH (numa, node, &numa_list->numas) { |
5307 | | /* For each pmd. */ |
5308 | 0 | for (int i = 0; i < numa->n_pmds; i++) { |
5309 | 0 | struct sched_pmd *sched_pmd; |
5310 | |
|
5311 | 0 | sched_pmd = &numa->pmds[i]; |
5312 | 0 | sched_pmd->pmd->isolated = sched_pmd->isolated; |
5313 | | /* For each rxq. */ |
5314 | 0 | for (unsigned k = 0; k < sched_pmd->n_rxq; k++) { |
5315 | | /* Store the new pmd from the out of place sched_numa_list |
5316 | | * struct to the dp_netdev_rxq struct */ |
5317 | 0 | sched_pmd->rxqs[k]->pmd = sched_pmd->pmd; |
5318 | 0 | } |
5319 | 0 | } |
5320 | 0 | } |
5321 | 0 | } |
5322 | | |
5323 | | /* Returns 'true' if OVS rxq scheduling algorithm assigned any unpinned rxq to |
5324 | | * a PMD thread core on a non-local numa node. */ |
5325 | | static bool |
5326 | | sched_numa_list_cross_numa_polling(struct sched_numa_list *numa_list) |
5327 | 0 | { |
5328 | 0 | struct sched_numa *numa; |
5329 | |
|
5330 | 0 | HMAP_FOR_EACH (numa, node, &numa_list->numas) { |
5331 | 0 | for (int i = 0; i < numa->n_pmds; i++) { |
5332 | 0 | struct sched_pmd *sched_pmd; |
5333 | |
|
5334 | 0 | sched_pmd = &numa->pmds[i]; |
5335 | 0 | if (sched_pmd->isolated) { |
5336 | | /* All rxqs on this PMD thread core are pinned. */ |
5337 | 0 | continue; |
5338 | 0 | } |
5339 | 0 | for (unsigned k = 0; k < sched_pmd->n_rxq; k++) { |
5340 | 0 | struct dp_netdev_rxq *rxq = sched_pmd->rxqs[k]; |
5341 | | /* Check if the rxq is not pinned to a specific PMD thread core |
5342 | | * by the user AND the PMD thread core that OVS assigned is |
5343 | | * non-local to the rxq port. */ |
5344 | 0 | if (rxq->core_id == OVS_CORE_UNSPEC && |
5345 | 0 | rxq->pmd->numa_id != |
5346 | 0 | netdev_get_numa_id(rxq->port->netdev)) { |
5347 | 0 | return true; |
5348 | 0 | } |
5349 | 0 | } |
5350 | 0 | } |
5351 | 0 | } |
5352 | 0 | return false; |
5353 | 0 | } |
5354 | | |
5355 | | static unsigned |
5356 | | sched_numa_noniso_pmd_count(struct sched_numa *numa) |
5357 | 0 | { |
5358 | 0 | if (numa->n_pmds > numa->n_isolated) { |
5359 | 0 | return numa->n_pmds - numa->n_isolated; |
5360 | 0 | } |
5361 | 0 | return 0; |
5362 | 0 | } |
5363 | | |
5364 | | /* Sort Rx Queues by the processing cycles they are consuming. */ |
5365 | | static int |
5366 | | compare_rxq_cycles(const void *a, const void *b) |
5367 | 0 | { |
5368 | 0 | struct dp_netdev_rxq *qa; |
5369 | 0 | struct dp_netdev_rxq *qb; |
5370 | 0 | uint64_t cycles_qa, cycles_qb; |
5371 | |
|
5372 | 0 | qa = *(struct dp_netdev_rxq **) a; |
5373 | 0 | qb = *(struct dp_netdev_rxq **) b; |
5374 | |
|
5375 | 0 | cycles_qa = dp_netdev_rxq_get_cycles(qa, RXQ_CYCLES_PROC_HIST); |
5376 | 0 | cycles_qb = dp_netdev_rxq_get_cycles(qb, RXQ_CYCLES_PROC_HIST); |
5377 | |
|
5378 | 0 | if (cycles_qa != cycles_qb) { |
5379 | 0 | return (cycles_qa < cycles_qb) ? 1 : -1; |
5380 | 0 | } else { |
5381 | | /* Cycles are the same so tiebreak on port/queue id. |
5382 | | * Tiebreaking (as opposed to return 0) ensures consistent |
5383 | | * sort results across multiple OS's. */ |
5384 | 0 | uint32_t port_qa = odp_to_u32(qa->port->port_no); |
5385 | 0 | uint32_t port_qb = odp_to_u32(qb->port->port_no); |
5386 | 0 | if (port_qa != port_qb) { |
5387 | 0 | return port_qa > port_qb ? 1 : -1; |
5388 | 0 | } else { |
5389 | 0 | return netdev_rxq_get_queue_id(qa->rx) |
5390 | 0 | - netdev_rxq_get_queue_id(qb->rx); |
5391 | 0 | } |
5392 | 0 | } |
5393 | 0 | } |
5394 | | |
5395 | | static bool |
5396 | | sched_pmd_new_lowest(struct sched_pmd *current_lowest, struct sched_pmd *pmd, |
5397 | | bool has_proc) |
5398 | 0 | { |
5399 | 0 | uint64_t current_num, pmd_num; |
5400 | |
|
5401 | 0 | if (current_lowest == NULL) { |
5402 | 0 | return true; |
5403 | 0 | } |
5404 | | |
5405 | 0 | if (has_proc) { |
5406 | 0 | current_num = current_lowest->pmd_proc_cycles; |
5407 | 0 | pmd_num = pmd->pmd_proc_cycles; |
5408 | 0 | } else { |
5409 | 0 | current_num = current_lowest->n_rxq; |
5410 | 0 | pmd_num = pmd->n_rxq; |
5411 | 0 | } |
5412 | |
|
5413 | 0 | if (pmd_num < current_num) { |
5414 | 0 | return true; |
5415 | 0 | } |
5416 | 0 | return false; |
5417 | 0 | } |
5418 | | |
5419 | | static struct sched_pmd * |
5420 | | sched_pmd_get_lowest(struct sched_numa *numa, bool has_cyc) |
5421 | 0 | { |
5422 | 0 | struct sched_pmd *lowest_sched_pmd = NULL; |
5423 | |
|
5424 | 0 | for (unsigned i = 0; i < numa->n_pmds; i++) { |
5425 | 0 | struct sched_pmd *sched_pmd; |
5426 | |
|
5427 | 0 | sched_pmd = &numa->pmds[i]; |
5428 | 0 | if (sched_pmd->isolated) { |
5429 | 0 | continue; |
5430 | 0 | } |
5431 | 0 | if (sched_pmd_new_lowest(lowest_sched_pmd, sched_pmd, has_cyc)) { |
5432 | 0 | lowest_sched_pmd = sched_pmd; |
5433 | 0 | } |
5434 | 0 | } |
5435 | 0 | return lowest_sched_pmd; |
5436 | 0 | } |
5437 | | |
5438 | | /* |
5439 | | * Returns the next pmd from the numa node. |
5440 | | * |
5441 | | * If 'updown' is 'true' it will alternate between selecting the next pmd in |
5442 | | * either an up or down walk, switching between up/down when the first or last |
5443 | | * core is reached. e.g. 1,2,3,3,2,1,1,2... |
5444 | | * |
5445 | | * If 'updown' is 'false' it will select the next pmd wrapping around when |
5446 | | * last core reached. e.g. 1,2,3,1,2,3,1,2... |
5447 | | */ |
5448 | | static struct sched_pmd * |
5449 | | sched_pmd_next_rr(struct sched_numa *numa, bool updown) |
5450 | 0 | { |
5451 | 0 | int numa_idx = numa->rr_cur_index; |
5452 | |
|
5453 | 0 | if (numa->rr_idx_inc == true) { |
5454 | | /* Incrementing through list of pmds. */ |
5455 | 0 | if (numa->rr_cur_index == numa->n_pmds - 1) { |
5456 | | /* Reached the last pmd. */ |
5457 | 0 | if (updown) { |
5458 | 0 | numa->rr_idx_inc = false; |
5459 | 0 | } else { |
5460 | 0 | numa->rr_cur_index = 0; |
5461 | 0 | } |
5462 | 0 | } else { |
5463 | 0 | numa->rr_cur_index++; |
5464 | 0 | } |
5465 | 0 | } else { |
5466 | | /* Decrementing through list of pmds. */ |
5467 | 0 | if (numa->rr_cur_index == 0) { |
5468 | | /* Reached the first pmd. */ |
5469 | 0 | numa->rr_idx_inc = true; |
5470 | 0 | } else { |
5471 | 0 | numa->rr_cur_index--; |
5472 | 0 | } |
5473 | 0 | } |
5474 | 0 | return &numa->pmds[numa_idx]; |
5475 | 0 | } |
5476 | | |
5477 | | static struct sched_pmd * |
5478 | | sched_pmd_next_noniso_rr(struct sched_numa *numa, bool updown) |
5479 | 0 | { |
5480 | 0 | struct sched_pmd *sched_pmd = NULL; |
5481 | | |
5482 | | /* sched_pmd_next_rr() may return duplicate PMDs before all PMDs have been |
5483 | | * returned depending on updown. Call it more than n_pmds to ensure all |
5484 | | * PMDs can be searched for the next non-isolated PMD. */ |
5485 | 0 | for (unsigned i = 0; i < numa->n_pmds * 2; i++) { |
5486 | 0 | sched_pmd = sched_pmd_next_rr(numa, updown); |
5487 | 0 | if (!sched_pmd->isolated) { |
5488 | 0 | break; |
5489 | 0 | } |
5490 | 0 | sched_pmd = NULL; |
5491 | 0 | } |
5492 | 0 | return sched_pmd; |
5493 | 0 | } |
5494 | | |
5495 | | static struct sched_pmd * |
5496 | | sched_pmd_next(struct sched_numa *numa, enum sched_assignment_type algo, |
5497 | | bool has_proc) |
5498 | 0 | { |
5499 | 0 | if (algo == SCHED_GROUP) { |
5500 | 0 | return sched_pmd_get_lowest(numa, has_proc); |
5501 | 0 | } |
5502 | | |
5503 | | /* By default RR the PMDs. */ |
5504 | 0 | return sched_pmd_next_noniso_rr(numa, algo == SCHED_CYCLES ? true : false); |
5505 | 0 | } |
5506 | | |
5507 | | static const char * |
5508 | | get_assignment_type_string(enum sched_assignment_type algo) |
5509 | 0 | { |
5510 | 0 | switch (algo) { |
5511 | 0 | case SCHED_ROUNDROBIN: return "roundrobin"; |
5512 | 0 | case SCHED_CYCLES: return "cycles"; |
5513 | 0 | case SCHED_GROUP: return "group"; |
5514 | 0 | default: return "Unknown"; |
5515 | 0 | } |
5516 | 0 | } |
5517 | | |
5518 | 0 | #define MAX_RXQ_CYC_TEXT 40 |
5519 | 0 | #define MAX_RXQ_CYC_STRLEN (INT_STRLEN(uint64_t) + MAX_RXQ_CYC_TEXT) |
5520 | | |
5521 | | static char * |
5522 | | get_rxq_cyc_log(char *a, enum sched_assignment_type algo, uint64_t cycles) |
5523 | 0 | { |
5524 | 0 | int ret = 0; |
5525 | |
|
5526 | 0 | if (algo != SCHED_ROUNDROBIN) { |
5527 | 0 | ret = snprintf(a, MAX_RXQ_CYC_STRLEN, |
5528 | 0 | " (measured processing cycles %"PRIu64")", cycles); |
5529 | 0 | } |
5530 | |
|
5531 | 0 | if (algo == SCHED_ROUNDROBIN || ret <= 0) { |
5532 | 0 | a[0] = '\0'; |
5533 | 0 | } |
5534 | 0 | return a; |
5535 | 0 | } |
5536 | | |
5537 | | static void |
5538 | | sched_numa_list_schedule(struct sched_numa_list *numa_list, |
5539 | | struct dp_netdev *dp, |
5540 | | enum sched_assignment_type algo, |
5541 | | enum vlog_level level) |
5542 | | OVS_REQ_RDLOCK(dp->port_rwlock) |
5543 | 0 | { |
5544 | 0 | struct dp_netdev_port *port; |
5545 | 0 | struct dp_netdev_rxq **rxqs = NULL; |
5546 | 0 | struct sched_numa *last_cross_numa; |
5547 | 0 | unsigned n_rxqs = 0; |
5548 | 0 | bool start_logged = false; |
5549 | 0 | size_t n_numa; |
5550 | | |
5551 | | /* For each port. */ |
5552 | 0 | HMAP_FOR_EACH (port, node, &dp->ports) { |
5553 | 0 | if (!netdev_is_pmd(port->netdev)) { |
5554 | 0 | continue; |
5555 | 0 | } |
5556 | | |
5557 | | /* For each rxq on the port. */ |
5558 | 0 | for (int qid = 0; qid < port->n_rxq; qid++) { |
5559 | 0 | struct dp_netdev_rxq *rxq = &port->rxqs[qid]; |
5560 | |
|
5561 | 0 | if (algo != SCHED_ROUNDROBIN) { |
5562 | 0 | uint64_t cycle_hist = 0; |
5563 | | |
5564 | | /* Sum the queue intervals and store the cycle history. */ |
5565 | 0 | for (unsigned i = 0; i < PMD_INTERVAL_MAX; i++) { |
5566 | 0 | cycle_hist += dp_netdev_rxq_get_intrvl_cycles(rxq, i); |
5567 | 0 | } |
5568 | 0 | dp_netdev_rxq_set_cycles(rxq, RXQ_CYCLES_PROC_HIST, |
5569 | 0 | cycle_hist); |
5570 | 0 | } |
5571 | | |
5572 | | /* Check if this rxq is pinned. */ |
5573 | 0 | if (rxq->core_id != OVS_CORE_UNSPEC) { |
5574 | 0 | struct sched_pmd *sched_pmd; |
5575 | 0 | struct dp_netdev_pmd_thread *pmd; |
5576 | 0 | struct sched_numa *numa; |
5577 | 0 | bool iso = dp->pmd_iso; |
5578 | 0 | uint64_t proc_cycles; |
5579 | 0 | char rxq_cyc_log[MAX_RXQ_CYC_STRLEN]; |
5580 | | |
5581 | | /* This rxq should be pinned, pin it now. */ |
5582 | 0 | pmd = dp_netdev_get_pmd(dp, rxq->core_id); |
5583 | 0 | sched_pmd = sched_pmd_find_by_pmd(numa_list, pmd); |
5584 | 0 | dp_netdev_pmd_unref(pmd); |
5585 | 0 | if (!sched_pmd) { |
5586 | | /* Cannot find the PMD. Cannot pin this rxq. */ |
5587 | 0 | VLOG(level == VLL_DBG ? VLL_DBG : VLL_WARN, |
5588 | 0 | "Core %2u cannot be pinned with " |
5589 | 0 | "port \'%s\' rx queue %d. Use pmd-cpu-mask to " |
5590 | 0 | "enable a pmd on core %u. An alternative core " |
5591 | 0 | "will be assigned.", |
5592 | 0 | rxq->core_id, |
5593 | 0 | netdev_rxq_get_name(rxq->rx), |
5594 | 0 | netdev_rxq_get_queue_id(rxq->rx), |
5595 | 0 | rxq->core_id); |
5596 | 0 | rxqs = xrealloc(rxqs, (n_rxqs + 1) * sizeof *rxqs); |
5597 | 0 | rxqs[n_rxqs++] = rxq; |
5598 | 0 | continue; |
5599 | 0 | } |
5600 | 0 | if (iso) { |
5601 | | /* Mark PMD as isolated if not done already. */ |
5602 | 0 | if (sched_pmd->isolated == false) { |
5603 | 0 | sched_pmd->isolated = true; |
5604 | 0 | numa = sched_pmd->numa; |
5605 | 0 | numa->n_isolated++; |
5606 | 0 | } |
5607 | 0 | } |
5608 | 0 | proc_cycles = dp_netdev_rxq_get_cycles(rxq, |
5609 | 0 | RXQ_CYCLES_PROC_HIST); |
5610 | 0 | VLOG(level, "Core %2u on numa node %d is pinned with " |
5611 | 0 | "port \'%s\' rx queue %d%s", |
5612 | 0 | sched_pmd->pmd->core_id, sched_pmd->pmd->numa_id, |
5613 | 0 | netdev_rxq_get_name(rxq->rx), |
5614 | 0 | netdev_rxq_get_queue_id(rxq->rx), |
5615 | 0 | get_rxq_cyc_log(rxq_cyc_log, algo, proc_cycles)); |
5616 | 0 | sched_pmd_add_rxq(sched_pmd, rxq, proc_cycles); |
5617 | 0 | } else { |
5618 | 0 | rxqs = xrealloc(rxqs, (n_rxqs + 1) * sizeof *rxqs); |
5619 | 0 | rxqs[n_rxqs++] = rxq; |
5620 | 0 | } |
5621 | 0 | } |
5622 | 0 | } |
5623 | |
|
5624 | 0 | if (n_rxqs > 1 && algo != SCHED_ROUNDROBIN) { |
5625 | | /* Sort the queues in order of the processing cycles |
5626 | | * they consumed during their last pmd interval. */ |
5627 | 0 | qsort(rxqs, n_rxqs, sizeof *rxqs, compare_rxq_cycles); |
5628 | 0 | } |
5629 | |
|
5630 | 0 | last_cross_numa = NULL; |
5631 | 0 | n_numa = sched_numa_list_count(numa_list); |
5632 | 0 | for (unsigned i = 0; i < n_rxqs; i++) { |
5633 | 0 | struct dp_netdev_rxq *rxq = rxqs[i]; |
5634 | 0 | struct sched_pmd *sched_pmd = NULL; |
5635 | 0 | struct sched_numa *numa; |
5636 | 0 | int port_numa_id; |
5637 | 0 | uint64_t proc_cycles; |
5638 | 0 | char rxq_cyc_log[MAX_RXQ_CYC_STRLEN]; |
5639 | |
|
5640 | 0 | if (start_logged == false && level != VLL_DBG) { |
5641 | 0 | VLOG(level, "Performing pmd to rx queue assignment using %s " |
5642 | 0 | "algorithm.", get_assignment_type_string(algo)); |
5643 | 0 | start_logged = true; |
5644 | 0 | } |
5645 | | |
5646 | | /* Store the cycles for this rxq as we will log these later. */ |
5647 | 0 | proc_cycles = dp_netdev_rxq_get_cycles(rxq, RXQ_CYCLES_PROC_HIST); |
5648 | |
|
5649 | 0 | port_numa_id = netdev_get_numa_id(rxq->port->netdev); |
5650 | | |
5651 | | /* Select numa. */ |
5652 | 0 | numa = sched_numa_list_lookup(numa_list, port_numa_id); |
5653 | | |
5654 | | /* Check if numa has no PMDs or no non-isolated PMDs. */ |
5655 | 0 | if (!numa || !sched_numa_noniso_pmd_count(numa)) { |
5656 | | /* Unable to use this numa to find a PMD. */ |
5657 | 0 | numa = NULL; |
5658 | | /* Find any numa with available PMDs. */ |
5659 | 0 | for (int j = 0; j < n_numa; j++) { |
5660 | 0 | numa = sched_numa_list_next(numa_list, last_cross_numa); |
5661 | 0 | last_cross_numa = numa; |
5662 | 0 | if (sched_numa_noniso_pmd_count(numa)) { |
5663 | 0 | break; |
5664 | 0 | } |
5665 | 0 | numa = NULL; |
5666 | 0 | } |
5667 | 0 | } |
5668 | |
|
5669 | 0 | if (numa) { |
5670 | | /* Select the PMD that should be used for this rxq. */ |
5671 | 0 | sched_pmd = sched_pmd_next(numa, algo, |
5672 | 0 | proc_cycles ? true : false); |
5673 | 0 | } |
5674 | | |
5675 | | /* Check that a pmd has been selected. */ |
5676 | 0 | if (sched_pmd) { |
5677 | 0 | int pmd_numa_id; |
5678 | |
|
5679 | 0 | pmd_numa_id = sched_pmd->numa->numa_id; |
5680 | | /* Check if selected pmd numa matches port numa. */ |
5681 | 0 | if (pmd_numa_id != port_numa_id) { |
5682 | 0 | VLOG(level, "There's no available (non-isolated) pmd thread " |
5683 | 0 | "on numa node %d. Port \'%s\' rx queue %d will " |
5684 | 0 | "be assigned to a pmd on numa node %d. " |
5685 | 0 | "This may lead to reduced performance.", |
5686 | 0 | port_numa_id, netdev_rxq_get_name(rxq->rx), |
5687 | 0 | netdev_rxq_get_queue_id(rxq->rx), pmd_numa_id); |
5688 | 0 | } |
5689 | 0 | VLOG(level, "Core %2u on numa node %d assigned port \'%s\' " |
5690 | 0 | "rx queue %d%s.", |
5691 | 0 | sched_pmd->pmd->core_id, sched_pmd->pmd->numa_id, |
5692 | 0 | netdev_rxq_get_name(rxq->rx), |
5693 | 0 | netdev_rxq_get_queue_id(rxq->rx), |
5694 | 0 | get_rxq_cyc_log(rxq_cyc_log, algo, proc_cycles)); |
5695 | 0 | sched_pmd_add_rxq(sched_pmd, rxq, proc_cycles); |
5696 | 0 | } else { |
5697 | 0 | VLOG(level == VLL_DBG ? level : VLL_WARN, |
5698 | 0 | "No non-isolated pmd on any numa available for " |
5699 | 0 | "port \'%s\' rx queue %d%s. " |
5700 | 0 | "This rx queue will not be polled.", |
5701 | 0 | netdev_rxq_get_name(rxq->rx), |
5702 | 0 | netdev_rxq_get_queue_id(rxq->rx), |
5703 | 0 | get_rxq_cyc_log(rxq_cyc_log, algo, proc_cycles)); |
5704 | 0 | } |
5705 | 0 | } |
5706 | 0 | free(rxqs); |
5707 | 0 | } |
5708 | | |
5709 | | static void |
5710 | | rxq_scheduling(struct dp_netdev *dp) |
5711 | | OVS_REQ_RDLOCK(dp->port_rwlock) |
5712 | 0 | { |
5713 | 0 | struct sched_numa_list numa_list; |
5714 | 0 | enum sched_assignment_type algo = dp->pmd_rxq_assign_type; |
5715 | |
|
5716 | 0 | sched_numa_list_populate(&numa_list, dp); |
5717 | 0 | sched_numa_list_schedule(&numa_list, dp, algo, VLL_INFO); |
5718 | 0 | sched_numa_list_put_in_place(&numa_list); |
5719 | |
|
5720 | 0 | sched_numa_list_free_entries(&numa_list); |
5721 | 0 | } |
5722 | | |
5723 | | static uint64_t variance(uint64_t a[], int n); |
5724 | | |
5725 | | static uint64_t |
5726 | | sched_numa_variance(struct sched_numa *numa) |
5727 | 0 | { |
5728 | 0 | uint64_t *percent_busy = NULL; |
5729 | 0 | int n_proc = 0; |
5730 | 0 | uint64_t var; |
5731 | |
|
5732 | 0 | percent_busy = xmalloc(numa->n_pmds * sizeof *percent_busy); |
5733 | |
|
5734 | 0 | for (unsigned i = 0; i < numa->n_pmds; i++) { |
5735 | 0 | struct sched_pmd *sched_pmd; |
5736 | 0 | uint64_t total_cycles = 0; |
5737 | |
|
5738 | 0 | sched_pmd = &numa->pmds[i]; |
5739 | | /* Exclude isolated PMDs from variance calculations. */ |
5740 | 0 | if (sched_pmd->isolated == true) { |
5741 | 0 | continue; |
5742 | 0 | } |
5743 | | /* Get the total pmd cycles for an interval. */ |
5744 | 0 | atomic_read_relaxed(&sched_pmd->pmd->intrvl_cycles, &total_cycles); |
5745 | |
|
5746 | 0 | if (total_cycles) { |
5747 | | /* Estimate the cycles to cover all intervals. */ |
5748 | 0 | total_cycles *= PMD_INTERVAL_MAX; |
5749 | 0 | percent_busy[n_proc++] = (sched_pmd->pmd_proc_cycles * 100) |
5750 | 0 | / total_cycles; |
5751 | 0 | } else { |
5752 | 0 | percent_busy[n_proc++] = 0; |
5753 | 0 | } |
5754 | 0 | } |
5755 | 0 | var = variance(percent_busy, n_proc); |
5756 | 0 | free(percent_busy); |
5757 | 0 | return var; |
5758 | 0 | } |
5759 | | |
5760 | | /* |
5761 | | * This function checks that some basic conditions needed for a rebalance to be |
5762 | | * effective are met. Such as Rxq scheduling assignment type, more than one |
5763 | | * PMD, more than 2 Rxqs on a PMD. If there was no reconfiguration change |
5764 | | * since the last check, it reuses the last result. |
5765 | | * |
5766 | | * It is not intended to be an inclusive check of every condition that may make |
5767 | | * a rebalance ineffective. It is done as a quick check so a full |
5768 | | * pmd_rebalance_dry_run() can be avoided when it is not needed. |
5769 | | */ |
5770 | | static bool |
5771 | | pmd_rebalance_dry_run_needed(struct dp_netdev *dp) |
5772 | | OVS_REQ_RDLOCK(dp->port_rwlock) |
5773 | 0 | { |
5774 | 0 | struct dp_netdev_pmd_thread *pmd; |
5775 | 0 | struct pmd_auto_lb *pmd_alb = &dp->pmd_alb; |
5776 | 0 | unsigned int cnt = 0; |
5777 | 0 | bool multi_rxq = false; |
5778 | | |
5779 | | /* Check if there was no reconfiguration since last check. */ |
5780 | 0 | if (!pmd_alb->recheck_config) { |
5781 | 0 | if (!pmd_alb->do_dry_run) { |
5782 | 0 | VLOG_DBG("PMD auto load balance nothing to do, " |
5783 | 0 | "no configuration changes since last check."); |
5784 | 0 | return false; |
5785 | 0 | } |
5786 | 0 | return true; |
5787 | 0 | } |
5788 | 0 | pmd_alb->recheck_config = false; |
5789 | | |
5790 | | /* Check for incompatible assignment type. */ |
5791 | 0 | if (dp->pmd_rxq_assign_type == SCHED_ROUNDROBIN) { |
5792 | 0 | VLOG_DBG("PMD auto load balance nothing to do, " |
5793 | 0 | "pmd-rxq-assign=roundrobin assignment type configured."); |
5794 | 0 | return pmd_alb->do_dry_run = false; |
5795 | 0 | } |
5796 | | |
5797 | | /* Check that there is at least 2 non-isolated PMDs and |
5798 | | * one of them is polling more than one rxq. */ |
5799 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
5800 | 0 | if (pmd->core_id == NON_PMD_CORE_ID || pmd->isolated) { |
5801 | 0 | continue; |
5802 | 0 | } |
5803 | | |
5804 | 0 | if (hmap_count(&pmd->poll_list) > 1) { |
5805 | 0 | multi_rxq = true; |
5806 | 0 | } |
5807 | 0 | if (cnt && multi_rxq) { |
5808 | 0 | return pmd_alb->do_dry_run = true; |
5809 | 0 | } |
5810 | 0 | cnt++; |
5811 | 0 | } |
5812 | | |
5813 | 0 | VLOG_DBG("PMD auto load balance nothing to do, " |
5814 | 0 | "not enough non-isolated PMDs or RxQs."); |
5815 | 0 | return pmd_alb->do_dry_run = false; |
5816 | 0 | } |
5817 | | |
5818 | | static bool |
5819 | | pmd_rebalance_dry_run(struct dp_netdev *dp) |
5820 | | OVS_REQ_RDLOCK(dp->port_rwlock) |
5821 | 0 | { |
5822 | 0 | struct sched_numa_list numa_list_cur; |
5823 | 0 | struct sched_numa_list numa_list_est; |
5824 | 0 | bool thresh_met = false; |
5825 | |
|
5826 | 0 | VLOG_DBG("PMD auto load balance performing dry run."); |
5827 | | |
5828 | | /* Populate current assignments. */ |
5829 | 0 | sched_numa_list_populate(&numa_list_cur, dp); |
5830 | 0 | sched_numa_list_assignments(&numa_list_cur, dp); |
5831 | | |
5832 | | /* Populate estimated assignments. */ |
5833 | 0 | sched_numa_list_populate(&numa_list_est, dp); |
5834 | 0 | sched_numa_list_schedule(&numa_list_est, dp, |
5835 | 0 | dp->pmd_rxq_assign_type, VLL_DBG); |
5836 | | |
5837 | | /* Check if cross-numa polling, there is only one numa with PMDs. */ |
5838 | 0 | if (!sched_numa_list_cross_numa_polling(&numa_list_est) || |
5839 | 0 | sched_numa_list_count(&numa_list_est) == 1) { |
5840 | 0 | struct sched_numa *numa_cur; |
5841 | | |
5842 | | /* Calculate variances. */ |
5843 | 0 | HMAP_FOR_EACH (numa_cur, node, &numa_list_cur.numas) { |
5844 | 0 | uint64_t current_var, estimate_var; |
5845 | 0 | struct sched_numa *numa_est; |
5846 | 0 | uint64_t improvement = 0; |
5847 | |
|
5848 | 0 | numa_est = sched_numa_list_lookup(&numa_list_est, |
5849 | 0 | numa_cur->numa_id); |
5850 | 0 | if (!numa_est) { |
5851 | 0 | continue; |
5852 | 0 | } |
5853 | 0 | current_var = sched_numa_variance(numa_cur); |
5854 | 0 | estimate_var = sched_numa_variance(numa_est); |
5855 | 0 | if (estimate_var < current_var) { |
5856 | 0 | improvement = ((current_var - estimate_var) * 100) |
5857 | 0 | / current_var; |
5858 | 0 | } |
5859 | 0 | VLOG_DBG("Numa node %d. Current variance %"PRIu64" Estimated " |
5860 | 0 | "variance %"PRIu64". Variance improvement %"PRIu64"%%.", |
5861 | 0 | numa_cur->numa_id, current_var, |
5862 | 0 | estimate_var, improvement); |
5863 | 0 | if (improvement >= dp->pmd_alb.rebalance_improve_thresh) { |
5864 | 0 | thresh_met = true; |
5865 | 0 | } |
5866 | 0 | } |
5867 | 0 | VLOG_DBG("PMD load variance improvement threshold %u%% is %s.", |
5868 | 0 | dp->pmd_alb.rebalance_improve_thresh, |
5869 | 0 | thresh_met ? "met" : "not met"); |
5870 | 0 | } else { |
5871 | 0 | VLOG_DBG("PMD auto load balance detected cross-numa polling with " |
5872 | 0 | "multiple numa nodes. Unable to accurately estimate."); |
5873 | 0 | } |
5874 | |
|
5875 | 0 | sched_numa_list_free_entries(&numa_list_cur); |
5876 | 0 | sched_numa_list_free_entries(&numa_list_est); |
5877 | |
|
5878 | 0 | return thresh_met; |
5879 | 0 | } |
5880 | | |
5881 | | static void |
5882 | | reload_affected_pmds(struct dp_netdev *dp) |
5883 | 0 | { |
5884 | 0 | struct dp_netdev_pmd_thread *pmd; |
5885 | |
|
5886 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
5887 | 0 | if (pmd->need_reload) { |
5888 | 0 | dp_netdev_reload_pmd__(pmd); |
5889 | 0 | } |
5890 | 0 | } |
5891 | |
|
5892 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
5893 | 0 | if (pmd->need_reload) { |
5894 | 0 | if (pmd->core_id != NON_PMD_CORE_ID) { |
5895 | 0 | bool reload; |
5896 | |
|
5897 | 0 | do { |
5898 | 0 | atomic_read_explicit(&pmd->reload, &reload, |
5899 | 0 | memory_order_acquire); |
5900 | 0 | } while (reload); |
5901 | 0 | } |
5902 | 0 | pmd->need_reload = false; |
5903 | 0 | } |
5904 | 0 | } |
5905 | 0 | } |
5906 | | |
5907 | | static void |
5908 | | reconfigure_pmd_threads(struct dp_netdev *dp) |
5909 | | OVS_REQ_RDLOCK(dp->port_rwlock) |
5910 | 0 | { |
5911 | 0 | struct dp_netdev_pmd_thread *pmd; |
5912 | 0 | struct ovs_numa_dump *pmd_cores; |
5913 | 0 | struct ovs_numa_info_core *core; |
5914 | 0 | struct hmapx to_delete = HMAPX_INITIALIZER(&to_delete); |
5915 | 0 | struct hmapx_node *node; |
5916 | 0 | bool changed = false; |
5917 | 0 | bool need_to_adjust_static_tx_qids = false; |
5918 | | |
5919 | | /* The pmd threads should be started only if there's a pmd port in the |
5920 | | * datapath. If the user didn't provide any "pmd-cpu-mask", we start |
5921 | | * NR_PMD_THREADS per numa node. */ |
5922 | 0 | if (!has_pmd_port(dp)) { |
5923 | 0 | pmd_cores = ovs_numa_dump_n_cores_per_numa(0); |
5924 | 0 | } else if (dp->pmd_cmask && dp->pmd_cmask[0]) { |
5925 | 0 | pmd_cores = ovs_numa_dump_cores_with_cmask(dp->pmd_cmask); |
5926 | 0 | } else { |
5927 | 0 | pmd_cores = ovs_numa_dump_n_cores_per_numa(NR_PMD_THREADS); |
5928 | 0 | } |
5929 | | |
5930 | | /* We need to adjust 'static_tx_qid's only if we're reducing number of |
5931 | | * PMD threads. Otherwise, new threads will allocate all the freed ids. */ |
5932 | 0 | if (ovs_numa_dump_count(pmd_cores) < cmap_count(&dp->poll_threads) - 1) { |
5933 | | /* Adjustment is required to keep 'static_tx_qid's sequential and |
5934 | | * avoid possible issues, for example, imbalanced tx queue usage |
5935 | | * and unnecessary locking caused by remapping on netdev level. */ |
5936 | 0 | need_to_adjust_static_tx_qids = true; |
5937 | 0 | } |
5938 | | |
5939 | | /* Check for unwanted pmd threads */ |
5940 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
5941 | 0 | if (pmd->core_id == NON_PMD_CORE_ID) { |
5942 | 0 | continue; |
5943 | 0 | } |
5944 | 0 | if (!ovs_numa_dump_contains_core(pmd_cores, pmd->numa_id, |
5945 | 0 | pmd->core_id)) { |
5946 | 0 | hmapx_add(&to_delete, pmd); |
5947 | 0 | } else if (need_to_adjust_static_tx_qids) { |
5948 | 0 | atomic_store_relaxed(&pmd->reload_tx_qid, true); |
5949 | 0 | pmd->need_reload = true; |
5950 | 0 | } |
5951 | 0 | } |
5952 | |
|
5953 | 0 | HMAPX_FOR_EACH (node, &to_delete) { |
5954 | 0 | pmd = (struct dp_netdev_pmd_thread *) node->data; |
5955 | 0 | VLOG_INFO("PMD thread on numa_id: %d, core id: %2d destroyed.", |
5956 | 0 | pmd->numa_id, pmd->core_id); |
5957 | 0 | dp_netdev_del_pmd(dp, pmd); |
5958 | 0 | } |
5959 | 0 | changed = !hmapx_is_empty(&to_delete); |
5960 | 0 | hmapx_destroy(&to_delete); |
5961 | |
|
5962 | 0 | if (need_to_adjust_static_tx_qids) { |
5963 | | /* 'static_tx_qid's are not sequential now. |
5964 | | * Reload remaining threads to fix this. */ |
5965 | 0 | reload_affected_pmds(dp); |
5966 | 0 | } |
5967 | | |
5968 | | /* Check for required new pmd threads */ |
5969 | 0 | FOR_EACH_CORE_ON_DUMP(core, pmd_cores) { |
5970 | 0 | pmd = dp_netdev_get_pmd(dp, core->core_id); |
5971 | 0 | if (!pmd) { |
5972 | 0 | struct ds name = DS_EMPTY_INITIALIZER; |
5973 | |
|
5974 | 0 | pmd = xzalloc(sizeof *pmd); |
5975 | 0 | dp_netdev_configure_pmd(pmd, dp, core->core_id, core->numa_id); |
5976 | |
|
5977 | 0 | ds_put_format(&name, "pmd-c%02d/id:", core->core_id); |
5978 | 0 | pmd->thread = ovs_thread_create(ds_cstr(&name), |
5979 | 0 | pmd_thread_main, pmd); |
5980 | 0 | ds_destroy(&name); |
5981 | |
|
5982 | 0 | VLOG_INFO("PMD thread on numa_id: %d, core id: %2d created.", |
5983 | 0 | pmd->numa_id, pmd->core_id); |
5984 | 0 | changed = true; |
5985 | 0 | } else { |
5986 | 0 | dp_netdev_pmd_unref(pmd); |
5987 | 0 | } |
5988 | 0 | } |
5989 | |
|
5990 | 0 | if (changed) { |
5991 | 0 | struct ovs_numa_info_numa *numa; |
5992 | | |
5993 | | /* Log the number of pmd threads per numa node. */ |
5994 | 0 | FOR_EACH_NUMA_ON_DUMP (numa, pmd_cores) { |
5995 | 0 | VLOG_INFO("There are %"PRIuSIZE" pmd threads on numa node %d", |
5996 | 0 | numa->n_cores, numa->numa_id); |
5997 | 0 | } |
5998 | 0 | } |
5999 | |
|
6000 | 0 | ovs_numa_dump_destroy(pmd_cores); |
6001 | 0 | } |
6002 | | |
6003 | | static void |
6004 | | pmd_remove_stale_ports(struct dp_netdev *dp, |
6005 | | struct dp_netdev_pmd_thread *pmd) |
6006 | | OVS_EXCLUDED(pmd->port_mutex) |
6007 | | OVS_REQ_RDLOCK(dp->port_rwlock) |
6008 | 0 | { |
6009 | 0 | struct rxq_poll *poll; |
6010 | 0 | struct tx_port *tx; |
6011 | |
|
6012 | 0 | ovs_mutex_lock(&pmd->port_mutex); |
6013 | 0 | HMAP_FOR_EACH_SAFE (poll, node, &pmd->poll_list) { |
6014 | 0 | struct dp_netdev_port *port = poll->rxq->port; |
6015 | |
|
6016 | 0 | if (port->need_reconfigure |
6017 | 0 | || !hmap_contains(&dp->ports, &port->node)) { |
6018 | 0 | dp_netdev_del_rxq_from_pmd(pmd, poll); |
6019 | 0 | } |
6020 | 0 | } |
6021 | 0 | HMAP_FOR_EACH_SAFE (tx, node, &pmd->tx_ports) { |
6022 | 0 | struct dp_netdev_port *port = tx->port; |
6023 | |
|
6024 | 0 | if (port->need_reconfigure |
6025 | 0 | || !hmap_contains(&dp->ports, &port->node)) { |
6026 | 0 | dp_netdev_del_port_tx_from_pmd(pmd, tx); |
6027 | 0 | } |
6028 | 0 | } |
6029 | 0 | ovs_mutex_unlock(&pmd->port_mutex); |
6030 | 0 | } |
6031 | | |
6032 | | /* Must be called each time a port is added/removed or the cmask changes. |
6033 | | * This creates and destroys pmd threads, reconfigures ports, opens their |
6034 | | * rxqs and assigns all rxqs/txqs to pmd threads. */ |
6035 | | static void |
6036 | | reconfigure_datapath(struct dp_netdev *dp) |
6037 | | OVS_REQ_RDLOCK(dp->port_rwlock) |
6038 | 0 | { |
6039 | 0 | struct hmapx busy_threads = HMAPX_INITIALIZER(&busy_threads); |
6040 | 0 | struct dp_netdev_pmd_thread *pmd; |
6041 | 0 | struct dp_netdev_port *port; |
6042 | 0 | int wanted_txqs; |
6043 | |
|
6044 | 0 | dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq); |
6045 | | |
6046 | | /* Step 1: Adjust the pmd threads based on the datapath ports, the cores |
6047 | | * on the system and the user configuration. */ |
6048 | 0 | reconfigure_pmd_threads(dp); |
6049 | |
|
6050 | 0 | wanted_txqs = cmap_count(&dp->poll_threads); |
6051 | | |
6052 | | /* The number of pmd threads might have changed, or a port can be new: |
6053 | | * adjust the txqs. */ |
6054 | 0 | HMAP_FOR_EACH (port, node, &dp->ports) { |
6055 | 0 | netdev_set_tx_multiq(port->netdev, wanted_txqs); |
6056 | 0 | } |
6057 | | |
6058 | | /* Step 2: Remove from the pmd threads ports that have been removed or |
6059 | | * need reconfiguration. */ |
6060 | | |
6061 | | /* Check for all the ports that need reconfiguration. We cache this in |
6062 | | * 'port->need_reconfigure', because netdev_is_reconf_required() can |
6063 | | * change at any time. |
6064 | | * Also mark for reconfiguration all ports which will likely change their |
6065 | | * 'txq_mode' parameter. It's required to stop using them before |
6066 | | * changing this setting and it's simpler to mark ports here and allow |
6067 | | * 'pmd_remove_stale_ports' to remove them from threads. There will be |
6068 | | * no actual reconfiguration in 'port_reconfigure' because it's |
6069 | | * unnecessary. */ |
6070 | 0 | HMAP_FOR_EACH (port, node, &dp->ports) { |
6071 | 0 | if (netdev_is_reconf_required(port->netdev) |
6072 | 0 | || ((port->txq_mode == TXQ_MODE_XPS) |
6073 | 0 | != (netdev_n_txq(port->netdev) < wanted_txqs)) |
6074 | 0 | || ((port->txq_mode == TXQ_MODE_XPS_HASH) |
6075 | 0 | != (port->txq_requested_mode == TXQ_REQ_MODE_HASH |
6076 | 0 | && netdev_n_txq(port->netdev) > 1))) { |
6077 | 0 | port->need_reconfigure = true; |
6078 | 0 | } |
6079 | 0 | } |
6080 | | |
6081 | | /* Remove from the pmd threads all the ports that have been deleted or |
6082 | | * need reconfiguration. */ |
6083 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
6084 | 0 | pmd_remove_stale_ports(dp, pmd); |
6085 | 0 | } |
6086 | | |
6087 | | /* Reload affected pmd threads. We must wait for the pmd threads before |
6088 | | * reconfiguring the ports, because a port cannot be reconfigured while |
6089 | | * it's being used. */ |
6090 | 0 | reload_affected_pmds(dp); |
6091 | | |
6092 | | /* Step 3: Reconfigure ports. */ |
6093 | | |
6094 | | /* We only reconfigure the ports that we determined above, because they're |
6095 | | * not being used by any pmd thread at the moment. If a port fails to |
6096 | | * reconfigure we remove it from the datapath. */ |
6097 | 0 | HMAP_FOR_EACH_SAFE (port, node, &dp->ports) { |
6098 | 0 | int err; |
6099 | |
|
6100 | 0 | if (!port->need_reconfigure) { |
6101 | 0 | continue; |
6102 | 0 | } |
6103 | | |
6104 | 0 | err = port_reconfigure(port); |
6105 | 0 | if (err) { |
6106 | 0 | hmap_remove(&dp->ports, &port->node); |
6107 | 0 | seq_change(dp->port_seq); |
6108 | 0 | port_destroy(port); |
6109 | 0 | } else { |
6110 | | /* With a single queue, there is no point in using hash mode. */ |
6111 | 0 | if (port->txq_requested_mode == TXQ_REQ_MODE_HASH && |
6112 | 0 | netdev_n_txq(port->netdev) > 1) { |
6113 | 0 | port->txq_mode = TXQ_MODE_XPS_HASH; |
6114 | 0 | } else if (netdev_n_txq(port->netdev) < wanted_txqs) { |
6115 | 0 | port->txq_mode = TXQ_MODE_XPS; |
6116 | 0 | } else { |
6117 | 0 | port->txq_mode = TXQ_MODE_STATIC; |
6118 | 0 | } |
6119 | 0 | } |
6120 | 0 | } |
6121 | | |
6122 | | /* Step 4: Compute new rxq scheduling. We don't touch the pmd threads |
6123 | | * for now, we just update the 'pmd' pointer in each rxq to point to the |
6124 | | * wanted thread according to the scheduling policy. */ |
6125 | | |
6126 | | /* Reset all the pmd threads to non isolated. */ |
6127 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
6128 | 0 | pmd->isolated = false; |
6129 | 0 | } |
6130 | | |
6131 | | /* Reset all the queues to unassigned */ |
6132 | 0 | HMAP_FOR_EACH (port, node, &dp->ports) { |
6133 | 0 | for (int i = 0; i < port->n_rxq; i++) { |
6134 | 0 | port->rxqs[i].pmd = NULL; |
6135 | 0 | } |
6136 | 0 | } |
6137 | 0 | rxq_scheduling(dp); |
6138 | | |
6139 | | /* Step 5: Remove queues not compliant with new scheduling. */ |
6140 | | |
6141 | | /* Count all the threads that will have at least one queue to poll. */ |
6142 | 0 | HMAP_FOR_EACH (port, node, &dp->ports) { |
6143 | 0 | for (int qid = 0; qid < port->n_rxq; qid++) { |
6144 | 0 | struct dp_netdev_rxq *q = &port->rxqs[qid]; |
6145 | |
|
6146 | 0 | if (q->pmd) { |
6147 | 0 | hmapx_add(&busy_threads, q->pmd); |
6148 | 0 | } |
6149 | 0 | } |
6150 | 0 | } |
6151 | |
|
6152 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
6153 | 0 | struct rxq_poll *poll; |
6154 | |
|
6155 | 0 | ovs_mutex_lock(&pmd->port_mutex); |
6156 | 0 | HMAP_FOR_EACH_SAFE (poll, node, &pmd->poll_list) { |
6157 | 0 | if (poll->rxq->pmd != pmd) { |
6158 | 0 | dp_netdev_del_rxq_from_pmd(pmd, poll); |
6159 | | |
6160 | | /* This pmd might sleep after this step if it has no rxq |
6161 | | * remaining. Tell it to busy wait for new assignment if it |
6162 | | * has at least one scheduled queue. */ |
6163 | 0 | if (hmap_count(&pmd->poll_list) == 0 && |
6164 | 0 | hmapx_contains(&busy_threads, pmd)) { |
6165 | 0 | atomic_store_relaxed(&pmd->wait_for_reload, true); |
6166 | 0 | } |
6167 | 0 | } |
6168 | 0 | } |
6169 | 0 | ovs_mutex_unlock(&pmd->port_mutex); |
6170 | 0 | } |
6171 | |
|
6172 | 0 | hmapx_destroy(&busy_threads); |
6173 | | |
6174 | | /* Reload affected pmd threads. We must wait for the pmd threads to remove |
6175 | | * the old queues before readding them, otherwise a queue can be polled by |
6176 | | * two threads at the same time. */ |
6177 | 0 | reload_affected_pmds(dp); |
6178 | | |
6179 | | /* Step 6: Add queues from scheduling, if they're not there already. */ |
6180 | 0 | HMAP_FOR_EACH (port, node, &dp->ports) { |
6181 | 0 | if (!netdev_is_pmd(port->netdev)) { |
6182 | 0 | continue; |
6183 | 0 | } |
6184 | | |
6185 | 0 | for (int qid = 0; qid < port->n_rxq; qid++) { |
6186 | 0 | struct dp_netdev_rxq *q = &port->rxqs[qid]; |
6187 | |
|
6188 | 0 | if (q->pmd) { |
6189 | 0 | ovs_mutex_lock(&q->pmd->port_mutex); |
6190 | 0 | dp_netdev_add_rxq_to_pmd(q->pmd, q); |
6191 | 0 | ovs_mutex_unlock(&q->pmd->port_mutex); |
6192 | 0 | } |
6193 | 0 | } |
6194 | 0 | } |
6195 | | |
6196 | | /* Add every port and bond to the tx port and bond caches of |
6197 | | * every pmd thread, if it's not there already and if this pmd |
6198 | | * has at least one rxq to poll. |
6199 | | */ |
6200 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
6201 | 0 | ovs_mutex_lock(&pmd->port_mutex); |
6202 | 0 | if (hmap_count(&pmd->poll_list) || pmd->core_id == NON_PMD_CORE_ID) { |
6203 | 0 | struct tx_bond *bond; |
6204 | |
|
6205 | 0 | HMAP_FOR_EACH (port, node, &dp->ports) { |
6206 | 0 | dp_netdev_add_port_tx_to_pmd(pmd, port); |
6207 | 0 | } |
6208 | |
|
6209 | 0 | CMAP_FOR_EACH (bond, node, &dp->tx_bonds) { |
6210 | 0 | dp_netdev_add_bond_tx_to_pmd(pmd, bond, false); |
6211 | 0 | } |
6212 | 0 | } |
6213 | 0 | ovs_mutex_unlock(&pmd->port_mutex); |
6214 | 0 | } |
6215 | | |
6216 | | /* Reload affected pmd threads. */ |
6217 | 0 | reload_affected_pmds(dp); |
6218 | | |
6219 | | /* PMD ALB will need to recheck if dry run needed. */ |
6220 | 0 | dp->pmd_alb.recheck_config = true; |
6221 | 0 | } |
6222 | | |
6223 | | /* Returns true if one of the netdevs in 'dp' requires a reconfiguration */ |
6224 | | static bool |
6225 | | ports_require_restart(const struct dp_netdev *dp) |
6226 | | OVS_REQ_RDLOCK(dp->port_rwlock) |
6227 | 0 | { |
6228 | 0 | struct dp_netdev_port *port; |
6229 | |
|
6230 | 0 | HMAP_FOR_EACH (port, node, &dp->ports) { |
6231 | 0 | if (netdev_is_reconf_required(port->netdev)) { |
6232 | 0 | return true; |
6233 | 0 | } |
6234 | 0 | } |
6235 | | |
6236 | 0 | return false; |
6237 | 0 | } |
6238 | | |
6239 | | /* Calculates variance in the values stored in array 'a'. 'n' is the number |
6240 | | * of elements in array to be considered for calculating vairance. |
6241 | | * Usage example: data array 'a' contains the processing load of each pmd and |
6242 | | * 'n' is the number of PMDs. It returns the variance in processing load of |
6243 | | * PMDs*/ |
6244 | | static uint64_t |
6245 | | variance(uint64_t a[], int n) |
6246 | 0 | { |
6247 | | /* Compute mean (average of elements). */ |
6248 | 0 | uint64_t sum = 0; |
6249 | 0 | uint64_t mean = 0; |
6250 | 0 | uint64_t sqDiff = 0; |
6251 | |
|
6252 | 0 | if (!n) { |
6253 | 0 | return 0; |
6254 | 0 | } |
6255 | | |
6256 | 0 | for (int i = 0; i < n; i++) { |
6257 | 0 | sum += a[i]; |
6258 | 0 | } |
6259 | |
|
6260 | 0 | if (sum) { |
6261 | 0 | mean = sum / n; |
6262 | | |
6263 | | /* Compute sum squared differences with mean. */ |
6264 | 0 | for (int i = 0; i < n; i++) { |
6265 | 0 | sqDiff += (a[i] - mean)*(a[i] - mean); |
6266 | 0 | } |
6267 | 0 | } |
6268 | 0 | return (sqDiff ? (sqDiff / n) : 0); |
6269 | 0 | } |
6270 | | |
6271 | | /* Return true if needs to revalidate datapath flows. */ |
6272 | | static bool |
6273 | | dpif_netdev_run(struct dpif *dpif) |
6274 | 0 | { |
6275 | 0 | struct dp_netdev_port *port; |
6276 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
6277 | 0 | struct dp_netdev_pmd_thread *non_pmd; |
6278 | 0 | uint64_t new_tnl_seq; |
6279 | 0 | bool need_to_flush = true; |
6280 | 0 | bool pmd_rebalance = false; |
6281 | 0 | long long int now = time_msec(); |
6282 | 0 | struct dp_netdev_pmd_thread *pmd; |
6283 | |
|
6284 | 0 | ovs_rwlock_rdlock(&dp->port_rwlock); |
6285 | 0 | non_pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID); |
6286 | 0 | if (non_pmd) { |
6287 | 0 | ovs_mutex_lock(&dp->non_pmd_mutex); |
6288 | |
|
6289 | 0 | atomic_read_relaxed(&dp->smc_enable_db, &non_pmd->ctx.smc_enable_db); |
6290 | |
|
6291 | 0 | HMAP_FOR_EACH (port, node, &dp->ports) { |
6292 | 0 | if (!netdev_is_pmd(port->netdev)) { |
6293 | 0 | int i; |
6294 | |
|
6295 | 0 | if (port->emc_enabled) { |
6296 | 0 | atomic_read_relaxed(&dp->emc_insert_min, |
6297 | 0 | &non_pmd->ctx.emc_insert_min); |
6298 | 0 | } else { |
6299 | 0 | non_pmd->ctx.emc_insert_min = 0; |
6300 | 0 | } |
6301 | |
|
6302 | 0 | for (i = 0; i < port->n_rxq; i++) { |
6303 | |
|
6304 | 0 | if (!netdev_rxq_enabled(port->rxqs[i].rx)) { |
6305 | 0 | continue; |
6306 | 0 | } |
6307 | | |
6308 | 0 | if (dp_netdev_process_rxq_port(non_pmd, |
6309 | 0 | &port->rxqs[i], |
6310 | 0 | port->port_no)) { |
6311 | 0 | need_to_flush = false; |
6312 | 0 | } |
6313 | 0 | } |
6314 | 0 | } |
6315 | 0 | } |
6316 | 0 | if (need_to_flush) { |
6317 | | /* We didn't receive anything in the process loop. |
6318 | | * Check if we need to send something. |
6319 | | * There was no time updates on current iteration. */ |
6320 | 0 | pmd_thread_ctx_time_update(non_pmd); |
6321 | 0 | dp_netdev_pmd_flush_output_packets(non_pmd, false); |
6322 | 0 | } |
6323 | |
|
6324 | 0 | dpif_netdev_xps_revalidate_pmd(non_pmd, false); |
6325 | 0 | ovs_mutex_unlock(&dp->non_pmd_mutex); |
6326 | |
|
6327 | 0 | dp_netdev_pmd_unref(non_pmd); |
6328 | 0 | } |
6329 | |
|
6330 | 0 | struct pmd_auto_lb *pmd_alb = &dp->pmd_alb; |
6331 | 0 | if (pmd_alb->is_enabled) { |
6332 | 0 | if (!pmd_alb->rebalance_poll_timer) { |
6333 | 0 | pmd_alb->rebalance_poll_timer = now; |
6334 | 0 | } else if ((pmd_alb->rebalance_poll_timer + |
6335 | 0 | pmd_alb->rebalance_intvl) < now) { |
6336 | 0 | pmd_alb->rebalance_poll_timer = now; |
6337 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
6338 | 0 | if (atomic_count_get(&pmd->pmd_overloaded) >= |
6339 | 0 | PMD_INTERVAL_MAX) { |
6340 | 0 | pmd_rebalance = true; |
6341 | 0 | break; |
6342 | 0 | } |
6343 | 0 | } |
6344 | |
|
6345 | 0 | if (pmd_rebalance && |
6346 | 0 | !dp_netdev_is_reconf_required(dp) && |
6347 | 0 | !ports_require_restart(dp) && |
6348 | 0 | pmd_rebalance_dry_run_needed(dp) && |
6349 | 0 | pmd_rebalance_dry_run(dp)) { |
6350 | 0 | VLOG_INFO("PMD auto load balance dry run. " |
6351 | 0 | "Requesting datapath reconfigure."); |
6352 | 0 | dp_netdev_request_reconfigure(dp); |
6353 | 0 | } |
6354 | 0 | } |
6355 | 0 | } |
6356 | |
|
6357 | 0 | if (dp_netdev_is_reconf_required(dp) || ports_require_restart(dp)) { |
6358 | 0 | reconfigure_datapath(dp); |
6359 | 0 | } |
6360 | 0 | ovs_rwlock_unlock(&dp->port_rwlock); |
6361 | |
|
6362 | 0 | tnl_neigh_cache_run(); |
6363 | 0 | tnl_port_map_run(); |
6364 | 0 | new_tnl_seq = seq_read(tnl_conf_seq); |
6365 | |
|
6366 | 0 | if (dp->last_tnl_conf_seq != new_tnl_seq) { |
6367 | 0 | dp->last_tnl_conf_seq = new_tnl_seq; |
6368 | 0 | return true; |
6369 | 0 | } |
6370 | 0 | return false; |
6371 | 0 | } |
6372 | | |
6373 | | static void |
6374 | | dpif_netdev_wait(struct dpif *dpif) |
6375 | 0 | { |
6376 | 0 | struct dp_netdev_port *port; |
6377 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
6378 | |
|
6379 | 0 | ovs_mutex_lock(&dp_netdev_mutex); |
6380 | 0 | ovs_rwlock_rdlock(&dp->port_rwlock); |
6381 | 0 | HMAP_FOR_EACH (port, node, &dp->ports) { |
6382 | 0 | netdev_wait_reconf_required(port->netdev); |
6383 | 0 | if (!netdev_is_pmd(port->netdev)) { |
6384 | 0 | int i; |
6385 | |
|
6386 | 0 | for (i = 0; i < port->n_rxq; i++) { |
6387 | 0 | netdev_rxq_wait(port->rxqs[i].rx); |
6388 | 0 | } |
6389 | 0 | } |
6390 | 0 | } |
6391 | 0 | ovs_rwlock_unlock(&dp->port_rwlock); |
6392 | 0 | ovs_mutex_unlock(&dp_netdev_mutex); |
6393 | 0 | seq_wait(tnl_conf_seq, dp->last_tnl_conf_seq); |
6394 | 0 | } |
6395 | | |
6396 | | static void |
6397 | | pmd_free_cached_ports(struct dp_netdev_pmd_thread *pmd) |
6398 | 0 | { |
6399 | 0 | struct tx_port *tx_port_cached; |
6400 | | |
6401 | | /* Flush all the queued packets. */ |
6402 | 0 | dp_netdev_pmd_flush_output_packets(pmd, true); |
6403 | | /* Free all used tx queue ids. */ |
6404 | 0 | dpif_netdev_xps_revalidate_pmd(pmd, true); |
6405 | |
|
6406 | 0 | HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->tnl_port_cache) { |
6407 | 0 | free(tx_port_cached->txq_pkts); |
6408 | 0 | free(tx_port_cached); |
6409 | 0 | } |
6410 | 0 | HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->send_port_cache) { |
6411 | 0 | free(tx_port_cached->txq_pkts); |
6412 | 0 | free(tx_port_cached); |
6413 | 0 | } |
6414 | 0 | } |
6415 | | |
6416 | | /* Copies ports from 'pmd->tx_ports' (shared with the main thread) to |
6417 | | * thread-local copies. Copy to 'pmd->tnl_port_cache' if it is a tunnel |
6418 | | * device, otherwise to 'pmd->send_port_cache' if the port has at least |
6419 | | * one txq. */ |
6420 | | static void |
6421 | | pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd) |
6422 | | OVS_REQUIRES(pmd->port_mutex) |
6423 | 0 | { |
6424 | 0 | struct tx_port *tx_port, *tx_port_cached; |
6425 | |
|
6426 | 0 | pmd_free_cached_ports(pmd); |
6427 | 0 | hmap_shrink(&pmd->send_port_cache); |
6428 | 0 | hmap_shrink(&pmd->tnl_port_cache); |
6429 | |
|
6430 | 0 | HMAP_FOR_EACH (tx_port, node, &pmd->tx_ports) { |
6431 | 0 | int n_txq = netdev_n_txq(tx_port->port->netdev); |
6432 | 0 | struct dp_packet_batch *txq_pkts_cached; |
6433 | |
|
6434 | 0 | if (netdev_has_tunnel_push_pop(tx_port->port->netdev)) { |
6435 | 0 | tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached); |
6436 | 0 | if (tx_port->txq_pkts) { |
6437 | 0 | txq_pkts_cached = xmemdup(tx_port->txq_pkts, |
6438 | 0 | n_txq * sizeof *tx_port->txq_pkts); |
6439 | 0 | tx_port_cached->txq_pkts = txq_pkts_cached; |
6440 | 0 | } |
6441 | 0 | hmap_insert(&pmd->tnl_port_cache, &tx_port_cached->node, |
6442 | 0 | hash_port_no(tx_port_cached->port->port_no)); |
6443 | 0 | } |
6444 | |
|
6445 | 0 | if (n_txq) { |
6446 | 0 | tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached); |
6447 | 0 | if (tx_port->txq_pkts) { |
6448 | 0 | txq_pkts_cached = xmemdup(tx_port->txq_pkts, |
6449 | 0 | n_txq * sizeof *tx_port->txq_pkts); |
6450 | 0 | tx_port_cached->txq_pkts = txq_pkts_cached; |
6451 | 0 | } |
6452 | 0 | hmap_insert(&pmd->send_port_cache, &tx_port_cached->node, |
6453 | 0 | hash_port_no(tx_port_cached->port->port_no)); |
6454 | 0 | } |
6455 | 0 | } |
6456 | 0 | } |
6457 | | |
6458 | | static void |
6459 | | pmd_alloc_static_tx_qid(struct dp_netdev_pmd_thread *pmd) |
6460 | 0 | { |
6461 | 0 | ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex); |
6462 | 0 | if (!id_pool_alloc_id(pmd->dp->tx_qid_pool, &pmd->static_tx_qid)) { |
6463 | 0 | VLOG_ABORT("static_tx_qid allocation failed for PMD on core %2d" |
6464 | 0 | ", numa_id %d.", pmd->core_id, pmd->numa_id); |
6465 | 0 | } |
6466 | 0 | ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex); |
6467 | |
|
6468 | 0 | VLOG_DBG("static_tx_qid = %d allocated for PMD thread on core %2d" |
6469 | 0 | ", numa_id %d.", pmd->static_tx_qid, pmd->core_id, pmd->numa_id); |
6470 | 0 | } |
6471 | | |
6472 | | static void |
6473 | | pmd_free_static_tx_qid(struct dp_netdev_pmd_thread *pmd) |
6474 | 0 | { |
6475 | 0 | ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex); |
6476 | 0 | id_pool_free_id(pmd->dp->tx_qid_pool, pmd->static_tx_qid); |
6477 | 0 | ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex); |
6478 | 0 | } |
6479 | | |
6480 | | static int |
6481 | | pmd_load_queues_and_ports(struct dp_netdev_pmd_thread *pmd, |
6482 | | struct polled_queue **ppoll_list) |
6483 | 0 | { |
6484 | 0 | struct polled_queue *poll_list = *ppoll_list; |
6485 | 0 | struct rxq_poll *poll; |
6486 | 0 | int i; |
6487 | |
|
6488 | 0 | ovs_mutex_lock(&pmd->port_mutex); |
6489 | 0 | poll_list = xrealloc(poll_list, hmap_count(&pmd->poll_list) |
6490 | 0 | * sizeof *poll_list); |
6491 | |
|
6492 | 0 | i = 0; |
6493 | 0 | HMAP_FOR_EACH (poll, node, &pmd->poll_list) { |
6494 | 0 | poll_list[i].rxq = poll->rxq; |
6495 | 0 | poll_list[i].port_no = poll->rxq->port->port_no; |
6496 | 0 | poll_list[i].emc_enabled = poll->rxq->port->emc_enabled; |
6497 | 0 | poll_list[i].rxq_enabled = netdev_rxq_enabled(poll->rxq->rx); |
6498 | 0 | poll_list[i].change_seq = |
6499 | 0 | netdev_get_change_seq(poll->rxq->port->netdev); |
6500 | 0 | i++; |
6501 | 0 | } |
6502 | |
|
6503 | 0 | pmd_load_cached_ports(pmd); |
6504 | |
|
6505 | 0 | ovs_mutex_unlock(&pmd->port_mutex); |
6506 | |
|
6507 | 0 | *ppoll_list = poll_list; |
6508 | 0 | return i; |
6509 | 0 | } |
6510 | | |
6511 | | static void * |
6512 | | pmd_thread_main(void *f_) |
6513 | 0 | { |
6514 | 0 | struct dp_netdev_pmd_thread *pmd = f_; |
6515 | 0 | struct pmd_perf_stats *s = &pmd->perf_stats; |
6516 | 0 | unsigned int lc = 0; |
6517 | 0 | struct polled_queue *poll_list; |
6518 | 0 | bool wait_for_reload = false; |
6519 | 0 | bool dpdk_attached; |
6520 | 0 | bool reload_tx_qid; |
6521 | 0 | bool exiting; |
6522 | 0 | bool reload; |
6523 | 0 | int poll_cnt; |
6524 | 0 | int i; |
6525 | 0 | int process_packets = 0; |
6526 | 0 | uint64_t sleep_time = 0; |
6527 | |
|
6528 | 0 | poll_list = NULL; |
6529 | | |
6530 | | /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */ |
6531 | 0 | ovsthread_setspecific(pmd->dp->per_pmd_key, pmd); |
6532 | 0 | ovs_numa_thread_setaffinity_core(pmd->core_id); |
6533 | 0 | dpdk_attached = dpdk_attach_thread(pmd->core_id); |
6534 | 0 | poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list); |
6535 | 0 | dfc_cache_init(&pmd->flow_cache); |
6536 | 0 | pmd_alloc_static_tx_qid(pmd); |
6537 | 0 | set_timer_resolution(PMD_TIMER_RES_NS); |
6538 | |
|
6539 | 0 | reload: |
6540 | 0 | atomic_count_init(&pmd->pmd_overloaded, 0); |
6541 | |
|
6542 | 0 | pmd->intrvl_tsc_prev = 0; |
6543 | 0 | atomic_store_relaxed(&pmd->intrvl_cycles, 0); |
6544 | |
|
6545 | 0 | if (!dpdk_attached) { |
6546 | 0 | dpdk_attached = dpdk_attach_thread(pmd->core_id); |
6547 | 0 | } |
6548 | | |
6549 | | /* List port/core affinity */ |
6550 | 0 | for (i = 0; i < poll_cnt; i++) { |
6551 | 0 | VLOG_DBG("Core %d processing port \'%s\' with queue-id %d\n", |
6552 | 0 | pmd->core_id, netdev_rxq_get_name(poll_list[i].rxq->rx), |
6553 | 0 | netdev_rxq_get_queue_id(poll_list[i].rxq->rx)); |
6554 | | /* Reset the rxq current cycles counter. */ |
6555 | 0 | dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR, 0); |
6556 | 0 | for (int j = 0; j < PMD_INTERVAL_MAX; j++) { |
6557 | 0 | dp_netdev_rxq_set_intrvl_cycles(poll_list[i].rxq, 0); |
6558 | 0 | } |
6559 | 0 | } |
6560 | |
|
6561 | 0 | if (!poll_cnt) { |
6562 | 0 | if (wait_for_reload) { |
6563 | | /* Don't sleep, control thread will ask for a reload shortly. */ |
6564 | 0 | do { |
6565 | 0 | atomic_read_explicit(&pmd->reload, &reload, |
6566 | 0 | memory_order_acquire); |
6567 | 0 | } while (!reload); |
6568 | 0 | } else { |
6569 | 0 | while (seq_read(pmd->reload_seq) == pmd->last_reload_seq) { |
6570 | 0 | seq_wait(pmd->reload_seq, pmd->last_reload_seq); |
6571 | 0 | poll_block(); |
6572 | 0 | } |
6573 | 0 | } |
6574 | 0 | } |
6575 | |
|
6576 | 0 | for (i = 0; i < PMD_INTERVAL_MAX; i++) { |
6577 | 0 | atomic_store_relaxed(&pmd->busy_cycles_intrvl[i], 0); |
6578 | 0 | } |
6579 | 0 | atomic_count_set(&pmd->intrvl_idx, 0); |
6580 | 0 | cycles_counter_update(s); |
6581 | |
|
6582 | 0 | pmd->next_rcu_quiesce = pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL; |
6583 | | |
6584 | | /* Protect pmd stats from external clearing while polling. */ |
6585 | 0 | ovs_mutex_lock(&pmd->perf_stats.stats_mutex); |
6586 | 0 | for (;;) { |
6587 | 0 | uint64_t rx_packets = 0, tx_packets = 0; |
6588 | 0 | uint64_t time_slept = 0; |
6589 | 0 | uint64_t max_sleep; |
6590 | |
|
6591 | 0 | pmd_perf_start_iteration(s); |
6592 | |
|
6593 | 0 | atomic_read_relaxed(&pmd->dp->smc_enable_db, &pmd->ctx.smc_enable_db); |
6594 | 0 | atomic_read_relaxed(&pmd->max_sleep, &max_sleep); |
6595 | |
|
6596 | 0 | for (i = 0; i < poll_cnt; i++) { |
6597 | |
|
6598 | 0 | if (!poll_list[i].rxq_enabled) { |
6599 | 0 | continue; |
6600 | 0 | } |
6601 | | |
6602 | 0 | if (poll_list[i].emc_enabled) { |
6603 | 0 | atomic_read_relaxed(&pmd->dp->emc_insert_min, |
6604 | 0 | &pmd->ctx.emc_insert_min); |
6605 | 0 | } else { |
6606 | 0 | pmd->ctx.emc_insert_min = 0; |
6607 | 0 | } |
6608 | |
|
6609 | 0 | process_packets = |
6610 | 0 | dp_netdev_process_rxq_port(pmd, poll_list[i].rxq, |
6611 | 0 | poll_list[i].port_no); |
6612 | 0 | rx_packets += process_packets; |
6613 | 0 | if (process_packets >= PMD_SLEEP_THRESH) { |
6614 | 0 | sleep_time = 0; |
6615 | 0 | } |
6616 | 0 | } |
6617 | |
|
6618 | 0 | if (!rx_packets) { |
6619 | | /* We didn't receive anything in the process loop. |
6620 | | * Check if we need to send something. |
6621 | | * There was no time updates on current iteration. */ |
6622 | 0 | pmd_thread_ctx_time_update(pmd); |
6623 | 0 | tx_packets = dp_netdev_pmd_flush_output_packets(pmd, |
6624 | 0 | max_sleep && sleep_time |
6625 | 0 | ? true : false); |
6626 | 0 | } |
6627 | |
|
6628 | 0 | if (max_sleep) { |
6629 | | /* Check if a sleep should happen on this iteration. */ |
6630 | 0 | if (sleep_time) { |
6631 | 0 | struct cycle_timer sleep_timer; |
6632 | |
|
6633 | 0 | cycle_timer_start(&pmd->perf_stats, &sleep_timer); |
6634 | 0 | xnanosleep_no_quiesce(sleep_time * 1000); |
6635 | 0 | time_slept = cycle_timer_stop(&pmd->perf_stats, &sleep_timer); |
6636 | 0 | pmd_thread_ctx_time_update(pmd); |
6637 | 0 | } |
6638 | 0 | if (sleep_time < max_sleep) { |
6639 | | /* Increase sleep time for next iteration. */ |
6640 | 0 | sleep_time += PMD_SLEEP_INC_US; |
6641 | 0 | } else { |
6642 | 0 | sleep_time = max_sleep; |
6643 | 0 | } |
6644 | 0 | } else { |
6645 | | /* Reset sleep time as max sleep policy may have been changed. */ |
6646 | 0 | sleep_time = 0; |
6647 | 0 | } |
6648 | | |
6649 | | /* Do RCU synchronization at fixed interval. This ensures that |
6650 | | * synchronization would not be delayed long even at high load of |
6651 | | * packet processing. */ |
6652 | 0 | if (pmd->ctx.now > pmd->next_rcu_quiesce) { |
6653 | 0 | if (!ovsrcu_try_quiesce()) { |
6654 | 0 | pmd->next_rcu_quiesce = |
6655 | 0 | pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL; |
6656 | 0 | } |
6657 | 0 | } |
6658 | |
|
6659 | 0 | if (lc++ > 1024) { |
6660 | 0 | lc = 0; |
6661 | |
|
6662 | 0 | coverage_try_clear(); |
6663 | 0 | dp_netdev_pmd_try_optimize(pmd, poll_list, poll_cnt); |
6664 | 0 | if (!ovsrcu_try_quiesce()) { |
6665 | 0 | emc_cache_slow_sweep(&((pmd->flow_cache).emc_cache)); |
6666 | 0 | pmd->next_rcu_quiesce = |
6667 | 0 | pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL; |
6668 | 0 | } |
6669 | |
|
6670 | 0 | for (i = 0; i < poll_cnt; i++) { |
6671 | 0 | uint64_t current_seq = |
6672 | 0 | netdev_get_change_seq(poll_list[i].rxq->port->netdev); |
6673 | 0 | if (poll_list[i].change_seq != current_seq) { |
6674 | 0 | poll_list[i].change_seq = current_seq; |
6675 | 0 | poll_list[i].rxq_enabled = |
6676 | 0 | netdev_rxq_enabled(poll_list[i].rxq->rx); |
6677 | 0 | } |
6678 | 0 | } |
6679 | 0 | } |
6680 | |
|
6681 | 0 | atomic_read_explicit(&pmd->reload, &reload, memory_order_acquire); |
6682 | 0 | if (OVS_UNLIKELY(reload)) { |
6683 | 0 | break; |
6684 | 0 | } |
6685 | | |
6686 | 0 | pmd_perf_end_iteration(s, rx_packets, tx_packets, time_slept, |
6687 | 0 | pmd_perf_metrics_enabled(pmd)); |
6688 | 0 | } |
6689 | 0 | ovs_mutex_unlock(&pmd->perf_stats.stats_mutex); |
6690 | |
|
6691 | 0 | poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list); |
6692 | 0 | atomic_read_relaxed(&pmd->wait_for_reload, &wait_for_reload); |
6693 | 0 | atomic_read_relaxed(&pmd->reload_tx_qid, &reload_tx_qid); |
6694 | 0 | atomic_read_relaxed(&pmd->exit, &exiting); |
6695 | | /* Signal here to make sure the pmd finishes |
6696 | | * reloading the updated configuration. */ |
6697 | 0 | dp_netdev_pmd_reload_done(pmd); |
6698 | |
|
6699 | 0 | if (reload_tx_qid) { |
6700 | 0 | pmd_free_static_tx_qid(pmd); |
6701 | 0 | pmd_alloc_static_tx_qid(pmd); |
6702 | 0 | } |
6703 | |
|
6704 | 0 | if (!exiting) { |
6705 | 0 | goto reload; |
6706 | 0 | } |
6707 | | |
6708 | 0 | pmd_free_static_tx_qid(pmd); |
6709 | 0 | dfc_cache_uninit(&pmd->flow_cache); |
6710 | 0 | free(poll_list); |
6711 | 0 | pmd_free_cached_ports(pmd); |
6712 | 0 | if (dpdk_attached) { |
6713 | 0 | dpdk_detach_thread(); |
6714 | 0 | } |
6715 | 0 | return NULL; |
6716 | 0 | } |
6717 | | |
6718 | | static void |
6719 | | dp_netdev_disable_upcall(struct dp_netdev *dp) |
6720 | | OVS_ACQUIRES(dp->upcall_rwlock) |
6721 | 0 | { |
6722 | 0 | fat_rwlock_wrlock(&dp->upcall_rwlock); |
6723 | 0 | } |
6724 | | |
6725 | | |
6726 | | /* Meters */ |
6727 | | static void |
6728 | | dpif_netdev_meter_get_features(const struct dpif * dpif OVS_UNUSED, |
6729 | | struct ofputil_meter_features *features) |
6730 | 0 | { |
6731 | 0 | features->max_meters = MAX_METERS; |
6732 | 0 | features->band_types = DP_SUPPORTED_METER_BAND_TYPES; |
6733 | 0 | features->capabilities = DP_SUPPORTED_METER_FLAGS_MASK; |
6734 | 0 | features->max_bands = MAX_BANDS; |
6735 | 0 | features->max_color = 0; |
6736 | 0 | } |
6737 | | |
6738 | | /* Tries to atomically add 'n' to 'value' in terms of saturation arithmetic, |
6739 | | * i.e., if the result will be larger than 'max_value', will store 'max_value' |
6740 | | * instead. */ |
6741 | | static void |
6742 | | atomic_sat_add(atomic_uint64_t *value, uint64_t n, uint64_t max_value) |
6743 | 0 | { |
6744 | 0 | uint64_t current, new_value; |
6745 | |
|
6746 | 0 | atomic_read_relaxed(value, ¤t); |
6747 | 0 | do { |
6748 | 0 | new_value = current + n; |
6749 | 0 | new_value = MIN(new_value, max_value); |
6750 | 0 | } while (!atomic_compare_exchange_weak_relaxed(value, ¤t, |
6751 | 0 | new_value)); |
6752 | 0 | } |
6753 | | |
6754 | | /* Tries to atomically subtract 'n' from 'value'. Does not perform the |
6755 | | * operation and returns 'false' if the result will be less than 'min_value'. |
6756 | | * Otherwise, stores the result and returns 'true'. */ |
6757 | | static bool |
6758 | | atomic_bound_sub(atomic_uint64_t *value, uint64_t n, uint64_t min_value) |
6759 | 0 | { |
6760 | 0 | uint64_t current; |
6761 | |
|
6762 | 0 | atomic_read_relaxed(value, ¤t); |
6763 | 0 | do { |
6764 | 0 | if (current < min_value + n) { |
6765 | 0 | return false; |
6766 | 0 | } |
6767 | 0 | } while (!atomic_compare_exchange_weak_relaxed(value, ¤t, |
6768 | 0 | current - n)); |
6769 | 0 | return true; |
6770 | 0 | } |
6771 | | |
6772 | | /* Applies the meter identified by 'meter_id' to 'packets_'. Packets |
6773 | | * that exceed a band are dropped in-place. */ |
6774 | | static void |
6775 | | dp_netdev_run_meter(struct dp_netdev *dp, struct dp_packet_batch *packets_, |
6776 | | uint32_t meter_id, long long int now_ms) |
6777 | 0 | { |
6778 | 0 | const size_t cnt = dp_packet_batch_size(packets_); |
6779 | 0 | uint32_t exceeded_rate[NETDEV_MAX_BURST]; |
6780 | 0 | uint32_t exceeded_band[NETDEV_MAX_BURST]; |
6781 | 0 | uint64_t bytes, volume, meter_used, old; |
6782 | 0 | uint64_t band_packets[MAX_BANDS]; |
6783 | 0 | uint64_t band_bytes[MAX_BANDS]; |
6784 | 0 | struct dp_meter_band *band; |
6785 | 0 | struct dp_packet *packet; |
6786 | 0 | struct dp_meter *meter; |
6787 | 0 | bool exceeded = false; |
6788 | |
|
6789 | 0 | if (meter_id >= MAX_METERS) { |
6790 | 0 | return; |
6791 | 0 | } |
6792 | | |
6793 | 0 | meter = dp_meter_lookup(&dp->meters, meter_id); |
6794 | 0 | if (!meter) { |
6795 | 0 | return; |
6796 | 0 | } |
6797 | | |
6798 | | /* Initialize as negative values. */ |
6799 | 0 | memset(exceeded_band, 0xff, cnt * sizeof *exceeded_band); |
6800 | | /* Initialize as zeroes. */ |
6801 | 0 | memset(exceeded_rate, 0, cnt * sizeof *exceeded_rate); |
6802 | |
|
6803 | 0 | atomic_read_relaxed(&meter->used, &meter_used); |
6804 | 0 | do { |
6805 | 0 | if (meter_used >= now_ms) { |
6806 | | /* The '>' condition means that we have several threads hitting the |
6807 | | * same meter, and the other one already advanced the time. */ |
6808 | 0 | meter_used = now_ms; |
6809 | 0 | break; |
6810 | 0 | } |
6811 | 0 | } while (!atomic_compare_exchange_weak_relaxed(&meter->used, |
6812 | 0 | &meter_used, now_ms)); |
6813 | | |
6814 | | /* Refill all buckets right away, since other threads may use them. */ |
6815 | 0 | if (meter_used < now_ms) { |
6816 | | /* All packets will hit the meter at the same time. */ |
6817 | 0 | uint64_t delta_t = now_ms - meter_used; |
6818 | | |
6819 | | /* Make sure delta_t will not be too large, so that bucket will not |
6820 | | * wrap around below. */ |
6821 | 0 | delta_t = MIN(delta_t, meter->max_delta_t); |
6822 | |
|
6823 | 0 | for (int m = 0; m < meter->n_bands; m++) { |
6824 | 0 | band = &meter->bands[m]; |
6825 | | /* Update band's bucket. We can't just use atomic add here, |
6826 | | * because we should never add above the max capacity. */ |
6827 | 0 | atomic_sat_add(&band->bucket, delta_t * band->rate, |
6828 | 0 | band->burst_size * 1000ULL); |
6829 | 0 | } |
6830 | 0 | } |
6831 | | |
6832 | | /* Update meter stats. */ |
6833 | 0 | atomic_add_relaxed(&meter->packet_count, cnt, &old); |
6834 | 0 | bytes = 0; |
6835 | 0 | DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) { |
6836 | 0 | bytes += dp_packet_size(packet); |
6837 | 0 | } |
6838 | 0 | atomic_add_relaxed(&meter->byte_count, bytes, &old); |
6839 | | |
6840 | | /* Meters can operate in terms of packets per second or kilobits per |
6841 | | * second. */ |
6842 | 0 | if (meter->flags & OFPMF13_PKTPS) { |
6843 | | /* Rate in packets/second, bucket 1/1000 packets. |
6844 | | * msec * packets/sec = 1/1000 packets. */ |
6845 | 0 | volume = cnt * 1000; /* Take 'cnt' packets from the bucket. */ |
6846 | 0 | } else { |
6847 | | /* Rate in kbps, bucket in bits. |
6848 | | * msec * kbps = bits */ |
6849 | 0 | volume = bytes * 8; |
6850 | 0 | } |
6851 | | |
6852 | | /* Find the band hit with the highest rate for each packet (if any). */ |
6853 | 0 | for (int m = 0; m < meter->n_bands; m++) { |
6854 | 0 | band = &meter->bands[m]; |
6855 | | |
6856 | | /* Drain the bucket for all the packets, if possible. */ |
6857 | 0 | if (atomic_bound_sub(&band->bucket, volume, 0)) { |
6858 | 0 | continue; |
6859 | 0 | } |
6860 | | |
6861 | | /* Band limit hit, must process packet-by-packet. */ |
6862 | 0 | DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) { |
6863 | 0 | uint64_t packet_volume = (meter->flags & OFPMF13_PKTPS) |
6864 | 0 | ? 1000 : (dp_packet_size(packet) * 8); |
6865 | |
|
6866 | 0 | if (!atomic_bound_sub(&band->bucket, packet_volume, 0)) { |
6867 | | /* Update the exceeding band for the exceeding packet. |
6868 | | * Only one band will be fired by a packet, and that can |
6869 | | * be different for each packet. */ |
6870 | 0 | if (band->rate > exceeded_rate[i]) { |
6871 | 0 | exceeded_rate[i] = band->rate; |
6872 | 0 | exceeded_band[i] = m; |
6873 | 0 | exceeded = true; |
6874 | 0 | } |
6875 | 0 | } |
6876 | 0 | } |
6877 | 0 | } |
6878 | | |
6879 | | /* No need to iterate over packets if there are no drops. */ |
6880 | 0 | if (!exceeded) { |
6881 | 0 | return; |
6882 | 0 | } |
6883 | | |
6884 | | /* Fire the highest rate band exceeded by each packet, and drop |
6885 | | * packets if needed. */ |
6886 | | |
6887 | 0 | memset(band_packets, 0, sizeof band_packets); |
6888 | 0 | memset(band_bytes, 0, sizeof band_bytes); |
6889 | |
|
6890 | 0 | size_t j; |
6891 | 0 | DP_PACKET_BATCH_REFILL_FOR_EACH (j, cnt, packet, packets_) { |
6892 | 0 | uint32_t m = exceeded_band[j]; |
6893 | |
|
6894 | 0 | if (m != UINT32_MAX) { |
6895 | | /* Meter drop packet. */ |
6896 | 0 | band_packets[m]++; |
6897 | 0 | band_bytes[m] += dp_packet_size(packet); |
6898 | 0 | dp_packet_delete(packet); |
6899 | 0 | } else { |
6900 | | /* Meter accepts packet. */ |
6901 | 0 | dp_packet_batch_refill(packets_, packet, j); |
6902 | 0 | } |
6903 | 0 | } |
6904 | |
|
6905 | 0 | for (int m = 0; m < meter->n_bands; m++) { |
6906 | 0 | if (!band_packets[m]) { |
6907 | 0 | continue; |
6908 | 0 | } |
6909 | 0 | band = &meter->bands[m]; |
6910 | 0 | atomic_add_relaxed(&band->packet_count, band_packets[m], &old); |
6911 | 0 | atomic_add_relaxed(&band->byte_count, band_bytes[m], &old); |
6912 | 0 | COVERAGE_ADD(datapath_drop_meter, band_packets[m]); |
6913 | 0 | } |
6914 | 0 | } |
6915 | | |
6916 | | /* Meter set/get/del processing is still single-threaded. */ |
6917 | | static int |
6918 | | dpif_netdev_meter_set(struct dpif *dpif, ofproto_meter_id meter_id, |
6919 | | struct ofputil_meter_config *config) |
6920 | 0 | { |
6921 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
6922 | 0 | uint32_t mid = meter_id.uint32; |
6923 | 0 | struct dp_meter *meter; |
6924 | 0 | int i; |
6925 | |
|
6926 | 0 | if (mid >= MAX_METERS) { |
6927 | 0 | return EFBIG; /* Meter_id out of range. */ |
6928 | 0 | } |
6929 | | |
6930 | 0 | if (config->flags & ~DP_SUPPORTED_METER_FLAGS_MASK) { |
6931 | 0 | return EBADF; /* Unsupported flags set */ |
6932 | 0 | } |
6933 | | |
6934 | 0 | if (config->n_bands > MAX_BANDS) { |
6935 | 0 | return EINVAL; |
6936 | 0 | } |
6937 | | |
6938 | 0 | for (i = 0; i < config->n_bands; ++i) { |
6939 | 0 | switch (config->bands[i].type) { |
6940 | 0 | case OFPMBT13_DROP: |
6941 | 0 | break; |
6942 | 0 | default: |
6943 | 0 | return ENODEV; /* Unsupported band type */ |
6944 | 0 | } |
6945 | 0 | } |
6946 | | |
6947 | | /* Allocate meter */ |
6948 | 0 | meter = xzalloc(sizeof *meter |
6949 | 0 | + config->n_bands * sizeof(struct dp_meter_band)); |
6950 | |
|
6951 | 0 | meter->flags = config->flags; |
6952 | 0 | meter->n_bands = config->n_bands; |
6953 | 0 | meter->max_delta_t = 0; |
6954 | 0 | meter->id = mid; |
6955 | 0 | atomic_init(&meter->used, time_msec()); |
6956 | | |
6957 | | /* set up bands */ |
6958 | 0 | for (i = 0; i < config->n_bands; ++i) { |
6959 | 0 | uint32_t band_max_delta_t; |
6960 | 0 | uint64_t bucket_size; |
6961 | | |
6962 | | /* Set burst size to a workable value if none specified. */ |
6963 | 0 | if (config->bands[i].burst_size == 0) { |
6964 | 0 | config->bands[i].burst_size = config->bands[i].rate; |
6965 | 0 | } |
6966 | |
|
6967 | 0 | meter->bands[i].rate = config->bands[i].rate; |
6968 | 0 | meter->bands[i].burst_size = config->bands[i].burst_size; |
6969 | | /* Start with a full bucket. */ |
6970 | 0 | bucket_size = meter->bands[i].burst_size * 1000ULL; |
6971 | 0 | atomic_init(&meter->bands[i].bucket, bucket_size); |
6972 | | |
6973 | | /* Figure out max delta_t that is enough to fill any bucket. */ |
6974 | 0 | band_max_delta_t = bucket_size / meter->bands[i].rate; |
6975 | 0 | if (band_max_delta_t > meter->max_delta_t) { |
6976 | 0 | meter->max_delta_t = band_max_delta_t; |
6977 | 0 | } |
6978 | 0 | } |
6979 | |
|
6980 | 0 | ovs_mutex_lock(&dp->meters_lock); |
6981 | |
|
6982 | 0 | dp_meter_detach_free(&dp->meters, mid); /* Free existing meter, if any. */ |
6983 | 0 | dp_meter_attach(&dp->meters, meter); |
6984 | |
|
6985 | 0 | ovs_mutex_unlock(&dp->meters_lock); |
6986 | |
|
6987 | 0 | return 0; |
6988 | 0 | } |
6989 | | |
6990 | | static int |
6991 | | dpif_netdev_meter_get(const struct dpif *dpif, |
6992 | | ofproto_meter_id meter_id_, |
6993 | | struct ofputil_meter_stats *stats, uint16_t n_bands) |
6994 | 0 | { |
6995 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
6996 | 0 | uint32_t meter_id = meter_id_.uint32; |
6997 | 0 | struct dp_meter *meter; |
6998 | |
|
6999 | 0 | if (meter_id >= MAX_METERS) { |
7000 | 0 | return EFBIG; |
7001 | 0 | } |
7002 | | |
7003 | 0 | meter = dp_meter_lookup(&dp->meters, meter_id); |
7004 | 0 | if (!meter) { |
7005 | 0 | return ENOENT; |
7006 | 0 | } |
7007 | | |
7008 | 0 | if (stats) { |
7009 | 0 | int i = 0; |
7010 | |
|
7011 | 0 | atomic_read_relaxed(&meter->packet_count, &stats->packet_in_count); |
7012 | 0 | atomic_read_relaxed(&meter->byte_count, &stats->byte_in_count); |
7013 | |
|
7014 | 0 | for (i = 0; i < n_bands && i < meter->n_bands; ++i) { |
7015 | 0 | atomic_read_relaxed(&meter->bands[i].packet_count, |
7016 | 0 | &stats->bands[i].packet_count); |
7017 | 0 | atomic_read_relaxed(&meter->bands[i].byte_count, |
7018 | 0 | &stats->bands[i].byte_count); |
7019 | 0 | } |
7020 | 0 | stats->n_bands = i; |
7021 | 0 | } |
7022 | |
|
7023 | 0 | return 0; |
7024 | 0 | } |
7025 | | |
7026 | | static int |
7027 | | dpif_netdev_meter_del(struct dpif *dpif, |
7028 | | ofproto_meter_id meter_id_, |
7029 | | struct ofputil_meter_stats *stats, uint16_t n_bands) |
7030 | 0 | { |
7031 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
7032 | 0 | int error; |
7033 | |
|
7034 | 0 | error = dpif_netdev_meter_get(dpif, meter_id_, stats, n_bands); |
7035 | 0 | if (!error) { |
7036 | 0 | uint32_t meter_id = meter_id_.uint32; |
7037 | |
|
7038 | 0 | ovs_mutex_lock(&dp->meters_lock); |
7039 | 0 | dp_meter_detach_free(&dp->meters, meter_id); |
7040 | 0 | ovs_mutex_unlock(&dp->meters_lock); |
7041 | 0 | } |
7042 | 0 | return error; |
7043 | 0 | } |
7044 | | |
7045 | | |
7046 | | static void |
7047 | | dpif_netdev_disable_upcall(struct dpif *dpif) |
7048 | | OVS_NO_THREAD_SAFETY_ANALYSIS |
7049 | 0 | { |
7050 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
7051 | 0 | dp_netdev_disable_upcall(dp); |
7052 | 0 | } |
7053 | | |
7054 | | static void |
7055 | | dp_netdev_enable_upcall(struct dp_netdev *dp) |
7056 | | OVS_RELEASES(dp->upcall_rwlock) |
7057 | 0 | { |
7058 | 0 | fat_rwlock_unlock(&dp->upcall_rwlock); |
7059 | 0 | } |
7060 | | |
7061 | | static void |
7062 | | dpif_netdev_enable_upcall(struct dpif *dpif) |
7063 | | OVS_NO_THREAD_SAFETY_ANALYSIS |
7064 | 0 | { |
7065 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
7066 | 0 | dp_netdev_enable_upcall(dp); |
7067 | 0 | } |
7068 | | |
7069 | | static void |
7070 | | dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd) |
7071 | 0 | { |
7072 | 0 | atomic_store_relaxed(&pmd->wait_for_reload, false); |
7073 | 0 | atomic_store_relaxed(&pmd->reload_tx_qid, false); |
7074 | 0 | pmd->last_reload_seq = seq_read(pmd->reload_seq); |
7075 | 0 | atomic_store_explicit(&pmd->reload, false, memory_order_release); |
7076 | 0 | } |
7077 | | |
7078 | | /* Finds and refs the dp_netdev_pmd_thread on core 'core_id'. Returns |
7079 | | * the pointer if succeeds, otherwise, NULL (it can return NULL even if |
7080 | | * 'core_id' is NON_PMD_CORE_ID). |
7081 | | * |
7082 | | * Caller must unrefs the returned reference. */ |
7083 | | static struct dp_netdev_pmd_thread * |
7084 | | dp_netdev_get_pmd(struct dp_netdev *dp, unsigned core_id) |
7085 | 0 | { |
7086 | 0 | struct dp_netdev_pmd_thread *pmd; |
7087 | |
|
7088 | 0 | CMAP_FOR_EACH_WITH_HASH (pmd, node, hash_int(core_id, 0), |
7089 | 0 | &dp->poll_threads) { |
7090 | 0 | if (pmd->core_id == core_id) { |
7091 | 0 | return dp_netdev_pmd_try_ref(pmd) ? pmd : NULL; |
7092 | 0 | } |
7093 | 0 | } |
7094 | | |
7095 | 0 | return NULL; |
7096 | 0 | } |
7097 | | |
7098 | | /* Sets the 'struct dp_netdev_pmd_thread' for non-pmd threads. */ |
7099 | | static void |
7100 | | dp_netdev_set_nonpmd(struct dp_netdev *dp) |
7101 | | OVS_REQ_WRLOCK(dp->port_rwlock) |
7102 | 0 | { |
7103 | 0 | struct dp_netdev_pmd_thread *non_pmd; |
7104 | |
|
7105 | 0 | non_pmd = xzalloc(sizeof *non_pmd); |
7106 | 0 | dp_netdev_configure_pmd(non_pmd, dp, NON_PMD_CORE_ID, OVS_NUMA_UNSPEC); |
7107 | 0 | } |
7108 | | |
7109 | | /* Caller must have valid pointer to 'pmd'. */ |
7110 | | static bool |
7111 | | dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd) |
7112 | 0 | { |
7113 | 0 | return ovs_refcount_try_ref_rcu(&pmd->ref_cnt); |
7114 | 0 | } |
7115 | | |
7116 | | static void |
7117 | | dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd) |
7118 | 0 | { |
7119 | 0 | if (pmd && ovs_refcount_unref(&pmd->ref_cnt) == 1) { |
7120 | 0 | ovsrcu_postpone(dp_netdev_destroy_pmd, pmd); |
7121 | 0 | } |
7122 | 0 | } |
7123 | | |
7124 | | /* Given cmap position 'pos', tries to ref the next node. If try_ref() |
7125 | | * fails, keeps checking for next node until reaching the end of cmap. |
7126 | | * |
7127 | | * Caller must unrefs the returned reference. */ |
7128 | | static struct dp_netdev_pmd_thread * |
7129 | | dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos) |
7130 | 0 | { |
7131 | 0 | struct dp_netdev_pmd_thread *next; |
7132 | |
|
7133 | 0 | do { |
7134 | 0 | struct cmap_node *node; |
7135 | |
|
7136 | 0 | node = cmap_next_position(&dp->poll_threads, pos); |
7137 | 0 | next = node ? CONTAINER_OF(node, struct dp_netdev_pmd_thread, node) |
7138 | 0 | : NULL; |
7139 | 0 | } while (next && !dp_netdev_pmd_try_ref(next)); |
7140 | |
|
7141 | 0 | return next; |
7142 | 0 | } |
7143 | | |
7144 | | /* Configures the 'pmd' based on the input argument. */ |
7145 | | static void |
7146 | | dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp, |
7147 | | unsigned core_id, int numa_id) |
7148 | | OVS_NO_THREAD_SAFETY_ANALYSIS |
7149 | 0 | { |
7150 | 0 | pmd->dp = dp; |
7151 | 0 | pmd->core_id = core_id; |
7152 | 0 | pmd->numa_id = numa_id; |
7153 | 0 | pmd->need_reload = false; |
7154 | 0 | pmd->n_output_batches = 0; |
7155 | |
|
7156 | 0 | ovs_refcount_init(&pmd->ref_cnt); |
7157 | 0 | atomic_init(&pmd->exit, false); |
7158 | 0 | pmd->reload_seq = seq_create(); |
7159 | 0 | pmd->last_reload_seq = seq_read(pmd->reload_seq); |
7160 | 0 | atomic_init(&pmd->reload, false); |
7161 | 0 | ovs_mutex_init(&pmd->flow_mutex); |
7162 | 0 | ovs_mutex_init(&pmd->port_mutex); |
7163 | 0 | ovs_mutex_init(&pmd->bond_mutex); |
7164 | 0 | cmap_init(&pmd->flow_table); |
7165 | 0 | cmap_init(&pmd->classifiers); |
7166 | 0 | cmap_init(&pmd->simple_match_table); |
7167 | 0 | ccmap_init(&pmd->n_flows); |
7168 | 0 | ccmap_init(&pmd->n_simple_flows); |
7169 | 0 | pmd->ctx.last_rxq = NULL; |
7170 | 0 | pmd_thread_ctx_time_update(pmd); |
7171 | 0 | pmd->next_optimization = pmd->ctx.now + DPCLS_OPTIMIZATION_INTERVAL; |
7172 | 0 | pmd->next_rcu_quiesce = pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL; |
7173 | 0 | pmd->next_cycle_store = pmd->ctx.now + PMD_INTERVAL_LEN; |
7174 | 0 | pmd->busy_cycles_intrvl = xzalloc(PMD_INTERVAL_MAX * |
7175 | 0 | sizeof *pmd->busy_cycles_intrvl); |
7176 | 0 | hmap_init(&pmd->poll_list); |
7177 | 0 | hmap_init(&pmd->tx_ports); |
7178 | 0 | hmap_init(&pmd->tnl_port_cache); |
7179 | 0 | hmap_init(&pmd->send_port_cache); |
7180 | 0 | cmap_init(&pmd->tx_bonds); |
7181 | |
|
7182 | 0 | pmd_init_max_sleep(dp, pmd); |
7183 | | |
7184 | | /* Initialize DPIF function pointer to the default configured version. */ |
7185 | 0 | atomic_init(&pmd->netdev_input_func, dp_netdev_impl_get_default()); |
7186 | | |
7187 | | /* Init default miniflow_extract function */ |
7188 | 0 | atomic_init(&pmd->miniflow_extract_opt, dp_mfex_impl_get_default()); |
7189 | | |
7190 | | /* init the 'flow_cache' since there is no |
7191 | | * actual thread created for NON_PMD_CORE_ID. */ |
7192 | 0 | if (core_id == NON_PMD_CORE_ID) { |
7193 | 0 | dfc_cache_init(&pmd->flow_cache); |
7194 | 0 | pmd_alloc_static_tx_qid(pmd); |
7195 | 0 | } |
7196 | 0 | pmd_perf_stats_init(&pmd->perf_stats); |
7197 | 0 | cmap_insert(&dp->poll_threads, CONST_CAST(struct cmap_node *, &pmd->node), |
7198 | 0 | hash_int(core_id, 0)); |
7199 | 0 | } |
7200 | | |
7201 | | static void |
7202 | | dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd) |
7203 | | OVS_NO_THREAD_SAFETY_ANALYSIS |
7204 | 0 | { |
7205 | 0 | struct dpcls *cls; |
7206 | |
|
7207 | 0 | dp_netdev_pmd_flow_flush(pmd); |
7208 | 0 | hmap_destroy(&pmd->send_port_cache); |
7209 | 0 | hmap_destroy(&pmd->tnl_port_cache); |
7210 | 0 | hmap_destroy(&pmd->tx_ports); |
7211 | 0 | cmap_destroy(&pmd->tx_bonds); |
7212 | 0 | hmap_destroy(&pmd->poll_list); |
7213 | 0 | free(pmd->busy_cycles_intrvl); |
7214 | | /* All flows (including their dpcls_rules) have been deleted already */ |
7215 | 0 | CMAP_FOR_EACH (cls, node, &pmd->classifiers) { |
7216 | 0 | dpcls_destroy(cls); |
7217 | 0 | ovsrcu_postpone(free, cls); |
7218 | 0 | } |
7219 | 0 | cmap_destroy(&pmd->classifiers); |
7220 | 0 | cmap_destroy(&pmd->flow_table); |
7221 | 0 | cmap_destroy(&pmd->simple_match_table); |
7222 | 0 | ccmap_destroy(&pmd->n_flows); |
7223 | 0 | ccmap_destroy(&pmd->n_simple_flows); |
7224 | 0 | ovs_mutex_destroy(&pmd->flow_mutex); |
7225 | 0 | seq_destroy(pmd->reload_seq); |
7226 | 0 | ovs_mutex_destroy(&pmd->port_mutex); |
7227 | 0 | ovs_mutex_destroy(&pmd->bond_mutex); |
7228 | 0 | free(pmd->netdev_input_func_userdata); |
7229 | 0 | free(pmd); |
7230 | 0 | } |
7231 | | |
7232 | | /* Stops the pmd thread, removes it from the 'dp->poll_threads', |
7233 | | * and unrefs the struct. */ |
7234 | | static void |
7235 | | dp_netdev_del_pmd(struct dp_netdev *dp, struct dp_netdev_pmd_thread *pmd) |
7236 | 0 | { |
7237 | | /* NON_PMD_CORE_ID doesn't have a thread, so we don't have to synchronize, |
7238 | | * but extra cleanup is necessary */ |
7239 | 0 | if (pmd->core_id == NON_PMD_CORE_ID) { |
7240 | 0 | ovs_mutex_lock(&dp->non_pmd_mutex); |
7241 | 0 | dfc_cache_uninit(&pmd->flow_cache); |
7242 | 0 | pmd_free_cached_ports(pmd); |
7243 | 0 | pmd_free_static_tx_qid(pmd); |
7244 | 0 | ovs_mutex_unlock(&dp->non_pmd_mutex); |
7245 | 0 | } else { |
7246 | 0 | atomic_store_relaxed(&pmd->exit, true); |
7247 | 0 | dp_netdev_reload_pmd__(pmd); |
7248 | 0 | xpthread_join(pmd->thread, NULL); |
7249 | 0 | } |
7250 | |
|
7251 | 0 | dp_netdev_pmd_clear_ports(pmd); |
7252 | | |
7253 | | /* Purges the 'pmd''s flows after stopping the thread, but before |
7254 | | * destroying the flows, so that the flow stats can be collected. */ |
7255 | 0 | if (dp->dp_purge_cb) { |
7256 | 0 | dp->dp_purge_cb(dp->dp_purge_aux, pmd->core_id); |
7257 | 0 | } |
7258 | 0 | cmap_remove(&pmd->dp->poll_threads, &pmd->node, hash_int(pmd->core_id, 0)); |
7259 | 0 | dp_netdev_pmd_unref(pmd); |
7260 | 0 | } |
7261 | | |
7262 | | /* Destroys all pmd threads. If 'non_pmd' is true it also destroys the non pmd |
7263 | | * thread. */ |
7264 | | static void |
7265 | | dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd) |
7266 | 0 | { |
7267 | 0 | struct dp_netdev_pmd_thread *pmd; |
7268 | 0 | struct dp_netdev_pmd_thread **pmd_list; |
7269 | 0 | size_t k = 0, n_pmds; |
7270 | |
|
7271 | 0 | n_pmds = cmap_count(&dp->poll_threads); |
7272 | 0 | pmd_list = xcalloc(n_pmds, sizeof *pmd_list); |
7273 | |
|
7274 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
7275 | 0 | if (!non_pmd && pmd->core_id == NON_PMD_CORE_ID) { |
7276 | 0 | continue; |
7277 | 0 | } |
7278 | | /* We cannot call dp_netdev_del_pmd(), since it alters |
7279 | | * 'dp->poll_threads' (while we're iterating it) and it |
7280 | | * might quiesce. */ |
7281 | 0 | ovs_assert(k < n_pmds); |
7282 | 0 | pmd_list[k++] = pmd; |
7283 | 0 | } |
7284 | |
|
7285 | 0 | for (size_t i = 0; i < k; i++) { |
7286 | 0 | dp_netdev_del_pmd(dp, pmd_list[i]); |
7287 | 0 | } |
7288 | 0 | free(pmd_list); |
7289 | 0 | } |
7290 | | |
7291 | | /* Deletes all rx queues from pmd->poll_list and all the ports from |
7292 | | * pmd->tx_ports. */ |
7293 | | static void |
7294 | | dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd) |
7295 | 0 | { |
7296 | 0 | struct rxq_poll *poll; |
7297 | 0 | struct tx_port *port; |
7298 | 0 | struct tx_bond *tx; |
7299 | |
|
7300 | 0 | ovs_mutex_lock(&pmd->port_mutex); |
7301 | 0 | HMAP_FOR_EACH_POP (poll, node, &pmd->poll_list) { |
7302 | 0 | free(poll); |
7303 | 0 | } |
7304 | 0 | HMAP_FOR_EACH_POP (port, node, &pmd->tx_ports) { |
7305 | 0 | free(port->txq_pkts); |
7306 | 0 | free(port); |
7307 | 0 | } |
7308 | 0 | ovs_mutex_unlock(&pmd->port_mutex); |
7309 | |
|
7310 | 0 | ovs_mutex_lock(&pmd->bond_mutex); |
7311 | 0 | CMAP_FOR_EACH (tx, node, &pmd->tx_bonds) { |
7312 | 0 | cmap_remove(&pmd->tx_bonds, &tx->node, hash_bond_id(tx->bond_id)); |
7313 | 0 | ovsrcu_postpone(free, tx); |
7314 | 0 | } |
7315 | 0 | ovs_mutex_unlock(&pmd->bond_mutex); |
7316 | 0 | } |
7317 | | |
7318 | | /* Adds rx queue to poll_list of PMD thread, if it's not there already. */ |
7319 | | static void |
7320 | | dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd, |
7321 | | struct dp_netdev_rxq *rxq) |
7322 | | OVS_REQUIRES(pmd->port_mutex) |
7323 | 0 | { |
7324 | 0 | int qid = netdev_rxq_get_queue_id(rxq->rx); |
7325 | 0 | uint32_t hash = hash_2words(odp_to_u32(rxq->port->port_no), qid); |
7326 | 0 | struct rxq_poll *poll; |
7327 | |
|
7328 | 0 | HMAP_FOR_EACH_WITH_HASH (poll, node, hash, &pmd->poll_list) { |
7329 | 0 | if (poll->rxq == rxq) { |
7330 | | /* 'rxq' is already polled by this thread. Do nothing. */ |
7331 | 0 | return; |
7332 | 0 | } |
7333 | 0 | } |
7334 | | |
7335 | 0 | poll = xmalloc(sizeof *poll); |
7336 | 0 | poll->rxq = rxq; |
7337 | 0 | hmap_insert(&pmd->poll_list, &poll->node, hash); |
7338 | |
|
7339 | 0 | pmd->need_reload = true; |
7340 | 0 | } |
7341 | | |
7342 | | /* Delete 'poll' from poll_list of PMD thread. */ |
7343 | | static void |
7344 | | dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd, |
7345 | | struct rxq_poll *poll) |
7346 | | OVS_REQUIRES(pmd->port_mutex) |
7347 | 0 | { |
7348 | 0 | hmap_remove(&pmd->poll_list, &poll->node); |
7349 | 0 | free(poll); |
7350 | |
|
7351 | 0 | pmd->need_reload = true; |
7352 | 0 | } |
7353 | | |
7354 | | /* Add 'port' to the tx port cache of 'pmd', which must be reloaded for the |
7355 | | * changes to take effect. */ |
7356 | | static void |
7357 | | dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd, |
7358 | | struct dp_netdev_port *port) |
7359 | | OVS_REQUIRES(pmd->port_mutex) |
7360 | 0 | { |
7361 | 0 | struct tx_port *tx; |
7362 | |
|
7363 | 0 | tx = tx_port_lookup(&pmd->tx_ports, port->port_no); |
7364 | 0 | if (tx) { |
7365 | | /* 'port' is already on this thread tx cache. Do nothing. */ |
7366 | 0 | return; |
7367 | 0 | } |
7368 | | |
7369 | 0 | tx = xzalloc(sizeof *tx); |
7370 | |
|
7371 | 0 | tx->port = port; |
7372 | 0 | tx->qid = -1; |
7373 | 0 | tx->flush_time = 0LL; |
7374 | 0 | dp_packet_batch_init(&tx->output_pkts); |
7375 | |
|
7376 | 0 | if (tx->port->txq_mode == TXQ_MODE_XPS_HASH) { |
7377 | 0 | int i, n_txq = netdev_n_txq(tx->port->netdev); |
7378 | |
|
7379 | 0 | tx->txq_pkts = xzalloc(n_txq * sizeof *tx->txq_pkts); |
7380 | 0 | for (i = 0; i < n_txq; i++) { |
7381 | 0 | dp_packet_batch_init(&tx->txq_pkts[i]); |
7382 | 0 | } |
7383 | 0 | } |
7384 | |
|
7385 | 0 | hmap_insert(&pmd->tx_ports, &tx->node, hash_port_no(tx->port->port_no)); |
7386 | 0 | pmd->need_reload = true; |
7387 | 0 | } |
7388 | | |
7389 | | /* Del 'tx' from the tx port cache of 'pmd', which must be reloaded for the |
7390 | | * changes to take effect. */ |
7391 | | static void |
7392 | | dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd, |
7393 | | struct tx_port *tx) |
7394 | | OVS_REQUIRES(pmd->port_mutex) |
7395 | 0 | { |
7396 | 0 | hmap_remove(&pmd->tx_ports, &tx->node); |
7397 | 0 | free(tx->txq_pkts); |
7398 | 0 | free(tx); |
7399 | 0 | pmd->need_reload = true; |
7400 | 0 | } |
7401 | | |
7402 | | /* Add bond to the tx bond cmap of 'pmd'. */ |
7403 | | static void |
7404 | | dp_netdev_add_bond_tx_to_pmd(struct dp_netdev_pmd_thread *pmd, |
7405 | | struct tx_bond *bond, bool update) |
7406 | | OVS_EXCLUDED(pmd->bond_mutex) |
7407 | 0 | { |
7408 | 0 | struct tx_bond *tx; |
7409 | |
|
7410 | 0 | ovs_mutex_lock(&pmd->bond_mutex); |
7411 | 0 | tx = tx_bond_lookup(&pmd->tx_bonds, bond->bond_id); |
7412 | |
|
7413 | 0 | if (tx && !update) { |
7414 | | /* It's not an update and the entry already exists. Do nothing. */ |
7415 | 0 | goto unlock; |
7416 | 0 | } |
7417 | | |
7418 | 0 | if (tx) { |
7419 | 0 | struct tx_bond *new_tx = xmemdup(bond, sizeof *bond); |
7420 | | |
7421 | | /* Copy the stats for each bucket. */ |
7422 | 0 | for (int i = 0; i < BOND_BUCKETS; i++) { |
7423 | 0 | uint64_t n_packets, n_bytes; |
7424 | |
|
7425 | 0 | atomic_read_relaxed(&tx->member_buckets[i].n_packets, &n_packets); |
7426 | 0 | atomic_read_relaxed(&tx->member_buckets[i].n_bytes, &n_bytes); |
7427 | 0 | atomic_init(&new_tx->member_buckets[i].n_packets, n_packets); |
7428 | 0 | atomic_init(&new_tx->member_buckets[i].n_bytes, n_bytes); |
7429 | 0 | } |
7430 | 0 | cmap_replace(&pmd->tx_bonds, &tx->node, &new_tx->node, |
7431 | 0 | hash_bond_id(bond->bond_id)); |
7432 | 0 | ovsrcu_postpone(free, tx); |
7433 | 0 | } else { |
7434 | 0 | tx = xmemdup(bond, sizeof *bond); |
7435 | 0 | cmap_insert(&pmd->tx_bonds, &tx->node, hash_bond_id(bond->bond_id)); |
7436 | 0 | } |
7437 | 0 | unlock: |
7438 | 0 | ovs_mutex_unlock(&pmd->bond_mutex); |
7439 | 0 | } |
7440 | | |
7441 | | /* Delete bond from the tx bond cmap of 'pmd'. */ |
7442 | | static void |
7443 | | dp_netdev_del_bond_tx_from_pmd(struct dp_netdev_pmd_thread *pmd, |
7444 | | uint32_t bond_id) |
7445 | | OVS_EXCLUDED(pmd->bond_mutex) |
7446 | 0 | { |
7447 | 0 | struct tx_bond *tx; |
7448 | |
|
7449 | 0 | ovs_mutex_lock(&pmd->bond_mutex); |
7450 | 0 | tx = tx_bond_lookup(&pmd->tx_bonds, bond_id); |
7451 | 0 | if (tx) { |
7452 | 0 | cmap_remove(&pmd->tx_bonds, &tx->node, hash_bond_id(tx->bond_id)); |
7453 | 0 | ovsrcu_postpone(free, tx); |
7454 | 0 | } |
7455 | 0 | ovs_mutex_unlock(&pmd->bond_mutex); |
7456 | 0 | } |
7457 | | |
7458 | | static char * |
7459 | | dpif_netdev_get_datapath_version(void) |
7460 | 0 | { |
7461 | 0 | return xstrdup("<built-in>"); |
7462 | 0 | } |
7463 | | |
7464 | | static void |
7465 | | dp_netdev_flow_used(struct dp_netdev_flow *netdev_flow, int cnt, int size, |
7466 | | uint16_t tcp_flags, long long now) |
7467 | 0 | { |
7468 | 0 | uint16_t flags; |
7469 | |
|
7470 | 0 | atomic_store_relaxed(&netdev_flow->stats.used, now); |
7471 | 0 | non_atomic_ullong_add(&netdev_flow->stats.packet_count, cnt); |
7472 | 0 | non_atomic_ullong_add(&netdev_flow->stats.byte_count, size); |
7473 | 0 | atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags); |
7474 | 0 | flags |= tcp_flags; |
7475 | 0 | atomic_store_relaxed(&netdev_flow->stats.tcp_flags, flags); |
7476 | 0 | } |
7477 | | |
7478 | | static int |
7479 | | dp_netdev_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet_, |
7480 | | struct flow *flow, struct flow_wildcards *wc, ovs_u128 *ufid, |
7481 | | enum dpif_upcall_type type, const struct nlattr *userdata, |
7482 | | struct ofpbuf *actions, struct ofpbuf *put_actions) |
7483 | 0 | { |
7484 | 0 | struct dp_netdev *dp = pmd->dp; |
7485 | |
|
7486 | 0 | if (OVS_UNLIKELY(!dp->upcall_cb)) { |
7487 | 0 | return ENODEV; |
7488 | 0 | } |
7489 | | |
7490 | 0 | if (OVS_UNLIKELY(!VLOG_DROP_DBG(&upcall_rl))) { |
7491 | 0 | struct ds ds = DS_EMPTY_INITIALIZER; |
7492 | 0 | char *packet_str; |
7493 | 0 | struct ofpbuf key; |
7494 | 0 | struct odp_flow_key_parms odp_parms = { |
7495 | 0 | .flow = flow, |
7496 | 0 | .mask = wc ? &wc->masks : NULL, |
7497 | 0 | .support = dp_netdev_support, |
7498 | 0 | }; |
7499 | |
|
7500 | 0 | ofpbuf_init(&key, 0); |
7501 | 0 | odp_flow_key_from_flow(&odp_parms, &key); |
7502 | 0 | packet_str = ofp_dp_packet_to_string(packet_); |
7503 | |
|
7504 | 0 | odp_flow_key_format(key.data, key.size, &ds); |
7505 | |
|
7506 | 0 | VLOG_DBG("%s: %s upcall:\n%s\n%s", dp->name, |
7507 | 0 | dpif_upcall_type_to_string(type), ds_cstr(&ds), packet_str); |
7508 | |
|
7509 | 0 | ofpbuf_uninit(&key); |
7510 | 0 | free(packet_str); |
7511 | |
|
7512 | 0 | ds_destroy(&ds); |
7513 | 0 | } |
7514 | |
|
7515 | 0 | if (type != DPIF_UC_MISS) { |
7516 | 0 | dp_packet_ol_send_prepare(packet_, 0); |
7517 | 0 | } |
7518 | |
|
7519 | 0 | return dp->upcall_cb(packet_, flow, ufid, pmd->core_id, type, userdata, |
7520 | 0 | actions, wc, put_actions, dp->upcall_aux); |
7521 | 0 | } |
7522 | | |
7523 | | static inline uint32_t |
7524 | | dpif_netdev_packet_get_rss_hash(struct dp_packet *packet, |
7525 | | const struct miniflow *mf) |
7526 | 0 | { |
7527 | 0 | uint32_t hash, recirc_depth; |
7528 | |
|
7529 | 0 | if (OVS_LIKELY(dp_packet_rss_valid(packet))) { |
7530 | 0 | hash = dp_packet_get_rss_hash(packet); |
7531 | 0 | } else { |
7532 | 0 | hash = miniflow_hash_5tuple(mf, 0); |
7533 | 0 | dp_packet_set_rss_hash(packet, hash); |
7534 | 0 | } |
7535 | | |
7536 | | /* The RSS hash must account for the recirculation depth to avoid |
7537 | | * collisions in the exact match cache */ |
7538 | 0 | recirc_depth = *recirc_depth_get_unsafe(); |
7539 | 0 | if (OVS_UNLIKELY(recirc_depth)) { |
7540 | 0 | hash = hash_finish(hash, recirc_depth); |
7541 | 0 | } |
7542 | 0 | return hash; |
7543 | 0 | } |
7544 | | |
7545 | | struct packet_batch_per_flow { |
7546 | | unsigned int byte_count; |
7547 | | uint16_t tcp_flags; |
7548 | | struct dp_netdev_flow *flow; |
7549 | | |
7550 | | struct dp_packet_batch array; |
7551 | | }; |
7552 | | |
7553 | | static inline void |
7554 | | packet_batch_per_flow_update(struct packet_batch_per_flow *batch, |
7555 | | struct dp_packet *packet, |
7556 | | uint16_t tcp_flags) |
7557 | 0 | { |
7558 | 0 | batch->byte_count += dp_packet_size(packet); |
7559 | 0 | batch->tcp_flags |= tcp_flags; |
7560 | 0 | dp_packet_batch_add(&batch->array, packet); |
7561 | 0 | } |
7562 | | |
7563 | | static inline void |
7564 | | packet_batch_per_flow_init(struct packet_batch_per_flow *batch, |
7565 | | struct dp_netdev_flow *flow) |
7566 | 0 | { |
7567 | 0 | flow->batch = batch; |
7568 | |
|
7569 | 0 | batch->flow = flow; |
7570 | 0 | dp_packet_batch_init(&batch->array); |
7571 | 0 | batch->byte_count = 0; |
7572 | 0 | batch->tcp_flags = 0; |
7573 | 0 | } |
7574 | | |
7575 | | static inline void |
7576 | | packet_batch_per_flow_execute(struct packet_batch_per_flow *batch, |
7577 | | struct dp_netdev_pmd_thread *pmd) |
7578 | 0 | { |
7579 | 0 | struct dp_netdev_actions *actions; |
7580 | 0 | struct dp_netdev_flow *flow = batch->flow; |
7581 | |
|
7582 | 0 | dp_netdev_flow_used(flow, dp_packet_batch_size(&batch->array), |
7583 | 0 | batch->byte_count, |
7584 | 0 | batch->tcp_flags, pmd->ctx.now / 1000); |
7585 | |
|
7586 | 0 | actions = dp_netdev_flow_get_actions(flow); |
7587 | |
|
7588 | 0 | dp_netdev_execute_actions(pmd, &batch->array, true, &flow->flow, |
7589 | 0 | actions->actions, actions->size); |
7590 | 0 | } |
7591 | | |
7592 | | void |
7593 | | dp_netdev_batch_execute(struct dp_netdev_pmd_thread *pmd, |
7594 | | struct dp_packet_batch *packets, |
7595 | | struct dpcls_rule *rule, |
7596 | | uint32_t bytes, |
7597 | | uint16_t tcp_flags) |
7598 | 0 | { |
7599 | | /* Gets action* from the rule. */ |
7600 | 0 | struct dp_netdev_flow *flow = dp_netdev_flow_cast(rule); |
7601 | 0 | struct dp_netdev_actions *actions = dp_netdev_flow_get_actions(flow); |
7602 | |
|
7603 | 0 | dp_netdev_flow_used(flow, dp_packet_batch_size(packets), bytes, |
7604 | 0 | tcp_flags, pmd->ctx.now / 1000); |
7605 | 0 | const uint32_t steal = 1; |
7606 | 0 | dp_netdev_execute_actions(pmd, packets, steal, &flow->flow, |
7607 | 0 | actions->actions, actions->size); |
7608 | 0 | } |
7609 | | |
7610 | | static inline void |
7611 | | dp_netdev_queue_batches(struct dp_packet *pkt, |
7612 | | struct dp_netdev_flow *flow, uint16_t tcp_flags, |
7613 | | struct packet_batch_per_flow *batches, |
7614 | | size_t *n_batches) |
7615 | 0 | { |
7616 | 0 | struct packet_batch_per_flow *batch = flow->batch; |
7617 | |
|
7618 | 0 | if (OVS_UNLIKELY(!batch)) { |
7619 | 0 | batch = &batches[(*n_batches)++]; |
7620 | 0 | packet_batch_per_flow_init(batch, flow); |
7621 | 0 | } |
7622 | |
|
7623 | 0 | packet_batch_per_flow_update(batch, pkt, tcp_flags); |
7624 | 0 | } |
7625 | | |
7626 | | static inline void |
7627 | | packet_enqueue_to_flow_map(struct dp_packet *packet, |
7628 | | struct dp_netdev_flow *flow, |
7629 | | uint16_t tcp_flags, |
7630 | | struct dp_packet_flow_map *flow_map, |
7631 | | size_t index) |
7632 | 0 | { |
7633 | 0 | struct dp_packet_flow_map *map = &flow_map[index]; |
7634 | 0 | map->flow = flow; |
7635 | 0 | map->packet = packet; |
7636 | 0 | map->tcp_flags = tcp_flags; |
7637 | 0 | } |
7638 | | |
7639 | | /* SMC lookup function for a batch of packets. |
7640 | | * By doing batching SMC lookup, we can use prefetch |
7641 | | * to hide memory access latency. |
7642 | | */ |
7643 | | static inline void |
7644 | | smc_lookup_batch(struct dp_netdev_pmd_thread *pmd, |
7645 | | struct netdev_flow_key *keys, |
7646 | | struct netdev_flow_key **missed_keys, |
7647 | | struct dp_packet_batch *packets_, |
7648 | | const int cnt, |
7649 | | struct dp_packet_flow_map *flow_map, |
7650 | | uint8_t *index_map) |
7651 | 0 | { |
7652 | 0 | int i; |
7653 | 0 | struct dp_packet *packet; |
7654 | 0 | size_t n_smc_hit = 0, n_missed = 0; |
7655 | 0 | struct dfc_cache *cache = &pmd->flow_cache; |
7656 | 0 | struct smc_cache *smc_cache = &cache->smc_cache; |
7657 | 0 | const struct cmap_node *flow_node; |
7658 | 0 | int recv_idx; |
7659 | 0 | uint16_t tcp_flags; |
7660 | | |
7661 | | /* Prefetch buckets for all packets */ |
7662 | 0 | for (i = 0; i < cnt; i++) { |
7663 | 0 | OVS_PREFETCH(&smc_cache->buckets[keys[i].hash & SMC_MASK]); |
7664 | 0 | } |
7665 | |
|
7666 | 0 | DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) { |
7667 | 0 | struct dp_netdev_flow *flow = NULL; |
7668 | 0 | flow_node = smc_entry_get(pmd, keys[i].hash); |
7669 | 0 | bool hit = false; |
7670 | | /* Get the original order of this packet in received batch. */ |
7671 | 0 | recv_idx = index_map[i]; |
7672 | |
|
7673 | 0 | if (OVS_LIKELY(flow_node != NULL)) { |
7674 | 0 | CMAP_NODE_FOR_EACH (flow, node, flow_node) { |
7675 | | /* Since we dont have per-port megaflow to check the port |
7676 | | * number, we need to verify that the input ports match. */ |
7677 | 0 | if (OVS_LIKELY(dpcls_rule_matches_key(&flow->cr, &keys[i]) && |
7678 | 0 | flow->flow.in_port.odp_port == packet->md.in_port.odp_port)) { |
7679 | 0 | tcp_flags = miniflow_get_tcp_flags(&keys[i].mf); |
7680 | | |
7681 | | /* SMC hit and emc miss, we insert into EMC */ |
7682 | 0 | keys[i].len = |
7683 | 0 | netdev_flow_key_size(miniflow_n_values(&keys[i].mf)); |
7684 | 0 | emc_probabilistic_insert(pmd, &keys[i], flow); |
7685 | | /* Add these packets into the flow map in the same order |
7686 | | * as received. |
7687 | | */ |
7688 | 0 | packet_enqueue_to_flow_map(packet, flow, tcp_flags, |
7689 | 0 | flow_map, recv_idx); |
7690 | 0 | n_smc_hit++; |
7691 | 0 | hit = true; |
7692 | 0 | break; |
7693 | 0 | } |
7694 | 0 | } |
7695 | 0 | if (hit) { |
7696 | 0 | continue; |
7697 | 0 | } |
7698 | 0 | } |
7699 | | |
7700 | | /* SMC missed. Group missed packets together at |
7701 | | * the beginning of the 'packets' array. */ |
7702 | 0 | dp_packet_batch_refill(packets_, packet, i); |
7703 | | |
7704 | | /* Preserve the order of packet for flow batching. */ |
7705 | 0 | index_map[n_missed] = recv_idx; |
7706 | | |
7707 | | /* Put missed keys to the pointer arrays return to the caller */ |
7708 | 0 | missed_keys[n_missed++] = &keys[i]; |
7709 | 0 | } |
7710 | |
|
7711 | 0 | pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SMC_HIT, n_smc_hit); |
7712 | 0 | } |
7713 | | |
7714 | | struct dp_netdev_flow * |
7715 | | smc_lookup_single(struct dp_netdev_pmd_thread *pmd, |
7716 | | struct dp_packet *packet, |
7717 | | struct netdev_flow_key *key) |
7718 | 0 | { |
7719 | 0 | const struct cmap_node *flow_node = smc_entry_get(pmd, key->hash); |
7720 | |
|
7721 | 0 | if (OVS_LIKELY(flow_node != NULL)) { |
7722 | 0 | struct dp_netdev_flow *flow = NULL; |
7723 | |
|
7724 | 0 | CMAP_NODE_FOR_EACH (flow, node, flow_node) { |
7725 | | /* Since we dont have per-port megaflow to check the port |
7726 | | * number, we need to verify that the input ports match. */ |
7727 | 0 | if (OVS_LIKELY(dpcls_rule_matches_key(&flow->cr, key) && |
7728 | 0 | flow->flow.in_port.odp_port == packet->md.in_port.odp_port)) { |
7729 | |
|
7730 | 0 | return (void *) flow; |
7731 | 0 | } |
7732 | 0 | } |
7733 | 0 | } |
7734 | | |
7735 | 0 | return NULL; |
7736 | 0 | } |
7737 | | |
7738 | | inline int |
7739 | | dp_netdev_hw_flow(const struct dp_netdev_pmd_thread *pmd, |
7740 | | struct dp_packet *packet, |
7741 | | struct dp_netdev_flow **flow) |
7742 | 0 | { |
7743 | 0 | struct dp_netdev_rxq *rxq = pmd->ctx.last_rxq; |
7744 | 0 | bool post_process_api_supported; |
7745 | 0 | void *flow_reference = NULL; |
7746 | 0 | int err; |
7747 | |
|
7748 | 0 | atomic_read_relaxed(&rxq->port->netdev->hw_info.post_process_api_supported, |
7749 | 0 | &post_process_api_supported); |
7750 | |
|
7751 | 0 | if (!post_process_api_supported) { |
7752 | 0 | *flow = NULL; |
7753 | 0 | return 0; |
7754 | 0 | } |
7755 | | |
7756 | 0 | err = dpif_offload_netdev_hw_post_process(rxq->port->netdev, pmd->core_id, |
7757 | 0 | packet, &flow_reference); |
7758 | 0 | if (err && err != EOPNOTSUPP) { |
7759 | 0 | if (err != ECANCELED) { |
7760 | 0 | COVERAGE_INC(datapath_drop_hw_post_process); |
7761 | 0 | } else { |
7762 | 0 | COVERAGE_INC(datapath_drop_hw_post_process_consumed); |
7763 | 0 | } |
7764 | 0 | return -1; |
7765 | 0 | } |
7766 | | |
7767 | 0 | *flow = flow_reference; |
7768 | 0 | return 0; |
7769 | 0 | } |
7770 | | |
7771 | | /* Enqueues already classified packet into per-flow batches or the flow map, |
7772 | | * depending on the fact if batching enabled. */ |
7773 | | static inline void |
7774 | | dfc_processing_enqueue_classified_packet(struct dp_packet *packet, |
7775 | | struct dp_netdev_flow *flow, |
7776 | | uint16_t tcp_flags, |
7777 | | bool batch_enable, |
7778 | | struct packet_batch_per_flow *batches, |
7779 | | size_t *n_batches, |
7780 | | struct dp_packet_flow_map *flow_map, |
7781 | | size_t *map_cnt) |
7782 | | |
7783 | 0 | { |
7784 | 0 | if (OVS_LIKELY(batch_enable)) { |
7785 | 0 | dp_netdev_queue_batches(packet, flow, tcp_flags, batches, |
7786 | 0 | n_batches); |
7787 | 0 | } else { |
7788 | | /* Flow batching should be performed only after fast-path |
7789 | | * processing is also completed for packets with emc miss |
7790 | | * or else it will result in reordering of packets with |
7791 | | * same datapath flows. */ |
7792 | 0 | packet_enqueue_to_flow_map(packet, flow, tcp_flags, |
7793 | 0 | flow_map, (*map_cnt)++); |
7794 | 0 | } |
7795 | |
|
7796 | 0 | } |
7797 | | |
7798 | | /* Try to process all ('cnt') the 'packets' using only the datapath flow cache |
7799 | | * 'pmd->flow_cache'. If a flow is not found for a packet 'packets[i]', the |
7800 | | * miniflow is copied into 'keys' and the packet pointer is moved at the |
7801 | | * beginning of the 'packets' array. The pointers of missed keys are put in the |
7802 | | * missed_keys pointer array for future processing. |
7803 | | * |
7804 | | * The function returns the number of packets that needs to be processed in the |
7805 | | * 'packets' array (they have been moved to the beginning of the vector). |
7806 | | * |
7807 | | * For performance reasons a caller may choose not to initialize the metadata |
7808 | | * in 'packets_'. If 'md_is_valid' is false, the metadata in 'packets' |
7809 | | * is not valid and must be initialized by this function using 'port_no'. |
7810 | | * If 'md_is_valid' is true, the metadata is already valid and 'port_no' |
7811 | | * will be ignored. |
7812 | | */ |
7813 | | static inline size_t |
7814 | | dfc_processing(struct dp_netdev_pmd_thread *pmd, |
7815 | | struct dp_packet_batch *packets_, |
7816 | | struct netdev_flow_key *keys, |
7817 | | struct netdev_flow_key **missed_keys, |
7818 | | struct packet_batch_per_flow batches[], size_t *n_batches, |
7819 | | struct dp_packet_flow_map *flow_map, |
7820 | | size_t *n_flows, uint8_t *index_map, |
7821 | | bool md_is_valid, odp_port_t port_no) |
7822 | 0 | { |
7823 | 0 | const bool offload_enabled = dpif_offload_enabled(); |
7824 | 0 | const uint32_t recirc_depth = *recirc_depth_get(); |
7825 | 0 | const size_t cnt = dp_packet_batch_size(packets_); |
7826 | 0 | size_t n_missed = 0, n_emc_hit = 0, n_phwol_hit = 0; |
7827 | 0 | size_t n_mfex_opt_hit = 0, n_simple_hit = 0; |
7828 | 0 | struct dfc_cache *cache = &pmd->flow_cache; |
7829 | 0 | struct netdev_flow_key *key = &keys[0]; |
7830 | 0 | struct dp_packet *packet; |
7831 | 0 | size_t map_cnt = 0; |
7832 | 0 | bool batch_enable = true; |
7833 | |
|
7834 | 0 | const bool simple_match_enabled = |
7835 | 0 | !md_is_valid && dp_netdev_simple_match_enabled(pmd, port_no); |
7836 | | /* 'simple_match_table' is a full flow table. If the flow is not there, |
7837 | | * upcall is required, and there is no chance to find a match in caches. */ |
7838 | 0 | const bool smc_enable_db = !simple_match_enabled && pmd->ctx.smc_enable_db; |
7839 | 0 | const uint32_t cur_min = simple_match_enabled |
7840 | 0 | ? 0 : pmd->ctx.emc_insert_min; |
7841 | |
|
7842 | 0 | pmd_perf_update_counter(&pmd->perf_stats, |
7843 | 0 | md_is_valid ? PMD_STAT_RECIRC : PMD_STAT_RECV, |
7844 | 0 | cnt); |
7845 | 0 | int i; |
7846 | 0 | DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) { |
7847 | 0 | struct dp_netdev_flow *flow = NULL; |
7848 | 0 | uint16_t tcp_flags; |
7849 | |
|
7850 | 0 | if (OVS_UNLIKELY(dp_packet_size(packet) < ETH_HEADER_LEN)) { |
7851 | 0 | dp_packet_delete(packet); |
7852 | 0 | COVERAGE_INC(datapath_drop_rx_invalid_packet); |
7853 | 0 | continue; |
7854 | 0 | } |
7855 | | |
7856 | 0 | if (i != cnt - 1) { |
7857 | 0 | struct dp_packet **packets = packets_->packets; |
7858 | | /* Prefetch next packet data and metadata. */ |
7859 | 0 | OVS_PREFETCH(dp_packet_data(packets[i+1])); |
7860 | 0 | pkt_metadata_prefetch_init(&packets[i+1]->md); |
7861 | 0 | } |
7862 | |
|
7863 | 0 | if (!md_is_valid) { |
7864 | 0 | pkt_metadata_init(&packet->md, port_no); |
7865 | 0 | } |
7866 | |
|
7867 | 0 | if (offload_enabled && recirc_depth == 0) { |
7868 | 0 | if (OVS_UNLIKELY(dp_netdev_hw_flow(pmd, packet, &flow))) { |
7869 | | /* Packet restoration failed and it was dropped, do not |
7870 | | * continue processing. |
7871 | | */ |
7872 | 0 | continue; |
7873 | 0 | } |
7874 | 0 | if (OVS_LIKELY(flow)) { |
7875 | 0 | tcp_flags = parse_tcp_flags(packet, NULL, NULL, NULL); |
7876 | 0 | n_phwol_hit++; |
7877 | 0 | dfc_processing_enqueue_classified_packet( |
7878 | 0 | packet, flow, tcp_flags, batch_enable, |
7879 | 0 | batches, n_batches, flow_map, &map_cnt); |
7880 | 0 | continue; |
7881 | 0 | } |
7882 | 0 | } |
7883 | | |
7884 | 0 | if (!flow && simple_match_enabled) { |
7885 | 0 | ovs_be16 dl_type = 0, vlan_tci = 0; |
7886 | 0 | uint8_t nw_frag = 0; |
7887 | |
|
7888 | 0 | tcp_flags = parse_tcp_flags(packet, &dl_type, &nw_frag, &vlan_tci); |
7889 | 0 | flow = dp_netdev_simple_match_lookup(pmd, port_no, dl_type, |
7890 | 0 | nw_frag, vlan_tci); |
7891 | 0 | if (OVS_LIKELY(flow)) { |
7892 | 0 | n_simple_hit++; |
7893 | 0 | dfc_processing_enqueue_classified_packet( |
7894 | 0 | packet, flow, tcp_flags, batch_enable, |
7895 | 0 | batches, n_batches, flow_map, &map_cnt); |
7896 | 0 | continue; |
7897 | 0 | } |
7898 | 0 | } |
7899 | | |
7900 | 0 | miniflow_extract(packet, &key->mf); |
7901 | 0 | key->len = 0; /* Not computed yet. */ |
7902 | 0 | key->hash = |
7903 | 0 | (md_is_valid == false) |
7904 | 0 | ? dpif_netdev_packet_get_rss_hash_orig_pkt(packet, &key->mf) |
7905 | 0 | : dpif_netdev_packet_get_rss_hash(packet, &key->mf); |
7906 | | |
7907 | | /* If EMC is disabled skip emc_lookup */ |
7908 | 0 | flow = (cur_min != 0) ? emc_lookup(&cache->emc_cache, key) : NULL; |
7909 | 0 | if (OVS_LIKELY(flow)) { |
7910 | 0 | tcp_flags = miniflow_get_tcp_flags(&key->mf); |
7911 | 0 | n_emc_hit++; |
7912 | 0 | dfc_processing_enqueue_classified_packet( |
7913 | 0 | packet, flow, tcp_flags, batch_enable, |
7914 | 0 | batches, n_batches, flow_map, &map_cnt); |
7915 | 0 | } else { |
7916 | | /* Exact match cache missed. Group missed packets together at |
7917 | | * the beginning of the 'packets' array. */ |
7918 | 0 | dp_packet_batch_refill(packets_, packet, i); |
7919 | | |
7920 | | /* Preserve the order of packet for flow batching. */ |
7921 | 0 | index_map[n_missed] = map_cnt; |
7922 | 0 | flow_map[map_cnt++].flow = NULL; |
7923 | | |
7924 | | /* 'key[n_missed]' contains the key of the current packet and it |
7925 | | * will be passed to SMC lookup. The next key should be extracted |
7926 | | * to 'keys[n_missed + 1]'. |
7927 | | * We also maintain a pointer array to keys missed both SMC and EMC |
7928 | | * which will be returned to the caller for future processing. */ |
7929 | 0 | missed_keys[n_missed] = key; |
7930 | 0 | key = &keys[++n_missed]; |
7931 | | |
7932 | | /* Skip batching for subsequent packets to avoid reordering. */ |
7933 | 0 | batch_enable = false; |
7934 | 0 | } |
7935 | 0 | } |
7936 | | /* Count of packets which are not flow batched. */ |
7937 | 0 | *n_flows = map_cnt; |
7938 | |
|
7939 | 0 | pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_PHWOL_HIT, n_phwol_hit); |
7940 | 0 | pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MFEX_OPT_HIT, |
7941 | 0 | n_mfex_opt_hit); |
7942 | 0 | pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SIMPLE_HIT, |
7943 | 0 | n_simple_hit); |
7944 | 0 | pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_EXACT_HIT, n_emc_hit); |
7945 | |
|
7946 | 0 | if (!smc_enable_db) { |
7947 | 0 | return dp_packet_batch_size(packets_); |
7948 | 0 | } |
7949 | | |
7950 | | /* Packets miss EMC will do a batch lookup in SMC if enabled */ |
7951 | 0 | smc_lookup_batch(pmd, keys, missed_keys, packets_, |
7952 | 0 | n_missed, flow_map, index_map); |
7953 | |
|
7954 | 0 | return dp_packet_batch_size(packets_); |
7955 | 0 | } |
7956 | | |
7957 | | static inline int |
7958 | | handle_packet_upcall(struct dp_netdev_pmd_thread *pmd, |
7959 | | struct dp_packet *packet, |
7960 | | const struct netdev_flow_key *key, |
7961 | | struct ofpbuf *actions, struct ofpbuf *put_actions) |
7962 | 0 | { |
7963 | 0 | struct ofpbuf *add_actions; |
7964 | 0 | struct dp_packet_batch b; |
7965 | 0 | struct match match; |
7966 | 0 | ovs_u128 ufid; |
7967 | 0 | int error; |
7968 | 0 | uint64_t cycles = cycles_counter_update(&pmd->perf_stats); |
7969 | 0 | odp_port_t orig_in_port = packet->md.orig_in_port; |
7970 | |
|
7971 | 0 | match.tun_md.valid = false; |
7972 | 0 | miniflow_expand(&key->mf, &match.flow); |
7973 | 0 | memset(&match.wc, 0, sizeof match.wc); |
7974 | |
|
7975 | 0 | ofpbuf_clear(actions); |
7976 | 0 | ofpbuf_clear(put_actions); |
7977 | |
|
7978 | 0 | odp_flow_key_hash(&match.flow, sizeof match.flow, &ufid); |
7979 | 0 | error = dp_netdev_upcall(pmd, packet, &match.flow, &match.wc, |
7980 | 0 | &ufid, DPIF_UC_MISS, NULL, actions, |
7981 | 0 | put_actions); |
7982 | 0 | if (OVS_UNLIKELY(error && error != ENOSPC)) { |
7983 | 0 | dp_packet_delete(packet); |
7984 | 0 | COVERAGE_INC(datapath_drop_upcall_error); |
7985 | 0 | return error; |
7986 | 0 | } |
7987 | | |
7988 | | /* The Netlink encoding of datapath flow keys cannot express |
7989 | | * wildcarding the presence of a VLAN tag. Instead, a missing VLAN |
7990 | | * tag is interpreted as exact match on the fact that there is no |
7991 | | * VLAN. Unless we refactor a lot of code that translates between |
7992 | | * Netlink and struct flow representations, we have to do the same |
7993 | | * here. This must be in sync with 'match' in dpif_netdev_flow_put(). */ |
7994 | 0 | if (!match.wc.masks.vlans[0].tci) { |
7995 | 0 | match.wc.masks.vlans[0].tci = htons(VLAN_VID_MASK | VLAN_CFI); |
7996 | 0 | } |
7997 | | |
7998 | | /* We can't allow the packet batching in the next loop to execute |
7999 | | * the actions. Otherwise, if there are any slow path actions, |
8000 | | * we'll send the packet up twice. */ |
8001 | 0 | dp_packet_batch_init_packet(&b, packet); |
8002 | 0 | dp_netdev_execute_actions(pmd, &b, true, &match.flow, |
8003 | 0 | actions->data, actions->size); |
8004 | |
|
8005 | 0 | add_actions = put_actions->size ? put_actions : actions; |
8006 | 0 | if (OVS_LIKELY(error != ENOSPC)) { |
8007 | 0 | struct dp_netdev_flow *netdev_flow; |
8008 | | |
8009 | | /* XXX: There's a race window where a flow covering this packet |
8010 | | * could have already been installed since we last did the flow |
8011 | | * lookup before upcall. This could be solved by moving the |
8012 | | * mutex lock outside the loop, but that's an awful long time |
8013 | | * to be locking revalidators out of making flow modifications. */ |
8014 | 0 | ovs_mutex_lock(&pmd->flow_mutex); |
8015 | 0 | netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL); |
8016 | 0 | if (OVS_LIKELY(!netdev_flow)) { |
8017 | 0 | netdev_flow = dp_netdev_flow_add(pmd, &match, &ufid, |
8018 | 0 | add_actions->data, |
8019 | 0 | add_actions->size, orig_in_port); |
8020 | 0 | } |
8021 | 0 | ovs_mutex_unlock(&pmd->flow_mutex); |
8022 | 0 | uint32_t hash = dp_netdev_flow_hash(&netdev_flow->ufid); |
8023 | 0 | smc_insert(pmd, key, hash); |
8024 | 0 | emc_probabilistic_insert(pmd, key, netdev_flow); |
8025 | 0 | } |
8026 | 0 | if (pmd_perf_metrics_enabled(pmd)) { |
8027 | | /* Update upcall stats. */ |
8028 | 0 | cycles = cycles_counter_update(&pmd->perf_stats) - cycles; |
8029 | 0 | struct pmd_perf_stats *s = &pmd->perf_stats; |
8030 | 0 | s->current.upcalls++; |
8031 | 0 | s->current.upcall_cycles += cycles; |
8032 | 0 | histogram_add_sample(&s->cycles_per_upcall, cycles); |
8033 | 0 | } |
8034 | 0 | return error; |
8035 | 0 | } |
8036 | | |
8037 | | static inline void |
8038 | | fast_path_processing(struct dp_netdev_pmd_thread *pmd, |
8039 | | struct dp_packet_batch *packets_, |
8040 | | struct netdev_flow_key **keys, |
8041 | | struct dp_packet_flow_map *flow_map, |
8042 | | uint8_t *index_map, |
8043 | | odp_port_t in_port) |
8044 | 0 | { |
8045 | 0 | const size_t cnt = dp_packet_batch_size(packets_); |
8046 | 0 | #if !defined(__CHECKER__) && !defined(_WIN32) |
8047 | 0 | const size_t PKT_ARRAY_SIZE = cnt; |
8048 | | #else |
8049 | | /* Sparse or MSVC doesn't like variable length array. */ |
8050 | | enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST }; |
8051 | | #endif |
8052 | 0 | struct dp_packet *packet; |
8053 | 0 | struct dpcls *cls; |
8054 | 0 | struct dpcls_rule *rules[PKT_ARRAY_SIZE]; |
8055 | 0 | struct dp_netdev *dp = pmd->dp; |
8056 | 0 | int upcall_ok_cnt = 0, upcall_fail_cnt = 0; |
8057 | 0 | int lookup_cnt = 0, add_lookup_cnt; |
8058 | 0 | bool any_miss; |
8059 | |
|
8060 | 0 | for (size_t i = 0; i < cnt; i++) { |
8061 | | /* Key length is needed in all the cases, hash computed on demand. */ |
8062 | 0 | keys[i]->len = netdev_flow_key_size(miniflow_n_values(&keys[i]->mf)); |
8063 | 0 | } |
8064 | | /* Get the classifier for the in_port */ |
8065 | 0 | cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port); |
8066 | 0 | if (OVS_LIKELY(cls)) { |
8067 | 0 | any_miss = !dpcls_lookup(cls, (const struct netdev_flow_key **)keys, |
8068 | 0 | rules, cnt, &lookup_cnt); |
8069 | 0 | } else { |
8070 | 0 | any_miss = true; |
8071 | 0 | memset(rules, 0, sizeof(rules)); |
8072 | 0 | } |
8073 | 0 | if (OVS_UNLIKELY(any_miss) && !fat_rwlock_tryrdlock(&dp->upcall_rwlock)) { |
8074 | 0 | uint64_t actions_stub[512 / 8], slow_stub[512 / 8]; |
8075 | 0 | struct ofpbuf actions, put_actions; |
8076 | |
|
8077 | 0 | ofpbuf_use_stub(&actions, actions_stub, sizeof actions_stub); |
8078 | 0 | ofpbuf_use_stub(&put_actions, slow_stub, sizeof slow_stub); |
8079 | |
|
8080 | 0 | DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) { |
8081 | 0 | struct dp_netdev_flow *netdev_flow; |
8082 | |
|
8083 | 0 | if (OVS_LIKELY(rules[i])) { |
8084 | 0 | continue; |
8085 | 0 | } |
8086 | | |
8087 | | /* It's possible that an earlier slow path execution installed |
8088 | | * a rule covering this flow. In this case, it's a lot cheaper |
8089 | | * to catch it here than execute a miss. */ |
8090 | 0 | netdev_flow = dp_netdev_pmd_lookup_flow(pmd, keys[i], |
8091 | 0 | &add_lookup_cnt); |
8092 | 0 | if (netdev_flow) { |
8093 | 0 | lookup_cnt += add_lookup_cnt; |
8094 | 0 | rules[i] = &netdev_flow->cr; |
8095 | 0 | continue; |
8096 | 0 | } |
8097 | | |
8098 | 0 | int error = handle_packet_upcall(pmd, packet, keys[i], |
8099 | 0 | &actions, &put_actions); |
8100 | |
|
8101 | 0 | if (OVS_UNLIKELY(error)) { |
8102 | 0 | upcall_fail_cnt++; |
8103 | 0 | } else { |
8104 | 0 | upcall_ok_cnt++; |
8105 | 0 | } |
8106 | 0 | } |
8107 | |
|
8108 | 0 | ofpbuf_uninit(&actions); |
8109 | 0 | ofpbuf_uninit(&put_actions); |
8110 | 0 | fat_rwlock_unlock(&dp->upcall_rwlock); |
8111 | 0 | } else if (OVS_UNLIKELY(any_miss)) { |
8112 | 0 | DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) { |
8113 | 0 | if (OVS_UNLIKELY(!rules[i])) { |
8114 | 0 | dp_packet_delete(packet); |
8115 | 0 | COVERAGE_INC(datapath_drop_lock_error); |
8116 | 0 | upcall_fail_cnt++; |
8117 | 0 | } |
8118 | 0 | } |
8119 | 0 | } |
8120 | |
|
8121 | 0 | DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) { |
8122 | 0 | struct dp_netdev_flow *flow; |
8123 | | /* Get the original order of this packet in received batch. */ |
8124 | 0 | int recv_idx = index_map[i]; |
8125 | 0 | uint16_t tcp_flags; |
8126 | |
|
8127 | 0 | if (OVS_UNLIKELY(!rules[i])) { |
8128 | 0 | continue; |
8129 | 0 | } |
8130 | | |
8131 | 0 | flow = dp_netdev_flow_cast(rules[i]); |
8132 | 0 | uint32_t hash = dp_netdev_flow_hash(&flow->ufid); |
8133 | 0 | smc_insert(pmd, keys[i], hash); |
8134 | |
|
8135 | 0 | emc_probabilistic_insert(pmd, keys[i], flow); |
8136 | | /* Add these packets into the flow map in the same order |
8137 | | * as received. |
8138 | | */ |
8139 | 0 | tcp_flags = miniflow_get_tcp_flags(&keys[i]->mf); |
8140 | 0 | packet_enqueue_to_flow_map(packet, flow, tcp_flags, |
8141 | 0 | flow_map, recv_idx); |
8142 | 0 | } |
8143 | |
|
8144 | 0 | pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_HIT, |
8145 | 0 | cnt - upcall_ok_cnt - upcall_fail_cnt); |
8146 | 0 | pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_LOOKUP, |
8147 | 0 | lookup_cnt); |
8148 | 0 | pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MISS, |
8149 | 0 | upcall_ok_cnt); |
8150 | 0 | pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_LOST, |
8151 | 0 | upcall_fail_cnt); |
8152 | 0 | } |
8153 | | |
8154 | | /* Packets enter the datapath from a port (or from recirculation) here. |
8155 | | * |
8156 | | * When 'md_is_valid' is true the metadata in 'packets' are already valid. |
8157 | | * When false the metadata in 'packets' need to be initialized. */ |
8158 | | static void |
8159 | | dp_netdev_input__(struct dp_netdev_pmd_thread *pmd, |
8160 | | struct dp_packet_batch *packets, |
8161 | | bool md_is_valid, odp_port_t port_no) |
8162 | 0 | { |
8163 | 0 | #if !defined(__CHECKER__) && !defined(_WIN32) |
8164 | 0 | const size_t PKT_ARRAY_SIZE = dp_packet_batch_size(packets); |
8165 | | #else |
8166 | | /* Sparse or MSVC doesn't like variable length array. */ |
8167 | | enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST }; |
8168 | | #endif |
8169 | 0 | OVS_ALIGNED_VAR(CACHE_LINE_SIZE) |
8170 | 0 | struct netdev_flow_key keys[PKT_ARRAY_SIZE]; |
8171 | 0 | struct netdev_flow_key *missed_keys[PKT_ARRAY_SIZE]; |
8172 | 0 | struct packet_batch_per_flow batches[PKT_ARRAY_SIZE]; |
8173 | 0 | size_t n_batches; |
8174 | 0 | struct dp_packet_flow_map flow_map[PKT_ARRAY_SIZE]; |
8175 | 0 | uint8_t index_map[PKT_ARRAY_SIZE]; |
8176 | 0 | size_t n_flows, i; |
8177 | |
|
8178 | 0 | odp_port_t in_port; |
8179 | |
|
8180 | 0 | n_batches = 0; |
8181 | 0 | dfc_processing(pmd, packets, keys, missed_keys, batches, &n_batches, |
8182 | 0 | flow_map, &n_flows, index_map, md_is_valid, port_no); |
8183 | |
|
8184 | 0 | if (!dp_packet_batch_is_empty(packets)) { |
8185 | | /* Get ingress port from first packet's metadata. */ |
8186 | 0 | in_port = packets->packets[0]->md.in_port.odp_port; |
8187 | 0 | fast_path_processing(pmd, packets, missed_keys, |
8188 | 0 | flow_map, index_map, in_port); |
8189 | 0 | } |
8190 | | |
8191 | | /* Batch rest of packets which are in flow map. */ |
8192 | 0 | for (i = 0; i < n_flows; i++) { |
8193 | 0 | struct dp_packet_flow_map *map = &flow_map[i]; |
8194 | |
|
8195 | 0 | if (OVS_UNLIKELY(!map->flow)) { |
8196 | 0 | continue; |
8197 | 0 | } |
8198 | 0 | dp_netdev_queue_batches(map->packet, map->flow, map->tcp_flags, |
8199 | 0 | batches, &n_batches); |
8200 | 0 | } |
8201 | | |
8202 | | /* All the flow batches need to be reset before any call to |
8203 | | * packet_batch_per_flow_execute() as it could potentially trigger |
8204 | | * recirculation. When a packet matching flow 'j' happens to be |
8205 | | * recirculated, the nested call to dp_netdev_input__() could potentially |
8206 | | * classify the packet as matching another flow - say 'k'. It could happen |
8207 | | * that in the previous call to dp_netdev_input__() that same flow 'k' had |
8208 | | * already its own batches[k] still waiting to be served. So if its |
8209 | | * 'batch' member is not reset, the recirculated packet would be wrongly |
8210 | | * appended to batches[k] of the 1st call to dp_netdev_input__(). */ |
8211 | 0 | for (i = 0; i < n_batches; i++) { |
8212 | 0 | batches[i].flow->batch = NULL; |
8213 | 0 | } |
8214 | |
|
8215 | 0 | for (i = 0; i < n_batches; i++) { |
8216 | 0 | packet_batch_per_flow_execute(&batches[i], pmd); |
8217 | 0 | } |
8218 | 0 | } |
8219 | | |
8220 | | int32_t |
8221 | | dp_netdev_input(struct dp_netdev_pmd_thread *pmd, |
8222 | | struct dp_packet_batch *packets, |
8223 | | odp_port_t port_no) |
8224 | 0 | { |
8225 | 0 | dp_netdev_input__(pmd, packets, false, port_no); |
8226 | 0 | return 0; |
8227 | 0 | } |
8228 | | |
8229 | | static void |
8230 | | dp_netdev_recirculate(struct dp_netdev_pmd_thread *pmd, |
8231 | | struct dp_packet_batch *packets) |
8232 | 0 | { |
8233 | 0 | dp_netdev_input__(pmd, packets, true, 0); |
8234 | 0 | } |
8235 | | |
8236 | | struct dp_netdev_execute_aux { |
8237 | | struct dp_netdev_pmd_thread *pmd; |
8238 | | const struct flow *flow; |
8239 | | }; |
8240 | | |
8241 | | static void |
8242 | | dpif_netdev_register_dp_purge_cb(struct dpif *dpif, dp_purge_callback *cb, |
8243 | | void *aux) |
8244 | 0 | { |
8245 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
8246 | 0 | dp->dp_purge_aux = aux; |
8247 | 0 | dp->dp_purge_cb = cb; |
8248 | 0 | } |
8249 | | |
8250 | | static void |
8251 | | dpif_netdev_register_upcall_cb(struct dpif *dpif, upcall_callback *cb, |
8252 | | void *aux) |
8253 | 0 | { |
8254 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
8255 | 0 | dp->upcall_aux = aux; |
8256 | 0 | dp->upcall_cb = cb; |
8257 | 0 | } |
8258 | | |
8259 | | static void |
8260 | | dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd, |
8261 | | bool purge) |
8262 | 0 | { |
8263 | 0 | struct tx_port *tx; |
8264 | 0 | struct dp_netdev_port *port; |
8265 | 0 | long long interval; |
8266 | |
|
8267 | 0 | HMAP_FOR_EACH (tx, node, &pmd->send_port_cache) { |
8268 | 0 | if (tx->port->txq_mode != TXQ_MODE_XPS) { |
8269 | 0 | continue; |
8270 | 0 | } |
8271 | 0 | interval = pmd->ctx.now - tx->last_used; |
8272 | 0 | if (tx->qid >= 0 && (purge || interval >= XPS_TIMEOUT)) { |
8273 | 0 | port = tx->port; |
8274 | 0 | ovs_mutex_lock(&port->txq_used_mutex); |
8275 | 0 | port->txq_used[tx->qid]--; |
8276 | 0 | ovs_mutex_unlock(&port->txq_used_mutex); |
8277 | 0 | tx->qid = -1; |
8278 | 0 | } |
8279 | 0 | } |
8280 | 0 | } |
8281 | | |
8282 | | static int |
8283 | | dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd, |
8284 | | struct tx_port *tx) |
8285 | 0 | { |
8286 | 0 | struct dp_netdev_port *port; |
8287 | 0 | long long interval; |
8288 | 0 | int i, min_cnt, min_qid; |
8289 | |
|
8290 | 0 | interval = pmd->ctx.now - tx->last_used; |
8291 | 0 | tx->last_used = pmd->ctx.now; |
8292 | |
|
8293 | 0 | if (OVS_LIKELY(tx->qid >= 0 && interval < XPS_TIMEOUT)) { |
8294 | 0 | return tx->qid; |
8295 | 0 | } |
8296 | | |
8297 | 0 | port = tx->port; |
8298 | |
|
8299 | 0 | ovs_mutex_lock(&port->txq_used_mutex); |
8300 | 0 | if (tx->qid >= 0) { |
8301 | 0 | port->txq_used[tx->qid]--; |
8302 | 0 | tx->qid = -1; |
8303 | 0 | } |
8304 | |
|
8305 | 0 | min_cnt = -1; |
8306 | 0 | min_qid = 0; |
8307 | 0 | for (i = 0; i < netdev_n_txq(port->netdev); i++) { |
8308 | 0 | if (port->txq_used[i] < min_cnt || min_cnt == -1) { |
8309 | 0 | min_cnt = port->txq_used[i]; |
8310 | 0 | min_qid = i; |
8311 | 0 | } |
8312 | 0 | } |
8313 | |
|
8314 | 0 | port->txq_used[min_qid]++; |
8315 | 0 | tx->qid = min_qid; |
8316 | |
|
8317 | 0 | ovs_mutex_unlock(&port->txq_used_mutex); |
8318 | |
|
8319 | 0 | dpif_netdev_xps_revalidate_pmd(pmd, false); |
8320 | |
|
8321 | 0 | VLOG_DBG("Core %d: New TX queue ID %d for port \'%s\'.", |
8322 | 0 | pmd->core_id, tx->qid, netdev_get_name(tx->port->netdev)); |
8323 | 0 | return min_qid; |
8324 | 0 | } |
8325 | | |
8326 | | static struct tx_port * |
8327 | | pmd_tnl_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd, |
8328 | | odp_port_t port_no) |
8329 | 0 | { |
8330 | 0 | return tx_port_lookup(&pmd->tnl_port_cache, port_no); |
8331 | 0 | } |
8332 | | |
8333 | | static struct tx_port * |
8334 | | pmd_send_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd, |
8335 | | odp_port_t port_no) |
8336 | 0 | { |
8337 | 0 | return tx_port_lookup(&pmd->send_port_cache, port_no); |
8338 | 0 | } |
8339 | | |
8340 | | static int |
8341 | | push_tnl_action(const struct dp_netdev_pmd_thread *pmd, |
8342 | | const struct nlattr *attr, |
8343 | | struct dp_packet_batch *batch) |
8344 | 0 | { |
8345 | 0 | struct tx_port *tun_port; |
8346 | 0 | const struct ovs_action_push_tnl *data; |
8347 | 0 | int err; |
8348 | |
|
8349 | 0 | data = nl_attr_get(attr); |
8350 | |
|
8351 | 0 | tun_port = pmd_tnl_port_cache_lookup(pmd, data->tnl_port); |
8352 | 0 | if (!tun_port) { |
8353 | 0 | err = -EINVAL; |
8354 | 0 | goto error; |
8355 | 0 | } |
8356 | 0 | err = netdev_push_header(tun_port->port->netdev, batch, data); |
8357 | 0 | if (!err) { |
8358 | 0 | return 0; |
8359 | 0 | } |
8360 | 0 | error: |
8361 | 0 | dp_packet_delete_batch(batch, true); |
8362 | 0 | return err; |
8363 | 0 | } |
8364 | | |
8365 | | static void |
8366 | | dp_execute_userspace_action(struct dp_netdev_pmd_thread *pmd, |
8367 | | struct dp_packet *packet, bool should_steal, |
8368 | | struct flow *flow, ovs_u128 *ufid, |
8369 | | struct ofpbuf *actions, |
8370 | | const struct nlattr *userdata) |
8371 | 0 | { |
8372 | 0 | struct dp_packet_batch b; |
8373 | 0 | int error; |
8374 | |
|
8375 | 0 | ofpbuf_clear(actions); |
8376 | |
|
8377 | 0 | error = dp_netdev_upcall(pmd, packet, flow, NULL, ufid, |
8378 | 0 | DPIF_UC_ACTION, userdata, actions, |
8379 | 0 | NULL); |
8380 | 0 | if (!error || error == ENOSPC) { |
8381 | 0 | dp_packet_batch_init_packet(&b, packet); |
8382 | 0 | dp_netdev_execute_actions(pmd, &b, should_steal, flow, |
8383 | 0 | actions->data, actions->size); |
8384 | 0 | } else if (should_steal) { |
8385 | 0 | dp_packet_delete(packet); |
8386 | 0 | COVERAGE_INC(datapath_drop_userspace_action_error); |
8387 | 0 | } |
8388 | 0 | } |
8389 | | |
8390 | | static bool |
8391 | | dp_execute_output_action(struct dp_netdev_pmd_thread *pmd, |
8392 | | struct dp_packet_batch *packets_, |
8393 | | bool should_steal, odp_port_t port_no) |
8394 | 0 | { |
8395 | 0 | struct tx_port *p = pmd_send_port_cache_lookup(pmd, port_no); |
8396 | 0 | struct dp_packet_batch out; |
8397 | |
|
8398 | 0 | if (!OVS_LIKELY(p)) { |
8399 | 0 | COVERAGE_ADD(datapath_drop_invalid_port, |
8400 | 0 | dp_packet_batch_size(packets_)); |
8401 | 0 | dp_packet_delete_batch(packets_, should_steal); |
8402 | 0 | return false; |
8403 | 0 | } |
8404 | 0 | if (!should_steal) { |
8405 | 0 | dp_packet_batch_clone(&out, packets_); |
8406 | 0 | dp_packet_batch_reset_cutlen(packets_); |
8407 | 0 | packets_ = &out; |
8408 | 0 | } |
8409 | 0 | dp_packet_batch_apply_cutlen(packets_); |
8410 | 0 | if (dp_packet_batch_size(&p->output_pkts) |
8411 | 0 | + dp_packet_batch_size(packets_) > NETDEV_MAX_BURST) { |
8412 | | /* Flush here to avoid overflow. */ |
8413 | 0 | dp_netdev_pmd_flush_output_on_port(pmd, p); |
8414 | 0 | } |
8415 | 0 | if (dp_packet_batch_is_empty(&p->output_pkts)) { |
8416 | 0 | pmd->n_output_batches++; |
8417 | 0 | } |
8418 | |
|
8419 | 0 | struct dp_packet *packet; |
8420 | 0 | DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) { |
8421 | 0 | p->output_pkts_rxqs[dp_packet_batch_size(&p->output_pkts)] = |
8422 | 0 | pmd->ctx.last_rxq; |
8423 | 0 | dp_packet_batch_add(&p->output_pkts, packet); |
8424 | 0 | } |
8425 | 0 | return true; |
8426 | 0 | } |
8427 | | |
8428 | | static void |
8429 | | dp_execute_lb_output_action(struct dp_netdev_pmd_thread *pmd, |
8430 | | struct dp_packet_batch *packets_, |
8431 | | bool should_steal, uint32_t bond) |
8432 | 0 | { |
8433 | 0 | struct tx_bond *p_bond = tx_bond_lookup(&pmd->tx_bonds, bond); |
8434 | 0 | struct dp_packet_batch out; |
8435 | 0 | struct dp_packet *packet; |
8436 | |
|
8437 | 0 | if (!p_bond) { |
8438 | 0 | COVERAGE_ADD(datapath_drop_invalid_bond, |
8439 | 0 | dp_packet_batch_size(packets_)); |
8440 | 0 | dp_packet_delete_batch(packets_, should_steal); |
8441 | 0 | return; |
8442 | 0 | } |
8443 | 0 | if (!should_steal) { |
8444 | 0 | dp_packet_batch_clone(&out, packets_); |
8445 | 0 | dp_packet_batch_reset_cutlen(packets_); |
8446 | 0 | packets_ = &out; |
8447 | 0 | } |
8448 | 0 | dp_packet_batch_apply_cutlen(packets_); |
8449 | |
|
8450 | 0 | DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) { |
8451 | | /* |
8452 | | * Lookup the bond-hash table using hash to get the member. |
8453 | | */ |
8454 | 0 | uint32_t hash = dp_packet_get_rss_hash(packet); |
8455 | 0 | struct member_entry *s_entry |
8456 | 0 | = &p_bond->member_buckets[hash & BOND_MASK]; |
8457 | 0 | odp_port_t bond_member = s_entry->member_id; |
8458 | 0 | uint32_t size = dp_packet_size(packet); |
8459 | 0 | struct dp_packet_batch output_pkt; |
8460 | |
|
8461 | 0 | dp_packet_batch_init_packet(&output_pkt, packet); |
8462 | 0 | if (OVS_LIKELY(dp_execute_output_action(pmd, &output_pkt, true, |
8463 | 0 | bond_member))) { |
8464 | | /* Update member stats. */ |
8465 | 0 | non_atomic_ullong_add(&s_entry->n_packets, 1); |
8466 | 0 | non_atomic_ullong_add(&s_entry->n_bytes, size); |
8467 | 0 | } |
8468 | 0 | } |
8469 | 0 | } |
8470 | | |
8471 | | static void |
8472 | | dp_execute_cb(void *aux_, struct dp_packet_batch *packets_, |
8473 | | const struct nlattr *a, bool should_steal) |
8474 | | OVS_NO_THREAD_SAFETY_ANALYSIS |
8475 | 0 | { |
8476 | 0 | struct dp_netdev_execute_aux *aux = aux_; |
8477 | 0 | uint32_t *depth = recirc_depth_get(); |
8478 | 0 | struct dp_netdev_pmd_thread *pmd = aux->pmd; |
8479 | 0 | struct dp_netdev *dp = pmd->dp; |
8480 | 0 | int type = nl_attr_type(a); |
8481 | 0 | struct tx_port *p; |
8482 | 0 | uint32_t packet_count, packets_dropped; |
8483 | |
|
8484 | 0 | switch ((enum ovs_action_attr)type) { |
8485 | 0 | case OVS_ACTION_ATTR_OUTPUT: |
8486 | 0 | dp_execute_output_action(pmd, packets_, should_steal, |
8487 | 0 | nl_attr_get_odp_port(a)); |
8488 | 0 | return; |
8489 | | |
8490 | 0 | case OVS_ACTION_ATTR_LB_OUTPUT: |
8491 | 0 | dp_execute_lb_output_action(pmd, packets_, should_steal, |
8492 | 0 | nl_attr_get_u32(a)); |
8493 | 0 | return; |
8494 | | |
8495 | 0 | case OVS_ACTION_ATTR_TUNNEL_PUSH: |
8496 | 0 | if (should_steal) { |
8497 | | /* We're requested to push tunnel header, but also we need to take |
8498 | | * the ownership of these packets. Thus, we can avoid performing |
8499 | | * the action, because the caller will not use the result anyway. |
8500 | | * Just break to free the batch. */ |
8501 | 0 | break; |
8502 | 0 | } |
8503 | 0 | dp_packet_batch_apply_cutlen(packets_); |
8504 | 0 | packet_count = dp_packet_batch_size(packets_); |
8505 | 0 | if (push_tnl_action(pmd, a, packets_)) { |
8506 | 0 | COVERAGE_ADD(datapath_drop_tunnel_push_error, |
8507 | 0 | packet_count); |
8508 | 0 | } |
8509 | 0 | return; |
8510 | | |
8511 | 0 | case OVS_ACTION_ATTR_TUNNEL_POP: |
8512 | 0 | if (*depth < MAX_RECIRC_DEPTH) { |
8513 | 0 | struct dp_packet_batch *orig_packets_ = packets_; |
8514 | 0 | odp_port_t portno = nl_attr_get_odp_port(a); |
8515 | |
|
8516 | 0 | p = pmd_tnl_port_cache_lookup(pmd, portno); |
8517 | 0 | if (p) { |
8518 | 0 | struct dp_packet_batch tnl_pkt; |
8519 | |
|
8520 | 0 | if (!should_steal) { |
8521 | 0 | dp_packet_batch_clone(&tnl_pkt, packets_); |
8522 | 0 | packets_ = &tnl_pkt; |
8523 | 0 | dp_packet_batch_reset_cutlen(orig_packets_); |
8524 | 0 | } |
8525 | |
|
8526 | 0 | dp_packet_batch_apply_cutlen(packets_); |
8527 | |
|
8528 | 0 | packet_count = dp_packet_batch_size(packets_); |
8529 | 0 | netdev_pop_header(p->port->netdev, packets_); |
8530 | 0 | packets_dropped = |
8531 | 0 | packet_count - dp_packet_batch_size(packets_); |
8532 | 0 | if (packets_dropped) { |
8533 | 0 | COVERAGE_ADD(datapath_drop_tunnel_pop_error, |
8534 | 0 | packets_dropped); |
8535 | 0 | } |
8536 | 0 | if (dp_packet_batch_is_empty(packets_)) { |
8537 | 0 | return; |
8538 | 0 | } |
8539 | | |
8540 | 0 | struct dp_packet *packet; |
8541 | 0 | DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) { |
8542 | 0 | packet->md.in_port.odp_port = portno; |
8543 | 0 | } |
8544 | |
|
8545 | 0 | (*depth)++; |
8546 | 0 | dp_netdev_recirculate(pmd, packets_); |
8547 | 0 | (*depth)--; |
8548 | 0 | return; |
8549 | 0 | } |
8550 | 0 | COVERAGE_ADD(datapath_drop_invalid_tnl_port, |
8551 | 0 | dp_packet_batch_size(packets_)); |
8552 | 0 | } else { |
8553 | 0 | COVERAGE_ADD(datapath_drop_recirc_error, |
8554 | 0 | dp_packet_batch_size(packets_)); |
8555 | 0 | } |
8556 | 0 | break; |
8557 | | |
8558 | 0 | case OVS_ACTION_ATTR_USERSPACE: |
8559 | 0 | if (!fat_rwlock_tryrdlock(&dp->upcall_rwlock)) { |
8560 | 0 | struct dp_packet_batch *orig_packets_ = packets_; |
8561 | 0 | const struct nlattr *userdata; |
8562 | 0 | struct dp_packet_batch usr_pkt; |
8563 | 0 | struct ofpbuf actions; |
8564 | 0 | struct flow flow; |
8565 | 0 | ovs_u128 ufid; |
8566 | 0 | bool clone = false; |
8567 | |
|
8568 | 0 | userdata = nl_attr_find_nested(a, OVS_USERSPACE_ATTR_USERDATA); |
8569 | 0 | ofpbuf_init(&actions, 0); |
8570 | |
|
8571 | 0 | if (packets_->trunc) { |
8572 | 0 | if (!should_steal) { |
8573 | 0 | dp_packet_batch_clone(&usr_pkt, packets_); |
8574 | 0 | packets_ = &usr_pkt; |
8575 | 0 | clone = true; |
8576 | 0 | dp_packet_batch_reset_cutlen(orig_packets_); |
8577 | 0 | } |
8578 | |
|
8579 | 0 | dp_packet_batch_apply_cutlen(packets_); |
8580 | 0 | } |
8581 | |
|
8582 | 0 | struct dp_packet *packet; |
8583 | 0 | DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) { |
8584 | 0 | flow_extract(packet, &flow); |
8585 | 0 | odp_flow_key_hash(&flow, sizeof flow, &ufid); |
8586 | 0 | dp_execute_userspace_action(pmd, packet, should_steal, &flow, |
8587 | 0 | &ufid, &actions, userdata); |
8588 | 0 | } |
8589 | |
|
8590 | 0 | if (clone) { |
8591 | 0 | dp_packet_delete_batch(packets_, true); |
8592 | 0 | } |
8593 | |
|
8594 | 0 | ofpbuf_uninit(&actions); |
8595 | 0 | fat_rwlock_unlock(&dp->upcall_rwlock); |
8596 | |
|
8597 | 0 | return; |
8598 | 0 | } |
8599 | 0 | COVERAGE_ADD(datapath_drop_lock_error, |
8600 | 0 | dp_packet_batch_size(packets_)); |
8601 | 0 | break; |
8602 | | |
8603 | 0 | case OVS_ACTION_ATTR_RECIRC: |
8604 | 0 | if (*depth < MAX_RECIRC_DEPTH) { |
8605 | 0 | struct dp_packet_batch recirc_pkts; |
8606 | |
|
8607 | 0 | if (!should_steal) { |
8608 | 0 | dp_packet_batch_clone(&recirc_pkts, packets_); |
8609 | 0 | packets_ = &recirc_pkts; |
8610 | 0 | } |
8611 | |
|
8612 | 0 | struct dp_packet *packet; |
8613 | 0 | DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) { |
8614 | 0 | packet->md.recirc_id = nl_attr_get_u32(a); |
8615 | 0 | } |
8616 | |
|
8617 | 0 | (*depth)++; |
8618 | 0 | dp_netdev_recirculate(pmd, packets_); |
8619 | 0 | (*depth)--; |
8620 | |
|
8621 | 0 | return; |
8622 | 0 | } |
8623 | | |
8624 | 0 | COVERAGE_ADD(datapath_drop_recirc_error, |
8625 | 0 | dp_packet_batch_size(packets_)); |
8626 | 0 | VLOG_WARN("Packet dropped. Max recirculation depth exceeded."); |
8627 | 0 | break; |
8628 | | |
8629 | 0 | case OVS_ACTION_ATTR_CT: { |
8630 | 0 | const struct nlattr *b; |
8631 | 0 | bool force = false; |
8632 | 0 | bool commit = false; |
8633 | 0 | unsigned int left; |
8634 | 0 | uint16_t zone = 0; |
8635 | 0 | uint32_t tp_id = 0; |
8636 | 0 | const char *helper = NULL; |
8637 | 0 | const uint32_t *setmark = NULL; |
8638 | 0 | const struct ovs_key_ct_labels *setlabel = NULL; |
8639 | 0 | struct nat_action_info_t nat_action_info; |
8640 | 0 | struct nat_action_info_t *nat_action_info_ref = NULL; |
8641 | 0 | bool nat_config = false; |
8642 | |
|
8643 | 0 | NL_ATTR_FOR_EACH_UNSAFE (b, left, nl_attr_get(a), |
8644 | 0 | nl_attr_get_size(a)) { |
8645 | 0 | enum ovs_ct_attr sub_type = nl_attr_type(b); |
8646 | |
|
8647 | 0 | switch(sub_type) { |
8648 | 0 | case OVS_CT_ATTR_FORCE_COMMIT: |
8649 | 0 | force = true; |
8650 | | /* fall through. */ |
8651 | 0 | case OVS_CT_ATTR_COMMIT: |
8652 | 0 | commit = true; |
8653 | 0 | break; |
8654 | 0 | case OVS_CT_ATTR_ZONE: |
8655 | 0 | zone = nl_attr_get_u16(b); |
8656 | 0 | break; |
8657 | 0 | case OVS_CT_ATTR_HELPER: |
8658 | 0 | helper = nl_attr_get_string(b); |
8659 | 0 | break; |
8660 | 0 | case OVS_CT_ATTR_MARK: |
8661 | 0 | setmark = nl_attr_get(b); |
8662 | 0 | break; |
8663 | 0 | case OVS_CT_ATTR_LABELS: |
8664 | 0 | setlabel = nl_attr_get(b); |
8665 | 0 | break; |
8666 | 0 | case OVS_CT_ATTR_EVENTMASK: |
8667 | | /* Silently ignored, as userspace datapath does not generate |
8668 | | * netlink events. */ |
8669 | 0 | break; |
8670 | 0 | case OVS_CT_ATTR_TIMEOUT: |
8671 | 0 | if (!str_to_uint(nl_attr_get_string(b), 10, &tp_id)) { |
8672 | 0 | VLOG_WARN("Invalid Timeout Policy ID: %s.", |
8673 | 0 | nl_attr_get_string(b)); |
8674 | 0 | tp_id = DEFAULT_TP_ID; |
8675 | 0 | } |
8676 | 0 | break; |
8677 | 0 | case OVS_CT_ATTR_NAT: { |
8678 | 0 | const struct nlattr *b_nest; |
8679 | 0 | unsigned int left_nest; |
8680 | 0 | bool ip_min_specified = false; |
8681 | 0 | bool proto_num_min_specified = false; |
8682 | 0 | bool ip_max_specified = false; |
8683 | 0 | bool proto_num_max_specified = false; |
8684 | 0 | memset(&nat_action_info, 0, sizeof nat_action_info); |
8685 | 0 | nat_action_info_ref = &nat_action_info; |
8686 | |
|
8687 | 0 | NL_NESTED_FOR_EACH_UNSAFE (b_nest, left_nest, b) { |
8688 | 0 | enum ovs_nat_attr sub_type_nest = nl_attr_type(b_nest); |
8689 | |
|
8690 | 0 | switch (sub_type_nest) { |
8691 | 0 | case OVS_NAT_ATTR_SRC: |
8692 | 0 | case OVS_NAT_ATTR_DST: |
8693 | 0 | nat_config = true; |
8694 | 0 | nat_action_info.nat_action |= |
8695 | 0 | ((sub_type_nest == OVS_NAT_ATTR_SRC) |
8696 | 0 | ? NAT_ACTION_SRC : NAT_ACTION_DST); |
8697 | 0 | break; |
8698 | 0 | case OVS_NAT_ATTR_IP_MIN: |
8699 | 0 | memcpy(&nat_action_info.min_addr, |
8700 | 0 | nl_attr_get(b_nest), |
8701 | 0 | nl_attr_get_size(b_nest)); |
8702 | 0 | ip_min_specified = true; |
8703 | 0 | break; |
8704 | 0 | case OVS_NAT_ATTR_IP_MAX: |
8705 | 0 | memcpy(&nat_action_info.max_addr, |
8706 | 0 | nl_attr_get(b_nest), |
8707 | 0 | nl_attr_get_size(b_nest)); |
8708 | 0 | ip_max_specified = true; |
8709 | 0 | break; |
8710 | 0 | case OVS_NAT_ATTR_PROTO_MIN: |
8711 | 0 | nat_action_info.min_port = |
8712 | 0 | nl_attr_get_u16(b_nest); |
8713 | 0 | proto_num_min_specified = true; |
8714 | 0 | break; |
8715 | 0 | case OVS_NAT_ATTR_PROTO_MAX: |
8716 | 0 | nat_action_info.max_port = |
8717 | 0 | nl_attr_get_u16(b_nest); |
8718 | 0 | proto_num_max_specified = true; |
8719 | 0 | break; |
8720 | 0 | case OVS_NAT_ATTR_PROTO_RANDOM: |
8721 | 0 | nat_action_info.nat_flags |= NAT_RANGE_RANDOM; |
8722 | 0 | break; |
8723 | 0 | case OVS_NAT_ATTR_PERSISTENT: |
8724 | 0 | nat_action_info.nat_flags |= NAT_PERSISTENT; |
8725 | 0 | break; |
8726 | 0 | case OVS_NAT_ATTR_PROTO_HASH: |
8727 | 0 | break; |
8728 | 0 | case OVS_NAT_ATTR_UNSPEC: |
8729 | 0 | case __OVS_NAT_ATTR_MAX: |
8730 | 0 | OVS_NOT_REACHED(); |
8731 | 0 | } |
8732 | 0 | } |
8733 | | |
8734 | 0 | if (ip_min_specified && !ip_max_specified) { |
8735 | 0 | nat_action_info.max_addr = nat_action_info.min_addr; |
8736 | 0 | } |
8737 | 0 | if (proto_num_min_specified && !proto_num_max_specified) { |
8738 | 0 | nat_action_info.max_port = nat_action_info.min_port; |
8739 | 0 | } |
8740 | 0 | if (proto_num_min_specified || proto_num_max_specified) { |
8741 | 0 | if (nat_action_info.nat_action & NAT_ACTION_SRC) { |
8742 | 0 | nat_action_info.nat_action |= NAT_ACTION_SRC_PORT; |
8743 | 0 | } else if (nat_action_info.nat_action & NAT_ACTION_DST) { |
8744 | 0 | nat_action_info.nat_action |= NAT_ACTION_DST_PORT; |
8745 | 0 | } |
8746 | 0 | } |
8747 | 0 | break; |
8748 | 0 | } |
8749 | 0 | case OVS_CT_ATTR_UNSPEC: |
8750 | 0 | case __OVS_CT_ATTR_MAX: |
8751 | 0 | OVS_NOT_REACHED(); |
8752 | 0 | } |
8753 | 0 | } |
8754 | | |
8755 | | /* We won't be able to function properly in this case, hence |
8756 | | * complain loudly. */ |
8757 | 0 | if (nat_config && !commit) { |
8758 | 0 | static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5); |
8759 | 0 | VLOG_WARN_RL(&rl, "NAT specified without commit."); |
8760 | 0 | } |
8761 | |
|
8762 | 0 | conntrack_execute(dp->conntrack, packets_, aux->flow->dl_type, force, |
8763 | 0 | commit, zone, setmark, setlabel, helper, |
8764 | 0 | nat_action_info_ref, pmd->ctx.now / 1000, tp_id); |
8765 | 0 | break; |
8766 | 0 | } |
8767 | | |
8768 | 0 | case OVS_ACTION_ATTR_METER: |
8769 | 0 | dp_netdev_run_meter(pmd->dp, packets_, nl_attr_get_u32(a), |
8770 | 0 | pmd->ctx.now / 1000); |
8771 | 0 | break; |
8772 | | |
8773 | 0 | case OVS_ACTION_ATTR_PUSH_VLAN: |
8774 | 0 | case OVS_ACTION_ATTR_POP_VLAN: |
8775 | 0 | case OVS_ACTION_ATTR_PUSH_MPLS: |
8776 | 0 | case OVS_ACTION_ATTR_POP_MPLS: |
8777 | 0 | case OVS_ACTION_ATTR_SET: |
8778 | 0 | case OVS_ACTION_ATTR_SET_MASKED: |
8779 | 0 | case OVS_ACTION_ATTR_SAMPLE: |
8780 | 0 | case OVS_ACTION_ATTR_HASH: |
8781 | 0 | case OVS_ACTION_ATTR_UNSPEC: |
8782 | 0 | case OVS_ACTION_ATTR_TRUNC: |
8783 | 0 | case OVS_ACTION_ATTR_PUSH_ETH: |
8784 | 0 | case OVS_ACTION_ATTR_POP_ETH: |
8785 | 0 | case OVS_ACTION_ATTR_CLONE: |
8786 | 0 | case OVS_ACTION_ATTR_PUSH_NSH: |
8787 | 0 | case OVS_ACTION_ATTR_POP_NSH: |
8788 | 0 | case OVS_ACTION_ATTR_CT_CLEAR: |
8789 | 0 | case OVS_ACTION_ATTR_CHECK_PKT_LEN: |
8790 | 0 | case OVS_ACTION_ATTR_DROP: |
8791 | 0 | case OVS_ACTION_ATTR_ADD_MPLS: |
8792 | 0 | case OVS_ACTION_ATTR_DEC_TTL: |
8793 | 0 | case OVS_ACTION_ATTR_PSAMPLE: |
8794 | 0 | case __OVS_ACTION_ATTR_MAX: |
8795 | 0 | OVS_NOT_REACHED(); |
8796 | 0 | } |
8797 | | |
8798 | 0 | dp_packet_delete_batch(packets_, should_steal); |
8799 | 0 | } |
8800 | | |
8801 | | static void |
8802 | | dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd, |
8803 | | struct dp_packet_batch *packets, |
8804 | | bool should_steal, const struct flow *flow, |
8805 | | const struct nlattr *actions, size_t actions_len) |
8806 | 0 | { |
8807 | 0 | struct dp_netdev_execute_aux aux = { pmd, flow }; |
8808 | |
|
8809 | 0 | odp_execute_actions(&aux, packets, should_steal, actions, |
8810 | 0 | actions_len, dp_execute_cb); |
8811 | 0 | } |
8812 | | |
8813 | | struct dp_netdev_ct_dump { |
8814 | | struct ct_dpif_dump_state up; |
8815 | | struct conntrack_dump dump; |
8816 | | struct conntrack *ct; |
8817 | | struct dp_netdev *dp; |
8818 | | }; |
8819 | | |
8820 | | static int |
8821 | | dpif_netdev_ct_dump_start(struct dpif *dpif, struct ct_dpif_dump_state **dump_, |
8822 | | const uint16_t *pzone, int *ptot_bkts) |
8823 | 0 | { |
8824 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
8825 | 0 | struct dp_netdev_ct_dump *dump; |
8826 | |
|
8827 | 0 | dump = xzalloc(sizeof *dump); |
8828 | 0 | dump->dp = dp; |
8829 | 0 | dump->ct = dp->conntrack; |
8830 | |
|
8831 | 0 | conntrack_dump_start(dp->conntrack, &dump->dump, pzone, ptot_bkts); |
8832 | |
|
8833 | 0 | *dump_ = &dump->up; |
8834 | |
|
8835 | 0 | return 0; |
8836 | 0 | } |
8837 | | |
8838 | | static int |
8839 | | dpif_netdev_ct_dump_next(struct dpif *dpif OVS_UNUSED, |
8840 | | struct ct_dpif_dump_state *dump_, |
8841 | | struct ct_dpif_entry *entry) |
8842 | 0 | { |
8843 | 0 | struct dp_netdev_ct_dump *dump; |
8844 | |
|
8845 | 0 | INIT_CONTAINER(dump, dump_, up); |
8846 | |
|
8847 | 0 | return conntrack_dump_next(&dump->dump, entry); |
8848 | 0 | } |
8849 | | |
8850 | | static int |
8851 | | dpif_netdev_ct_dump_done(struct dpif *dpif OVS_UNUSED, |
8852 | | struct ct_dpif_dump_state *dump_) |
8853 | 0 | { |
8854 | 0 | struct dp_netdev_ct_dump *dump; |
8855 | 0 | int err; |
8856 | |
|
8857 | 0 | INIT_CONTAINER(dump, dump_, up); |
8858 | |
|
8859 | 0 | err = conntrack_dump_done(&dump->dump); |
8860 | |
|
8861 | 0 | free(dump); |
8862 | |
|
8863 | 0 | return err; |
8864 | 0 | } |
8865 | | |
8866 | | static int |
8867 | | dpif_netdev_ct_exp_dump_start(struct dpif *dpif, |
8868 | | struct ct_dpif_dump_state **dump_, |
8869 | | const uint16_t *pzone) |
8870 | 0 | { |
8871 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
8872 | 0 | struct dp_netdev_ct_dump *dump; |
8873 | |
|
8874 | 0 | dump = xzalloc(sizeof *dump); |
8875 | 0 | dump->dp = dp; |
8876 | 0 | dump->ct = dp->conntrack; |
8877 | |
|
8878 | 0 | conntrack_exp_dump_start(dp->conntrack, &dump->dump, pzone); |
8879 | |
|
8880 | 0 | *dump_ = &dump->up; |
8881 | |
|
8882 | 0 | return 0; |
8883 | 0 | } |
8884 | | |
8885 | | static int |
8886 | | dpif_netdev_ct_exp_dump_next(struct dpif *dpif OVS_UNUSED, |
8887 | | struct ct_dpif_dump_state *dump_, |
8888 | | struct ct_dpif_exp *entry) |
8889 | 0 | { |
8890 | 0 | struct dp_netdev_ct_dump *dump; |
8891 | |
|
8892 | 0 | INIT_CONTAINER(dump, dump_, up); |
8893 | |
|
8894 | 0 | return conntrack_exp_dump_next(&dump->dump, entry); |
8895 | 0 | } |
8896 | | |
8897 | | static int |
8898 | | dpif_netdev_ct_exp_dump_done(struct dpif *dpif OVS_UNUSED, |
8899 | | struct ct_dpif_dump_state *dump_) |
8900 | 0 | { |
8901 | 0 | struct dp_netdev_ct_dump *dump; |
8902 | 0 | int err; |
8903 | |
|
8904 | 0 | INIT_CONTAINER(dump, dump_, up); |
8905 | |
|
8906 | 0 | err = conntrack_exp_dump_done(&dump->dump); |
8907 | |
|
8908 | 0 | free(dump); |
8909 | |
|
8910 | 0 | return err; |
8911 | 0 | } |
8912 | | |
8913 | | static int |
8914 | | dpif_netdev_ct_flush(struct dpif *dpif, const uint16_t *zone, |
8915 | | const struct ct_dpif_tuple *tuple) |
8916 | 0 | { |
8917 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
8918 | |
|
8919 | 0 | if (tuple) { |
8920 | 0 | return conntrack_flush_tuple(dp->conntrack, tuple, zone ? *zone : 0); |
8921 | 0 | } |
8922 | 0 | return conntrack_flush(dp->conntrack, zone); |
8923 | 0 | } |
8924 | | |
8925 | | static int |
8926 | | dpif_netdev_ct_set_maxconns(struct dpif *dpif, uint32_t maxconns) |
8927 | 0 | { |
8928 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
8929 | |
|
8930 | 0 | return conntrack_set_maxconns(dp->conntrack, maxconns); |
8931 | 0 | } |
8932 | | |
8933 | | static int |
8934 | | dpif_netdev_ct_get_maxconns(struct dpif *dpif, uint32_t *maxconns) |
8935 | 0 | { |
8936 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
8937 | |
|
8938 | 0 | return conntrack_get_maxconns(dp->conntrack, maxconns); |
8939 | 0 | } |
8940 | | |
8941 | | static int |
8942 | | dpif_netdev_ct_get_nconns(struct dpif *dpif, uint32_t *nconns) |
8943 | 0 | { |
8944 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
8945 | |
|
8946 | 0 | return conntrack_get_nconns(dp->conntrack, nconns); |
8947 | 0 | } |
8948 | | |
8949 | | static int |
8950 | | dpif_netdev_ct_set_tcp_seq_chk(struct dpif *dpif, bool enabled) |
8951 | 0 | { |
8952 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
8953 | |
|
8954 | 0 | return conntrack_set_tcp_seq_chk(dp->conntrack, enabled); |
8955 | 0 | } |
8956 | | |
8957 | | static int |
8958 | | dpif_netdev_ct_get_tcp_seq_chk(struct dpif *dpif, bool *enabled) |
8959 | 0 | { |
8960 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
8961 | 0 | *enabled = conntrack_get_tcp_seq_chk(dp->conntrack); |
8962 | 0 | return 0; |
8963 | 0 | } |
8964 | | |
8965 | | static int |
8966 | | dpif_netdev_ct_set_sweep_interval(struct dpif *dpif, uint32_t ms) |
8967 | 0 | { |
8968 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
8969 | 0 | return conntrack_set_sweep_interval(dp->conntrack, ms); |
8970 | 0 | } |
8971 | | |
8972 | | static int |
8973 | | dpif_netdev_ct_get_sweep_interval(struct dpif *dpif, uint32_t *ms) |
8974 | 0 | { |
8975 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
8976 | 0 | *ms = conntrack_get_sweep_interval(dp->conntrack); |
8977 | 0 | return 0; |
8978 | 0 | } |
8979 | | |
8980 | | static int |
8981 | | dpif_netdev_ct_set_limits(struct dpif *dpif, |
8982 | | const struct ovs_list *zone_limits) |
8983 | 0 | { |
8984 | 0 | int err = 0; |
8985 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
8986 | |
|
8987 | 0 | struct ct_dpif_zone_limit *zone_limit; |
8988 | 0 | LIST_FOR_EACH (zone_limit, node, zone_limits) { |
8989 | 0 | err = zone_limit_update(dp->conntrack, zone_limit->zone, |
8990 | 0 | zone_limit->limit); |
8991 | 0 | if (err != 0) { |
8992 | 0 | break; |
8993 | 0 | } |
8994 | 0 | } |
8995 | 0 | return err; |
8996 | 0 | } |
8997 | | |
8998 | | static int |
8999 | | dpif_netdev_ct_get_limits(struct dpif *dpif, |
9000 | | const struct ovs_list *zone_limits_request, |
9001 | | struct ovs_list *zone_limits_reply) |
9002 | 0 | { |
9003 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
9004 | 0 | struct conntrack_zone_info czl; |
9005 | |
|
9006 | 0 | if (!ovs_list_is_empty(zone_limits_request)) { |
9007 | 0 | struct ct_dpif_zone_limit *zone_limit; |
9008 | 0 | LIST_FOR_EACH (zone_limit, node, zone_limits_request) { |
9009 | 0 | czl = zone_limit_get(dp->conntrack, zone_limit->zone); |
9010 | 0 | if (czl.zone == zone_limit->zone || czl.zone == DEFAULT_ZONE) { |
9011 | 0 | ct_dpif_push_zone_limit(zone_limits_reply, zone_limit->zone, |
9012 | 0 | czl.limit, |
9013 | 0 | czl.count); |
9014 | 0 | } else { |
9015 | 0 | return EINVAL; |
9016 | 0 | } |
9017 | 0 | } |
9018 | 0 | } else { |
9019 | 0 | czl = zone_limit_get(dp->conntrack, DEFAULT_ZONE); |
9020 | 0 | if (czl.zone == DEFAULT_ZONE) { |
9021 | 0 | ct_dpif_push_zone_limit(zone_limits_reply, DEFAULT_ZONE, |
9022 | 0 | czl.limit, 0); |
9023 | 0 | } |
9024 | |
|
9025 | 0 | for (int z = MIN_ZONE; z <= MAX_ZONE; z++) { |
9026 | 0 | czl = zone_limit_get(dp->conntrack, z); |
9027 | 0 | if (czl.zone == z) { |
9028 | 0 | ct_dpif_push_zone_limit(zone_limits_reply, z, czl.limit, |
9029 | 0 | czl.count); |
9030 | 0 | } |
9031 | 0 | } |
9032 | 0 | } |
9033 | | |
9034 | 0 | return 0; |
9035 | 0 | } |
9036 | | |
9037 | | static int |
9038 | | dpif_netdev_ct_del_limits(struct dpif *dpif, |
9039 | | const struct ovs_list *zone_limits) |
9040 | 0 | { |
9041 | 0 | int err = 0; |
9042 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
9043 | 0 | struct ct_dpif_zone_limit *zone_limit; |
9044 | 0 | LIST_FOR_EACH (zone_limit, node, zone_limits) { |
9045 | 0 | err = zone_limit_delete(dp->conntrack, zone_limit->zone); |
9046 | 0 | if (err != 0) { |
9047 | 0 | break; |
9048 | 0 | } |
9049 | 0 | } |
9050 | |
|
9051 | 0 | return err; |
9052 | 0 | } |
9053 | | |
9054 | | static int |
9055 | | dpif_netdev_ct_get_features(struct dpif *dpif OVS_UNUSED, |
9056 | | enum ct_features *features) |
9057 | 0 | { |
9058 | 0 | if (features != NULL) { |
9059 | 0 | *features = CONNTRACK_F_ZERO_SNAT; |
9060 | 0 | } |
9061 | 0 | return 0; |
9062 | 0 | } |
9063 | | |
9064 | | static int |
9065 | | dpif_netdev_ct_set_timeout_policy(struct dpif *dpif, |
9066 | | const struct ct_dpif_timeout_policy *dpif_tp) |
9067 | 0 | { |
9068 | 0 | struct timeout_policy tp; |
9069 | 0 | struct dp_netdev *dp; |
9070 | |
|
9071 | 0 | dp = get_dp_netdev(dpif); |
9072 | 0 | memcpy(&tp.policy, dpif_tp, sizeof tp.policy); |
9073 | 0 | return timeout_policy_update(dp->conntrack, &tp); |
9074 | 0 | } |
9075 | | |
9076 | | static int |
9077 | | dpif_netdev_ct_get_timeout_policy(struct dpif *dpif, uint32_t tp_id, |
9078 | | struct ct_dpif_timeout_policy *dpif_tp) |
9079 | 0 | { |
9080 | 0 | struct timeout_policy *tp; |
9081 | 0 | struct dp_netdev *dp; |
9082 | 0 | int err = 0; |
9083 | |
|
9084 | 0 | dp = get_dp_netdev(dpif); |
9085 | 0 | tp = timeout_policy_get(dp->conntrack, tp_id); |
9086 | 0 | if (!tp) { |
9087 | 0 | return ENOENT; |
9088 | 0 | } |
9089 | 0 | memcpy(dpif_tp, &tp->policy, sizeof tp->policy); |
9090 | 0 | return err; |
9091 | 0 | } |
9092 | | |
9093 | | static int |
9094 | | dpif_netdev_ct_del_timeout_policy(struct dpif *dpif, |
9095 | | uint32_t tp_id) |
9096 | 0 | { |
9097 | 0 | struct dp_netdev *dp; |
9098 | 0 | int err = 0; |
9099 | |
|
9100 | 0 | dp = get_dp_netdev(dpif); |
9101 | 0 | err = timeout_policy_delete(dp->conntrack, tp_id); |
9102 | 0 | return err; |
9103 | 0 | } |
9104 | | |
9105 | | static int |
9106 | | dpif_netdev_ct_get_timeout_policy_name(struct dpif *dpif OVS_UNUSED, |
9107 | | uint32_t tp_id, |
9108 | | uint16_t dl_type OVS_UNUSED, |
9109 | | uint8_t nw_proto OVS_UNUSED, |
9110 | | char **tp_name, bool *is_generic) |
9111 | 0 | { |
9112 | 0 | struct ds ds = DS_EMPTY_INITIALIZER; |
9113 | |
|
9114 | 0 | ds_put_format(&ds, "%"PRIu32, tp_id); |
9115 | 0 | *tp_name = ds_steal_cstr(&ds); |
9116 | 0 | *is_generic = true; |
9117 | 0 | return 0; |
9118 | 0 | } |
9119 | | |
9120 | | static int |
9121 | | dpif_netdev_ipf_set_enabled(struct dpif *dpif, bool v6, bool enable) |
9122 | 0 | { |
9123 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
9124 | 0 | return ipf_set_enabled(conntrack_ipf_ctx(dp->conntrack), v6, enable); |
9125 | 0 | } |
9126 | | |
9127 | | static int |
9128 | | dpif_netdev_ipf_set_min_frag(struct dpif *dpif, bool v6, uint32_t min_frag) |
9129 | 0 | { |
9130 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
9131 | 0 | return ipf_set_min_frag(conntrack_ipf_ctx(dp->conntrack), v6, min_frag); |
9132 | 0 | } |
9133 | | |
9134 | | static int |
9135 | | dpif_netdev_ipf_set_max_nfrags(struct dpif *dpif, uint32_t max_frags) |
9136 | 0 | { |
9137 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
9138 | 0 | return ipf_set_max_nfrags(conntrack_ipf_ctx(dp->conntrack), max_frags); |
9139 | 0 | } |
9140 | | |
9141 | | /* Adjust this function if 'dpif_ipf_status' and 'ipf_status' were to |
9142 | | * diverge. */ |
9143 | | static int |
9144 | | dpif_netdev_ipf_get_status(struct dpif *dpif, |
9145 | | struct dpif_ipf_status *dpif_ipf_status) |
9146 | 0 | { |
9147 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
9148 | 0 | ipf_get_status(conntrack_ipf_ctx(dp->conntrack), |
9149 | 0 | (struct ipf_status *) dpif_ipf_status); |
9150 | 0 | return 0; |
9151 | 0 | } |
9152 | | |
9153 | | static int |
9154 | | dpif_netdev_ipf_dump_start(struct dpif *dpif OVS_UNUSED, |
9155 | | struct ipf_dump_ctx **ipf_dump_ctx) |
9156 | 0 | { |
9157 | 0 | return ipf_dump_start(ipf_dump_ctx); |
9158 | 0 | } |
9159 | | |
9160 | | static int |
9161 | | dpif_netdev_ipf_dump_next(struct dpif *dpif, void *ipf_dump_ctx, char **dump) |
9162 | 0 | { |
9163 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
9164 | 0 | return ipf_dump_next(conntrack_ipf_ctx(dp->conntrack), ipf_dump_ctx, |
9165 | 0 | dump); |
9166 | 0 | } |
9167 | | |
9168 | | static int |
9169 | | dpif_netdev_ipf_dump_done(struct dpif *dpif OVS_UNUSED, void *ipf_dump_ctx) |
9170 | 0 | { |
9171 | 0 | return ipf_dump_done(ipf_dump_ctx); |
9172 | |
|
9173 | 0 | } |
9174 | | |
9175 | | static int |
9176 | | dpif_netdev_bond_add(struct dpif *dpif, uint32_t bond_id, |
9177 | | odp_port_t *member_map) |
9178 | 0 | { |
9179 | 0 | struct tx_bond *new_tx = xzalloc(sizeof *new_tx); |
9180 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
9181 | 0 | struct dp_netdev_pmd_thread *pmd; |
9182 | | |
9183 | | /* Prepare new bond mapping. */ |
9184 | 0 | new_tx->bond_id = bond_id; |
9185 | 0 | for (int bucket = 0; bucket < BOND_BUCKETS; bucket++) { |
9186 | 0 | new_tx->member_buckets[bucket].member_id = member_map[bucket]; |
9187 | 0 | } |
9188 | |
|
9189 | 0 | ovs_mutex_lock(&dp->bond_mutex); |
9190 | | /* Check if bond already existed. */ |
9191 | 0 | struct tx_bond *old_tx = tx_bond_lookup(&dp->tx_bonds, bond_id); |
9192 | 0 | if (old_tx) { |
9193 | 0 | cmap_replace(&dp->tx_bonds, &old_tx->node, &new_tx->node, |
9194 | 0 | hash_bond_id(bond_id)); |
9195 | 0 | ovsrcu_postpone(free, old_tx); |
9196 | 0 | } else { |
9197 | 0 | cmap_insert(&dp->tx_bonds, &new_tx->node, hash_bond_id(bond_id)); |
9198 | 0 | } |
9199 | 0 | ovs_mutex_unlock(&dp->bond_mutex); |
9200 | | |
9201 | | /* Update all PMDs with new bond mapping. */ |
9202 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
9203 | 0 | dp_netdev_add_bond_tx_to_pmd(pmd, new_tx, true); |
9204 | 0 | } |
9205 | 0 | return 0; |
9206 | 0 | } |
9207 | | |
9208 | | static int |
9209 | | dpif_netdev_bond_del(struct dpif *dpif, uint32_t bond_id) |
9210 | 0 | { |
9211 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
9212 | 0 | struct dp_netdev_pmd_thread *pmd; |
9213 | 0 | struct tx_bond *tx; |
9214 | |
|
9215 | 0 | ovs_mutex_lock(&dp->bond_mutex); |
9216 | | /* Check if bond existed. */ |
9217 | 0 | tx = tx_bond_lookup(&dp->tx_bonds, bond_id); |
9218 | 0 | if (tx) { |
9219 | 0 | cmap_remove(&dp->tx_bonds, &tx->node, hash_bond_id(bond_id)); |
9220 | 0 | ovsrcu_postpone(free, tx); |
9221 | 0 | } else { |
9222 | | /* Bond is not present. */ |
9223 | 0 | ovs_mutex_unlock(&dp->bond_mutex); |
9224 | 0 | return ENOENT; |
9225 | 0 | } |
9226 | 0 | ovs_mutex_unlock(&dp->bond_mutex); |
9227 | | |
9228 | | /* Remove the bond map in all pmds. */ |
9229 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
9230 | 0 | dp_netdev_del_bond_tx_from_pmd(pmd, bond_id); |
9231 | 0 | } |
9232 | 0 | return 0; |
9233 | 0 | } |
9234 | | |
9235 | | static int |
9236 | | dpif_netdev_bond_stats_get(struct dpif *dpif, uint32_t bond_id, |
9237 | | uint64_t *n_bytes) |
9238 | 0 | { |
9239 | 0 | struct dp_netdev *dp = get_dp_netdev(dpif); |
9240 | 0 | struct dp_netdev_pmd_thread *pmd; |
9241 | |
|
9242 | 0 | if (!tx_bond_lookup(&dp->tx_bonds, bond_id)) { |
9243 | 0 | return ENOENT; |
9244 | 0 | } |
9245 | | |
9246 | | /* Search the bond in all PMDs. */ |
9247 | 0 | CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { |
9248 | 0 | struct tx_bond *pmd_bond_entry |
9249 | 0 | = tx_bond_lookup(&pmd->tx_bonds, bond_id); |
9250 | |
|
9251 | 0 | if (!pmd_bond_entry) { |
9252 | 0 | continue; |
9253 | 0 | } |
9254 | | |
9255 | | /* Read bond stats. */ |
9256 | 0 | for (int i = 0; i < BOND_BUCKETS; i++) { |
9257 | 0 | uint64_t pmd_n_bytes; |
9258 | |
|
9259 | 0 | atomic_read_relaxed(&pmd_bond_entry->member_buckets[i].n_bytes, |
9260 | 0 | &pmd_n_bytes); |
9261 | 0 | n_bytes[i] += pmd_n_bytes; |
9262 | 0 | } |
9263 | 0 | } |
9264 | 0 | return 0; |
9265 | 0 | } |
9266 | | |
9267 | | const struct dpif_class dpif_netdev_class = { |
9268 | | "netdev", |
9269 | | true, /* cleanup_required */ |
9270 | | dpif_netdev_init, |
9271 | | dpif_netdev_enumerate, |
9272 | | dpif_netdev_port_open_type, |
9273 | | dpif_netdev_open, |
9274 | | dpif_netdev_close, |
9275 | | dpif_netdev_destroy, |
9276 | | dpif_netdev_run, |
9277 | | dpif_netdev_wait, |
9278 | | dpif_netdev_get_stats, |
9279 | | NULL, /* set_features */ |
9280 | | NULL, /* get_features */ |
9281 | | dpif_netdev_port_add, |
9282 | | dpif_netdev_port_del, |
9283 | | dpif_netdev_port_set_config, |
9284 | | dpif_netdev_port_query_by_number, |
9285 | | dpif_netdev_port_query_by_name, |
9286 | | NULL, /* port_get_pid */ |
9287 | | dpif_netdev_port_dump_start, |
9288 | | dpif_netdev_port_dump_next, |
9289 | | dpif_netdev_port_dump_done, |
9290 | | dpif_netdev_port_poll, |
9291 | | dpif_netdev_port_poll_wait, |
9292 | | dpif_netdev_flow_flush, |
9293 | | dpif_netdev_flow_dump_create, |
9294 | | dpif_netdev_flow_dump_destroy, |
9295 | | dpif_netdev_flow_dump_thread_create, |
9296 | | dpif_netdev_flow_dump_thread_destroy, |
9297 | | dpif_netdev_flow_dump_next, |
9298 | | dpif_netdev_operate, |
9299 | | NULL, /* recv_set */ |
9300 | | NULL, /* handlers_set */ |
9301 | | dpif_netdev_number_handlers_required, |
9302 | | dpif_netdev_set_config, |
9303 | | dpif_netdev_queue_to_priority, |
9304 | | NULL, /* recv */ |
9305 | | NULL, /* recv_wait */ |
9306 | | NULL, /* recv_purge */ |
9307 | | dpif_netdev_register_dp_purge_cb, |
9308 | | dpif_netdev_register_upcall_cb, |
9309 | | dpif_netdev_enable_upcall, |
9310 | | dpif_netdev_disable_upcall, |
9311 | | dpif_netdev_get_datapath_version, |
9312 | | dpif_netdev_ct_dump_start, |
9313 | | dpif_netdev_ct_dump_next, |
9314 | | dpif_netdev_ct_dump_done, |
9315 | | dpif_netdev_ct_exp_dump_start, |
9316 | | dpif_netdev_ct_exp_dump_next, |
9317 | | dpif_netdev_ct_exp_dump_done, |
9318 | | dpif_netdev_ct_flush, |
9319 | | dpif_netdev_ct_set_maxconns, |
9320 | | dpif_netdev_ct_get_maxconns, |
9321 | | dpif_netdev_ct_get_nconns, |
9322 | | dpif_netdev_ct_set_tcp_seq_chk, |
9323 | | dpif_netdev_ct_get_tcp_seq_chk, |
9324 | | dpif_netdev_ct_set_sweep_interval, |
9325 | | dpif_netdev_ct_get_sweep_interval, |
9326 | | dpif_netdev_ct_set_limits, |
9327 | | dpif_netdev_ct_get_limits, |
9328 | | dpif_netdev_ct_del_limits, |
9329 | | dpif_netdev_ct_set_timeout_policy, |
9330 | | dpif_netdev_ct_get_timeout_policy, |
9331 | | dpif_netdev_ct_del_timeout_policy, |
9332 | | NULL, /* ct_timeout_policy_dump_start */ |
9333 | | NULL, /* ct_timeout_policy_dump_next */ |
9334 | | NULL, /* ct_timeout_policy_dump_done */ |
9335 | | dpif_netdev_ct_get_timeout_policy_name, |
9336 | | dpif_netdev_ct_get_features, |
9337 | | dpif_netdev_ipf_set_enabled, |
9338 | | dpif_netdev_ipf_set_min_frag, |
9339 | | dpif_netdev_ipf_set_max_nfrags, |
9340 | | dpif_netdev_ipf_get_status, |
9341 | | dpif_netdev_ipf_dump_start, |
9342 | | dpif_netdev_ipf_dump_next, |
9343 | | dpif_netdev_ipf_dump_done, |
9344 | | dpif_netdev_meter_get_features, |
9345 | | dpif_netdev_meter_set, |
9346 | | dpif_netdev_meter_get, |
9347 | | dpif_netdev_meter_del, |
9348 | | dpif_netdev_bond_add, |
9349 | | dpif_netdev_bond_del, |
9350 | | dpif_netdev_bond_stats_get, |
9351 | | NULL, /* cache_get_supported_levels */ |
9352 | | NULL, /* cache_get_name */ |
9353 | | NULL, /* cache_get_size */ |
9354 | | NULL, /* cache_set_size */ |
9355 | | }; |
9356 | | |
9357 | | static void |
9358 | | dpif_dummy_change_port_number(struct unixctl_conn *conn, int argc OVS_UNUSED, |
9359 | | const char *argv[], void *aux OVS_UNUSED) |
9360 | 0 | { |
9361 | 0 | struct dp_netdev_port *port; |
9362 | 0 | struct dp_netdev *dp; |
9363 | 0 | odp_port_t port_no; |
9364 | |
|
9365 | 0 | ovs_mutex_lock(&dp_netdev_mutex); |
9366 | 0 | dp = shash_find_data(&dp_netdevs, argv[1]); |
9367 | 0 | if (!dp || !dpif_netdev_class_is_dummy(dp->class)) { |
9368 | 0 | ovs_mutex_unlock(&dp_netdev_mutex); |
9369 | 0 | unixctl_command_reply_error(conn, "unknown datapath or not a dummy"); |
9370 | 0 | return; |
9371 | 0 | } |
9372 | 0 | ovs_refcount_ref(&dp->ref_cnt); |
9373 | 0 | ovs_mutex_unlock(&dp_netdev_mutex); |
9374 | |
|
9375 | 0 | ovs_rwlock_wrlock(&dp->port_rwlock); |
9376 | 0 | if (get_port_by_name(dp, argv[2], &port)) { |
9377 | 0 | unixctl_command_reply_error(conn, "unknown port"); |
9378 | 0 | goto exit; |
9379 | 0 | } |
9380 | | |
9381 | 0 | port_no = u32_to_odp(atoi(argv[3])); |
9382 | 0 | if (!port_no || port_no == ODPP_NONE) { |
9383 | 0 | unixctl_command_reply_error(conn, "bad port number"); |
9384 | 0 | goto exit; |
9385 | 0 | } |
9386 | 0 | if (dp_netdev_lookup_port(dp, port_no)) { |
9387 | 0 | unixctl_command_reply_error(conn, "port number already in use"); |
9388 | 0 | goto exit; |
9389 | 0 | } |
9390 | | |
9391 | | /* Remove port. */ |
9392 | 0 | hmap_remove(&dp->ports, &port->node); |
9393 | 0 | reconfigure_datapath(dp); |
9394 | | |
9395 | | /* Reinsert with new port number. */ |
9396 | 0 | port->port_no = port_no; |
9397 | 0 | hmap_insert(&dp->ports, &port->node, hash_port_no(port_no)); |
9398 | 0 | reconfigure_datapath(dp); |
9399 | |
|
9400 | 0 | seq_change(dp->port_seq); |
9401 | 0 | unixctl_command_reply(conn, NULL); |
9402 | |
|
9403 | 0 | exit: |
9404 | 0 | ovs_rwlock_unlock(&dp->port_rwlock); |
9405 | 0 | dp_netdev_unref(dp); |
9406 | 0 | } |
9407 | | |
9408 | | static void |
9409 | | dpif_dummy_register__(const char *type) |
9410 | 0 | { |
9411 | 0 | struct dpif_class *class; |
9412 | |
|
9413 | 0 | class = xmalloc(sizeof *class); |
9414 | 0 | *class = dpif_netdev_class; |
9415 | 0 | class->type = xstrdup(type); |
9416 | 0 | dp_register_provider(class); |
9417 | 0 | } |
9418 | | |
9419 | | static void |
9420 | | dpif_dummy_override(const char *type) |
9421 | 0 | { |
9422 | 0 | int error; |
9423 | | |
9424 | | /* |
9425 | | * Ignore EAFNOSUPPORT to allow --enable-dummy=system with |
9426 | | * a userland-only build. It's useful for testsuite. |
9427 | | */ |
9428 | 0 | error = dp_unregister_provider(type); |
9429 | 0 | if (error == 0 || error == EAFNOSUPPORT) { |
9430 | 0 | dpif_dummy_register__(type); |
9431 | 0 | } |
9432 | 0 | } |
9433 | | |
9434 | | void |
9435 | | dpif_dummy_register(enum dummy_level level) |
9436 | 0 | { |
9437 | 0 | if (level == DUMMY_OVERRIDE_ALL) { |
9438 | 0 | struct sset types; |
9439 | 0 | const char *type; |
9440 | |
|
9441 | 0 | sset_init(&types); |
9442 | 0 | dp_enumerate_types(&types); |
9443 | 0 | SSET_FOR_EACH (type, &types) { |
9444 | 0 | dpif_dummy_override(type); |
9445 | 0 | } |
9446 | 0 | sset_destroy(&types); |
9447 | 0 | } else if (level == DUMMY_OVERRIDE_SYSTEM) { |
9448 | 0 | dpif_dummy_override("system"); |
9449 | 0 | } |
9450 | |
|
9451 | 0 | dpif_dummy_register__("dummy"); |
9452 | |
|
9453 | 0 | unixctl_command_register("dpif-dummy/change-port-number", |
9454 | 0 | "dp port new-number", |
9455 | 0 | 3, 3, dpif_dummy_change_port_number, NULL); |
9456 | 0 | } |
9457 | | |
9458 | | /* Datapath Classifier. */ |
9459 | | |
9460 | | static void |
9461 | | dpcls_subtable_destroy_cb(struct dpcls_subtable *subtable) |
9462 | 0 | { |
9463 | 0 | cmap_destroy(&subtable->rules); |
9464 | 0 | ovsrcu_postpone(free, subtable->mf_masks); |
9465 | 0 | ovsrcu_postpone(free, subtable); |
9466 | 0 | } |
9467 | | |
9468 | | /* Initializes 'cls' as a classifier that initially contains no classification |
9469 | | * rules. */ |
9470 | | static void |
9471 | | dpcls_init(struct dpcls *cls) |
9472 | 0 | { |
9473 | 0 | cmap_init(&cls->subtables_map); |
9474 | 0 | pvector_init(&cls->subtables); |
9475 | 0 | } |
9476 | | |
9477 | | static void |
9478 | | dpcls_destroy_subtable(struct dpcls *cls, struct dpcls_subtable *subtable) |
9479 | 0 | { |
9480 | 0 | VLOG_DBG("Destroying subtable %p for in_port %d", subtable, cls->in_port); |
9481 | 0 | pvector_remove(&cls->subtables, subtable); |
9482 | 0 | cmap_remove(&cls->subtables_map, &subtable->cmap_node, |
9483 | 0 | subtable->mask.hash); |
9484 | 0 | dpcls_info_dec_usage(subtable->lookup_func_info); |
9485 | 0 | ovsrcu_postpone(dpcls_subtable_destroy_cb, subtable); |
9486 | 0 | } |
9487 | | |
9488 | | /* Destroys 'cls'. Rules within 'cls', if any, are not freed; this is the |
9489 | | * caller's responsibility. |
9490 | | * May only be called after all the readers have been terminated. */ |
9491 | | static void |
9492 | | dpcls_destroy(struct dpcls *cls) |
9493 | 0 | { |
9494 | 0 | if (cls) { |
9495 | 0 | struct dpcls_subtable *subtable; |
9496 | |
|
9497 | 0 | CMAP_FOR_EACH (subtable, cmap_node, &cls->subtables_map) { |
9498 | 0 | ovs_assert(cmap_count(&subtable->rules) == 0); |
9499 | 0 | dpcls_destroy_subtable(cls, subtable); |
9500 | 0 | } |
9501 | 0 | cmap_destroy(&cls->subtables_map); |
9502 | 0 | pvector_destroy(&cls->subtables); |
9503 | 0 | } |
9504 | 0 | } |
9505 | | |
9506 | | static struct dpcls_subtable * |
9507 | | dpcls_create_subtable(struct dpcls *cls, const struct netdev_flow_key *mask) |
9508 | 0 | { |
9509 | 0 | struct dpcls_subtable *subtable; |
9510 | | |
9511 | | /* Need to add one. */ |
9512 | 0 | subtable = xmalloc(sizeof *subtable |
9513 | 0 | - sizeof subtable->mask.mf + mask->len); |
9514 | 0 | cmap_init(&subtable->rules); |
9515 | 0 | subtable->hit_cnt = 0; |
9516 | 0 | netdev_flow_key_clone(&subtable->mask, mask); |
9517 | | |
9518 | | /* The count of bits in the mask defines the space required for masks. |
9519 | | * Then call gen_masks() to create the appropriate masks, avoiding the cost |
9520 | | * of doing runtime calculations. */ |
9521 | 0 | uint32_t unit0 = count_1bits(mask->mf.map.bits[0]); |
9522 | 0 | uint32_t unit1 = count_1bits(mask->mf.map.bits[1]); |
9523 | 0 | subtable->mf_bits_set_unit0 = unit0; |
9524 | 0 | subtable->mf_bits_set_unit1 = unit1; |
9525 | 0 | subtable->mf_masks = xmalloc(sizeof(uint64_t) * (unit0 + unit1)); |
9526 | 0 | dpcls_flow_key_gen_masks(mask, subtable->mf_masks, unit0, unit1); |
9527 | | |
9528 | | /* Get the preferred subtable search function for this (u0,u1) subtable. |
9529 | | * The function is guaranteed to always return a valid implementation, and |
9530 | | * possibly an ISA optimized, and/or specialized implementation. Initialize |
9531 | | * the subtable search function atomically to avoid garbage data being read |
9532 | | * by the PMD thread. |
9533 | | */ |
9534 | 0 | atomic_init(&subtable->lookup_func, |
9535 | 0 | dpcls_subtable_get_best_impl(unit0, unit1, |
9536 | 0 | &subtable->lookup_func_info)); |
9537 | 0 | dpcls_info_inc_usage(subtable->lookup_func_info); |
9538 | |
|
9539 | 0 | cmap_insert(&cls->subtables_map, &subtable->cmap_node, mask->hash); |
9540 | | /* Add the new subtable at the end of the pvector (with no hits yet) */ |
9541 | 0 | pvector_insert(&cls->subtables, subtable, 0); |
9542 | 0 | VLOG_DBG("Creating %"PRIuSIZE". subtable %p for in_port %d", |
9543 | 0 | cmap_count(&cls->subtables_map), subtable, cls->in_port); |
9544 | 0 | pvector_publish(&cls->subtables); |
9545 | |
|
9546 | 0 | return subtable; |
9547 | 0 | } |
9548 | | |
9549 | | static inline struct dpcls_subtable * |
9550 | | dpcls_find_subtable(struct dpcls *cls, const struct netdev_flow_key *mask) |
9551 | 0 | { |
9552 | 0 | struct dpcls_subtable *subtable; |
9553 | |
|
9554 | 0 | CMAP_FOR_EACH_WITH_HASH (subtable, cmap_node, mask->hash, |
9555 | 0 | &cls->subtables_map) { |
9556 | 0 | if (netdev_flow_key_equal(&subtable->mask, mask)) { |
9557 | 0 | return subtable; |
9558 | 0 | } |
9559 | 0 | } |
9560 | 0 | return dpcls_create_subtable(cls, mask); |
9561 | 0 | } |
9562 | | |
9563 | | /* Checks for the best available implementation for each subtable lookup |
9564 | | * function, and assigns it as the lookup function pointer for each subtable. |
9565 | | * Returns the number of subtables that have changed lookup implementation. |
9566 | | * This function requires holding a flow_mutex when called. This is to make |
9567 | | * sure modifications done by this function are not overwritten. This could |
9568 | | * happen if dpcls_sort_subtable_vector() is called at the same time as this |
9569 | | * function. |
9570 | | */ |
9571 | | static uint32_t |
9572 | | dpcls_subtable_lookup_reprobe(struct dpcls *cls) |
9573 | 0 | { |
9574 | 0 | struct pvector *pvec = &cls->subtables; |
9575 | 0 | uint32_t subtables_changed = 0; |
9576 | 0 | struct dpcls_subtable *subtable = NULL; |
9577 | |
|
9578 | 0 | PVECTOR_FOR_EACH (subtable, pvec) { |
9579 | 0 | uint32_t u0_bits = subtable->mf_bits_set_unit0; |
9580 | 0 | uint32_t u1_bits = subtable->mf_bits_set_unit1; |
9581 | 0 | void *old_func = subtable->lookup_func; |
9582 | 0 | struct dpcls_subtable_lookup_info_t *old_info; |
9583 | 0 | old_info = subtable->lookup_func_info; |
9584 | | /* Set the subtable lookup function atomically to avoid garbage data |
9585 | | * being read by the PMD thread. */ |
9586 | 0 | atomic_store_relaxed(&subtable->lookup_func, |
9587 | 0 | dpcls_subtable_get_best_impl(u0_bits, u1_bits, |
9588 | 0 | &subtable->lookup_func_info)); |
9589 | 0 | if (old_func != subtable->lookup_func) { |
9590 | 0 | subtables_changed += 1; |
9591 | 0 | } |
9592 | |
|
9593 | 0 | if (old_info != subtable->lookup_func_info) { |
9594 | | /* In theory, functions can be shared between implementations, so |
9595 | | * do an explicit check on the function info structures. */ |
9596 | 0 | dpcls_info_dec_usage(old_info); |
9597 | 0 | dpcls_info_inc_usage(subtable->lookup_func_info); |
9598 | 0 | } |
9599 | 0 | } |
9600 | |
|
9601 | 0 | return subtables_changed; |
9602 | 0 | } |
9603 | | |
9604 | | /* Periodically sort the dpcls subtable vectors according to hit counts */ |
9605 | | static void |
9606 | | dpcls_sort_subtable_vector(struct dpcls *cls) |
9607 | 0 | { |
9608 | 0 | struct pvector *pvec = &cls->subtables; |
9609 | 0 | struct dpcls_subtable *subtable; |
9610 | |
|
9611 | 0 | PVECTOR_FOR_EACH (subtable, pvec) { |
9612 | 0 | pvector_change_priority(pvec, subtable, subtable->hit_cnt); |
9613 | 0 | subtable->hit_cnt = 0; |
9614 | 0 | } |
9615 | 0 | pvector_publish(pvec); |
9616 | 0 | } |
9617 | | |
9618 | | static inline void |
9619 | | dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd, |
9620 | | struct polled_queue *poll_list, int poll_cnt) |
9621 | 0 | { |
9622 | 0 | struct dpcls *cls; |
9623 | 0 | uint64_t tot_idle = 0, tot_proc = 0, tot_sleep = 0; |
9624 | 0 | unsigned int pmd_load = 0; |
9625 | |
|
9626 | 0 | if (pmd->ctx.now > pmd->next_cycle_store) { |
9627 | 0 | uint64_t curr_tsc; |
9628 | 0 | uint8_t rebalance_load_trigger; |
9629 | 0 | struct pmd_auto_lb *pmd_alb = &pmd->dp->pmd_alb; |
9630 | 0 | unsigned int idx; |
9631 | |
|
9632 | 0 | if (pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] >= |
9633 | 0 | pmd->prev_stats[PMD_CYCLES_ITER_IDLE] && |
9634 | 0 | pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] >= |
9635 | 0 | pmd->prev_stats[PMD_CYCLES_ITER_BUSY]) { |
9636 | 0 | tot_idle = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] - |
9637 | 0 | pmd->prev_stats[PMD_CYCLES_ITER_IDLE]; |
9638 | 0 | tot_proc = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] - |
9639 | 0 | pmd->prev_stats[PMD_CYCLES_ITER_BUSY]; |
9640 | 0 | tot_sleep = pmd->perf_stats.counters.n[PMD_CYCLES_SLEEP] - |
9641 | 0 | pmd->prev_stats[PMD_CYCLES_SLEEP]; |
9642 | |
|
9643 | 0 | if (pmd_alb->is_enabled && !pmd->isolated) { |
9644 | 0 | if (tot_proc) { |
9645 | 0 | pmd_load = ((tot_proc * 100) / |
9646 | 0 | (tot_idle + tot_proc + tot_sleep)); |
9647 | 0 | } |
9648 | |
|
9649 | 0 | atomic_read_relaxed(&pmd_alb->rebalance_load_thresh, |
9650 | 0 | &rebalance_load_trigger); |
9651 | 0 | if (pmd_load >= rebalance_load_trigger) { |
9652 | 0 | atomic_count_inc(&pmd->pmd_overloaded); |
9653 | 0 | } else { |
9654 | 0 | atomic_count_set(&pmd->pmd_overloaded, 0); |
9655 | 0 | } |
9656 | 0 | } |
9657 | 0 | } |
9658 | |
|
9659 | 0 | pmd->prev_stats[PMD_CYCLES_ITER_IDLE] = |
9660 | 0 | pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE]; |
9661 | 0 | pmd->prev_stats[PMD_CYCLES_ITER_BUSY] = |
9662 | 0 | pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY]; |
9663 | 0 | pmd->prev_stats[PMD_CYCLES_SLEEP] = |
9664 | 0 | pmd->perf_stats.counters.n[PMD_CYCLES_SLEEP]; |
9665 | | |
9666 | | /* Get the cycles that were used to process each queue and store. */ |
9667 | 0 | for (unsigned i = 0; i < poll_cnt; i++) { |
9668 | 0 | uint64_t rxq_cyc_curr = dp_netdev_rxq_get_cycles(poll_list[i].rxq, |
9669 | 0 | RXQ_CYCLES_PROC_CURR); |
9670 | 0 | dp_netdev_rxq_set_intrvl_cycles(poll_list[i].rxq, rxq_cyc_curr); |
9671 | 0 | dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR, |
9672 | 0 | 0); |
9673 | 0 | } |
9674 | 0 | curr_tsc = cycles_counter_update(&pmd->perf_stats); |
9675 | 0 | if (pmd->intrvl_tsc_prev) { |
9676 | | /* There is a prev timestamp, store a new intrvl cycle count. */ |
9677 | 0 | atomic_store_relaxed(&pmd->intrvl_cycles, |
9678 | 0 | curr_tsc - pmd->intrvl_tsc_prev); |
9679 | 0 | } |
9680 | 0 | idx = atomic_count_inc(&pmd->intrvl_idx) % PMD_INTERVAL_MAX; |
9681 | 0 | atomic_store_relaxed(&pmd->busy_cycles_intrvl[idx], tot_proc); |
9682 | 0 | pmd->intrvl_tsc_prev = curr_tsc; |
9683 | | /* Start new measuring interval */ |
9684 | 0 | pmd->next_cycle_store = pmd->ctx.now + PMD_INTERVAL_LEN; |
9685 | 0 | } |
9686 | |
|
9687 | 0 | if (pmd->ctx.now > pmd->next_optimization) { |
9688 | | /* Try to obtain the flow lock to block out revalidator threads. |
9689 | | * If not possible, just try next time. */ |
9690 | 0 | if (!ovs_mutex_trylock(&pmd->flow_mutex)) { |
9691 | | /* Optimize each classifier */ |
9692 | 0 | CMAP_FOR_EACH (cls, node, &pmd->classifiers) { |
9693 | 0 | dpcls_sort_subtable_vector(cls); |
9694 | 0 | } |
9695 | 0 | ovs_mutex_unlock(&pmd->flow_mutex); |
9696 | | /* Start new measuring interval */ |
9697 | 0 | pmd->next_optimization = pmd->ctx.now |
9698 | 0 | + DPCLS_OPTIMIZATION_INTERVAL; |
9699 | 0 | } |
9700 | 0 | } |
9701 | 0 | } |
9702 | | |
9703 | | /* Returns the sum of a specified number of newest to |
9704 | | * oldest interval values. 'cur_idx' is where the next |
9705 | | * write will be and wrap around needs to be handled. |
9706 | | */ |
9707 | | static uint64_t |
9708 | | get_interval_values(atomic_ullong *source, atomic_count *cur_idx, |
9709 | 0 | int num_to_read) { |
9710 | 0 | unsigned int i; |
9711 | 0 | uint64_t total = 0; |
9712 | |
|
9713 | 0 | i = atomic_count_get(cur_idx) % PMD_INTERVAL_MAX; |
9714 | 0 | for (int read = 0; read < num_to_read; read++) { |
9715 | 0 | uint64_t interval_value; |
9716 | |
|
9717 | 0 | i = i ? i - 1 : PMD_INTERVAL_MAX - 1; |
9718 | 0 | atomic_read_relaxed(&source[i], &interval_value); |
9719 | 0 | total += interval_value; |
9720 | 0 | } |
9721 | 0 | return total; |
9722 | 0 | } |
9723 | | |
9724 | | /* Insert 'rule' into 'cls'. */ |
9725 | | static void |
9726 | | dpcls_insert(struct dpcls *cls, struct dpcls_rule *rule, |
9727 | | const struct netdev_flow_key *mask) |
9728 | 0 | { |
9729 | 0 | struct dpcls_subtable *subtable = dpcls_find_subtable(cls, mask); |
9730 | | |
9731 | | /* Refer to subtable's mask, also for later removal. */ |
9732 | 0 | rule->mask = &subtable->mask; |
9733 | 0 | cmap_insert(&subtable->rules, &rule->cmap_node, rule->flow.hash); |
9734 | 0 | } |
9735 | | |
9736 | | /* Removes 'rule' from 'cls', also destructing the 'rule'. */ |
9737 | | static void |
9738 | | dpcls_remove(struct dpcls *cls, struct dpcls_rule *rule) |
9739 | 0 | { |
9740 | 0 | struct dpcls_subtable *subtable; |
9741 | |
|
9742 | 0 | ovs_assert(rule->mask); |
9743 | | |
9744 | | /* Get subtable from reference in rule->mask. */ |
9745 | 0 | INIT_CONTAINER(subtable, rule->mask, mask); |
9746 | 0 | if (cmap_remove(&subtable->rules, &rule->cmap_node, rule->flow.hash) |
9747 | 0 | == 0) { |
9748 | | /* Delete empty subtable. */ |
9749 | 0 | dpcls_destroy_subtable(cls, subtable); |
9750 | 0 | pvector_publish(&cls->subtables); |
9751 | 0 | } |
9752 | 0 | } |
9753 | | |
9754 | | /* Inner loop for mask generation of a unit, see dpcls_flow_key_gen_masks. */ |
9755 | | static inline void |
9756 | | dpcls_flow_key_gen_mask_unit(uint64_t iter, const uint64_t count, |
9757 | | uint64_t *mf_masks) |
9758 | 0 | { |
9759 | 0 | int i; |
9760 | 0 | for (i = 0; i < count; i++) { |
9761 | 0 | uint64_t lowest_bit = (iter & -iter); |
9762 | 0 | iter &= ~lowest_bit; |
9763 | 0 | mf_masks[i] = (lowest_bit - 1); |
9764 | 0 | } |
9765 | | /* Checks that count has covered all bits in the iter bitmap. */ |
9766 | 0 | ovs_assert(iter == 0); |
9767 | 0 | } |
9768 | | |
9769 | | /* Generate a mask for each block in the miniflow, based on the bits set. This |
9770 | | * allows easily masking packets with the generated array here, without |
9771 | | * calculations. This replaces runtime-calculating the masks. |
9772 | | * @param key The table to generate the mf_masks for |
9773 | | * @param mf_masks Pointer to a u64 array of at least *mf_bits* in size |
9774 | | * @param mf_bits_total Number of bits set in the whole miniflow (both units) |
9775 | | * @param mf_bits_unit0 Number of bits set in unit0 of the miniflow |
9776 | | */ |
9777 | | void |
9778 | | dpcls_flow_key_gen_masks(const struct netdev_flow_key *tbl, |
9779 | | uint64_t *mf_masks, |
9780 | | const uint32_t mf_bits_u0, |
9781 | | const uint32_t mf_bits_u1) |
9782 | 0 | { |
9783 | 0 | uint64_t iter_u0 = tbl->mf.map.bits[0]; |
9784 | 0 | uint64_t iter_u1 = tbl->mf.map.bits[1]; |
9785 | |
|
9786 | 0 | dpcls_flow_key_gen_mask_unit(iter_u0, mf_bits_u0, &mf_masks[0]); |
9787 | 0 | dpcls_flow_key_gen_mask_unit(iter_u1, mf_bits_u1, &mf_masks[mf_bits_u0]); |
9788 | 0 | } |
9789 | | |
9790 | | /* Returns true if 'target' satisfies 'key' in 'mask', that is, if each 1-bit |
9791 | | * in 'mask' the values in 'key' and 'target' are the same. */ |
9792 | | inline bool |
9793 | | dpcls_rule_matches_key(const struct dpcls_rule *rule, |
9794 | | const struct netdev_flow_key *target) |
9795 | 0 | { |
9796 | 0 | const uint64_t *keyp = miniflow_get_values(&rule->flow.mf); |
9797 | 0 | const uint64_t *maskp = miniflow_get_values(&rule->mask->mf); |
9798 | 0 | uint64_t value; |
9799 | |
|
9800 | 0 | NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, target, rule->flow.mf.map) { |
9801 | 0 | if (OVS_UNLIKELY((value & *maskp++) != *keyp++)) { |
9802 | 0 | return false; |
9803 | 0 | } |
9804 | 0 | } |
9805 | 0 | return true; |
9806 | 0 | } |
9807 | | |
9808 | | /* For each miniflow in 'keys' performs a classifier lookup writing the result |
9809 | | * into the corresponding slot in 'rules'. If a particular entry in 'keys' is |
9810 | | * NULL it is skipped. |
9811 | | * |
9812 | | * This function is optimized for use in the userspace datapath and therefore |
9813 | | * does not implement a lot of features available in the standard |
9814 | | * classifier_lookup() function. Specifically, it does not implement |
9815 | | * priorities, instead returning any rule which matches the flow. |
9816 | | * |
9817 | | * Returns true if all miniflows found a corresponding rule. */ |
9818 | | bool |
9819 | | dpcls_lookup(struct dpcls *cls, const struct netdev_flow_key *keys[], |
9820 | | struct dpcls_rule **rules, const size_t cnt, |
9821 | | int *num_lookups_p) |
9822 | 0 | { |
9823 | | /* The received 'cnt' miniflows are the search-keys that will be processed |
9824 | | * to find a matching entry into the available subtables. |
9825 | | * The number of bits in map_type is equal to NETDEV_MAX_BURST. */ |
9826 | 0 | #define MAP_BITS (sizeof(uint32_t) * CHAR_BIT) |
9827 | 0 | BUILD_ASSERT_DECL(MAP_BITS >= NETDEV_MAX_BURST); |
9828 | |
|
9829 | 0 | struct dpcls_subtable *subtable; |
9830 | 0 | uint32_t keys_map = TYPE_MAXIMUM(uint32_t); /* Set all bits. */ |
9831 | |
|
9832 | 0 | if (cnt != MAP_BITS) { |
9833 | 0 | keys_map >>= MAP_BITS - cnt; /* Clear extra bits. */ |
9834 | 0 | } |
9835 | 0 | memset(rules, 0, cnt * sizeof *rules); |
9836 | |
|
9837 | 0 | int lookups_match = 0, subtable_pos = 1; |
9838 | 0 | uint32_t found_map; |
9839 | | |
9840 | | /* The Datapath classifier - aka dpcls - is composed of subtables. |
9841 | | * Subtables are dynamically created as needed when new rules are inserted. |
9842 | | * Each subtable collects rules with matches on a specific subset of packet |
9843 | | * fields as defined by the subtable's mask. We proceed to process every |
9844 | | * search-key against each subtable, but when a match is found for a |
9845 | | * search-key, the search for that key can stop because the rules are |
9846 | | * non-overlapping. */ |
9847 | 0 | PVECTOR_FOR_EACH (subtable, &cls->subtables) { |
9848 | | /* Call the subtable specific lookup function. */ |
9849 | 0 | found_map = subtable->lookup_func(subtable, keys_map, keys, rules); |
9850 | | |
9851 | | /* Count the number of subtables searched for this packet match. This |
9852 | | * estimates the "spread" of subtables looked at per matched packet. */ |
9853 | 0 | uint32_t pkts_matched = count_1bits(found_map); |
9854 | 0 | lookups_match += pkts_matched * subtable_pos; |
9855 | | |
9856 | | /* Clear the found rules, and return early if all packets are found. */ |
9857 | 0 | keys_map &= ~found_map; |
9858 | 0 | if (!keys_map) { |
9859 | 0 | if (num_lookups_p) { |
9860 | 0 | *num_lookups_p = lookups_match; |
9861 | 0 | } |
9862 | 0 | return true; |
9863 | 0 | } |
9864 | 0 | subtable_pos++; |
9865 | 0 | } |
9866 | | |
9867 | 0 | if (num_lookups_p) { |
9868 | 0 | *num_lookups_p = lookups_match; |
9869 | 0 | } |
9870 | | return false; |
9871 | 0 | } |