Coverage Report

Created: 2026-06-22 06:39

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/openvswitch/lib/dpif-netdev.c
Line
Count
Source
1
/*
2
 * Copyright (c) 2009-2014, 2016-2018 Nicira, Inc.
3
 *
4
 * Licensed under the Apache License, Version 2.0 (the "License");
5
 * you may not use this file except in compliance with the License.
6
 * You may obtain a copy of the License at:
7
 *
8
 *     http://www.apache.org/licenses/LICENSE-2.0
9
 *
10
 * Unless required by applicable law or agreed to in writing, software
11
 * distributed under the License is distributed on an "AS IS" BASIS,
12
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
 * See the License for the specific language governing permissions and
14
 * limitations under the License.
15
 */
16
17
#include <config.h>
18
#include "dpif-netdev.h"
19
20
#include <ctype.h>
21
#include <errno.h>
22
#include <fcntl.h>
23
#include <inttypes.h>
24
#include <net/if.h>
25
#include <sys/types.h>
26
#include <netinet/in.h>
27
#include <stdint.h>
28
#include <stdlib.h>
29
#include <string.h>
30
#include <sys/ioctl.h>
31
#include <sys/socket.h>
32
#include <sys/stat.h>
33
#include <unistd.h>
34
35
#include "bitmap.h"
36
#include "ccmap.h"
37
#include "cmap.h"
38
#include "conntrack.h"
39
#include "conntrack-tp.h"
40
#include "coverage.h"
41
#include "ct-dpif.h"
42
#include "csum.h"
43
#include "dp-packet.h"
44
#include "dpif.h"
45
#include "dpif-netdev-dfc.h"
46
#include "dpif-netdev-dpcls.h"
47
#include "dpif-netdev-flow.h"
48
#include "dpif-netdev-perf.h"
49
#include "dpif-netdev-thread.h"
50
#include "dpif-offload.h"
51
#include "dpif-provider.h"
52
#include "dummy.h"
53
#include "fat-rwlock.h"
54
#include "flow.h"
55
#include "hmapx.h"
56
#include "id-fpool.h"
57
#include "id-pool.h"
58
#include "ipf.h"
59
#include "mov-avg.h"
60
#include "mpsc-queue.h"
61
#include "netdev.h"
62
#include "netdev-provider.h"
63
#include "netdev-vport.h"
64
#include "netlink.h"
65
#include "odp-execute.h"
66
#include "odp-util.h"
67
#include "openvswitch/dynamic-string.h"
68
#include "openvswitch/list.h"
69
#include "openvswitch/match.h"
70
#include "openvswitch/ofp-parse.h"
71
#include "openvswitch/ofp-print.h"
72
#include "openvswitch/ofpbuf.h"
73
#include "openvswitch/shash.h"
74
#include "openvswitch/vlog.h"
75
#include "ovs-numa.h"
76
#include "ovs-rcu.h"
77
#include "packets.h"
78
#include "openvswitch/poll-loop.h"
79
#include "pvector.h"
80
#include "random.h"
81
#include "seq.h"
82
#include "smap.h"
83
#include "sset.h"
84
#include "timeval.h"
85
#include "tnl-neigh-cache.h"
86
#include "tnl-ports.h"
87
#include "unixctl.h"
88
#include "util.h"
89
#include "uuid.h"
90
91
VLOG_DEFINE_THIS_MODULE(dpif_netdev);
92
93
/* Auto Load Balancing Defaults */
94
0
#define ALB_IMPROVEMENT_THRESHOLD    25
95
0
#define ALB_LOAD_THRESHOLD           95
96
0
#define ALB_REBALANCE_INTERVAL       1     /* 1 Min */
97
0
#define MAX_ALB_REBALANCE_INTERVAL   20000 /* 20000 Min */
98
0
#define MIN_TO_MSEC                  60000
99
100
#define FLOW_DUMP_MAX_BATCH 50
101
/* Use per thread recirc_depth to prevent recirculation loop. */
102
0
#define MAX_RECIRC_DEPTH 8
103
DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0)
104
105
/* Use instant packet send by default. */
106
0
#define DEFAULT_TX_FLUSH_INTERVAL 0
107
108
/* Configuration parameters. */
109
enum { MAX_METERS = 1 << 18 };  /* Maximum number of meters. */
110
enum { MAX_BANDS = 8 };         /* Maximum number of bands / meter. */
111
112
COVERAGE_DEFINE(datapath_drop_meter);
113
COVERAGE_DEFINE(datapath_drop_upcall_error);
114
COVERAGE_DEFINE(datapath_drop_lock_error);
115
COVERAGE_DEFINE(datapath_drop_userspace_action_error);
116
COVERAGE_DEFINE(datapath_drop_tunnel_push_error);
117
COVERAGE_DEFINE(datapath_drop_tunnel_pop_error);
118
COVERAGE_DEFINE(datapath_drop_recirc_error);
119
COVERAGE_DEFINE(datapath_drop_invalid_port);
120
COVERAGE_DEFINE(datapath_drop_invalid_bond);
121
COVERAGE_DEFINE(datapath_drop_invalid_tnl_port);
122
COVERAGE_DEFINE(datapath_drop_rx_invalid_packet);
123
COVERAGE_DEFINE(datapath_drop_hw_post_process);
124
COVERAGE_DEFINE(datapath_drop_hw_post_process_consumed);
125
126
/* Protects against changes to 'dp_netdevs'. */
127
static struct ovs_mutex dp_netdev_mutex = OVS_MUTEX_INITIALIZER;
128
129
/* Contains all 'struct dp_netdev's. */
130
static struct shash dp_netdevs OVS_GUARDED_BY(dp_netdev_mutex)
131
    = SHASH_INITIALIZER(&dp_netdevs);
132
133
static struct vlog_rate_limit upcall_rl = VLOG_RATE_LIMIT_INIT(600, 600);
134
135
0
#define DP_NETDEV_CS_SUPPORTED_MASK (CS_NEW | CS_ESTABLISHED | CS_RELATED \
136
0
                                     | CS_INVALID | CS_REPLY_DIR | CS_TRACKED \
137
0
                                     | CS_SRC_NAT | CS_DST_NAT)
138
0
#define DP_NETDEV_CS_UNSUPPORTED_MASK (~(uint32_t)DP_NETDEV_CS_SUPPORTED_MASK)
139
140
static struct odp_support dp_netdev_support = {
141
    .max_vlan_headers = SIZE_MAX,
142
    .max_mpls_depth = SIZE_MAX,
143
    .recirc = true,
144
    .ct_state = true,
145
    .ct_zone = true,
146
    .ct_mark = true,
147
    .ct_label = true,
148
    .ct_state_nat = true,
149
    .ct_orig_tuple = true,
150
    .ct_orig_tuple6 = true,
151
};
152
153

154
/* Simple non-wildcarding single-priority classifier. */
155
156
/* Time in microseconds between successive optimizations of the dpcls
157
 * subtable vector */
158
0
#define DPCLS_OPTIMIZATION_INTERVAL 1000000LL
159
160
/* Time in microseconds of the interval in which rxq processing cycles used
161
 * in rxq to pmd assignments is measured and stored. */
162
0
#define PMD_INTERVAL_LEN 5000000LL
163
/* For converting PMD_INTERVAL_LEN to secs. */
164
0
#define INTERVAL_USEC_TO_SEC 1000000LL
165
166
/* Number of intervals for which cycles are stored
167
 * and used during rxq to pmd assignment. */
168
0
#define PMD_INTERVAL_MAX 12
169
170
/* Time in microseconds to try RCU quiescing. */
171
0
#define PMD_RCU_QUIESCE_INTERVAL 10000LL
172
173
/* Timer resolution for PMD threads in nanoseconds. */
174
0
#define PMD_TIMER_RES_NS 1000
175
176
/* Number of pkts Rx on an interface that will stop pmd thread sleeping. */
177
0
#define PMD_SLEEP_THRESH (NETDEV_MAX_BURST / 2)
178
/* Time in uS to increment a pmd thread sleep time. */
179
0
#define PMD_SLEEP_INC_US 1
180
181
struct pmd_sleep {
182
    unsigned core_id;
183
    uint64_t max_sleep;
184
};
185
186
struct dpcls {
187
    struct cmap_node node;      /* Within dp_netdev_pmd_thread.classifiers */
188
    odp_port_t in_port;
189
    struct cmap subtables_map;
190
    struct pvector subtables;
191
};
192
193
/* Data structure to keep packet order till fastpath processing. */
194
struct dp_packet_flow_map {
195
    struct dp_packet *packet;
196
    struct dp_netdev_flow *flow;
197
    uint16_t tcp_flags;
198
};
199
200
static void dpcls_init(struct dpcls *);
201
static void dpcls_destroy(struct dpcls *);
202
static void dpcls_sort_subtable_vector(struct dpcls *);
203
static void dpcls_insert(struct dpcls *, struct dpcls_rule *,
204
                         const struct netdev_flow_key *mask);
205
static void dpcls_remove(struct dpcls *, struct dpcls_rule *);
206
207
/* Set of supported meter flags */
208
#define DP_SUPPORTED_METER_FLAGS_MASK \
209
0
    (OFPMF13_STATS | OFPMF13_PKTPS | OFPMF13_KBPS | OFPMF13_BURST)
210
211
/* Set of supported meter band types */
212
#define DP_SUPPORTED_METER_BAND_TYPES           \
213
0
    ( 1 << OFPMBT13_DROP )
214
215
struct dp_meter_band {
216
    uint32_t rate;
217
    uint32_t burst_size;
218
    atomic_uint64_t bucket;          /* In 1/1000 packets for PKTPS,
219
                                      * or in bits for KBPS. */
220
    atomic_uint64_t packet_count;
221
    atomic_uint64_t byte_count;
222
};
223
224
struct dp_meter {
225
    struct cmap_node node;
226
    uint32_t id;
227
    uint16_t flags;
228
    uint16_t n_bands;
229
    uint32_t max_delta_t;
230
    atomic_uint64_t used;  /* Time of a last use in milliseconds. */
231
    atomic_uint64_t packet_count;
232
    atomic_uint64_t byte_count;
233
    struct dp_meter_band bands[];
234
};
235
236
struct pmd_auto_lb {
237
    bool do_dry_run;
238
    bool recheck_config;
239
    bool is_enabled;            /* Current status of Auto load balancing. */
240
    uint64_t rebalance_intvl;
241
    uint64_t rebalance_poll_timer;
242
    uint8_t rebalance_improve_thresh;
243
    atomic_uint8_t rebalance_load_thresh;
244
};
245
246
enum sched_assignment_type {
247
    SCHED_ROUNDROBIN,
248
    SCHED_CYCLES, /* Default.*/
249
    SCHED_GROUP
250
};
251
252
/* Datapath based on the network device interface from netdev.h.
253
 *
254
 *
255
 * Thread-safety
256
 * =============
257
 *
258
 * Some members, marked 'const', are immutable.  Accessing other members
259
 * requires synchronization, as noted in more detail below.
260
 *
261
 * Acquisition order is, from outermost to innermost:
262
 *
263
 *    dp_netdev_mutex (global)
264
 *    port_rwlock
265
 *    bond_mutex
266
 *    non_pmd_mutex
267
 */
268
struct dp_netdev {
269
    const struct dpif_class *const class;
270
    const char *const name;
271
    const char *const full_name;
272
    struct ovs_refcount ref_cnt;
273
    atomic_flag destroyed;
274
275
    /* Ports.
276
     *
277
     * Any lookup into 'ports' or any access to the dp_netdev_ports found
278
     * through 'ports' requires taking 'port_rwlock'. */
279
    struct ovs_rwlock port_rwlock;
280
    struct hmap ports;
281
    struct seq *port_seq;       /* Incremented whenever a port changes. */
282
283
    /* The time that a packet can wait in output batch for sending. */
284
    atomic_uint32_t tx_flush_interval;
285
286
    /* Meters. */
287
    struct ovs_mutex meters_lock;
288
    struct cmap meters;
289
290
    /* Probability of EMC insertions is a factor of 'emc_insert_min'.*/
291
    atomic_uint32_t emc_insert_min;
292
    /* Enable collection of PMD performance metrics. */
293
    atomic_bool pmd_perf_metrics;
294
    /* Default max load based sleep request. */
295
    uint64_t pmd_max_sleep_default;
296
    /* Enable the SMC cache from ovsdb config */
297
    atomic_bool smc_enable_db;
298
299
    /* Protects access to ofproto-dpif-upcall interface during revalidator
300
     * thread synchronization. */
301
    struct fat_rwlock upcall_rwlock;
302
    upcall_callback *upcall_cb;  /* Callback function for executing upcalls. */
303
    void *upcall_aux;
304
305
    /* Callback function for notifying the purging of dp flows (during
306
     * reseting pmd deletion). */
307
    dp_purge_callback *dp_purge_cb;
308
    void *dp_purge_aux;
309
310
    /* Stores all 'struct dp_netdev_pmd_thread's. */
311
    struct cmap poll_threads;
312
    /* id pool for per thread static_tx_qid. */
313
    struct id_pool *tx_qid_pool;
314
    struct ovs_mutex tx_qid_pool_mutex;
315
    /* Rxq to pmd assignment type. */
316
    enum sched_assignment_type pmd_rxq_assign_type;
317
    bool pmd_iso;
318
319
    /* Protects the access of the 'struct dp_netdev_pmd_thread'
320
     * instance for non-pmd thread. */
321
    struct ovs_mutex non_pmd_mutex;
322
323
    /* Each pmd thread will store its pointer to
324
     * 'struct dp_netdev_pmd_thread' in 'per_pmd_key'. */
325
    ovsthread_key_t per_pmd_key;
326
327
    struct seq *reconfigure_seq;
328
    uint64_t last_reconfigure_seq;
329
    struct ovsthread_once once_set_config;
330
331
    /* Cpu mask for pin of pmd threads. */
332
    char *pmd_cmask;
333
334
    /* PMD max load based sleep request user string. */
335
    char *max_sleep_list;
336
337
    uint64_t last_tnl_conf_seq;
338
339
    struct conntrack *conntrack;
340
    struct pmd_auto_lb pmd_alb;
341
342
    /* Bonds. */
343
    struct ovs_mutex bond_mutex; /* Protects updates of 'tx_bonds'. */
344
    struct cmap tx_bonds; /* Contains 'struct tx_bond'. */
345
};
346
347
static struct dp_netdev_port *dp_netdev_lookup_port(const struct dp_netdev *dp,
348
                                                    odp_port_t)
349
    OVS_REQ_RDLOCK(dp->port_rwlock);
350
351
enum rxq_cycles_counter_type {
352
    RXQ_CYCLES_PROC_CURR,       /* Cycles spent successfully polling and
353
                                   processing packets during the current
354
                                   interval. */
355
    RXQ_CYCLES_PROC_HIST,       /* Total cycles of all intervals that are used
356
                                   during rxq to pmd assignment. */
357
    RXQ_N_CYCLES
358
};
359
360
0
#define XPS_TIMEOUT 500000LL    /* In microseconds. */
361
362
/* Contained by struct dp_netdev_port's 'rxqs' member.  */
363
struct dp_netdev_rxq {
364
    struct dp_netdev_port *port;
365
    struct netdev_rxq *rx;
366
    unsigned core_id;                  /* Core to which this queue should be
367
                                          pinned. OVS_CORE_UNSPEC if the
368
                                          queue doesn't need to be pinned to a
369
                                          particular core. */
370
    atomic_count intrvl_idx;           /* Write index for 'cycles_intrvl'. */
371
    struct dp_netdev_pmd_thread *pmd;  /* pmd thread that polls this queue. */
372
    bool is_vhost;                     /* Is rxq of a vhost port. */
373
374
    /* Counters of cycles spent successfully polling and processing pkts. */
375
    atomic_ullong cycles[RXQ_N_CYCLES];
376
    /* We store PMD_INTERVAL_MAX intervals of data for an rxq and then
377
       sum them to yield the cycles used for an rxq. */
378
    atomic_ullong cycles_intrvl[PMD_INTERVAL_MAX];
379
};
380
381
enum txq_req_mode {
382
    TXQ_REQ_MODE_THREAD,
383
    TXQ_REQ_MODE_HASH,
384
};
385
386
enum txq_mode {
387
    TXQ_MODE_STATIC,
388
    TXQ_MODE_XPS,
389
    TXQ_MODE_XPS_HASH,
390
};
391
392
/* A port in a netdev-based datapath. */
393
struct dp_netdev_port {
394
    odp_port_t port_no;
395
    enum txq_mode txq_mode;     /* static, XPS, XPS_HASH. */
396
    bool need_reconfigure;      /* True if we should reconfigure netdev. */
397
    struct netdev *netdev;
398
    struct hmap_node node;      /* Node in dp_netdev's 'ports'. */
399
    struct netdev_saved_flags *sf;
400
    struct dp_netdev_rxq *rxqs;
401
    unsigned n_rxq;             /* Number of elements in 'rxqs' */
402
    unsigned *txq_used;         /* Number of threads that use each tx queue. */
403
    struct ovs_mutex txq_used_mutex;
404
    bool emc_enabled;           /* If true EMC will be used. */
405
    char *type;                 /* Port type as requested by user. */
406
    char *rxq_affinity_list;    /* Requested affinity of rx queues. */
407
    enum txq_req_mode txq_requested_mode;
408
};
409
410
static bool dp_netdev_flow_ref(struct dp_netdev_flow *);
411
static int dpif_netdev_flow_from_nlattrs(const struct nlattr *, uint32_t,
412
                                         struct flow *, bool);
413
414
struct dp_netdev_actions *dp_netdev_actions_create(const struct nlattr *,
415
                                                   size_t);
416
struct dp_netdev_actions *dp_netdev_flow_get_actions(
417
    const struct dp_netdev_flow *);
418
static void dp_netdev_actions_free(struct dp_netdev_actions *);
419
420
struct polled_queue {
421
    struct dp_netdev_rxq *rxq;
422
    odp_port_t port_no;
423
    bool emc_enabled;
424
    bool rxq_enabled;
425
    uint64_t change_seq;
426
};
427
428
/* Contained by struct dp_netdev_pmd_thread's 'poll_list' member. */
429
struct rxq_poll {
430
    struct dp_netdev_rxq *rxq;
431
    struct hmap_node node;
432
};
433
434
/* Contained by struct dp_netdev_pmd_thread's 'send_port_cache',
435
 * 'tnl_port_cache' or 'tx_ports'. */
436
struct tx_port {
437
    struct dp_netdev_port *port;
438
    int qid;
439
    long long last_used;
440
    struct hmap_node node;
441
    long long flush_time;
442
    struct dp_packet_batch output_pkts;
443
    struct dp_packet_batch *txq_pkts; /* Only for hash mode. */
444
    struct dp_netdev_rxq *output_pkts_rxqs[NETDEV_MAX_BURST];
445
};
446
447
/* Contained by struct tx_bond 'member_buckets'. */
448
struct member_entry {
449
    odp_port_t member_id;
450
    atomic_ullong n_packets;
451
    atomic_ullong n_bytes;
452
};
453
454
/* Contained by struct dp_netdev_pmd_thread's 'tx_bonds'. */
455
struct tx_bond {
456
    struct cmap_node node;
457
    uint32_t bond_id;
458
    struct member_entry member_buckets[BOND_BUCKETS];
459
};
460
461
/* Interface to netdev-based datapath. */
462
struct dpif_netdev {
463
    struct dpif dpif;
464
    struct dp_netdev *dp;
465
    uint64_t last_port_seq;
466
};
467
468
static int get_port_by_number(struct dp_netdev *dp, odp_port_t port_no,
469
                              struct dp_netdev_port **portp)
470
    OVS_REQ_RDLOCK(dp->port_rwlock);
471
static int get_port_by_name(struct dp_netdev *dp, const char *devname,
472
                            struct dp_netdev_port **portp)
473
    OVS_REQ_RDLOCK(dp->port_rwlock);
474
static void dp_netdev_free(struct dp_netdev *)
475
    OVS_REQUIRES(dp_netdev_mutex);
476
static int do_add_port(struct dp_netdev *dp, const char *devname,
477
                       const char *type, odp_port_t port_no)
478
    OVS_REQ_WRLOCK(dp->port_rwlock);
479
static void do_del_port(struct dp_netdev *dp, struct dp_netdev_port *)
480
    OVS_REQ_WRLOCK(dp->port_rwlock);
481
static int dpif_netdev_open(const struct dpif_class *, const char *name,
482
                            bool create, struct dpif **);
483
static void dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
484
                                      struct dp_packet_batch *,
485
                                      bool should_steal,
486
                                      const struct flow *flow,
487
                                      const struct nlattr *actions,
488
                                      size_t actions_len);
489
static void dp_netdev_input(struct dp_netdev_pmd_thread *,
490
                            struct dp_packet_batch *, odp_port_t port_no);
491
static void dp_netdev_recirculate(struct dp_netdev_pmd_thread *,
492
                                  struct dp_packet_batch *);
493
494
static void dp_netdev_disable_upcall(struct dp_netdev *);
495
static void dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd);
496
static void dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd,
497
                                    struct dp_netdev *dp, unsigned core_id,
498
                                    int numa_id);
499
static void dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd);
500
static void dp_netdev_set_nonpmd(struct dp_netdev *dp)
501
    OVS_REQ_WRLOCK(dp->port_rwlock);
502
503
static void *pmd_thread_main(void *);
504
static struct dp_netdev_pmd_thread *dp_netdev_get_pmd(struct dp_netdev *dp,
505
                                                      unsigned core_id);
506
static struct dp_netdev_pmd_thread *
507
dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos);
508
static void dp_netdev_del_pmd(struct dp_netdev *dp,
509
                              struct dp_netdev_pmd_thread *pmd);
510
static void dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd);
511
static void dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd);
512
static void dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
513
                                         struct dp_netdev_port *port)
514
    OVS_REQUIRES(pmd->port_mutex);
515
static void dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
516
                                           struct tx_port *tx)
517
    OVS_REQUIRES(pmd->port_mutex);
518
static void dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
519
                                     struct dp_netdev_rxq *rxq)
520
    OVS_REQUIRES(pmd->port_mutex);
521
static void dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
522
                                       struct rxq_poll *poll)
523
    OVS_REQUIRES(pmd->port_mutex);
524
static int
525
dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
526
                                   bool force);
527
static void dp_netdev_add_bond_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
528
                                         struct tx_bond *bond, bool update)
529
    OVS_EXCLUDED(pmd->bond_mutex);
530
static void dp_netdev_del_bond_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
531
                                           uint32_t bond_id)
532
    OVS_EXCLUDED(pmd->bond_mutex);
533
534
static void reconfigure_datapath(struct dp_netdev *dp)
535
    OVS_REQ_RDLOCK(dp->port_rwlock);
536
static bool dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd);
537
static void dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd);
538
static void dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd);
539
static void pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
540
    OVS_REQUIRES(pmd->port_mutex);
541
static inline void
542
dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
543
                           struct polled_queue *poll_list, int poll_cnt);
544
static void
545
dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
546
                         enum rxq_cycles_counter_type type,
547
                         unsigned long long cycles);
548
static uint64_t
549
dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx,
550
                         enum rxq_cycles_counter_type type);
551
static void
552
dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
553
                           unsigned long long cycles);
554
static uint64_t
555
dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx);
556
static uint64_t
557
get_interval_values(atomic_ullong *source, atomic_count *cur_idx,
558
                    int num_to_read);
559
static void
560
dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
561
                               bool purge);
562
static int dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
563
                                      struct tx_port *tx);
564
static inline struct dpcls *dp_netdev_pmd_lookup_dpcls(
565
    struct dp_netdev_pmd_thread *pmd, odp_port_t in_port);
566
567
static void dp_netdev_request_reconfigure(struct dp_netdev *dp);
568
static inline bool
569
pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd);
570
571
static void dp_netdev_simple_match_insert(struct dp_netdev_pmd_thread *pmd,
572
                                          struct dp_netdev_flow *flow)
573
    OVS_REQUIRES(pmd->flow_mutex);
574
static void dp_netdev_simple_match_remove(struct dp_netdev_pmd_thread *pmd,
575
                                          struct dp_netdev_flow *flow)
576
    OVS_REQUIRES(pmd->flow_mutex);
577
578
static bool dp_netdev_flow_is_simple_match(const struct match *);
579
580
/* Updates the time in PMD threads context and should be called in three cases:
581
 *
582
 *     1. PMD structure initialization:
583
 *         - dp_netdev_configure_pmd()
584
 *
585
 *     2. Before processing of the new packet batch:
586
 *         - dpif_netdev_execute()
587
 *         - dp_netdev_process_rxq_port()
588
 *
589
 *     3. At least once per polling iteration in main polling threads if no
590
 *        packets received on current iteration:
591
 *         - dpif_netdev_run()
592
 *         - pmd_thread_main()
593
 *
594
 * 'pmd->ctx.now' should be used without update in all other cases if possible.
595
 */
596
static inline void
597
pmd_thread_ctx_time_update(struct dp_netdev_pmd_thread *pmd)
598
0
{
599
0
    pmd->ctx.now = time_usec();
600
0
}
601
602
/* Returns true if 'dpif' is a netdev or dummy dpif, false otherwise. */
603
bool
604
dpif_is_netdev(const struct dpif *dpif)
605
0
{
606
0
    return dpif->dpif_class->open == dpif_netdev_open;
607
0
}
608
609
static struct dpif_netdev *
610
dpif_netdev_cast(const struct dpif *dpif)
611
0
{
612
0
    ovs_assert(dpif_is_netdev(dpif));
613
0
    return CONTAINER_OF(dpif, struct dpif_netdev, dpif);
614
0
}
615
616
static struct dp_netdev *
617
get_dp_netdev(const struct dpif *dpif)
618
0
{
619
0
    return dpif_netdev_cast(dpif)->dp;
620
0
}
621

622
enum pmd_info_type {
623
    PMD_INFO_CLEAR_STATS, /* Set the cycle and the packet counters to 0. */
624
    PMD_INFO_SHOW_RXQ,    /* Show poll lists of pmd threads. */
625
    PMD_INFO_PERF_SHOW,   /* Show pmd performance details. */
626
    PMD_INFO_SLEEP_SHOW,  /* Show max sleep configuration details. */
627
};
628
629
static void
630
format_pmd_thread(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
631
0
{
632
0
    ds_put_cstr(reply, (pmd->core_id == NON_PMD_CORE_ID)
633
0
                        ? "main thread" : "pmd thread");
634
0
    if (pmd->numa_id != OVS_NUMA_UNSPEC) {
635
0
        ds_put_format(reply, " numa_id %d", pmd->numa_id);
636
0
    }
637
0
    if (pmd->core_id != OVS_CORE_UNSPEC && pmd->core_id != NON_PMD_CORE_ID) {
638
0
        ds_put_format(reply, " core_id %u", pmd->core_id);
639
0
    }
640
0
    ds_put_cstr(reply, ":\n");
641
0
}
642
643
static void
644
pmd_info_show_perf(struct ds *reply,
645
                   struct dp_netdev_pmd_thread *pmd,
646
                   struct pmd_perf_params *par)
647
0
{
648
0
    char *time_str = xastrftime_msec("%H:%M:%S.###", time_wall_msec(), true);
649
0
    long long now = time_msec();
650
0
    double duration = (now - pmd->perf_stats.start_ms) / 1000.0;
651
652
0
    ds_put_cstr(reply, "\n");
653
0
    ds_put_format(reply, "Time: %s\n", time_str);
654
0
    ds_put_format(reply, "Measurement duration: %.3f s\n", duration);
655
0
    ds_put_cstr(reply, "\n");
656
0
    format_pmd_thread(reply, pmd);
657
0
    ds_put_cstr(reply, "\n");
658
0
    pmd_perf_format_overall_stats(reply, &pmd->perf_stats, duration,
659
0
                                  pmd->core_id != NON_PMD_CORE_ID);
660
0
    if (pmd_perf_metrics_enabled(pmd) && pmd->core_id != NON_PMD_CORE_ID) {
661
        /* Prevent parallel clearing of perf metrics. */
662
0
        ovs_mutex_lock(&pmd->perf_stats.clear_mutex);
663
0
        if (par->histograms) {
664
0
            ds_put_cstr(reply, "\n");
665
0
            pmd_perf_format_histograms(reply, &pmd->perf_stats);
666
0
        }
667
0
        if (par->iter_hist_len > 0) {
668
0
            ds_put_cstr(reply, "\n");
669
0
            pmd_perf_format_iteration_history(reply, &pmd->perf_stats,
670
0
                    par->iter_hist_len);
671
0
        }
672
0
        if (par->ms_hist_len > 0) {
673
0
            ds_put_cstr(reply, "\n");
674
0
            pmd_perf_format_ms_history(reply, &pmd->perf_stats,
675
0
                    par->ms_hist_len);
676
0
        }
677
0
        ovs_mutex_unlock(&pmd->perf_stats.clear_mutex);
678
0
    }
679
0
    free(time_str);
680
0
}
681
682
static int
683
compare_poll_list(const void *a_, const void *b_)
684
0
{
685
0
    const struct rxq_poll *a = a_;
686
0
    const struct rxq_poll *b = b_;
687
688
0
    const char *namea = netdev_rxq_get_name(a->rxq->rx);
689
0
    const char *nameb = netdev_rxq_get_name(b->rxq->rx);
690
691
0
    int cmp = strcmp(namea, nameb);
692
0
    if (!cmp) {
693
0
        return netdev_rxq_get_queue_id(a->rxq->rx)
694
0
               - netdev_rxq_get_queue_id(b->rxq->rx);
695
0
    } else {
696
0
        return cmp;
697
0
    }
698
0
}
699
700
static void
701
sorted_poll_list(struct dp_netdev_pmd_thread *pmd, struct rxq_poll **list,
702
                 size_t *n)
703
    OVS_REQUIRES(pmd->port_mutex)
704
0
{
705
0
    struct rxq_poll *ret, *poll;
706
0
    size_t i;
707
708
0
    *n = hmap_count(&pmd->poll_list);
709
0
    if (!*n) {
710
0
        ret = NULL;
711
0
    } else {
712
0
        ret = xcalloc(*n, sizeof *ret);
713
0
        i = 0;
714
0
        HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
715
0
            ret[i] = *poll;
716
0
            i++;
717
0
        }
718
0
        ovs_assert(i == *n);
719
0
        qsort(ret, *n, sizeof *ret, compare_poll_list);
720
0
    }
721
722
0
    *list = ret;
723
0
}
724
725
static void
726
pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd,
727
                  int secs)
728
0
{
729
0
    if (pmd->core_id != NON_PMD_CORE_ID) {
730
0
        struct rxq_poll *list;
731
0
        size_t n_rxq;
732
0
        uint64_t total_pmd_cycles = 0;
733
0
        uint64_t busy_pmd_cycles = 0;
734
0
        uint64_t total_rxq_proc_cycles = 0;
735
0
        unsigned int intervals;
736
737
0
        ds_put_format(reply,
738
0
                      "pmd thread numa_id %d core_id %u:\n  isolated : %s\n",
739
0
                      pmd->numa_id, pmd->core_id, (pmd->isolated)
740
0
                                                  ? "true" : "false");
741
742
0
        ovs_mutex_lock(&pmd->port_mutex);
743
0
        sorted_poll_list(pmd, &list, &n_rxq);
744
745
        /* Get the total pmd cycles for an interval. */
746
0
        atomic_read_relaxed(&pmd->intrvl_cycles, &total_pmd_cycles);
747
        /* Calculate how many intervals are to be used. */
748
0
        intervals = DIV_ROUND_UP(secs,
749
0
                                 PMD_INTERVAL_LEN / INTERVAL_USEC_TO_SEC);
750
        /* Estimate the cycles to cover all intervals. */
751
0
        total_pmd_cycles *= intervals;
752
0
        busy_pmd_cycles = get_interval_values(pmd->busy_cycles_intrvl,
753
0
                                              &pmd->intrvl_idx,
754
0
                                              intervals);
755
0
        if (busy_pmd_cycles > total_pmd_cycles) {
756
0
            busy_pmd_cycles = total_pmd_cycles;
757
0
        }
758
759
0
        for (int i = 0; i < n_rxq; i++) {
760
0
            struct dp_netdev_rxq *rxq = list[i].rxq;
761
0
            const char *name = netdev_rxq_get_name(rxq->rx);
762
0
            uint64_t rxq_proc_cycles = 0;
763
764
0
            rxq_proc_cycles = get_interval_values(rxq->cycles_intrvl,
765
0
                                                  &rxq->intrvl_idx,
766
0
                                                  intervals);
767
0
            total_rxq_proc_cycles += rxq_proc_cycles;
768
0
            ds_put_format(reply, "  port: %-16s  queue-id: %2d", name,
769
0
                          netdev_rxq_get_queue_id(list[i].rxq->rx));
770
0
            ds_put_format(reply, " %s", netdev_rxq_enabled(list[i].rxq->rx)
771
0
                                        ? "(enabled) " : "(disabled)");
772
0
            ds_put_format(reply, "  pmd usage: ");
773
0
            if (total_pmd_cycles) {
774
0
                ds_put_format(reply, "%2.0f %%",
775
0
                              (double) (rxq_proc_cycles * 100) /
776
0
                              total_pmd_cycles);
777
0
            } else {
778
0
                ds_put_format(reply, "%s", "NOT AVAIL");
779
0
            }
780
0
            ds_put_cstr(reply, "\n");
781
0
        }
782
783
0
        if (n_rxq > 0) {
784
0
            ds_put_cstr(reply, "  overhead: ");
785
0
            if (total_pmd_cycles) {
786
0
                uint64_t overhead_cycles = 0;
787
788
0
                if (total_rxq_proc_cycles < busy_pmd_cycles) {
789
0
                    overhead_cycles = busy_pmd_cycles - total_rxq_proc_cycles;
790
0
                }
791
792
0
                ds_put_format(reply, "%2.0f %%",
793
0
                              (double) (overhead_cycles * 100) /
794
0
                              total_pmd_cycles);
795
0
            } else {
796
0
                ds_put_cstr(reply, "NOT AVAIL");
797
0
            }
798
0
            ds_put_cstr(reply, "\n");
799
0
        }
800
801
0
        ovs_mutex_unlock(&pmd->port_mutex);
802
0
        free(list);
803
0
    }
804
0
}
805
806
static int
807
compare_poll_thread_list(const void *a_, const void *b_)
808
0
{
809
0
    const struct dp_netdev_pmd_thread *a, *b;
810
811
0
    a = *(struct dp_netdev_pmd_thread **)a_;
812
0
    b = *(struct dp_netdev_pmd_thread **)b_;
813
814
0
    if (a->core_id < b->core_id) {
815
0
        return -1;
816
0
    }
817
0
    if (a->core_id > b->core_id) {
818
0
        return 1;
819
0
    }
820
0
    return 0;
821
0
}
822
823
/* Create a sorted list of pmd's from the dp->poll_threads cmap. We can use
824
 * this list, as long as we do not go to quiescent state. */
825
static void
826
sorted_poll_thread_list(struct dp_netdev *dp,
827
                        struct dp_netdev_pmd_thread ***list,
828
                        size_t *n)
829
0
{
830
0
    struct dp_netdev_pmd_thread *pmd;
831
0
    struct dp_netdev_pmd_thread **pmd_list;
832
0
    size_t k = 0, n_pmds;
833
834
0
    n_pmds = cmap_count(&dp->poll_threads);
835
0
    pmd_list = xcalloc(n_pmds, sizeof *pmd_list);
836
837
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
838
0
        if (k >= n_pmds) {
839
0
            break;
840
0
        }
841
0
        pmd_list[k++] = pmd;
842
0
    }
843
844
0
    qsort(pmd_list, k, sizeof *pmd_list, compare_poll_thread_list);
845
846
0
    *list = pmd_list;
847
0
    *n = k;
848
0
}
849
850
static void
851
dpif_netdev_pmd_rebalance(struct unixctl_conn *conn, int argc,
852
                          const char *argv[], void *aux OVS_UNUSED)
853
0
{
854
0
    struct ds reply = DS_EMPTY_INITIALIZER;
855
0
    struct dp_netdev *dp = NULL;
856
857
0
    ovs_mutex_lock(&dp_netdev_mutex);
858
859
0
    if (argc == 2) {
860
0
        dp = shash_find_data(&dp_netdevs, argv[1]);
861
0
    } else if (shash_count(&dp_netdevs) == 1) {
862
        /* There's only one datapath */
863
0
        dp = shash_first(&dp_netdevs)->data;
864
0
    }
865
866
0
    if (!dp) {
867
0
        ovs_mutex_unlock(&dp_netdev_mutex);
868
0
        unixctl_command_reply_error(conn,
869
0
                                    "please specify an existing datapath");
870
0
        return;
871
0
    }
872
873
0
    dp_netdev_request_reconfigure(dp);
874
0
    ovs_mutex_unlock(&dp_netdev_mutex);
875
0
    ds_put_cstr(&reply, "pmd rxq rebalance requested.\n");
876
0
    unixctl_command_reply(conn, ds_cstr(&reply));
877
0
    ds_destroy(&reply);
878
0
}
879
880
static void
881
pmd_info_show_sleep(struct ds *reply, unsigned core_id, int numa_id,
882
                    uint64_t pmd_max_sleep)
883
0
{
884
0
    if (core_id == NON_PMD_CORE_ID) {
885
0
        return;
886
0
    }
887
0
    ds_put_format(reply,
888
0
                  "pmd thread numa_id %d core_id %d:\n"
889
0
                  "  max sleep: %4"PRIu64" us\n",
890
0
                  numa_id, core_id, pmd_max_sleep);
891
0
}
892
893
static void
894
dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[],
895
                     void *aux)
896
0
{
897
0
    struct ds reply = DS_EMPTY_INITIALIZER;
898
0
    struct dp_netdev_pmd_thread **pmd_list;
899
0
    struct dp_netdev *dp = NULL;
900
0
    enum pmd_info_type type = *(enum pmd_info_type *) aux;
901
0
    unsigned int core_id;
902
0
    bool filter_on_pmd = false;
903
0
    size_t n;
904
0
    unsigned int secs = 0;
905
0
    unsigned long long max_secs = (PMD_INTERVAL_LEN * PMD_INTERVAL_MAX)
906
0
                                      / INTERVAL_USEC_TO_SEC;
907
0
    bool show_header = true;
908
0
    uint64_t max_sleep;
909
910
0
    ovs_mutex_lock(&dp_netdev_mutex);
911
912
0
    while (argc > 1) {
913
0
        if (!strcmp(argv[1], "-pmd") && argc > 2) {
914
0
            if (str_to_uint(argv[2], 10, &core_id)) {
915
0
                filter_on_pmd = true;
916
0
            }
917
0
            argc -= 2;
918
0
            argv += 2;
919
0
        } else if (type == PMD_INFO_SHOW_RXQ &&
920
0
                       !strcmp(argv[1], "-secs") &&
921
0
                       argc > 2) {
922
0
            if (!str_to_uint(argv[2], 10, &secs)) {
923
0
                secs = max_secs;
924
0
            }
925
0
            argc -= 2;
926
0
            argv += 2;
927
0
        } else {
928
0
            dp = shash_find_data(&dp_netdevs, argv[1]);
929
0
            argc -= 1;
930
0
            argv += 1;
931
0
        }
932
0
    }
933
934
0
    if (!dp) {
935
0
        if (shash_count(&dp_netdevs) == 1) {
936
            /* There's only one datapath */
937
0
            dp = shash_first(&dp_netdevs)->data;
938
0
        } else {
939
0
            ovs_mutex_unlock(&dp_netdev_mutex);
940
0
            unixctl_command_reply_error(conn,
941
0
                                        "please specify an existing datapath");
942
0
            return;
943
0
        }
944
0
    }
945
946
0
    sorted_poll_thread_list(dp, &pmd_list, &n);
947
0
    for (size_t i = 0; i < n; i++) {
948
0
        struct dp_netdev_pmd_thread *pmd = pmd_list[i];
949
0
        if (!pmd) {
950
0
            break;
951
0
        }
952
0
        if (filter_on_pmd && pmd->core_id != core_id) {
953
0
            continue;
954
0
        }
955
0
        if (type == PMD_INFO_SHOW_RXQ) {
956
0
            if (show_header) {
957
0
                if (!secs || secs > max_secs) {
958
0
                    secs = max_secs;
959
0
                } else {
960
0
                    secs = ROUND_UP(secs,
961
0
                                    PMD_INTERVAL_LEN / INTERVAL_USEC_TO_SEC);
962
0
                }
963
0
                ds_put_format(&reply, "Displaying last %u seconds "
964
0
                              "pmd usage %%\n", secs);
965
0
                show_header = false;
966
0
            }
967
0
            pmd_info_show_rxq(&reply, pmd, secs);
968
0
        } else if (type == PMD_INFO_CLEAR_STATS) {
969
0
            pmd_perf_stats_clear(&pmd->perf_stats);
970
0
        } else if (type == PMD_INFO_PERF_SHOW) {
971
0
            pmd_info_show_perf(&reply, pmd, (struct pmd_perf_params *)aux);
972
0
        } else if (type == PMD_INFO_SLEEP_SHOW) {
973
0
            if (show_header) {
974
0
                ds_put_format(&reply, "Default max sleep: %4"PRIu64" us\n",
975
0
                              dp->pmd_max_sleep_default);
976
0
                show_header = false;
977
0
            }
978
0
            atomic_read_relaxed(&pmd->max_sleep, &max_sleep);
979
0
            pmd_info_show_sleep(&reply, pmd->core_id, pmd->numa_id,
980
0
                                max_sleep);
981
0
        }
982
0
    }
983
0
    free(pmd_list);
984
985
0
    ovs_mutex_unlock(&dp_netdev_mutex);
986
987
0
    unixctl_command_reply(conn, ds_cstr(&reply));
988
0
    ds_destroy(&reply);
989
0
}
990
991
static void
992
pmd_perf_show_cmd(struct unixctl_conn *conn, int argc,
993
                          const char *argv[],
994
                          void *aux OVS_UNUSED)
995
0
{
996
0
    struct pmd_perf_params par;
997
0
    long int it_hist = 0, ms_hist = 0;
998
0
    par.histograms = true;
999
1000
0
    while (argc > 1) {
1001
0
        if (!strcmp(argv[1], "-nh")) {
1002
0
            par.histograms = false;
1003
0
            argc -= 1;
1004
0
            argv += 1;
1005
0
        } else if (!strcmp(argv[1], "-it") && argc > 2) {
1006
0
            it_hist = strtol(argv[2], NULL, 10);
1007
0
            if (it_hist < 0) {
1008
0
                it_hist = 0;
1009
0
            } else if (it_hist > HISTORY_LEN) {
1010
0
                it_hist = HISTORY_LEN;
1011
0
            }
1012
0
            argc -= 2;
1013
0
            argv += 2;
1014
0
        } else if (!strcmp(argv[1], "-ms") && argc > 2) {
1015
0
            ms_hist = strtol(argv[2], NULL, 10);
1016
0
            if (ms_hist < 0) {
1017
0
                ms_hist = 0;
1018
0
            } else if (ms_hist > HISTORY_LEN) {
1019
0
                ms_hist = HISTORY_LEN;
1020
0
            }
1021
0
            argc -= 2;
1022
0
            argv += 2;
1023
0
        } else {
1024
0
            break;
1025
0
        }
1026
0
    }
1027
0
    par.iter_hist_len = it_hist;
1028
0
    par.ms_hist_len = ms_hist;
1029
0
    par.command_type = PMD_INFO_PERF_SHOW;
1030
0
    dpif_netdev_pmd_info(conn, argc, argv, &par);
1031
0
}
1032
1033
static void
1034
dpif_netdev_bond_show(struct unixctl_conn *conn, int argc,
1035
                      const char *argv[], void *aux OVS_UNUSED)
1036
0
{
1037
0
    struct ds reply = DS_EMPTY_INITIALIZER;
1038
0
    struct dp_netdev *dp = NULL;
1039
1040
0
    ovs_mutex_lock(&dp_netdev_mutex);
1041
0
    if (argc == 2) {
1042
0
        dp = shash_find_data(&dp_netdevs, argv[1]);
1043
0
    } else if (shash_count(&dp_netdevs) == 1) {
1044
        /* There's only one datapath. */
1045
0
        dp = shash_first(&dp_netdevs)->data;
1046
0
    }
1047
0
    if (!dp) {
1048
0
        ovs_mutex_unlock(&dp_netdev_mutex);
1049
0
        unixctl_command_reply_error(conn,
1050
0
                                    "please specify an existing datapath");
1051
0
        return;
1052
0
    }
1053
1054
0
    if (cmap_count(&dp->tx_bonds) > 0) {
1055
0
        struct tx_bond *dp_bond_entry;
1056
1057
0
        ds_put_cstr(&reply, "Bonds:\n");
1058
0
        CMAP_FOR_EACH (dp_bond_entry, node, &dp->tx_bonds) {
1059
0
            ds_put_format(&reply, "  bond-id %"PRIu32":\n",
1060
0
                          dp_bond_entry->bond_id);
1061
0
            for (int bucket = 0; bucket < BOND_BUCKETS; bucket++) {
1062
0
                uint32_t member_id = odp_to_u32(
1063
0
                    dp_bond_entry->member_buckets[bucket].member_id);
1064
0
                ds_put_format(&reply,
1065
0
                              "    bucket %d - member %"PRIu32"\n",
1066
0
                              bucket, member_id);
1067
0
            }
1068
0
        }
1069
0
    }
1070
0
    ovs_mutex_unlock(&dp_netdev_mutex);
1071
0
    unixctl_command_reply(conn, ds_cstr(&reply));
1072
0
    ds_destroy(&reply);
1073
0
}
1074
1075

1076
static int
1077
dpif_netdev_init(void)
1078
0
{
1079
0
    static enum pmd_info_type clear_aux = PMD_INFO_CLEAR_STATS,
1080
0
                              poll_aux = PMD_INFO_SHOW_RXQ,
1081
0
                              sleep_aux = PMD_INFO_SLEEP_SHOW;
1082
1083
0
    unixctl_command_register("dpif-netdev/pmd-stats-clear", "[-pmd core] [dp]",
1084
0
                             0, 3, dpif_netdev_pmd_info,
1085
0
                             (void *)&clear_aux);
1086
0
    unixctl_command_register("dpif-netdev/pmd-rxq-show", "[-pmd core] "
1087
0
                             "[-secs secs] [dp]",
1088
0
                             0, 5, dpif_netdev_pmd_info,
1089
0
                             (void *)&poll_aux);
1090
0
    unixctl_command_register("dpif-netdev/pmd-sleep-show", "[dp]",
1091
0
                             0, 1, dpif_netdev_pmd_info,
1092
0
                             (void *)&sleep_aux);
1093
0
    unixctl_command_register("dpif-netdev/pmd-perf-show",
1094
0
                             "[-nh] [-it iter-history-len]"
1095
0
                             " [-ms ms-history-len]"
1096
0
                             " [-pmd core] [dp]",
1097
0
                             0, 8, pmd_perf_show_cmd,
1098
0
                             NULL);
1099
    /* 'pmd-stats-show' is just an undocumented alias for 'pmd-perf-show',
1100
     * for compatibility with old muscle memory. */
1101
0
    unixctl_command_register("dpif-netdev/pmd-stats-show", NULL,
1102
0
                             0, 8, pmd_perf_show_cmd, NULL);
1103
0
    unixctl_command_register("dpif-netdev/pmd-rxq-rebalance", "[dp]",
1104
0
                             0, 1, dpif_netdev_pmd_rebalance,
1105
0
                             NULL);
1106
0
    unixctl_command_register("dpif-netdev/pmd-perf-log-set",
1107
0
                             "on|off [-b before] [-a after] [-e|-ne] "
1108
0
                             "[-us usec] [-q qlen]",
1109
0
                             0, 10, pmd_perf_log_set_cmd,
1110
0
                             NULL);
1111
0
    unixctl_command_register("dpif-netdev/bond-show", "[dp]",
1112
0
                             0, 1, dpif_netdev_bond_show,
1113
0
                             NULL);
1114
0
    return 0;
1115
0
}
1116
1117
static int
1118
dpif_netdev_enumerate(struct sset *all_dps,
1119
                      const struct dpif_class *dpif_class)
1120
0
{
1121
0
    struct shash_node *node;
1122
1123
0
    ovs_mutex_lock(&dp_netdev_mutex);
1124
0
    SHASH_FOR_EACH(node, &dp_netdevs) {
1125
0
        struct dp_netdev *dp = node->data;
1126
0
        if (dpif_class != dp->class) {
1127
            /* 'dp_netdevs' contains both "netdev" and "dummy" dpifs.
1128
             * If the class doesn't match, skip this dpif. */
1129
0
             continue;
1130
0
        }
1131
0
        sset_add(all_dps, node->name);
1132
0
    }
1133
0
    ovs_mutex_unlock(&dp_netdev_mutex);
1134
1135
0
    return 0;
1136
0
}
1137
1138
static bool
1139
dpif_netdev_class_is_dummy(const struct dpif_class *class)
1140
0
{
1141
0
    return class != &dpif_netdev_class;
1142
0
}
1143
1144
static const char *
1145
dpif_netdev_port_open_type(const struct dpif_class *class, const char *type)
1146
0
{
1147
0
    return strcmp(type, "internal") ? type
1148
0
                  : dpif_netdev_class_is_dummy(class) ? "dummy-internal"
1149
0
                  : "tap";
1150
0
}
1151
1152
static struct dpif *
1153
create_dpif_netdev(struct dp_netdev *dp)
1154
0
{
1155
0
    uint16_t netflow_id = hash_string(dp->name, 0);
1156
0
    struct dpif_netdev *dpif;
1157
1158
0
    ovs_refcount_ref(&dp->ref_cnt);
1159
1160
0
    dpif = xmalloc(sizeof *dpif);
1161
0
    dpif_init(&dpif->dpif, dp->class, dp->name, netflow_id >> 8, netflow_id);
1162
0
    dpif->dp = dp;
1163
0
    dpif->last_port_seq = seq_read(dp->port_seq);
1164
1165
0
    return &dpif->dpif;
1166
0
}
1167
1168
/* Choose an unused, non-zero port number and return it on success.
1169
 * Return ODPP_NONE on failure. */
1170
static odp_port_t
1171
choose_port(struct dp_netdev *dp, const char *name)
1172
    OVS_REQ_RDLOCK(dp->port_rwlock)
1173
0
{
1174
0
    uint32_t port_no;
1175
1176
0
    if (dp->class != &dpif_netdev_class) {
1177
0
        const char *p;
1178
0
        int start_no = 0;
1179
1180
        /* If the port name begins with "br", start the number search at
1181
         * 100 to make writing tests easier. */
1182
0
        if (!strncmp(name, "br", 2)) {
1183
0
            start_no = 100;
1184
0
        }
1185
1186
        /* If the port name contains a number, try to assign that port number.
1187
         * This can make writing unit tests easier because port numbers are
1188
         * predictable. */
1189
0
        for (p = name; *p != '\0'; p++) {
1190
0
            if (isdigit((unsigned char) *p)) {
1191
0
                port_no = start_no + strtol(p, NULL, 10);
1192
0
                if (port_no > 0 && port_no != odp_to_u32(ODPP_NONE)
1193
0
                    && !dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
1194
0
                    return u32_to_odp(port_no);
1195
0
                }
1196
0
                break;
1197
0
            }
1198
0
        }
1199
0
    }
1200
1201
0
    for (port_no = 1; port_no <= UINT16_MAX; port_no++) {
1202
0
        if (!dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
1203
0
            return u32_to_odp(port_no);
1204
0
        }
1205
0
    }
1206
1207
0
    return ODPP_NONE;
1208
0
}
1209
1210
static uint32_t
1211
dp_meter_hash(uint32_t meter_id)
1212
0
{
1213
    /* In the ofproto-dpif layer, we use the id-pool to alloc meter id
1214
     * orderly (e.g. 1, 2, ... N.), which provides a better hash
1215
     * distribution.  Use them directly instead of hash_xxx function for
1216
     * achieving high-performance. */
1217
0
    return meter_id;
1218
0
}
1219
1220
static void
1221
dp_netdev_meter_destroy(struct dp_netdev *dp)
1222
0
{
1223
0
    struct dp_meter *m;
1224
1225
0
    ovs_mutex_lock(&dp->meters_lock);
1226
0
    CMAP_FOR_EACH (m, node, &dp->meters) {
1227
0
        cmap_remove(&dp->meters, &m->node, dp_meter_hash(m->id));
1228
0
        ovsrcu_postpone(free, m);
1229
0
    }
1230
1231
0
    cmap_destroy(&dp->meters);
1232
0
    ovs_mutex_unlock(&dp->meters_lock);
1233
0
    ovs_mutex_destroy(&dp->meters_lock);
1234
0
}
1235
1236
static struct dp_meter *
1237
dp_meter_lookup(struct cmap *meters, uint32_t meter_id)
1238
0
{
1239
0
    uint32_t hash = dp_meter_hash(meter_id);
1240
0
    struct dp_meter *m;
1241
1242
0
    CMAP_FOR_EACH_WITH_HASH (m, node, hash, meters) {
1243
0
        if (m->id == meter_id) {
1244
0
            return m;
1245
0
        }
1246
0
    }
1247
1248
0
    return NULL;
1249
0
}
1250
1251
static void
1252
dp_meter_detach_free(struct cmap *meters, uint32_t meter_id)
1253
0
{
1254
0
    struct dp_meter *m = dp_meter_lookup(meters, meter_id);
1255
1256
0
    if (m) {
1257
0
        cmap_remove(meters, &m->node, dp_meter_hash(meter_id));
1258
0
        ovsrcu_postpone(free, m);
1259
0
    }
1260
0
}
1261
1262
static void
1263
dp_meter_attach(struct cmap *meters, struct dp_meter *meter)
1264
0
{
1265
0
    cmap_insert(meters, &meter->node, dp_meter_hash(meter->id));
1266
0
}
1267
1268
static int
1269
create_dp_netdev(const char *name, const struct dpif_class *class,
1270
                 struct dp_netdev **dpp)
1271
    OVS_REQUIRES(dp_netdev_mutex)
1272
0
{
1273
0
    static struct ovsthread_once tsc_freq_check = OVSTHREAD_ONCE_INITIALIZER;
1274
0
    struct dp_netdev *dp;
1275
0
    int error;
1276
1277
    /* Avoid estimating TSC frequency for dummy datapath to not slow down
1278
     * unit tests. */
1279
0
    if (!dpif_netdev_class_is_dummy(class)
1280
0
        && ovsthread_once_start(&tsc_freq_check)) {
1281
0
        pmd_perf_estimate_tsc_frequency();
1282
0
        ovsthread_once_done(&tsc_freq_check);
1283
0
    }
1284
1285
0
    dp = xzalloc(sizeof *dp);
1286
0
    shash_add(&dp_netdevs, name, dp);
1287
1288
0
    *CONST_CAST(const struct dpif_class **, &dp->class) = class;
1289
0
    *CONST_CAST(const char **, &dp->name) = xstrdup(name);
1290
0
    *CONST_CAST(const char **, &dp->full_name) = xasprintf("%s@%s",
1291
0
                                                           class->type, name);
1292
0
    ovs_refcount_init(&dp->ref_cnt);
1293
0
    atomic_flag_clear(&dp->destroyed);
1294
1295
0
    ovs_rwlock_init(&dp->port_rwlock);
1296
0
    hmap_init(&dp->ports);
1297
0
    dp->port_seq = seq_create();
1298
0
    ovs_mutex_init(&dp->bond_mutex);
1299
0
    cmap_init(&dp->tx_bonds);
1300
1301
0
    fat_rwlock_init(&dp->upcall_rwlock);
1302
1303
0
    dp->reconfigure_seq = seq_create();
1304
0
    dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
1305
0
    dp->once_set_config = (struct ovsthread_once) OVSTHREAD_ONCE_INITIALIZER;
1306
1307
    /* Init meter resources. */
1308
0
    cmap_init(&dp->meters);
1309
0
    ovs_mutex_init(&dp->meters_lock);
1310
1311
    /* Disable upcalls by default. */
1312
0
    dp_netdev_disable_upcall(dp);
1313
0
    dp->upcall_aux = NULL;
1314
0
    dp->upcall_cb = NULL;
1315
1316
0
    dp->conntrack = conntrack_init();
1317
1318
0
    atomic_init(&dp->emc_insert_min, DEFAULT_EM_FLOW_INSERT_MIN);
1319
0
    atomic_init(&dp->tx_flush_interval, DEFAULT_TX_FLUSH_INTERVAL);
1320
1321
0
    cmap_init(&dp->poll_threads);
1322
0
    dp->pmd_rxq_assign_type = SCHED_CYCLES;
1323
1324
0
    ovs_mutex_init(&dp->tx_qid_pool_mutex);
1325
    /* We need 1 Tx queue for each possible core + 1 for non-PMD threads. */
1326
0
    dp->tx_qid_pool = id_pool_create(0, ovs_numa_get_n_cores() + 1);
1327
1328
0
    ovs_mutex_init_recursive(&dp->non_pmd_mutex);
1329
0
    ovsthread_key_create(&dp->per_pmd_key, NULL);
1330
1331
0
    ovs_rwlock_wrlock(&dp->port_rwlock);
1332
    /* non-PMD will be created before all other threads and will
1333
     * allocate static_tx_qid = 0. */
1334
0
    dp_netdev_set_nonpmd(dp);
1335
1336
0
    error = do_add_port(dp, name, dpif_netdev_port_open_type(dp->class,
1337
0
                                                             "internal"),
1338
0
                        ODPP_LOCAL);
1339
0
    ovs_rwlock_unlock(&dp->port_rwlock);
1340
0
    if (error) {
1341
0
        dp_netdev_free(dp);
1342
0
        return error;
1343
0
    }
1344
1345
0
    dp->max_sleep_list = NULL;
1346
1347
0
    dp->last_tnl_conf_seq = seq_read(tnl_conf_seq);
1348
0
    *dpp = dp;
1349
0
    return 0;
1350
0
}
1351
1352
static void
1353
dp_netdev_request_reconfigure(struct dp_netdev *dp)
1354
0
{
1355
0
    seq_change(dp->reconfigure_seq);
1356
0
}
1357
1358
static bool
1359
dp_netdev_is_reconf_required(struct dp_netdev *dp)
1360
0
{
1361
0
    return seq_read(dp->reconfigure_seq) != dp->last_reconfigure_seq;
1362
0
}
1363
1364
static int
1365
dpif_netdev_open(const struct dpif_class *class, const char *name,
1366
                 bool create, struct dpif **dpifp)
1367
0
{
1368
0
    struct dp_netdev *dp;
1369
0
    int error;
1370
1371
0
    ovs_mutex_lock(&dp_netdev_mutex);
1372
0
    dp = shash_find_data(&dp_netdevs, name);
1373
0
    if (!dp) {
1374
0
        error = create ? create_dp_netdev(name, class, &dp) : ENODEV;
1375
0
    } else {
1376
0
        error = (dp->class != class ? EINVAL
1377
0
                 : create ? EEXIST
1378
0
                 : 0);
1379
0
    }
1380
0
    if (!error) {
1381
0
        *dpifp = create_dpif_netdev(dp);
1382
0
    }
1383
0
    ovs_mutex_unlock(&dp_netdev_mutex);
1384
1385
0
    return error;
1386
0
}
1387
1388
static void
1389
dp_netdev_destroy_upcall_lock(struct dp_netdev *dp)
1390
    OVS_NO_THREAD_SAFETY_ANALYSIS
1391
0
{
1392
    /* Check that upcalls are disabled, i.e. that the rwlock is taken */
1393
0
    ovs_assert(fat_rwlock_tryrdlock(&dp->upcall_rwlock));
1394
1395
    /* Before freeing a lock we should release it */
1396
0
    fat_rwlock_unlock(&dp->upcall_rwlock);
1397
0
    fat_rwlock_destroy(&dp->upcall_rwlock);
1398
0
}
1399
1400
static uint32_t
1401
hash_bond_id(uint32_t bond_id)
1402
0
{
1403
0
    return hash_int(bond_id, 0);
1404
0
}
1405
1406
/* Requires dp_netdev_mutex so that we can't get a new reference to 'dp'
1407
 * through the 'dp_netdevs' shash while freeing 'dp'. */
1408
static void
1409
dp_netdev_free(struct dp_netdev *dp)
1410
    OVS_REQUIRES(dp_netdev_mutex)
1411
0
{
1412
0
    struct dp_netdev_port *port;
1413
0
    struct tx_bond *bond;
1414
1415
0
    shash_find_and_delete(&dp_netdevs, dp->name);
1416
1417
0
    ovs_rwlock_wrlock(&dp->port_rwlock);
1418
0
    HMAP_FOR_EACH_SAFE (port, node, &dp->ports) {
1419
0
        do_del_port(dp, port);
1420
0
    }
1421
0
    ovs_rwlock_unlock(&dp->port_rwlock);
1422
1423
0
    ovs_mutex_lock(&dp->bond_mutex);
1424
0
    CMAP_FOR_EACH (bond, node, &dp->tx_bonds) {
1425
0
        cmap_remove(&dp->tx_bonds, &bond->node, hash_bond_id(bond->bond_id));
1426
0
        ovsrcu_postpone(free, bond);
1427
0
    }
1428
0
    ovs_mutex_unlock(&dp->bond_mutex);
1429
1430
0
    dp_netdev_destroy_all_pmds(dp, true);
1431
0
    cmap_destroy(&dp->poll_threads);
1432
1433
0
    ovs_mutex_destroy(&dp->tx_qid_pool_mutex);
1434
0
    id_pool_destroy(dp->tx_qid_pool);
1435
1436
0
    ovs_mutex_destroy(&dp->non_pmd_mutex);
1437
0
    ovsthread_key_delete(dp->per_pmd_key);
1438
1439
0
    conntrack_destroy(dp->conntrack);
1440
1441
1442
0
    seq_destroy(dp->reconfigure_seq);
1443
0
    ovsthread_once_destroy(&dp->once_set_config);
1444
1445
0
    seq_destroy(dp->port_seq);
1446
0
    hmap_destroy(&dp->ports);
1447
0
    ovs_rwlock_destroy(&dp->port_rwlock);
1448
1449
0
    cmap_destroy(&dp->tx_bonds);
1450
0
    ovs_mutex_destroy(&dp->bond_mutex);
1451
1452
    /* Upcalls must be disabled at this point */
1453
0
    dp_netdev_destroy_upcall_lock(dp);
1454
1455
0
    dp_netdev_meter_destroy(dp);
1456
1457
0
    free(dp->max_sleep_list);
1458
0
    free(dp->pmd_cmask);
1459
0
    free(CONST_CAST(char *, dp->name));
1460
0
    free(CONST_CAST(char *, dp->full_name));
1461
0
    free(dp);
1462
0
}
1463
1464
static void
1465
dp_netdev_unref(struct dp_netdev *dp)
1466
0
{
1467
0
    if (dp) {
1468
        /* Take dp_netdev_mutex so that, if dp->ref_cnt falls to zero, we can't
1469
         * get a new reference to 'dp' through the 'dp_netdevs' shash. */
1470
0
        ovs_mutex_lock(&dp_netdev_mutex);
1471
0
        if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
1472
0
            dp_netdev_free(dp);
1473
0
        }
1474
0
        ovs_mutex_unlock(&dp_netdev_mutex);
1475
0
    }
1476
0
}
1477
1478
static void
1479
dpif_netdev_close(struct dpif *dpif)
1480
0
{
1481
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
1482
1483
0
    dp_netdev_unref(dp);
1484
0
    free(dpif);
1485
0
}
1486
1487
static int
1488
dpif_netdev_destroy(struct dpif *dpif)
1489
0
{
1490
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
1491
1492
0
    if (!atomic_flag_test_and_set(&dp->destroyed)) {
1493
0
        if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
1494
            /* Can't happen: 'dpif' still owns a reference to 'dp'. */
1495
0
            OVS_NOT_REACHED();
1496
0
        }
1497
0
    }
1498
1499
0
    return 0;
1500
0
}
1501
1502
/* Add 'n' to the atomic variable 'var' non-atomically and using relaxed
1503
 * load/store semantics.  While the increment is not atomic, the load and
1504
 * store operations are, making it impossible to read inconsistent values.
1505
 *
1506
 * This is used to update thread local stats counters. */
1507
static void
1508
non_atomic_ullong_add(atomic_ullong *var, unsigned long long n)
1509
0
{
1510
0
    unsigned long long tmp;
1511
1512
0
    atomic_read_relaxed(var, &tmp);
1513
0
    tmp += n;
1514
0
    atomic_store_relaxed(var, tmp);
1515
0
}
1516
1517
static int
1518
dpif_netdev_get_stats(const struct dpif *dpif, struct dpif_dp_stats *stats)
1519
0
{
1520
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
1521
0
    struct dp_netdev_pmd_thread *pmd;
1522
0
    uint64_t pmd_stats[PMD_N_STATS];
1523
1524
0
    stats->n_flows = stats->n_hit = stats->n_missed = stats->n_lost = 0;
1525
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
1526
0
        stats->n_flows += cmap_count(&pmd->flow_table);
1527
0
        pmd_perf_read_counters(&pmd->perf_stats, pmd_stats);
1528
0
        stats->n_hit += pmd_stats[PMD_STAT_PHWOL_HIT];
1529
0
        stats->n_hit += pmd_stats[PMD_STAT_SIMPLE_HIT];
1530
0
        stats->n_hit += pmd_stats[PMD_STAT_EXACT_HIT];
1531
0
        stats->n_hit += pmd_stats[PMD_STAT_SMC_HIT];
1532
0
        stats->n_hit += pmd_stats[PMD_STAT_MASKED_HIT];
1533
0
        stats->n_missed += pmd_stats[PMD_STAT_MISS];
1534
0
        stats->n_lost += pmd_stats[PMD_STAT_LOST];
1535
0
    }
1536
0
    stats->n_masks = UINT32_MAX;
1537
0
    stats->n_mask_hit = UINT64_MAX;
1538
0
    stats->n_cache_hit = UINT64_MAX;
1539
1540
0
    return 0;
1541
0
}
1542
1543
static void
1544
dp_netdev_reload_pmd__(struct dp_netdev_pmd_thread *pmd)
1545
0
{
1546
0
    if (pmd->core_id == NON_PMD_CORE_ID) {
1547
0
        ovs_mutex_lock(&pmd->dp->non_pmd_mutex);
1548
0
        ovs_mutex_lock(&pmd->port_mutex);
1549
0
        pmd_load_cached_ports(pmd);
1550
0
        ovs_mutex_unlock(&pmd->port_mutex);
1551
0
        ovs_mutex_unlock(&pmd->dp->non_pmd_mutex);
1552
0
        return;
1553
0
    }
1554
1555
0
    seq_change(pmd->reload_seq);
1556
0
    atomic_store_explicit(&pmd->reload, true, memory_order_release);
1557
0
}
1558
1559
static uint32_t
1560
hash_port_no(odp_port_t port_no)
1561
0
{
1562
0
    return hash_int(odp_to_u32(port_no), 0);
1563
0
}
1564
1565
static int
1566
port_create(const char *devname, const char *type,
1567
            odp_port_t port_no, struct dp_netdev_port **portp)
1568
0
{
1569
0
    struct dp_netdev_port *port;
1570
0
    enum netdev_flags flags;
1571
0
    struct netdev *netdev;
1572
0
    int error;
1573
1574
0
    *portp = NULL;
1575
1576
    /* Open and validate network device. */
1577
0
    error = netdev_open(devname, type, &netdev);
1578
0
    if (error) {
1579
0
        return error;
1580
0
    }
1581
    /* XXX reject non-Ethernet devices */
1582
1583
0
    netdev_get_flags(netdev, &flags);
1584
0
    if (flags & NETDEV_LOOPBACK) {
1585
0
        VLOG_ERR("%s: cannot add a loopback device", devname);
1586
0
        error = EINVAL;
1587
0
        goto out;
1588
0
    }
1589
1590
0
    port = xzalloc(sizeof *port);
1591
0
    port->port_no = port_no;
1592
0
    port->netdev = netdev;
1593
0
    port->type = xstrdup(type);
1594
0
    port->sf = NULL;
1595
0
    port->emc_enabled = true;
1596
0
    port->need_reconfigure = true;
1597
0
    ovs_mutex_init(&port->txq_used_mutex);
1598
1599
0
    *portp = port;
1600
1601
0
    return 0;
1602
1603
0
out:
1604
0
    netdev_close(netdev);
1605
0
    return error;
1606
0
}
1607
1608
static int
1609
do_add_port(struct dp_netdev *dp, const char *devname, const char *type,
1610
            odp_port_t port_no)
1611
    OVS_REQ_WRLOCK(dp->port_rwlock)
1612
0
{
1613
0
    struct netdev_saved_flags *sf;
1614
0
    struct dp_netdev_port *port;
1615
0
    int error;
1616
1617
    /* Reject devices already in 'dp'. */
1618
0
    if (!get_port_by_name(dp, devname, &port)) {
1619
0
        return EEXIST;
1620
0
    }
1621
1622
0
    error = port_create(devname, type, port_no, &port);
1623
0
    if (error) {
1624
0
        return error;
1625
0
    }
1626
1627
0
    hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
1628
0
    seq_change(dp->port_seq);
1629
1630
0
    reconfigure_datapath(dp);
1631
1632
    /* Check that port was successfully configured. */
1633
0
    if (!dp_netdev_lookup_port(dp, port_no)) {
1634
0
        return EINVAL;
1635
0
    }
1636
1637
    /* Updating device flags triggers an if_notifier, which triggers a bridge
1638
     * reconfiguration and another attempt to add this port, leading to an
1639
     * infinite loop if the device is configured incorrectly and cannot be
1640
     * added.  Setting the promisc mode after a successful reconfiguration,
1641
     * since we already know that the device is somehow properly configured. */
1642
0
    error = netdev_turn_flags_on(port->netdev, NETDEV_PROMISC, &sf);
1643
0
    if (error) {
1644
0
        VLOG_ERR("%s: cannot set promisc flag", devname);
1645
0
        do_del_port(dp, port);
1646
0
        return error;
1647
0
    }
1648
0
    port->sf = sf;
1649
1650
0
    return 0;
1651
0
}
1652
1653
static int
1654
dpif_netdev_port_add(struct dpif *dpif, struct netdev *netdev,
1655
                     odp_port_t *port_nop)
1656
0
{
1657
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
1658
0
    char namebuf[NETDEV_VPORT_NAME_BUFSIZE];
1659
0
    const char *dpif_port;
1660
0
    odp_port_t port_no;
1661
0
    int error;
1662
1663
0
    ovs_rwlock_wrlock(&dp->port_rwlock);
1664
0
    dpif_port = netdev_vport_get_dpif_port(netdev, namebuf, sizeof namebuf);
1665
0
    if (*port_nop != ODPP_NONE) {
1666
0
        port_no = *port_nop;
1667
0
        error = dp_netdev_lookup_port(dp, *port_nop) ? EBUSY : 0;
1668
0
    } else {
1669
0
        port_no = choose_port(dp, dpif_port);
1670
0
        error = port_no == ODPP_NONE ? EFBIG : 0;
1671
0
    }
1672
0
    if (!error) {
1673
0
        *port_nop = port_no;
1674
0
        error = do_add_port(dp, dpif_port, netdev_get_type(netdev), port_no);
1675
0
    }
1676
0
    ovs_rwlock_unlock(&dp->port_rwlock);
1677
1678
0
    return error;
1679
0
}
1680
1681
static int
1682
dpif_netdev_port_del(struct dpif *dpif, odp_port_t port_no)
1683
0
{
1684
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
1685
0
    int error;
1686
1687
0
    ovs_rwlock_wrlock(&dp->port_rwlock);
1688
0
    if (port_no == ODPP_LOCAL) {
1689
0
        error = EINVAL;
1690
0
    } else {
1691
0
        struct dp_netdev_port *port;
1692
1693
0
        error = get_port_by_number(dp, port_no, &port);
1694
0
        if (!error) {
1695
0
            do_del_port(dp, port);
1696
0
        }
1697
0
    }
1698
0
    ovs_rwlock_unlock(&dp->port_rwlock);
1699
1700
0
    return error;
1701
0
}
1702
1703
static bool
1704
is_valid_port_number(odp_port_t port_no)
1705
0
{
1706
0
    return port_no != ODPP_NONE;
1707
0
}
1708
1709
static struct dp_netdev_port *
1710
dp_netdev_lookup_port(const struct dp_netdev *dp, odp_port_t port_no)
1711
    OVS_REQ_RDLOCK(dp->port_rwlock)
1712
0
{
1713
0
    struct dp_netdev_port *port;
1714
1715
0
    HMAP_FOR_EACH_WITH_HASH (port, node, hash_port_no(port_no), &dp->ports) {
1716
0
        if (port->port_no == port_no) {
1717
0
            return port;
1718
0
        }
1719
0
    }
1720
0
    return NULL;
1721
0
}
1722
1723
static int
1724
get_port_by_number(struct dp_netdev *dp,
1725
                   odp_port_t port_no, struct dp_netdev_port **portp)
1726
    OVS_REQ_RDLOCK(dp->port_rwlock)
1727
0
{
1728
0
    if (!is_valid_port_number(port_no)) {
1729
0
        *portp = NULL;
1730
0
        return EINVAL;
1731
0
    } else {
1732
0
        *portp = dp_netdev_lookup_port(dp, port_no);
1733
0
        return *portp ? 0 : ENODEV;
1734
0
    }
1735
0
}
1736
1737
static void
1738
port_destroy(struct dp_netdev_port *port)
1739
0
{
1740
0
    if (!port) {
1741
0
        return;
1742
0
    }
1743
1744
0
    netdev_close(port->netdev);
1745
0
    netdev_restore_flags(port->sf);
1746
1747
0
    for (unsigned i = 0; i < port->n_rxq; i++) {
1748
0
        netdev_rxq_close(port->rxqs[i].rx);
1749
0
    }
1750
0
    ovs_mutex_destroy(&port->txq_used_mutex);
1751
0
    free(port->rxq_affinity_list);
1752
0
    free(port->txq_used);
1753
0
    free(port->rxqs);
1754
0
    free(port->type);
1755
0
    free(port);
1756
0
}
1757
1758
static int
1759
get_port_by_name(struct dp_netdev *dp,
1760
                 const char *devname, struct dp_netdev_port **portp)
1761
    OVS_REQ_RDLOCK(dp->port_rwlock)
1762
0
{
1763
0
    struct dp_netdev_port *port;
1764
1765
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
1766
0
        if (!strcmp(netdev_get_name(port->netdev), devname)) {
1767
0
            *portp = port;
1768
0
            return 0;
1769
0
        }
1770
0
    }
1771
1772
    /* Callers of dpif_netdev_port_query_by_name() expect ENODEV for a non
1773
     * existing port. */
1774
0
    return ENODEV;
1775
0
}
1776
1777
/* Returns 'true' if there is a port with pmd netdev. */
1778
static bool
1779
has_pmd_port(struct dp_netdev *dp)
1780
    OVS_REQ_RDLOCK(dp->port_rwlock)
1781
0
{
1782
0
    struct dp_netdev_port *port;
1783
1784
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
1785
0
        if (netdev_is_pmd(port->netdev)) {
1786
0
            return true;
1787
0
        }
1788
0
    }
1789
1790
0
    return false;
1791
0
}
1792
1793
static void
1794
do_del_port(struct dp_netdev *dp, struct dp_netdev_port *port)
1795
    OVS_REQ_WRLOCK(dp->port_rwlock)
1796
0
{
1797
0
    hmap_remove(&dp->ports, &port->node);
1798
0
    seq_change(dp->port_seq);
1799
1800
0
    reconfigure_datapath(dp);
1801
0
    port_destroy(port);
1802
0
}
1803
1804
static void
1805
answer_port_query(const struct dp_netdev_port *port,
1806
                  struct dpif_port *dpif_port)
1807
0
{
1808
0
    dpif_port->name = xstrdup(netdev_get_name(port->netdev));
1809
0
    dpif_port->type = xstrdup(port->type);
1810
0
    dpif_port->port_no = port->port_no;
1811
0
}
1812
1813
static int
1814
dpif_netdev_port_query_by_number(const struct dpif *dpif, odp_port_t port_no,
1815
                                 struct dpif_port *dpif_port)
1816
0
{
1817
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
1818
0
    struct dp_netdev_port *port;
1819
0
    int error;
1820
1821
0
    ovs_rwlock_rdlock(&dp->port_rwlock);
1822
0
    error = get_port_by_number(dp, port_no, &port);
1823
0
    if (!error && dpif_port) {
1824
0
        answer_port_query(port, dpif_port);
1825
0
    }
1826
0
    ovs_rwlock_unlock(&dp->port_rwlock);
1827
1828
0
    return error;
1829
0
}
1830
1831
static int
1832
dpif_netdev_port_query_by_name(const struct dpif *dpif, const char *devname,
1833
                               struct dpif_port *dpif_port)
1834
0
{
1835
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
1836
0
    struct dp_netdev_port *port;
1837
0
    int error;
1838
1839
0
    ovs_rwlock_rdlock(&dp->port_rwlock);
1840
0
    error = get_port_by_name(dp, devname, &port);
1841
0
    if (!error && dpif_port) {
1842
0
        answer_port_query(port, dpif_port);
1843
0
    }
1844
0
    ovs_rwlock_unlock(&dp->port_rwlock);
1845
1846
0
    return error;
1847
0
}
1848
1849
static void
1850
dp_netdev_flow_free(struct dp_netdev_flow *flow)
1851
0
{
1852
0
    dp_netdev_actions_free(dp_netdev_flow_get_actions(flow));
1853
0
    free(flow->dp_extra_info);
1854
0
    free(flow);
1855
0
}
1856
1857
void dp_netdev_flow_unref(struct dp_netdev_flow *flow)
1858
0
{
1859
0
    if (ovs_refcount_unref_relaxed(&flow->ref_cnt) == 1) {
1860
0
        ovsrcu_postpone(dp_netdev_flow_free, flow);
1861
0
    }
1862
0
}
1863
1864
static inline struct dpcls *
1865
dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd,
1866
                           odp_port_t in_port)
1867
0
{
1868
0
    struct dpcls *cls;
1869
0
    uint32_t hash = hash_port_no(in_port);
1870
0
    CMAP_FOR_EACH_WITH_HASH (cls, node, hash, &pmd->classifiers) {
1871
0
        if (cls->in_port == in_port) {
1872
            /* Port classifier exists already */
1873
0
            return cls;
1874
0
        }
1875
0
    }
1876
0
    return NULL;
1877
0
}
1878
1879
static inline struct dpcls *
1880
dp_netdev_pmd_find_dpcls(struct dp_netdev_pmd_thread *pmd,
1881
                         odp_port_t in_port)
1882
    OVS_REQUIRES(pmd->flow_mutex)
1883
0
{
1884
0
    struct dpcls *cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
1885
1886
0
    if (!cls) {
1887
0
        uint32_t hash = hash_port_no(in_port);
1888
1889
        /* Create new classifier for in_port */
1890
0
        cls = xmalloc(sizeof(*cls));
1891
0
        dpcls_init(cls);
1892
0
        cls->in_port = in_port;
1893
0
        cmap_insert(&pmd->classifiers, &cls->node, hash);
1894
0
        VLOG_DBG("Creating dpcls %p for in_port %d", cls, in_port);
1895
0
    }
1896
0
    return cls;
1897
0
}
1898
1899
static void
1900
log_netdev_flow_change(const struct dp_netdev_flow *flow,
1901
                       const struct match *match,
1902
                       const struct dp_netdev_actions *old_actions,
1903
                       const struct nlattr *actions,
1904
                       size_t actions_len)
1905
0
{
1906
0
    struct ds ds = DS_EMPTY_INITIALIZER;
1907
0
    struct ofpbuf key_buf, mask_buf;
1908
0
    struct odp_flow_key_parms odp_parms = {
1909
0
        .flow = &match->flow,
1910
0
        .mask = &match->wc.masks,
1911
0
        .support = dp_netdev_support,
1912
0
    };
1913
1914
0
    if (OVS_LIKELY(VLOG_DROP_DBG((&upcall_rl)))) {
1915
0
        return;
1916
0
    }
1917
1918
0
    ofpbuf_init(&key_buf, 0);
1919
0
    ofpbuf_init(&mask_buf, 0);
1920
1921
0
    odp_flow_key_from_flow(&odp_parms, &key_buf);
1922
0
    odp_parms.key_buf = &key_buf;
1923
0
    odp_flow_key_from_mask(&odp_parms, &mask_buf);
1924
1925
0
    if (old_actions) {
1926
0
        ds_put_cstr(&ds, "flow_mod: ");
1927
0
    } else {
1928
0
        ds_put_cstr(&ds, "flow_add: ");
1929
0
    }
1930
0
    odp_format_ufid(&flow->ufid, &ds);
1931
0
    ds_put_cstr(&ds, " mega_");
1932
0
    odp_format_ufid(&flow->mega_ufid, &ds);
1933
0
    ds_put_cstr(&ds, " ");
1934
0
    odp_flow_format(key_buf.data, key_buf.size,
1935
0
                    mask_buf.data, mask_buf.size,
1936
0
                    NULL, &ds, false, true);
1937
0
    if (old_actions) {
1938
0
        ds_put_cstr(&ds, ", old_actions:");
1939
0
        format_odp_actions(&ds, old_actions->actions, old_actions->size,
1940
0
                           NULL);
1941
0
    }
1942
0
    ds_put_cstr(&ds, ", actions:");
1943
0
    format_odp_actions(&ds, actions, actions_len, NULL);
1944
1945
0
    VLOG_DBG("%s", ds_cstr(&ds));
1946
1947
0
    ofpbuf_uninit(&key_buf);
1948
0
    ofpbuf_uninit(&mask_buf);
1949
1950
    /* Add a printout of the actual match installed. */
1951
0
    struct match m;
1952
0
    ds_clear(&ds);
1953
0
    ds_put_cstr(&ds, "flow match: ");
1954
0
    miniflow_expand(&flow->cr.flow.mf, &m.flow);
1955
0
    miniflow_expand(&flow->cr.mask->mf, &m.wc.masks);
1956
0
    memset(&m.tun_md, 0, sizeof m.tun_md);
1957
0
    match_format(&m, NULL, &ds, OFP_DEFAULT_PRIORITY);
1958
1959
0
    VLOG_DBG("%s", ds_cstr(&ds));
1960
1961
0
    ds_destroy(&ds);
1962
0
}
1963
1964
/* Offloaded flows can be handled asynchronously, so we do not always know
1965
 * whether a specific flow is offloaded or not.  It might still be pending;
1966
 * in fact, multiple modifications can be pending, and the actual offload
1967
 * state depends on the completion of each modification.
1968
 *
1969
 * To correctly determine whether a flow is offloaded when it is being
1970
 * destroyed (and therefore requires cleanup), we must ensure that all
1971
 * operations have completed.  To achieve this, we track the number of
1972
 * outstanding offloaded flow modifications. */
1973
static bool
1974
offload_queue_inc(struct dp_netdev_flow *flow)
1975
0
{
1976
0
    int current;
1977
1978
0
    while (true) {
1979
0
        atomic_read(&flow->offload_queue_depth, &current);
1980
0
        if (current < 0) {
1981
            /* We are cleaning up, so no longer enqueue operations. */
1982
0
            return false;
1983
0
        }
1984
1985
        /* Here we try to atomically increase the value.  If we do not succeed,
1986
         * someone else has modified it, and we need to check again for a
1987
         * current negative value. */
1988
0
        if (atomic_compare_exchange_strong(&flow->offload_queue_depth,
1989
0
                                           &current, current + 1)) {
1990
0
            return true;
1991
0
        }
1992
0
    }
1993
0
}
1994
1995
static bool
1996
offload_queue_dec(struct dp_netdev_flow *flow)
1997
0
{
1998
0
    int old;
1999
2000
0
    atomic_sub(&flow->offload_queue_depth, 1, &old);
2001
0
    ovs_assert(old >= 1);
2002
2003
0
    if (old == 1) {
2004
        /* Note that this only indicates that the queue might be empty. */
2005
0
        return true;
2006
0
    }
2007
0
    return false;
2008
0
}
2009
2010
static bool
2011
offload_queue_complete(struct dp_netdev_flow *flow)
2012
0
{
2013
    /* This function returns false if the queue is still in use.
2014
     * If the queue is empty, it will attempt to atomically mark it as
2015
     * 'not in use' by making the queue depth negative.  This prevents
2016
     * other flow operations from being added.  If successful, it returns
2017
     * true. */
2018
0
     int expected_val = 0;
2019
2020
0
    return atomic_compare_exchange_strong(&flow->offload_queue_depth,
2021
0
                                          &expected_val, -1);
2022
0
}
2023
2024
static void
2025
offload_flow_reference_unreference_cb(unsigned pmd_id OVS_UNUSED,
2026
                                      void *flow_reference_)
2027
0
{
2028
0
    struct dp_netdev_flow *flow_reference = flow_reference_;
2029
2030
0
    if (flow_reference) {
2031
0
        flow_reference->offloaded = false;
2032
0
        dp_netdev_flow_unref(flow_reference);
2033
0
    }
2034
0
}
2035
2036
static void
2037
offload_flow_del_resume(struct dp_netdev_flow *flow_reference,
2038
                        int error)
2039
0
{
2040
0
    if (error == EINPROGRESS) {
2041
0
        return;
2042
0
    }
2043
2044
0
    if (error) {
2045
0
        odp_port_t in_port = flow_reference->flow.in_port.odp_port;
2046
2047
0
        VLOG_DBG(
2048
0
            "Failed removing offload flow ufid " UUID_FMT " from port %d: %d",
2049
0
            UUID_ARGS((struct uuid *)&flow_reference->mega_ufid), in_port,
2050
0
            error);
2051
0
    } else {
2052
        /* Release because we successfully removed the reference. */
2053
0
        dp_netdev_flow_unref(flow_reference);
2054
0
    }
2055
2056
    /* Release as we took a reference in offload_flow_del(). */
2057
0
    dp_netdev_flow_unref(flow_reference);
2058
0
}
2059
2060
static void
2061
offload_flow_del_resume_cb(void *aux OVS_UNUSED,
2062
                           struct dpif_flow_stats *stats OVS_UNUSED,
2063
                           unsigned pmd_id OVS_UNUSED,
2064
                           void *flow_reference,
2065
                           void *previous_flow_reference OVS_UNUSED, int error)
2066
0
{
2067
0
    offload_flow_del_resume(flow_reference, error);
2068
0
}
2069
2070
static void
2071
offload_flow_del(struct dp_netdev *dp, unsigned pmd_id,
2072
                 struct dp_netdev_flow *flow)
2073
0
{
2074
0
    odp_port_t in_port = flow->flow.in_port.odp_port;
2075
0
    struct dpif_offload_flow_del del = {
2076
0
        .in_port = in_port,
2077
0
        .pmd_id = pmd_id,
2078
0
        .ufid = CONST_CAST(ovs_u128 *, &flow->mega_ufid),
2079
0
        .flow_reference = flow,
2080
0
        .stats = NULL,
2081
0
        .cb_data = { .callback = offload_flow_del_resume_cb },
2082
0
    };
2083
0
    int error;
2084
2085
0
    if (!dpif_offload_enabled()) {
2086
0
        return;
2087
0
    }
2088
2089
    /* This offload flow delete is only called when the actual flow is
2090
     * destructed.  However, we can only trust the state of flow->offloaded
2091
     * if no more flow_put operations are pending.  Below, we check whether
2092
     * the queue can be marked as complete, and then determine if we need
2093
     * to schedule a removal.  If not, the delete will be rescheduled later
2094
     * in the last offload_flow_put_resume_cb() callback. */
2095
0
    ovs_assert(flow->dead);
2096
0
    if (!offload_queue_complete(flow) || !flow->offloaded) {
2097
0
        return;
2098
0
    }
2099
2100
0
    flow->offloaded = false;
2101
0
    dp_netdev_flow_ref(flow);
2102
2103
    /* It's the responsibility of the offload provider to remove the
2104
     * actual rule from hardware only if none of the other PMD threads
2105
     * have the rule installed in hardware. */
2106
0
    error = dpif_offload_datapath_flow_del(dp->full_name, &del);
2107
0
    offload_flow_del_resume(flow, error);
2108
0
}
2109
2110
static void
2111
dp_netdev_pmd_remove_flow(struct dp_netdev_pmd_thread *pmd,
2112
                          struct dp_netdev_flow *flow)
2113
    OVS_REQUIRES(pmd->flow_mutex)
2114
0
{
2115
0
    struct cmap_node *node = CONST_CAST(struct cmap_node *, &flow->node);
2116
0
    struct dpcls *cls;
2117
0
    odp_port_t in_port = flow->flow.in_port.odp_port;
2118
2119
0
    cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
2120
0
    ovs_assert(cls != NULL);
2121
0
    dpcls_remove(cls, &flow->cr);
2122
0
    dp_netdev_simple_match_remove(pmd, flow);
2123
0
    cmap_remove(&pmd->flow_table, node, dp_netdev_flow_hash(&flow->ufid));
2124
0
    ccmap_dec(&pmd->n_flows, odp_to_u32(in_port));
2125
0
    flow->dead = true;
2126
0
    offload_flow_del(pmd->dp, pmd->core_id, flow);
2127
2128
0
    dp_netdev_flow_unref(flow);
2129
0
}
2130
2131
static void
2132
dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd)
2133
0
{
2134
0
    struct dp_netdev_flow *netdev_flow;
2135
2136
0
    ovs_mutex_lock(&pmd->flow_mutex);
2137
0
    CMAP_FOR_EACH (netdev_flow, node, &pmd->flow_table) {
2138
0
        dp_netdev_pmd_remove_flow(pmd, netdev_flow);
2139
0
    }
2140
0
    ovs_mutex_unlock(&pmd->flow_mutex);
2141
0
}
2142
2143
static int
2144
dpif_netdev_flow_flush(struct dpif *dpif)
2145
0
{
2146
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
2147
0
    struct dp_netdev_pmd_thread *pmd;
2148
2149
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
2150
0
        dp_netdev_pmd_flow_flush(pmd);
2151
0
    }
2152
2153
0
    return 0;
2154
0
}
2155
2156
struct dp_netdev_port_state {
2157
    struct hmap_position position;
2158
    char *name;
2159
};
2160
2161
static int
2162
dpif_netdev_port_dump_start(const struct dpif *dpif OVS_UNUSED, void **statep)
2163
0
{
2164
0
    *statep = xzalloc(sizeof(struct dp_netdev_port_state));
2165
0
    return 0;
2166
0
}
2167
2168
static int
2169
dpif_netdev_port_dump_next(const struct dpif *dpif, void *state_,
2170
                           struct dpif_port *dpif_port)
2171
0
{
2172
0
    struct dp_netdev_port_state *state = state_;
2173
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
2174
0
    struct hmap_node *node;
2175
0
    int retval;
2176
2177
0
    ovs_rwlock_rdlock(&dp->port_rwlock);
2178
0
    node = hmap_at_position(&dp->ports, &state->position);
2179
0
    if (node) {
2180
0
        struct dp_netdev_port *port;
2181
2182
0
        port = CONTAINER_OF(node, struct dp_netdev_port, node);
2183
2184
0
        free(state->name);
2185
0
        state->name = xstrdup(netdev_get_name(port->netdev));
2186
0
        dpif_port->name = state->name;
2187
0
        dpif_port->type = port->type;
2188
0
        dpif_port->port_no = port->port_no;
2189
2190
0
        retval = 0;
2191
0
    } else {
2192
0
        retval = EOF;
2193
0
    }
2194
0
    ovs_rwlock_unlock(&dp->port_rwlock);
2195
2196
0
    return retval;
2197
0
}
2198
2199
static int
2200
dpif_netdev_port_dump_done(const struct dpif *dpif OVS_UNUSED, void *state_)
2201
0
{
2202
0
    struct dp_netdev_port_state *state = state_;
2203
0
    free(state->name);
2204
0
    free(state);
2205
0
    return 0;
2206
0
}
2207
2208
static int
2209
dpif_netdev_port_poll(const struct dpif *dpif_, char **devnamep OVS_UNUSED)
2210
0
{
2211
0
    struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
2212
0
    uint64_t new_port_seq;
2213
0
    int error;
2214
2215
0
    new_port_seq = seq_read(dpif->dp->port_seq);
2216
0
    if (dpif->last_port_seq != new_port_seq) {
2217
0
        dpif->last_port_seq = new_port_seq;
2218
0
        error = ENOBUFS;
2219
0
    } else {
2220
0
        error = EAGAIN;
2221
0
    }
2222
2223
0
    return error;
2224
0
}
2225
2226
static void
2227
dpif_netdev_port_poll_wait(const struct dpif *dpif_)
2228
0
{
2229
0
    struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
2230
2231
0
    seq_wait(dpif->dp->port_seq, dpif->last_port_seq);
2232
0
}
2233
2234
static struct dp_netdev_flow *
2235
dp_netdev_flow_cast(const struct dpcls_rule *cr)
2236
0
{
2237
0
    return cr ? CONTAINER_OF(cr, struct dp_netdev_flow, cr) : NULL;
2238
0
}
2239
2240
static bool dp_netdev_flow_ref(struct dp_netdev_flow *flow)
2241
0
{
2242
0
    return ovs_refcount_try_ref_rcu(&flow->ref_cnt);
2243
0
}
2244
2245
/* netdev_flow_key utilities.
2246
 *
2247
 * netdev_flow_key is basically a miniflow.  We use these functions
2248
 * (netdev_flow_key_clone, netdev_flow_key_equal, ...) instead of the miniflow
2249
 * functions (miniflow_clone_inline, miniflow_equal, ...), because:
2250
 *
2251
 * - Since we are dealing exclusively with miniflows created by
2252
 *   miniflow_extract(), if the map is different the miniflow is different.
2253
 *   Therefore we can be faster by comparing the map and the miniflow in a
2254
 *   single memcmp().
2255
 * - These functions can be inlined by the compiler. */
2256
2257
static inline bool
2258
netdev_flow_key_equal(const struct netdev_flow_key *a,
2259
                      const struct netdev_flow_key *b)
2260
0
{
2261
    /* 'b->len' may be not set yet. */
2262
0
    return a->hash == b->hash && !memcmp(&a->mf, &b->mf, a->len);
2263
0
}
2264
2265
static inline void
2266
netdev_flow_key_clone(struct netdev_flow_key *dst,
2267
                      const struct netdev_flow_key *src)
2268
0
{
2269
0
    memcpy(dst, src,
2270
0
           offsetof(struct netdev_flow_key, mf) + src->len);
2271
0
}
2272
2273
/* Initialize a netdev_flow_key 'mask' from 'match'. */
2274
static inline void
2275
netdev_flow_mask_init(struct netdev_flow_key *mask,
2276
                      const struct match *match)
2277
0
{
2278
0
    uint64_t *dst = miniflow_values(&mask->mf);
2279
0
    struct flowmap fmap;
2280
0
    uint32_t hash = 0;
2281
0
    size_t idx;
2282
2283
    /* Only check masks that make sense for the flow. */
2284
0
    flow_wc_map(&match->flow, &fmap);
2285
0
    flowmap_init(&mask->mf.map);
2286
2287
0
    FLOWMAP_FOR_EACH_INDEX(idx, fmap) {
2288
0
        uint64_t mask_u64 = flow_u64_value(&match->wc.masks, idx);
2289
2290
0
        if (mask_u64) {
2291
0
            flowmap_set(&mask->mf.map, idx, 1);
2292
0
            *dst++ = mask_u64;
2293
0
            hash = hash_add64(hash, mask_u64);
2294
0
        }
2295
0
    }
2296
2297
0
    map_t map;
2298
2299
0
    FLOWMAP_FOR_EACH_MAP (map, mask->mf.map) {
2300
0
        hash = hash_add64(hash, map);
2301
0
    }
2302
2303
0
    size_t n = dst - miniflow_get_values(&mask->mf);
2304
2305
0
    mask->hash = hash_finish(hash, n * 8);
2306
0
    mask->len = netdev_flow_key_size(n);
2307
0
}
2308
2309
/* Initializes 'dst' as a copy of 'flow' masked with 'mask'. */
2310
static inline void
2311
netdev_flow_key_init_masked(struct netdev_flow_key *dst,
2312
                            const struct flow *flow,
2313
                            const struct netdev_flow_key *mask)
2314
0
{
2315
0
    uint64_t *dst_u64 = miniflow_values(&dst->mf);
2316
0
    const uint64_t *mask_u64 = miniflow_get_values(&mask->mf);
2317
0
    uint32_t hash = 0;
2318
0
    uint64_t value;
2319
2320
0
    dst->len = mask->len;
2321
0
    dst->mf = mask->mf;   /* Copy maps. */
2322
2323
0
    FLOW_FOR_EACH_IN_MAPS(value, flow, mask->mf.map) {
2324
0
        *dst_u64 = value & *mask_u64++;
2325
0
        hash = hash_add64(hash, *dst_u64++);
2326
0
    }
2327
0
    dst->hash = hash_finish(hash,
2328
0
                            (dst_u64 - miniflow_get_values(&dst->mf)) * 8);
2329
0
}
2330
2331
/* Initializes 'key' as a copy of 'flow'. */
2332
static inline void
2333
netdev_flow_key_init(struct netdev_flow_key *key,
2334
                     const struct flow *flow)
2335
0
{
2336
0
    uint32_t hash = 0;
2337
0
    uint64_t value;
2338
2339
0
    miniflow_map_init(&key->mf, flow);
2340
0
    miniflow_init(&key->mf, flow);
2341
2342
0
    size_t n = miniflow_n_values(&key->mf);
2343
2344
0
    FLOW_FOR_EACH_IN_MAPS (value, flow, key->mf.map) {
2345
0
        hash = hash_add64(hash, value);
2346
0
    }
2347
2348
0
    key->hash = hash_finish(hash, n * 8);
2349
0
    key->len = netdev_flow_key_size(n);
2350
0
}
2351
2352
static inline void
2353
emc_change_entry(struct emc_entry *ce, struct dp_netdev_flow *flow,
2354
                 const struct netdev_flow_key *key)
2355
0
{
2356
0
    if (ce->flow != flow) {
2357
0
        if (ce->flow) {
2358
0
            dp_netdev_flow_unref(ce->flow);
2359
0
        }
2360
2361
0
        if (dp_netdev_flow_ref(flow)) {
2362
0
            ce->flow = flow;
2363
0
        } else {
2364
0
            ce->flow = NULL;
2365
0
        }
2366
0
    }
2367
0
    if (key) {
2368
0
        netdev_flow_key_clone(&ce->key, key);
2369
0
    }
2370
0
}
2371
2372
static inline void
2373
emc_insert(struct emc_cache *cache, const struct netdev_flow_key *key,
2374
           struct dp_netdev_flow *flow)
2375
0
{
2376
0
    struct emc_entry *to_be_replaced = NULL;
2377
0
    struct emc_entry *current_entry;
2378
2379
0
    EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
2380
0
        if (netdev_flow_key_equal(&current_entry->key, key)) {
2381
            /* We found the entry with the 'mf' miniflow */
2382
0
            emc_change_entry(current_entry, flow, NULL);
2383
0
            return;
2384
0
        }
2385
2386
        /* Replacement policy: put the flow in an empty (not alive) entry, or
2387
         * in the first entry where it can be */
2388
0
        if (!to_be_replaced
2389
0
            || (emc_entry_alive(to_be_replaced)
2390
0
                && !emc_entry_alive(current_entry))
2391
0
            || current_entry->key.hash < to_be_replaced->key.hash) {
2392
0
            to_be_replaced = current_entry;
2393
0
        }
2394
0
    }
2395
    /* We didn't find the miniflow in the cache.
2396
     * The 'to_be_replaced' entry is where the new flow will be stored */
2397
2398
0
    emc_change_entry(to_be_replaced, flow, key);
2399
0
}
2400
2401
static inline void
2402
emc_probabilistic_insert(struct dp_netdev_pmd_thread *pmd,
2403
                         const struct netdev_flow_key *key,
2404
                         struct dp_netdev_flow *flow)
2405
0
{
2406
    /* Insert an entry into the EMC based on probability value 'min'. By
2407
     * default the value is UINT32_MAX / 100 which yields an insertion
2408
     * probability of 1/100 ie. 1% */
2409
2410
0
    uint32_t min = pmd->ctx.emc_insert_min;
2411
2412
0
    if (min && random_uint32() <= min) {
2413
0
        emc_insert(&(pmd->flow_cache).emc_cache, key, flow);
2414
0
    }
2415
0
}
2416
2417
static inline const struct cmap_node *
2418
smc_entry_get(struct dp_netdev_pmd_thread *pmd, const uint32_t hash)
2419
0
{
2420
0
    struct smc_cache *cache = &(pmd->flow_cache).smc_cache;
2421
0
    struct smc_bucket *bucket = &cache->buckets[hash & SMC_MASK];
2422
0
    uint16_t sig = hash >> 16;
2423
0
    uint16_t index = UINT16_MAX;
2424
2425
0
    for (int i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
2426
0
        if (bucket->sig[i] == sig) {
2427
0
            index = bucket->flow_idx[i];
2428
0
            break;
2429
0
        }
2430
0
    }
2431
0
    if (index != UINT16_MAX) {
2432
0
        return cmap_find_by_index(&pmd->flow_table, index);
2433
0
    }
2434
0
    return NULL;
2435
0
}
2436
2437
/* Insert the flow_table index into SMC. Insertion may fail when 1) SMC is
2438
 * turned off, 2) the flow_table index is larger than uint16_t can handle.
2439
 * If there is already an SMC entry having same signature, the index will be
2440
 * updated. If there is no existing entry, but an empty entry is available,
2441
 * the empty entry will be taken. If no empty entry or existing same signature,
2442
 * a random entry from the hashed bucket will be picked. */
2443
static inline void
2444
smc_insert(struct dp_netdev_pmd_thread *pmd,
2445
           const struct netdev_flow_key *key,
2446
           uint32_t hash)
2447
0
{
2448
0
    struct smc_cache *smc_cache = &(pmd->flow_cache).smc_cache;
2449
0
    struct smc_bucket *bucket = &smc_cache->buckets[key->hash & SMC_MASK];
2450
0
    uint16_t index;
2451
0
    uint32_t cmap_index;
2452
0
    int i;
2453
2454
0
    if (!pmd->ctx.smc_enable_db) {
2455
0
        return;
2456
0
    }
2457
2458
0
    cmap_index = cmap_find_index(&pmd->flow_table, hash);
2459
0
    index = (cmap_index >= UINT16_MAX) ? UINT16_MAX : (uint16_t)cmap_index;
2460
2461
    /* If the index is larger than SMC can handle (uint16_t), we don't
2462
     * insert */
2463
0
    if (index == UINT16_MAX) {
2464
0
        return;
2465
0
    }
2466
2467
    /* If an entry with same signature already exists, update the index */
2468
0
    uint16_t sig = key->hash >> 16;
2469
0
    for (i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
2470
0
        if (bucket->sig[i] == sig) {
2471
0
            bucket->flow_idx[i] = index;
2472
0
            return;
2473
0
        }
2474
0
    }
2475
    /* If there is an empty entry, occupy it. */
2476
0
    for (i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
2477
0
        if (bucket->flow_idx[i] == UINT16_MAX) {
2478
0
            bucket->sig[i] = sig;
2479
0
            bucket->flow_idx[i] = index;
2480
0
            return;
2481
0
        }
2482
0
    }
2483
    /* Otherwise, pick a random entry. */
2484
0
    i = random_uint32() % SMC_ENTRY_PER_BUCKET;
2485
0
    bucket->sig[i] = sig;
2486
0
    bucket->flow_idx[i] = index;
2487
0
}
2488
2489
inline void
2490
emc_probabilistic_insert_batch(struct dp_netdev_pmd_thread *pmd,
2491
                               const struct netdev_flow_key *keys,
2492
                               struct dpcls_rule **rules,
2493
                               uint32_t emc_insert_mask)
2494
0
{
2495
0
    while (emc_insert_mask) {
2496
0
        uint32_t i = raw_ctz(emc_insert_mask);
2497
0
        emc_insert_mask &= emc_insert_mask - 1;
2498
        /* Get the require parameters for EMC/SMC from the rule */
2499
0
        struct dp_netdev_flow *flow = dp_netdev_flow_cast(rules[i]);
2500
        /* Insert the key into EMC/SMC. */
2501
0
        emc_probabilistic_insert(pmd, &keys[i], flow);
2502
0
    }
2503
0
}
2504
2505
inline void
2506
smc_insert_batch(struct dp_netdev_pmd_thread *pmd,
2507
                 const struct netdev_flow_key *keys,
2508
                 struct dpcls_rule **rules,
2509
                 uint32_t smc_insert_mask)
2510
0
{
2511
0
    while (smc_insert_mask) {
2512
0
        uint32_t i = raw_ctz(smc_insert_mask);
2513
0
        smc_insert_mask &= smc_insert_mask - 1;
2514
        /* Get the require parameters for EMC/SMC from the rule */
2515
0
        struct dp_netdev_flow *flow = dp_netdev_flow_cast(rules[i]);
2516
0
        uint32_t hash = dp_netdev_flow_hash(&flow->ufid);
2517
        /* Insert the key into EMC/SMC. */
2518
0
        smc_insert(pmd, &keys[i], hash);
2519
0
    }
2520
0
}
2521
2522
static struct dp_netdev_flow *
2523
dp_netdev_pmd_lookup_flow(struct dp_netdev_pmd_thread *pmd,
2524
                          const struct netdev_flow_key *key,
2525
                          int *lookup_num_p)
2526
0
{
2527
0
    struct dpcls *cls;
2528
0
    struct dpcls_rule *rule = NULL;
2529
0
    odp_port_t in_port = u32_to_odp(MINIFLOW_GET_U32(&key->mf,
2530
0
                                                     in_port.odp_port));
2531
0
    struct dp_netdev_flow *netdev_flow = NULL;
2532
2533
0
    cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
2534
0
    if (OVS_LIKELY(cls)) {
2535
0
        dpcls_lookup(cls, &key, &rule, 1, lookup_num_p);
2536
0
        netdev_flow = dp_netdev_flow_cast(rule);
2537
0
    }
2538
0
    return netdev_flow;
2539
0
}
2540
2541
static struct dp_netdev_flow *
2542
dp_netdev_pmd_find_flow(const struct dp_netdev_pmd_thread *pmd,
2543
                        const ovs_u128 *ufidp, const struct nlattr *key,
2544
                        size_t key_len)
2545
0
{
2546
0
    struct dp_netdev_flow *netdev_flow;
2547
0
    struct flow flow;
2548
0
    ovs_u128 ufid;
2549
2550
    /* If a UFID is not provided, determine one based on the key. */
2551
0
    if (!ufidp && key && key_len
2552
0
        && !dpif_netdev_flow_from_nlattrs(key, key_len, &flow, false)) {
2553
0
        odp_flow_key_hash(&flow, sizeof flow, &ufid);
2554
0
        ufidp = &ufid;
2555
0
    }
2556
2557
0
    if (ufidp) {
2558
0
        CMAP_FOR_EACH_WITH_HASH (netdev_flow, node, dp_netdev_flow_hash(ufidp),
2559
0
                                 &pmd->flow_table) {
2560
0
            if (ovs_u128_equals(netdev_flow->ufid, *ufidp)) {
2561
0
                return netdev_flow;
2562
0
            }
2563
0
        }
2564
0
    }
2565
2566
0
    return NULL;
2567
0
}
2568
2569
static void
2570
get_dpif_flow_status(const struct dp_netdev *dp,
2571
                     const struct dp_netdev_flow *netdev_flow_,
2572
                     struct dpif_flow_stats *stats,
2573
                     struct dpif_flow_attrs *attrs)
2574
0
{
2575
0
    struct dpif_flow_stats offload_stats;
2576
0
    struct dpif_flow_attrs offload_attrs;
2577
0
    struct dp_netdev_flow *netdev_flow;
2578
0
    unsigned long long n;
2579
0
    long long used;
2580
0
    uint16_t flags;
2581
2582
0
    netdev_flow = CONST_CAST(struct dp_netdev_flow *, netdev_flow_);
2583
2584
0
    atomic_read_relaxed(&netdev_flow->stats.packet_count, &n);
2585
0
    stats->n_packets = n;
2586
0
    atomic_read_relaxed(&netdev_flow->stats.byte_count, &n);
2587
0
    stats->n_bytes = n;
2588
0
    atomic_read_relaxed(&netdev_flow->stats.used, &used);
2589
0
    stats->used = used;
2590
0
    atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
2591
0
    stats->tcp_flags = flags;
2592
2593
0
    if (dpif_offload_datapath_flow_stats(dp->full_name,
2594
0
                                         netdev_flow->flow.in_port.odp_port,
2595
0
                                         &netdev_flow->mega_ufid,
2596
0
                                         &offload_stats, &offload_attrs)) {
2597
0
        stats->n_packets += offload_stats.n_packets;
2598
0
        stats->n_bytes += offload_stats.n_bytes;
2599
0
        stats->used = MAX(stats->used, offload_stats.used);
2600
0
        stats->tcp_flags |= offload_stats.tcp_flags;
2601
0
        if (attrs) {
2602
0
            attrs->offloaded = offload_attrs.offloaded;
2603
0
            attrs->dp_layer = offload_attrs.dp_layer;
2604
0
        }
2605
0
    } else if (attrs) {
2606
0
        attrs->offloaded = false;
2607
0
        attrs->dp_layer = "ovs";
2608
0
    }
2609
0
}
2610
2611
/* Converts to the dpif_flow format, using 'key_buf' and 'mask_buf' for
2612
 * storing the netlink-formatted key/mask. 'key_buf' may be the same as
2613
 * 'mask_buf'. Actions will be returned without copying, by relying on RCU to
2614
 * protect them. */
2615
static void
2616
dp_netdev_flow_to_dpif_flow(const struct dp_netdev *dp,
2617
                            const struct dp_netdev_flow *netdev_flow,
2618
                            struct ofpbuf *key_buf, struct ofpbuf *mask_buf,
2619
                            struct dpif_flow *flow, bool terse)
2620
0
{
2621
0
    if (terse) {
2622
0
        memset(flow, 0, sizeof *flow);
2623
0
    } else {
2624
0
        struct flow_wildcards wc;
2625
0
        struct dp_netdev_actions *actions;
2626
0
        size_t offset;
2627
0
        struct odp_flow_key_parms odp_parms = {
2628
0
            .flow = &netdev_flow->flow,
2629
0
            .mask = &wc.masks,
2630
0
            .support = dp_netdev_support,
2631
0
        };
2632
2633
0
        miniflow_expand(&netdev_flow->cr.mask->mf, &wc.masks);
2634
        /* in_port is exact matched, but we have left it out from the mask for
2635
         * optimnization reasons. Add in_port back to the mask. */
2636
0
        wc.masks.in_port.odp_port = ODPP_NONE;
2637
2638
        /* Key */
2639
0
        offset = key_buf->size;
2640
0
        flow->key = ofpbuf_tail(key_buf);
2641
0
        odp_flow_key_from_flow(&odp_parms, key_buf);
2642
0
        flow->key_len = key_buf->size - offset;
2643
2644
        /* Mask */
2645
0
        offset = mask_buf->size;
2646
0
        flow->mask = ofpbuf_tail(mask_buf);
2647
0
        odp_parms.key_buf = key_buf;
2648
0
        odp_flow_key_from_mask(&odp_parms, mask_buf);
2649
0
        flow->mask_len = mask_buf->size - offset;
2650
2651
        /* Actions */
2652
0
        actions = dp_netdev_flow_get_actions(netdev_flow);
2653
0
        flow->actions = actions->actions;
2654
0
        flow->actions_len = actions->size;
2655
0
    }
2656
2657
0
    flow->ufid = netdev_flow->ufid;
2658
0
    flow->ufid_present = true;
2659
0
    flow->pmd_id = netdev_flow->pmd_id;
2660
2661
0
    get_dpif_flow_status(dp, netdev_flow, &flow->stats, &flow->attrs);
2662
0
    flow->attrs.dp_extra_info = netdev_flow->dp_extra_info;
2663
0
}
2664
2665
static int
2666
dpif_netdev_mask_from_nlattrs(const struct nlattr *key, uint32_t key_len,
2667
                              const struct nlattr *mask_key,
2668
                              uint32_t mask_key_len, const struct flow *flow,
2669
                              struct flow_wildcards *wc, bool probe)
2670
0
{
2671
0
    enum odp_key_fitness fitness;
2672
2673
0
    fitness = odp_flow_key_to_mask(mask_key, mask_key_len, wc, flow, NULL);
2674
0
    if (fitness) {
2675
0
        if (!probe) {
2676
            /* This should not happen: it indicates that
2677
             * odp_flow_key_from_mask() and odp_flow_key_to_mask()
2678
             * disagree on the acceptable form of a mask.  Log the problem
2679
             * as an error, with enough details to enable debugging. */
2680
0
            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
2681
2682
0
            if (!VLOG_DROP_ERR(&rl)) {
2683
0
                struct ds s;
2684
2685
0
                ds_init(&s);
2686
0
                odp_flow_format(key, key_len, mask_key, mask_key_len, NULL, &s,
2687
0
                                true, true);
2688
0
                VLOG_ERR("internal error parsing flow mask %s (%s)",
2689
0
                ds_cstr(&s), odp_key_fitness_to_string(fitness));
2690
0
                ds_destroy(&s);
2691
0
            }
2692
0
        }
2693
2694
0
        return EINVAL;
2695
0
    }
2696
2697
0
    return 0;
2698
0
}
2699
2700
static int
2701
dpif_netdev_flow_from_nlattrs(const struct nlattr *key, uint32_t key_len,
2702
                              struct flow *flow, bool probe)
2703
0
{
2704
0
    if (odp_flow_key_to_flow(key, key_len, flow, NULL)) {
2705
0
        if (!probe) {
2706
            /* This should not happen: it indicates that
2707
             * odp_flow_key_from_flow() and odp_flow_key_to_flow() disagree on
2708
             * the acceptable form of a flow.  Log the problem as an error,
2709
             * with enough details to enable debugging. */
2710
0
            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
2711
2712
0
            if (!VLOG_DROP_ERR(&rl)) {
2713
0
                struct ds s;
2714
2715
0
                ds_init(&s);
2716
0
                odp_flow_format(key, key_len, NULL, 0, NULL, &s, true, false);
2717
0
                VLOG_ERR("internal error parsing flow key %s", ds_cstr(&s));
2718
0
                ds_destroy(&s);
2719
0
            }
2720
0
        }
2721
2722
0
        return EINVAL;
2723
0
    }
2724
2725
0
    if (flow->ct_state & DP_NETDEV_CS_UNSUPPORTED_MASK) {
2726
0
        return EINVAL;
2727
0
    }
2728
2729
0
    return 0;
2730
0
}
2731
2732
static int
2733
dpif_netdev_flow_get(const struct dpif *dpif, const struct dpif_flow_get *get)
2734
0
{
2735
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
2736
0
    struct dp_netdev_flow *netdev_flow;
2737
0
    struct dp_netdev_pmd_thread *pmd;
2738
0
    struct hmapx to_find = HMAPX_INITIALIZER(&to_find);
2739
0
    struct hmapx_node *node;
2740
0
    int error = EINVAL;
2741
2742
0
    if (get->pmd_id == PMD_ID_NULL) {
2743
0
        CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
2744
0
            if (dp_netdev_pmd_try_ref(pmd) && !hmapx_add(&to_find, pmd)) {
2745
0
                dp_netdev_pmd_unref(pmd);
2746
0
            }
2747
0
        }
2748
0
    } else {
2749
0
        pmd = dp_netdev_get_pmd(dp, get->pmd_id);
2750
0
        if (!pmd) {
2751
0
            goto out;
2752
0
        }
2753
0
        hmapx_add(&to_find, pmd);
2754
0
    }
2755
2756
0
    if (!hmapx_count(&to_find)) {
2757
0
        goto out;
2758
0
    }
2759
2760
0
    HMAPX_FOR_EACH (node, &to_find) {
2761
0
        pmd = (struct dp_netdev_pmd_thread *) node->data;
2762
0
        netdev_flow = dp_netdev_pmd_find_flow(pmd, get->ufid, get->key,
2763
0
                                              get->key_len);
2764
0
        if (netdev_flow) {
2765
0
            dp_netdev_flow_to_dpif_flow(dp, netdev_flow, get->buffer,
2766
0
                                        get->buffer, get->flow, false);
2767
0
            error = 0;
2768
0
            break;
2769
0
        } else {
2770
0
            error = ENOENT;
2771
0
        }
2772
0
    }
2773
2774
0
    HMAPX_FOR_EACH (node, &to_find) {
2775
0
        pmd = (struct dp_netdev_pmd_thread *) node->data;
2776
0
        dp_netdev_pmd_unref(pmd);
2777
0
    }
2778
0
out:
2779
0
    hmapx_destroy(&to_find);
2780
0
    return error;
2781
0
}
2782
2783
static void
2784
dp_netdev_get_mega_ufid(const struct match *match, ovs_u128 *mega_ufid)
2785
0
{
2786
0
    struct {
2787
0
        struct flow masked_flow;
2788
0
        struct flow wc;
2789
0
    } key;
2790
0
    size_t i;
2791
2792
0
    memset(&key, 0, sizeof key);
2793
0
    for (i = 0; i < sizeof(struct flow); i++) {
2794
0
        ((uint8_t *)&key.masked_flow)[i] = ((uint8_t *)&match->flow)[i] &
2795
0
                                           ((uint8_t *)&match->wc)[i];
2796
0
        ((uint8_t *)&key.wc)[i] = ((uint8_t *)&match->wc)[i];
2797
0
    }
2798
2799
0
    odp_flow_key_hash(&key, sizeof key, mega_ufid);
2800
0
}
2801
2802
static uint64_t
2803
dp_netdev_simple_match_mark(odp_port_t in_port, ovs_be16 dl_type,
2804
                            uint8_t nw_frag, ovs_be16 vlan_tci)
2805
0
{
2806
    /* Simple Match Mark:
2807
     *
2808
     * BE:
2809
     * +-----------------+-------------++---------+---+-----------+
2810
     * |     in_port     |   dl_type   || nw_frag |CFI|  VID(12)  |
2811
     * +-----------------+-------------++---------+---+-----------+
2812
     * 0                 32          47 49         51  52     63
2813
     *
2814
     * LE:
2815
     * +-----------------+-------------+------++-------+---+------+
2816
     * |     in_port     |   dl_type   |VID(8)||nw_frag|CFI|VID(4)|
2817
     * +-----------------+-------------+------++-------+---+------+
2818
     * 0                 32          47 48  55  57   59 60  61   63
2819
     *
2820
     *         Big Endian              Little Endian
2821
     * in_port : 32 bits [ 0..31]  in_port : 32 bits [ 0..31]
2822
     * dl_type : 16 bits [32..47]  dl_type : 16 bits [32..47]
2823
     * <empty> :  1 bit  [48..48]  vlan VID:  8 bits [48..55]
2824
     * nw_frag :  2 bits [49..50]  <empty> :  1 bit  [56..56]
2825
     * vlan CFI:  1 bit  [51..51]  nw_frag :  2 bits [57..59]
2826
     * vlan VID: 12 bits [52..63]  vlan CFI:  1 bit  [60..60]
2827
     *                             vlan VID:  4 bits [61..63]
2828
     *
2829
     * Layout is different for LE and BE in order to save a couple of
2830
     * network to host translations.
2831
     * */
2832
0
    return ((uint64_t) odp_to_u32(in_port) << 32)
2833
0
           | ((OVS_FORCE uint32_t) dl_type << 16)
2834
#if WORDS_BIGENDIAN
2835
           | (((uint16_t) nw_frag & FLOW_NW_FRAG_MASK) << VLAN_PCP_SHIFT)
2836
#else
2837
0
           | ((nw_frag & FLOW_NW_FRAG_MASK) << (VLAN_PCP_SHIFT - 8))
2838
0
#endif
2839
0
           | (OVS_FORCE uint16_t) (vlan_tci & htons(VLAN_VID_MASK | VLAN_CFI));
2840
0
}
2841
2842
static struct dp_netdev_flow *
2843
dp_netdev_simple_match_lookup(const struct dp_netdev_pmd_thread *pmd,
2844
                              odp_port_t in_port, ovs_be16 dl_type,
2845
                              uint8_t nw_frag, ovs_be16 vlan_tci)
2846
0
{
2847
0
    uint64_t mark = dp_netdev_simple_match_mark(in_port, dl_type,
2848
0
                                                nw_frag, vlan_tci);
2849
0
    uint32_t hash = hash_uint64(mark);
2850
0
    struct dp_netdev_flow *flow;
2851
0
    bool found = false;
2852
2853
0
    CMAP_FOR_EACH_WITH_HASH (flow, simple_match_node,
2854
0
                             hash, &pmd->simple_match_table) {
2855
0
        if (flow->simple_match_mark == mark) {
2856
0
            found = true;
2857
0
            break;
2858
0
        }
2859
0
    }
2860
0
    return found ? flow : NULL;
2861
0
}
2862
2863
static bool
2864
dp_netdev_simple_match_enabled(const struct dp_netdev_pmd_thread *pmd,
2865
                               odp_port_t in_port)
2866
0
{
2867
0
    return ccmap_find(&pmd->n_flows, odp_to_u32(in_port))
2868
0
           == ccmap_find(&pmd->n_simple_flows, odp_to_u32(in_port));
2869
0
}
2870
2871
static void
2872
dp_netdev_simple_match_insert(struct dp_netdev_pmd_thread *pmd,
2873
                              struct dp_netdev_flow *dp_flow)
2874
    OVS_REQUIRES(pmd->flow_mutex)
2875
0
{
2876
0
    odp_port_t in_port = dp_flow->flow.in_port.odp_port;
2877
0
    ovs_be16 vlan_tci = dp_flow->flow.vlans[0].tci;
2878
0
    ovs_be16 dl_type = dp_flow->flow.dl_type;
2879
0
    uint8_t nw_frag = dp_flow->flow.nw_frag;
2880
2881
0
    if (!dp_netdev_flow_ref(dp_flow)) {
2882
0
        return;
2883
0
    }
2884
2885
    /* Avoid double insertion.  Should not happen in practice. */
2886
0
    dp_netdev_simple_match_remove(pmd, dp_flow);
2887
2888
0
    uint64_t mark = dp_netdev_simple_match_mark(in_port, dl_type,
2889
0
                                                nw_frag, vlan_tci);
2890
0
    uint32_t hash = hash_uint64(mark);
2891
2892
0
    dp_flow->simple_match_mark = mark;
2893
0
    cmap_insert(&pmd->simple_match_table,
2894
0
                CONST_CAST(struct cmap_node *, &dp_flow->simple_match_node),
2895
0
                hash);
2896
0
    ccmap_inc(&pmd->n_simple_flows, odp_to_u32(in_port));
2897
2898
0
    VLOG_DBG("Simple match insert: "
2899
0
             "core_id(%d),in_port(%"PRIu32"),mark(0x%016"PRIx64").",
2900
0
             pmd->core_id, in_port, mark);
2901
0
}
2902
2903
static void
2904
dp_netdev_simple_match_remove(struct dp_netdev_pmd_thread *pmd,
2905
                               struct dp_netdev_flow *dp_flow)
2906
    OVS_REQUIRES(pmd->flow_mutex)
2907
0
{
2908
0
    odp_port_t in_port = dp_flow->flow.in_port.odp_port;
2909
0
    ovs_be16 vlan_tci = dp_flow->flow.vlans[0].tci;
2910
0
    ovs_be16 dl_type = dp_flow->flow.dl_type;
2911
0
    uint8_t nw_frag = dp_flow->flow.nw_frag;
2912
0
    struct dp_netdev_flow *flow;
2913
0
    uint64_t mark = dp_netdev_simple_match_mark(in_port, dl_type,
2914
0
                                                nw_frag, vlan_tci);
2915
0
    uint32_t hash = hash_uint64(mark);
2916
2917
0
    flow = dp_netdev_simple_match_lookup(pmd, in_port, dl_type,
2918
0
                                         nw_frag, vlan_tci);
2919
0
    if (flow == dp_flow) {
2920
0
        VLOG_DBG("Simple match remove: "
2921
0
                 "core_id(%d),in_port(%"PRIu32"),mark(0x%016"PRIx64").",
2922
0
                 pmd->core_id, in_port, mark);
2923
0
        cmap_remove(&pmd->simple_match_table,
2924
0
                    CONST_CAST(struct cmap_node *, &flow->simple_match_node),
2925
0
                    hash);
2926
0
        ccmap_dec(&pmd->n_simple_flows, odp_to_u32(in_port));
2927
0
        dp_netdev_flow_unref(flow);
2928
0
    }
2929
0
}
2930
2931
static bool
2932
dp_netdev_flow_is_simple_match(const struct match *match)
2933
0
{
2934
0
    const struct flow *flow = &match->flow;
2935
0
    const struct flow_wildcards *wc = &match->wc;
2936
2937
0
    if (flow->recirc_id || flow->packet_type != htonl(PT_ETH)) {
2938
0
        return false;
2939
0
    }
2940
2941
    /* Check that flow matches only minimal set of fields that always set.
2942
     * Also checking that VLAN VID+CFI is an exact match, because these
2943
     * are not mandatory and could be masked. */
2944
0
    struct flow_wildcards *minimal = xmalloc(sizeof *minimal);
2945
0
    ovs_be16 vlan_tci_mask = htons(VLAN_VID_MASK | VLAN_CFI);
2946
2947
0
    flow_wildcards_init_catchall(minimal);
2948
    /* 'dpif-netdev' always has following in exact match:
2949
     *   - recirc_id                   <-- recirc_id == 0 checked on input.
2950
     *   - in_port                     <-- Will be checked on input.
2951
     *   - packet_type                 <-- Assuming all packets are PT_ETH.
2952
     *   - dl_type                     <-- Need to match with.
2953
     *   - vlan_tci                    <-- Need to match with.
2954
     *   - and nw_frag for ip packets. <-- Need to match with.
2955
     */
2956
0
    WC_MASK_FIELD(minimal, recirc_id);
2957
0
    WC_MASK_FIELD(minimal, in_port);
2958
0
    WC_MASK_FIELD(minimal, packet_type);
2959
0
    WC_MASK_FIELD(minimal, dl_type);
2960
0
    WC_MASK_FIELD_MASK(minimal, vlans[0].tci, vlan_tci_mask);
2961
0
    WC_MASK_FIELD_MASK(minimal, nw_frag, FLOW_NW_FRAG_MASK);
2962
2963
0
    if (flow_wildcards_has_extra(minimal, wc)
2964
0
        || wc->masks.vlans[0].tci != vlan_tci_mask) {
2965
0
        free(minimal);
2966
0
        return false;
2967
0
    }
2968
0
    free(minimal);
2969
2970
0
    return true;
2971
0
}
2972
2973
static void
2974
offload_flow_put_resume(struct dp_netdev *dp, struct dp_netdev_flow *flow,
2975
                        struct dp_netdev_flow *previous_flow_reference,
2976
                        unsigned pmd_id, int error)
2977
0
{
2978
0
    if (error == EINPROGRESS) {
2979
0
        return;
2980
0
    }
2981
2982
0
    if (!error) {
2983
0
        flow->offloaded = true;
2984
0
    } else {
2985
        /* If the flow was already offloaded, the new action set can no
2986
         * longer be offloaded.  In theory, we should disassociate the
2987
         * offload from all PMDs that have this flow marked as offloaded.
2988
         * Unfortunately, there is no mechanism to inform other PMDs, so
2989
         * we cannot explicitly mark such flows.  This situation typically
2990
         * occurs when the revalidator modifies the flow, so it is safe to
2991
         * assume it will update all affected flows and that the offload
2992
         * will subsequently fail. */
2993
0
        flow->offloaded = false;
2994
2995
        /* On error, the flow reference was not stored by the offload provider,
2996
         * so we should decrease the reference. */
2997
0
        dp_netdev_flow_unref(flow);
2998
0
    }
2999
3000
0
    if (offload_queue_dec(flow) && flow->dead) {
3001
        /* If flows are processed asynchronously, modifications might
3002
         * still be queued up while the flow is being removed.  If this
3003
         * was the last flow in the queue on a dead flow, we try again
3004
         * to see if we need to remove this flow. */
3005
0
        offload_flow_del(dp, pmd_id, flow);
3006
0
    }
3007
3008
0
    if (previous_flow_reference) {
3009
0
        dp_netdev_flow_unref(previous_flow_reference);
3010
0
        if (previous_flow_reference != flow) {
3011
0
            VLOG_DBG("Updated flow reference was from outdated flow");
3012
0
        }
3013
0
    }
3014
0
}
3015
3016
static void
3017
offload_flow_put_resume_cb(void *aux, struct dpif_flow_stats *stats OVS_UNUSED,
3018
                           unsigned pmd_id, void *flow_reference_,
3019
                           void *old_flow_reference_,
3020
                           int error)
3021
0
{
3022
0
    struct dp_netdev *dp = aux;
3023
0
    struct dp_netdev_flow *flow_reference = flow_reference_;
3024
0
    struct dp_netdev_flow *old_flow_reference = old_flow_reference_;
3025
3026
0
    offload_flow_put_resume(dp, flow_reference, old_flow_reference,
3027
0
                            pmd_id, error);
3028
0
}
3029
3030
static void
3031
offload_flow_put(struct dp_netdev_pmd_thread *pmd, struct dp_netdev_flow *flow,
3032
                 struct match *match, const struct nlattr *actions,
3033
                 size_t actions_len)
3034
0
{
3035
0
    struct dpif_offload_flow_put put = {
3036
0
        .in_port = match->flow.in_port.odp_port,
3037
0
        .orig_in_port = flow->orig_in_port,
3038
0
        .pmd_id = pmd->core_id,
3039
0
        .ufid = CONST_CAST(ovs_u128 *, &flow->mega_ufid),
3040
0
        .match = match,
3041
0
        .actions = actions,
3042
0
        .actions_len = actions_len,
3043
0
        .stats = NULL,
3044
0
        .flow_reference = flow,
3045
0
        .cb_data = {
3046
0
            .callback = offload_flow_put_resume_cb,
3047
0
            .callback_aux = pmd->dp,
3048
0
        },
3049
0
    };
3050
0
    void *previous_flow_reference = NULL;
3051
0
    int error;
3052
3053
0
    if (!dpif_offload_enabled() || flow->dead || !offload_queue_inc(flow)) {
3054
0
        return;
3055
0
    }
3056
3057
0
    dp_netdev_flow_ref(flow);
3058
3059
0
    error = dpif_offload_datapath_flow_put(pmd->dp->full_name, &put,
3060
0
                                           &previous_flow_reference);
3061
0
    offload_flow_put_resume(pmd->dp, put.flow_reference,
3062
0
                            previous_flow_reference,
3063
0
                            pmd->core_id, error);
3064
0
}
3065
3066
static struct dp_netdev_flow *
3067
dp_netdev_flow_add(struct dp_netdev_pmd_thread *pmd,
3068
                   struct match *match, const ovs_u128 *ufid,
3069
                   const struct nlattr *actions, size_t actions_len,
3070
                   odp_port_t orig_in_port)
3071
    OVS_REQUIRES(pmd->flow_mutex)
3072
0
{
3073
0
    struct ds extra_info = DS_EMPTY_INITIALIZER;
3074
0
    struct dp_netdev_flow *flow;
3075
0
    struct netdev_flow_key mask;
3076
0
    struct dpcls *cls;
3077
0
    size_t unit;
3078
3079
    /* Make sure in_port is exact matched before we read it. */
3080
0
    ovs_assert(match->wc.masks.in_port.odp_port == ODPP_NONE);
3081
0
    odp_port_t in_port = match->flow.in_port.odp_port;
3082
3083
    /* As we select the dpcls based on the port number, each netdev flow
3084
     * belonging to the same dpcls will have the same odp_port value.
3085
     * For performance reasons we wildcard odp_port here in the mask.  In the
3086
     * typical case dp_hash is also wildcarded, and the resulting 8-byte
3087
     * chunk {dp_hash, in_port} will be ignored by netdev_flow_mask_init() and
3088
     * will not be part of the subtable mask.
3089
     * This will speed up the hash computation during dpcls_lookup() because
3090
     * there is one less call to hash_add64() in this case. */
3091
0
    match->wc.masks.in_port.odp_port = 0;
3092
0
    netdev_flow_mask_init(&mask, match);
3093
0
    match->wc.masks.in_port.odp_port = ODPP_NONE;
3094
3095
    /* Make sure wc does not have metadata. */
3096
0
    ovs_assert(!FLOWMAP_HAS_FIELD(&mask.mf.map, metadata)
3097
0
               && !FLOWMAP_HAS_FIELD(&mask.mf.map, regs));
3098
3099
    /* Do not allocate extra space. */
3100
0
    flow = xmalloc(sizeof *flow - sizeof flow->cr.flow.mf + mask.len);
3101
0
    memset(&flow->stats, 0, sizeof flow->stats);
3102
0
    flow->dead = false;
3103
0
    flow->offloaded = false;
3104
0
    atomic_init(&flow->offload_queue_depth, 0);
3105
0
    flow->batch = NULL;
3106
0
    flow->orig_in_port = orig_in_port;
3107
0
    *CONST_CAST(unsigned *, &flow->pmd_id) = pmd->core_id;
3108
0
    *CONST_CAST(struct flow *, &flow->flow) = match->flow;
3109
0
    *CONST_CAST(ovs_u128 *, &flow->ufid) = *ufid;
3110
0
    ovs_refcount_init(&flow->ref_cnt);
3111
0
    ovsrcu_set(&flow->actions, dp_netdev_actions_create(actions, actions_len));
3112
3113
0
    dp_netdev_get_mega_ufid(match, CONST_CAST(ovs_u128 *, &flow->mega_ufid));
3114
0
    netdev_flow_key_init_masked(&flow->cr.flow, &match->flow, &mask);
3115
3116
    /* Select dpcls for in_port. Relies on in_port to be exact match. */
3117
0
    cls = dp_netdev_pmd_find_dpcls(pmd, in_port);
3118
0
    dpcls_insert(cls, &flow->cr, &mask);
3119
3120
0
    ds_put_cstr(&extra_info, "miniflow_bits(");
3121
0
    FLOWMAP_FOR_EACH_UNIT (unit) {
3122
0
        if (unit) {
3123
0
            ds_put_char(&extra_info, ',');
3124
0
        }
3125
0
        ds_put_format(&extra_info, "%d",
3126
0
                      count_1bits(flow->cr.mask->mf.map.bits[unit]));
3127
0
    }
3128
0
    ds_put_char(&extra_info, ')');
3129
0
    flow->dp_extra_info = ds_steal_cstr(&extra_info);
3130
0
    ds_destroy(&extra_info);
3131
3132
0
    cmap_insert(&pmd->flow_table, CONST_CAST(struct cmap_node *, &flow->node),
3133
0
                dp_netdev_flow_hash(&flow->ufid));
3134
0
    ccmap_inc(&pmd->n_flows, odp_to_u32(in_port));
3135
3136
0
    if (dp_netdev_flow_is_simple_match(match)) {
3137
0
        dp_netdev_simple_match_insert(pmd, flow);
3138
0
    }
3139
3140
0
    offload_flow_put(pmd, flow, match, actions, actions_len);
3141
0
    log_netdev_flow_change(flow, match, NULL, actions, actions_len);
3142
3143
0
    return flow;
3144
0
}
3145
3146
static int
3147
flow_put_on_pmd(struct dp_netdev_pmd_thread *pmd,
3148
                struct netdev_flow_key *key,
3149
                struct match *match,
3150
                ovs_u128 *ufid,
3151
                const struct dpif_flow_put *put,
3152
                struct dpif_flow_stats *stats)
3153
0
{
3154
0
    struct dp_netdev_flow *netdev_flow = NULL;
3155
0
    int error = 0;
3156
3157
0
    if (stats) {
3158
0
        memset(stats, 0, sizeof *stats);
3159
0
    }
3160
3161
0
    ovs_mutex_lock(&pmd->flow_mutex);
3162
0
    if (put->ufid) {
3163
0
        netdev_flow = dp_netdev_pmd_find_flow(pmd, put->ufid,
3164
0
                                              put->key, put->key_len);
3165
0
    } else {
3166
        /* Use key instead of the locally generated ufid
3167
         * to search netdev_flow. */
3168
0
        netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL);
3169
0
    }
3170
3171
0
    if (put->flags & DPIF_FP_CREATE) {
3172
0
        if (!netdev_flow) {
3173
0
            dp_netdev_flow_add(pmd, match, ufid,
3174
0
                               put->actions, put->actions_len, ODPP_NONE);
3175
0
        } else {
3176
0
            error = EEXIST;
3177
0
        }
3178
0
        goto exit;
3179
0
    }
3180
3181
0
    if (put->flags & DPIF_FP_MODIFY) {
3182
0
        if (!netdev_flow) {
3183
0
            error = ENOENT;
3184
0
        } else {
3185
0
            if (!put->ufid && !flow_equal(&match->flow, &netdev_flow->flow)) {
3186
                /* Overlapping flow. */
3187
0
                error = EINVAL;
3188
0
                goto exit;
3189
0
            }
3190
3191
0
            struct dp_netdev_actions *new_actions;
3192
0
            struct dp_netdev_actions *old_actions;
3193
3194
0
            new_actions = dp_netdev_actions_create(put->actions,
3195
0
                                                   put->actions_len);
3196
3197
0
            old_actions = dp_netdev_flow_get_actions(netdev_flow);
3198
0
            ovsrcu_set(&netdev_flow->actions, new_actions);
3199
3200
0
            offload_flow_put(pmd, netdev_flow, match, put->actions,
3201
0
                             put->actions_len);
3202
0
            log_netdev_flow_change(netdev_flow, match, old_actions,
3203
0
                                   put->actions, put->actions_len);
3204
3205
0
            if (stats) {
3206
0
                get_dpif_flow_status(pmd->dp, netdev_flow, stats, NULL);
3207
0
            }
3208
0
            if (put->flags & DPIF_FP_ZERO_STATS) {
3209
                /* XXX: The userspace datapath uses thread local statistics
3210
                 * (for flows), which should be updated only by the owning
3211
                 * thread.  Since we cannot write on stats memory here,
3212
                 * we choose not to support this flag.  Please note:
3213
                 * - This feature is currently used only by dpctl commands with
3214
                 *   option --clear.
3215
                 * - Should the need arise, this operation can be implemented
3216
                 *   by keeping a base value (to be update here) for each
3217
                 *   counter, and subtracting it before outputting the stats */
3218
0
                error = EOPNOTSUPP;
3219
0
            }
3220
0
            ovsrcu_postpone(dp_netdev_actions_free, old_actions);
3221
0
        }
3222
0
    }
3223
3224
0
exit:
3225
0
    ovs_mutex_unlock(&pmd->flow_mutex);
3226
0
    return error;
3227
0
}
3228
3229
static int
3230
dpif_netdev_flow_put(struct dpif *dpif, const struct dpif_flow_put *put)
3231
0
{
3232
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
3233
0
    struct netdev_flow_key key;
3234
0
    struct dp_netdev_pmd_thread *pmd;
3235
0
    struct match match;
3236
0
    ovs_u128 ufid;
3237
0
    int error;
3238
0
    bool probe = put->flags & DPIF_FP_PROBE;
3239
3240
0
    if (put->stats) {
3241
0
        memset(put->stats, 0, sizeof *put->stats);
3242
0
    }
3243
0
    error = dpif_netdev_flow_from_nlattrs(put->key, put->key_len, &match.flow,
3244
0
                                          probe);
3245
0
    if (error) {
3246
0
        return error;
3247
0
    }
3248
0
    error = dpif_netdev_mask_from_nlattrs(put->key, put->key_len,
3249
0
                                          put->mask, put->mask_len,
3250
0
                                          &match.flow, &match.wc, probe);
3251
0
    if (error) {
3252
0
        return error;
3253
0
    }
3254
3255
0
    if (match.wc.masks.in_port.odp_port != ODPP_NONE) {
3256
0
        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
3257
3258
0
        VLOG_ERR_RL(&rl, "failed to put%s flow: in_port is not an exact match",
3259
0
                    (put->flags & DPIF_FP_CREATE) ? "[create]"
3260
0
                    : (put->flags & DPIF_FP_MODIFY) ? "[modify]" : "[zero]");
3261
0
        return EINVAL;
3262
0
    }
3263
3264
0
    if (put->ufid) {
3265
0
        ufid = *put->ufid;
3266
0
    } else {
3267
0
        odp_flow_key_hash(&match.flow, sizeof match.flow, &ufid);
3268
0
    }
3269
3270
    /* The Netlink encoding of datapath flow keys cannot express
3271
     * wildcarding the presence of a VLAN tag. Instead, a missing VLAN
3272
     * tag is interpreted as exact match on the fact that there is no
3273
     * VLAN.  Unless we refactor a lot of code that translates between
3274
     * Netlink and struct flow representations, we have to do the same
3275
     * here.  This must be in sync with 'match' in handle_packet_upcall(). */
3276
0
    if (!match.wc.masks.vlans[0].tci) {
3277
0
        match.wc.masks.vlans[0].tci = htons(VLAN_VID_MASK | VLAN_CFI);
3278
0
    }
3279
3280
    /* Must produce a netdev_flow_key for lookup.
3281
     * Use the same method as employed to create the key when adding
3282
     * the flow to the dplcs to make sure they match.
3283
     * We need to put in the unmasked key as flow_put_on_pmd() will first try
3284
     * to see if an entry exists doing a packet type lookup. As masked-out
3285
     * fields are interpreted as zeros, they could falsely match a wider IP
3286
     * address mask. Installation of the flow will use the match variable. */
3287
0
    netdev_flow_key_init(&key, &match.flow);
3288
3289
0
    if (put->pmd_id == PMD_ID_NULL) {
3290
0
        if (cmap_count(&dp->poll_threads) == 0) {
3291
0
            return EINVAL;
3292
0
        }
3293
0
        CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3294
0
            struct dpif_flow_stats pmd_stats;
3295
0
            int pmd_error;
3296
3297
0
            pmd_error = flow_put_on_pmd(pmd, &key, &match, &ufid, put,
3298
0
                                        &pmd_stats);
3299
0
            if (pmd_error) {
3300
0
                error = pmd_error;
3301
0
            } else if (put->stats) {
3302
0
                put->stats->n_packets += pmd_stats.n_packets;
3303
0
                put->stats->n_bytes += pmd_stats.n_bytes;
3304
0
                put->stats->used = MAX(put->stats->used, pmd_stats.used);
3305
0
                put->stats->tcp_flags |= pmd_stats.tcp_flags;
3306
0
            }
3307
0
        }
3308
0
    } else {
3309
0
        pmd = dp_netdev_get_pmd(dp, put->pmd_id);
3310
0
        if (!pmd) {
3311
0
            return EINVAL;
3312
0
        }
3313
0
        error = flow_put_on_pmd(pmd, &key, &match, &ufid, put, put->stats);
3314
0
        dp_netdev_pmd_unref(pmd);
3315
0
    }
3316
3317
0
    return error;
3318
0
}
3319
3320
static int
3321
flow_del_on_pmd(struct dp_netdev_pmd_thread *pmd,
3322
                struct dpif_flow_stats *stats,
3323
                const struct dpif_flow_del *del)
3324
0
{
3325
0
    struct dp_netdev_flow *netdev_flow;
3326
0
    int error = 0;
3327
3328
0
    ovs_mutex_lock(&pmd->flow_mutex);
3329
0
    netdev_flow = dp_netdev_pmd_find_flow(pmd, del->ufid, del->key,
3330
0
                                          del->key_len);
3331
0
    if (netdev_flow) {
3332
0
        if (stats) {
3333
0
            get_dpif_flow_status(pmd->dp, netdev_flow, stats, NULL);
3334
0
        }
3335
0
        dp_netdev_pmd_remove_flow(pmd, netdev_flow);
3336
0
    } else {
3337
0
        error = ENOENT;
3338
0
    }
3339
0
    ovs_mutex_unlock(&pmd->flow_mutex);
3340
3341
0
    return error;
3342
0
}
3343
3344
static int
3345
dpif_netdev_flow_del(struct dpif *dpif, const struct dpif_flow_del *del)
3346
0
{
3347
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
3348
0
    struct dp_netdev_pmd_thread *pmd;
3349
0
    int error = 0;
3350
3351
0
    if (del->stats) {
3352
0
        memset(del->stats, 0, sizeof *del->stats);
3353
0
    }
3354
3355
0
    if (del->pmd_id == PMD_ID_NULL) {
3356
0
        if (cmap_count(&dp->poll_threads) == 0) {
3357
0
            return EINVAL;
3358
0
        }
3359
0
        CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3360
0
            struct dpif_flow_stats pmd_stats;
3361
0
            int pmd_error;
3362
3363
0
            pmd_error = flow_del_on_pmd(pmd, &pmd_stats, del);
3364
0
            if (pmd_error) {
3365
0
                error = pmd_error;
3366
0
            } else if (del->stats) {
3367
0
                del->stats->n_packets += pmd_stats.n_packets;
3368
0
                del->stats->n_bytes += pmd_stats.n_bytes;
3369
0
                del->stats->used = MAX(del->stats->used, pmd_stats.used);
3370
0
                del->stats->tcp_flags |= pmd_stats.tcp_flags;
3371
0
            }
3372
0
        }
3373
0
    } else {
3374
0
        pmd = dp_netdev_get_pmd(dp, del->pmd_id);
3375
0
        if (!pmd) {
3376
0
            return EINVAL;
3377
0
        }
3378
0
        error = flow_del_on_pmd(pmd, del->stats, del);
3379
0
        dp_netdev_pmd_unref(pmd);
3380
0
    }
3381
3382
3383
0
    return error;
3384
0
}
3385
3386
struct dpif_netdev_flow_dump {
3387
    struct dpif_flow_dump up;
3388
    struct cmap_position poll_thread_pos;
3389
    struct cmap_position flow_pos;
3390
    struct dp_netdev_pmd_thread *cur_pmd;
3391
    int status;
3392
    struct ovs_mutex mutex;
3393
};
3394
3395
static struct dpif_netdev_flow_dump *
3396
dpif_netdev_flow_dump_cast(struct dpif_flow_dump *dump)
3397
0
{
3398
0
    return CONTAINER_OF(dump, struct dpif_netdev_flow_dump, up);
3399
0
}
3400
3401
static struct dpif_flow_dump *
3402
dpif_netdev_flow_dump_create(const struct dpif *dpif_, bool terse,
3403
                             struct dpif_flow_dump_types *types)
3404
0
{
3405
0
    struct dpif_netdev_flow_dump *dump;
3406
3407
0
    dump = xzalloc(sizeof *dump);
3408
0
    dpif_flow_dump_init(&dump->up, dpif_, terse, types);
3409
0
    ovs_mutex_init(&dump->mutex);
3410
3411
0
    return &dump->up;
3412
0
}
3413
3414
static int
3415
dpif_netdev_flow_dump_destroy(struct dpif_flow_dump *dump_)
3416
0
{
3417
0
    struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
3418
3419
0
    ovs_mutex_destroy(&dump->mutex);
3420
0
    free(dump);
3421
0
    return 0;
3422
0
}
3423
3424
struct dpif_netdev_flow_dump_thread {
3425
    struct dpif_flow_dump_thread up;
3426
    struct dpif_netdev_flow_dump *dump;
3427
    struct odputil_keybuf keybuf[FLOW_DUMP_MAX_BATCH];
3428
    struct odputil_keybuf maskbuf[FLOW_DUMP_MAX_BATCH];
3429
};
3430
3431
static struct dpif_netdev_flow_dump_thread *
3432
dpif_netdev_flow_dump_thread_cast(struct dpif_flow_dump_thread *thread)
3433
0
{
3434
0
    return CONTAINER_OF(thread, struct dpif_netdev_flow_dump_thread, up);
3435
0
}
3436
3437
static struct dpif_flow_dump_thread *
3438
dpif_netdev_flow_dump_thread_create(struct dpif_flow_dump *dump_)
3439
0
{
3440
0
    struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
3441
0
    struct dpif_netdev_flow_dump_thread *thread;
3442
3443
0
    thread = xmalloc(sizeof *thread);
3444
0
    dpif_flow_dump_thread_init(&thread->up, &dump->up);
3445
0
    thread->dump = dump;
3446
0
    return &thread->up;
3447
0
}
3448
3449
static void
3450
dpif_netdev_flow_dump_thread_destroy(struct dpif_flow_dump_thread *thread_)
3451
0
{
3452
0
    struct dpif_netdev_flow_dump_thread *thread
3453
0
        = dpif_netdev_flow_dump_thread_cast(thread_);
3454
3455
0
    free(thread);
3456
0
}
3457
3458
static int
3459
dpif_netdev_flow_dump_next(struct dpif_flow_dump_thread *thread_,
3460
                           struct dpif_flow *flows, int max_flows)
3461
0
{
3462
0
    struct dpif_netdev_flow_dump_thread *thread
3463
0
        = dpif_netdev_flow_dump_thread_cast(thread_);
3464
0
    struct dpif_netdev_flow_dump *dump = thread->dump;
3465
0
    struct dp_netdev_flow *netdev_flows[FLOW_DUMP_MAX_BATCH];
3466
0
    struct dpif_netdev *dpif = dpif_netdev_cast(thread->up.dump->dpif);
3467
0
    struct dp_netdev *dp = get_dp_netdev(&dpif->dpif);
3468
0
    int n_flows = 0;
3469
0
    int i;
3470
3471
0
    ovs_mutex_lock(&dump->mutex);
3472
0
    if (!dump->status) {
3473
0
        struct dp_netdev_pmd_thread *pmd = dump->cur_pmd;
3474
0
        int flow_limit = MIN(max_flows, FLOW_DUMP_MAX_BATCH);
3475
3476
        /* First call to dump_next(), extracts the first pmd thread.
3477
         * If there is no pmd thread, returns immediately. */
3478
0
        if (!pmd) {
3479
0
            pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
3480
0
            if (!pmd) {
3481
0
                ovs_mutex_unlock(&dump->mutex);
3482
0
                return n_flows;
3483
3484
0
            }
3485
0
        }
3486
3487
0
        do {
3488
0
            for (n_flows = 0; n_flows < flow_limit; n_flows++) {
3489
0
                struct cmap_node *node;
3490
3491
0
                node = cmap_next_position(&pmd->flow_table, &dump->flow_pos);
3492
0
                if (!node) {
3493
0
                    break;
3494
0
                }
3495
0
                netdev_flows[n_flows] = CONTAINER_OF(node,
3496
0
                                                     struct dp_netdev_flow,
3497
0
                                                     node);
3498
0
            }
3499
            /* When finishing dumping the current pmd thread, moves to
3500
             * the next. */
3501
0
            if (n_flows < flow_limit) {
3502
0
                memset(&dump->flow_pos, 0, sizeof dump->flow_pos);
3503
0
                dp_netdev_pmd_unref(pmd);
3504
0
                pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
3505
0
                if (!pmd) {
3506
0
                    dump->status = EOF;
3507
0
                    break;
3508
0
                }
3509
0
            }
3510
            /* Keeps the reference to next caller. */
3511
0
            dump->cur_pmd = pmd;
3512
3513
            /* If the current dump is empty, do not exit the loop, since the
3514
             * remaining pmds could have flows to be dumped.  Just dumps again
3515
             * on the new 'pmd'. */
3516
0
        } while (!n_flows);
3517
0
    }
3518
0
    ovs_mutex_unlock(&dump->mutex);
3519
3520
0
    for (i = 0; i < n_flows; i++) {
3521
0
        struct odputil_keybuf *maskbuf = &thread->maskbuf[i];
3522
0
        struct odputil_keybuf *keybuf = &thread->keybuf[i];
3523
0
        struct dp_netdev_flow *netdev_flow = netdev_flows[i];
3524
0
        struct dpif_flow *f = &flows[i];
3525
0
        struct ofpbuf key, mask;
3526
3527
0
        ofpbuf_use_stack(&key, keybuf, sizeof *keybuf);
3528
0
        ofpbuf_use_stack(&mask, maskbuf, sizeof *maskbuf);
3529
0
        dp_netdev_flow_to_dpif_flow(dp, netdev_flow, &key, &mask, f,
3530
0
                                    dump->up.terse);
3531
0
    }
3532
3533
0
    return n_flows;
3534
0
}
3535
3536
static int
3537
dpif_netdev_execute(struct dpif *dpif, struct dpif_execute *execute)
3538
    OVS_NO_THREAD_SAFETY_ANALYSIS
3539
0
{
3540
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
3541
0
    struct dp_netdev_pmd_thread *pmd;
3542
0
    struct dp_packet_batch pp;
3543
3544
0
    if (dp_packet_size(execute->packet) < ETH_HEADER_LEN ||
3545
0
        dp_packet_size(execute->packet) > UINT16_MAX) {
3546
0
        return EINVAL;
3547
0
    }
3548
3549
    /* Tries finding the 'pmd'.  If NULL is returned, that means
3550
     * the current thread is a non-pmd thread and should use
3551
     * dp_netdev_get_pmd(dp, NON_PMD_CORE_ID). */
3552
0
    pmd = ovsthread_getspecific(dp->per_pmd_key);
3553
0
    if (!pmd) {
3554
0
        pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
3555
0
        if (!pmd) {
3556
0
            return EBUSY;
3557
0
        }
3558
0
    }
3559
3560
0
    if (execute->probe) {
3561
        /* If this is part of a probe, Drop the packet, since executing
3562
         * the action may actually cause spurious packets be sent into
3563
         * the network. */
3564
0
        if (pmd->core_id == NON_PMD_CORE_ID) {
3565
0
            dp_netdev_pmd_unref(pmd);
3566
0
        }
3567
0
        return 0;
3568
0
    }
3569
3570
    /* If the current thread is non-pmd thread, acquires
3571
     * the 'non_pmd_mutex'. */
3572
0
    if (pmd->core_id == NON_PMD_CORE_ID) {
3573
0
        ovs_mutex_lock(&dp->non_pmd_mutex);
3574
0
    }
3575
3576
    /* Update current time in PMD context. We don't care about EMC insertion
3577
     * probability, because we are on a slow path. */
3578
0
    pmd_thread_ctx_time_update(pmd);
3579
3580
    /* The action processing expects the RSS hash to be valid, because
3581
     * it's always initialized at the beginning of datapath processing.
3582
     * In this case, though, 'execute->packet' may not have gone through
3583
     * the datapath at all, it may have been generated by the upper layer
3584
     * (OpenFlow packet-out, BFD frame, ...). */
3585
0
    if (!dp_packet_rss_valid(execute->packet)) {
3586
0
        dp_packet_set_rss_hash(execute->packet,
3587
0
                               flow_hash_5tuple(execute->flow, 0));
3588
0
    }
3589
3590
    /* Making a copy because the packet might be stolen during the execution
3591
     * and caller might still need it.  */
3592
0
    struct dp_packet *packet_clone = dp_packet_clone(execute->packet);
3593
0
    dp_packet_batch_init_packet(&pp, packet_clone);
3594
0
    dp_netdev_execute_actions(pmd, &pp, false, execute->flow,
3595
0
                              execute->actions, execute->actions_len);
3596
0
    dp_netdev_pmd_flush_output_packets(pmd, true);
3597
3598
0
    if (pmd->core_id == NON_PMD_CORE_ID) {
3599
0
        ovs_mutex_unlock(&dp->non_pmd_mutex);
3600
0
        dp_netdev_pmd_unref(pmd);
3601
0
    }
3602
3603
0
    if (dp_packet_batch_size(&pp) == 1) {
3604
        /* Packet wasn't dropped during the execution.  Swapping content with
3605
         * the original packet, because the caller might expect actions to
3606
         * modify it.  Uisng the packet from a batch instead of 'packet_clone'
3607
         * because it maybe stolen and replaced by other packet, e.g. by
3608
         * the fragmentation engine. */
3609
0
        dp_packet_swap(execute->packet, pp.packets[0]);
3610
0
        dp_packet_delete_batch(&pp, true);
3611
0
    } else if (dp_packet_batch_size(&pp)) {
3612
        /* FIXME: We have more packets than expected.  Likely, we got IP
3613
         * fragments of the reassembled packet.  Dropping them here as we have
3614
         * no way to get them to the caller.  It might be that all the required
3615
         * actions with them are already executed, but it also might not be a
3616
         * case, e.g. if dpif_netdev_execute() called to execute a single
3617
         * tunnel push. */
3618
0
        dp_packet_delete_batch(&pp, true);
3619
0
    }
3620
3621
0
    return 0;
3622
0
}
3623
3624
static void
3625
dpif_netdev_operate(struct dpif *dpif, struct dpif_op **ops, size_t n_ops)
3626
0
{
3627
0
    size_t i;
3628
3629
0
    for (i = 0; i < n_ops; i++) {
3630
0
        struct dpif_op *op = ops[i];
3631
3632
0
        switch (op->type) {
3633
0
        case DPIF_OP_FLOW_PUT:
3634
0
            op->error = dpif_netdev_flow_put(dpif, &op->flow_put);
3635
0
            break;
3636
3637
0
        case DPIF_OP_FLOW_DEL:
3638
0
            op->error = dpif_netdev_flow_del(dpif, &op->flow_del);
3639
0
            break;
3640
3641
0
        case DPIF_OP_EXECUTE:
3642
0
            op->error = dpif_netdev_execute(dpif, &op->execute);
3643
0
            break;
3644
3645
0
        case DPIF_OP_FLOW_GET:
3646
0
            op->error = dpif_netdev_flow_get(dpif, &op->flow_get);
3647
0
            break;
3648
0
        }
3649
0
    }
3650
0
}
3651
3652
/* Enable or Disable PMD auto load balancing. */
3653
static void
3654
set_pmd_auto_lb(struct dp_netdev *dp, bool state, bool always_log)
3655
0
{
3656
0
    struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
3657
3658
0
    if (pmd_alb->is_enabled != state || always_log) {
3659
0
        pmd_alb->is_enabled = state;
3660
0
        if (pmd_alb->is_enabled) {
3661
0
            uint8_t rebalance_load_thresh;
3662
3663
0
            atomic_read_relaxed(&pmd_alb->rebalance_load_thresh,
3664
0
                                &rebalance_load_thresh);
3665
0
            VLOG_INFO("PMD auto load balance is enabled, "
3666
0
                      "interval %"PRIu64" mins, "
3667
0
                      "pmd load threshold %"PRIu8"%%, "
3668
0
                      "improvement threshold %"PRIu8"%%.",
3669
0
                       pmd_alb->rebalance_intvl / MIN_TO_MSEC,
3670
0
                       rebalance_load_thresh,
3671
0
                       pmd_alb->rebalance_improve_thresh);
3672
0
        } else {
3673
0
            pmd_alb->rebalance_poll_timer = 0;
3674
0
            VLOG_INFO("PMD auto load balance is disabled.");
3675
0
        }
3676
0
    }
3677
0
}
3678
3679
static int
3680
parse_pmd_sleep_list(const char *max_sleep_list,
3681
                     struct pmd_sleep **pmd_sleeps)
3682
0
{
3683
0
    char *list, *copy, *key, *value;
3684
0
    int num_vals = 0;
3685
3686
0
    if (!max_sleep_list) {
3687
0
        return num_vals;
3688
0
    }
3689
3690
0
    list = copy = xstrdup(max_sleep_list);
3691
3692
0
    while (ofputil_parse_key_value(&list, &key, &value)) {
3693
0
        uint64_t temp, pmd_max_sleep;
3694
0
        char *error = NULL;
3695
0
        unsigned core;
3696
0
        int i;
3697
3698
0
        error = str_to_u64(key, &temp);
3699
0
        if (error) {
3700
0
            free(error);
3701
0
            continue;
3702
0
        }
3703
3704
0
        if (value[0] == '\0') {
3705
            /* No value specified. key is dp default. */
3706
0
            core = UINT_MAX;
3707
0
            pmd_max_sleep = temp;
3708
0
        } else {
3709
0
            error = str_to_u64(value, &pmd_max_sleep);
3710
0
            if (!error && temp < UINT_MAX) {
3711
                /* Key is pmd core id. */
3712
0
                core = (unsigned) temp;
3713
0
            } else {
3714
0
                free(error);
3715
0
                continue;
3716
0
            }
3717
0
        }
3718
3719
        /* Detect duplicate max sleep values. */
3720
0
        for (i = 0; i < num_vals; i++) {
3721
0
            if ((*pmd_sleeps)[i].core_id == core) {
3722
0
                break;
3723
0
            }
3724
0
        }
3725
0
        if (i == num_vals) {
3726
            /* Not duplicate, add a new entry. */
3727
0
            *pmd_sleeps = xrealloc(*pmd_sleeps,
3728
0
                                   (num_vals + 1) * sizeof **pmd_sleeps);
3729
0
            num_vals++;
3730
0
        }
3731
3732
0
        pmd_max_sleep = MIN(PMD_RCU_QUIESCE_INTERVAL, pmd_max_sleep);
3733
3734
0
        (*pmd_sleeps)[i].core_id = core;
3735
0
        (*pmd_sleeps)[i].max_sleep = pmd_max_sleep;
3736
0
    }
3737
3738
0
    free(copy);
3739
0
    return num_vals;
3740
0
}
3741
3742
static void
3743
log_pmd_sleep(unsigned core_id, int numa_id, uint64_t pmd_max_sleep)
3744
0
{
3745
0
    if (core_id == NON_PMD_CORE_ID) {
3746
0
        return;
3747
0
    }
3748
0
    VLOG_INFO("PMD thread on numa_id: %d, core id: %2d, "
3749
0
              "max sleep: %4"PRIu64" us.", numa_id, core_id, pmd_max_sleep);
3750
0
}
3751
3752
static void
3753
pmd_init_max_sleep(struct dp_netdev *dp, struct dp_netdev_pmd_thread *pmd)
3754
0
{
3755
0
    uint64_t max_sleep = dp->pmd_max_sleep_default;
3756
0
    struct pmd_sleep *pmd_sleeps = NULL;
3757
0
    int num_vals;
3758
3759
0
    num_vals = parse_pmd_sleep_list(dp->max_sleep_list, &pmd_sleeps);
3760
3761
    /* Check if the user has set a specific value for this pmd. */
3762
0
    for (int i = 0; i < num_vals; i++) {
3763
0
        if (pmd_sleeps[i].core_id == pmd->core_id) {
3764
0
            max_sleep = pmd_sleeps[i].max_sleep;
3765
0
            break;
3766
0
        }
3767
0
    }
3768
0
    atomic_init(&pmd->max_sleep, max_sleep);
3769
0
    log_pmd_sleep(pmd->core_id, pmd->numa_id, max_sleep);
3770
0
    free(pmd_sleeps);
3771
0
}
3772
3773
static bool
3774
assign_sleep_values_to_pmds(struct dp_netdev *dp, int num_vals,
3775
                            struct pmd_sleep *pmd_sleeps)
3776
0
{
3777
0
    struct dp_netdev_pmd_thread *pmd;
3778
0
    bool value_changed = false;
3779
3780
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3781
0
        uint64_t new_max_sleep, cur_pmd_max_sleep;
3782
3783
0
        if (pmd->core_id == NON_PMD_CORE_ID) {
3784
0
            continue;
3785
0
        }
3786
3787
        /* Default to global value. */
3788
0
        new_max_sleep = dp->pmd_max_sleep_default;
3789
3790
        /* Check for pmd specific value. */
3791
0
        for (int i = 0;  i < num_vals; i++) {
3792
0
            if (pmd->core_id == pmd_sleeps[i].core_id) {
3793
0
                new_max_sleep = pmd_sleeps[i].max_sleep;
3794
0
                break;
3795
0
            }
3796
0
        }
3797
0
        atomic_read_relaxed(&pmd->max_sleep, &cur_pmd_max_sleep);
3798
0
        if (new_max_sleep != cur_pmd_max_sleep) {
3799
0
            atomic_store_relaxed(&pmd->max_sleep, new_max_sleep);
3800
0
            value_changed = true;
3801
0
        }
3802
0
    }
3803
0
    return value_changed;
3804
0
}
3805
3806
static void
3807
log_all_pmd_sleeps(struct dp_netdev *dp)
3808
0
{
3809
0
    struct dp_netdev_pmd_thread **pmd_list = NULL;
3810
0
    struct dp_netdev_pmd_thread *pmd;
3811
0
    size_t n;
3812
3813
0
    VLOG_INFO("Default PMD thread max sleep: %4"PRIu64" us.",
3814
0
              dp->pmd_max_sleep_default);
3815
3816
0
    sorted_poll_thread_list(dp, &pmd_list, &n);
3817
3818
0
    for (size_t i = 0; i < n; i++) {
3819
0
        uint64_t cur_pmd_max_sleep;
3820
3821
0
        pmd = pmd_list[i];
3822
0
        atomic_read_relaxed(&pmd->max_sleep, &cur_pmd_max_sleep);
3823
0
        log_pmd_sleep(pmd->core_id, pmd->numa_id, cur_pmd_max_sleep);
3824
0
    }
3825
0
    free(pmd_list);
3826
0
}
3827
3828
static bool
3829
set_all_pmd_max_sleeps(struct dp_netdev *dp, const struct smap *config)
3830
0
{
3831
0
    const char *max_sleep_list = smap_get(config, "pmd-sleep-max");
3832
0
    struct pmd_sleep *pmd_sleeps = NULL;
3833
0
    uint64_t default_max_sleep = 0;
3834
0
    bool default_changed = false;
3835
0
    bool pmd_changed = false;
3836
0
    uint64_t pmd_maxsleep;
3837
0
    int num_vals = 0;
3838
3839
    /* Check for deprecated 'pmd-maxsleep' value. */
3840
0
    pmd_maxsleep = smap_get_ullong(config, "pmd-maxsleep", UINT64_MAX);
3841
0
    if (pmd_maxsleep != UINT64_MAX && !max_sleep_list) {
3842
0
        VLOG_WARN_ONCE("pmd-maxsleep is deprecated. "
3843
0
                       "Please use pmd-sleep-max instead.");
3844
0
        default_max_sleep = pmd_maxsleep;
3845
0
    }
3846
3847
    /* Check if there is no change in string or value. */
3848
0
    if (!!dp->max_sleep_list == !!max_sleep_list) {
3849
0
        if (max_sleep_list
3850
0
            ? nullable_string_is_equal(max_sleep_list, dp->max_sleep_list)
3851
0
            : default_max_sleep == dp->pmd_max_sleep_default) {
3852
0
            return false;
3853
0
        }
3854
0
    }
3855
3856
    /* Free existing string and copy new one (if any). */
3857
0
    free(dp->max_sleep_list);
3858
0
    dp->max_sleep_list = nullable_xstrdup(max_sleep_list);
3859
3860
0
    if (max_sleep_list) {
3861
0
        num_vals = parse_pmd_sleep_list(max_sleep_list, &pmd_sleeps);
3862
3863
        /* Check if the user has set a global value. */
3864
0
        for (int i = 0; i < num_vals; i++) {
3865
0
            if (pmd_sleeps[i].core_id == UINT_MAX) {
3866
0
                default_max_sleep = pmd_sleeps[i].max_sleep;
3867
0
                break;
3868
0
            }
3869
0
        }
3870
0
    }
3871
3872
0
    if (dp->pmd_max_sleep_default != default_max_sleep) {
3873
0
        dp->pmd_max_sleep_default = default_max_sleep;
3874
0
        default_changed = true;
3875
0
    }
3876
0
    pmd_changed = assign_sleep_values_to_pmds(dp, num_vals, pmd_sleeps);
3877
3878
0
    free(pmd_sleeps);
3879
0
    return default_changed || pmd_changed;
3880
0
}
3881
3882
/* Applies datapath configuration from the database. Some of the changes are
3883
 * actually applied in dpif_netdev_run(). */
3884
static int
3885
dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config)
3886
0
{
3887
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
3888
0
    const char *cmask = smap_get(other_config, "pmd-cpu-mask");
3889
0
    const char *pmd_rxq_assign = smap_get_def(other_config, "pmd-rxq-assign",
3890
0
                                             "cycles");
3891
0
    unsigned long long insert_prob =
3892
0
        smap_get_ullong(other_config, "emc-insert-inv-prob",
3893
0
                        DEFAULT_EM_FLOW_INSERT_INV_PROB);
3894
0
    uint32_t insert_min, cur_min;
3895
0
    uint32_t tx_flush_interval, cur_tx_flush_interval;
3896
0
    uint64_t rebalance_intvl;
3897
0
    uint8_t cur_rebalance_load;
3898
0
    uint32_t rebalance_load, rebalance_improve;
3899
0
    bool log_autolb = false;
3900
0
    enum sched_assignment_type pmd_rxq_assign_type;
3901
3902
0
    tx_flush_interval = smap_get_int(other_config, "tx-flush-interval",
3903
0
                                     DEFAULT_TX_FLUSH_INTERVAL);
3904
0
    atomic_read_relaxed(&dp->tx_flush_interval, &cur_tx_flush_interval);
3905
0
    if (tx_flush_interval != cur_tx_flush_interval) {
3906
0
        atomic_store_relaxed(&dp->tx_flush_interval, tx_flush_interval);
3907
0
        VLOG_INFO("Flushing interval for tx queues set to %"PRIu32" us",
3908
0
                  tx_flush_interval);
3909
0
    }
3910
3911
0
    if (!nullable_string_is_equal(dp->pmd_cmask, cmask)) {
3912
0
        free(dp->pmd_cmask);
3913
0
        dp->pmd_cmask = nullable_xstrdup(cmask);
3914
0
        dp_netdev_request_reconfigure(dp);
3915
0
    }
3916
3917
0
    atomic_read_relaxed(&dp->emc_insert_min, &cur_min);
3918
0
    if (insert_prob <= UINT32_MAX) {
3919
0
        insert_min = insert_prob == 0 ? 0 : UINT32_MAX / insert_prob;
3920
0
    } else {
3921
0
        insert_min = DEFAULT_EM_FLOW_INSERT_MIN;
3922
0
        insert_prob = DEFAULT_EM_FLOW_INSERT_INV_PROB;
3923
0
    }
3924
3925
0
    if (insert_min != cur_min) {
3926
0
        atomic_store_relaxed(&dp->emc_insert_min, insert_min);
3927
0
        if (insert_min == 0) {
3928
0
            VLOG_INFO("EMC insertion probability changed to zero");
3929
0
        } else {
3930
0
            VLOG_INFO("EMC insertion probability changed to 1/%llu (~%.2f%%)",
3931
0
                      insert_prob, (100 / (float)insert_prob));
3932
0
        }
3933
0
    }
3934
3935
0
    bool perf_enabled = smap_get_bool(other_config, "pmd-perf-metrics", false);
3936
0
    bool cur_perf_enabled;
3937
0
    atomic_read_relaxed(&dp->pmd_perf_metrics, &cur_perf_enabled);
3938
0
    if (perf_enabled != cur_perf_enabled) {
3939
0
        atomic_store_relaxed(&dp->pmd_perf_metrics, perf_enabled);
3940
0
        if (perf_enabled) {
3941
0
            VLOG_INFO("PMD performance metrics collection enabled");
3942
0
        } else {
3943
0
            VLOG_INFO("PMD performance metrics collection disabled");
3944
0
        }
3945
0
    }
3946
3947
0
    bool smc_enable = smap_get_bool(other_config, "smc-enable", false);
3948
0
    bool cur_smc;
3949
0
    atomic_read_relaxed(&dp->smc_enable_db, &cur_smc);
3950
0
    if (smc_enable != cur_smc) {
3951
0
        atomic_store_relaxed(&dp->smc_enable_db, smc_enable);
3952
0
        if (smc_enable) {
3953
0
            VLOG_INFO("SMC cache is enabled");
3954
0
        } else {
3955
0
            VLOG_INFO("SMC cache is disabled");
3956
0
        }
3957
0
    }
3958
3959
0
    if (!strcmp(pmd_rxq_assign, "roundrobin")) {
3960
0
        pmd_rxq_assign_type = SCHED_ROUNDROBIN;
3961
0
    } else if (!strcmp(pmd_rxq_assign, "cycles")) {
3962
0
        pmd_rxq_assign_type = SCHED_CYCLES;
3963
0
    } else if (!strcmp(pmd_rxq_assign, "group")) {
3964
0
        pmd_rxq_assign_type = SCHED_GROUP;
3965
0
    } else {
3966
        /* Default. */
3967
0
        VLOG_WARN("Unsupported rx queue to PMD assignment mode in "
3968
0
                  "pmd-rxq-assign. Defaulting to 'cycles'.");
3969
0
        pmd_rxq_assign_type = SCHED_CYCLES;
3970
0
        pmd_rxq_assign = "cycles";
3971
0
    }
3972
0
    if (dp->pmd_rxq_assign_type != pmd_rxq_assign_type) {
3973
0
        dp->pmd_rxq_assign_type = pmd_rxq_assign_type;
3974
0
        VLOG_INFO("Rxq to PMD assignment mode changed to: \'%s\'.",
3975
0
                  pmd_rxq_assign);
3976
0
        dp_netdev_request_reconfigure(dp);
3977
0
    }
3978
3979
0
    bool pmd_iso = smap_get_bool(other_config, "pmd-rxq-isolate", true);
3980
3981
0
    if (pmd_rxq_assign_type != SCHED_GROUP && pmd_iso == false) {
3982
        /* Invalid combination. */
3983
0
        VLOG_WARN("pmd-rxq-isolate can only be set false "
3984
0
                  "when using pmd-rxq-assign=group");
3985
0
        pmd_iso = true;
3986
0
    }
3987
0
    if (dp->pmd_iso != pmd_iso) {
3988
0
        dp->pmd_iso = pmd_iso;
3989
0
        if (pmd_iso) {
3990
0
            VLOG_INFO("pmd-rxq-affinity isolates PMD core");
3991
0
        } else {
3992
0
            VLOG_INFO("pmd-rxq-affinity does not isolate PMD core");
3993
0
        }
3994
0
        dp_netdev_request_reconfigure(dp);
3995
0
    }
3996
3997
0
    struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
3998
3999
0
    rebalance_intvl = smap_get_ullong(other_config,
4000
0
                                      "pmd-auto-lb-rebal-interval",
4001
0
                                      ALB_REBALANCE_INTERVAL);
4002
0
    if (rebalance_intvl > MAX_ALB_REBALANCE_INTERVAL) {
4003
0
        rebalance_intvl = ALB_REBALANCE_INTERVAL;
4004
0
    }
4005
4006
    /* Input is in min, convert it to msec. */
4007
0
    rebalance_intvl =
4008
0
        rebalance_intvl ? rebalance_intvl * MIN_TO_MSEC : MIN_TO_MSEC;
4009
4010
0
    if (pmd_alb->rebalance_intvl != rebalance_intvl) {
4011
0
        pmd_alb->rebalance_intvl = rebalance_intvl;
4012
0
        VLOG_INFO("PMD auto load balance interval set to "
4013
0
                  "%"PRIu64" mins\n", rebalance_intvl / MIN_TO_MSEC);
4014
0
        log_autolb = true;
4015
0
    }
4016
4017
0
    rebalance_improve = smap_get_uint(other_config,
4018
0
                                      "pmd-auto-lb-improvement-threshold",
4019
0
                                      ALB_IMPROVEMENT_THRESHOLD);
4020
0
    if (rebalance_improve > 100) {
4021
0
        rebalance_improve = ALB_IMPROVEMENT_THRESHOLD;
4022
0
    }
4023
0
    if (rebalance_improve != pmd_alb->rebalance_improve_thresh) {
4024
0
        pmd_alb->rebalance_improve_thresh = rebalance_improve;
4025
0
        VLOG_INFO("PMD auto load balance improvement threshold set to "
4026
0
                  "%"PRIu32"%%", rebalance_improve);
4027
0
        log_autolb = true;
4028
0
    }
4029
4030
0
    rebalance_load = smap_get_uint(other_config, "pmd-auto-lb-load-threshold",
4031
0
                                   ALB_LOAD_THRESHOLD);
4032
0
    if (rebalance_load > 100) {
4033
0
        rebalance_load = ALB_LOAD_THRESHOLD;
4034
0
    }
4035
0
    atomic_read_relaxed(&pmd_alb->rebalance_load_thresh, &cur_rebalance_load);
4036
0
    if (rebalance_load != cur_rebalance_load) {
4037
0
        atomic_store_relaxed(&pmd_alb->rebalance_load_thresh,
4038
0
                             rebalance_load);
4039
0
        VLOG_INFO("PMD auto load balance load threshold set to %"PRIu32"%%",
4040
0
                  rebalance_load);
4041
0
        log_autolb = true;
4042
0
    }
4043
4044
0
    bool autolb_state = smap_get_bool(other_config, "pmd-auto-lb", false);
4045
4046
0
    set_pmd_auto_lb(dp, autolb_state, log_autolb);
4047
4048
0
    bool sleep_changed = set_all_pmd_max_sleeps(dp, other_config);
4049
4050
0
    if (ovsthread_once_start(&dp->once_set_config)) {
4051
0
        log_all_pmd_sleeps(dp);
4052
0
        dpif_offload_datapath_register_flow_unreference_cb(
4053
0
            dpif, offload_flow_reference_unreference_cb);
4054
4055
0
        ovsthread_once_done(&dp->once_set_config);
4056
0
    } else if (sleep_changed) {
4057
0
        log_all_pmd_sleeps(dp);
4058
0
    }
4059
4060
0
    return 0;
4061
0
}
4062
4063
static bool
4064
dpif_netdev_number_handlers_required(struct dpif *dpif_ OVS_UNUSED,
4065
                                     uint32_t *n_handlers)
4066
0
{
4067
0
    *n_handlers = 0;
4068
0
    return true;
4069
0
}
4070
4071
/* Parses affinity list and returns result in 'core_ids'. */
4072
static int
4073
parse_affinity_list(const char *affinity_list, unsigned *core_ids, int n_rxq)
4074
0
{
4075
0
    unsigned i;
4076
0
    char *list, *copy, *key, *value;
4077
0
    int error = 0;
4078
4079
0
    for (i = 0; i < n_rxq; i++) {
4080
0
        core_ids[i] = OVS_CORE_UNSPEC;
4081
0
    }
4082
4083
0
    if (!affinity_list) {
4084
0
        return 0;
4085
0
    }
4086
4087
0
    list = copy = xstrdup(affinity_list);
4088
4089
0
    while (ofputil_parse_key_value(&list, &key, &value)) {
4090
0
        int rxq_id, core_id;
4091
4092
0
        if (!str_to_int(key, 0, &rxq_id) || rxq_id < 0
4093
0
            || !str_to_int(value, 0, &core_id) || core_id < 0) {
4094
0
            error = EINVAL;
4095
0
            break;
4096
0
        }
4097
4098
0
        if (rxq_id < n_rxq) {
4099
0
            core_ids[rxq_id] = core_id;
4100
0
        }
4101
0
    }
4102
4103
0
    free(copy);
4104
0
    return error;
4105
0
}
4106
4107
/* Parses 'affinity_list' and applies configuration if it is valid. */
4108
static int
4109
dpif_netdev_port_set_rxq_affinity(struct dp_netdev_port *port,
4110
                                  const char *affinity_list)
4111
0
{
4112
0
    unsigned *core_ids, i;
4113
0
    int error = 0;
4114
4115
0
    core_ids = xmalloc(port->n_rxq * sizeof *core_ids);
4116
0
    if (parse_affinity_list(affinity_list, core_ids, port->n_rxq)) {
4117
0
        error = EINVAL;
4118
0
        goto exit;
4119
0
    }
4120
4121
0
    for (i = 0; i < port->n_rxq; i++) {
4122
0
        port->rxqs[i].core_id = core_ids[i];
4123
0
    }
4124
4125
0
exit:
4126
0
    free(core_ids);
4127
0
    return error;
4128
0
}
4129
4130
/* Returns 'true' if one of the 'port's RX queues exists in 'poll_list'
4131
 * of given PMD thread. */
4132
static bool
4133
dpif_netdev_pmd_polls_port(struct dp_netdev_pmd_thread *pmd,
4134
                           struct dp_netdev_port *port)
4135
    OVS_EXCLUDED(pmd->port_mutex)
4136
0
{
4137
0
    struct rxq_poll *poll;
4138
0
    bool found = false;
4139
4140
0
    ovs_mutex_lock(&pmd->port_mutex);
4141
0
    HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
4142
0
        if (port == poll->rxq->port) {
4143
0
            found = true;
4144
0
            break;
4145
0
        }
4146
0
    }
4147
0
    ovs_mutex_unlock(&pmd->port_mutex);
4148
0
    return found;
4149
0
}
4150
4151
/* Updates port configuration from the database.  The changes are actually
4152
 * applied in dpif_netdev_run(). */
4153
static int
4154
dpif_netdev_port_set_config(struct dpif *dpif, odp_port_t port_no,
4155
                            const struct smap *cfg)
4156
0
{
4157
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
4158
0
    struct dp_netdev_port *port;
4159
0
    int error = 0;
4160
0
    const char *affinity_list = smap_get(cfg, "pmd-rxq-affinity");
4161
0
    bool emc_enabled = smap_get_bool(cfg, "emc-enable", true);
4162
0
    const char *tx_steering_mode = smap_get(cfg, "tx-steering");
4163
0
    enum txq_req_mode txq_mode;
4164
4165
0
    ovs_rwlock_wrlock(&dp->port_rwlock);
4166
0
    error = get_port_by_number(dp, port_no, &port);
4167
0
    if (error) {
4168
0
        goto unlock;
4169
0
    }
4170
4171
0
    if (emc_enabled != port->emc_enabled) {
4172
0
        struct dp_netdev_pmd_thread *pmd;
4173
0
        struct ds ds = DS_EMPTY_INITIALIZER;
4174
0
        uint32_t cur_min, insert_prob;
4175
4176
0
        port->emc_enabled = emc_enabled;
4177
        /* Mark for reload all the threads that polls this port and request
4178
         * for reconfiguration for the actual reloading of threads. */
4179
0
        CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4180
0
            if (dpif_netdev_pmd_polls_port(pmd, port)) {
4181
0
                pmd->need_reload = true;
4182
0
            }
4183
0
        }
4184
0
        dp_netdev_request_reconfigure(dp);
4185
4186
0
        ds_put_format(&ds, "%s: EMC has been %s.",
4187
0
                      netdev_get_name(port->netdev),
4188
0
                      (emc_enabled) ? "enabled" : "disabled");
4189
0
        if (emc_enabled) {
4190
0
            ds_put_cstr(&ds, " Current insertion probability is ");
4191
0
            atomic_read_relaxed(&dp->emc_insert_min, &cur_min);
4192
0
            if (!cur_min) {
4193
0
                ds_put_cstr(&ds, "zero.");
4194
0
            } else {
4195
0
                insert_prob = UINT32_MAX / cur_min;
4196
0
                ds_put_format(&ds, "1/%"PRIu32" (~%.2f%%).",
4197
0
                              insert_prob, 100 / (float) insert_prob);
4198
0
            }
4199
0
        }
4200
0
        VLOG_INFO("%s", ds_cstr(&ds));
4201
0
        ds_destroy(&ds);
4202
0
    }
4203
4204
    /* Checking for RXq affinity changes. */
4205
0
    if (netdev_is_pmd(port->netdev)
4206
0
        && !nullable_string_is_equal(affinity_list, port->rxq_affinity_list)) {
4207
4208
0
        error = dpif_netdev_port_set_rxq_affinity(port, affinity_list);
4209
0
        if (error) {
4210
0
            goto unlock;
4211
0
        }
4212
0
        free(port->rxq_affinity_list);
4213
0
        port->rxq_affinity_list = nullable_xstrdup(affinity_list);
4214
4215
0
        dp_netdev_request_reconfigure(dp);
4216
0
    }
4217
4218
0
    if (nullable_string_is_equal(tx_steering_mode, "hash")) {
4219
0
        txq_mode = TXQ_REQ_MODE_HASH;
4220
0
    } else {
4221
0
        txq_mode = TXQ_REQ_MODE_THREAD;
4222
0
    }
4223
4224
0
    if (txq_mode != port->txq_requested_mode) {
4225
0
        port->txq_requested_mode = txq_mode;
4226
0
        VLOG_INFO("%s: Tx packet steering mode has been set to '%s'.",
4227
0
                  netdev_get_name(port->netdev),
4228
0
                  (txq_mode == TXQ_REQ_MODE_THREAD) ? "thread" : "hash");
4229
0
        dp_netdev_request_reconfigure(dp);
4230
0
    }
4231
4232
0
unlock:
4233
0
    ovs_rwlock_unlock(&dp->port_rwlock);
4234
0
    return error;
4235
0
}
4236
4237
static int
4238
dpif_netdev_queue_to_priority(const struct dpif *dpif OVS_UNUSED,
4239
                              uint32_t queue_id, uint32_t *priority)
4240
0
{
4241
0
    *priority = queue_id;
4242
0
    return 0;
4243
0
}
4244
4245

4246
/* Creates and returns a new 'struct dp_netdev_actions', whose actions are
4247
 * a copy of the 'size' bytes of 'actions' input parameters. */
4248
struct dp_netdev_actions *
4249
dp_netdev_actions_create(const struct nlattr *actions, size_t size)
4250
0
{
4251
0
    struct dp_netdev_actions *netdev_actions;
4252
4253
0
    netdev_actions = xmalloc(sizeof *netdev_actions + size);
4254
0
    netdev_actions->size = size;
4255
0
    if (size) {
4256
0
        memcpy(netdev_actions->actions, actions, size);
4257
0
    }
4258
4259
0
    return netdev_actions;
4260
0
}
4261
4262
struct dp_netdev_actions *
4263
dp_netdev_flow_get_actions(const struct dp_netdev_flow *flow)
4264
0
{
4265
0
    return ovsrcu_get(struct dp_netdev_actions *, &flow->actions);
4266
0
}
4267
4268
static void
4269
dp_netdev_actions_free(struct dp_netdev_actions *actions)
4270
0
{
4271
0
    free(actions);
4272
0
}
4273

4274
static void
4275
dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
4276
                         enum rxq_cycles_counter_type type,
4277
                         unsigned long long cycles)
4278
0
{
4279
0
   atomic_store_relaxed(&rx->cycles[type], cycles);
4280
0
}
4281
4282
static void
4283
dp_netdev_rxq_add_cycles(struct dp_netdev_rxq *rx,
4284
                         enum rxq_cycles_counter_type type,
4285
                         unsigned long long cycles)
4286
0
{
4287
0
    non_atomic_ullong_add(&rx->cycles[type], cycles);
4288
0
}
4289
4290
static uint64_t
4291
dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx,
4292
                         enum rxq_cycles_counter_type type)
4293
0
{
4294
0
    unsigned long long processing_cycles;
4295
0
    atomic_read_relaxed(&rx->cycles[type], &processing_cycles);
4296
0
    return processing_cycles;
4297
0
}
4298
4299
static void
4300
dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
4301
                                unsigned long long cycles)
4302
0
{
4303
0
    unsigned int idx = atomic_count_inc(&rx->intrvl_idx) % PMD_INTERVAL_MAX;
4304
0
    atomic_store_relaxed(&rx->cycles_intrvl[idx], cycles);
4305
0
}
4306
4307
static uint64_t
4308
dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx)
4309
0
{
4310
0
    unsigned long long processing_cycles;
4311
0
    atomic_read_relaxed(&rx->cycles_intrvl[idx], &processing_cycles);
4312
0
    return processing_cycles;
4313
0
}
4314
4315
#if ATOMIC_ALWAYS_LOCK_FREE_8B
4316
static inline bool
4317
pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd)
4318
0
{
4319
0
    bool pmd_perf_enabled;
4320
0
    atomic_read_relaxed(&pmd->dp->pmd_perf_metrics, &pmd_perf_enabled);
4321
0
    return pmd_perf_enabled;
4322
0
}
4323
#else
4324
/* If stores and reads of 64-bit integers are not atomic, the full PMD
4325
 * performance metrics are not available as locked access to 64 bit
4326
 * integers would be prohibitively expensive. */
4327
static inline bool
4328
pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd OVS_UNUSED)
4329
{
4330
    return false;
4331
}
4332
#endif
4333
4334
static int
4335
dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd,
4336
                                   struct tx_port *p)
4337
0
{
4338
0
    int i;
4339
0
    int tx_qid;
4340
0
    int output_cnt;
4341
0
    bool concurrent_txqs;
4342
0
    struct cycle_timer timer;
4343
0
    uint64_t cycles;
4344
0
    uint32_t tx_flush_interval;
4345
4346
0
    cycle_timer_start(&pmd->perf_stats, &timer);
4347
4348
0
    output_cnt = dp_packet_batch_size(&p->output_pkts);
4349
0
    ovs_assert(output_cnt > 0);
4350
4351
0
    if (p->port->txq_mode == TXQ_MODE_XPS_HASH) {
4352
0
        int n_txq = netdev_n_txq(p->port->netdev);
4353
4354
        /* Re-batch per txq based on packet hash. */
4355
0
        struct dp_packet *packet;
4356
0
        DP_PACKET_BATCH_FOR_EACH (j, packet, &p->output_pkts) {
4357
0
            uint32_t hash;
4358
4359
0
            if (OVS_LIKELY(dp_packet_rss_valid(packet))) {
4360
0
                hash = dp_packet_get_rss_hash(packet);
4361
0
            } else {
4362
0
                struct flow flow;
4363
4364
0
                flow_extract(packet, &flow);
4365
0
                hash = flow_hash_5tuple(&flow, 0);
4366
0
            }
4367
0
            dp_packet_batch_add(&p->txq_pkts[hash % n_txq], packet);
4368
0
        }
4369
4370
        /* Flush batches of each Tx queues. */
4371
0
        for (i = 0; i < n_txq; i++) {
4372
0
            if (dp_packet_batch_is_empty(&p->txq_pkts[i])) {
4373
0
                continue;
4374
0
            }
4375
0
            netdev_send(p->port->netdev, i, &p->txq_pkts[i], true);
4376
0
            dp_packet_batch_init(&p->txq_pkts[i]);
4377
0
        }
4378
0
    } else {
4379
0
        if (p->port->txq_mode == TXQ_MODE_XPS) {
4380
0
            tx_qid = dpif_netdev_xps_get_tx_qid(pmd, p);
4381
0
            concurrent_txqs = true;
4382
0
        } else {
4383
0
            tx_qid = pmd->static_tx_qid;
4384
0
            concurrent_txqs = false;
4385
0
        }
4386
0
        netdev_send(p->port->netdev, tx_qid, &p->output_pkts, concurrent_txqs);
4387
0
    }
4388
0
    dp_packet_batch_init(&p->output_pkts);
4389
4390
    /* Update time of the next flush. */
4391
0
    atomic_read_relaxed(&pmd->dp->tx_flush_interval, &tx_flush_interval);
4392
0
    p->flush_time = pmd->ctx.now + tx_flush_interval;
4393
4394
0
    ovs_assert(pmd->n_output_batches > 0);
4395
0
    pmd->n_output_batches--;
4396
4397
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_PKTS, output_cnt);
4398
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_BATCHES, 1);
4399
4400
    /* Distribute send cycles evenly among transmitted packets and assign to
4401
     * their respective rx queues. */
4402
0
    cycles = cycle_timer_stop(&pmd->perf_stats, &timer) / output_cnt;
4403
0
    for (i = 0; i < output_cnt; i++) {
4404
0
        if (p->output_pkts_rxqs[i]) {
4405
0
            dp_netdev_rxq_add_cycles(p->output_pkts_rxqs[i],
4406
0
                                     RXQ_CYCLES_PROC_CURR, cycles);
4407
0
        }
4408
0
    }
4409
4410
0
    return output_cnt;
4411
0
}
4412
4413
static int
4414
dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
4415
                                   bool force)
4416
0
{
4417
0
    struct tx_port *p;
4418
0
    int output_cnt = 0;
4419
4420
0
    if (!pmd->n_output_batches) {
4421
0
        return 0;
4422
0
    }
4423
4424
0
    HMAP_FOR_EACH (p, node, &pmd->send_port_cache) {
4425
0
        if (!dp_packet_batch_is_empty(&p->output_pkts)
4426
0
            && (force || pmd->ctx.now >= p->flush_time)) {
4427
0
            output_cnt += dp_netdev_pmd_flush_output_on_port(pmd, p);
4428
0
        }
4429
0
    }
4430
0
    return output_cnt;
4431
0
}
4432
4433
static int
4434
dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
4435
                           struct dp_netdev_rxq *rxq,
4436
                           odp_port_t port_no)
4437
0
{
4438
0
    struct pmd_perf_stats *s = &pmd->perf_stats;
4439
0
    struct dp_packet_batch batch;
4440
0
    struct cycle_timer timer;
4441
0
    int error;
4442
0
    int batch_cnt = 0;
4443
0
    int rem_qlen = 0, *qlen_p = NULL;
4444
0
    uint64_t cycles;
4445
4446
    /* Measure duration for polling and processing rx burst. */
4447
0
    cycle_timer_start(&pmd->perf_stats, &timer);
4448
4449
0
    pmd->ctx.last_rxq = rxq;
4450
0
    dp_packet_batch_init(&batch);
4451
4452
    /* Fetch the rx queue length only for vhostuser ports. */
4453
0
    if (pmd_perf_metrics_enabled(pmd) && rxq->is_vhost) {
4454
0
        qlen_p = &rem_qlen;
4455
0
    }
4456
4457
0
    error = netdev_rxq_recv(rxq->rx, &batch, qlen_p);
4458
0
    if (!error) {
4459
        /* At least one packet received. */
4460
0
        *recirc_depth_get() = 0;
4461
0
        pmd_thread_ctx_time_update(pmd);
4462
0
        batch_cnt = dp_packet_batch_size(&batch);
4463
0
        if (pmd_perf_metrics_enabled(pmd)) {
4464
            /* Update batch histogram. */
4465
0
            s->current.batches++;
4466
0
            histogram_add_sample(&s->pkts_per_batch, batch_cnt);
4467
            /* Update the maximum vhost rx queue fill level. */
4468
0
            if (rxq->is_vhost && rem_qlen >= 0) {
4469
0
                uint32_t qfill = batch_cnt + rem_qlen;
4470
0
                if (qfill > s->current.max_vhost_qfill) {
4471
0
                    s->current.max_vhost_qfill = qfill;
4472
0
                }
4473
0
            }
4474
0
        }
4475
4476
        /* Process packet batch. */
4477
0
        dp_netdev_input(pmd, &batch, port_no);
4478
4479
        /* Assign processing cycles to rx queue. */
4480
0
        cycles = cycle_timer_stop(&pmd->perf_stats, &timer);
4481
0
        dp_netdev_rxq_add_cycles(rxq, RXQ_CYCLES_PROC_CURR, cycles);
4482
4483
0
        dp_netdev_pmd_flush_output_packets(pmd, false);
4484
0
    } else {
4485
        /* Discard cycles. */
4486
0
        cycle_timer_stop(&pmd->perf_stats, &timer);
4487
0
        if (error != EAGAIN && error != EOPNOTSUPP) {
4488
0
            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
4489
4490
0
            VLOG_ERR_RL(&rl, "error receiving data from %s: %s",
4491
0
                    netdev_rxq_get_name(rxq->rx), ovs_strerror(error));
4492
0
        }
4493
0
    }
4494
4495
0
    pmd->ctx.last_rxq = NULL;
4496
4497
0
    return batch_cnt;
4498
0
}
4499
4500
static struct tx_port *
4501
tx_port_lookup(const struct hmap *hmap, odp_port_t port_no)
4502
0
{
4503
0
    struct tx_port *tx;
4504
4505
0
    HMAP_FOR_EACH_IN_BUCKET (tx, node, hash_port_no(port_no), hmap) {
4506
0
        if (tx->port->port_no == port_no) {
4507
0
            return tx;
4508
0
        }
4509
0
    }
4510
4511
0
    return NULL;
4512
0
}
4513
4514
static struct tx_bond *
4515
tx_bond_lookup(const struct cmap *tx_bonds, uint32_t bond_id)
4516
0
{
4517
0
    uint32_t hash = hash_bond_id(bond_id);
4518
0
    struct tx_bond *tx;
4519
4520
0
    CMAP_FOR_EACH_WITH_HASH (tx, node, hash, tx_bonds) {
4521
0
        if (tx->bond_id == bond_id) {
4522
0
            return tx;
4523
0
        }
4524
0
    }
4525
0
    return NULL;
4526
0
}
4527
4528
static int
4529
port_reconfigure(struct dp_netdev_port *port)
4530
0
{
4531
0
    struct netdev *netdev = port->netdev;
4532
0
    int i, err;
4533
4534
    /* Closes the existing 'rxq's. */
4535
0
    for (i = 0; i < port->n_rxq; i++) {
4536
0
        netdev_rxq_close(port->rxqs[i].rx);
4537
0
        port->rxqs[i].rx = NULL;
4538
0
    }
4539
0
    unsigned last_nrxq = port->n_rxq;
4540
0
    port->n_rxq = 0;
4541
4542
    /* Allows 'netdev' to apply the pending configuration changes. */
4543
0
    if (netdev_is_reconf_required(netdev) || port->need_reconfigure) {
4544
0
        err = netdev_reconfigure(netdev);
4545
0
        if (err && (err != EOPNOTSUPP)) {
4546
0
            VLOG_ERR("Failed to set interface %s new configuration",
4547
0
                     netdev_get_name(netdev));
4548
0
            return err;
4549
0
        }
4550
0
    }
4551
    /* If the netdev_reconfigure() above succeeds, reopens the 'rxq's. */
4552
0
    port->rxqs = xrealloc(port->rxqs,
4553
0
                          sizeof *port->rxqs * netdev_n_rxq(netdev));
4554
    /* Realloc 'used' counters for tx queues. */
4555
0
    free(port->txq_used);
4556
0
    port->txq_used = xcalloc(netdev_n_txq(netdev), sizeof *port->txq_used);
4557
4558
0
    for (i = 0; i < netdev_n_rxq(netdev); i++) {
4559
0
        bool new_queue = i >= last_nrxq;
4560
0
        if (new_queue) {
4561
0
            memset(&port->rxqs[i], 0, sizeof port->rxqs[i]);
4562
0
        }
4563
4564
0
        port->rxqs[i].port = port;
4565
0
        port->rxqs[i].is_vhost = !strncmp(port->type, "dpdkvhost", 9);
4566
4567
0
        err = netdev_rxq_open(netdev, &port->rxqs[i].rx, i);
4568
0
        if (err) {
4569
0
            return err;
4570
0
        }
4571
0
        port->n_rxq++;
4572
0
    }
4573
4574
    /* Parse affinity list to apply configuration for new queues. */
4575
0
    dpif_netdev_port_set_rxq_affinity(port, port->rxq_affinity_list);
4576
4577
    /* If reconfiguration was successful mark it as such, so we can use it */
4578
0
    port->need_reconfigure = false;
4579
4580
0
    return 0;
4581
0
}
4582
4583
struct sched_numa_list {
4584
    struct hmap numas;  /* Contains 'struct sched_numa'. */
4585
};
4586
4587
/* Meta data for out-of-place pmd rxq assignments. */
4588
struct sched_pmd {
4589
    struct sched_numa *numa;
4590
    /* Associated PMD thread. */
4591
    struct dp_netdev_pmd_thread *pmd;
4592
    uint64_t pmd_proc_cycles;
4593
    struct dp_netdev_rxq **rxqs;
4594
    unsigned n_rxq;
4595
    bool isolated;
4596
};
4597
4598
struct sched_numa {
4599
    struct hmap_node node;
4600
    int numa_id;
4601
    /* PMDs on numa node. */
4602
    struct sched_pmd *pmds;
4603
    /* Num of PMDs on numa node. */
4604
    unsigned n_pmds;
4605
    /* Num of isolated PMDs on numa node. */
4606
    unsigned n_isolated;
4607
    int rr_cur_index;
4608
    bool rr_idx_inc;
4609
};
4610
4611
static size_t
4612
sched_numa_list_count(struct sched_numa_list *numa_list)
4613
0
{
4614
0
    return hmap_count(&numa_list->numas);
4615
0
}
4616
4617
static struct sched_numa *
4618
sched_numa_list_next(struct sched_numa_list *numa_list,
4619
                     const struct sched_numa *numa)
4620
0
{
4621
0
    struct hmap_node *node = NULL;
4622
4623
0
    if (numa) {
4624
0
        node = hmap_next(&numa_list->numas, &numa->node);
4625
0
    }
4626
0
    if (!node) {
4627
0
        node = hmap_first(&numa_list->numas);
4628
0
    }
4629
4630
0
    return (node) ? CONTAINER_OF(node, struct sched_numa, node) : NULL;
4631
0
}
4632
4633
static struct sched_numa *
4634
sched_numa_list_lookup(struct sched_numa_list *numa_list, int numa_id)
4635
0
{
4636
0
    struct sched_numa *numa;
4637
4638
0
    HMAP_FOR_EACH_WITH_HASH (numa, node, hash_int(numa_id, 0),
4639
0
                             &numa_list->numas) {
4640
0
        if (numa->numa_id == numa_id) {
4641
0
            return numa;
4642
0
        }
4643
0
    }
4644
0
    return NULL;
4645
0
}
4646
4647
static int
4648
compare_sched_pmd_list(const void *a_, const void *b_)
4649
0
{
4650
0
    struct sched_pmd *a, *b;
4651
4652
0
    a = (struct sched_pmd *) a_;
4653
0
    b = (struct sched_pmd *) b_;
4654
4655
0
    return compare_poll_thread_list(&a->pmd, &b->pmd);
4656
0
}
4657
4658
static void
4659
sort_numa_list_pmds(struct sched_numa_list *numa_list)
4660
0
{
4661
0
    struct sched_numa *numa;
4662
4663
0
    HMAP_FOR_EACH (numa, node, &numa_list->numas) {
4664
0
        if (numa->n_pmds > 1) {
4665
0
            qsort(numa->pmds, numa->n_pmds, sizeof *numa->pmds,
4666
0
                  compare_sched_pmd_list);
4667
0
        }
4668
0
    }
4669
0
}
4670
4671
/* Populate numas and pmds on those numas. */
4672
static void
4673
sched_numa_list_populate(struct sched_numa_list *numa_list,
4674
                         struct dp_netdev *dp)
4675
0
{
4676
0
    struct dp_netdev_pmd_thread *pmd;
4677
4678
0
    hmap_init(&numa_list->numas);
4679
4680
    /* For each pmd on this datapath. */
4681
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4682
0
        struct sched_numa *numa;
4683
0
        struct sched_pmd *sched_pmd;
4684
0
        if (pmd->core_id == NON_PMD_CORE_ID) {
4685
0
            continue;
4686
0
        }
4687
4688
        /* Get the numa of the PMD. */
4689
0
        numa = sched_numa_list_lookup(numa_list, pmd->numa_id);
4690
        /* Create a new numa node for it if not already created. */
4691
0
        if (!numa) {
4692
0
            numa = xzalloc(sizeof *numa);
4693
0
            numa->numa_id = pmd->numa_id;
4694
0
            hmap_insert(&numa_list->numas, &numa->node,
4695
0
                        hash_int(pmd->numa_id, 0));
4696
0
        }
4697
4698
        /* Create a sched_pmd on this numa for the pmd. */
4699
0
        numa->n_pmds++;
4700
0
        numa->pmds = xrealloc(numa->pmds, numa->n_pmds * sizeof *numa->pmds);
4701
0
        sched_pmd = &numa->pmds[numa->n_pmds - 1];
4702
0
        memset(sched_pmd, 0, sizeof *sched_pmd);
4703
0
        sched_pmd->numa = numa;
4704
0
        sched_pmd->pmd = pmd;
4705
        /* At least one pmd is present so initialize curr_idx and idx_inc. */
4706
0
        numa->rr_cur_index = 0;
4707
0
        numa->rr_idx_inc = true;
4708
0
    }
4709
0
    sort_numa_list_pmds(numa_list);
4710
0
}
4711
4712
static void
4713
sched_numa_list_free_entries(struct sched_numa_list *numa_list)
4714
0
{
4715
0
    struct sched_numa *numa;
4716
4717
0
    HMAP_FOR_EACH_POP (numa, node, &numa_list->numas) {
4718
0
        for (unsigned i = 0; i < numa->n_pmds; i++) {
4719
0
            struct sched_pmd *sched_pmd;
4720
4721
0
            sched_pmd = &numa->pmds[i];
4722
0
            sched_pmd->n_rxq = 0;
4723
0
            free(sched_pmd->rxqs);
4724
0
        }
4725
0
        numa->n_pmds = 0;
4726
0
        free(numa->pmds);
4727
0
        free(numa);
4728
0
    }
4729
0
    hmap_destroy(&numa_list->numas);
4730
0
}
4731
4732
static struct sched_pmd *
4733
sched_pmd_find_by_pmd(struct sched_numa_list *numa_list,
4734
                      struct dp_netdev_pmd_thread *pmd)
4735
0
{
4736
0
    struct sched_numa *numa;
4737
4738
0
    HMAP_FOR_EACH (numa, node, &numa_list->numas) {
4739
0
        for (unsigned i = 0; i < numa->n_pmds; i++) {
4740
0
            struct sched_pmd *sched_pmd;
4741
4742
0
            sched_pmd = &numa->pmds[i];
4743
0
            if (pmd == sched_pmd->pmd) {
4744
0
                return sched_pmd;
4745
0
            }
4746
0
        }
4747
0
    }
4748
0
    return NULL;
4749
0
}
4750
4751
static void
4752
sched_pmd_add_rxq(struct sched_pmd *sched_pmd, struct dp_netdev_rxq *rxq,
4753
                  uint64_t cycles)
4754
0
{
4755
    /* As sched_pmd is allocated outside this fn. better to not assume
4756
     * rxqs is initialized to NULL. */
4757
0
    if (sched_pmd->n_rxq == 0) {
4758
0
        sched_pmd->rxqs = xmalloc(sizeof *sched_pmd->rxqs);
4759
0
    } else {
4760
0
        sched_pmd->rxqs = xrealloc(sched_pmd->rxqs, (sched_pmd->n_rxq + 1) *
4761
0
                                                    sizeof *sched_pmd->rxqs);
4762
0
    }
4763
4764
0
    sched_pmd->rxqs[sched_pmd->n_rxq++] = rxq;
4765
0
    sched_pmd->pmd_proc_cycles += cycles;
4766
0
}
4767
4768
static void
4769
sched_numa_list_assignments(struct sched_numa_list *numa_list,
4770
                            struct dp_netdev *dp)
4771
    OVS_REQ_RDLOCK(dp->port_rwlock)
4772
0
{
4773
0
    struct dp_netdev_port *port;
4774
4775
    /* For each port. */
4776
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
4777
0
        if (!netdev_is_pmd(port->netdev)) {
4778
0
            continue;
4779
0
        }
4780
        /* For each rxq on the port. */
4781
0
        for (unsigned qid = 0; qid < port->n_rxq; qid++) {
4782
0
            struct dp_netdev_rxq *rxq = &port->rxqs[qid];
4783
0
            struct sched_pmd *sched_pmd;
4784
0
            uint64_t proc_cycles = 0;
4785
4786
0
            for (int i = 0; i < PMD_INTERVAL_MAX; i++) {
4787
0
                proc_cycles  += dp_netdev_rxq_get_intrvl_cycles(rxq, i);
4788
0
            }
4789
4790
0
            sched_pmd = sched_pmd_find_by_pmd(numa_list, rxq->pmd);
4791
0
            if (sched_pmd) {
4792
0
                if (rxq->core_id != OVS_CORE_UNSPEC && dp->pmd_iso) {
4793
0
                    sched_pmd->isolated = true;
4794
0
                }
4795
0
                sched_pmd_add_rxq(sched_pmd, rxq, proc_cycles);
4796
0
            }
4797
0
        }
4798
0
    }
4799
0
}
4800
4801
static void
4802
sched_numa_list_put_in_place(struct sched_numa_list *numa_list)
4803
0
{
4804
0
    struct sched_numa *numa;
4805
4806
    /* For each numa. */
4807
0
    HMAP_FOR_EACH (numa, node, &numa_list->numas) {
4808
        /* For each pmd. */
4809
0
        for (int i = 0; i < numa->n_pmds; i++) {
4810
0
            struct sched_pmd *sched_pmd;
4811
4812
0
            sched_pmd = &numa->pmds[i];
4813
0
            sched_pmd->pmd->isolated = sched_pmd->isolated;
4814
            /* For each rxq. */
4815
0
            for (unsigned k = 0; k < sched_pmd->n_rxq; k++) {
4816
                /* Store the new pmd from the out of place sched_numa_list
4817
                 * struct to the dp_netdev_rxq struct */
4818
0
                sched_pmd->rxqs[k]->pmd = sched_pmd->pmd;
4819
0
            }
4820
0
        }
4821
0
    }
4822
0
}
4823
4824
/* Returns 'true' if OVS rxq scheduling algorithm assigned any unpinned rxq to
4825
 * a PMD thread core on a non-local numa node. */
4826
static bool
4827
sched_numa_list_cross_numa_polling(struct sched_numa_list *numa_list)
4828
0
{
4829
0
    struct sched_numa *numa;
4830
4831
0
    HMAP_FOR_EACH (numa, node, &numa_list->numas) {
4832
0
        for (int i = 0; i < numa->n_pmds; i++) {
4833
0
            struct sched_pmd *sched_pmd;
4834
4835
0
            sched_pmd = &numa->pmds[i];
4836
0
            if (sched_pmd->isolated) {
4837
                /* All rxqs on this PMD thread core are pinned. */
4838
0
                continue;
4839
0
            }
4840
0
            for (unsigned k = 0; k < sched_pmd->n_rxq; k++) {
4841
0
                struct dp_netdev_rxq *rxq = sched_pmd->rxqs[k];
4842
                /* Check if the rxq is not pinned to a specific PMD thread core
4843
                 * by the user AND the PMD thread core that OVS assigned is
4844
                 * non-local to the rxq port. */
4845
0
                if (rxq->core_id == OVS_CORE_UNSPEC &&
4846
0
                    rxq->pmd->numa_id !=
4847
0
                        netdev_get_numa_id(rxq->port->netdev)) {
4848
0
                    return true;
4849
0
                }
4850
0
            }
4851
0
        }
4852
0
    }
4853
0
    return false;
4854
0
}
4855
4856
static unsigned
4857
sched_numa_noniso_pmd_count(struct sched_numa *numa)
4858
0
{
4859
0
    if (numa->n_pmds > numa->n_isolated) {
4860
0
        return numa->n_pmds - numa->n_isolated;
4861
0
    }
4862
0
    return 0;
4863
0
}
4864
4865
/* Sort Rx Queues by the processing cycles they are consuming. */
4866
static int
4867
compare_rxq_cycles(const void *a, const void *b)
4868
0
{
4869
0
    struct dp_netdev_rxq *qa;
4870
0
    struct dp_netdev_rxq *qb;
4871
0
    uint64_t cycles_qa, cycles_qb;
4872
4873
0
    qa = *(struct dp_netdev_rxq **) a;
4874
0
    qb = *(struct dp_netdev_rxq **) b;
4875
4876
0
    cycles_qa = dp_netdev_rxq_get_cycles(qa, RXQ_CYCLES_PROC_HIST);
4877
0
    cycles_qb = dp_netdev_rxq_get_cycles(qb, RXQ_CYCLES_PROC_HIST);
4878
4879
0
    if (cycles_qa != cycles_qb) {
4880
0
        return (cycles_qa < cycles_qb) ? 1 : -1;
4881
0
    } else {
4882
        /* Cycles are the same so tiebreak on port/queue id.
4883
         * Tiebreaking (as opposed to return 0) ensures consistent
4884
         * sort results across multiple OS's. */
4885
0
        uint32_t port_qa = odp_to_u32(qa->port->port_no);
4886
0
        uint32_t port_qb = odp_to_u32(qb->port->port_no);
4887
0
        if (port_qa != port_qb) {
4888
0
            return port_qa > port_qb ? 1 : -1;
4889
0
        } else {
4890
0
            return netdev_rxq_get_queue_id(qa->rx)
4891
0
                    - netdev_rxq_get_queue_id(qb->rx);
4892
0
        }
4893
0
    }
4894
0
}
4895
4896
static bool
4897
sched_pmd_new_lowest(struct sched_pmd *current_lowest, struct sched_pmd *pmd,
4898
                     bool has_proc)
4899
0
{
4900
0
    uint64_t current_num, pmd_num;
4901
4902
0
    if (current_lowest == NULL) {
4903
0
        return true;
4904
0
    }
4905
4906
0
    if (has_proc) {
4907
0
        current_num = current_lowest->pmd_proc_cycles;
4908
0
        pmd_num = pmd->pmd_proc_cycles;
4909
0
    } else {
4910
0
        current_num = current_lowest->n_rxq;
4911
0
        pmd_num = pmd->n_rxq;
4912
0
    }
4913
4914
0
    if (pmd_num < current_num) {
4915
0
        return true;
4916
0
    }
4917
0
    return false;
4918
0
}
4919
4920
static struct sched_pmd *
4921
sched_pmd_get_lowest(struct sched_numa *numa, bool has_cyc)
4922
0
{
4923
0
    struct sched_pmd *lowest_sched_pmd = NULL;
4924
4925
0
    for (unsigned i = 0; i < numa->n_pmds; i++) {
4926
0
        struct sched_pmd *sched_pmd;
4927
4928
0
        sched_pmd = &numa->pmds[i];
4929
0
        if (sched_pmd->isolated) {
4930
0
            continue;
4931
0
        }
4932
0
        if (sched_pmd_new_lowest(lowest_sched_pmd, sched_pmd, has_cyc)) {
4933
0
            lowest_sched_pmd = sched_pmd;
4934
0
        }
4935
0
    }
4936
0
    return lowest_sched_pmd;
4937
0
}
4938
4939
/*
4940
 * Returns the next pmd from the numa node.
4941
 *
4942
 * If 'updown' is 'true' it will alternate between selecting the next pmd in
4943
 * either an up or down walk, switching between up/down when the first or last
4944
 * core is reached. e.g. 1,2,3,3,2,1,1,2...
4945
 *
4946
 * If 'updown' is 'false' it will select the next pmd wrapping around when
4947
 * last core reached. e.g. 1,2,3,1,2,3,1,2...
4948
 */
4949
static struct sched_pmd *
4950
sched_pmd_next_rr(struct sched_numa *numa, bool updown)
4951
0
{
4952
0
    int numa_idx = numa->rr_cur_index;
4953
4954
0
    if (numa->rr_idx_inc == true) {
4955
        /* Incrementing through list of pmds. */
4956
0
        if (numa->rr_cur_index == numa->n_pmds - 1) {
4957
            /* Reached the last pmd. */
4958
0
            if (updown) {
4959
0
                numa->rr_idx_inc = false;
4960
0
            } else {
4961
0
                numa->rr_cur_index = 0;
4962
0
            }
4963
0
        } else {
4964
0
            numa->rr_cur_index++;
4965
0
        }
4966
0
    } else {
4967
        /* Decrementing through list of pmds. */
4968
0
        if (numa->rr_cur_index == 0) {
4969
            /* Reached the first pmd. */
4970
0
            numa->rr_idx_inc = true;
4971
0
        } else {
4972
0
            numa->rr_cur_index--;
4973
0
        }
4974
0
    }
4975
0
    return &numa->pmds[numa_idx];
4976
0
}
4977
4978
static struct sched_pmd *
4979
sched_pmd_next_noniso_rr(struct sched_numa *numa, bool updown)
4980
0
{
4981
0
    struct sched_pmd *sched_pmd = NULL;
4982
4983
    /* sched_pmd_next_rr() may return duplicate PMDs before all PMDs have been
4984
     * returned depending on updown. Call it more than n_pmds to ensure all
4985
     * PMDs can be searched for the next non-isolated PMD. */
4986
0
    for (unsigned i = 0; i < numa->n_pmds * 2; i++) {
4987
0
        sched_pmd = sched_pmd_next_rr(numa, updown);
4988
0
        if (!sched_pmd->isolated) {
4989
0
            break;
4990
0
        }
4991
0
        sched_pmd = NULL;
4992
0
    }
4993
0
    return sched_pmd;
4994
0
}
4995
4996
static struct sched_pmd *
4997
sched_pmd_next(struct sched_numa *numa, enum sched_assignment_type algo,
4998
               bool has_proc)
4999
0
{
5000
0
    if (algo == SCHED_GROUP) {
5001
0
        return sched_pmd_get_lowest(numa, has_proc);
5002
0
    }
5003
5004
    /* By default RR the PMDs. */
5005
0
    return sched_pmd_next_noniso_rr(numa, algo == SCHED_CYCLES ? true : false);
5006
0
}
5007
5008
static const char *
5009
get_assignment_type_string(enum sched_assignment_type algo)
5010
0
{
5011
0
    switch (algo) {
5012
0
    case SCHED_ROUNDROBIN: return "roundrobin";
5013
0
    case SCHED_CYCLES: return "cycles";
5014
0
    case SCHED_GROUP: return "group";
5015
0
    default: return "Unknown";
5016
0
    }
5017
0
}
5018
5019
0
#define MAX_RXQ_CYC_TEXT 40
5020
0
#define MAX_RXQ_CYC_STRLEN (INT_STRLEN(uint64_t) + MAX_RXQ_CYC_TEXT)
5021
5022
static char *
5023
get_rxq_cyc_log(char *a, enum sched_assignment_type algo, uint64_t cycles)
5024
0
{
5025
0
    int ret = 0;
5026
5027
0
    if (algo != SCHED_ROUNDROBIN) {
5028
0
        ret = snprintf(a, MAX_RXQ_CYC_STRLEN,
5029
0
                       " (measured processing cycles %"PRIu64")", cycles);
5030
0
    }
5031
5032
0
    if (algo == SCHED_ROUNDROBIN || ret <= 0) {
5033
0
        a[0] = '\0';
5034
0
    }
5035
0
    return a;
5036
0
}
5037
5038
static void
5039
sched_numa_list_schedule(struct sched_numa_list *numa_list,
5040
                         struct dp_netdev *dp,
5041
                         enum sched_assignment_type algo,
5042
                         enum vlog_level level)
5043
    OVS_REQ_RDLOCK(dp->port_rwlock)
5044
0
{
5045
0
    struct dp_netdev_port *port;
5046
0
    struct dp_netdev_rxq **rxqs = NULL;
5047
0
    struct sched_numa *last_cross_numa;
5048
0
    unsigned n_rxqs = 0;
5049
0
    bool start_logged = false;
5050
0
    size_t n_numa;
5051
5052
    /* For each port. */
5053
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
5054
0
        if (!netdev_is_pmd(port->netdev)) {
5055
0
            continue;
5056
0
        }
5057
5058
        /* For each rxq on the port. */
5059
0
        for (int qid = 0; qid < port->n_rxq; qid++) {
5060
0
            struct dp_netdev_rxq *rxq = &port->rxqs[qid];
5061
5062
0
            if (algo != SCHED_ROUNDROBIN) {
5063
0
                uint64_t cycle_hist = 0;
5064
5065
                /* Sum the queue intervals and store the cycle history. */
5066
0
                for (unsigned i = 0; i < PMD_INTERVAL_MAX; i++) {
5067
0
                    cycle_hist += dp_netdev_rxq_get_intrvl_cycles(rxq, i);
5068
0
                }
5069
0
                dp_netdev_rxq_set_cycles(rxq, RXQ_CYCLES_PROC_HIST,
5070
0
                                         cycle_hist);
5071
0
            }
5072
5073
            /* Check if this rxq is pinned. */
5074
0
            if (rxq->core_id != OVS_CORE_UNSPEC) {
5075
0
                struct sched_pmd *sched_pmd;
5076
0
                struct dp_netdev_pmd_thread *pmd;
5077
0
                struct sched_numa *numa;
5078
0
                bool iso = dp->pmd_iso;
5079
0
                uint64_t proc_cycles;
5080
0
                char rxq_cyc_log[MAX_RXQ_CYC_STRLEN];
5081
5082
                /* This rxq should be pinned, pin it now. */
5083
0
                pmd = dp_netdev_get_pmd(dp, rxq->core_id);
5084
0
                sched_pmd = sched_pmd_find_by_pmd(numa_list, pmd);
5085
0
                dp_netdev_pmd_unref(pmd);
5086
0
                if (!sched_pmd) {
5087
                    /* Cannot find the PMD.  Cannot pin this rxq. */
5088
0
                    VLOG(level == VLL_DBG ? VLL_DBG : VLL_WARN,
5089
0
                            "Core %2u cannot be pinned with "
5090
0
                            "port \'%s\' rx queue %d. Use pmd-cpu-mask to "
5091
0
                            "enable a pmd on core %u. An alternative core "
5092
0
                            "will be assigned.",
5093
0
                            rxq->core_id,
5094
0
                            netdev_rxq_get_name(rxq->rx),
5095
0
                            netdev_rxq_get_queue_id(rxq->rx),
5096
0
                            rxq->core_id);
5097
0
                    rxqs = xrealloc(rxqs, (n_rxqs + 1) * sizeof *rxqs);
5098
0
                    rxqs[n_rxqs++] = rxq;
5099
0
                    continue;
5100
0
                }
5101
0
                if (iso) {
5102
                    /* Mark PMD as isolated if not done already. */
5103
0
                    if (sched_pmd->isolated == false) {
5104
0
                        sched_pmd->isolated = true;
5105
0
                        numa = sched_pmd->numa;
5106
0
                        numa->n_isolated++;
5107
0
                    }
5108
0
                }
5109
0
                proc_cycles = dp_netdev_rxq_get_cycles(rxq,
5110
0
                                                       RXQ_CYCLES_PROC_HIST);
5111
0
                VLOG(level, "Core %2u on numa node %d is pinned with "
5112
0
                            "port \'%s\' rx queue %d%s",
5113
0
                            sched_pmd->pmd->core_id, sched_pmd->pmd->numa_id,
5114
0
                            netdev_rxq_get_name(rxq->rx),
5115
0
                            netdev_rxq_get_queue_id(rxq->rx),
5116
0
                            get_rxq_cyc_log(rxq_cyc_log, algo, proc_cycles));
5117
0
                sched_pmd_add_rxq(sched_pmd, rxq, proc_cycles);
5118
0
            } else {
5119
0
                rxqs = xrealloc(rxqs, (n_rxqs + 1) * sizeof *rxqs);
5120
0
                rxqs[n_rxqs++] = rxq;
5121
0
            }
5122
0
        }
5123
0
    }
5124
5125
0
    if (n_rxqs > 1 && algo != SCHED_ROUNDROBIN) {
5126
        /* Sort the queues in order of the processing cycles
5127
         * they consumed during their last pmd interval. */
5128
0
        qsort(rxqs, n_rxqs, sizeof *rxqs, compare_rxq_cycles);
5129
0
    }
5130
5131
0
    last_cross_numa = NULL;
5132
0
    n_numa = sched_numa_list_count(numa_list);
5133
0
    for (unsigned i = 0; i < n_rxqs; i++) {
5134
0
        struct dp_netdev_rxq *rxq = rxqs[i];
5135
0
        struct sched_pmd *sched_pmd = NULL;
5136
0
        struct sched_numa *numa;
5137
0
        int port_numa_id;
5138
0
        uint64_t proc_cycles;
5139
0
        char rxq_cyc_log[MAX_RXQ_CYC_STRLEN];
5140
5141
0
        if (start_logged == false && level != VLL_DBG) {
5142
0
            VLOG(level, "Performing pmd to rx queue assignment using %s "
5143
0
                        "algorithm.", get_assignment_type_string(algo));
5144
0
            start_logged = true;
5145
0
        }
5146
5147
        /* Store the cycles for this rxq as we will log these later. */
5148
0
        proc_cycles = dp_netdev_rxq_get_cycles(rxq, RXQ_CYCLES_PROC_HIST);
5149
5150
0
        port_numa_id = netdev_get_numa_id(rxq->port->netdev);
5151
5152
        /* Select numa. */
5153
0
        numa = sched_numa_list_lookup(numa_list, port_numa_id);
5154
5155
        /* Check if numa has no PMDs or no non-isolated PMDs. */
5156
0
        if (!numa || !sched_numa_noniso_pmd_count(numa)) {
5157
            /* Unable to use this numa to find a PMD. */
5158
0
            numa = NULL;
5159
            /* Find any numa with available PMDs. */
5160
0
            for (int j = 0; j < n_numa; j++) {
5161
0
                numa = sched_numa_list_next(numa_list, last_cross_numa);
5162
0
                last_cross_numa = numa;
5163
0
                if (sched_numa_noniso_pmd_count(numa)) {
5164
0
                    break;
5165
0
                }
5166
0
                numa = NULL;
5167
0
            }
5168
0
        }
5169
5170
0
        if (numa) {
5171
            /* Select the PMD that should be used for this rxq. */
5172
0
            sched_pmd = sched_pmd_next(numa, algo,
5173
0
                                       proc_cycles ? true : false);
5174
0
        }
5175
5176
        /* Check that a pmd has been selected. */
5177
0
        if (sched_pmd) {
5178
0
            int pmd_numa_id;
5179
5180
0
            pmd_numa_id = sched_pmd->numa->numa_id;
5181
            /* Check if selected pmd numa matches port numa. */
5182
0
            if (pmd_numa_id != port_numa_id) {
5183
0
                VLOG(level, "There's no available (non-isolated) pmd thread "
5184
0
                            "on numa node %d. Port \'%s\' rx queue %d will "
5185
0
                            "be assigned to a pmd on numa node %d. "
5186
0
                            "This may lead to reduced performance.",
5187
0
                            port_numa_id, netdev_rxq_get_name(rxq->rx),
5188
0
                            netdev_rxq_get_queue_id(rxq->rx), pmd_numa_id);
5189
0
            }
5190
0
            VLOG(level, "Core %2u on numa node %d assigned port \'%s\' "
5191
0
                        "rx queue %d%s.",
5192
0
                        sched_pmd->pmd->core_id, sched_pmd->pmd->numa_id,
5193
0
                        netdev_rxq_get_name(rxq->rx),
5194
0
                        netdev_rxq_get_queue_id(rxq->rx),
5195
0
                        get_rxq_cyc_log(rxq_cyc_log, algo, proc_cycles));
5196
0
            sched_pmd_add_rxq(sched_pmd, rxq, proc_cycles);
5197
0
        } else  {
5198
0
            VLOG(level == VLL_DBG ? level : VLL_WARN,
5199
0
                 "No non-isolated pmd on any numa available for "
5200
0
                 "port \'%s\' rx queue %d%s. "
5201
0
                 "This rx queue will not be polled.",
5202
0
                 netdev_rxq_get_name(rxq->rx),
5203
0
                 netdev_rxq_get_queue_id(rxq->rx),
5204
0
                 get_rxq_cyc_log(rxq_cyc_log, algo, proc_cycles));
5205
0
        }
5206
0
    }
5207
0
    free(rxqs);
5208
0
}
5209
5210
static void
5211
rxq_scheduling(struct dp_netdev *dp)
5212
    OVS_REQ_RDLOCK(dp->port_rwlock)
5213
0
{
5214
0
    struct sched_numa_list numa_list;
5215
0
    enum sched_assignment_type algo = dp->pmd_rxq_assign_type;
5216
5217
0
    sched_numa_list_populate(&numa_list, dp);
5218
0
    sched_numa_list_schedule(&numa_list, dp, algo, VLL_INFO);
5219
0
    sched_numa_list_put_in_place(&numa_list);
5220
5221
0
    sched_numa_list_free_entries(&numa_list);
5222
0
}
5223
5224
static uint64_t variance(uint64_t a[], int n);
5225
5226
static uint64_t
5227
sched_numa_variance(struct sched_numa *numa)
5228
0
{
5229
0
    uint64_t *percent_busy = NULL;
5230
0
    int n_proc = 0;
5231
0
    uint64_t var;
5232
5233
0
    percent_busy = xmalloc(numa->n_pmds * sizeof *percent_busy);
5234
5235
0
    for (unsigned i = 0; i < numa->n_pmds; i++) {
5236
0
        struct sched_pmd *sched_pmd;
5237
0
        uint64_t total_cycles = 0;
5238
5239
0
        sched_pmd = &numa->pmds[i];
5240
        /* Exclude isolated PMDs from variance calculations. */
5241
0
        if (sched_pmd->isolated == true) {
5242
0
            continue;
5243
0
        }
5244
        /* Get the total pmd cycles for an interval. */
5245
0
        atomic_read_relaxed(&sched_pmd->pmd->intrvl_cycles, &total_cycles);
5246
5247
0
        if (total_cycles) {
5248
            /* Estimate the cycles to cover all intervals. */
5249
0
            total_cycles *= PMD_INTERVAL_MAX;
5250
0
            percent_busy[n_proc++] = (sched_pmd->pmd_proc_cycles * 100)
5251
0
                                            / total_cycles;
5252
0
        } else {
5253
0
            percent_busy[n_proc++] = 0;
5254
0
        }
5255
0
    }
5256
0
    var = variance(percent_busy, n_proc);
5257
0
    free(percent_busy);
5258
0
    return var;
5259
0
}
5260
5261
/*
5262
 * This function checks that some basic conditions needed for a rebalance to be
5263
 * effective are met. Such as Rxq scheduling assignment type, more than one
5264
 * PMD, more than 2 Rxqs on a PMD. If there was no reconfiguration change
5265
 * since the last check, it reuses the last result.
5266
 *
5267
 * It is not intended to be an inclusive check of every condition that may make
5268
 * a rebalance ineffective. It is done as a quick check so a full
5269
 * pmd_rebalance_dry_run() can be avoided when it is not needed.
5270
 */
5271
static bool
5272
pmd_rebalance_dry_run_needed(struct dp_netdev *dp)
5273
    OVS_REQ_RDLOCK(dp->port_rwlock)
5274
0
{
5275
0
    struct dp_netdev_pmd_thread *pmd;
5276
0
    struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
5277
0
    unsigned int cnt = 0;
5278
0
    bool multi_rxq = false;
5279
5280
    /* Check if there was no reconfiguration since last check. */
5281
0
    if (!pmd_alb->recheck_config) {
5282
0
        if (!pmd_alb->do_dry_run) {
5283
0
            VLOG_DBG("PMD auto load balance nothing to do, "
5284
0
                     "no configuration changes since last check.");
5285
0
            return false;
5286
0
        }
5287
0
        return true;
5288
0
    }
5289
0
    pmd_alb->recheck_config = false;
5290
5291
    /* Check for incompatible assignment type. */
5292
0
    if (dp->pmd_rxq_assign_type == SCHED_ROUNDROBIN) {
5293
0
        VLOG_DBG("PMD auto load balance nothing to do, "
5294
0
                 "pmd-rxq-assign=roundrobin assignment type configured.");
5295
0
        return pmd_alb->do_dry_run = false;
5296
0
    }
5297
5298
    /* Check that there is at least 2 non-isolated PMDs and
5299
     * one of them is polling more than one rxq. */
5300
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5301
0
        if (pmd->core_id == NON_PMD_CORE_ID || pmd->isolated) {
5302
0
            continue;
5303
0
        }
5304
5305
0
        if (hmap_count(&pmd->poll_list) > 1) {
5306
0
            multi_rxq = true;
5307
0
        }
5308
0
        if (cnt && multi_rxq) {
5309
0
            return pmd_alb->do_dry_run = true;
5310
0
        }
5311
0
        cnt++;
5312
0
    }
5313
5314
0
    VLOG_DBG("PMD auto load balance nothing to do, "
5315
0
             "not enough non-isolated PMDs or RxQs.");
5316
0
    return pmd_alb->do_dry_run = false;
5317
0
}
5318
5319
static bool
5320
pmd_rebalance_dry_run(struct dp_netdev *dp)
5321
    OVS_REQ_RDLOCK(dp->port_rwlock)
5322
0
{
5323
0
    struct sched_numa_list numa_list_cur;
5324
0
    struct sched_numa_list numa_list_est;
5325
0
    bool thresh_met = false;
5326
5327
0
    VLOG_DBG("PMD auto load balance performing dry run.");
5328
5329
    /* Populate current assignments. */
5330
0
    sched_numa_list_populate(&numa_list_cur, dp);
5331
0
    sched_numa_list_assignments(&numa_list_cur, dp);
5332
5333
    /* Populate estimated assignments. */
5334
0
    sched_numa_list_populate(&numa_list_est, dp);
5335
0
    sched_numa_list_schedule(&numa_list_est, dp,
5336
0
                             dp->pmd_rxq_assign_type, VLL_DBG);
5337
5338
    /* Check if cross-numa polling, there is only one numa with PMDs. */
5339
0
    if (!sched_numa_list_cross_numa_polling(&numa_list_est) ||
5340
0
            sched_numa_list_count(&numa_list_est) == 1) {
5341
0
        struct sched_numa *numa_cur;
5342
5343
        /* Calculate variances. */
5344
0
        HMAP_FOR_EACH (numa_cur, node, &numa_list_cur.numas) {
5345
0
            uint64_t current_var, estimate_var;
5346
0
            struct sched_numa *numa_est;
5347
0
            uint64_t improvement = 0;
5348
5349
0
            numa_est = sched_numa_list_lookup(&numa_list_est,
5350
0
                                              numa_cur->numa_id);
5351
0
            if (!numa_est) {
5352
0
                continue;
5353
0
            }
5354
0
            current_var = sched_numa_variance(numa_cur);
5355
0
            estimate_var = sched_numa_variance(numa_est);
5356
0
            if (estimate_var < current_var) {
5357
0
                improvement = ((current_var - estimate_var) * 100)
5358
0
                              / current_var;
5359
0
            }
5360
0
            VLOG_DBG("Numa node %d. Current variance %"PRIu64" Estimated "
5361
0
                     "variance %"PRIu64". Variance improvement %"PRIu64"%%.",
5362
0
                     numa_cur->numa_id, current_var,
5363
0
                     estimate_var, improvement);
5364
0
            if (improvement >= dp->pmd_alb.rebalance_improve_thresh) {
5365
0
                thresh_met = true;
5366
0
            }
5367
0
        }
5368
0
        VLOG_DBG("PMD load variance improvement threshold %u%% is %s.",
5369
0
                 dp->pmd_alb.rebalance_improve_thresh,
5370
0
                 thresh_met ? "met" : "not met");
5371
0
    } else {
5372
0
        VLOG_DBG("PMD auto load balance detected cross-numa polling with "
5373
0
                 "multiple numa nodes. Unable to accurately estimate.");
5374
0
    }
5375
5376
0
    sched_numa_list_free_entries(&numa_list_cur);
5377
0
    sched_numa_list_free_entries(&numa_list_est);
5378
5379
0
    return thresh_met;
5380
0
}
5381
5382
static void
5383
reload_affected_pmds(struct dp_netdev *dp)
5384
0
{
5385
0
    struct dp_netdev_pmd_thread *pmd;
5386
5387
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5388
0
        if (pmd->need_reload) {
5389
0
            dp_netdev_reload_pmd__(pmd);
5390
0
        }
5391
0
    }
5392
5393
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5394
0
        if (pmd->need_reload) {
5395
0
            if (pmd->core_id != NON_PMD_CORE_ID) {
5396
0
                bool reload;
5397
5398
0
                do {
5399
0
                    atomic_read_explicit(&pmd->reload, &reload,
5400
0
                                         memory_order_acquire);
5401
0
                } while (reload);
5402
0
            }
5403
0
            pmd->need_reload = false;
5404
0
        }
5405
0
    }
5406
0
}
5407
5408
static void
5409
reconfigure_pmd_threads(struct dp_netdev *dp)
5410
    OVS_REQ_RDLOCK(dp->port_rwlock)
5411
0
{
5412
0
    struct dp_netdev_pmd_thread *pmd;
5413
0
    struct ovs_numa_dump *pmd_cores;
5414
0
    struct ovs_numa_info_core *core;
5415
0
    struct hmapx to_delete = HMAPX_INITIALIZER(&to_delete);
5416
0
    struct hmapx_node *node;
5417
0
    bool changed = false;
5418
0
    bool need_to_adjust_static_tx_qids = false;
5419
5420
    /* The pmd threads should be started only if there's a pmd port in the
5421
     * datapath.  If the user didn't provide any "pmd-cpu-mask", we start
5422
     * NR_PMD_THREADS per numa node. */
5423
0
    if (!has_pmd_port(dp)) {
5424
0
        pmd_cores = ovs_numa_dump_n_cores_per_numa(0);
5425
0
    } else if (dp->pmd_cmask && dp->pmd_cmask[0]) {
5426
0
        pmd_cores = ovs_numa_dump_cores_with_cmask(dp->pmd_cmask);
5427
0
    } else {
5428
0
        pmd_cores = ovs_numa_dump_n_cores_per_numa(NR_PMD_THREADS);
5429
0
    }
5430
5431
    /* We need to adjust 'static_tx_qid's only if we're reducing number of
5432
     * PMD threads. Otherwise, new threads will allocate all the freed ids. */
5433
0
    if (ovs_numa_dump_count(pmd_cores) < cmap_count(&dp->poll_threads) - 1) {
5434
        /* Adjustment is required to keep 'static_tx_qid's sequential and
5435
         * avoid possible issues, for example, imbalanced tx queue usage
5436
         * and unnecessary locking caused by remapping on netdev level. */
5437
0
        need_to_adjust_static_tx_qids = true;
5438
0
    }
5439
5440
    /* Check for unwanted pmd threads */
5441
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5442
0
        if (pmd->core_id == NON_PMD_CORE_ID) {
5443
0
            continue;
5444
0
        }
5445
0
        if (!ovs_numa_dump_contains_core(pmd_cores, pmd->numa_id,
5446
0
                                                    pmd->core_id)) {
5447
0
            hmapx_add(&to_delete, pmd);
5448
0
        } else if (need_to_adjust_static_tx_qids) {
5449
0
            atomic_store_relaxed(&pmd->reload_tx_qid, true);
5450
0
            pmd->need_reload = true;
5451
0
        }
5452
0
    }
5453
5454
0
    HMAPX_FOR_EACH (node, &to_delete) {
5455
0
        pmd = (struct dp_netdev_pmd_thread *) node->data;
5456
0
        VLOG_INFO("PMD thread on numa_id: %d, core id: %2d destroyed.",
5457
0
                  pmd->numa_id, pmd->core_id);
5458
0
        dp_netdev_del_pmd(dp, pmd);
5459
0
    }
5460
0
    changed = !hmapx_is_empty(&to_delete);
5461
0
    hmapx_destroy(&to_delete);
5462
5463
0
    if (need_to_adjust_static_tx_qids) {
5464
        /* 'static_tx_qid's are not sequential now.
5465
         * Reload remaining threads to fix this. */
5466
0
        reload_affected_pmds(dp);
5467
0
    }
5468
5469
    /* Check for required new pmd threads */
5470
0
    FOR_EACH_CORE_ON_DUMP(core, pmd_cores) {
5471
0
        pmd = dp_netdev_get_pmd(dp, core->core_id);
5472
0
        if (!pmd) {
5473
0
            struct ds name = DS_EMPTY_INITIALIZER;
5474
5475
0
            pmd = xzalloc(sizeof *pmd);
5476
0
            dp_netdev_configure_pmd(pmd, dp, core->core_id, core->numa_id);
5477
5478
0
            ds_put_format(&name, "pmd-c%02d/id:", core->core_id);
5479
0
            pmd->thread = ovs_thread_create(ds_cstr(&name),
5480
0
                                            pmd_thread_main, pmd);
5481
0
            ds_destroy(&name);
5482
5483
0
            VLOG_INFO("PMD thread on numa_id: %d, core id: %2d created.",
5484
0
                      pmd->numa_id, pmd->core_id);
5485
0
            changed = true;
5486
0
        } else {
5487
0
            dp_netdev_pmd_unref(pmd);
5488
0
        }
5489
0
    }
5490
5491
0
    if (changed) {
5492
0
        struct ovs_numa_info_numa *numa;
5493
5494
        /* Log the number of pmd threads per numa node. */
5495
0
        FOR_EACH_NUMA_ON_DUMP (numa, pmd_cores) {
5496
0
            VLOG_INFO("There are %"PRIuSIZE" pmd threads on numa node %d",
5497
0
                      numa->n_cores, numa->numa_id);
5498
0
        }
5499
0
    }
5500
5501
0
    ovs_numa_dump_destroy(pmd_cores);
5502
0
}
5503
5504
static void
5505
pmd_remove_stale_ports(struct dp_netdev *dp,
5506
                       struct dp_netdev_pmd_thread *pmd)
5507
    OVS_EXCLUDED(pmd->port_mutex)
5508
    OVS_REQ_RDLOCK(dp->port_rwlock)
5509
0
{
5510
0
    struct rxq_poll *poll;
5511
0
    struct tx_port *tx;
5512
5513
0
    ovs_mutex_lock(&pmd->port_mutex);
5514
0
    HMAP_FOR_EACH_SAFE (poll, node, &pmd->poll_list) {
5515
0
        struct dp_netdev_port *port = poll->rxq->port;
5516
5517
0
        if (port->need_reconfigure
5518
0
            || !hmap_contains(&dp->ports, &port->node)) {
5519
0
            dp_netdev_del_rxq_from_pmd(pmd, poll);
5520
0
        }
5521
0
    }
5522
0
    HMAP_FOR_EACH_SAFE (tx, node, &pmd->tx_ports) {
5523
0
        struct dp_netdev_port *port = tx->port;
5524
5525
0
        if (port->need_reconfigure
5526
0
            || !hmap_contains(&dp->ports, &port->node)) {
5527
0
            dp_netdev_del_port_tx_from_pmd(pmd, tx);
5528
0
        }
5529
0
    }
5530
0
    ovs_mutex_unlock(&pmd->port_mutex);
5531
0
}
5532
5533
/* Must be called each time a port is added/removed or the cmask changes.
5534
 * This creates and destroys pmd threads, reconfigures ports, opens their
5535
 * rxqs and assigns all rxqs/txqs to pmd threads. */
5536
static void
5537
reconfigure_datapath(struct dp_netdev *dp)
5538
    OVS_REQ_RDLOCK(dp->port_rwlock)
5539
0
{
5540
0
    struct hmapx busy_threads = HMAPX_INITIALIZER(&busy_threads);
5541
0
    struct dp_netdev_pmd_thread *pmd;
5542
0
    struct dp_netdev_port *port;
5543
0
    int wanted_txqs;
5544
5545
0
    dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
5546
5547
    /* Step 1: Adjust the pmd threads based on the datapath ports, the cores
5548
     * on the system and the user configuration. */
5549
0
    reconfigure_pmd_threads(dp);
5550
5551
0
    wanted_txqs = cmap_count(&dp->poll_threads);
5552
5553
    /* The number of pmd threads might have changed, or a port can be new:
5554
     * adjust the txqs. */
5555
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
5556
0
        netdev_set_tx_multiq(port->netdev, wanted_txqs);
5557
0
    }
5558
5559
    /* Step 2: Remove from the pmd threads ports that have been removed or
5560
     * need reconfiguration. */
5561
5562
    /* Check for all the ports that need reconfiguration.  We cache this in
5563
     * 'port->need_reconfigure', because netdev_is_reconf_required() can
5564
     * change at any time.
5565
     * Also mark for reconfiguration all ports which will likely change their
5566
     * 'txq_mode' parameter.  It's required to stop using them before
5567
     * changing this setting and it's simpler to mark ports here and allow
5568
     * 'pmd_remove_stale_ports' to remove them from threads.  There will be
5569
     * no actual reconfiguration in 'port_reconfigure' because it's
5570
     * unnecessary.  */
5571
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
5572
0
        if (netdev_is_reconf_required(port->netdev)
5573
0
            || ((port->txq_mode == TXQ_MODE_XPS)
5574
0
                != (netdev_n_txq(port->netdev) < wanted_txqs))
5575
0
            || ((port->txq_mode == TXQ_MODE_XPS_HASH)
5576
0
                != (port->txq_requested_mode == TXQ_REQ_MODE_HASH
5577
0
                    && netdev_n_txq(port->netdev) > 1))) {
5578
0
            port->need_reconfigure = true;
5579
0
        }
5580
0
    }
5581
5582
    /* Remove from the pmd threads all the ports that have been deleted or
5583
     * need reconfiguration. */
5584
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5585
0
        pmd_remove_stale_ports(dp, pmd);
5586
0
    }
5587
5588
    /* Reload affected pmd threads.  We must wait for the pmd threads before
5589
     * reconfiguring the ports, because a port cannot be reconfigured while
5590
     * it's being used. */
5591
0
    reload_affected_pmds(dp);
5592
5593
    /* Step 3: Reconfigure ports. */
5594
5595
    /* We only reconfigure the ports that we determined above, because they're
5596
     * not being used by any pmd thread at the moment.  If a port fails to
5597
     * reconfigure we remove it from the datapath. */
5598
0
    HMAP_FOR_EACH_SAFE (port, node, &dp->ports) {
5599
0
        int err;
5600
5601
0
        if (!port->need_reconfigure) {
5602
0
            continue;
5603
0
        }
5604
5605
0
        err = port_reconfigure(port);
5606
0
        if (err) {
5607
0
            hmap_remove(&dp->ports, &port->node);
5608
0
            seq_change(dp->port_seq);
5609
0
            port_destroy(port);
5610
0
        } else {
5611
            /* With a single queue, there is no point in using hash mode. */
5612
0
            if (port->txq_requested_mode == TXQ_REQ_MODE_HASH &&
5613
0
                netdev_n_txq(port->netdev) > 1) {
5614
0
                port->txq_mode = TXQ_MODE_XPS_HASH;
5615
0
            } else if (netdev_n_txq(port->netdev) < wanted_txqs) {
5616
0
                port->txq_mode = TXQ_MODE_XPS;
5617
0
            } else {
5618
0
                port->txq_mode = TXQ_MODE_STATIC;
5619
0
            }
5620
0
        }
5621
0
    }
5622
5623
    /* Step 4: Compute new rxq scheduling.  We don't touch the pmd threads
5624
     * for now, we just update the 'pmd' pointer in each rxq to point to the
5625
     * wanted thread according to the scheduling policy. */
5626
5627
    /* Reset all the pmd threads to non isolated. */
5628
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5629
0
        pmd->isolated = false;
5630
0
    }
5631
5632
    /* Reset all the queues to unassigned */
5633
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
5634
0
        for (int i = 0; i < port->n_rxq; i++) {
5635
0
            port->rxqs[i].pmd = NULL;
5636
0
        }
5637
0
    }
5638
0
    rxq_scheduling(dp);
5639
5640
    /* Step 5: Remove queues not compliant with new scheduling. */
5641
5642
    /* Count all the threads that will have at least one queue to poll. */
5643
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
5644
0
        for (int qid = 0; qid < port->n_rxq; qid++) {
5645
0
            struct dp_netdev_rxq *q = &port->rxqs[qid];
5646
5647
0
            if (q->pmd) {
5648
0
                hmapx_add(&busy_threads, q->pmd);
5649
0
            }
5650
0
        }
5651
0
    }
5652
5653
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5654
0
        struct rxq_poll *poll;
5655
5656
0
        ovs_mutex_lock(&pmd->port_mutex);
5657
0
        HMAP_FOR_EACH_SAFE (poll, node, &pmd->poll_list) {
5658
0
            if (poll->rxq->pmd != pmd) {
5659
0
                dp_netdev_del_rxq_from_pmd(pmd, poll);
5660
5661
                /* This pmd might sleep after this step if it has no rxq
5662
                 * remaining. Tell it to busy wait for new assignment if it
5663
                 * has at least one scheduled queue. */
5664
0
                if (hmap_count(&pmd->poll_list) == 0 &&
5665
0
                    hmapx_contains(&busy_threads, pmd)) {
5666
0
                    atomic_store_relaxed(&pmd->wait_for_reload, true);
5667
0
                }
5668
0
            }
5669
0
        }
5670
0
        ovs_mutex_unlock(&pmd->port_mutex);
5671
0
    }
5672
5673
0
    hmapx_destroy(&busy_threads);
5674
5675
    /* Reload affected pmd threads.  We must wait for the pmd threads to remove
5676
     * the old queues before readding them, otherwise a queue can be polled by
5677
     * two threads at the same time. */
5678
0
    reload_affected_pmds(dp);
5679
5680
    /* Step 6: Add queues from scheduling, if they're not there already. */
5681
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
5682
0
        if (!netdev_is_pmd(port->netdev)) {
5683
0
            continue;
5684
0
        }
5685
5686
0
        for (int qid = 0; qid < port->n_rxq; qid++) {
5687
0
            struct dp_netdev_rxq *q = &port->rxqs[qid];
5688
5689
0
            if (q->pmd) {
5690
0
                ovs_mutex_lock(&q->pmd->port_mutex);
5691
0
                dp_netdev_add_rxq_to_pmd(q->pmd, q);
5692
0
                ovs_mutex_unlock(&q->pmd->port_mutex);
5693
0
            }
5694
0
        }
5695
0
    }
5696
5697
    /* Add every port and bond to the tx port and bond caches of
5698
     * every pmd thread, if it's not there already and if this pmd
5699
     * has at least one rxq to poll.
5700
     */
5701
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5702
0
        ovs_mutex_lock(&pmd->port_mutex);
5703
0
        if (hmap_count(&pmd->poll_list) || pmd->core_id == NON_PMD_CORE_ID) {
5704
0
            struct tx_bond *bond;
5705
5706
0
            HMAP_FOR_EACH (port, node, &dp->ports) {
5707
0
                dp_netdev_add_port_tx_to_pmd(pmd, port);
5708
0
            }
5709
5710
0
            CMAP_FOR_EACH (bond, node, &dp->tx_bonds) {
5711
0
                dp_netdev_add_bond_tx_to_pmd(pmd, bond, false);
5712
0
            }
5713
0
        }
5714
0
        ovs_mutex_unlock(&pmd->port_mutex);
5715
0
    }
5716
5717
    /* Reload affected pmd threads. */
5718
0
    reload_affected_pmds(dp);
5719
5720
    /* PMD ALB will need to recheck if dry run needed. */
5721
0
    dp->pmd_alb.recheck_config = true;
5722
0
}
5723
5724
/* Returns true if one of the netdevs in 'dp' requires a reconfiguration */
5725
static bool
5726
ports_require_restart(const struct dp_netdev *dp)
5727
    OVS_REQ_RDLOCK(dp->port_rwlock)
5728
0
{
5729
0
    struct dp_netdev_port *port;
5730
5731
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
5732
0
        if (netdev_is_reconf_required(port->netdev)) {
5733
0
            return true;
5734
0
        }
5735
0
    }
5736
5737
0
    return false;
5738
0
}
5739
5740
/* Calculates variance in the values stored in array 'a'. 'n' is the number
5741
 * of elements in array to be considered for calculating vairance.
5742
 * Usage example: data array 'a' contains the processing load of each pmd and
5743
 * 'n' is the number of PMDs. It returns the variance in processing load of
5744
 * PMDs*/
5745
static uint64_t
5746
variance(uint64_t a[], int n)
5747
0
{
5748
    /* Compute mean (average of elements). */
5749
0
    uint64_t sum = 0;
5750
0
    uint64_t mean = 0;
5751
0
    uint64_t sqDiff = 0;
5752
5753
0
    if (!n) {
5754
0
        return 0;
5755
0
    }
5756
5757
0
    for (int i = 0; i < n; i++) {
5758
0
        sum += a[i];
5759
0
    }
5760
5761
0
    if (sum) {
5762
0
        mean = sum / n;
5763
5764
        /* Compute sum squared differences with mean. */
5765
0
        for (int i = 0; i < n; i++) {
5766
0
            sqDiff += (a[i] - mean)*(a[i] - mean);
5767
0
        }
5768
0
    }
5769
0
    return (sqDiff ? (sqDiff / n) : 0);
5770
0
}
5771
5772
/* Return true if needs to revalidate datapath flows. */
5773
static bool
5774
dpif_netdev_run(struct dpif *dpif)
5775
0
{
5776
0
    struct dp_netdev_port *port;
5777
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
5778
0
    struct dp_netdev_pmd_thread *non_pmd;
5779
0
    uint64_t new_tnl_seq;
5780
0
    bool need_to_flush = true;
5781
0
    bool pmd_rebalance = false;
5782
0
    long long int now = time_msec();
5783
0
    struct dp_netdev_pmd_thread *pmd;
5784
5785
0
    ovs_rwlock_rdlock(&dp->port_rwlock);
5786
0
    non_pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
5787
0
    if (non_pmd) {
5788
0
        ovs_mutex_lock(&dp->non_pmd_mutex);
5789
5790
0
        atomic_read_relaxed(&dp->smc_enable_db, &non_pmd->ctx.smc_enable_db);
5791
5792
0
        HMAP_FOR_EACH (port, node, &dp->ports) {
5793
0
            if (!netdev_is_pmd(port->netdev)) {
5794
0
                int i;
5795
5796
0
                if (port->emc_enabled) {
5797
0
                    atomic_read_relaxed(&dp->emc_insert_min,
5798
0
                                        &non_pmd->ctx.emc_insert_min);
5799
0
                } else {
5800
0
                    non_pmd->ctx.emc_insert_min = 0;
5801
0
                }
5802
5803
0
                for (i = 0; i < port->n_rxq; i++) {
5804
5805
0
                    if (!netdev_rxq_enabled(port->rxqs[i].rx)) {
5806
0
                        continue;
5807
0
                    }
5808
5809
0
                    if (dp_netdev_process_rxq_port(non_pmd,
5810
0
                                                   &port->rxqs[i],
5811
0
                                                   port->port_no)) {
5812
0
                        need_to_flush = false;
5813
0
                    }
5814
0
                }
5815
0
            }
5816
0
        }
5817
0
        if (need_to_flush) {
5818
            /* We didn't receive anything in the process loop.
5819
             * Check if we need to send something.
5820
             * There was no time updates on current iteration. */
5821
0
            pmd_thread_ctx_time_update(non_pmd);
5822
0
            dp_netdev_pmd_flush_output_packets(non_pmd, false);
5823
0
        }
5824
5825
0
        dpif_netdev_xps_revalidate_pmd(non_pmd, false);
5826
0
        ovs_mutex_unlock(&dp->non_pmd_mutex);
5827
5828
0
        dp_netdev_pmd_unref(non_pmd);
5829
0
    }
5830
5831
0
    struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
5832
0
    if (pmd_alb->is_enabled) {
5833
0
        if (!pmd_alb->rebalance_poll_timer) {
5834
0
            pmd_alb->rebalance_poll_timer = now;
5835
0
        } else if ((pmd_alb->rebalance_poll_timer +
5836
0
                   pmd_alb->rebalance_intvl) < now) {
5837
0
            pmd_alb->rebalance_poll_timer = now;
5838
0
            CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5839
0
                if (atomic_count_get(&pmd->pmd_overloaded) >=
5840
0
                                    PMD_INTERVAL_MAX) {
5841
0
                    pmd_rebalance = true;
5842
0
                    break;
5843
0
                }
5844
0
            }
5845
5846
0
            if (pmd_rebalance &&
5847
0
                !dp_netdev_is_reconf_required(dp) &&
5848
0
                !ports_require_restart(dp) &&
5849
0
                pmd_rebalance_dry_run_needed(dp) &&
5850
0
                pmd_rebalance_dry_run(dp)) {
5851
0
                VLOG_INFO("PMD auto load balance dry run. "
5852
0
                          "Requesting datapath reconfigure.");
5853
0
                dp_netdev_request_reconfigure(dp);
5854
0
            }
5855
0
        }
5856
0
    }
5857
5858
0
    if (dp_netdev_is_reconf_required(dp) || ports_require_restart(dp)) {
5859
0
        reconfigure_datapath(dp);
5860
0
    }
5861
0
    ovs_rwlock_unlock(&dp->port_rwlock);
5862
5863
0
    tnl_neigh_cache_run();
5864
0
    tnl_port_map_run();
5865
0
    new_tnl_seq = seq_read(tnl_conf_seq);
5866
5867
0
    if (dp->last_tnl_conf_seq != new_tnl_seq) {
5868
0
        dp->last_tnl_conf_seq = new_tnl_seq;
5869
0
        return true;
5870
0
    }
5871
0
    return false;
5872
0
}
5873
5874
static void
5875
dpif_netdev_wait(struct dpif *dpif)
5876
0
{
5877
0
    struct dp_netdev_port *port;
5878
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
5879
5880
0
    ovs_mutex_lock(&dp_netdev_mutex);
5881
0
    ovs_rwlock_rdlock(&dp->port_rwlock);
5882
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
5883
0
        netdev_wait_reconf_required(port->netdev);
5884
0
        if (!netdev_is_pmd(port->netdev)) {
5885
0
            int i;
5886
5887
0
            for (i = 0; i < port->n_rxq; i++) {
5888
0
                netdev_rxq_wait(port->rxqs[i].rx);
5889
0
            }
5890
0
        }
5891
0
    }
5892
0
    ovs_rwlock_unlock(&dp->port_rwlock);
5893
0
    ovs_mutex_unlock(&dp_netdev_mutex);
5894
0
    seq_wait(tnl_conf_seq, dp->last_tnl_conf_seq);
5895
0
}
5896
5897
static void
5898
pmd_free_cached_ports(struct dp_netdev_pmd_thread *pmd)
5899
0
{
5900
0
    struct tx_port *tx_port_cached;
5901
5902
    /* Flush all the queued packets. */
5903
0
    dp_netdev_pmd_flush_output_packets(pmd, true);
5904
    /* Free all used tx queue ids. */
5905
0
    dpif_netdev_xps_revalidate_pmd(pmd, true);
5906
5907
0
    HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->tnl_port_cache) {
5908
0
        free(tx_port_cached->txq_pkts);
5909
0
        free(tx_port_cached);
5910
0
    }
5911
0
    HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->send_port_cache) {
5912
0
        free(tx_port_cached->txq_pkts);
5913
0
        free(tx_port_cached);
5914
0
    }
5915
0
}
5916
5917
/* Copies ports from 'pmd->tx_ports' (shared with the main thread) to
5918
 * thread-local copies. Copy to 'pmd->tnl_port_cache' if it is a tunnel
5919
 * device, otherwise to 'pmd->send_port_cache' if the port has at least
5920
 * one txq. */
5921
static void
5922
pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
5923
    OVS_REQUIRES(pmd->port_mutex)
5924
0
{
5925
0
    struct tx_port *tx_port, *tx_port_cached;
5926
5927
0
    pmd_free_cached_ports(pmd);
5928
0
    hmap_shrink(&pmd->send_port_cache);
5929
0
    hmap_shrink(&pmd->tnl_port_cache);
5930
5931
0
    HMAP_FOR_EACH (tx_port, node, &pmd->tx_ports) {
5932
0
        int n_txq = netdev_n_txq(tx_port->port->netdev);
5933
0
        struct dp_packet_batch *txq_pkts_cached;
5934
5935
0
        if (netdev_has_tunnel_push_pop(tx_port->port->netdev)) {
5936
0
            tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
5937
0
            if (tx_port->txq_pkts) {
5938
0
                txq_pkts_cached = xmemdup(tx_port->txq_pkts,
5939
0
                                          n_txq * sizeof *tx_port->txq_pkts);
5940
0
                tx_port_cached->txq_pkts = txq_pkts_cached;
5941
0
            }
5942
0
            hmap_insert(&pmd->tnl_port_cache, &tx_port_cached->node,
5943
0
                        hash_port_no(tx_port_cached->port->port_no));
5944
0
        }
5945
5946
0
        if (n_txq) {
5947
0
            tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
5948
0
            if (tx_port->txq_pkts) {
5949
0
                txq_pkts_cached = xmemdup(tx_port->txq_pkts,
5950
0
                                          n_txq * sizeof *tx_port->txq_pkts);
5951
0
                tx_port_cached->txq_pkts = txq_pkts_cached;
5952
0
            }
5953
0
            hmap_insert(&pmd->send_port_cache, &tx_port_cached->node,
5954
0
                        hash_port_no(tx_port_cached->port->port_no));
5955
0
        }
5956
0
    }
5957
0
}
5958
5959
static void
5960
pmd_alloc_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
5961
0
{
5962
0
    ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
5963
0
    if (!id_pool_alloc_id(pmd->dp->tx_qid_pool, &pmd->static_tx_qid)) {
5964
0
        VLOG_ABORT("static_tx_qid allocation failed for PMD on core %2d"
5965
0
                   ", numa_id %d.", pmd->core_id, pmd->numa_id);
5966
0
    }
5967
0
    ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
5968
5969
0
    VLOG_DBG("static_tx_qid = %d allocated for PMD thread on core %2d"
5970
0
             ", numa_id %d.", pmd->static_tx_qid, pmd->core_id, pmd->numa_id);
5971
0
}
5972
5973
static void
5974
pmd_free_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
5975
0
{
5976
0
    ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
5977
0
    id_pool_free_id(pmd->dp->tx_qid_pool, pmd->static_tx_qid);
5978
0
    ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
5979
0
}
5980
5981
static int
5982
pmd_load_queues_and_ports(struct dp_netdev_pmd_thread *pmd,
5983
                          struct polled_queue **ppoll_list)
5984
0
{
5985
0
    struct polled_queue *poll_list = *ppoll_list;
5986
0
    struct rxq_poll *poll;
5987
0
    int i;
5988
5989
0
    ovs_mutex_lock(&pmd->port_mutex);
5990
0
    poll_list = xrealloc(poll_list, hmap_count(&pmd->poll_list)
5991
0
                                    * sizeof *poll_list);
5992
5993
0
    i = 0;
5994
0
    HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
5995
0
        poll_list[i].rxq = poll->rxq;
5996
0
        poll_list[i].port_no = poll->rxq->port->port_no;
5997
0
        poll_list[i].emc_enabled = poll->rxq->port->emc_enabled;
5998
0
        poll_list[i].rxq_enabled = netdev_rxq_enabled(poll->rxq->rx);
5999
0
        poll_list[i].change_seq =
6000
0
                     netdev_get_change_seq(poll->rxq->port->netdev);
6001
0
        i++;
6002
0
    }
6003
6004
0
    pmd_load_cached_ports(pmd);
6005
6006
0
    ovs_mutex_unlock(&pmd->port_mutex);
6007
6008
0
    *ppoll_list = poll_list;
6009
0
    return i;
6010
0
}
6011
6012
static void *
6013
pmd_thread_main(void *f_)
6014
0
{
6015
0
    struct dp_netdev_pmd_thread *pmd = f_;
6016
0
    struct pmd_perf_stats *s = &pmd->perf_stats;
6017
0
    unsigned int lc = 0;
6018
0
    struct polled_queue *poll_list;
6019
0
    bool wait_for_reload = false;
6020
0
    bool dpdk_attached;
6021
0
    bool reload_tx_qid;
6022
0
    bool exiting;
6023
0
    bool reload;
6024
0
    int poll_cnt;
6025
0
    int i;
6026
0
    int process_packets = 0;
6027
0
    uint64_t sleep_time = 0;
6028
6029
0
    poll_list = NULL;
6030
6031
    /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
6032
0
    ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
6033
0
    ovs_numa_thread_setaffinity_core(pmd->core_id);
6034
0
    dpdk_attached = dpdk_attach_thread(pmd->core_id);
6035
0
    poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
6036
0
    dfc_cache_init(&pmd->flow_cache);
6037
0
    pmd_alloc_static_tx_qid(pmd);
6038
0
    set_timer_resolution(PMD_TIMER_RES_NS);
6039
6040
0
reload:
6041
0
    atomic_count_init(&pmd->pmd_overloaded, 0);
6042
6043
0
    pmd->intrvl_tsc_prev = 0;
6044
0
    atomic_store_relaxed(&pmd->intrvl_cycles, 0);
6045
6046
0
    if (!dpdk_attached) {
6047
0
        dpdk_attached = dpdk_attach_thread(pmd->core_id);
6048
0
    }
6049
6050
    /* List port/core affinity */
6051
0
    for (i = 0; i < poll_cnt; i++) {
6052
0
       VLOG_DBG("Core %d processing port \'%s\' with queue-id %d\n",
6053
0
                pmd->core_id, netdev_rxq_get_name(poll_list[i].rxq->rx),
6054
0
                netdev_rxq_get_queue_id(poll_list[i].rxq->rx));
6055
       /* Reset the rxq current cycles counter. */
6056
0
       dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR, 0);
6057
0
       for (int j = 0; j < PMD_INTERVAL_MAX; j++) {
6058
0
           dp_netdev_rxq_set_intrvl_cycles(poll_list[i].rxq, 0);
6059
0
       }
6060
0
    }
6061
6062
0
    if (!poll_cnt) {
6063
0
        if (wait_for_reload) {
6064
            /* Don't sleep, control thread will ask for a reload shortly. */
6065
0
            do {
6066
0
                atomic_read_explicit(&pmd->reload, &reload,
6067
0
                                     memory_order_acquire);
6068
0
            } while (!reload);
6069
0
        } else {
6070
0
            while (seq_read(pmd->reload_seq) == pmd->last_reload_seq) {
6071
0
                seq_wait(pmd->reload_seq, pmd->last_reload_seq);
6072
0
                poll_block();
6073
0
            }
6074
0
        }
6075
0
    }
6076
6077
0
    for (i = 0; i < PMD_INTERVAL_MAX; i++) {
6078
0
        atomic_store_relaxed(&pmd->busy_cycles_intrvl[i], 0);
6079
0
    }
6080
0
    atomic_count_set(&pmd->intrvl_idx, 0);
6081
0
    cycles_counter_update(s);
6082
6083
0
    pmd->next_rcu_quiesce = pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL;
6084
6085
    /* Protect pmd stats from external clearing while polling. */
6086
0
    ovs_mutex_lock(&pmd->perf_stats.stats_mutex);
6087
0
    for (;;) {
6088
0
        uint64_t rx_packets = 0, tx_packets = 0;
6089
0
        uint64_t time_slept = 0;
6090
0
        uint64_t max_sleep;
6091
6092
0
        pmd_perf_start_iteration(s);
6093
6094
0
        atomic_read_relaxed(&pmd->dp->smc_enable_db, &pmd->ctx.smc_enable_db);
6095
0
        atomic_read_relaxed(&pmd->max_sleep, &max_sleep);
6096
6097
0
        for (i = 0; i < poll_cnt; i++) {
6098
6099
0
            if (!poll_list[i].rxq_enabled) {
6100
0
                continue;
6101
0
            }
6102
6103
0
            if (poll_list[i].emc_enabled) {
6104
0
                atomic_read_relaxed(&pmd->dp->emc_insert_min,
6105
0
                                    &pmd->ctx.emc_insert_min);
6106
0
            } else {
6107
0
                pmd->ctx.emc_insert_min = 0;
6108
0
            }
6109
6110
0
            process_packets =
6111
0
                dp_netdev_process_rxq_port(pmd, poll_list[i].rxq,
6112
0
                                           poll_list[i].port_no);
6113
0
            rx_packets += process_packets;
6114
0
            if (process_packets >= PMD_SLEEP_THRESH) {
6115
0
                sleep_time = 0;
6116
0
            }
6117
0
        }
6118
6119
0
        if (!rx_packets) {
6120
            /* We didn't receive anything in the process loop.
6121
             * Check if we need to send something.
6122
             * There was no time updates on current iteration. */
6123
0
            pmd_thread_ctx_time_update(pmd);
6124
0
            tx_packets = dp_netdev_pmd_flush_output_packets(pmd,
6125
0
                                                   max_sleep && sleep_time
6126
0
                                                   ? true : false);
6127
0
        }
6128
6129
0
        if (max_sleep) {
6130
            /* Check if a sleep should happen on this iteration. */
6131
0
            if (sleep_time) {
6132
0
                struct cycle_timer sleep_timer;
6133
6134
0
                cycle_timer_start(&pmd->perf_stats, &sleep_timer);
6135
0
                xnanosleep_no_quiesce(sleep_time * 1000);
6136
0
                time_slept = cycle_timer_stop(&pmd->perf_stats, &sleep_timer);
6137
0
                pmd_thread_ctx_time_update(pmd);
6138
0
            }
6139
0
            if (sleep_time < max_sleep) {
6140
                /* Increase sleep time for next iteration. */
6141
0
                sleep_time += PMD_SLEEP_INC_US;
6142
0
            } else {
6143
0
                sleep_time = max_sleep;
6144
0
            }
6145
0
        } else {
6146
            /* Reset sleep time as max sleep policy may have been changed. */
6147
0
            sleep_time = 0;
6148
0
        }
6149
6150
        /* Do RCU synchronization at fixed interval.  This ensures that
6151
         * synchronization would not be delayed long even at high load of
6152
         * packet processing. */
6153
0
        if (pmd->ctx.now > pmd->next_rcu_quiesce) {
6154
0
            if (!ovsrcu_try_quiesce()) {
6155
0
                pmd->next_rcu_quiesce =
6156
0
                    pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL;
6157
0
            }
6158
0
        }
6159
6160
0
        if (lc++ > 1024) {
6161
0
            lc = 0;
6162
6163
0
            coverage_try_clear();
6164
0
            dp_netdev_pmd_try_optimize(pmd, poll_list, poll_cnt);
6165
0
            if (!ovsrcu_try_quiesce()) {
6166
0
                emc_cache_slow_sweep(&((pmd->flow_cache).emc_cache));
6167
0
                pmd->next_rcu_quiesce =
6168
0
                    pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL;
6169
0
            }
6170
6171
0
            for (i = 0; i < poll_cnt; i++) {
6172
0
                uint64_t current_seq =
6173
0
                         netdev_get_change_seq(poll_list[i].rxq->port->netdev);
6174
0
                if (poll_list[i].change_seq != current_seq) {
6175
0
                    poll_list[i].change_seq = current_seq;
6176
0
                    poll_list[i].rxq_enabled =
6177
0
                                 netdev_rxq_enabled(poll_list[i].rxq->rx);
6178
0
                }
6179
0
            }
6180
0
        }
6181
6182
0
        atomic_read_explicit(&pmd->reload, &reload, memory_order_acquire);
6183
0
        if (OVS_UNLIKELY(reload)) {
6184
0
            break;
6185
0
        }
6186
6187
0
        pmd_perf_end_iteration(s, rx_packets, tx_packets, time_slept,
6188
0
                               pmd_perf_metrics_enabled(pmd));
6189
0
    }
6190
0
    ovs_mutex_unlock(&pmd->perf_stats.stats_mutex);
6191
6192
0
    poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
6193
0
    atomic_read_relaxed(&pmd->wait_for_reload, &wait_for_reload);
6194
0
    atomic_read_relaxed(&pmd->reload_tx_qid, &reload_tx_qid);
6195
0
    atomic_read_relaxed(&pmd->exit, &exiting);
6196
    /* Signal here to make sure the pmd finishes
6197
     * reloading the updated configuration. */
6198
0
    dp_netdev_pmd_reload_done(pmd);
6199
6200
0
    if (reload_tx_qid) {
6201
0
        pmd_free_static_tx_qid(pmd);
6202
0
        pmd_alloc_static_tx_qid(pmd);
6203
0
    }
6204
6205
0
    if (!exiting) {
6206
0
        goto reload;
6207
0
    }
6208
6209
0
    pmd_free_static_tx_qid(pmd);
6210
0
    dfc_cache_uninit(&pmd->flow_cache);
6211
0
    free(poll_list);
6212
0
    pmd_free_cached_ports(pmd);
6213
0
    if (dpdk_attached) {
6214
0
        dpdk_detach_thread();
6215
0
    }
6216
0
    return NULL;
6217
0
}
6218
6219
static void
6220
dp_netdev_disable_upcall(struct dp_netdev *dp)
6221
    OVS_ACQUIRES(dp->upcall_rwlock)
6222
0
{
6223
0
    fat_rwlock_wrlock(&dp->upcall_rwlock);
6224
0
}
6225
6226

6227
/* Meters */
6228
static void
6229
dpif_netdev_meter_get_features(const struct dpif * dpif OVS_UNUSED,
6230
                               struct ofputil_meter_features *features)
6231
0
{
6232
0
    features->max_meters = MAX_METERS;
6233
0
    features->band_types = DP_SUPPORTED_METER_BAND_TYPES;
6234
0
    features->capabilities = DP_SUPPORTED_METER_FLAGS_MASK;
6235
0
    features->max_bands = MAX_BANDS;
6236
0
    features->max_color = 0;
6237
0
}
6238
6239
/* Tries to atomically add 'n' to 'value' in terms of saturation arithmetic,
6240
 * i.e., if the result will be larger than 'max_value', will store 'max_value'
6241
 * instead. */
6242
static void
6243
atomic_sat_add(atomic_uint64_t *value, uint64_t n, uint64_t max_value)
6244
0
{
6245
0
    uint64_t current, new_value;
6246
6247
0
    atomic_read_relaxed(value, &current);
6248
0
    do {
6249
0
        new_value = current + n;
6250
0
        new_value = MIN(new_value, max_value);
6251
0
    } while (!atomic_compare_exchange_weak_relaxed(value, &current,
6252
0
                                                   new_value));
6253
0
}
6254
6255
/* Tries to atomically subtract 'n' from 'value'.  Does not perform the
6256
 * operation and returns 'false' if the result will be less than 'min_value'.
6257
 * Otherwise, stores the result and returns 'true'. */
6258
static bool
6259
atomic_bound_sub(atomic_uint64_t *value, uint64_t n, uint64_t min_value)
6260
0
{
6261
0
    uint64_t current;
6262
6263
0
    atomic_read_relaxed(value, &current);
6264
0
    do {
6265
0
        if (current < min_value + n) {
6266
0
            return false;
6267
0
        }
6268
0
    } while (!atomic_compare_exchange_weak_relaxed(value, &current,
6269
0
                                                   current - n));
6270
0
    return true;
6271
0
}
6272
6273
/* Applies the meter identified by 'meter_id' to 'packets_'.  Packets
6274
 * that exceed a band are dropped in-place. */
6275
static void
6276
dp_netdev_run_meter(struct dp_netdev *dp, struct dp_packet_batch *packets_,
6277
                    uint32_t meter_id, long long int now_ms)
6278
0
{
6279
0
    const size_t cnt = dp_packet_batch_size(packets_);
6280
0
    uint32_t exceeded_rate[NETDEV_MAX_BURST];
6281
0
    uint32_t exceeded_band[NETDEV_MAX_BURST];
6282
0
    uint64_t bytes, volume, meter_used, old;
6283
0
    uint64_t band_packets[MAX_BANDS];
6284
0
    uint64_t band_bytes[MAX_BANDS];
6285
0
    struct dp_meter_band *band;
6286
0
    struct dp_packet *packet;
6287
0
    struct dp_meter *meter;
6288
0
    bool exceeded = false;
6289
6290
0
    if (meter_id >= MAX_METERS) {
6291
0
        return;
6292
0
    }
6293
6294
0
    meter = dp_meter_lookup(&dp->meters, meter_id);
6295
0
    if (!meter) {
6296
0
        return;
6297
0
    }
6298
6299
    /* Initialize as negative values. */
6300
0
    memset(exceeded_band, 0xff, cnt * sizeof *exceeded_band);
6301
    /* Initialize as zeroes. */
6302
0
    memset(exceeded_rate, 0, cnt * sizeof *exceeded_rate);
6303
6304
0
    atomic_read_relaxed(&meter->used, &meter_used);
6305
0
    do {
6306
0
        if (meter_used >= now_ms) {
6307
            /* The '>' condition means that we have several threads hitting the
6308
             * same meter, and the other one already advanced the time. */
6309
0
            meter_used = now_ms;
6310
0
            break;
6311
0
        }
6312
0
    } while (!atomic_compare_exchange_weak_relaxed(&meter->used,
6313
0
                                                   &meter_used, now_ms));
6314
6315
    /* Refill all buckets right away, since other threads may use them. */
6316
0
    if (meter_used < now_ms) {
6317
        /* All packets will hit the meter at the same time. */
6318
0
        uint64_t delta_t = now_ms - meter_used;
6319
6320
        /* Make sure delta_t will not be too large, so that bucket will not
6321
         * wrap around below. */
6322
0
        delta_t = MIN(delta_t, meter->max_delta_t);
6323
6324
0
        for (int m = 0; m < meter->n_bands; m++) {
6325
0
            band = &meter->bands[m];
6326
            /* Update band's bucket.  We can't just use atomic add here,
6327
             * because we should never add above the max capacity. */
6328
0
            atomic_sat_add(&band->bucket, delta_t * band->rate,
6329
0
                           band->burst_size * 1000ULL);
6330
0
        }
6331
0
    }
6332
6333
    /* Update meter stats. */
6334
0
    atomic_add_relaxed(&meter->packet_count, cnt, &old);
6335
0
    bytes = 0;
6336
0
    DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
6337
0
        bytes += dp_packet_size(packet);
6338
0
    }
6339
0
    atomic_add_relaxed(&meter->byte_count, bytes, &old);
6340
6341
    /* Meters can operate in terms of packets per second or kilobits per
6342
     * second. */
6343
0
    if (meter->flags & OFPMF13_PKTPS) {
6344
        /* Rate in packets/second, bucket 1/1000 packets.
6345
         * msec * packets/sec = 1/1000 packets. */
6346
0
        volume = cnt * 1000; /* Take 'cnt' packets from the bucket. */
6347
0
    } else {
6348
        /* Rate in kbps, bucket in bits.
6349
         * msec * kbps = bits */
6350
0
        volume = bytes * 8;
6351
0
    }
6352
6353
    /* Find the band hit with the highest rate for each packet (if any). */
6354
0
    for (int m = 0; m < meter->n_bands; m++) {
6355
0
        band = &meter->bands[m];
6356
6357
        /* Drain the bucket for all the packets, if possible. */
6358
0
        if (atomic_bound_sub(&band->bucket, volume, 0)) {
6359
0
            continue;
6360
0
        }
6361
6362
        /* Band limit hit, must process packet-by-packet. */
6363
0
        DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
6364
0
            uint64_t packet_volume = (meter->flags & OFPMF13_PKTPS)
6365
0
                                     ? 1000 : (dp_packet_size(packet) * 8);
6366
6367
0
            if (!atomic_bound_sub(&band->bucket, packet_volume, 0)) {
6368
                /* Update the exceeding band for the exceeding packet.
6369
                 * Only one band will be fired by a packet, and that can
6370
                 * be different for each packet. */
6371
0
                if (band->rate > exceeded_rate[i]) {
6372
0
                    exceeded_rate[i] = band->rate;
6373
0
                    exceeded_band[i] = m;
6374
0
                    exceeded = true;
6375
0
                }
6376
0
            }
6377
0
        }
6378
0
    }
6379
6380
    /* No need to iterate over packets if there are no drops. */
6381
0
    if (!exceeded) {
6382
0
        return;
6383
0
    }
6384
6385
    /* Fire the highest rate band exceeded by each packet, and drop
6386
     * packets if needed. */
6387
6388
0
    memset(band_packets, 0, sizeof band_packets);
6389
0
    memset(band_bytes,   0, sizeof band_bytes);
6390
6391
0
    size_t j;
6392
0
    DP_PACKET_BATCH_REFILL_FOR_EACH (j, cnt, packet, packets_) {
6393
0
        uint32_t m = exceeded_band[j];
6394
6395
0
        if (m != UINT32_MAX) {
6396
            /* Meter drop packet. */
6397
0
            band_packets[m]++;
6398
0
            band_bytes[m] += dp_packet_size(packet);
6399
0
            dp_packet_delete(packet);
6400
0
        } else {
6401
            /* Meter accepts packet. */
6402
0
            dp_packet_batch_refill(packets_, packet, j);
6403
0
        }
6404
0
    }
6405
6406
0
    for (int m = 0; m < meter->n_bands; m++) {
6407
0
        if (!band_packets[m]) {
6408
0
            continue;
6409
0
        }
6410
0
        band = &meter->bands[m];
6411
0
        atomic_add_relaxed(&band->packet_count, band_packets[m], &old);
6412
0
        atomic_add_relaxed(&band->byte_count,   band_bytes[m],   &old);
6413
0
        COVERAGE_ADD(datapath_drop_meter, band_packets[m]);
6414
0
    }
6415
0
}
6416
6417
/* Meter set/get/del processing is still single-threaded. */
6418
static int
6419
dpif_netdev_meter_set(struct dpif *dpif, ofproto_meter_id meter_id,
6420
                      struct ofputil_meter_config *config)
6421
0
{
6422
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
6423
0
    uint32_t mid = meter_id.uint32;
6424
0
    struct dp_meter *meter;
6425
0
    int i;
6426
6427
0
    if (mid >= MAX_METERS) {
6428
0
        return EFBIG; /* Meter_id out of range. */
6429
0
    }
6430
6431
0
    if (config->flags & ~DP_SUPPORTED_METER_FLAGS_MASK) {
6432
0
        return EBADF; /* Unsupported flags set */
6433
0
    }
6434
6435
0
    if (config->n_bands > MAX_BANDS) {
6436
0
        return EINVAL;
6437
0
    }
6438
6439
0
    for (i = 0; i < config->n_bands; ++i) {
6440
0
        switch (config->bands[i].type) {
6441
0
        case OFPMBT13_DROP:
6442
0
            break;
6443
0
        default:
6444
0
            return ENODEV; /* Unsupported band type */
6445
0
        }
6446
0
    }
6447
6448
    /* Allocate meter */
6449
0
    meter = xzalloc(sizeof *meter
6450
0
                    + config->n_bands * sizeof(struct dp_meter_band));
6451
6452
0
    meter->flags = config->flags;
6453
0
    meter->n_bands = config->n_bands;
6454
0
    meter->max_delta_t = 0;
6455
0
    meter->id = mid;
6456
0
    atomic_init(&meter->used, time_msec());
6457
6458
    /* set up bands */
6459
0
    for (i = 0; i < config->n_bands; ++i) {
6460
0
        uint32_t band_max_delta_t;
6461
0
        uint64_t bucket_size;
6462
6463
        /* Set burst size to a workable value if none specified. */
6464
0
        if (config->bands[i].burst_size == 0) {
6465
0
            config->bands[i].burst_size = config->bands[i].rate;
6466
0
        }
6467
6468
0
        meter->bands[i].rate = config->bands[i].rate;
6469
0
        meter->bands[i].burst_size = config->bands[i].burst_size;
6470
        /* Start with a full bucket. */
6471
0
        bucket_size = meter->bands[i].burst_size * 1000ULL;
6472
0
        atomic_init(&meter->bands[i].bucket, bucket_size);
6473
6474
        /* Figure out max delta_t that is enough to fill any bucket. */
6475
0
        band_max_delta_t = bucket_size / meter->bands[i].rate;
6476
0
        if (band_max_delta_t > meter->max_delta_t) {
6477
0
            meter->max_delta_t = band_max_delta_t;
6478
0
        }
6479
0
    }
6480
6481
0
    ovs_mutex_lock(&dp->meters_lock);
6482
6483
0
    dp_meter_detach_free(&dp->meters, mid); /* Free existing meter, if any. */
6484
0
    dp_meter_attach(&dp->meters, meter);
6485
6486
0
    ovs_mutex_unlock(&dp->meters_lock);
6487
6488
0
    return 0;
6489
0
}
6490
6491
static int
6492
dpif_netdev_meter_get(const struct dpif *dpif,
6493
                      ofproto_meter_id meter_id_,
6494
                      struct ofputil_meter_stats *stats, uint16_t n_bands)
6495
0
{
6496
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
6497
0
    uint32_t meter_id = meter_id_.uint32;
6498
0
    struct dp_meter *meter;
6499
6500
0
    if (meter_id >= MAX_METERS) {
6501
0
        return EFBIG;
6502
0
    }
6503
6504
0
    meter = dp_meter_lookup(&dp->meters, meter_id);
6505
0
    if (!meter) {
6506
0
        return ENOENT;
6507
0
    }
6508
6509
0
    if (stats) {
6510
0
        int i = 0;
6511
6512
0
        atomic_read_relaxed(&meter->packet_count, &stats->packet_in_count);
6513
0
        atomic_read_relaxed(&meter->byte_count, &stats->byte_in_count);
6514
6515
0
        for (i = 0; i < n_bands && i < meter->n_bands; ++i) {
6516
0
            atomic_read_relaxed(&meter->bands[i].packet_count,
6517
0
                                &stats->bands[i].packet_count);
6518
0
            atomic_read_relaxed(&meter->bands[i].byte_count,
6519
0
                                &stats->bands[i].byte_count);
6520
0
        }
6521
0
        stats->n_bands = i;
6522
0
    }
6523
6524
0
    return 0;
6525
0
}
6526
6527
static int
6528
dpif_netdev_meter_del(struct dpif *dpif,
6529
                      ofproto_meter_id meter_id_,
6530
                      struct ofputil_meter_stats *stats, uint16_t n_bands)
6531
0
{
6532
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
6533
0
    int error;
6534
6535
0
    error = dpif_netdev_meter_get(dpif, meter_id_, stats, n_bands);
6536
0
    if (!error) {
6537
0
        uint32_t meter_id = meter_id_.uint32;
6538
6539
0
        ovs_mutex_lock(&dp->meters_lock);
6540
0
        dp_meter_detach_free(&dp->meters, meter_id);
6541
0
        ovs_mutex_unlock(&dp->meters_lock);
6542
0
    }
6543
0
    return error;
6544
0
}
6545
6546

6547
static void
6548
dpif_netdev_disable_upcall(struct dpif *dpif)
6549
    OVS_NO_THREAD_SAFETY_ANALYSIS
6550
0
{
6551
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
6552
0
    dp_netdev_disable_upcall(dp);
6553
0
}
6554
6555
static void
6556
dp_netdev_enable_upcall(struct dp_netdev *dp)
6557
    OVS_RELEASES(dp->upcall_rwlock)
6558
0
{
6559
0
    fat_rwlock_unlock(&dp->upcall_rwlock);
6560
0
}
6561
6562
static void
6563
dpif_netdev_enable_upcall(struct dpif *dpif)
6564
    OVS_NO_THREAD_SAFETY_ANALYSIS
6565
0
{
6566
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
6567
0
    dp_netdev_enable_upcall(dp);
6568
0
}
6569
6570
static void
6571
dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd)
6572
0
{
6573
0
    atomic_store_relaxed(&pmd->wait_for_reload, false);
6574
0
    atomic_store_relaxed(&pmd->reload_tx_qid, false);
6575
0
    pmd->last_reload_seq = seq_read(pmd->reload_seq);
6576
0
    atomic_store_explicit(&pmd->reload, false, memory_order_release);
6577
0
}
6578
6579
/* Finds and refs the dp_netdev_pmd_thread on core 'core_id'.  Returns
6580
 * the pointer if succeeds, otherwise, NULL (it can return NULL even if
6581
 * 'core_id' is NON_PMD_CORE_ID).
6582
 *
6583
 * Caller must unrefs the returned reference.  */
6584
static struct dp_netdev_pmd_thread *
6585
dp_netdev_get_pmd(struct dp_netdev *dp, unsigned core_id)
6586
0
{
6587
0
    struct dp_netdev_pmd_thread *pmd;
6588
6589
0
    CMAP_FOR_EACH_WITH_HASH (pmd, node, hash_int(core_id, 0),
6590
0
                             &dp->poll_threads) {
6591
0
        if (pmd->core_id == core_id) {
6592
0
            return dp_netdev_pmd_try_ref(pmd) ? pmd : NULL;
6593
0
        }
6594
0
    }
6595
6596
0
    return NULL;
6597
0
}
6598
6599
/* Sets the 'struct dp_netdev_pmd_thread' for non-pmd threads. */
6600
static void
6601
dp_netdev_set_nonpmd(struct dp_netdev *dp)
6602
    OVS_REQ_WRLOCK(dp->port_rwlock)
6603
0
{
6604
0
    struct dp_netdev_pmd_thread *non_pmd;
6605
6606
0
    non_pmd = xzalloc(sizeof *non_pmd);
6607
0
    dp_netdev_configure_pmd(non_pmd, dp, NON_PMD_CORE_ID, OVS_NUMA_UNSPEC);
6608
0
}
6609
6610
/* Caller must have valid pointer to 'pmd'. */
6611
static bool
6612
dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd)
6613
0
{
6614
0
    return ovs_refcount_try_ref_rcu(&pmd->ref_cnt);
6615
0
}
6616
6617
static void
6618
dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd)
6619
0
{
6620
0
    if (pmd && ovs_refcount_unref(&pmd->ref_cnt) == 1) {
6621
0
        ovsrcu_postpone(dp_netdev_destroy_pmd, pmd);
6622
0
    }
6623
0
}
6624
6625
/* Given cmap position 'pos', tries to ref the next node.  If try_ref()
6626
 * fails, keeps checking for next node until reaching the end of cmap.
6627
 *
6628
 * Caller must unrefs the returned reference. */
6629
static struct dp_netdev_pmd_thread *
6630
dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos)
6631
0
{
6632
0
    struct dp_netdev_pmd_thread *next;
6633
6634
0
    do {
6635
0
        struct cmap_node *node;
6636
6637
0
        node = cmap_next_position(&dp->poll_threads, pos);
6638
0
        next = node ? CONTAINER_OF(node, struct dp_netdev_pmd_thread, node)
6639
0
            : NULL;
6640
0
    } while (next && !dp_netdev_pmd_try_ref(next));
6641
6642
0
    return next;
6643
0
}
6644
6645
/* Configures the 'pmd' based on the input argument. */
6646
static void
6647
dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
6648
                        unsigned core_id, int numa_id)
6649
    OVS_NO_THREAD_SAFETY_ANALYSIS
6650
0
{
6651
0
    pmd->dp = dp;
6652
0
    pmd->core_id = core_id;
6653
0
    pmd->numa_id = numa_id;
6654
0
    pmd->need_reload = false;
6655
0
    pmd->n_output_batches = 0;
6656
6657
0
    ovs_refcount_init(&pmd->ref_cnt);
6658
0
    atomic_init(&pmd->exit, false);
6659
0
    pmd->reload_seq = seq_create();
6660
0
    pmd->last_reload_seq = seq_read(pmd->reload_seq);
6661
0
    atomic_init(&pmd->reload, false);
6662
0
    ovs_mutex_init(&pmd->flow_mutex);
6663
0
    ovs_mutex_init(&pmd->port_mutex);
6664
0
    ovs_mutex_init(&pmd->bond_mutex);
6665
0
    cmap_init(&pmd->flow_table);
6666
0
    cmap_init(&pmd->classifiers);
6667
0
    cmap_init(&pmd->simple_match_table);
6668
0
    ccmap_init(&pmd->n_flows);
6669
0
    ccmap_init(&pmd->n_simple_flows);
6670
0
    pmd->ctx.last_rxq = NULL;
6671
0
    pmd_thread_ctx_time_update(pmd);
6672
0
    pmd->next_optimization = pmd->ctx.now + DPCLS_OPTIMIZATION_INTERVAL;
6673
0
    pmd->next_rcu_quiesce = pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL;
6674
0
    pmd->next_cycle_store = pmd->ctx.now + PMD_INTERVAL_LEN;
6675
0
    pmd->busy_cycles_intrvl = xzalloc(PMD_INTERVAL_MAX *
6676
0
                                      sizeof *pmd->busy_cycles_intrvl);
6677
0
    hmap_init(&pmd->poll_list);
6678
0
    hmap_init(&pmd->tx_ports);
6679
0
    hmap_init(&pmd->tnl_port_cache);
6680
0
    hmap_init(&pmd->send_port_cache);
6681
0
    cmap_init(&pmd->tx_bonds);
6682
6683
0
    pmd_init_max_sleep(dp, pmd);
6684
6685
    /* init the 'flow_cache' since there is no
6686
     * actual thread created for NON_PMD_CORE_ID. */
6687
0
    if (core_id == NON_PMD_CORE_ID) {
6688
0
        dfc_cache_init(&pmd->flow_cache);
6689
0
        pmd_alloc_static_tx_qid(pmd);
6690
0
    }
6691
0
    pmd_perf_stats_init(&pmd->perf_stats);
6692
0
    cmap_insert(&dp->poll_threads, CONST_CAST(struct cmap_node *, &pmd->node),
6693
0
                hash_int(core_id, 0));
6694
0
}
6695
6696
static void
6697
dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd)
6698
    OVS_NO_THREAD_SAFETY_ANALYSIS
6699
0
{
6700
0
    struct dpcls *cls;
6701
6702
0
    dp_netdev_pmd_flow_flush(pmd);
6703
0
    hmap_destroy(&pmd->send_port_cache);
6704
0
    hmap_destroy(&pmd->tnl_port_cache);
6705
0
    hmap_destroy(&pmd->tx_ports);
6706
0
    cmap_destroy(&pmd->tx_bonds);
6707
0
    hmap_destroy(&pmd->poll_list);
6708
0
    free(pmd->busy_cycles_intrvl);
6709
    /* All flows (including their dpcls_rules) have been deleted already */
6710
0
    CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
6711
0
        dpcls_destroy(cls);
6712
0
        ovsrcu_postpone(free, cls);
6713
0
    }
6714
0
    cmap_destroy(&pmd->classifiers);
6715
0
    cmap_destroy(&pmd->flow_table);
6716
0
    cmap_destroy(&pmd->simple_match_table);
6717
0
    ccmap_destroy(&pmd->n_flows);
6718
0
    ccmap_destroy(&pmd->n_simple_flows);
6719
0
    ovs_mutex_destroy(&pmd->flow_mutex);
6720
0
    seq_destroy(pmd->reload_seq);
6721
0
    ovs_mutex_destroy(&pmd->port_mutex);
6722
0
    ovs_mutex_destroy(&pmd->bond_mutex);
6723
0
    free(pmd);
6724
0
}
6725
6726
/* Stops the pmd thread, removes it from the 'dp->poll_threads',
6727
 * and unrefs the struct. */
6728
static void
6729
dp_netdev_del_pmd(struct dp_netdev *dp, struct dp_netdev_pmd_thread *pmd)
6730
0
{
6731
    /* NON_PMD_CORE_ID doesn't have a thread, so we don't have to synchronize,
6732
     * but extra cleanup is necessary */
6733
0
    if (pmd->core_id == NON_PMD_CORE_ID) {
6734
0
        ovs_mutex_lock(&dp->non_pmd_mutex);
6735
0
        dfc_cache_uninit(&pmd->flow_cache);
6736
0
        pmd_free_cached_ports(pmd);
6737
0
        pmd_free_static_tx_qid(pmd);
6738
0
        ovs_mutex_unlock(&dp->non_pmd_mutex);
6739
0
    } else {
6740
0
        atomic_store_relaxed(&pmd->exit, true);
6741
0
        dp_netdev_reload_pmd__(pmd);
6742
0
        xpthread_join(pmd->thread, NULL);
6743
0
    }
6744
6745
0
    dp_netdev_pmd_clear_ports(pmd);
6746
6747
    /* Purges the 'pmd''s flows after stopping the thread, but before
6748
     * destroying the flows, so that the flow stats can be collected. */
6749
0
    if (dp->dp_purge_cb) {
6750
0
        dp->dp_purge_cb(dp->dp_purge_aux, pmd->core_id);
6751
0
    }
6752
0
    cmap_remove(&pmd->dp->poll_threads, &pmd->node, hash_int(pmd->core_id, 0));
6753
0
    dp_netdev_pmd_unref(pmd);
6754
0
}
6755
6756
/* Destroys all pmd threads. If 'non_pmd' is true it also destroys the non pmd
6757
 * thread. */
6758
static void
6759
dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd)
6760
0
{
6761
0
    struct dp_netdev_pmd_thread *pmd;
6762
0
    struct dp_netdev_pmd_thread **pmd_list;
6763
0
    size_t k = 0, n_pmds;
6764
6765
0
    n_pmds = cmap_count(&dp->poll_threads);
6766
0
    pmd_list = xcalloc(n_pmds, sizeof *pmd_list);
6767
6768
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
6769
0
        if (!non_pmd && pmd->core_id == NON_PMD_CORE_ID) {
6770
0
            continue;
6771
0
        }
6772
        /* We cannot call dp_netdev_del_pmd(), since it alters
6773
         * 'dp->poll_threads' (while we're iterating it) and it
6774
         * might quiesce. */
6775
0
        ovs_assert(k < n_pmds);
6776
0
        pmd_list[k++] = pmd;
6777
0
    }
6778
6779
0
    for (size_t i = 0; i < k; i++) {
6780
0
        dp_netdev_del_pmd(dp, pmd_list[i]);
6781
0
    }
6782
0
    free(pmd_list);
6783
0
}
6784
6785
/* Deletes all rx queues from pmd->poll_list and all the ports from
6786
 * pmd->tx_ports. */
6787
static void
6788
dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd)
6789
0
{
6790
0
    struct rxq_poll *poll;
6791
0
    struct tx_port *port;
6792
0
    struct tx_bond *tx;
6793
6794
0
    ovs_mutex_lock(&pmd->port_mutex);
6795
0
    HMAP_FOR_EACH_POP (poll, node, &pmd->poll_list) {
6796
0
        free(poll);
6797
0
    }
6798
0
    HMAP_FOR_EACH_POP (port, node, &pmd->tx_ports) {
6799
0
        free(port->txq_pkts);
6800
0
        free(port);
6801
0
    }
6802
0
    ovs_mutex_unlock(&pmd->port_mutex);
6803
6804
0
    ovs_mutex_lock(&pmd->bond_mutex);
6805
0
    CMAP_FOR_EACH (tx, node, &pmd->tx_bonds) {
6806
0
        cmap_remove(&pmd->tx_bonds, &tx->node, hash_bond_id(tx->bond_id));
6807
0
        ovsrcu_postpone(free, tx);
6808
0
    }
6809
0
    ovs_mutex_unlock(&pmd->bond_mutex);
6810
0
}
6811
6812
/* Adds rx queue to poll_list of PMD thread, if it's not there already. */
6813
static void
6814
dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
6815
                         struct dp_netdev_rxq *rxq)
6816
    OVS_REQUIRES(pmd->port_mutex)
6817
0
{
6818
0
    int qid = netdev_rxq_get_queue_id(rxq->rx);
6819
0
    uint32_t hash = hash_2words(odp_to_u32(rxq->port->port_no), qid);
6820
0
    struct rxq_poll *poll;
6821
6822
0
    HMAP_FOR_EACH_WITH_HASH (poll, node, hash, &pmd->poll_list) {
6823
0
        if (poll->rxq == rxq) {
6824
            /* 'rxq' is already polled by this thread. Do nothing. */
6825
0
            return;
6826
0
        }
6827
0
    }
6828
6829
0
    poll = xmalloc(sizeof *poll);
6830
0
    poll->rxq = rxq;
6831
0
    hmap_insert(&pmd->poll_list, &poll->node, hash);
6832
6833
0
    pmd->need_reload = true;
6834
0
}
6835
6836
/* Delete 'poll' from poll_list of PMD thread. */
6837
static void
6838
dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
6839
                           struct rxq_poll *poll)
6840
    OVS_REQUIRES(pmd->port_mutex)
6841
0
{
6842
0
    hmap_remove(&pmd->poll_list, &poll->node);
6843
0
    free(poll);
6844
6845
0
    pmd->need_reload = true;
6846
0
}
6847
6848
/* Add 'port' to the tx port cache of 'pmd', which must be reloaded for the
6849
 * changes to take effect. */
6850
static void
6851
dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
6852
                             struct dp_netdev_port *port)
6853
    OVS_REQUIRES(pmd->port_mutex)
6854
0
{
6855
0
    struct tx_port *tx;
6856
6857
0
    tx = tx_port_lookup(&pmd->tx_ports, port->port_no);
6858
0
    if (tx) {
6859
        /* 'port' is already on this thread tx cache. Do nothing. */
6860
0
        return;
6861
0
    }
6862
6863
0
    tx = xzalloc(sizeof *tx);
6864
6865
0
    tx->port = port;
6866
0
    tx->qid = -1;
6867
0
    tx->flush_time = 0LL;
6868
0
    dp_packet_batch_init(&tx->output_pkts);
6869
6870
0
    if (tx->port->txq_mode == TXQ_MODE_XPS_HASH) {
6871
0
        int i, n_txq = netdev_n_txq(tx->port->netdev);
6872
6873
0
        tx->txq_pkts = xzalloc(n_txq * sizeof *tx->txq_pkts);
6874
0
        for (i = 0; i < n_txq; i++) {
6875
0
            dp_packet_batch_init(&tx->txq_pkts[i]);
6876
0
        }
6877
0
    }
6878
6879
0
    hmap_insert(&pmd->tx_ports, &tx->node, hash_port_no(tx->port->port_no));
6880
0
    pmd->need_reload = true;
6881
0
}
6882
6883
/* Del 'tx' from the tx port cache of 'pmd', which must be reloaded for the
6884
 * changes to take effect. */
6885
static void
6886
dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
6887
                               struct tx_port *tx)
6888
    OVS_REQUIRES(pmd->port_mutex)
6889
0
{
6890
0
    hmap_remove(&pmd->tx_ports, &tx->node);
6891
0
    free(tx->txq_pkts);
6892
0
    free(tx);
6893
0
    pmd->need_reload = true;
6894
0
}
6895
6896
/* Add bond to the tx bond cmap of 'pmd'. */
6897
static void
6898
dp_netdev_add_bond_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
6899
                             struct tx_bond *bond, bool update)
6900
    OVS_EXCLUDED(pmd->bond_mutex)
6901
0
{
6902
0
    struct tx_bond *tx;
6903
6904
0
    ovs_mutex_lock(&pmd->bond_mutex);
6905
0
    tx = tx_bond_lookup(&pmd->tx_bonds, bond->bond_id);
6906
6907
0
    if (tx && !update) {
6908
        /* It's not an update and the entry already exists.  Do nothing. */
6909
0
        goto unlock;
6910
0
    }
6911
6912
0
    if (tx) {
6913
0
        struct tx_bond *new_tx = xmemdup(bond, sizeof *bond);
6914
6915
        /* Copy the stats for each bucket. */
6916
0
        for (int i = 0; i < BOND_BUCKETS; i++) {
6917
0
            uint64_t n_packets, n_bytes;
6918
6919
0
            atomic_read_relaxed(&tx->member_buckets[i].n_packets, &n_packets);
6920
0
            atomic_read_relaxed(&tx->member_buckets[i].n_bytes, &n_bytes);
6921
0
            atomic_init(&new_tx->member_buckets[i].n_packets, n_packets);
6922
0
            atomic_init(&new_tx->member_buckets[i].n_bytes, n_bytes);
6923
0
        }
6924
0
        cmap_replace(&pmd->tx_bonds, &tx->node, &new_tx->node,
6925
0
                     hash_bond_id(bond->bond_id));
6926
0
        ovsrcu_postpone(free, tx);
6927
0
    } else {
6928
0
        tx = xmemdup(bond, sizeof *bond);
6929
0
        cmap_insert(&pmd->tx_bonds, &tx->node, hash_bond_id(bond->bond_id));
6930
0
    }
6931
0
unlock:
6932
0
    ovs_mutex_unlock(&pmd->bond_mutex);
6933
0
}
6934
6935
/* Delete bond from the tx bond cmap of 'pmd'. */
6936
static void
6937
dp_netdev_del_bond_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
6938
                               uint32_t bond_id)
6939
    OVS_EXCLUDED(pmd->bond_mutex)
6940
0
{
6941
0
    struct tx_bond *tx;
6942
6943
0
    ovs_mutex_lock(&pmd->bond_mutex);
6944
0
    tx = tx_bond_lookup(&pmd->tx_bonds, bond_id);
6945
0
    if (tx) {
6946
0
        cmap_remove(&pmd->tx_bonds, &tx->node, hash_bond_id(tx->bond_id));
6947
0
        ovsrcu_postpone(free, tx);
6948
0
    }
6949
0
    ovs_mutex_unlock(&pmd->bond_mutex);
6950
0
}
6951

6952
static char *
6953
dpif_netdev_get_datapath_version(void)
6954
0
{
6955
0
     return xstrdup("<built-in>");
6956
0
}
6957
6958
static void
6959
dp_netdev_flow_used(struct dp_netdev_flow *netdev_flow, int cnt, int size,
6960
                    uint16_t tcp_flags, long long now)
6961
0
{
6962
0
    uint16_t flags;
6963
6964
0
    atomic_store_relaxed(&netdev_flow->stats.used, now);
6965
0
    non_atomic_ullong_add(&netdev_flow->stats.packet_count, cnt);
6966
0
    non_atomic_ullong_add(&netdev_flow->stats.byte_count, size);
6967
0
    atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
6968
0
    flags |= tcp_flags;
6969
0
    atomic_store_relaxed(&netdev_flow->stats.tcp_flags, flags);
6970
0
}
6971
6972
static int
6973
dp_netdev_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet_,
6974
                 struct flow *flow, struct flow_wildcards *wc, ovs_u128 *ufid,
6975
                 enum dpif_upcall_type type, const struct nlattr *userdata,
6976
                 struct ofpbuf *actions, struct ofpbuf *put_actions)
6977
0
{
6978
0
    struct dp_netdev *dp = pmd->dp;
6979
6980
0
    if (OVS_UNLIKELY(!dp->upcall_cb)) {
6981
0
        return ENODEV;
6982
0
    }
6983
6984
0
    if (OVS_UNLIKELY(!VLOG_DROP_DBG(&upcall_rl))) {
6985
0
        struct ds ds = DS_EMPTY_INITIALIZER;
6986
0
        char *packet_str;
6987
0
        struct ofpbuf key;
6988
0
        struct odp_flow_key_parms odp_parms = {
6989
0
            .flow = flow,
6990
0
            .mask = wc ? &wc->masks : NULL,
6991
0
            .support = dp_netdev_support,
6992
0
        };
6993
6994
0
        ofpbuf_init(&key, 0);
6995
0
        odp_flow_key_from_flow(&odp_parms, &key);
6996
0
        packet_str = ofp_dp_packet_to_string(packet_);
6997
6998
0
        odp_flow_key_format(key.data, key.size, &ds);
6999
7000
0
        VLOG_DBG("%s: %s upcall:\n%s\n%s", dp->name,
7001
0
                 dpif_upcall_type_to_string(type), ds_cstr(&ds), packet_str);
7002
7003
0
        ofpbuf_uninit(&key);
7004
0
        free(packet_str);
7005
7006
0
        ds_destroy(&ds);
7007
0
    }
7008
7009
0
    if (type != DPIF_UC_MISS) {
7010
0
        dp_packet_ol_send_prepare(packet_, 0);
7011
0
    }
7012
7013
0
    return dp->upcall_cb(packet_, flow, ufid, pmd->core_id, type, userdata,
7014
0
                         actions, wc, put_actions, dp->upcall_aux);
7015
0
}
7016
7017
static inline uint32_t
7018
dpif_netdev_packet_get_rss_hash(struct dp_packet *packet,
7019
                                const struct miniflow *mf)
7020
0
{
7021
0
    uint32_t hash, recirc_depth;
7022
7023
0
    if (OVS_LIKELY(dp_packet_rss_valid(packet))) {
7024
0
        hash = dp_packet_get_rss_hash(packet);
7025
0
    } else {
7026
0
        hash = miniflow_hash_5tuple(mf, 0);
7027
0
        dp_packet_set_rss_hash(packet, hash);
7028
0
    }
7029
7030
    /* The RSS hash must account for the recirculation depth to avoid
7031
     * collisions in the exact match cache */
7032
0
    recirc_depth = *recirc_depth_get_unsafe();
7033
0
    if (OVS_UNLIKELY(recirc_depth)) {
7034
0
        hash = hash_finish(hash, recirc_depth);
7035
0
    }
7036
0
    return hash;
7037
0
}
7038
7039
struct packet_batch_per_flow {
7040
    unsigned int byte_count;
7041
    uint16_t tcp_flags;
7042
    struct dp_netdev_flow *flow;
7043
7044
    struct dp_packet_batch array;
7045
};
7046
7047
static inline void
7048
packet_batch_per_flow_update(struct packet_batch_per_flow *batch,
7049
                             struct dp_packet *packet,
7050
                             uint16_t tcp_flags)
7051
0
{
7052
0
    batch->byte_count += dp_packet_size(packet);
7053
0
    batch->tcp_flags |= tcp_flags;
7054
0
    dp_packet_batch_add(&batch->array, packet);
7055
0
}
7056
7057
static inline void
7058
packet_batch_per_flow_init(struct packet_batch_per_flow *batch,
7059
                           struct dp_netdev_flow *flow)
7060
0
{
7061
0
    flow->batch = batch;
7062
7063
0
    batch->flow = flow;
7064
0
    dp_packet_batch_init(&batch->array);
7065
0
    batch->byte_count = 0;
7066
0
    batch->tcp_flags = 0;
7067
0
}
7068
7069
static inline void
7070
packet_batch_per_flow_execute(struct packet_batch_per_flow *batch,
7071
                              struct dp_netdev_pmd_thread *pmd)
7072
0
{
7073
0
    struct dp_netdev_actions *actions;
7074
0
    struct dp_netdev_flow *flow = batch->flow;
7075
7076
0
    dp_netdev_flow_used(flow, dp_packet_batch_size(&batch->array),
7077
0
                        batch->byte_count,
7078
0
                        batch->tcp_flags, pmd->ctx.now / 1000);
7079
7080
0
    actions = dp_netdev_flow_get_actions(flow);
7081
7082
0
    dp_netdev_execute_actions(pmd, &batch->array, true, &flow->flow,
7083
0
                              actions->actions, actions->size);
7084
0
}
7085
7086
static inline void
7087
dp_netdev_queue_batches(struct dp_packet *pkt,
7088
                        struct dp_netdev_flow *flow, uint16_t tcp_flags,
7089
                        struct packet_batch_per_flow *batches,
7090
                        size_t *n_batches)
7091
0
{
7092
0
    struct packet_batch_per_flow *batch = flow->batch;
7093
7094
0
    if (OVS_UNLIKELY(!batch)) {
7095
0
        batch = &batches[(*n_batches)++];
7096
0
        packet_batch_per_flow_init(batch, flow);
7097
0
    }
7098
7099
0
    packet_batch_per_flow_update(batch, pkt, tcp_flags);
7100
0
}
7101
7102
static inline void
7103
packet_enqueue_to_flow_map(struct dp_packet *packet,
7104
                           struct dp_netdev_flow *flow,
7105
                           uint16_t tcp_flags,
7106
                           struct dp_packet_flow_map *flow_map,
7107
                           size_t index)
7108
0
{
7109
0
    struct dp_packet_flow_map *map = &flow_map[index];
7110
0
    map->flow = flow;
7111
0
    map->packet = packet;
7112
0
    map->tcp_flags = tcp_flags;
7113
0
}
7114
7115
/* SMC lookup function for a batch of packets.
7116
 * By doing batching SMC lookup, we can use prefetch
7117
 * to hide memory access latency.
7118
 */
7119
static inline void
7120
smc_lookup_batch(struct dp_netdev_pmd_thread *pmd,
7121
            struct netdev_flow_key *keys,
7122
            struct netdev_flow_key **missed_keys,
7123
            struct dp_packet_batch *packets_,
7124
            const int cnt,
7125
            struct dp_packet_flow_map *flow_map,
7126
            uint8_t *index_map)
7127
0
{
7128
0
    int i;
7129
0
    struct dp_packet *packet;
7130
0
    size_t n_smc_hit = 0, n_missed = 0;
7131
0
    struct dfc_cache *cache = &pmd->flow_cache;
7132
0
    struct smc_cache *smc_cache = &cache->smc_cache;
7133
0
    const struct cmap_node *flow_node;
7134
0
    int recv_idx;
7135
0
    uint16_t tcp_flags;
7136
7137
    /* Prefetch buckets for all packets */
7138
0
    for (i = 0; i < cnt; i++) {
7139
0
        OVS_PREFETCH(&smc_cache->buckets[keys[i].hash & SMC_MASK]);
7140
0
    }
7141
7142
0
    DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) {
7143
0
        struct dp_netdev_flow *flow = NULL;
7144
0
        flow_node = smc_entry_get(pmd, keys[i].hash);
7145
0
        bool hit = false;
7146
        /* Get the original order of this packet in received batch. */
7147
0
        recv_idx = index_map[i];
7148
7149
0
        if (OVS_LIKELY(flow_node != NULL)) {
7150
0
            CMAP_NODE_FOR_EACH (flow, node, flow_node) {
7151
                /* Since we dont have per-port megaflow to check the port
7152
                 * number, we need to  verify that the input ports match. */
7153
0
                if (OVS_LIKELY(dpcls_rule_matches_key(&flow->cr, &keys[i]) &&
7154
0
                flow->flow.in_port.odp_port == packet->md.in_port.odp_port)) {
7155
0
                    tcp_flags = miniflow_get_tcp_flags(&keys[i].mf);
7156
7157
                    /* SMC hit and emc miss, we insert into EMC */
7158
0
                    keys[i].len =
7159
0
                        netdev_flow_key_size(miniflow_n_values(&keys[i].mf));
7160
0
                    emc_probabilistic_insert(pmd, &keys[i], flow);
7161
                    /* Add these packets into the flow map in the same order
7162
                     * as received.
7163
                     */
7164
0
                    packet_enqueue_to_flow_map(packet, flow, tcp_flags,
7165
0
                                               flow_map, recv_idx);
7166
0
                    n_smc_hit++;
7167
0
                    hit = true;
7168
0
                    break;
7169
0
                }
7170
0
            }
7171
0
            if (hit) {
7172
0
                continue;
7173
0
            }
7174
0
        }
7175
7176
        /* SMC missed. Group missed packets together at
7177
         * the beginning of the 'packets' array. */
7178
0
        dp_packet_batch_refill(packets_, packet, i);
7179
7180
        /* Preserve the order of packet for flow batching. */
7181
0
        index_map[n_missed] = recv_idx;
7182
7183
        /* Put missed keys to the pointer arrays return to the caller */
7184
0
        missed_keys[n_missed++] = &keys[i];
7185
0
    }
7186
7187
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SMC_HIT, n_smc_hit);
7188
0
}
7189
7190
struct dp_netdev_flow *
7191
smc_lookup_single(struct dp_netdev_pmd_thread *pmd,
7192
                  struct dp_packet *packet,
7193
                  struct netdev_flow_key *key)
7194
0
{
7195
0
    const struct cmap_node *flow_node = smc_entry_get(pmd, key->hash);
7196
7197
0
    if (OVS_LIKELY(flow_node != NULL)) {
7198
0
        struct dp_netdev_flow *flow = NULL;
7199
7200
0
        CMAP_NODE_FOR_EACH (flow, node, flow_node) {
7201
            /* Since we dont have per-port megaflow to check the port
7202
             * number, we need to verify that the input ports match. */
7203
0
            if (OVS_LIKELY(dpcls_rule_matches_key(&flow->cr, key) &&
7204
0
                flow->flow.in_port.odp_port == packet->md.in_port.odp_port)) {
7205
7206
0
                return (void *) flow;
7207
0
            }
7208
0
        }
7209
0
    }
7210
7211
0
    return NULL;
7212
0
}
7213
7214
static inline int
7215
dp_netdev_hw_flow(const struct dp_netdev_pmd_thread *pmd,
7216
                  struct dp_packet *packet,
7217
                  struct dp_netdev_flow **flow)
7218
0
{
7219
0
    struct dp_netdev_rxq *rxq = pmd->ctx.last_rxq;
7220
0
    bool post_process_api_supported;
7221
0
    void *flow_reference = NULL;
7222
0
    int err;
7223
7224
0
    atomic_read_relaxed(&rxq->port->netdev->hw_info.post_process_api_supported,
7225
0
                        &post_process_api_supported);
7226
7227
0
    if (!post_process_api_supported) {
7228
0
        *flow = NULL;
7229
0
        return 0;
7230
0
    }
7231
7232
0
    err = dpif_offload_netdev_hw_post_process(rxq->port->netdev, pmd->core_id,
7233
0
                                              packet, &flow_reference);
7234
0
    if (err && err != EOPNOTSUPP) {
7235
0
        if (err != ECANCELED) {
7236
0
            COVERAGE_INC(datapath_drop_hw_post_process);
7237
0
        } else {
7238
0
            COVERAGE_INC(datapath_drop_hw_post_process_consumed);
7239
0
        }
7240
0
        return -1;
7241
0
    }
7242
7243
0
    *flow = flow_reference;
7244
0
    return 0;
7245
0
}
7246
7247
/* Enqueues already classified packet into per-flow batches or the flow map,
7248
 * depending on the fact if batching enabled. */
7249
static inline void
7250
dfc_processing_enqueue_classified_packet(struct dp_packet *packet,
7251
                                         struct dp_netdev_flow *flow,
7252
                                         uint16_t tcp_flags,
7253
                                         bool batch_enable,
7254
                                         struct packet_batch_per_flow *batches,
7255
                                         size_t *n_batches,
7256
                                         struct dp_packet_flow_map *flow_map,
7257
                                         size_t *map_cnt)
7258
7259
0
{
7260
0
    if (OVS_LIKELY(batch_enable)) {
7261
0
        dp_netdev_queue_batches(packet, flow, tcp_flags, batches,
7262
0
                                n_batches);
7263
0
    } else {
7264
        /* Flow batching should be performed only after fast-path
7265
         * processing is also completed for packets with emc miss
7266
         * or else it will result in reordering of packets with
7267
         * same datapath flows. */
7268
0
        packet_enqueue_to_flow_map(packet, flow, tcp_flags,
7269
0
                                   flow_map, (*map_cnt)++);
7270
0
    }
7271
7272
0
}
7273
7274
/* Try to process all ('cnt') the 'packets' using only the datapath flow cache
7275
 * 'pmd->flow_cache'. If a flow is not found for a packet 'packets[i]', the
7276
 * miniflow is copied into 'keys' and the packet pointer is moved at the
7277
 * beginning of the 'packets' array. The pointers of missed keys are put in the
7278
 * missed_keys pointer array for future processing.
7279
 *
7280
 * The function returns the number of packets that needs to be processed in the
7281
 * 'packets' array (they have been moved to the beginning of the vector).
7282
 *
7283
 * For performance reasons a caller may choose not to initialize the metadata
7284
 * in 'packets_'.  If 'md_is_valid' is false, the metadata in 'packets'
7285
 * is not valid and must be initialized by this function using 'port_no'.
7286
 * If 'md_is_valid' is true, the metadata is already valid and 'port_no'
7287
 * will be ignored.
7288
 */
7289
static inline size_t
7290
dfc_processing(struct dp_netdev_pmd_thread *pmd,
7291
               struct dp_packet_batch *packets_,
7292
               struct netdev_flow_key *keys,
7293
               struct netdev_flow_key **missed_keys,
7294
               struct packet_batch_per_flow batches[], size_t *n_batches,
7295
               struct dp_packet_flow_map *flow_map,
7296
               size_t *n_flows, uint8_t *index_map,
7297
               bool md_is_valid, odp_port_t port_no)
7298
0
{
7299
0
    size_t n_missed = 0, n_emc_hit = 0, n_phwol_hit = 0, n_simple_hit = 0;
7300
0
    const bool offload_enabled = dpif_offload_enabled();
7301
0
    const uint32_t recirc_depth = *recirc_depth_get();
7302
0
    const size_t cnt = dp_packet_batch_size(packets_);
7303
0
    struct dfc_cache *cache = &pmd->flow_cache;
7304
0
    struct netdev_flow_key *key = &keys[0];
7305
0
    struct dp_packet *packet;
7306
0
    size_t map_cnt = 0;
7307
0
    bool batch_enable = true;
7308
7309
0
    const bool simple_match_enabled =
7310
0
        !md_is_valid && dp_netdev_simple_match_enabled(pmd, port_no);
7311
    /* 'simple_match_table' is a full flow table.  If the flow is not there,
7312
     * upcall is required, and there is no chance to find a match in caches. */
7313
0
    const bool smc_enable_db = !simple_match_enabled && pmd->ctx.smc_enable_db;
7314
0
    const uint32_t cur_min = simple_match_enabled
7315
0
                             ? 0 : pmd->ctx.emc_insert_min;
7316
7317
0
    pmd_perf_update_counter(&pmd->perf_stats,
7318
0
                            md_is_valid ? PMD_STAT_RECIRC : PMD_STAT_RECV,
7319
0
                            cnt);
7320
0
    int i;
7321
0
    DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) {
7322
0
        struct dp_netdev_flow *flow = NULL;
7323
0
        uint16_t tcp_flags;
7324
7325
0
        if (OVS_UNLIKELY(dp_packet_size(packet) < ETH_HEADER_LEN)) {
7326
0
            dp_packet_delete(packet);
7327
0
            COVERAGE_INC(datapath_drop_rx_invalid_packet);
7328
0
            continue;
7329
0
        }
7330
7331
0
        if (i != cnt - 1) {
7332
0
            struct dp_packet **packets = packets_->packets;
7333
            /* Prefetch next packet data and metadata. */
7334
0
            OVS_PREFETCH(dp_packet_data(packets[i+1]));
7335
0
            pkt_metadata_prefetch_init(&packets[i+1]->md);
7336
0
        }
7337
7338
0
        if (!md_is_valid) {
7339
0
            pkt_metadata_init(&packet->md, port_no);
7340
0
        }
7341
7342
0
        if (offload_enabled && recirc_depth == 0) {
7343
0
            if (OVS_UNLIKELY(dp_netdev_hw_flow(pmd, packet, &flow))) {
7344
                /* Packet restoration failed and it was dropped, do not
7345
                 * continue processing.
7346
                 */
7347
0
                continue;
7348
0
            }
7349
0
            if (OVS_LIKELY(flow)) {
7350
0
                tcp_flags = parse_tcp_flags(packet, NULL, NULL, NULL);
7351
0
                n_phwol_hit++;
7352
0
                dfc_processing_enqueue_classified_packet(
7353
0
                        packet, flow, tcp_flags, batch_enable,
7354
0
                        batches, n_batches, flow_map, &map_cnt);
7355
0
                continue;
7356
0
            }
7357
0
        }
7358
7359
0
        if (!flow && simple_match_enabled) {
7360
0
            ovs_be16 dl_type = 0, vlan_tci = 0;
7361
0
            uint8_t nw_frag = 0;
7362
7363
0
            tcp_flags = parse_tcp_flags(packet, &dl_type, &nw_frag, &vlan_tci);
7364
0
            flow = dp_netdev_simple_match_lookup(pmd, port_no, dl_type,
7365
0
                                                 nw_frag, vlan_tci);
7366
0
            if (OVS_LIKELY(flow)) {
7367
0
                n_simple_hit++;
7368
0
                dfc_processing_enqueue_classified_packet(
7369
0
                        packet, flow, tcp_flags, batch_enable,
7370
0
                        batches, n_batches, flow_map, &map_cnt);
7371
0
                continue;
7372
0
            }
7373
0
        }
7374
7375
0
        miniflow_extract(packet, &key->mf);
7376
0
        key->len = 0; /* Not computed yet. */
7377
0
        key->hash =
7378
0
                (md_is_valid == false)
7379
0
                ? dpif_netdev_packet_get_rss_hash_orig_pkt(packet, &key->mf)
7380
0
                : dpif_netdev_packet_get_rss_hash(packet, &key->mf);
7381
7382
        /* If EMC is disabled skip emc_lookup */
7383
0
        flow = (cur_min != 0) ? emc_lookup(&cache->emc_cache, key) : NULL;
7384
0
        if (OVS_LIKELY(flow)) {
7385
0
            tcp_flags = miniflow_get_tcp_flags(&key->mf);
7386
0
            n_emc_hit++;
7387
0
            dfc_processing_enqueue_classified_packet(
7388
0
                    packet, flow, tcp_flags, batch_enable,
7389
0
                    batches, n_batches, flow_map, &map_cnt);
7390
0
        } else {
7391
            /* Exact match cache missed. Group missed packets together at
7392
             * the beginning of the 'packets' array. */
7393
0
            dp_packet_batch_refill(packets_, packet, i);
7394
7395
            /* Preserve the order of packet for flow batching. */
7396
0
            index_map[n_missed] = map_cnt;
7397
0
            flow_map[map_cnt++].flow = NULL;
7398
7399
            /* 'key[n_missed]' contains the key of the current packet and it
7400
             * will be passed to SMC lookup. The next key should be extracted
7401
             * to 'keys[n_missed + 1]'.
7402
             * We also maintain a pointer array to keys missed both SMC and EMC
7403
             * which will be returned to the caller for future processing. */
7404
0
            missed_keys[n_missed] = key;
7405
0
            key = &keys[++n_missed];
7406
7407
            /* Skip batching for subsequent packets to avoid reordering. */
7408
0
            batch_enable = false;
7409
0
        }
7410
0
    }
7411
    /* Count of packets which are not flow batched. */
7412
0
    *n_flows = map_cnt;
7413
7414
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_PHWOL_HIT, n_phwol_hit);
7415
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SIMPLE_HIT,
7416
0
                            n_simple_hit);
7417
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_EXACT_HIT, n_emc_hit);
7418
7419
0
    if (!smc_enable_db) {
7420
0
        return dp_packet_batch_size(packets_);
7421
0
    }
7422
7423
    /* Packets miss EMC will do a batch lookup in SMC if enabled */
7424
0
    smc_lookup_batch(pmd, keys, missed_keys, packets_,
7425
0
                     n_missed, flow_map, index_map);
7426
7427
0
    return dp_packet_batch_size(packets_);
7428
0
}
7429
7430
static inline int
7431
handle_packet_upcall(struct dp_netdev_pmd_thread *pmd,
7432
                     struct dp_packet *packet,
7433
                     const struct netdev_flow_key *key,
7434
                     struct ofpbuf *actions, struct ofpbuf *put_actions)
7435
0
{
7436
0
    struct ofpbuf *add_actions;
7437
0
    struct dp_packet_batch b;
7438
0
    struct match match;
7439
0
    ovs_u128 ufid;
7440
0
    int error;
7441
0
    uint64_t cycles = cycles_counter_update(&pmd->perf_stats);
7442
0
    odp_port_t orig_in_port = packet->md.orig_in_port;
7443
7444
0
    match.tun_md.valid = false;
7445
0
    miniflow_expand(&key->mf, &match.flow);
7446
0
    memset(&match.wc, 0, sizeof match.wc);
7447
7448
0
    ofpbuf_clear(actions);
7449
0
    ofpbuf_clear(put_actions);
7450
7451
0
    odp_flow_key_hash(&match.flow, sizeof match.flow, &ufid);
7452
0
    error = dp_netdev_upcall(pmd, packet, &match.flow, &match.wc,
7453
0
                             &ufid, DPIF_UC_MISS, NULL, actions,
7454
0
                             put_actions);
7455
0
    if (OVS_UNLIKELY(error && error != ENOSPC)) {
7456
0
        dp_packet_delete(packet);
7457
0
        COVERAGE_INC(datapath_drop_upcall_error);
7458
0
        return error;
7459
0
    }
7460
7461
    /* The Netlink encoding of datapath flow keys cannot express
7462
     * wildcarding the presence of a VLAN tag. Instead, a missing VLAN
7463
     * tag is interpreted as exact match on the fact that there is no
7464
     * VLAN.  Unless we refactor a lot of code that translates between
7465
     * Netlink and struct flow representations, we have to do the same
7466
     * here.  This must be in sync with 'match' in dpif_netdev_flow_put(). */
7467
0
    if (!match.wc.masks.vlans[0].tci) {
7468
0
        match.wc.masks.vlans[0].tci = htons(VLAN_VID_MASK | VLAN_CFI);
7469
0
    }
7470
7471
    /* We can't allow the packet batching in the next loop to execute
7472
     * the actions.  Otherwise, if there are any slow path actions,
7473
     * we'll send the packet up twice. */
7474
0
    dp_packet_batch_init_packet(&b, packet);
7475
0
    dp_netdev_execute_actions(pmd, &b, true, &match.flow,
7476
0
                              actions->data, actions->size);
7477
7478
0
    add_actions = put_actions->size ? put_actions : actions;
7479
0
    if (OVS_LIKELY(error != ENOSPC)) {
7480
0
        struct dp_netdev_flow *netdev_flow;
7481
7482
        /* XXX: There's a race window where a flow covering this packet
7483
         * could have already been installed since we last did the flow
7484
         * lookup before upcall.  This could be solved by moving the
7485
         * mutex lock outside the loop, but that's an awful long time
7486
         * to be locking revalidators out of making flow modifications. */
7487
0
        ovs_mutex_lock(&pmd->flow_mutex);
7488
0
        netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL);
7489
0
        if (OVS_LIKELY(!netdev_flow)) {
7490
0
            netdev_flow = dp_netdev_flow_add(pmd, &match, &ufid,
7491
0
                                             add_actions->data,
7492
0
                                             add_actions->size, orig_in_port);
7493
0
        }
7494
0
        ovs_mutex_unlock(&pmd->flow_mutex);
7495
0
        uint32_t hash = dp_netdev_flow_hash(&netdev_flow->ufid);
7496
0
        smc_insert(pmd, key, hash);
7497
0
        emc_probabilistic_insert(pmd, key, netdev_flow);
7498
0
    }
7499
0
    if (pmd_perf_metrics_enabled(pmd)) {
7500
        /* Update upcall stats. */
7501
0
        cycles = cycles_counter_update(&pmd->perf_stats) - cycles;
7502
0
        struct pmd_perf_stats *s = &pmd->perf_stats;
7503
0
        s->current.upcalls++;
7504
0
        s->current.upcall_cycles += cycles;
7505
0
        histogram_add_sample(&s->cycles_per_upcall, cycles);
7506
0
    }
7507
0
    return error;
7508
0
}
7509
7510
static inline void
7511
fast_path_processing(struct dp_netdev_pmd_thread *pmd,
7512
                     struct dp_packet_batch *packets_,
7513
                     struct netdev_flow_key **keys,
7514
                     struct dp_packet_flow_map *flow_map,
7515
                     uint8_t *index_map,
7516
                     odp_port_t in_port)
7517
0
{
7518
0
    const size_t cnt = dp_packet_batch_size(packets_);
7519
0
#ifndef __CHECKER__
7520
0
    const size_t PKT_ARRAY_SIZE = cnt;
7521
#else
7522
    /* Sparse doesn't like variable length array. */
7523
    enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
7524
#endif
7525
0
    struct dp_packet *packet;
7526
0
    struct dpcls *cls;
7527
0
    struct dpcls_rule *rules[PKT_ARRAY_SIZE];
7528
0
    struct dp_netdev *dp = pmd->dp;
7529
0
    int upcall_ok_cnt = 0, upcall_fail_cnt = 0;
7530
0
    int lookup_cnt = 0, add_lookup_cnt;
7531
0
    bool any_miss;
7532
7533
0
    for (size_t i = 0; i < cnt; i++) {
7534
        /* Key length is needed in all the cases, hash computed on demand. */
7535
0
        keys[i]->len = netdev_flow_key_size(miniflow_n_values(&keys[i]->mf));
7536
0
    }
7537
    /* Get the classifier for the in_port */
7538
0
    cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
7539
0
    if (OVS_LIKELY(cls)) {
7540
0
        any_miss = !dpcls_lookup(cls, (const struct netdev_flow_key **)keys,
7541
0
                                rules, cnt, &lookup_cnt);
7542
0
    } else {
7543
0
        any_miss = true;
7544
0
        memset(rules, 0, sizeof(rules));
7545
0
    }
7546
0
    if (OVS_UNLIKELY(any_miss) && !fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
7547
0
        uint64_t actions_stub[512 / 8], slow_stub[512 / 8];
7548
0
        struct ofpbuf actions, put_actions;
7549
7550
0
        ofpbuf_use_stub(&actions, actions_stub, sizeof actions_stub);
7551
0
        ofpbuf_use_stub(&put_actions, slow_stub, sizeof slow_stub);
7552
7553
0
        DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7554
0
            struct dp_netdev_flow *netdev_flow;
7555
7556
0
            if (OVS_LIKELY(rules[i])) {
7557
0
                continue;
7558
0
            }
7559
7560
            /* It's possible that an earlier slow path execution installed
7561
             * a rule covering this flow.  In this case, it's a lot cheaper
7562
             * to catch it here than execute a miss. */
7563
0
            netdev_flow = dp_netdev_pmd_lookup_flow(pmd, keys[i],
7564
0
                                                    &add_lookup_cnt);
7565
0
            if (netdev_flow) {
7566
0
                lookup_cnt += add_lookup_cnt;
7567
0
                rules[i] = &netdev_flow->cr;
7568
0
                continue;
7569
0
            }
7570
7571
0
            int error = handle_packet_upcall(pmd, packet, keys[i],
7572
0
                                             &actions, &put_actions);
7573
7574
0
            if (OVS_UNLIKELY(error)) {
7575
0
                upcall_fail_cnt++;
7576
0
            } else {
7577
0
                upcall_ok_cnt++;
7578
0
            }
7579
0
        }
7580
7581
0
        ofpbuf_uninit(&actions);
7582
0
        ofpbuf_uninit(&put_actions);
7583
0
        fat_rwlock_unlock(&dp->upcall_rwlock);
7584
0
    } else if (OVS_UNLIKELY(any_miss)) {
7585
0
        DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7586
0
            if (OVS_UNLIKELY(!rules[i])) {
7587
0
                dp_packet_delete(packet);
7588
0
                COVERAGE_INC(datapath_drop_lock_error);
7589
0
                upcall_fail_cnt++;
7590
0
            }
7591
0
        }
7592
0
    }
7593
7594
0
    DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7595
0
        struct dp_netdev_flow *flow;
7596
        /* Get the original order of this packet in received batch. */
7597
0
        int recv_idx = index_map[i];
7598
0
        uint16_t tcp_flags;
7599
7600
0
        if (OVS_UNLIKELY(!rules[i])) {
7601
0
            continue;
7602
0
        }
7603
7604
0
        flow = dp_netdev_flow_cast(rules[i]);
7605
0
        uint32_t hash =  dp_netdev_flow_hash(&flow->ufid);
7606
0
        smc_insert(pmd, keys[i], hash);
7607
7608
0
        emc_probabilistic_insert(pmd, keys[i], flow);
7609
        /* Add these packets into the flow map in the same order
7610
         * as received.
7611
         */
7612
0
        tcp_flags = miniflow_get_tcp_flags(&keys[i]->mf);
7613
0
        packet_enqueue_to_flow_map(packet, flow, tcp_flags,
7614
0
                                   flow_map, recv_idx);
7615
0
    }
7616
7617
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_HIT,
7618
0
                            cnt - upcall_ok_cnt - upcall_fail_cnt);
7619
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_LOOKUP,
7620
0
                            lookup_cnt);
7621
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MISS,
7622
0
                            upcall_ok_cnt);
7623
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_LOST,
7624
0
                            upcall_fail_cnt);
7625
0
}
7626
7627
/* Packets enter the datapath from a port (or from recirculation) here.
7628
 *
7629
 * When 'md_is_valid' is true the metadata in 'packets' are already valid.
7630
 * When false the metadata in 'packets' need to be initialized. */
7631
static void
7632
dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
7633
                  struct dp_packet_batch *packets,
7634
                  bool md_is_valid, odp_port_t port_no)
7635
0
{
7636
0
#ifndef __CHECKER__
7637
0
    const size_t PKT_ARRAY_SIZE = dp_packet_batch_size(packets);
7638
#else
7639
    /* Sparse doesn't like variable length array. */
7640
    enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
7641
#endif
7642
0
    OVS_ALIGNED_VAR(CACHE_LINE_SIZE)
7643
0
        struct netdev_flow_key keys[PKT_ARRAY_SIZE];
7644
0
    struct netdev_flow_key *missed_keys[PKT_ARRAY_SIZE];
7645
0
    struct packet_batch_per_flow batches[PKT_ARRAY_SIZE];
7646
0
    size_t n_batches;
7647
0
    struct dp_packet_flow_map flow_map[PKT_ARRAY_SIZE];
7648
0
    uint8_t index_map[PKT_ARRAY_SIZE];
7649
0
    size_t n_flows, i;
7650
7651
0
    odp_port_t in_port;
7652
7653
0
    n_batches = 0;
7654
0
    dfc_processing(pmd, packets, keys, missed_keys, batches, &n_batches,
7655
0
                   flow_map, &n_flows, index_map, md_is_valid, port_no);
7656
7657
0
    if (!dp_packet_batch_is_empty(packets)) {
7658
        /* Get ingress port from first packet's metadata. */
7659
0
        in_port = packets->packets[0]->md.in_port.odp_port;
7660
0
        fast_path_processing(pmd, packets, missed_keys,
7661
0
                             flow_map, index_map, in_port);
7662
0
    }
7663
7664
    /* Batch rest of packets which are in flow map. */
7665
0
    for (i = 0; i < n_flows; i++) {
7666
0
        struct dp_packet_flow_map *map = &flow_map[i];
7667
7668
0
        if (OVS_UNLIKELY(!map->flow)) {
7669
0
            continue;
7670
0
        }
7671
0
        dp_netdev_queue_batches(map->packet, map->flow, map->tcp_flags,
7672
0
                                batches, &n_batches);
7673
0
     }
7674
7675
    /* All the flow batches need to be reset before any call to
7676
     * packet_batch_per_flow_execute() as it could potentially trigger
7677
     * recirculation. When a packet matching flow 'j' happens to be
7678
     * recirculated, the nested call to dp_netdev_input__() could potentially
7679
     * classify the packet as matching another flow - say 'k'. It could happen
7680
     * that in the previous call to dp_netdev_input__() that same flow 'k' had
7681
     * already its own batches[k] still waiting to be served.  So if its
7682
     * 'batch' member is not reset, the recirculated packet would be wrongly
7683
     * appended to batches[k] of the 1st call to dp_netdev_input__(). */
7684
0
    for (i = 0; i < n_batches; i++) {
7685
0
        batches[i].flow->batch = NULL;
7686
0
    }
7687
7688
0
    for (i = 0; i < n_batches; i++) {
7689
0
        packet_batch_per_flow_execute(&batches[i], pmd);
7690
0
    }
7691
0
}
7692
7693
static void
7694
dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
7695
                struct dp_packet_batch *packets,
7696
                odp_port_t port_no)
7697
0
{
7698
0
    dp_netdev_input__(pmd, packets, false, port_no);
7699
0
}
7700
7701
static void
7702
dp_netdev_recirculate(struct dp_netdev_pmd_thread *pmd,
7703
                      struct dp_packet_batch *packets)
7704
0
{
7705
0
    dp_netdev_input__(pmd, packets, true, 0);
7706
0
}
7707
7708
struct dp_netdev_execute_aux {
7709
    struct dp_netdev_pmd_thread *pmd;
7710
    const struct flow *flow;
7711
};
7712
7713
static void
7714
dpif_netdev_register_dp_purge_cb(struct dpif *dpif, dp_purge_callback *cb,
7715
                                 void *aux)
7716
0
{
7717
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
7718
0
    dp->dp_purge_aux = aux;
7719
0
    dp->dp_purge_cb = cb;
7720
0
}
7721
7722
static void
7723
dpif_netdev_register_upcall_cb(struct dpif *dpif, upcall_callback *cb,
7724
                               void *aux)
7725
0
{
7726
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
7727
0
    dp->upcall_aux = aux;
7728
0
    dp->upcall_cb = cb;
7729
0
}
7730
7731
static void
7732
dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
7733
                               bool purge)
7734
0
{
7735
0
    struct tx_port *tx;
7736
0
    struct dp_netdev_port *port;
7737
0
    long long interval;
7738
7739
0
    HMAP_FOR_EACH (tx, node, &pmd->send_port_cache) {
7740
0
        if (tx->port->txq_mode != TXQ_MODE_XPS) {
7741
0
            continue;
7742
0
        }
7743
0
        interval = pmd->ctx.now - tx->last_used;
7744
0
        if (tx->qid >= 0 && (purge || interval >= XPS_TIMEOUT)) {
7745
0
            port = tx->port;
7746
0
            ovs_mutex_lock(&port->txq_used_mutex);
7747
0
            port->txq_used[tx->qid]--;
7748
0
            ovs_mutex_unlock(&port->txq_used_mutex);
7749
0
            tx->qid = -1;
7750
0
        }
7751
0
    }
7752
0
}
7753
7754
static int
7755
dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
7756
                           struct tx_port *tx)
7757
0
{
7758
0
    struct dp_netdev_port *port;
7759
0
    long long interval;
7760
0
    int i, min_cnt, min_qid;
7761
7762
0
    interval = pmd->ctx.now - tx->last_used;
7763
0
    tx->last_used = pmd->ctx.now;
7764
7765
0
    if (OVS_LIKELY(tx->qid >= 0 && interval < XPS_TIMEOUT)) {
7766
0
        return tx->qid;
7767
0
    }
7768
7769
0
    port = tx->port;
7770
7771
0
    ovs_mutex_lock(&port->txq_used_mutex);
7772
0
    if (tx->qid >= 0) {
7773
0
        port->txq_used[tx->qid]--;
7774
0
        tx->qid = -1;
7775
0
    }
7776
7777
0
    min_cnt = -1;
7778
0
    min_qid = 0;
7779
0
    for (i = 0; i < netdev_n_txq(port->netdev); i++) {
7780
0
        if (port->txq_used[i] < min_cnt || min_cnt == -1) {
7781
0
            min_cnt = port->txq_used[i];
7782
0
            min_qid = i;
7783
0
        }
7784
0
    }
7785
7786
0
    port->txq_used[min_qid]++;
7787
0
    tx->qid = min_qid;
7788
7789
0
    ovs_mutex_unlock(&port->txq_used_mutex);
7790
7791
0
    dpif_netdev_xps_revalidate_pmd(pmd, false);
7792
7793
0
    VLOG_DBG("Core %d: New TX queue ID %d for port \'%s\'.",
7794
0
             pmd->core_id, tx->qid, netdev_get_name(tx->port->netdev));
7795
0
    return min_qid;
7796
0
}
7797
7798
static struct tx_port *
7799
pmd_tnl_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
7800
                          odp_port_t port_no)
7801
0
{
7802
0
    return tx_port_lookup(&pmd->tnl_port_cache, port_no);
7803
0
}
7804
7805
static struct tx_port *
7806
pmd_send_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
7807
                           odp_port_t port_no)
7808
0
{
7809
0
    return tx_port_lookup(&pmd->send_port_cache, port_no);
7810
0
}
7811
7812
static int
7813
push_tnl_action(const struct dp_netdev_pmd_thread *pmd,
7814
                const struct nlattr *attr,
7815
                struct dp_packet_batch *batch)
7816
0
{
7817
0
    const struct netdev *ingress_netdev = NULL;
7818
0
    const struct ovs_action_push_tnl *data;
7819
0
    struct tx_port *tun_port;
7820
0
    int err;
7821
7822
0
    data = nl_attr_get(attr);
7823
7824
0
    tun_port = pmd_tnl_port_cache_lookup(pmd, data->tnl_port);
7825
0
    if (!tun_port) {
7826
0
        err = -EINVAL;
7827
0
        goto error;
7828
0
    }
7829
7830
0
    if (dpif_offload_enabled() && !dp_packet_batch_is_empty(batch)) {
7831
        /* To avoid multiple port lookups per batch, assume that all packets
7832
         * in the batch originate from the same flow and therefore share the
7833
         * same original input port. */
7834
0
        struct tx_port *in_port = pmd_send_port_cache_lookup(
7835
0
                                      pmd, batch->packets[0]->md.orig_in_port);
7836
0
        if (in_port) {
7837
0
            ingress_netdev = in_port->port->netdev;
7838
0
        }
7839
0
    }
7840
7841
0
    err = netdev_push_header(tun_port->port->netdev, ingress_netdev, batch,
7842
0
                             data);
7843
0
    if (!err) {
7844
0
        return 0;
7845
0
    }
7846
0
error:
7847
0
    dp_packet_delete_batch(batch, true);
7848
0
    return err;
7849
0
}
7850
7851
static void
7852
dp_execute_userspace_action(struct dp_netdev_pmd_thread *pmd,
7853
                            struct dp_packet *packet, bool should_steal,
7854
                            struct flow *flow, ovs_u128 *ufid,
7855
                            struct ofpbuf *actions,
7856
                            const struct nlattr *userdata)
7857
0
{
7858
0
    struct dp_packet_batch b;
7859
0
    int error;
7860
7861
0
    ofpbuf_clear(actions);
7862
7863
0
    error = dp_netdev_upcall(pmd, packet, flow, NULL, ufid,
7864
0
                             DPIF_UC_ACTION, userdata, actions,
7865
0
                             NULL);
7866
0
    if (!error || error == ENOSPC) {
7867
0
        dp_packet_batch_init_packet(&b, packet);
7868
0
        dp_netdev_execute_actions(pmd, &b, should_steal, flow,
7869
0
                                  actions->data, actions->size);
7870
0
    } else if (should_steal) {
7871
0
        dp_packet_delete(packet);
7872
0
        COVERAGE_INC(datapath_drop_userspace_action_error);
7873
0
    }
7874
0
}
7875
7876
static bool
7877
dp_execute_output_action(struct dp_netdev_pmd_thread *pmd,
7878
                         struct dp_packet_batch *packets_,
7879
                         bool should_steal, odp_port_t port_no)
7880
0
{
7881
0
    struct tx_port *p = pmd_send_port_cache_lookup(pmd, port_no);
7882
0
    struct dp_packet_batch out;
7883
7884
0
    if (!OVS_LIKELY(p)) {
7885
0
        COVERAGE_ADD(datapath_drop_invalid_port,
7886
0
                     dp_packet_batch_size(packets_));
7887
0
        dp_packet_delete_batch(packets_, should_steal);
7888
0
        return false;
7889
0
    }
7890
0
    if (!should_steal) {
7891
0
        dp_packet_batch_clone(&out, packets_);
7892
0
        dp_packet_batch_reset_cutlen(packets_);
7893
0
        packets_ = &out;
7894
0
    }
7895
0
    dp_packet_batch_apply_cutlen(packets_);
7896
0
    if (dp_packet_batch_size(&p->output_pkts)
7897
0
        + dp_packet_batch_size(packets_) > NETDEV_MAX_BURST) {
7898
        /* Flush here to avoid overflow. */
7899
0
        dp_netdev_pmd_flush_output_on_port(pmd, p);
7900
0
    }
7901
0
    if (dp_packet_batch_is_empty(&p->output_pkts)) {
7902
0
        pmd->n_output_batches++;
7903
0
    }
7904
7905
0
    struct dp_packet *packet;
7906
0
    DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7907
0
        p->output_pkts_rxqs[dp_packet_batch_size(&p->output_pkts)] =
7908
0
            pmd->ctx.last_rxq;
7909
0
        dp_packet_batch_add(&p->output_pkts, packet);
7910
0
    }
7911
0
    return true;
7912
0
}
7913
7914
static void
7915
dp_execute_lb_output_action(struct dp_netdev_pmd_thread *pmd,
7916
                            struct dp_packet_batch *packets_,
7917
                            bool should_steal, uint32_t bond)
7918
0
{
7919
0
    struct tx_bond *p_bond = tx_bond_lookup(&pmd->tx_bonds, bond);
7920
0
    struct dp_packet_batch out;
7921
0
    struct dp_packet *packet;
7922
7923
0
    if (!p_bond) {
7924
0
        COVERAGE_ADD(datapath_drop_invalid_bond,
7925
0
                     dp_packet_batch_size(packets_));
7926
0
        dp_packet_delete_batch(packets_, should_steal);
7927
0
        return;
7928
0
    }
7929
0
    if (!should_steal) {
7930
0
        dp_packet_batch_clone(&out, packets_);
7931
0
        dp_packet_batch_reset_cutlen(packets_);
7932
0
        packets_ = &out;
7933
0
    }
7934
0
    dp_packet_batch_apply_cutlen(packets_);
7935
7936
0
    DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7937
        /*
7938
         * Lookup the bond-hash table using hash to get the member.
7939
         */
7940
0
        uint32_t hash = dp_packet_get_rss_hash(packet);
7941
0
        struct member_entry *s_entry
7942
0
            = &p_bond->member_buckets[hash & BOND_MASK];
7943
0
        odp_port_t bond_member = s_entry->member_id;
7944
0
        uint32_t size = dp_packet_size(packet);
7945
0
        struct dp_packet_batch output_pkt;
7946
7947
0
        dp_packet_batch_init_packet(&output_pkt, packet);
7948
0
        if (OVS_LIKELY(dp_execute_output_action(pmd, &output_pkt, true,
7949
0
                                                bond_member))) {
7950
            /* Update member stats. */
7951
0
            non_atomic_ullong_add(&s_entry->n_packets, 1);
7952
0
            non_atomic_ullong_add(&s_entry->n_bytes, size);
7953
0
        }
7954
0
    }
7955
0
}
7956
7957
static void
7958
dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
7959
              const struct nlattr *a, bool should_steal)
7960
    OVS_NO_THREAD_SAFETY_ANALYSIS
7961
0
{
7962
0
    struct dp_netdev_execute_aux *aux = aux_;
7963
0
    uint32_t *depth = recirc_depth_get();
7964
0
    struct dp_netdev_pmd_thread *pmd = aux->pmd;
7965
0
    struct dp_netdev *dp = pmd->dp;
7966
0
    int type = nl_attr_type(a);
7967
0
    struct tx_port *p;
7968
0
    uint32_t packet_count, packets_dropped;
7969
7970
0
    switch ((enum ovs_action_attr)type) {
7971
0
    case OVS_ACTION_ATTR_OUTPUT:
7972
0
        dp_execute_output_action(pmd, packets_, should_steal,
7973
0
                                 nl_attr_get_odp_port(a));
7974
0
        return;
7975
7976
0
    case OVS_ACTION_ATTR_LB_OUTPUT:
7977
0
        dp_execute_lb_output_action(pmd, packets_, should_steal,
7978
0
                                    nl_attr_get_u32(a));
7979
0
        return;
7980
7981
0
    case OVS_ACTION_ATTR_TUNNEL_PUSH:
7982
0
        if (should_steal) {
7983
            /* We're requested to push tunnel header, but also we need to take
7984
             * the ownership of these packets. Thus, we can avoid performing
7985
             * the action, because the caller will not use the result anyway.
7986
             * Just break to free the batch. */
7987
0
            break;
7988
0
        }
7989
0
        dp_packet_batch_apply_cutlen(packets_);
7990
0
        packet_count = dp_packet_batch_size(packets_);
7991
0
        if (push_tnl_action(pmd, a, packets_)) {
7992
0
            COVERAGE_ADD(datapath_drop_tunnel_push_error,
7993
0
                         packet_count);
7994
0
        }
7995
0
        return;
7996
7997
0
    case OVS_ACTION_ATTR_TUNNEL_POP:
7998
0
        if (*depth < MAX_RECIRC_DEPTH) {
7999
0
            struct dp_packet_batch *orig_packets_ = packets_;
8000
0
            odp_port_t portno = nl_attr_get_odp_port(a);
8001
8002
0
            p = pmd_tnl_port_cache_lookup(pmd, portno);
8003
0
            if (p) {
8004
0
                struct dp_packet_batch tnl_pkt;
8005
8006
0
                if (!should_steal) {
8007
0
                    dp_packet_batch_clone(&tnl_pkt, packets_);
8008
0
                    packets_ = &tnl_pkt;
8009
0
                    dp_packet_batch_reset_cutlen(orig_packets_);
8010
0
                }
8011
8012
0
                dp_packet_batch_apply_cutlen(packets_);
8013
8014
0
                packet_count = dp_packet_batch_size(packets_);
8015
0
                netdev_pop_header(p->port->netdev, packets_);
8016
0
                packets_dropped =
8017
0
                   packet_count - dp_packet_batch_size(packets_);
8018
0
                if (packets_dropped) {
8019
0
                    COVERAGE_ADD(datapath_drop_tunnel_pop_error,
8020
0
                                 packets_dropped);
8021
0
                }
8022
0
                if (dp_packet_batch_is_empty(packets_)) {
8023
0
                    return;
8024
0
                }
8025
8026
0
                struct dp_packet *packet;
8027
0
                DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
8028
0
                    packet->md.in_port.odp_port = portno;
8029
0
                }
8030
8031
0
                (*depth)++;
8032
0
                dp_netdev_recirculate(pmd, packets_);
8033
0
                (*depth)--;
8034
0
                return;
8035
0
            }
8036
0
            COVERAGE_ADD(datapath_drop_invalid_tnl_port,
8037
0
                         dp_packet_batch_size(packets_));
8038
0
        } else {
8039
0
            COVERAGE_ADD(datapath_drop_recirc_error,
8040
0
                         dp_packet_batch_size(packets_));
8041
0
        }
8042
0
        break;
8043
8044
0
    case OVS_ACTION_ATTR_USERSPACE:
8045
0
        if (!fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
8046
0
            struct dp_packet_batch *orig_packets_ = packets_;
8047
0
            const struct nlattr *userdata;
8048
0
            struct dp_packet_batch usr_pkt;
8049
0
            struct ofpbuf actions;
8050
0
            struct flow flow;
8051
0
            ovs_u128 ufid;
8052
0
            bool clone = false;
8053
8054
0
            userdata = nl_attr_find_nested(a, OVS_USERSPACE_ATTR_USERDATA);
8055
0
            ofpbuf_init(&actions, 0);
8056
8057
0
            if (packets_->trunc) {
8058
0
                if (!should_steal) {
8059
0
                    dp_packet_batch_clone(&usr_pkt, packets_);
8060
0
                    packets_ = &usr_pkt;
8061
0
                    clone = true;
8062
0
                    dp_packet_batch_reset_cutlen(orig_packets_);
8063
0
                }
8064
8065
0
                dp_packet_batch_apply_cutlen(packets_);
8066
0
            }
8067
8068
0
            struct dp_packet *packet;
8069
0
            DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
8070
0
                flow_extract(packet, &flow);
8071
0
                odp_flow_key_hash(&flow, sizeof flow, &ufid);
8072
0
                dp_execute_userspace_action(pmd, packet, should_steal, &flow,
8073
0
                                            &ufid, &actions, userdata);
8074
0
            }
8075
8076
0
            if (clone) {
8077
0
                dp_packet_delete_batch(packets_, true);
8078
0
            }
8079
8080
0
            ofpbuf_uninit(&actions);
8081
0
            fat_rwlock_unlock(&dp->upcall_rwlock);
8082
8083
0
            return;
8084
0
        }
8085
0
        COVERAGE_ADD(datapath_drop_lock_error,
8086
0
                     dp_packet_batch_size(packets_));
8087
0
        break;
8088
8089
0
    case OVS_ACTION_ATTR_RECIRC:
8090
0
        if (*depth < MAX_RECIRC_DEPTH) {
8091
0
            struct dp_packet_batch recirc_pkts;
8092
8093
0
            if (!should_steal) {
8094
0
               dp_packet_batch_clone(&recirc_pkts, packets_);
8095
0
               packets_ = &recirc_pkts;
8096
0
            }
8097
8098
0
            struct dp_packet *packet;
8099
0
            DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
8100
0
                packet->md.recirc_id = nl_attr_get_u32(a);
8101
0
            }
8102
8103
0
            (*depth)++;
8104
0
            dp_netdev_recirculate(pmd, packets_);
8105
0
            (*depth)--;
8106
8107
0
            return;
8108
0
        }
8109
8110
0
        COVERAGE_ADD(datapath_drop_recirc_error,
8111
0
                     dp_packet_batch_size(packets_));
8112
0
        VLOG_WARN("Packet dropped. Max recirculation depth exceeded.");
8113
0
        break;
8114
8115
0
    case OVS_ACTION_ATTR_CT: {
8116
0
        const struct nlattr *b;
8117
0
        bool force = false;
8118
0
        bool commit = false;
8119
0
        unsigned int left;
8120
0
        uint16_t zone = 0;
8121
0
        uint32_t tp_id = 0;
8122
0
        const char *helper = NULL;
8123
0
        const uint32_t *setmark = NULL;
8124
0
        const struct ovs_key_ct_labels *setlabel = NULL;
8125
0
        struct nat_action_info_t nat_action_info;
8126
0
        struct nat_action_info_t *nat_action_info_ref = NULL;
8127
0
        bool nat_config = false;
8128
8129
0
        NL_ATTR_FOR_EACH_UNSAFE (b, left, nl_attr_get(a),
8130
0
                                 nl_attr_get_size(a)) {
8131
0
            enum ovs_ct_attr sub_type = nl_attr_type(b);
8132
8133
0
            switch(sub_type) {
8134
0
            case OVS_CT_ATTR_FORCE_COMMIT:
8135
0
                force = true;
8136
                /* fall through. */
8137
0
            case OVS_CT_ATTR_COMMIT:
8138
0
                commit = true;
8139
0
                break;
8140
0
            case OVS_CT_ATTR_ZONE:
8141
0
                zone = nl_attr_get_u16(b);
8142
0
                break;
8143
0
            case OVS_CT_ATTR_HELPER:
8144
0
                helper = nl_attr_get_string(b);
8145
0
                break;
8146
0
            case OVS_CT_ATTR_MARK:
8147
0
                setmark = nl_attr_get(b);
8148
0
                break;
8149
0
            case OVS_CT_ATTR_LABELS:
8150
0
                setlabel = nl_attr_get(b);
8151
0
                break;
8152
0
            case OVS_CT_ATTR_EVENTMASK:
8153
                /* Silently ignored, as userspace datapath does not generate
8154
                 * netlink events. */
8155
0
                break;
8156
0
            case OVS_CT_ATTR_TIMEOUT:
8157
0
                if (!str_to_uint(nl_attr_get_string(b), 10, &tp_id)) {
8158
0
                    VLOG_WARN("Invalid Timeout Policy ID: %s.",
8159
0
                              nl_attr_get_string(b));
8160
0
                    tp_id = DEFAULT_TP_ID;
8161
0
                }
8162
0
                break;
8163
0
            case OVS_CT_ATTR_NAT: {
8164
0
                const struct nlattr *b_nest;
8165
0
                unsigned int left_nest;
8166
0
                bool ip_min_specified = false;
8167
0
                bool proto_num_min_specified = false;
8168
0
                bool ip_max_specified = false;
8169
0
                bool proto_num_max_specified = false;
8170
0
                memset(&nat_action_info, 0, sizeof nat_action_info);
8171
0
                nat_action_info_ref = &nat_action_info;
8172
8173
0
                NL_NESTED_FOR_EACH_UNSAFE (b_nest, left_nest, b) {
8174
0
                    enum ovs_nat_attr sub_type_nest = nl_attr_type(b_nest);
8175
8176
0
                    switch (sub_type_nest) {
8177
0
                    case OVS_NAT_ATTR_SRC:
8178
0
                    case OVS_NAT_ATTR_DST:
8179
0
                        nat_config = true;
8180
0
                        nat_action_info.nat_action |=
8181
0
                            ((sub_type_nest == OVS_NAT_ATTR_SRC)
8182
0
                                ? NAT_ACTION_SRC : NAT_ACTION_DST);
8183
0
                        break;
8184
0
                    case OVS_NAT_ATTR_IP_MIN:
8185
0
                        memcpy(&nat_action_info.min_addr,
8186
0
                               nl_attr_get(b_nest),
8187
0
                               nl_attr_get_size(b_nest));
8188
0
                        ip_min_specified = true;
8189
0
                        break;
8190
0
                    case OVS_NAT_ATTR_IP_MAX:
8191
0
                        memcpy(&nat_action_info.max_addr,
8192
0
                               nl_attr_get(b_nest),
8193
0
                               nl_attr_get_size(b_nest));
8194
0
                        ip_max_specified = true;
8195
0
                        break;
8196
0
                    case OVS_NAT_ATTR_PROTO_MIN:
8197
0
                        nat_action_info.min_port =
8198
0
                            nl_attr_get_u16(b_nest);
8199
0
                        proto_num_min_specified = true;
8200
0
                        break;
8201
0
                    case OVS_NAT_ATTR_PROTO_MAX:
8202
0
                        nat_action_info.max_port =
8203
0
                            nl_attr_get_u16(b_nest);
8204
0
                        proto_num_max_specified = true;
8205
0
                        break;
8206
0
                    case OVS_NAT_ATTR_PROTO_RANDOM:
8207
0
                        nat_action_info.nat_flags |= NAT_RANGE_RANDOM;
8208
0
                        break;
8209
0
                    case OVS_NAT_ATTR_PERSISTENT:
8210
0
                        nat_action_info.nat_flags |= NAT_PERSISTENT;
8211
0
                        break;
8212
0
                    case OVS_NAT_ATTR_PROTO_HASH:
8213
0
                        break;
8214
0
                    case OVS_NAT_ATTR_UNSPEC:
8215
0
                    case __OVS_NAT_ATTR_MAX:
8216
0
                        OVS_NOT_REACHED();
8217
0
                    }
8218
0
                }
8219
8220
0
                if (ip_min_specified && !ip_max_specified) {
8221
0
                    nat_action_info.max_addr = nat_action_info.min_addr;
8222
0
                }
8223
0
                if (proto_num_min_specified && !proto_num_max_specified) {
8224
0
                    nat_action_info.max_port = nat_action_info.min_port;
8225
0
                }
8226
0
                if (proto_num_min_specified || proto_num_max_specified) {
8227
0
                    if (nat_action_info.nat_action & NAT_ACTION_SRC) {
8228
0
                        nat_action_info.nat_action |= NAT_ACTION_SRC_PORT;
8229
0
                    } else if (nat_action_info.nat_action & NAT_ACTION_DST) {
8230
0
                        nat_action_info.nat_action |= NAT_ACTION_DST_PORT;
8231
0
                    }
8232
0
                }
8233
0
                break;
8234
0
            }
8235
0
            case OVS_CT_ATTR_UNSPEC:
8236
0
            case __OVS_CT_ATTR_MAX:
8237
0
                OVS_NOT_REACHED();
8238
0
            }
8239
0
        }
8240
8241
        /* We won't be able to function properly in this case, hence
8242
         * complain loudly. */
8243
0
        if (nat_config && !commit) {
8244
0
            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
8245
0
            VLOG_WARN_RL(&rl, "NAT specified without commit.");
8246
0
        }
8247
8248
0
        conntrack_execute(dp->conntrack, packets_, aux->flow->dl_type, force,
8249
0
                          commit, zone, setmark, setlabel, helper,
8250
0
                          nat_action_info_ref, pmd->ctx.now / 1000, tp_id);
8251
0
        break;
8252
0
    }
8253
8254
0
    case OVS_ACTION_ATTR_METER:
8255
0
        dp_netdev_run_meter(pmd->dp, packets_, nl_attr_get_u32(a),
8256
0
                            pmd->ctx.now / 1000);
8257
0
        break;
8258
8259
0
    case OVS_ACTION_ATTR_PUSH_VLAN:
8260
0
    case OVS_ACTION_ATTR_POP_VLAN:
8261
0
    case OVS_ACTION_ATTR_PUSH_MPLS:
8262
0
    case OVS_ACTION_ATTR_POP_MPLS:
8263
0
    case OVS_ACTION_ATTR_SET:
8264
0
    case OVS_ACTION_ATTR_SET_MASKED:
8265
0
    case OVS_ACTION_ATTR_SAMPLE:
8266
0
    case OVS_ACTION_ATTR_HASH:
8267
0
    case OVS_ACTION_ATTR_UNSPEC:
8268
0
    case OVS_ACTION_ATTR_TRUNC:
8269
0
    case OVS_ACTION_ATTR_PUSH_ETH:
8270
0
    case OVS_ACTION_ATTR_POP_ETH:
8271
0
    case OVS_ACTION_ATTR_CLONE:
8272
0
    case OVS_ACTION_ATTR_PUSH_NSH:
8273
0
    case OVS_ACTION_ATTR_POP_NSH:
8274
0
    case OVS_ACTION_ATTR_CT_CLEAR:
8275
0
    case OVS_ACTION_ATTR_CHECK_PKT_LEN:
8276
0
    case OVS_ACTION_ATTR_DROP:
8277
0
    case OVS_ACTION_ATTR_ADD_MPLS:
8278
0
    case OVS_ACTION_ATTR_DEC_TTL:
8279
0
    case OVS_ACTION_ATTR_PSAMPLE:
8280
0
    case __OVS_ACTION_ATTR_MAX:
8281
0
        OVS_NOT_REACHED();
8282
0
    }
8283
8284
0
    dp_packet_delete_batch(packets_, should_steal);
8285
0
}
8286
8287
static void
8288
dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
8289
                          struct dp_packet_batch *packets,
8290
                          bool should_steal, const struct flow *flow,
8291
                          const struct nlattr *actions, size_t actions_len)
8292
0
{
8293
0
    struct dp_netdev_execute_aux aux = { pmd, flow };
8294
8295
0
    odp_execute_actions(&aux, packets, should_steal, actions,
8296
0
                        actions_len, dp_execute_cb);
8297
0
}
8298
8299
struct dp_netdev_ct_dump {
8300
    struct ct_dpif_dump_state up;
8301
    struct conntrack_dump dump;
8302
    struct conntrack *ct;
8303
    struct dp_netdev *dp;
8304
};
8305
8306
static int
8307
dpif_netdev_ct_dump_start(struct dpif *dpif, struct ct_dpif_dump_state **dump_,
8308
                          const uint16_t *pzone, int *ptot_bkts)
8309
0
{
8310
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8311
0
    struct dp_netdev_ct_dump *dump;
8312
8313
0
    dump = xzalloc(sizeof *dump);
8314
0
    dump->dp = dp;
8315
0
    dump->ct = dp->conntrack;
8316
8317
0
    conntrack_dump_start(dp->conntrack, &dump->dump, pzone, ptot_bkts);
8318
8319
0
    *dump_ = &dump->up;
8320
8321
0
    return 0;
8322
0
}
8323
8324
static int
8325
dpif_netdev_ct_dump_next(struct dpif *dpif OVS_UNUSED,
8326
                         struct ct_dpif_dump_state *dump_,
8327
                         struct ct_dpif_entry *entry)
8328
0
{
8329
0
    struct dp_netdev_ct_dump *dump;
8330
8331
0
    INIT_CONTAINER(dump, dump_, up);
8332
8333
0
    return conntrack_dump_next(&dump->dump, entry);
8334
0
}
8335
8336
static int
8337
dpif_netdev_ct_dump_done(struct dpif *dpif OVS_UNUSED,
8338
                         struct ct_dpif_dump_state *dump_)
8339
0
{
8340
0
    struct dp_netdev_ct_dump *dump;
8341
0
    int err;
8342
8343
0
    INIT_CONTAINER(dump, dump_, up);
8344
8345
0
    err = conntrack_dump_done(&dump->dump);
8346
8347
0
    free(dump);
8348
8349
0
    return err;
8350
0
}
8351
8352
static int
8353
dpif_netdev_ct_exp_dump_start(struct dpif *dpif,
8354
                              struct ct_dpif_dump_state **dump_,
8355
                              const uint16_t *pzone)
8356
0
{
8357
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8358
0
    struct dp_netdev_ct_dump *dump;
8359
8360
0
    dump = xzalloc(sizeof *dump);
8361
0
    dump->dp = dp;
8362
0
    dump->ct = dp->conntrack;
8363
8364
0
    conntrack_exp_dump_start(dp->conntrack, &dump->dump, pzone);
8365
8366
0
    *dump_ = &dump->up;
8367
8368
0
    return 0;
8369
0
}
8370
8371
static int
8372
dpif_netdev_ct_exp_dump_next(struct dpif *dpif OVS_UNUSED,
8373
                             struct ct_dpif_dump_state *dump_,
8374
                             struct ct_dpif_exp *entry)
8375
0
{
8376
0
    struct dp_netdev_ct_dump *dump;
8377
8378
0
    INIT_CONTAINER(dump, dump_, up);
8379
8380
0
    return conntrack_exp_dump_next(&dump->dump, entry);
8381
0
}
8382
8383
static int
8384
dpif_netdev_ct_exp_dump_done(struct dpif *dpif OVS_UNUSED,
8385
                             struct ct_dpif_dump_state *dump_)
8386
0
{
8387
0
    struct dp_netdev_ct_dump *dump;
8388
0
    int err;
8389
8390
0
    INIT_CONTAINER(dump, dump_, up);
8391
8392
0
    err = conntrack_exp_dump_done(&dump->dump);
8393
8394
0
    free(dump);
8395
8396
0
    return err;
8397
0
}
8398
8399
static int
8400
dpif_netdev_ct_flush(struct dpif *dpif, const uint16_t *zone,
8401
                     const struct ct_dpif_tuple *tuple)
8402
0
{
8403
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8404
8405
0
    if (tuple) {
8406
0
        return conntrack_flush_tuple(dp->conntrack, tuple, zone ? *zone : 0);
8407
0
    }
8408
0
    return conntrack_flush(dp->conntrack, zone);
8409
0
}
8410
8411
static int
8412
dpif_netdev_ct_set_maxconns(struct dpif *dpif, uint32_t maxconns)
8413
0
{
8414
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8415
8416
0
    return conntrack_set_maxconns(dp->conntrack, maxconns);
8417
0
}
8418
8419
static int
8420
dpif_netdev_ct_get_maxconns(struct dpif *dpif, uint32_t *maxconns)
8421
0
{
8422
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8423
8424
0
    return conntrack_get_maxconns(dp->conntrack, maxconns);
8425
0
}
8426
8427
static int
8428
dpif_netdev_ct_get_nconns(struct dpif *dpif, uint32_t *nconns)
8429
0
{
8430
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8431
8432
0
    return conntrack_get_nconns(dp->conntrack, nconns);
8433
0
}
8434
8435
static int
8436
dpif_netdev_ct_set_tcp_seq_chk(struct dpif *dpif, bool enabled)
8437
0
{
8438
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8439
8440
0
    return conntrack_set_tcp_seq_chk(dp->conntrack, enabled);
8441
0
}
8442
8443
static int
8444
dpif_netdev_ct_get_tcp_seq_chk(struct dpif *dpif, bool *enabled)
8445
0
{
8446
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8447
0
    *enabled = conntrack_get_tcp_seq_chk(dp->conntrack);
8448
0
    return 0;
8449
0
}
8450
8451
static int
8452
dpif_netdev_ct_set_sweep_interval(struct dpif *dpif, uint32_t ms)
8453
0
{
8454
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8455
0
    return conntrack_set_sweep_interval(dp->conntrack, ms);
8456
0
}
8457
8458
static int
8459
dpif_netdev_ct_get_sweep_interval(struct dpif *dpif, uint32_t *ms)
8460
0
{
8461
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8462
0
    *ms = conntrack_get_sweep_interval(dp->conntrack);
8463
0
    return 0;
8464
0
}
8465
8466
static int
8467
dpif_netdev_ct_set_limits(struct dpif *dpif,
8468
                           const struct ovs_list *zone_limits)
8469
0
{
8470
0
    int err = 0;
8471
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8472
8473
0
    struct ct_dpif_zone_limit *zone_limit;
8474
0
    LIST_FOR_EACH (zone_limit, node, zone_limits) {
8475
0
        err = zone_limit_update(dp->conntrack, zone_limit->zone,
8476
0
                                zone_limit->limit);
8477
0
        if (err != 0) {
8478
0
            break;
8479
0
        }
8480
0
    }
8481
0
    return err;
8482
0
}
8483
8484
static int
8485
dpif_netdev_ct_get_limits(struct dpif *dpif,
8486
                           const struct ovs_list *zone_limits_request,
8487
                           struct ovs_list *zone_limits_reply)
8488
0
{
8489
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8490
0
    struct conntrack_zone_info czl;
8491
8492
0
    if (!ovs_list_is_empty(zone_limits_request)) {
8493
0
        struct ct_dpif_zone_limit *zone_limit;
8494
0
        LIST_FOR_EACH (zone_limit, node, zone_limits_request) {
8495
0
            czl = zone_limit_get(dp->conntrack, zone_limit->zone);
8496
0
            if (czl.zone == zone_limit->zone || czl.zone == DEFAULT_ZONE) {
8497
0
                ct_dpif_push_zone_limit(zone_limits_reply, zone_limit->zone,
8498
0
                                        czl.limit,
8499
0
                                        czl.count);
8500
0
            } else {
8501
0
                return EINVAL;
8502
0
            }
8503
0
        }
8504
0
    } else {
8505
0
        czl = zone_limit_get(dp->conntrack, DEFAULT_ZONE);
8506
0
        if (czl.zone == DEFAULT_ZONE) {
8507
0
            ct_dpif_push_zone_limit(zone_limits_reply, DEFAULT_ZONE,
8508
0
                                    czl.limit, 0);
8509
0
        }
8510
8511
0
        for (int z = MIN_ZONE; z <= MAX_ZONE; z++) {
8512
0
            czl = zone_limit_get(dp->conntrack, z);
8513
0
            if (czl.zone == z) {
8514
0
                ct_dpif_push_zone_limit(zone_limits_reply, z, czl.limit,
8515
0
                                        czl.count);
8516
0
            }
8517
0
        }
8518
0
    }
8519
8520
0
    return 0;
8521
0
}
8522
8523
static int
8524
dpif_netdev_ct_del_limits(struct dpif *dpif,
8525
                           const struct ovs_list *zone_limits)
8526
0
{
8527
0
    int err = 0;
8528
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8529
0
    struct ct_dpif_zone_limit *zone_limit;
8530
0
    LIST_FOR_EACH (zone_limit, node, zone_limits) {
8531
0
        err = zone_limit_delete(dp->conntrack, zone_limit->zone);
8532
0
        if (err != 0) {
8533
0
            break;
8534
0
        }
8535
0
    }
8536
8537
0
    return err;
8538
0
}
8539
8540
static int
8541
dpif_netdev_ct_get_features(struct dpif *dpif OVS_UNUSED,
8542
                            enum ct_features *features)
8543
0
{
8544
0
    if (features != NULL) {
8545
0
        *features = CONNTRACK_F_ZERO_SNAT;
8546
0
    }
8547
0
    return 0;
8548
0
}
8549
8550
static int
8551
dpif_netdev_ct_set_timeout_policy(struct dpif *dpif,
8552
                                  const struct ct_dpif_timeout_policy *dpif_tp)
8553
0
{
8554
0
    struct timeout_policy tp;
8555
0
    struct dp_netdev *dp;
8556
8557
0
    dp = get_dp_netdev(dpif);
8558
0
    memcpy(&tp.policy, dpif_tp, sizeof tp.policy);
8559
0
    return timeout_policy_update(dp->conntrack, &tp);
8560
0
}
8561
8562
static int
8563
dpif_netdev_ct_get_timeout_policy(struct dpif *dpif, uint32_t tp_id,
8564
                                  struct ct_dpif_timeout_policy *dpif_tp)
8565
0
{
8566
0
    struct timeout_policy *tp;
8567
0
    struct dp_netdev *dp;
8568
0
    int err = 0;
8569
8570
0
    dp = get_dp_netdev(dpif);
8571
0
    tp = timeout_policy_get(dp->conntrack, tp_id);
8572
0
    if (!tp) {
8573
0
        return ENOENT;
8574
0
    }
8575
0
    memcpy(dpif_tp, &tp->policy, sizeof tp->policy);
8576
0
    return err;
8577
0
}
8578
8579
static int
8580
dpif_netdev_ct_del_timeout_policy(struct dpif *dpif,
8581
                                  uint32_t tp_id)
8582
0
{
8583
0
    struct dp_netdev *dp;
8584
0
    int err = 0;
8585
8586
0
    dp = get_dp_netdev(dpif);
8587
0
    err = timeout_policy_delete(dp->conntrack, tp_id);
8588
0
    return err;
8589
0
}
8590
8591
static int
8592
dpif_netdev_ct_get_timeout_policy_name(struct dpif *dpif OVS_UNUSED,
8593
                                       uint32_t tp_id,
8594
                                       uint16_t dl_type OVS_UNUSED,
8595
                                       uint8_t nw_proto OVS_UNUSED,
8596
                                       char **tp_name, bool *is_generic)
8597
0
{
8598
0
    struct ds ds = DS_EMPTY_INITIALIZER;
8599
8600
0
    ds_put_format(&ds, "%"PRIu32, tp_id);
8601
0
    *tp_name = ds_steal_cstr(&ds);
8602
0
    *is_generic = true;
8603
0
    return 0;
8604
0
}
8605
8606
static int
8607
dpif_netdev_ipf_set_enabled(struct dpif *dpif, bool v6, bool enable)
8608
0
{
8609
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8610
0
    return ipf_set_enabled(conntrack_ipf_ctx(dp->conntrack), v6, enable);
8611
0
}
8612
8613
static int
8614
dpif_netdev_ipf_set_min_frag(struct dpif *dpif, bool v6, uint32_t min_frag)
8615
0
{
8616
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8617
0
    return ipf_set_min_frag(conntrack_ipf_ctx(dp->conntrack), v6, min_frag);
8618
0
}
8619
8620
static int
8621
dpif_netdev_ipf_set_max_nfrags(struct dpif *dpif, uint32_t max_frags)
8622
0
{
8623
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8624
0
    return ipf_set_max_nfrags(conntrack_ipf_ctx(dp->conntrack), max_frags);
8625
0
}
8626
8627
/* Adjust this function if 'dpif_ipf_status' and 'ipf_status' were to
8628
 * diverge. */
8629
static int
8630
dpif_netdev_ipf_get_status(struct dpif *dpif,
8631
                           struct dpif_ipf_status *dpif_ipf_status)
8632
0
{
8633
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8634
0
    ipf_get_status(conntrack_ipf_ctx(dp->conntrack),
8635
0
                   (struct ipf_status *) dpif_ipf_status);
8636
0
    return 0;
8637
0
}
8638
8639
static int
8640
dpif_netdev_ipf_dump_start(struct dpif *dpif OVS_UNUSED,
8641
                           struct ipf_dump_ctx **ipf_dump_ctx)
8642
0
{
8643
0
    return ipf_dump_start(ipf_dump_ctx);
8644
0
}
8645
8646
static int
8647
dpif_netdev_ipf_dump_next(struct dpif *dpif, void *ipf_dump_ctx, char **dump)
8648
0
{
8649
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8650
0
    return ipf_dump_next(conntrack_ipf_ctx(dp->conntrack), ipf_dump_ctx,
8651
0
                         dump);
8652
0
}
8653
8654
static int
8655
dpif_netdev_ipf_dump_done(struct dpif *dpif OVS_UNUSED, void *ipf_dump_ctx)
8656
0
{
8657
0
    return ipf_dump_done(ipf_dump_ctx);
8658
8659
0
}
8660
8661
static int
8662
dpif_netdev_bond_add(struct dpif *dpif, uint32_t bond_id,
8663
                     odp_port_t *member_map)
8664
0
{
8665
0
    struct tx_bond *new_tx = xzalloc(sizeof *new_tx);
8666
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8667
0
    struct dp_netdev_pmd_thread *pmd;
8668
8669
    /* Prepare new bond mapping. */
8670
0
    new_tx->bond_id = bond_id;
8671
0
    for (int bucket = 0; bucket < BOND_BUCKETS; bucket++) {
8672
0
        new_tx->member_buckets[bucket].member_id = member_map[bucket];
8673
0
    }
8674
8675
0
    ovs_mutex_lock(&dp->bond_mutex);
8676
    /* Check if bond already existed. */
8677
0
    struct tx_bond *old_tx = tx_bond_lookup(&dp->tx_bonds, bond_id);
8678
0
    if (old_tx) {
8679
0
        cmap_replace(&dp->tx_bonds, &old_tx->node, &new_tx->node,
8680
0
                     hash_bond_id(bond_id));
8681
0
        ovsrcu_postpone(free, old_tx);
8682
0
    } else {
8683
0
        cmap_insert(&dp->tx_bonds, &new_tx->node, hash_bond_id(bond_id));
8684
0
    }
8685
0
    ovs_mutex_unlock(&dp->bond_mutex);
8686
8687
    /* Update all PMDs with new bond mapping. */
8688
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
8689
0
        dp_netdev_add_bond_tx_to_pmd(pmd, new_tx, true);
8690
0
    }
8691
0
    return 0;
8692
0
}
8693
8694
static int
8695
dpif_netdev_bond_del(struct dpif *dpif, uint32_t bond_id)
8696
0
{
8697
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8698
0
    struct dp_netdev_pmd_thread *pmd;
8699
0
    struct tx_bond *tx;
8700
8701
0
    ovs_mutex_lock(&dp->bond_mutex);
8702
    /* Check if bond existed. */
8703
0
    tx = tx_bond_lookup(&dp->tx_bonds, bond_id);
8704
0
    if (tx) {
8705
0
        cmap_remove(&dp->tx_bonds, &tx->node, hash_bond_id(bond_id));
8706
0
        ovsrcu_postpone(free, tx);
8707
0
    } else {
8708
        /* Bond is not present. */
8709
0
        ovs_mutex_unlock(&dp->bond_mutex);
8710
0
        return ENOENT;
8711
0
    }
8712
0
    ovs_mutex_unlock(&dp->bond_mutex);
8713
8714
    /* Remove the bond map in all pmds. */
8715
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
8716
0
        dp_netdev_del_bond_tx_from_pmd(pmd, bond_id);
8717
0
    }
8718
0
    return 0;
8719
0
}
8720
8721
static int
8722
dpif_netdev_bond_stats_get(struct dpif *dpif, uint32_t bond_id,
8723
                           uint64_t *n_bytes)
8724
0
{
8725
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8726
0
    struct dp_netdev_pmd_thread *pmd;
8727
8728
0
    if (!tx_bond_lookup(&dp->tx_bonds, bond_id)) {
8729
0
        return ENOENT;
8730
0
    }
8731
8732
    /* Search the bond in all PMDs. */
8733
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
8734
0
        struct tx_bond *pmd_bond_entry
8735
0
            = tx_bond_lookup(&pmd->tx_bonds, bond_id);
8736
8737
0
        if (!pmd_bond_entry) {
8738
0
            continue;
8739
0
        }
8740
8741
        /* Read bond stats. */
8742
0
        for (int i = 0; i < BOND_BUCKETS; i++) {
8743
0
            uint64_t pmd_n_bytes;
8744
8745
0
            atomic_read_relaxed(&pmd_bond_entry->member_buckets[i].n_bytes,
8746
0
                                &pmd_n_bytes);
8747
0
            n_bytes[i] += pmd_n_bytes;
8748
0
        }
8749
0
    }
8750
0
    return 0;
8751
0
}
8752
8753
const struct dpif_class dpif_netdev_class = {
8754
    "netdev",
8755
    true,                       /* cleanup_required */
8756
    dpif_netdev_init,
8757
    dpif_netdev_enumerate,
8758
    dpif_netdev_port_open_type,
8759
    dpif_netdev_open,
8760
    dpif_netdev_close,
8761
    dpif_netdev_destroy,
8762
    dpif_netdev_run,
8763
    dpif_netdev_wait,
8764
    dpif_netdev_get_stats,
8765
    NULL,                      /* set_features */
8766
    NULL,                      /* get_features */
8767
    dpif_netdev_port_add,
8768
    dpif_netdev_port_del,
8769
    dpif_netdev_port_set_config,
8770
    dpif_netdev_port_query_by_number,
8771
    dpif_netdev_port_query_by_name,
8772
    NULL,                       /* port_get_pid */
8773
    dpif_netdev_port_dump_start,
8774
    dpif_netdev_port_dump_next,
8775
    dpif_netdev_port_dump_done,
8776
    dpif_netdev_port_poll,
8777
    dpif_netdev_port_poll_wait,
8778
    dpif_netdev_flow_flush,
8779
    dpif_netdev_flow_dump_create,
8780
    dpif_netdev_flow_dump_destroy,
8781
    dpif_netdev_flow_dump_thread_create,
8782
    dpif_netdev_flow_dump_thread_destroy,
8783
    dpif_netdev_flow_dump_next,
8784
    dpif_netdev_operate,
8785
    NULL,                       /* recv_set */
8786
    NULL,                       /* handlers_set */
8787
    dpif_netdev_number_handlers_required,
8788
    dpif_netdev_set_config,
8789
    dpif_netdev_queue_to_priority,
8790
    NULL,                       /* recv */
8791
    NULL,                       /* recv_wait */
8792
    NULL,                       /* recv_purge */
8793
    dpif_netdev_register_dp_purge_cb,
8794
    dpif_netdev_register_upcall_cb,
8795
    dpif_netdev_enable_upcall,
8796
    dpif_netdev_disable_upcall,
8797
    dpif_netdev_get_datapath_version,
8798
    dpif_netdev_ct_dump_start,
8799
    dpif_netdev_ct_dump_next,
8800
    dpif_netdev_ct_dump_done,
8801
    dpif_netdev_ct_exp_dump_start,
8802
    dpif_netdev_ct_exp_dump_next,
8803
    dpif_netdev_ct_exp_dump_done,
8804
    dpif_netdev_ct_flush,
8805
    dpif_netdev_ct_set_maxconns,
8806
    dpif_netdev_ct_get_maxconns,
8807
    dpif_netdev_ct_get_nconns,
8808
    dpif_netdev_ct_set_tcp_seq_chk,
8809
    dpif_netdev_ct_get_tcp_seq_chk,
8810
    dpif_netdev_ct_set_sweep_interval,
8811
    dpif_netdev_ct_get_sweep_interval,
8812
    dpif_netdev_ct_set_limits,
8813
    dpif_netdev_ct_get_limits,
8814
    dpif_netdev_ct_del_limits,
8815
    dpif_netdev_ct_set_timeout_policy,
8816
    dpif_netdev_ct_get_timeout_policy,
8817
    dpif_netdev_ct_del_timeout_policy,
8818
    NULL,                       /* ct_timeout_policy_dump_start */
8819
    NULL,                       /* ct_timeout_policy_dump_next */
8820
    NULL,                       /* ct_timeout_policy_dump_done */
8821
    dpif_netdev_ct_get_timeout_policy_name,
8822
    dpif_netdev_ct_get_features,
8823
    dpif_netdev_ipf_set_enabled,
8824
    dpif_netdev_ipf_set_min_frag,
8825
    dpif_netdev_ipf_set_max_nfrags,
8826
    dpif_netdev_ipf_get_status,
8827
    dpif_netdev_ipf_dump_start,
8828
    dpif_netdev_ipf_dump_next,
8829
    dpif_netdev_ipf_dump_done,
8830
    dpif_netdev_meter_get_features,
8831
    dpif_netdev_meter_set,
8832
    dpif_netdev_meter_get,
8833
    dpif_netdev_meter_del,
8834
    dpif_netdev_bond_add,
8835
    dpif_netdev_bond_del,
8836
    dpif_netdev_bond_stats_get,
8837
    NULL,                       /* cache_get_supported_levels */
8838
    NULL,                       /* cache_get_name */
8839
    NULL,                       /* cache_get_size */
8840
    NULL,                       /* cache_set_size */
8841
};
8842
8843
static void
8844
dpif_dummy_change_port_number(struct unixctl_conn *conn, int argc OVS_UNUSED,
8845
                              const char *argv[], void *aux OVS_UNUSED)
8846
0
{
8847
0
    struct dp_netdev_port *port;
8848
0
    struct dp_netdev *dp;
8849
0
    odp_port_t port_no;
8850
8851
0
    ovs_mutex_lock(&dp_netdev_mutex);
8852
0
    dp = shash_find_data(&dp_netdevs, argv[1]);
8853
0
    if (!dp || !dpif_netdev_class_is_dummy(dp->class)) {
8854
0
        ovs_mutex_unlock(&dp_netdev_mutex);
8855
0
        unixctl_command_reply_error(conn, "unknown datapath or not a dummy");
8856
0
        return;
8857
0
    }
8858
0
    ovs_refcount_ref(&dp->ref_cnt);
8859
0
    ovs_mutex_unlock(&dp_netdev_mutex);
8860
8861
0
    ovs_rwlock_wrlock(&dp->port_rwlock);
8862
0
    if (get_port_by_name(dp, argv[2], &port)) {
8863
0
        unixctl_command_reply_error(conn, "unknown port");
8864
0
        goto exit;
8865
0
    }
8866
8867
0
    port_no = u32_to_odp(atoi(argv[3]));
8868
0
    if (!port_no || port_no == ODPP_NONE) {
8869
0
        unixctl_command_reply_error(conn, "bad port number");
8870
0
        goto exit;
8871
0
    }
8872
0
    if (dp_netdev_lookup_port(dp, port_no)) {
8873
0
        unixctl_command_reply_error(conn, "port number already in use");
8874
0
        goto exit;
8875
0
    }
8876
8877
    /* Remove port. */
8878
0
    hmap_remove(&dp->ports, &port->node);
8879
0
    reconfigure_datapath(dp);
8880
8881
    /* Reinsert with new port number. */
8882
0
    port->port_no = port_no;
8883
0
    hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
8884
0
    reconfigure_datapath(dp);
8885
8886
0
    seq_change(dp->port_seq);
8887
0
    unixctl_command_reply(conn, NULL);
8888
8889
0
exit:
8890
0
    ovs_rwlock_unlock(&dp->port_rwlock);
8891
0
    dp_netdev_unref(dp);
8892
0
}
8893
8894
static void
8895
dpif_dummy_register__(const char *type)
8896
0
{
8897
0
    struct dpif_class *class;
8898
8899
0
    class = xmalloc(sizeof *class);
8900
0
    *class = dpif_netdev_class;
8901
0
    class->type = xstrdup(type);
8902
0
    dp_register_provider(class);
8903
0
}
8904
8905
static void
8906
dpif_dummy_override(const char *type)
8907
0
{
8908
0
    int error;
8909
8910
    /*
8911
     * Ignore EAFNOSUPPORT to allow --enable-dummy=system with
8912
     * a userland-only build.  It's useful for testsuite.
8913
     */
8914
0
    error = dp_unregister_provider(type);
8915
0
    if (error == 0 || error == EAFNOSUPPORT) {
8916
0
        dpif_dummy_register__(type);
8917
0
    }
8918
0
}
8919
8920
void
8921
dpif_dummy_register(enum dummy_level level)
8922
0
{
8923
0
    if (level == DUMMY_OVERRIDE_ALL) {
8924
0
        struct sset types;
8925
0
        const char *type;
8926
8927
0
        sset_init(&types);
8928
0
        dp_enumerate_types(&types);
8929
0
        SSET_FOR_EACH (type, &types) {
8930
0
            dpif_dummy_override(type);
8931
0
        }
8932
0
        sset_destroy(&types);
8933
0
    } else if (level == DUMMY_OVERRIDE_SYSTEM) {
8934
0
        dpif_dummy_override("system");
8935
0
    }
8936
8937
0
    dpif_dummy_register__("dummy");
8938
8939
0
    unixctl_command_register("dpif-dummy/change-port-number",
8940
0
                             "dp port new-number",
8941
0
                             3, 3, dpif_dummy_change_port_number, NULL);
8942
0
}
8943

8944
/* Datapath Classifier. */
8945
8946
static void
8947
dpcls_subtable_destroy_cb(struct dpcls_subtable *subtable)
8948
0
{
8949
0
    cmap_destroy(&subtable->rules);
8950
0
    ovsrcu_postpone(free, subtable->mf_masks);
8951
0
    ovsrcu_postpone(free, subtable);
8952
0
}
8953
8954
/* Initializes 'cls' as a classifier that initially contains no classification
8955
 * rules. */
8956
static void
8957
dpcls_init(struct dpcls *cls)
8958
0
{
8959
0
    cmap_init(&cls->subtables_map);
8960
0
    pvector_init(&cls->subtables);
8961
0
}
8962
8963
static void
8964
dpcls_destroy_subtable(struct dpcls *cls, struct dpcls_subtable *subtable)
8965
0
{
8966
0
    VLOG_DBG("Destroying subtable %p for in_port %d", subtable, cls->in_port);
8967
0
    pvector_remove(&cls->subtables, subtable);
8968
0
    cmap_remove(&cls->subtables_map, &subtable->cmap_node,
8969
0
                subtable->mask.hash);
8970
0
    ovsrcu_postpone(dpcls_subtable_destroy_cb, subtable);
8971
0
}
8972
8973
/* Destroys 'cls'.  Rules within 'cls', if any, are not freed; this is the
8974
 * caller's responsibility.
8975
 * May only be called after all the readers have been terminated. */
8976
static void
8977
dpcls_destroy(struct dpcls *cls)
8978
0
{
8979
0
    if (cls) {
8980
0
        struct dpcls_subtable *subtable;
8981
8982
0
        CMAP_FOR_EACH (subtable, cmap_node, &cls->subtables_map) {
8983
0
            ovs_assert(cmap_count(&subtable->rules) == 0);
8984
0
            dpcls_destroy_subtable(cls, subtable);
8985
0
        }
8986
0
        cmap_destroy(&cls->subtables_map);
8987
0
        pvector_destroy(&cls->subtables);
8988
0
    }
8989
0
}
8990
8991
static struct dpcls_subtable *
8992
dpcls_create_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
8993
0
{
8994
0
    struct dpcls_subtable *subtable;
8995
8996
    /* Need to add one. */
8997
0
    subtable = xmalloc(sizeof *subtable
8998
0
                       - sizeof subtable->mask.mf + mask->len);
8999
0
    cmap_init(&subtable->rules);
9000
0
    subtable->hit_cnt = 0;
9001
0
    netdev_flow_key_clone(&subtable->mask, mask);
9002
9003
    /* The count of bits in the mask defines the space required for masks.
9004
     * Then call gen_masks() to create the appropriate masks, avoiding the cost
9005
     * of doing runtime calculations. */
9006
0
    uint32_t unit0 = count_1bits(mask->mf.map.bits[0]);
9007
0
    uint32_t unit1 = count_1bits(mask->mf.map.bits[1]);
9008
0
    subtable->mf_bits_set_unit0 = unit0;
9009
0
    subtable->mf_bits_set_unit1 = unit1;
9010
0
    subtable->mf_masks = xmalloc(sizeof(uint64_t) * (unit0 + unit1));
9011
0
    dpcls_flow_key_gen_masks(mask, subtable->mf_masks, unit0, unit1);
9012
9013
    /* Get the preferred subtable search function for this (u0,u1) subtable.
9014
     * The function is guaranteed to always return a valid implementation, and
9015
     * possibly a specialized implementation. */
9016
0
    subtable->lookup_func = dpcls_subtable_lookup_probe(unit0, unit1);
9017
9018
0
    cmap_insert(&cls->subtables_map, &subtable->cmap_node, mask->hash);
9019
    /* Add the new subtable at the end of the pvector (with no hits yet) */
9020
0
    pvector_insert(&cls->subtables, subtable, 0);
9021
0
    VLOG_DBG("Creating %"PRIuSIZE". subtable %p for in_port %d",
9022
0
             cmap_count(&cls->subtables_map), subtable, cls->in_port);
9023
0
    pvector_publish(&cls->subtables);
9024
9025
0
    return subtable;
9026
0
}
9027
9028
static inline struct dpcls_subtable *
9029
dpcls_find_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
9030
0
{
9031
0
    struct dpcls_subtable *subtable;
9032
9033
0
    CMAP_FOR_EACH_WITH_HASH (subtable, cmap_node, mask->hash,
9034
0
                             &cls->subtables_map) {
9035
0
        if (netdev_flow_key_equal(&subtable->mask, mask)) {
9036
0
            return subtable;
9037
0
        }
9038
0
    }
9039
0
    return dpcls_create_subtable(cls, mask);
9040
0
}
9041
9042
/* Periodically sort the dpcls subtable vectors according to hit counts */
9043
static void
9044
dpcls_sort_subtable_vector(struct dpcls *cls)
9045
0
{
9046
0
    struct pvector *pvec = &cls->subtables;
9047
0
    struct dpcls_subtable *subtable;
9048
9049
0
    PVECTOR_FOR_EACH (subtable, pvec) {
9050
0
        pvector_change_priority(pvec, subtable, subtable->hit_cnt);
9051
0
        subtable->hit_cnt = 0;
9052
0
    }
9053
0
    pvector_publish(pvec);
9054
0
}
9055
9056
static inline void
9057
dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
9058
                           struct polled_queue *poll_list, int poll_cnt)
9059
0
{
9060
0
    struct dpcls *cls;
9061
0
    uint64_t tot_idle = 0, tot_proc = 0, tot_sleep = 0;
9062
0
    unsigned int pmd_load = 0;
9063
9064
0
    if (pmd->ctx.now > pmd->next_cycle_store) {
9065
0
        uint64_t curr_tsc;
9066
0
        uint8_t rebalance_load_trigger;
9067
0
        struct pmd_auto_lb *pmd_alb = &pmd->dp->pmd_alb;
9068
0
        unsigned int idx;
9069
9070
0
        if (pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] >=
9071
0
                pmd->prev_stats[PMD_CYCLES_ITER_IDLE] &&
9072
0
            pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] >=
9073
0
                pmd->prev_stats[PMD_CYCLES_ITER_BUSY]) {
9074
0
            tot_idle = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] -
9075
0
                       pmd->prev_stats[PMD_CYCLES_ITER_IDLE];
9076
0
            tot_proc = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] -
9077
0
                       pmd->prev_stats[PMD_CYCLES_ITER_BUSY];
9078
0
            tot_sleep = pmd->perf_stats.counters.n[PMD_CYCLES_SLEEP] -
9079
0
                        pmd->prev_stats[PMD_CYCLES_SLEEP];
9080
9081
0
            if (pmd_alb->is_enabled && !pmd->isolated) {
9082
0
                if (tot_proc) {
9083
0
                    pmd_load = ((tot_proc * 100) /
9084
0
                                    (tot_idle + tot_proc + tot_sleep));
9085
0
                }
9086
9087
0
                atomic_read_relaxed(&pmd_alb->rebalance_load_thresh,
9088
0
                                    &rebalance_load_trigger);
9089
0
                if (pmd_load >= rebalance_load_trigger) {
9090
0
                    atomic_count_inc(&pmd->pmd_overloaded);
9091
0
                } else {
9092
0
                    atomic_count_set(&pmd->pmd_overloaded, 0);
9093
0
                }
9094
0
            }
9095
0
        }
9096
9097
0
        pmd->prev_stats[PMD_CYCLES_ITER_IDLE] =
9098
0
                        pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE];
9099
0
        pmd->prev_stats[PMD_CYCLES_ITER_BUSY] =
9100
0
                        pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY];
9101
0
        pmd->prev_stats[PMD_CYCLES_SLEEP] =
9102
0
                        pmd->perf_stats.counters.n[PMD_CYCLES_SLEEP];
9103
9104
        /* Get the cycles that were used to process each queue and store. */
9105
0
        for (unsigned i = 0; i < poll_cnt; i++) {
9106
0
            uint64_t rxq_cyc_curr = dp_netdev_rxq_get_cycles(poll_list[i].rxq,
9107
0
                                                        RXQ_CYCLES_PROC_CURR);
9108
0
            dp_netdev_rxq_set_intrvl_cycles(poll_list[i].rxq, rxq_cyc_curr);
9109
0
            dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR,
9110
0
                                     0);
9111
0
        }
9112
0
        curr_tsc = cycles_counter_update(&pmd->perf_stats);
9113
0
        if (pmd->intrvl_tsc_prev) {
9114
            /* There is a prev timestamp, store a new intrvl cycle count. */
9115
0
            atomic_store_relaxed(&pmd->intrvl_cycles,
9116
0
                                 curr_tsc - pmd->intrvl_tsc_prev);
9117
0
        }
9118
0
        idx = atomic_count_inc(&pmd->intrvl_idx) % PMD_INTERVAL_MAX;
9119
0
        atomic_store_relaxed(&pmd->busy_cycles_intrvl[idx], tot_proc);
9120
0
        pmd->intrvl_tsc_prev = curr_tsc;
9121
        /* Start new measuring interval */
9122
0
        pmd->next_cycle_store = pmd->ctx.now + PMD_INTERVAL_LEN;
9123
0
    }
9124
9125
0
    if (pmd->ctx.now > pmd->next_optimization) {
9126
        /* Try to obtain the flow lock to block out revalidator threads.
9127
         * If not possible, just try next time. */
9128
0
        if (!ovs_mutex_trylock(&pmd->flow_mutex)) {
9129
            /* Optimize each classifier */
9130
0
            CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
9131
0
                dpcls_sort_subtable_vector(cls);
9132
0
            }
9133
0
            ovs_mutex_unlock(&pmd->flow_mutex);
9134
            /* Start new measuring interval */
9135
0
            pmd->next_optimization = pmd->ctx.now
9136
0
                                     + DPCLS_OPTIMIZATION_INTERVAL;
9137
0
        }
9138
0
    }
9139
0
}
9140
9141
/* Returns the sum of a specified number of newest to
9142
 * oldest interval values. 'cur_idx' is where the next
9143
 * write will be and wrap around needs to be handled.
9144
 */
9145
static uint64_t
9146
get_interval_values(atomic_ullong *source, atomic_count *cur_idx,
9147
0
                    int num_to_read) {
9148
0
    unsigned int i;
9149
0
    uint64_t total = 0;
9150
9151
0
    i = atomic_count_get(cur_idx) % PMD_INTERVAL_MAX;
9152
0
    for (int read = 0; read < num_to_read; read++) {
9153
0
        uint64_t interval_value;
9154
9155
0
        i = i ? i - 1 : PMD_INTERVAL_MAX - 1;
9156
0
        atomic_read_relaxed(&source[i], &interval_value);
9157
0
        total += interval_value;
9158
0
    }
9159
0
    return total;
9160
0
}
9161
9162
/* Insert 'rule' into 'cls'. */
9163
static void
9164
dpcls_insert(struct dpcls *cls, struct dpcls_rule *rule,
9165
             const struct netdev_flow_key *mask)
9166
0
{
9167
0
    struct dpcls_subtable *subtable = dpcls_find_subtable(cls, mask);
9168
9169
    /* Refer to subtable's mask, also for later removal. */
9170
0
    rule->mask = &subtable->mask;
9171
0
    cmap_insert(&subtable->rules, &rule->cmap_node, rule->flow.hash);
9172
0
}
9173
9174
/* Removes 'rule' from 'cls', also destructing the 'rule'. */
9175
static void
9176
dpcls_remove(struct dpcls *cls, struct dpcls_rule *rule)
9177
0
{
9178
0
    struct dpcls_subtable *subtable;
9179
9180
0
    ovs_assert(rule->mask);
9181
9182
    /* Get subtable from reference in rule->mask. */
9183
0
    INIT_CONTAINER(subtable, rule->mask, mask);
9184
0
    if (cmap_remove(&subtable->rules, &rule->cmap_node, rule->flow.hash)
9185
0
        == 0) {
9186
        /* Delete empty subtable. */
9187
0
        dpcls_destroy_subtable(cls, subtable);
9188
0
        pvector_publish(&cls->subtables);
9189
0
    }
9190
0
}
9191
9192
/* Inner loop for mask generation of a unit, see dpcls_flow_key_gen_masks. */
9193
static inline void
9194
dpcls_flow_key_gen_mask_unit(uint64_t iter, const uint64_t count,
9195
                             uint64_t *mf_masks)
9196
0
{
9197
0
    int i;
9198
0
    for (i = 0; i < count; i++) {
9199
0
        uint64_t lowest_bit = (iter & -iter);
9200
0
        iter &= ~lowest_bit;
9201
0
        mf_masks[i] = (lowest_bit - 1);
9202
0
    }
9203
    /* Checks that count has covered all bits in the iter bitmap. */
9204
0
    ovs_assert(iter == 0);
9205
0
}
9206
9207
/* Generate a mask for each block in the miniflow, based on the bits set. This
9208
 * allows easily masking packets with the generated array here, without
9209
 * calculations. This replaces runtime-calculating the masks.
9210
 * @param key The table to generate the mf_masks for
9211
 * @param mf_masks Pointer to a u64 array of at least *mf_bits* in size
9212
 * @param mf_bits_total Number of bits set in the whole miniflow (both units)
9213
 * @param mf_bits_unit0 Number of bits set in unit0 of the miniflow
9214
 */
9215
void
9216
dpcls_flow_key_gen_masks(const struct netdev_flow_key *tbl,
9217
                         uint64_t *mf_masks,
9218
                         const uint32_t mf_bits_u0,
9219
                         const uint32_t mf_bits_u1)
9220
0
{
9221
0
    uint64_t iter_u0 = tbl->mf.map.bits[0];
9222
0
    uint64_t iter_u1 = tbl->mf.map.bits[1];
9223
9224
0
    dpcls_flow_key_gen_mask_unit(iter_u0, mf_bits_u0, &mf_masks[0]);
9225
0
    dpcls_flow_key_gen_mask_unit(iter_u1, mf_bits_u1, &mf_masks[mf_bits_u0]);
9226
0
}
9227
9228
/* Returns true if 'target' satisfies 'key' in 'mask', that is, if each 1-bit
9229
 * in 'mask' the values in 'key' and 'target' are the same. */
9230
inline bool
9231
dpcls_rule_matches_key(const struct dpcls_rule *rule,
9232
                       const struct netdev_flow_key *target)
9233
0
{
9234
0
    const uint64_t *keyp = miniflow_get_values(&rule->flow.mf);
9235
0
    const uint64_t *maskp = miniflow_get_values(&rule->mask->mf);
9236
0
    uint64_t value;
9237
9238
0
    NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, target, rule->flow.mf.map) {
9239
0
        if (OVS_UNLIKELY((value & *maskp++) != *keyp++)) {
9240
0
            return false;
9241
0
        }
9242
0
    }
9243
0
    return true;
9244
0
}
9245
9246
/* For each miniflow in 'keys' performs a classifier lookup writing the result
9247
 * into the corresponding slot in 'rules'.  If a particular entry in 'keys' is
9248
 * NULL it is skipped.
9249
 *
9250
 * This function is optimized for use in the userspace datapath and therefore
9251
 * does not implement a lot of features available in the standard
9252
 * classifier_lookup() function.  Specifically, it does not implement
9253
 * priorities, instead returning any rule which matches the flow.
9254
 *
9255
 * Returns true if all miniflows found a corresponding rule. */
9256
bool
9257
dpcls_lookup(struct dpcls *cls, const struct netdev_flow_key *keys[],
9258
             struct dpcls_rule **rules, const size_t cnt,
9259
             int *num_lookups_p)
9260
0
{
9261
    /* The received 'cnt' miniflows are the search-keys that will be processed
9262
     * to find a matching entry into the available subtables.
9263
     * The number of bits in map_type is equal to NETDEV_MAX_BURST. */
9264
0
#define MAP_BITS (sizeof(uint32_t) * CHAR_BIT)
9265
0
    BUILD_ASSERT_DECL(MAP_BITS >= NETDEV_MAX_BURST);
9266
9267
0
    struct dpcls_subtable *subtable;
9268
0
    uint32_t keys_map = TYPE_MAXIMUM(uint32_t); /* Set all bits. */
9269
9270
0
    if (cnt != MAP_BITS) {
9271
0
        keys_map >>= MAP_BITS - cnt; /* Clear extra bits. */
9272
0
    }
9273
0
    memset(rules, 0, cnt * sizeof *rules);
9274
9275
0
    int lookups_match = 0, subtable_pos = 1;
9276
0
    uint32_t found_map;
9277
9278
    /* The Datapath classifier - aka dpcls - is composed of subtables.
9279
     * Subtables are dynamically created as needed when new rules are inserted.
9280
     * Each subtable collects rules with matches on a specific subset of packet
9281
     * fields as defined by the subtable's mask.  We proceed to process every
9282
     * search-key against each subtable, but when a match is found for a
9283
     * search-key, the search for that key can stop because the rules are
9284
     * non-overlapping. */
9285
0
    PVECTOR_FOR_EACH (subtable, &cls->subtables) {
9286
        /* Call the subtable specific lookup function. */
9287
0
        found_map = subtable->lookup_func(subtable, keys_map, keys, rules);
9288
9289
        /* Count the number of subtables searched for this packet match. This
9290
         * estimates the "spread" of subtables looked at per matched packet. */
9291
0
        uint32_t pkts_matched = count_1bits(found_map);
9292
0
        lookups_match += pkts_matched * subtable_pos;
9293
9294
        /* Clear the found rules, and return early if all packets are found. */
9295
0
        keys_map &= ~found_map;
9296
0
        if (!keys_map) {
9297
0
            if (num_lookups_p) {
9298
0
                *num_lookups_p = lookups_match;
9299
0
            }
9300
0
            return true;
9301
0
        }
9302
0
        subtable_pos++;
9303
0
    }
9304
9305
0
    if (num_lookups_p) {
9306
0
        *num_lookups_p = lookups_match;
9307
0
    }
9308
    return false;
9309
0
}