Coverage Report

Created: 2023-03-26 07:41

/src/openvswitch/lib/dpif-netdev.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2009-2014, 2016-2018 Nicira, Inc.
3
 *
4
 * Licensed under the Apache License, Version 2.0 (the "License");
5
 * you may not use this file except in compliance with the License.
6
 * You may obtain a copy of the License at:
7
 *
8
 *     http://www.apache.org/licenses/LICENSE-2.0
9
 *
10
 * Unless required by applicable law or agreed to in writing, software
11
 * distributed under the License is distributed on an "AS IS" BASIS,
12
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
 * See the License for the specific language governing permissions and
14
 * limitations under the License.
15
 */
16
17
#include <config.h>
18
#include "dpif-netdev.h"
19
#include "dpif-netdev-private.h"
20
#include "dpif-netdev-private-dfc.h"
21
22
#include <ctype.h>
23
#include <errno.h>
24
#include <fcntl.h>
25
#include <inttypes.h>
26
#include <net/if.h>
27
#include <sys/types.h>
28
#include <netinet/in.h>
29
#include <stdint.h>
30
#include <stdlib.h>
31
#include <string.h>
32
#include <sys/ioctl.h>
33
#include <sys/socket.h>
34
#include <sys/stat.h>
35
#include <unistd.h>
36
37
#include "bitmap.h"
38
#include "ccmap.h"
39
#include "cmap.h"
40
#include "conntrack.h"
41
#include "conntrack-tp.h"
42
#include "coverage.h"
43
#include "ct-dpif.h"
44
#include "csum.h"
45
#include "dp-packet.h"
46
#include "dpif.h"
47
#include "dpif-netdev-lookup.h"
48
#include "dpif-netdev-perf.h"
49
#include "dpif-netdev-private-extract.h"
50
#include "dpif-provider.h"
51
#include "dummy.h"
52
#include "fat-rwlock.h"
53
#include "flow.h"
54
#include "hmapx.h"
55
#include "id-fpool.h"
56
#include "id-pool.h"
57
#include "ipf.h"
58
#include "mov-avg.h"
59
#include "mpsc-queue.h"
60
#include "netdev.h"
61
#include "netdev-offload.h"
62
#include "netdev-provider.h"
63
#include "netdev-vport.h"
64
#include "netlink.h"
65
#include "odp-execute.h"
66
#include "odp-util.h"
67
#include "openvswitch/dynamic-string.h"
68
#include "openvswitch/list.h"
69
#include "openvswitch/match.h"
70
#include "openvswitch/ofp-parse.h"
71
#include "openvswitch/ofp-print.h"
72
#include "openvswitch/ofpbuf.h"
73
#include "openvswitch/shash.h"
74
#include "openvswitch/vlog.h"
75
#include "ovs-numa.h"
76
#include "ovs-rcu.h"
77
#include "packets.h"
78
#include "openvswitch/poll-loop.h"
79
#include "pvector.h"
80
#include "random.h"
81
#include "seq.h"
82
#include "smap.h"
83
#include "sset.h"
84
#include "timeval.h"
85
#include "tnl-neigh-cache.h"
86
#include "tnl-ports.h"
87
#include "unixctl.h"
88
#include "util.h"
89
#include "uuid.h"
90
91
VLOG_DEFINE_THIS_MODULE(dpif_netdev);
92
93
/* Auto Load Balancing Defaults */
94
0
#define ALB_IMPROVEMENT_THRESHOLD    25
95
0
#define ALB_LOAD_THRESHOLD           95
96
0
#define ALB_REBALANCE_INTERVAL       1     /* 1 Min */
97
0
#define MAX_ALB_REBALANCE_INTERVAL   20000 /* 20000 Min */
98
0
#define MIN_TO_MSEC                  60000
99
100
#define FLOW_DUMP_MAX_BATCH 50
101
/* Use per thread recirc_depth to prevent recirculation loop. */
102
0
#define MAX_RECIRC_DEPTH 6
103
DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0)
104
105
/* Use instant packet send by default. */
106
0
#define DEFAULT_TX_FLUSH_INTERVAL 0
107
108
/* Configuration parameters. */
109
enum { MAX_METERS = 1 << 18 };  /* Maximum number of meters. */
110
enum { MAX_BANDS = 8 };         /* Maximum number of bands / meter. */
111
112
COVERAGE_DEFINE(datapath_drop_meter);
113
COVERAGE_DEFINE(datapath_drop_upcall_error);
114
COVERAGE_DEFINE(datapath_drop_lock_error);
115
COVERAGE_DEFINE(datapath_drop_userspace_action_error);
116
COVERAGE_DEFINE(datapath_drop_tunnel_push_error);
117
COVERAGE_DEFINE(datapath_drop_tunnel_pop_error);
118
COVERAGE_DEFINE(datapath_drop_recirc_error);
119
COVERAGE_DEFINE(datapath_drop_invalid_port);
120
COVERAGE_DEFINE(datapath_drop_invalid_bond);
121
COVERAGE_DEFINE(datapath_drop_invalid_tnl_port);
122
COVERAGE_DEFINE(datapath_drop_rx_invalid_packet);
123
#ifdef ALLOW_EXPERIMENTAL_API /* Packet restoration API required. */
124
COVERAGE_DEFINE(datapath_drop_hw_miss_recover);
125
#endif
126
127
/* Protects against changes to 'dp_netdevs'. */
128
struct ovs_mutex dp_netdev_mutex = OVS_MUTEX_INITIALIZER;
129
130
/* Contains all 'struct dp_netdev's. */
131
static struct shash dp_netdevs OVS_GUARDED_BY(dp_netdev_mutex)
132
    = SHASH_INITIALIZER(&dp_netdevs);
133
134
static struct vlog_rate_limit upcall_rl = VLOG_RATE_LIMIT_INIT(600, 600);
135
136
0
#define DP_NETDEV_CS_SUPPORTED_MASK (CS_NEW | CS_ESTABLISHED | CS_RELATED \
137
0
                                     | CS_INVALID | CS_REPLY_DIR | CS_TRACKED \
138
0
                                     | CS_SRC_NAT | CS_DST_NAT)
139
0
#define DP_NETDEV_CS_UNSUPPORTED_MASK (~(uint32_t)DP_NETDEV_CS_SUPPORTED_MASK)
140
141
static struct odp_support dp_netdev_support = {
142
    .max_vlan_headers = SIZE_MAX,
143
    .max_mpls_depth = SIZE_MAX,
144
    .recirc = true,
145
    .ct_state = true,
146
    .ct_zone = true,
147
    .ct_mark = true,
148
    .ct_label = true,
149
    .ct_state_nat = true,
150
    .ct_orig_tuple = true,
151
    .ct_orig_tuple6 = true,
152
};
153
154

155
/* Simple non-wildcarding single-priority classifier. */
156
157
/* Time in microseconds between successive optimizations of the dpcls
158
 * subtable vector */
159
0
#define DPCLS_OPTIMIZATION_INTERVAL 1000000LL
160
161
/* Time in microseconds of the interval in which rxq processing cycles used
162
 * in rxq to pmd assignments is measured and stored. */
163
0
#define PMD_INTERVAL_LEN 5000000LL
164
/* For converting PMD_INTERVAL_LEN to secs. */
165
0
#define INTERVAL_USEC_TO_SEC 1000000LL
166
167
/* Number of intervals for which cycles are stored
168
 * and used during rxq to pmd assignment. */
169
0
#define PMD_INTERVAL_MAX 12
170
171
/* Time in microseconds to try RCU quiescing. */
172
0
#define PMD_RCU_QUIESCE_INTERVAL 10000LL
173
174
/* Timer resolution for PMD threads in nanoseconds. */
175
0
#define PMD_TIMER_RES_NS 1000
176
177
/* Number of pkts Rx on an interface that will stop pmd thread sleeping. */
178
0
#define PMD_SLEEP_THRESH (NETDEV_MAX_BURST / 2)
179
/* Time in uS to increment a pmd thread sleep time. */
180
0
#define PMD_SLEEP_INC_US 1
181
182
struct dpcls {
183
    struct cmap_node node;      /* Within dp_netdev_pmd_thread.classifiers */
184
    odp_port_t in_port;
185
    struct cmap subtables_map;
186
    struct pvector subtables;
187
};
188
189
/* Data structure to keep packet order till fastpath processing. */
190
struct dp_packet_flow_map {
191
    struct dp_packet *packet;
192
    struct dp_netdev_flow *flow;
193
    uint16_t tcp_flags;
194
};
195
196
static void dpcls_init(struct dpcls *);
197
static void dpcls_destroy(struct dpcls *);
198
static void dpcls_sort_subtable_vector(struct dpcls *);
199
static uint32_t dpcls_subtable_lookup_reprobe(struct dpcls *cls);
200
static void dpcls_insert(struct dpcls *, struct dpcls_rule *,
201
                         const struct netdev_flow_key *mask);
202
static void dpcls_remove(struct dpcls *, struct dpcls_rule *);
203
204
/* Set of supported meter flags */
205
#define DP_SUPPORTED_METER_FLAGS_MASK \
206
0
    (OFPMF13_STATS | OFPMF13_PKTPS | OFPMF13_KBPS | OFPMF13_BURST)
207
208
/* Set of supported meter band types */
209
#define DP_SUPPORTED_METER_BAND_TYPES           \
210
0
    ( 1 << OFPMBT13_DROP )
211
212
struct dp_meter_band {
213
    uint32_t rate;
214
    uint32_t burst_size;
215
    uint64_t bucket; /* In 1/1000 packets (for PKTPS), or in bits (for KBPS) */
216
    uint64_t packet_count;
217
    uint64_t byte_count;
218
};
219
220
struct dp_meter {
221
    struct cmap_node node;
222
    struct ovs_mutex lock;
223
    uint32_t id;
224
    uint16_t flags;
225
    uint16_t n_bands;
226
    uint32_t max_delta_t;
227
    uint64_t used;
228
    uint64_t packet_count;
229
    uint64_t byte_count;
230
    struct dp_meter_band bands[];
231
};
232
233
struct pmd_auto_lb {
234
    bool do_dry_run;
235
    bool recheck_config;
236
    bool is_enabled;            /* Current status of Auto load balancing. */
237
    uint64_t rebalance_intvl;
238
    uint64_t rebalance_poll_timer;
239
    uint8_t rebalance_improve_thresh;
240
    atomic_uint8_t rebalance_load_thresh;
241
};
242
243
enum sched_assignment_type {
244
    SCHED_ROUNDROBIN,
245
    SCHED_CYCLES, /* Default.*/
246
    SCHED_GROUP
247
};
248
249
/* Datapath based on the network device interface from netdev.h.
250
 *
251
 *
252
 * Thread-safety
253
 * =============
254
 *
255
 * Some members, marked 'const', are immutable.  Accessing other members
256
 * requires synchronization, as noted in more detail below.
257
 *
258
 * Acquisition order is, from outermost to innermost:
259
 *
260
 *    dp_netdev_mutex (global)
261
 *    port_rwlock
262
 *    bond_mutex
263
 *    non_pmd_mutex
264
 */
265
struct dp_netdev {
266
    const struct dpif_class *const class;
267
    const char *const name;
268
    struct ovs_refcount ref_cnt;
269
    atomic_flag destroyed;
270
271
    /* Ports.
272
     *
273
     * Any lookup into 'ports' or any access to the dp_netdev_ports found
274
     * through 'ports' requires taking 'port_rwlock'. */
275
    struct ovs_rwlock port_rwlock;
276
    struct hmap ports;
277
    struct seq *port_seq;       /* Incremented whenever a port changes. */
278
279
    /* The time that a packet can wait in output batch for sending. */
280
    atomic_uint32_t tx_flush_interval;
281
282
    /* Meters. */
283
    struct ovs_mutex meters_lock;
284
    struct cmap meters OVS_GUARDED;
285
286
    /* Probability of EMC insertions is a factor of 'emc_insert_min'.*/
287
    atomic_uint32_t emc_insert_min;
288
    /* Enable collection of PMD performance metrics. */
289
    atomic_bool pmd_perf_metrics;
290
    /* Max load based sleep request. */
291
    atomic_uint64_t pmd_max_sleep;
292
    /* Enable the SMC cache from ovsdb config */
293
    atomic_bool smc_enable_db;
294
295
    /* Protects access to ofproto-dpif-upcall interface during revalidator
296
     * thread synchronization. */
297
    struct fat_rwlock upcall_rwlock;
298
    upcall_callback *upcall_cb;  /* Callback function for executing upcalls. */
299
    void *upcall_aux;
300
301
    /* Callback function for notifying the purging of dp flows (during
302
     * reseting pmd deletion). */
303
    dp_purge_callback *dp_purge_cb;
304
    void *dp_purge_aux;
305
306
    /* Stores all 'struct dp_netdev_pmd_thread's. */
307
    struct cmap poll_threads;
308
    /* id pool for per thread static_tx_qid. */
309
    struct id_pool *tx_qid_pool;
310
    struct ovs_mutex tx_qid_pool_mutex;
311
    /* Rxq to pmd assignment type. */
312
    enum sched_assignment_type pmd_rxq_assign_type;
313
    bool pmd_iso;
314
315
    /* Protects the access of the 'struct dp_netdev_pmd_thread'
316
     * instance for non-pmd thread. */
317
    struct ovs_mutex non_pmd_mutex;
318
319
    /* Each pmd thread will store its pointer to
320
     * 'struct dp_netdev_pmd_thread' in 'per_pmd_key'. */
321
    ovsthread_key_t per_pmd_key;
322
323
    struct seq *reconfigure_seq;
324
    uint64_t last_reconfigure_seq;
325
326
    /* Cpu mask for pin of pmd threads. */
327
    char *pmd_cmask;
328
329
    uint64_t last_tnl_conf_seq;
330
331
    struct conntrack *conntrack;
332
    struct pmd_auto_lb pmd_alb;
333
334
    /* Bonds. */
335
    struct ovs_mutex bond_mutex; /* Protects updates of 'tx_bonds'. */
336
    struct cmap tx_bonds; /* Contains 'struct tx_bond'. */
337
};
338
339
static struct dp_netdev_port *dp_netdev_lookup_port(const struct dp_netdev *dp,
340
                                                    odp_port_t)
341
    OVS_REQ_RDLOCK(dp->port_rwlock);
342
343
enum rxq_cycles_counter_type {
344
    RXQ_CYCLES_PROC_CURR,       /* Cycles spent successfully polling and
345
                                   processing packets during the current
346
                                   interval. */
347
    RXQ_CYCLES_PROC_HIST,       /* Total cycles of all intervals that are used
348
                                   during rxq to pmd assignment. */
349
    RXQ_N_CYCLES
350
};
351
352
enum dp_offload_type {
353
    DP_OFFLOAD_FLOW,
354
    DP_OFFLOAD_FLUSH,
355
};
356
357
enum {
358
    DP_NETDEV_FLOW_OFFLOAD_OP_ADD,
359
    DP_NETDEV_FLOW_OFFLOAD_OP_MOD,
360
    DP_NETDEV_FLOW_OFFLOAD_OP_DEL,
361
};
362
363
struct dp_offload_flow_item {
364
    struct dp_netdev_flow *flow;
365
    int op;
366
    struct match match;
367
    struct nlattr *actions;
368
    size_t actions_len;
369
    odp_port_t orig_in_port; /* Originating in_port for tnl flows. */
370
};
371
372
struct dp_offload_flush_item {
373
    struct netdev *netdev;
374
    struct ovs_barrier *barrier;
375
};
376
377
union dp_offload_thread_data {
378
    struct dp_offload_flow_item flow;
379
    struct dp_offload_flush_item flush;
380
};
381
382
struct dp_offload_thread_item {
383
    struct mpsc_queue_node node;
384
    enum dp_offload_type type;
385
    long long int timestamp;
386
    struct dp_netdev *dp;
387
    union dp_offload_thread_data data[0];
388
};
389
390
struct dp_offload_thread {
391
    PADDED_MEMBERS(CACHE_LINE_SIZE,
392
        struct mpsc_queue queue;
393
        atomic_uint64_t enqueued_item;
394
        struct cmap megaflow_to_mark;
395
        struct cmap mark_to_flow;
396
        struct mov_avg_cma cma;
397
        struct mov_avg_ema ema;
398
    );
399
};
400
static struct dp_offload_thread *dp_offload_threads;
401
static void *dp_netdev_flow_offload_main(void *arg);
402
403
static void
404
dp_netdev_offload_init(void)
405
0
{
406
0
    static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
407
0
    unsigned int nb_offload_thread = netdev_offload_thread_nb();
408
0
    unsigned int tid;
409
410
0
    if (!ovsthread_once_start(&once)) {
411
0
        return;
412
0
    }
413
414
0
    dp_offload_threads = xcalloc(nb_offload_thread,
415
0
                                 sizeof *dp_offload_threads);
416
417
0
    for (tid = 0; tid < nb_offload_thread; tid++) {
418
0
        struct dp_offload_thread *thread;
419
420
0
        thread = &dp_offload_threads[tid];
421
0
        mpsc_queue_init(&thread->queue);
422
0
        cmap_init(&thread->megaflow_to_mark);
423
0
        cmap_init(&thread->mark_to_flow);
424
0
        atomic_init(&thread->enqueued_item, 0);
425
0
        mov_avg_cma_init(&thread->cma);
426
0
        mov_avg_ema_init(&thread->ema, 100);
427
0
        ovs_thread_create("hw_offload", dp_netdev_flow_offload_main, thread);
428
0
    }
429
430
0
    ovsthread_once_done(&once);
431
0
}
432
433
0
#define XPS_TIMEOUT 500000LL    /* In microseconds. */
434
435
/* Contained by struct dp_netdev_port's 'rxqs' member.  */
436
struct dp_netdev_rxq {
437
    struct dp_netdev_port *port;
438
    struct netdev_rxq *rx;
439
    unsigned core_id;                  /* Core to which this queue should be
440
                                          pinned. OVS_CORE_UNSPEC if the
441
                                          queue doesn't need to be pinned to a
442
                                          particular core. */
443
    atomic_count intrvl_idx;           /* Write index for 'cycles_intrvl'. */
444
    struct dp_netdev_pmd_thread *pmd;  /* pmd thread that polls this queue. */
445
    bool is_vhost;                     /* Is rxq of a vhost port. */
446
447
    /* Counters of cycles spent successfully polling and processing pkts. */
448
    atomic_ullong cycles[RXQ_N_CYCLES];
449
    /* We store PMD_INTERVAL_MAX intervals of data for an rxq and then
450
       sum them to yield the cycles used for an rxq. */
451
    atomic_ullong cycles_intrvl[PMD_INTERVAL_MAX];
452
};
453
454
enum txq_req_mode {
455
    TXQ_REQ_MODE_THREAD,
456
    TXQ_REQ_MODE_HASH,
457
};
458
459
enum txq_mode {
460
    TXQ_MODE_STATIC,
461
    TXQ_MODE_XPS,
462
    TXQ_MODE_XPS_HASH,
463
};
464
465
/* A port in a netdev-based datapath. */
466
struct dp_netdev_port {
467
    odp_port_t port_no;
468
    enum txq_mode txq_mode;     /* static, XPS, XPS_HASH. */
469
    bool need_reconfigure;      /* True if we should reconfigure netdev. */
470
    struct netdev *netdev;
471
    struct hmap_node node;      /* Node in dp_netdev's 'ports'. */
472
    struct netdev_saved_flags *sf;
473
    struct dp_netdev_rxq *rxqs;
474
    unsigned n_rxq;             /* Number of elements in 'rxqs' */
475
    unsigned *txq_used;         /* Number of threads that use each tx queue. */
476
    struct ovs_mutex txq_used_mutex;
477
    bool emc_enabled;           /* If true EMC will be used. */
478
    char *type;                 /* Port type as requested by user. */
479
    char *rxq_affinity_list;    /* Requested affinity of rx queues. */
480
    enum txq_req_mode txq_requested_mode;
481
};
482
483
static bool dp_netdev_flow_ref(struct dp_netdev_flow *);
484
static int dpif_netdev_flow_from_nlattrs(const struct nlattr *, uint32_t,
485
                                         struct flow *, bool);
486
487
struct dp_netdev_actions *dp_netdev_actions_create(const struct nlattr *,
488
                                                   size_t);
489
struct dp_netdev_actions *dp_netdev_flow_get_actions(
490
    const struct dp_netdev_flow *);
491
static void dp_netdev_actions_free(struct dp_netdev_actions *);
492
493
struct polled_queue {
494
    struct dp_netdev_rxq *rxq;
495
    odp_port_t port_no;
496
    bool emc_enabled;
497
    bool rxq_enabled;
498
    uint64_t change_seq;
499
};
500
501
/* Contained by struct dp_netdev_pmd_thread's 'poll_list' member. */
502
struct rxq_poll {
503
    struct dp_netdev_rxq *rxq;
504
    struct hmap_node node;
505
};
506
507
/* Contained by struct dp_netdev_pmd_thread's 'send_port_cache',
508
 * 'tnl_port_cache' or 'tx_ports'. */
509
struct tx_port {
510
    struct dp_netdev_port *port;
511
    int qid;
512
    long long last_used;
513
    struct hmap_node node;
514
    long long flush_time;
515
    struct dp_packet_batch output_pkts;
516
    struct dp_packet_batch *txq_pkts; /* Only for hash mode. */
517
    struct dp_netdev_rxq *output_pkts_rxqs[NETDEV_MAX_BURST];
518
};
519
520
/* Contained by struct tx_bond 'member_buckets'. */
521
struct member_entry {
522
    odp_port_t member_id;
523
    atomic_ullong n_packets;
524
    atomic_ullong n_bytes;
525
};
526
527
/* Contained by struct dp_netdev_pmd_thread's 'tx_bonds'. */
528
struct tx_bond {
529
    struct cmap_node node;
530
    uint32_t bond_id;
531
    struct member_entry member_buckets[BOND_BUCKETS];
532
};
533
534
/* Interface to netdev-based datapath. */
535
struct dpif_netdev {
536
    struct dpif dpif;
537
    struct dp_netdev *dp;
538
    uint64_t last_port_seq;
539
};
540
541
static int get_port_by_number(struct dp_netdev *dp, odp_port_t port_no,
542
                              struct dp_netdev_port **portp)
543
    OVS_REQ_RDLOCK(dp->port_rwlock);
544
static int get_port_by_name(struct dp_netdev *dp, const char *devname,
545
                            struct dp_netdev_port **portp)
546
    OVS_REQ_RDLOCK(dp->port_rwlock);
547
static void dp_netdev_free(struct dp_netdev *)
548
    OVS_REQUIRES(dp_netdev_mutex);
549
static int do_add_port(struct dp_netdev *dp, const char *devname,
550
                       const char *type, odp_port_t port_no)
551
    OVS_REQ_WRLOCK(dp->port_rwlock);
552
static void do_del_port(struct dp_netdev *dp, struct dp_netdev_port *)
553
    OVS_REQ_WRLOCK(dp->port_rwlock);
554
static int dpif_netdev_open(const struct dpif_class *, const char *name,
555
                            bool create, struct dpif **);
556
static void dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
557
                                      struct dp_packet_batch *,
558
                                      bool should_steal,
559
                                      const struct flow *flow,
560
                                      const struct nlattr *actions,
561
                                      size_t actions_len);
562
static void dp_netdev_recirculate(struct dp_netdev_pmd_thread *,
563
                                  struct dp_packet_batch *);
564
565
static void dp_netdev_disable_upcall(struct dp_netdev *);
566
static void dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd);
567
static void dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd,
568
                                    struct dp_netdev *dp, unsigned core_id,
569
                                    int numa_id);
570
static void dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd);
571
static void dp_netdev_set_nonpmd(struct dp_netdev *dp)
572
    OVS_REQ_WRLOCK(dp->port_rwlock);
573
574
static void *pmd_thread_main(void *);
575
static struct dp_netdev_pmd_thread *dp_netdev_get_pmd(struct dp_netdev *dp,
576
                                                      unsigned core_id);
577
static struct dp_netdev_pmd_thread *
578
dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos);
579
static void dp_netdev_del_pmd(struct dp_netdev *dp,
580
                              struct dp_netdev_pmd_thread *pmd);
581
static void dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd);
582
static void dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd);
583
static void dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
584
                                         struct dp_netdev_port *port)
585
    OVS_REQUIRES(pmd->port_mutex);
586
static void dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
587
                                           struct tx_port *tx)
588
    OVS_REQUIRES(pmd->port_mutex);
589
static void dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
590
                                     struct dp_netdev_rxq *rxq)
591
    OVS_REQUIRES(pmd->port_mutex);
592
static void dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
593
                                       struct rxq_poll *poll)
594
    OVS_REQUIRES(pmd->port_mutex);
595
static int
596
dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
597
                                   bool force);
598
static void dp_netdev_add_bond_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
599
                                         struct tx_bond *bond, bool update)
600
    OVS_EXCLUDED(pmd->bond_mutex);
601
static void dp_netdev_del_bond_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
602
                                           uint32_t bond_id)
603
    OVS_EXCLUDED(pmd->bond_mutex);
604
605
static void dp_netdev_offload_flush(struct dp_netdev *dp,
606
                                    struct dp_netdev_port *port);
607
608
static void reconfigure_datapath(struct dp_netdev *dp)
609
    OVS_REQ_RDLOCK(dp->port_rwlock);
610
static bool dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd);
611
static void dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd);
612
static void dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd);
613
static void pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
614
    OVS_REQUIRES(pmd->port_mutex);
615
static inline void
616
dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
617
                           struct polled_queue *poll_list, int poll_cnt);
618
static void
619
dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
620
                         enum rxq_cycles_counter_type type,
621
                         unsigned long long cycles);
622
static uint64_t
623
dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx,
624
                         enum rxq_cycles_counter_type type);
625
static void
626
dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
627
                           unsigned long long cycles);
628
static uint64_t
629
dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx);
630
static uint64_t
631
get_interval_values(atomic_ullong *source, atomic_count *cur_idx,
632
                    int num_to_read);
633
static void
634
dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
635
                               bool purge);
636
static int dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
637
                                      struct tx_port *tx);
638
inline struct dpcls *
639
dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd,
640
                           odp_port_t in_port);
641
642
static void dp_netdev_request_reconfigure(struct dp_netdev *dp);
643
static inline bool
644
pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd);
645
static void queue_netdev_flow_del(struct dp_netdev_pmd_thread *pmd,
646
                                  struct dp_netdev_flow *flow);
647
648
static void dp_netdev_simple_match_insert(struct dp_netdev_pmd_thread *pmd,
649
                                          struct dp_netdev_flow *flow)
650
    OVS_REQUIRES(pmd->flow_mutex);
651
static void dp_netdev_simple_match_remove(struct dp_netdev_pmd_thread *pmd,
652
                                          struct dp_netdev_flow *flow)
653
    OVS_REQUIRES(pmd->flow_mutex);
654
655
static bool dp_netdev_flow_is_simple_match(const struct match *);
656
657
/* Updates the time in PMD threads context and should be called in three cases:
658
 *
659
 *     1. PMD structure initialization:
660
 *         - dp_netdev_configure_pmd()
661
 *
662
 *     2. Before processing of the new packet batch:
663
 *         - dpif_netdev_execute()
664
 *         - dp_netdev_process_rxq_port()
665
 *
666
 *     3. At least once per polling iteration in main polling threads if no
667
 *        packets received on current iteration:
668
 *         - dpif_netdev_run()
669
 *         - pmd_thread_main()
670
 *
671
 * 'pmd->ctx.now' should be used without update in all other cases if possible.
672
 */
673
static inline void
674
pmd_thread_ctx_time_update(struct dp_netdev_pmd_thread *pmd)
675
0
{
676
0
    pmd->ctx.now = time_usec();
677
0
}
678
679
/* Returns true if 'dpif' is a netdev or dummy dpif, false otherwise. */
680
bool
681
dpif_is_netdev(const struct dpif *dpif)
682
0
{
683
0
    return dpif->dpif_class->open == dpif_netdev_open;
684
0
}
685
686
static struct dpif_netdev *
687
dpif_netdev_cast(const struct dpif *dpif)
688
0
{
689
0
    ovs_assert(dpif_is_netdev(dpif));
690
0
    return CONTAINER_OF(dpif, struct dpif_netdev, dpif);
691
0
}
692
693
static struct dp_netdev *
694
get_dp_netdev(const struct dpif *dpif)
695
0
{
696
0
    return dpif_netdev_cast(dpif)->dp;
697
0
}
698

699
enum pmd_info_type {
700
    PMD_INFO_SHOW_STATS,  /* Show how cpu cycles are spent. */
701
    PMD_INFO_CLEAR_STATS, /* Set the cycles count to 0. */
702
    PMD_INFO_SHOW_RXQ,    /* Show poll lists of pmd threads. */
703
    PMD_INFO_PERF_SHOW,   /* Show pmd performance details. */
704
};
705
706
static void
707
format_pmd_thread(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
708
0
{
709
0
    ds_put_cstr(reply, (pmd->core_id == NON_PMD_CORE_ID)
710
0
                        ? "main thread" : "pmd thread");
711
0
    if (pmd->numa_id != OVS_NUMA_UNSPEC) {
712
0
        ds_put_format(reply, " numa_id %d", pmd->numa_id);
713
0
    }
714
0
    if (pmd->core_id != OVS_CORE_UNSPEC && pmd->core_id != NON_PMD_CORE_ID) {
715
0
        ds_put_format(reply, " core_id %u", pmd->core_id);
716
0
    }
717
0
    ds_put_cstr(reply, ":\n");
718
0
}
719
720
static void
721
pmd_info_show_stats(struct ds *reply,
722
                    struct dp_netdev_pmd_thread *pmd)
723
0
{
724
0
    uint64_t stats[PMD_N_STATS];
725
0
    uint64_t total_cycles, total_packets;
726
0
    double passes_per_pkt = 0;
727
0
    double lookups_per_hit = 0;
728
0
    double packets_per_batch = 0;
729
730
0
    pmd_perf_read_counters(&pmd->perf_stats, stats);
731
0
    total_cycles = stats[PMD_CYCLES_ITER_IDLE]
732
0
                         + stats[PMD_CYCLES_ITER_BUSY];
733
0
    total_packets = stats[PMD_STAT_RECV];
734
735
0
    format_pmd_thread(reply, pmd);
736
737
0
    if (total_packets > 0) {
738
0
        passes_per_pkt = (total_packets + stats[PMD_STAT_RECIRC])
739
0
                            / (double) total_packets;
740
0
    }
741
0
    if (stats[PMD_STAT_MASKED_HIT] > 0) {
742
0
        lookups_per_hit = stats[PMD_STAT_MASKED_LOOKUP]
743
0
                            / (double) stats[PMD_STAT_MASKED_HIT];
744
0
    }
745
0
    if (stats[PMD_STAT_SENT_BATCHES] > 0) {
746
0
        packets_per_batch = stats[PMD_STAT_SENT_PKTS]
747
0
                            / (double) stats[PMD_STAT_SENT_BATCHES];
748
0
    }
749
750
0
    ds_put_format(reply,
751
0
                  "  packets received: %"PRIu64"\n"
752
0
                  "  packet recirculations: %"PRIu64"\n"
753
0
                  "  avg. datapath passes per packet: %.02f\n"
754
0
                  "  phwol hits: %"PRIu64"\n"
755
0
                  "  mfex opt hits: %"PRIu64"\n"
756
0
                  "  simple match hits: %"PRIu64"\n"
757
0
                  "  emc hits: %"PRIu64"\n"
758
0
                  "  smc hits: %"PRIu64"\n"
759
0
                  "  megaflow hits: %"PRIu64"\n"
760
0
                  "  avg. subtable lookups per megaflow hit: %.02f\n"
761
0
                  "  miss with success upcall: %"PRIu64"\n"
762
0
                  "  miss with failed upcall: %"PRIu64"\n"
763
0
                  "  avg. packets per output batch: %.02f\n",
764
0
                  total_packets, stats[PMD_STAT_RECIRC],
765
0
                  passes_per_pkt, stats[PMD_STAT_PHWOL_HIT],
766
0
                  stats[PMD_STAT_MFEX_OPT_HIT],
767
0
                  stats[PMD_STAT_SIMPLE_HIT],
768
0
                  stats[PMD_STAT_EXACT_HIT],
769
0
                  stats[PMD_STAT_SMC_HIT],
770
0
                  stats[PMD_STAT_MASKED_HIT],
771
0
                  lookups_per_hit, stats[PMD_STAT_MISS], stats[PMD_STAT_LOST],
772
0
                  packets_per_batch);
773
774
0
    if (total_cycles == 0) {
775
0
        return;
776
0
    }
777
778
0
    ds_put_format(reply,
779
0
                  "  idle cycles: %"PRIu64" (%.02f%%)\n"
780
0
                  "  processing cycles: %"PRIu64" (%.02f%%)\n",
781
0
                  stats[PMD_CYCLES_ITER_IDLE],
782
0
                  stats[PMD_CYCLES_ITER_IDLE] / (double) total_cycles * 100,
783
0
                  stats[PMD_CYCLES_ITER_BUSY],
784
0
                  stats[PMD_CYCLES_ITER_BUSY] / (double) total_cycles * 100);
785
786
0
    if (total_packets == 0) {
787
0
        return;
788
0
    }
789
790
0
    ds_put_format(reply,
791
0
                  "  avg cycles per packet: %.02f (%"PRIu64"/%"PRIu64")\n",
792
0
                  total_cycles / (double) total_packets,
793
0
                  total_cycles, total_packets);
794
795
0
    ds_put_format(reply,
796
0
                  "  avg processing cycles per packet: "
797
0
                  "%.02f (%"PRIu64"/%"PRIu64")\n",
798
0
                  stats[PMD_CYCLES_ITER_BUSY] / (double) total_packets,
799
0
                  stats[PMD_CYCLES_ITER_BUSY], total_packets);
800
0
}
801
802
static void
803
pmd_info_show_perf(struct ds *reply,
804
                   struct dp_netdev_pmd_thread *pmd,
805
                   struct pmd_perf_params *par)
806
0
{
807
0
    if (pmd->core_id != NON_PMD_CORE_ID) {
808
0
        char *time_str =
809
0
                xastrftime_msec("%H:%M:%S.###", time_wall_msec(), true);
810
0
        long long now = time_msec();
811
0
        double duration = (now - pmd->perf_stats.start_ms) / 1000.0;
812
813
0
        ds_put_cstr(reply, "\n");
814
0
        ds_put_format(reply, "Time: %s\n", time_str);
815
0
        ds_put_format(reply, "Measurement duration: %.3f s\n", duration);
816
0
        ds_put_cstr(reply, "\n");
817
0
        format_pmd_thread(reply, pmd);
818
0
        ds_put_cstr(reply, "\n");
819
0
        pmd_perf_format_overall_stats(reply, &pmd->perf_stats, duration);
820
0
        if (pmd_perf_metrics_enabled(pmd)) {
821
            /* Prevent parallel clearing of perf metrics. */
822
0
            ovs_mutex_lock(&pmd->perf_stats.clear_mutex);
823
0
            if (par->histograms) {
824
0
                ds_put_cstr(reply, "\n");
825
0
                pmd_perf_format_histograms(reply, &pmd->perf_stats);
826
0
            }
827
0
            if (par->iter_hist_len > 0) {
828
0
                ds_put_cstr(reply, "\n");
829
0
                pmd_perf_format_iteration_history(reply, &pmd->perf_stats,
830
0
                        par->iter_hist_len);
831
0
            }
832
0
            if (par->ms_hist_len > 0) {
833
0
                ds_put_cstr(reply, "\n");
834
0
                pmd_perf_format_ms_history(reply, &pmd->perf_stats,
835
0
                        par->ms_hist_len);
836
0
            }
837
0
            ovs_mutex_unlock(&pmd->perf_stats.clear_mutex);
838
0
        }
839
0
        free(time_str);
840
0
    }
841
0
}
842
843
static int
844
compare_poll_list(const void *a_, const void *b_)
845
0
{
846
0
    const struct rxq_poll *a = a_;
847
0
    const struct rxq_poll *b = b_;
848
849
0
    const char *namea = netdev_rxq_get_name(a->rxq->rx);
850
0
    const char *nameb = netdev_rxq_get_name(b->rxq->rx);
851
852
0
    int cmp = strcmp(namea, nameb);
853
0
    if (!cmp) {
854
0
        return netdev_rxq_get_queue_id(a->rxq->rx)
855
0
               - netdev_rxq_get_queue_id(b->rxq->rx);
856
0
    } else {
857
0
        return cmp;
858
0
    }
859
0
}
860
861
static void
862
sorted_poll_list(struct dp_netdev_pmd_thread *pmd, struct rxq_poll **list,
863
                 size_t *n)
864
    OVS_REQUIRES(pmd->port_mutex)
865
0
{
866
0
    struct rxq_poll *ret, *poll;
867
0
    size_t i;
868
869
0
    *n = hmap_count(&pmd->poll_list);
870
0
    if (!*n) {
871
0
        ret = NULL;
872
0
    } else {
873
0
        ret = xcalloc(*n, sizeof *ret);
874
0
        i = 0;
875
0
        HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
876
0
            ret[i] = *poll;
877
0
            i++;
878
0
        }
879
0
        ovs_assert(i == *n);
880
0
        qsort(ret, *n, sizeof *ret, compare_poll_list);
881
0
    }
882
883
0
    *list = ret;
884
0
}
885
886
static void
887
pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd,
888
                  int secs)
889
0
{
890
0
    if (pmd->core_id != NON_PMD_CORE_ID) {
891
0
        struct rxq_poll *list;
892
0
        size_t n_rxq;
893
0
        uint64_t total_pmd_cycles = 0;
894
0
        uint64_t busy_pmd_cycles = 0;
895
0
        uint64_t total_rxq_proc_cycles = 0;
896
0
        unsigned int intervals;
897
898
0
        ds_put_format(reply,
899
0
                      "pmd thread numa_id %d core_id %u:\n  isolated : %s\n",
900
0
                      pmd->numa_id, pmd->core_id, (pmd->isolated)
901
0
                                                  ? "true" : "false");
902
903
0
        ovs_mutex_lock(&pmd->port_mutex);
904
0
        sorted_poll_list(pmd, &list, &n_rxq);
905
906
        /* Get the total pmd cycles for an interval. */
907
0
        atomic_read_relaxed(&pmd->intrvl_cycles, &total_pmd_cycles);
908
        /* Calculate how many intervals are to be used. */
909
0
        intervals = DIV_ROUND_UP(secs,
910
0
                                 PMD_INTERVAL_LEN / INTERVAL_USEC_TO_SEC);
911
        /* Estimate the cycles to cover all intervals. */
912
0
        total_pmd_cycles *= intervals;
913
0
        busy_pmd_cycles = get_interval_values(pmd->busy_cycles_intrvl,
914
0
                                              &pmd->intrvl_idx,
915
0
                                              intervals);
916
0
        if (busy_pmd_cycles > total_pmd_cycles) {
917
0
            busy_pmd_cycles = total_pmd_cycles;
918
0
        }
919
920
0
        for (int i = 0; i < n_rxq; i++) {
921
0
            struct dp_netdev_rxq *rxq = list[i].rxq;
922
0
            const char *name = netdev_rxq_get_name(rxq->rx);
923
0
            uint64_t rxq_proc_cycles = 0;
924
925
0
            rxq_proc_cycles = get_interval_values(rxq->cycles_intrvl,
926
0
                                                  &rxq->intrvl_idx,
927
0
                                                  intervals);
928
0
            total_rxq_proc_cycles += rxq_proc_cycles;
929
0
            ds_put_format(reply, "  port: %-16s  queue-id: %2d", name,
930
0
                          netdev_rxq_get_queue_id(list[i].rxq->rx));
931
0
            ds_put_format(reply, " %s", netdev_rxq_enabled(list[i].rxq->rx)
932
0
                                        ? "(enabled) " : "(disabled)");
933
0
            ds_put_format(reply, "  pmd usage: ");
934
0
            if (total_pmd_cycles) {
935
0
                ds_put_format(reply, "%2"PRIu64"",
936
0
                              rxq_proc_cycles * 100 / total_pmd_cycles);
937
0
                ds_put_cstr(reply, " %");
938
0
            } else {
939
0
                ds_put_format(reply, "%s", "NOT AVAIL");
940
0
            }
941
0
            ds_put_cstr(reply, "\n");
942
0
        }
943
944
0
        if (n_rxq > 0) {
945
0
            ds_put_cstr(reply, "  overhead: ");
946
0
            if (total_pmd_cycles) {
947
0
                uint64_t overhead_cycles = 0;
948
949
0
                if (total_rxq_proc_cycles < busy_pmd_cycles) {
950
0
                    overhead_cycles = busy_pmd_cycles - total_rxq_proc_cycles;
951
0
                }
952
0
                ds_put_format(reply, "%2"PRIu64" %%",
953
0
                              overhead_cycles * 100 / total_pmd_cycles);
954
0
            } else {
955
0
                ds_put_cstr(reply, "NOT AVAIL");
956
0
            }
957
0
            ds_put_cstr(reply, "\n");
958
0
        }
959
960
0
        ovs_mutex_unlock(&pmd->port_mutex);
961
0
        free(list);
962
0
    }
963
0
}
964
965
static int
966
compare_poll_thread_list(const void *a_, const void *b_)
967
0
{
968
0
    const struct dp_netdev_pmd_thread *a, *b;
969
970
0
    a = *(struct dp_netdev_pmd_thread **)a_;
971
0
    b = *(struct dp_netdev_pmd_thread **)b_;
972
973
0
    if (a->core_id < b->core_id) {
974
0
        return -1;
975
0
    }
976
0
    if (a->core_id > b->core_id) {
977
0
        return 1;
978
0
    }
979
0
    return 0;
980
0
}
981
982
/* Create a sorted list of pmd's from the dp->poll_threads cmap. We can use
983
 * this list, as long as we do not go to quiescent state. */
984
static void
985
sorted_poll_thread_list(struct dp_netdev *dp,
986
                        struct dp_netdev_pmd_thread ***list,
987
                        size_t *n)
988
0
{
989
0
    struct dp_netdev_pmd_thread *pmd;
990
0
    struct dp_netdev_pmd_thread **pmd_list;
991
0
    size_t k = 0, n_pmds;
992
993
0
    n_pmds = cmap_count(&dp->poll_threads);
994
0
    pmd_list = xcalloc(n_pmds, sizeof *pmd_list);
995
996
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
997
0
        if (k >= n_pmds) {
998
0
            break;
999
0
        }
1000
0
        pmd_list[k++] = pmd;
1001
0
    }
1002
1003
0
    qsort(pmd_list, k, sizeof *pmd_list, compare_poll_thread_list);
1004
1005
0
    *list = pmd_list;
1006
0
    *n = k;
1007
0
}
1008
1009
static void
1010
dpif_netdev_subtable_lookup_get(struct unixctl_conn *conn, int argc OVS_UNUSED,
1011
                                const char *argv[] OVS_UNUSED,
1012
                                void *aux OVS_UNUSED)
1013
0
{
1014
0
    struct ds reply = DS_EMPTY_INITIALIZER;
1015
1016
0
    dpcls_impl_print_stats(&reply);
1017
0
    unixctl_command_reply(conn, ds_cstr(&reply));
1018
0
    ds_destroy(&reply);
1019
0
}
1020
1021
static void
1022
dpif_netdev_subtable_lookup_set(struct unixctl_conn *conn, int argc OVS_UNUSED,
1023
                                const char *argv[], void *aux OVS_UNUSED)
1024
0
{
1025
    /* This function requires 2 parameters (argv[1] and argv[2]) to execute.
1026
     *   argv[1] is subtable name
1027
     *   argv[2] is priority
1028
     */
1029
0
    const char *func_name = argv[1];
1030
1031
0
    errno = 0;
1032
0
    char *err_char;
1033
0
    uint32_t new_prio = strtoul(argv[2], &err_char, 10);
1034
0
    uint32_t lookup_dpcls_changed = 0;
1035
0
    uint32_t lookup_subtable_changed = 0;
1036
0
    struct shash_node *node;
1037
0
    if (errno != 0 || new_prio > UINT8_MAX) {
1038
0
        unixctl_command_reply_error(conn,
1039
0
            "error converting priority, use integer in range 0-255\n");
1040
0
        return;
1041
0
    }
1042
1043
0
    int32_t err = dpcls_subtable_set_prio(func_name, new_prio);
1044
0
    if (err) {
1045
0
        unixctl_command_reply_error(conn,
1046
0
            "error, subtable lookup function not found\n");
1047
0
        return;
1048
0
    }
1049
1050
0
    ovs_mutex_lock(&dp_netdev_mutex);
1051
0
    SHASH_FOR_EACH (node, &dp_netdevs) {
1052
0
        struct dp_netdev *dp = node->data;
1053
1054
        /* Get PMD threads list, required to get DPCLS instances. */
1055
0
        size_t n;
1056
0
        struct dp_netdev_pmd_thread **pmd_list;
1057
0
        sorted_poll_thread_list(dp, &pmd_list, &n);
1058
1059
        /* take port mutex as HMAP iters over them. */
1060
0
        ovs_rwlock_rdlock(&dp->port_rwlock);
1061
1062
0
        for (size_t i = 0; i < n; i++) {
1063
0
            struct dp_netdev_pmd_thread *pmd = pmd_list[i];
1064
0
            if (pmd->core_id == NON_PMD_CORE_ID) {
1065
0
                continue;
1066
0
            }
1067
1068
0
            struct dp_netdev_port *port = NULL;
1069
0
            HMAP_FOR_EACH (port, node, &dp->ports) {
1070
0
                odp_port_t in_port = port->port_no;
1071
0
                struct dpcls *cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
1072
0
                if (!cls) {
1073
0
                    continue;
1074
0
                }
1075
0
                ovs_mutex_lock(&pmd->flow_mutex);
1076
0
                uint32_t subtbl_changes = dpcls_subtable_lookup_reprobe(cls);
1077
0
                ovs_mutex_unlock(&pmd->flow_mutex);
1078
0
                if (subtbl_changes) {
1079
0
                    lookup_dpcls_changed++;
1080
0
                    lookup_subtable_changed += subtbl_changes;
1081
0
                }
1082
0
            }
1083
0
        }
1084
1085
        /* release port mutex before netdev mutex. */
1086
0
        ovs_rwlock_unlock(&dp->port_rwlock);
1087
0
        free(pmd_list);
1088
0
    }
1089
0
    ovs_mutex_unlock(&dp_netdev_mutex);
1090
1091
0
    struct ds reply = DS_EMPTY_INITIALIZER;
1092
0
    ds_put_format(&reply,
1093
0
        "Lookup priority change affected %d dpcls ports and %d subtables.\n",
1094
0
        lookup_dpcls_changed, lookup_subtable_changed);
1095
0
    const char *reply_str = ds_cstr(&reply);
1096
0
    unixctl_command_reply(conn, reply_str);
1097
0
    VLOG_INFO("%s", reply_str);
1098
0
    ds_destroy(&reply);
1099
0
}
1100
1101
static void
1102
dpif_netdev_impl_get(struct unixctl_conn *conn, int argc OVS_UNUSED,
1103
                     const char *argv[] OVS_UNUSED, void *aux OVS_UNUSED)
1104
0
{
1105
0
    struct ds reply = DS_EMPTY_INITIALIZER;
1106
0
    struct shash_node *node;
1107
1108
0
    ovs_mutex_lock(&dp_netdev_mutex);
1109
0
    SHASH_FOR_EACH (node, &dp_netdevs) {
1110
0
        struct dp_netdev_pmd_thread **pmd_list;
1111
0
        struct dp_netdev *dp = node->data;
1112
0
        size_t n;
1113
1114
        /* Get PMD threads list, required to get the DPIF impl used by each PMD
1115
         * thread. */
1116
0
        sorted_poll_thread_list(dp, &pmd_list, &n);
1117
0
        dp_netdev_impl_get(&reply, pmd_list, n);
1118
0
        free(pmd_list);
1119
0
    }
1120
0
    ovs_mutex_unlock(&dp_netdev_mutex);
1121
0
    unixctl_command_reply(conn, ds_cstr(&reply));
1122
0
    ds_destroy(&reply);
1123
0
}
1124
1125
static void
1126
dpif_netdev_impl_set(struct unixctl_conn *conn, int argc OVS_UNUSED,
1127
                     const char *argv[], void *aux OVS_UNUSED)
1128
0
{
1129
    /* This function requires just one parameter, the DPIF name. */
1130
0
    const char *dpif_name = argv[1];
1131
0
    struct shash_node *node;
1132
1133
0
    static const char *error_description[2] = {
1134
0
        "Unknown DPIF implementation",
1135
0
        "CPU doesn't support the required instruction for",
1136
0
    };
1137
1138
0
    ovs_mutex_lock(&dp_netdev_mutex);
1139
0
    int32_t err = dp_netdev_impl_set_default_by_name(dpif_name);
1140
1141
0
    if (err) {
1142
0
        struct ds reply = DS_EMPTY_INITIALIZER;
1143
0
        ds_put_format(&reply, "DPIF implementation not available: %s %s.\n",
1144
0
                      error_description[ (err == -ENOTSUP) ], dpif_name);
1145
0
        const char *reply_str = ds_cstr(&reply);
1146
0
        unixctl_command_reply_error(conn, reply_str);
1147
0
        VLOG_ERR("%s", reply_str);
1148
0
        ds_destroy(&reply);
1149
0
        ovs_mutex_unlock(&dp_netdev_mutex);
1150
0
        return;
1151
0
    }
1152
1153
0
    SHASH_FOR_EACH (node, &dp_netdevs) {
1154
0
        struct dp_netdev *dp = node->data;
1155
1156
        /* Get PMD threads list, required to get DPCLS instances. */
1157
0
        size_t n;
1158
0
        struct dp_netdev_pmd_thread **pmd_list;
1159
0
        sorted_poll_thread_list(dp, &pmd_list, &n);
1160
1161
0
        for (size_t i = 0; i < n; i++) {
1162
0
            struct dp_netdev_pmd_thread *pmd = pmd_list[i];
1163
0
            if (pmd->core_id == NON_PMD_CORE_ID) {
1164
0
                continue;
1165
0
            }
1166
1167
            /* Initialize DPIF function pointer to the newly configured
1168
             * default. */
1169
0
            atomic_store_relaxed(&pmd->netdev_input_func,
1170
0
                                 dp_netdev_impl_get_default());
1171
0
        };
1172
1173
0
        free(pmd_list);
1174
0
    }
1175
0
    ovs_mutex_unlock(&dp_netdev_mutex);
1176
1177
    /* Reply with success to command. */
1178
0
    struct ds reply = DS_EMPTY_INITIALIZER;
1179
0
    ds_put_format(&reply, "DPIF implementation set to %s.\n", dpif_name);
1180
0
    const char *reply_str = ds_cstr(&reply);
1181
0
    unixctl_command_reply(conn, reply_str);
1182
0
    VLOG_INFO("%s", reply_str);
1183
0
    ds_destroy(&reply);
1184
0
}
1185
1186
static void
1187
dpif_miniflow_extract_impl_get(struct unixctl_conn *conn, int argc OVS_UNUSED,
1188
                               const char *argv[] OVS_UNUSED,
1189
                               void *aux OVS_UNUSED)
1190
0
{
1191
0
    struct ds reply = DS_EMPTY_INITIALIZER;
1192
0
    struct shash_node *node;
1193
1194
0
    ovs_mutex_lock(&dp_netdev_mutex);
1195
0
    SHASH_FOR_EACH (node, &dp_netdevs) {
1196
0
        struct dp_netdev_pmd_thread **pmd_list;
1197
0
        struct dp_netdev *dp = node->data;
1198
0
        size_t n;
1199
1200
        /* Get PMD threads list, required to get the DPIF impl used by each PMD
1201
         * thread. */
1202
0
        sorted_poll_thread_list(dp, &pmd_list, &n);
1203
0
        dp_mfex_impl_get(&reply, pmd_list, n);
1204
0
        free(pmd_list);
1205
0
    }
1206
0
    ovs_mutex_unlock(&dp_netdev_mutex);
1207
0
    unixctl_command_reply(conn, ds_cstr(&reply));
1208
0
    ds_destroy(&reply);
1209
0
}
1210
1211
static void
1212
dpif_miniflow_extract_impl_set(struct unixctl_conn *conn, int argc,
1213
                               const char *argv[], void *aux OVS_UNUSED)
1214
0
{
1215
    /* This command takes some optional and mandatory arguments. The function
1216
     * here first parses all of the options, saving results in local variables.
1217
     * Then the parsed values are acted on.
1218
     */
1219
0
    unsigned int pmd_thread_to_change = NON_PMD_CORE_ID;
1220
0
    unsigned int study_count = MFEX_MAX_PKT_COUNT;
1221
0
    struct ds reply = DS_EMPTY_INITIALIZER;
1222
0
    bool pmd_thread_update_done = false;
1223
0
    bool mfex_name_is_study = false;
1224
0
    const char *mfex_name = NULL;
1225
0
    const char *reply_str = NULL;
1226
0
    struct shash_node *node;
1227
0
    int err;
1228
1229
0
    while (argc > 1) {
1230
        /* Optional argument "-pmd" limits the commands actions to just this
1231
         * PMD thread.
1232
         */
1233
0
        if ((!strcmp(argv[1], "-pmd") && !mfex_name)) {
1234
0
            if (argc < 3) {
1235
0
                ds_put_format(&reply,
1236
0
                              "Error: -pmd option requires a thread id"
1237
0
                              " argument.\n");
1238
0
                goto error;
1239
0
            }
1240
1241
            /* Ensure argument can be parsed to an integer. */
1242
0
            if (!str_to_uint(argv[2], 10, &pmd_thread_to_change) ||
1243
0
                (pmd_thread_to_change == NON_PMD_CORE_ID)) {
1244
0
                ds_put_format(&reply,
1245
0
                              "Error: miniflow extract parser not changed,"
1246
0
                              " PMD thread passed is not valid: '%s'."
1247
0
                              " Pass a valid pmd thread ID.\n",
1248
0
                              argv[2]);
1249
0
                goto error;
1250
0
            }
1251
1252
0
            argc -= 2;
1253
0
            argv += 2;
1254
1255
0
        } else if (!mfex_name) {
1256
            /* Name of MFEX impl requested by user. */
1257
0
            mfex_name = argv[1];
1258
0
            mfex_name_is_study = strcmp("study", mfex_name) == 0;
1259
0
            argc -= 1;
1260
0
            argv += 1;
1261
1262
        /* If name is study and more args exist, parse study_count value. */
1263
0
        } else if (mfex_name && mfex_name_is_study) {
1264
0
            if (!str_to_uint(argv[1], 10, &study_count) ||
1265
0
                (study_count == 0)) {
1266
0
                ds_put_format(&reply,
1267
0
                              "Error: invalid study_pkt_cnt value: %s.\n",
1268
0
                              argv[1]);
1269
0
                goto error;
1270
0
            }
1271
1272
0
            argc -= 1;
1273
0
            argv += 1;
1274
0
        } else {
1275
0
            ds_put_format(&reply, "Error: unknown argument %s.\n", argv[1]);
1276
0
            goto error;
1277
0
        }
1278
0
    }
1279
1280
    /* Ensure user passed an MFEX name. */
1281
0
    if (!mfex_name) {
1282
0
        ds_put_format(&reply, "Error: no miniflow extract name provided."
1283
0
                      " Output of miniflow-parser-get shows implementation"
1284
0
                      " list.\n");
1285
0
        goto error;
1286
0
    }
1287
1288
    /* If the MFEX name is "study", set the study packet count. */
1289
0
    if (mfex_name_is_study) {
1290
0
        err = mfex_set_study_pkt_cnt(study_count, mfex_name);
1291
0
        if (err) {
1292
0
            ds_put_format(&reply, "Error: failed to set study count %d for"
1293
0
                          " miniflow extract implementation %s.\n",
1294
0
                          study_count, mfex_name);
1295
0
            goto error;
1296
0
        }
1297
0
    }
1298
1299
    /* Set the default MFEX impl only if the command was applied to all PMD
1300
     * threads. If a PMD thread was selected, do NOT update the default.
1301
     */
1302
0
    if (pmd_thread_to_change == NON_PMD_CORE_ID) {
1303
0
        err = dp_mfex_impl_set_default_by_name(mfex_name);
1304
0
        if (err == -ENODEV) {
1305
0
            ds_put_format(&reply,
1306
0
                          "Error: miniflow extract not available due to CPU"
1307
0
                          " ISA requirements: %s",
1308
0
                          mfex_name);
1309
0
            goto error;
1310
0
        } else if (err) {
1311
0
            ds_put_format(&reply,
1312
0
                          "Error: unknown miniflow extract implementation %s.",
1313
0
                          mfex_name);
1314
0
            goto error;
1315
0
        }
1316
0
    }
1317
1318
    /* Get the desired MFEX function pointer and error check its usage. */
1319
0
    miniflow_extract_func mfex_func = NULL;
1320
0
    err = dp_mfex_impl_get_by_name(mfex_name, &mfex_func);
1321
0
    if (err) {
1322
0
        if (err == -ENODEV) {
1323
0
            ds_put_format(&reply,
1324
0
                          "Error: miniflow extract not available due to CPU"
1325
0
                          " ISA requirements: %s", mfex_name);
1326
0
        } else {
1327
0
            ds_put_format(&reply,
1328
0
                          "Error: unknown miniflow extract implementation %s.",
1329
0
                          mfex_name);
1330
0
        }
1331
0
        goto error;
1332
0
    }
1333
1334
    /* Apply the MFEX pointer to each pmd thread in each netdev, filtering
1335
     * by the users "-pmd" argument if required.
1336
     */
1337
0
    ovs_mutex_lock(&dp_netdev_mutex);
1338
1339
0
    SHASH_FOR_EACH (node, &dp_netdevs) {
1340
0
        struct dp_netdev_pmd_thread **pmd_list;
1341
0
        struct dp_netdev *dp = node->data;
1342
0
        size_t n;
1343
1344
0
        sorted_poll_thread_list(dp, &pmd_list, &n);
1345
1346
0
        for (size_t i = 0; i < n; i++) {
1347
0
            struct dp_netdev_pmd_thread *pmd = pmd_list[i];
1348
0
            if (pmd->core_id == NON_PMD_CORE_ID) {
1349
0
                continue;
1350
0
            }
1351
1352
            /* If -pmd specified, skip all other pmd threads. */
1353
0
            if ((pmd_thread_to_change != NON_PMD_CORE_ID) &&
1354
0
                (pmd->core_id != pmd_thread_to_change)) {
1355
0
                continue;
1356
0
            }
1357
1358
0
            pmd_thread_update_done = true;
1359
0
            atomic_store_relaxed(&pmd->miniflow_extract_opt, mfex_func);
1360
0
        };
1361
1362
0
        free(pmd_list);
1363
0
    }
1364
1365
0
    ovs_mutex_unlock(&dp_netdev_mutex);
1366
1367
    /* If PMD thread was specified, but it wasn't found, return error. */
1368
0
    if (pmd_thread_to_change != NON_PMD_CORE_ID && !pmd_thread_update_done) {
1369
0
        ds_put_format(&reply,
1370
0
                      "Error: miniflow extract parser not changed, "
1371
0
                      "PMD thread %d not in use, pass a valid pmd"
1372
0
                      " thread ID.\n", pmd_thread_to_change);
1373
0
        goto error;
1374
0
    }
1375
1376
    /* Reply with success to command. */
1377
0
    ds_put_format(&reply, "Miniflow extract implementation set to %s",
1378
0
                  mfex_name);
1379
0
    if (pmd_thread_to_change != NON_PMD_CORE_ID) {
1380
0
        ds_put_format(&reply, ", on pmd thread %d", pmd_thread_to_change);
1381
0
    }
1382
0
    if (mfex_name_is_study) {
1383
0
        ds_put_format(&reply, ", studying %d packets", study_count);
1384
0
    }
1385
0
    ds_put_format(&reply, ".\n");
1386
1387
0
    reply_str = ds_cstr(&reply);
1388
0
    VLOG_INFO("%s", reply_str);
1389
0
    unixctl_command_reply(conn, reply_str);
1390
0
    ds_destroy(&reply);
1391
0
    return;
1392
1393
0
error:
1394
0
    reply_str = ds_cstr(&reply);
1395
0
    VLOG_ERR("%s", reply_str);
1396
0
    unixctl_command_reply_error(conn, reply_str);
1397
0
    ds_destroy(&reply);
1398
0
}
1399
1400
static void
1401
dpif_netdev_pmd_rebalance(struct unixctl_conn *conn, int argc,
1402
                          const char *argv[], void *aux OVS_UNUSED)
1403
0
{
1404
0
    struct ds reply = DS_EMPTY_INITIALIZER;
1405
0
    struct dp_netdev *dp = NULL;
1406
1407
0
    ovs_mutex_lock(&dp_netdev_mutex);
1408
1409
0
    if (argc == 2) {
1410
0
        dp = shash_find_data(&dp_netdevs, argv[1]);
1411
0
    } else if (shash_count(&dp_netdevs) == 1) {
1412
        /* There's only one datapath */
1413
0
        dp = shash_first(&dp_netdevs)->data;
1414
0
    }
1415
1416
0
    if (!dp) {
1417
0
        ovs_mutex_unlock(&dp_netdev_mutex);
1418
0
        unixctl_command_reply_error(conn,
1419
0
                                    "please specify an existing datapath");
1420
0
        return;
1421
0
    }
1422
1423
0
    dp_netdev_request_reconfigure(dp);
1424
0
    ovs_mutex_unlock(&dp_netdev_mutex);
1425
0
    ds_put_cstr(&reply, "pmd rxq rebalance requested.\n");
1426
0
    unixctl_command_reply(conn, ds_cstr(&reply));
1427
0
    ds_destroy(&reply);
1428
0
}
1429
1430
static void
1431
dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[],
1432
                     void *aux)
1433
0
{
1434
0
    struct ds reply = DS_EMPTY_INITIALIZER;
1435
0
    struct dp_netdev_pmd_thread **pmd_list;
1436
0
    struct dp_netdev *dp = NULL;
1437
0
    enum pmd_info_type type = *(enum pmd_info_type *) aux;
1438
0
    unsigned int core_id;
1439
0
    bool filter_on_pmd = false;
1440
0
    size_t n;
1441
0
    unsigned int secs = 0;
1442
0
    unsigned long long max_secs = (PMD_INTERVAL_LEN * PMD_INTERVAL_MAX)
1443
0
                                      / INTERVAL_USEC_TO_SEC;
1444
0
    bool first_show_rxq = true;
1445
1446
0
    ovs_mutex_lock(&dp_netdev_mutex);
1447
1448
0
    while (argc > 1) {
1449
0
        if (!strcmp(argv[1], "-pmd") && argc > 2) {
1450
0
            if (str_to_uint(argv[2], 10, &core_id)) {
1451
0
                filter_on_pmd = true;
1452
0
            }
1453
0
            argc -= 2;
1454
0
            argv += 2;
1455
0
        } else if (type == PMD_INFO_SHOW_RXQ &&
1456
0
                       !strcmp(argv[1], "-secs") &&
1457
0
                       argc > 2) {
1458
0
            if (!str_to_uint(argv[2], 10, &secs)) {
1459
0
                secs = max_secs;
1460
0
            }
1461
0
            argc -= 2;
1462
0
            argv += 2;
1463
0
        } else {
1464
0
            dp = shash_find_data(&dp_netdevs, argv[1]);
1465
0
            argc -= 1;
1466
0
            argv += 1;
1467
0
        }
1468
0
    }
1469
1470
0
    if (!dp) {
1471
0
        if (shash_count(&dp_netdevs) == 1) {
1472
            /* There's only one datapath */
1473
0
            dp = shash_first(&dp_netdevs)->data;
1474
0
        } else {
1475
0
            ovs_mutex_unlock(&dp_netdev_mutex);
1476
0
            unixctl_command_reply_error(conn,
1477
0
                                        "please specify an existing datapath");
1478
0
            return;
1479
0
        }
1480
0
    }
1481
1482
0
    sorted_poll_thread_list(dp, &pmd_list, &n);
1483
0
    for (size_t i = 0; i < n; i++) {
1484
0
        struct dp_netdev_pmd_thread *pmd = pmd_list[i];
1485
0
        if (!pmd) {
1486
0
            break;
1487
0
        }
1488
0
        if (filter_on_pmd && pmd->core_id != core_id) {
1489
0
            continue;
1490
0
        }
1491
0
        if (type == PMD_INFO_SHOW_RXQ) {
1492
0
            if (first_show_rxq) {
1493
0
                if (!secs || secs > max_secs) {
1494
0
                    secs = max_secs;
1495
0
                } else {
1496
0
                    secs = ROUND_UP(secs,
1497
0
                                    PMD_INTERVAL_LEN / INTERVAL_USEC_TO_SEC);
1498
0
                }
1499
0
                ds_put_format(&reply, "Displaying last %u seconds "
1500
0
                              "pmd usage %%\n", secs);
1501
0
                first_show_rxq = false;
1502
0
            }
1503
0
            pmd_info_show_rxq(&reply, pmd, secs);
1504
0
        } else if (type == PMD_INFO_CLEAR_STATS) {
1505
0
            pmd_perf_stats_clear(&pmd->perf_stats);
1506
0
        } else if (type == PMD_INFO_SHOW_STATS) {
1507
0
            pmd_info_show_stats(&reply, pmd);
1508
0
        } else if (type == PMD_INFO_PERF_SHOW) {
1509
0
            pmd_info_show_perf(&reply, pmd, (struct pmd_perf_params *)aux);
1510
0
        }
1511
0
    }
1512
0
    free(pmd_list);
1513
1514
0
    ovs_mutex_unlock(&dp_netdev_mutex);
1515
1516
0
    unixctl_command_reply(conn, ds_cstr(&reply));
1517
0
    ds_destroy(&reply);
1518
0
}
1519
1520
static void
1521
pmd_perf_show_cmd(struct unixctl_conn *conn, int argc,
1522
                          const char *argv[],
1523
                          void *aux OVS_UNUSED)
1524
0
{
1525
0
    struct pmd_perf_params par;
1526
0
    long int it_hist = 0, ms_hist = 0;
1527
0
    par.histograms = true;
1528
1529
0
    while (argc > 1) {
1530
0
        if (!strcmp(argv[1], "-nh")) {
1531
0
            par.histograms = false;
1532
0
            argc -= 1;
1533
0
            argv += 1;
1534
0
        } else if (!strcmp(argv[1], "-it") && argc > 2) {
1535
0
            it_hist = strtol(argv[2], NULL, 10);
1536
0
            if (it_hist < 0) {
1537
0
                it_hist = 0;
1538
0
            } else if (it_hist > HISTORY_LEN) {
1539
0
                it_hist = HISTORY_LEN;
1540
0
            }
1541
0
            argc -= 2;
1542
0
            argv += 2;
1543
0
        } else if (!strcmp(argv[1], "-ms") && argc > 2) {
1544
0
            ms_hist = strtol(argv[2], NULL, 10);
1545
0
            if (ms_hist < 0) {
1546
0
                ms_hist = 0;
1547
0
            } else if (ms_hist > HISTORY_LEN) {
1548
0
                ms_hist = HISTORY_LEN;
1549
0
            }
1550
0
            argc -= 2;
1551
0
            argv += 2;
1552
0
        } else {
1553
0
            break;
1554
0
        }
1555
0
    }
1556
0
    par.iter_hist_len = it_hist;
1557
0
    par.ms_hist_len = ms_hist;
1558
0
    par.command_type = PMD_INFO_PERF_SHOW;
1559
0
    dpif_netdev_pmd_info(conn, argc, argv, &par);
1560
0
}
1561
1562
static void
1563
dpif_netdev_bond_show(struct unixctl_conn *conn, int argc,
1564
                      const char *argv[], void *aux OVS_UNUSED)
1565
0
{
1566
0
    struct ds reply = DS_EMPTY_INITIALIZER;
1567
0
    struct dp_netdev *dp = NULL;
1568
1569
0
    ovs_mutex_lock(&dp_netdev_mutex);
1570
0
    if (argc == 2) {
1571
0
        dp = shash_find_data(&dp_netdevs, argv[1]);
1572
0
    } else if (shash_count(&dp_netdevs) == 1) {
1573
        /* There's only one datapath. */
1574
0
        dp = shash_first(&dp_netdevs)->data;
1575
0
    }
1576
0
    if (!dp) {
1577
0
        ovs_mutex_unlock(&dp_netdev_mutex);
1578
0
        unixctl_command_reply_error(conn,
1579
0
                                    "please specify an existing datapath");
1580
0
        return;
1581
0
    }
1582
1583
0
    if (cmap_count(&dp->tx_bonds) > 0) {
1584
0
        struct tx_bond *dp_bond_entry;
1585
1586
0
        ds_put_cstr(&reply, "Bonds:\n");
1587
0
        CMAP_FOR_EACH (dp_bond_entry, node, &dp->tx_bonds) {
1588
0
            ds_put_format(&reply, "  bond-id %"PRIu32":\n",
1589
0
                          dp_bond_entry->bond_id);
1590
0
            for (int bucket = 0; bucket < BOND_BUCKETS; bucket++) {
1591
0
                uint32_t member_id = odp_to_u32(
1592
0
                    dp_bond_entry->member_buckets[bucket].member_id);
1593
0
                ds_put_format(&reply,
1594
0
                              "    bucket %d - member %"PRIu32"\n",
1595
0
                              bucket, member_id);
1596
0
            }
1597
0
        }
1598
0
    }
1599
0
    ovs_mutex_unlock(&dp_netdev_mutex);
1600
0
    unixctl_command_reply(conn, ds_cstr(&reply));
1601
0
    ds_destroy(&reply);
1602
0
}
1603
1604

1605
static int
1606
dpif_netdev_init(void)
1607
0
{
1608
0
    static enum pmd_info_type show_aux = PMD_INFO_SHOW_STATS,
1609
0
                              clear_aux = PMD_INFO_CLEAR_STATS,
1610
0
                              poll_aux = PMD_INFO_SHOW_RXQ;
1611
1612
0
    unixctl_command_register("dpif-netdev/pmd-stats-show", "[-pmd core] [dp]",
1613
0
                             0, 3, dpif_netdev_pmd_info,
1614
0
                             (void *)&show_aux);
1615
0
    unixctl_command_register("dpif-netdev/pmd-stats-clear", "[-pmd core] [dp]",
1616
0
                             0, 3, dpif_netdev_pmd_info,
1617
0
                             (void *)&clear_aux);
1618
0
    unixctl_command_register("dpif-netdev/pmd-rxq-show", "[-pmd core] "
1619
0
                             "[-secs secs] [dp]",
1620
0
                             0, 5, dpif_netdev_pmd_info,
1621
0
                             (void *)&poll_aux);
1622
0
    unixctl_command_register("dpif-netdev/pmd-perf-show",
1623
0
                             "[-nh] [-it iter-history-len]"
1624
0
                             " [-ms ms-history-len]"
1625
0
                             " [-pmd core] [dp]",
1626
0
                             0, 8, pmd_perf_show_cmd,
1627
0
                             NULL);
1628
0
    unixctl_command_register("dpif-netdev/pmd-rxq-rebalance", "[dp]",
1629
0
                             0, 1, dpif_netdev_pmd_rebalance,
1630
0
                             NULL);
1631
0
    unixctl_command_register("dpif-netdev/pmd-perf-log-set",
1632
0
                             "on|off [-b before] [-a after] [-e|-ne] "
1633
0
                             "[-us usec] [-q qlen]",
1634
0
                             0, 10, pmd_perf_log_set_cmd,
1635
0
                             NULL);
1636
0
    unixctl_command_register("dpif-netdev/bond-show", "[dp]",
1637
0
                             0, 1, dpif_netdev_bond_show,
1638
0
                             NULL);
1639
0
    unixctl_command_register("dpif-netdev/subtable-lookup-prio-set",
1640
0
                             "[lookup_func] [prio]",
1641
0
                             2, 2, dpif_netdev_subtable_lookup_set,
1642
0
                             NULL);
1643
0
    unixctl_command_register("dpif-netdev/subtable-lookup-info-get", "",
1644
0
                             0, 0, dpif_netdev_subtable_lookup_get,
1645
0
                             NULL);
1646
0
    unixctl_command_register("dpif-netdev/subtable-lookup-prio-get", NULL,
1647
0
                             0, 0, dpif_netdev_subtable_lookup_get,
1648
0
                             NULL);
1649
0
    unixctl_command_register("dpif-netdev/dpif-impl-set",
1650
0
                             "dpif_implementation_name",
1651
0
                             1, 1, dpif_netdev_impl_set,
1652
0
                             NULL);
1653
0
    unixctl_command_register("dpif-netdev/dpif-impl-get", "",
1654
0
                             0, 0, dpif_netdev_impl_get,
1655
0
                             NULL);
1656
0
    unixctl_command_register("dpif-netdev/miniflow-parser-set",
1657
0
                             "[-pmd core] miniflow_implementation_name"
1658
0
                             " [study_pkt_cnt]",
1659
0
                             1, 5, dpif_miniflow_extract_impl_set,
1660
0
                             NULL);
1661
0
    unixctl_command_register("dpif-netdev/miniflow-parser-get", "",
1662
0
                             0, 0, dpif_miniflow_extract_impl_get,
1663
0
                             NULL);
1664
0
    return 0;
1665
0
}
1666
1667
static int
1668
dpif_netdev_enumerate(struct sset *all_dps,
1669
                      const struct dpif_class *dpif_class)
1670
0
{
1671
0
    struct shash_node *node;
1672
1673
0
    ovs_mutex_lock(&dp_netdev_mutex);
1674
0
    SHASH_FOR_EACH(node, &dp_netdevs) {
1675
0
        struct dp_netdev *dp = node->data;
1676
0
        if (dpif_class != dp->class) {
1677
            /* 'dp_netdevs' contains both "netdev" and "dummy" dpifs.
1678
             * If the class doesn't match, skip this dpif. */
1679
0
             continue;
1680
0
        }
1681
0
        sset_add(all_dps, node->name);
1682
0
    }
1683
0
    ovs_mutex_unlock(&dp_netdev_mutex);
1684
1685
0
    return 0;
1686
0
}
1687
1688
static bool
1689
dpif_netdev_class_is_dummy(const struct dpif_class *class)
1690
0
{
1691
0
    return class != &dpif_netdev_class;
1692
0
}
1693
1694
static const char *
1695
dpif_netdev_port_open_type(const struct dpif_class *class, const char *type)
1696
0
{
1697
0
    return strcmp(type, "internal") ? type
1698
0
                  : dpif_netdev_class_is_dummy(class) ? "dummy-internal"
1699
0
                  : "tap";
1700
0
}
1701
1702
static struct dpif *
1703
create_dpif_netdev(struct dp_netdev *dp)
1704
0
{
1705
0
    uint16_t netflow_id = hash_string(dp->name, 0);
1706
0
    struct dpif_netdev *dpif;
1707
1708
0
    ovs_refcount_ref(&dp->ref_cnt);
1709
1710
0
    dpif = xmalloc(sizeof *dpif);
1711
0
    dpif_init(&dpif->dpif, dp->class, dp->name, netflow_id >> 8, netflow_id);
1712
0
    dpif->dp = dp;
1713
0
    dpif->last_port_seq = seq_read(dp->port_seq);
1714
1715
0
    return &dpif->dpif;
1716
0
}
1717
1718
/* Choose an unused, non-zero port number and return it on success.
1719
 * Return ODPP_NONE on failure. */
1720
static odp_port_t
1721
choose_port(struct dp_netdev *dp, const char *name)
1722
    OVS_REQ_RDLOCK(dp->port_rwlock)
1723
0
{
1724
0
    uint32_t port_no;
1725
1726
0
    if (dp->class != &dpif_netdev_class) {
1727
0
        const char *p;
1728
0
        int start_no = 0;
1729
1730
        /* If the port name begins with "br", start the number search at
1731
         * 100 to make writing tests easier. */
1732
0
        if (!strncmp(name, "br", 2)) {
1733
0
            start_no = 100;
1734
0
        }
1735
1736
        /* If the port name contains a number, try to assign that port number.
1737
         * This can make writing unit tests easier because port numbers are
1738
         * predictable. */
1739
0
        for (p = name; *p != '\0'; p++) {
1740
0
            if (isdigit((unsigned char) *p)) {
1741
0
                port_no = start_no + strtol(p, NULL, 10);
1742
0
                if (port_no > 0 && port_no != odp_to_u32(ODPP_NONE)
1743
0
                    && !dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
1744
0
                    return u32_to_odp(port_no);
1745
0
                }
1746
0
                break;
1747
0
            }
1748
0
        }
1749
0
    }
1750
1751
0
    for (port_no = 1; port_no <= UINT16_MAX; port_no++) {
1752
0
        if (!dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
1753
0
            return u32_to_odp(port_no);
1754
0
        }
1755
0
    }
1756
1757
0
    return ODPP_NONE;
1758
0
}
1759
1760
static uint32_t
1761
dp_meter_hash(uint32_t meter_id)
1762
0
{
1763
    /* In the ofproto-dpif layer, we use the id-pool to alloc meter id
1764
     * orderly (e.g. 1, 2, ... N.), which provides a better hash
1765
     * distribution.  Use them directly instead of hash_xxx function for
1766
     * achieving high-performance. */
1767
0
    return meter_id;
1768
0
}
1769
1770
static void
1771
dp_netdev_meter_destroy(struct dp_netdev *dp)
1772
0
{
1773
0
    struct dp_meter *m;
1774
1775
0
    ovs_mutex_lock(&dp->meters_lock);
1776
0
    CMAP_FOR_EACH (m, node, &dp->meters) {
1777
0
        cmap_remove(&dp->meters, &m->node, dp_meter_hash(m->id));
1778
0
        ovsrcu_postpone(free, m);
1779
0
    }
1780
1781
0
    cmap_destroy(&dp->meters);
1782
0
    ovs_mutex_unlock(&dp->meters_lock);
1783
0
    ovs_mutex_destroy(&dp->meters_lock);
1784
0
}
1785
1786
static struct dp_meter *
1787
dp_meter_lookup(struct cmap *meters, uint32_t meter_id)
1788
0
{
1789
0
    uint32_t hash = dp_meter_hash(meter_id);
1790
0
    struct dp_meter *m;
1791
1792
0
    CMAP_FOR_EACH_WITH_HASH (m, node, hash, meters) {
1793
0
        if (m->id == meter_id) {
1794
0
            return m;
1795
0
        }
1796
0
    }
1797
1798
0
    return NULL;
1799
0
}
1800
1801
static void
1802
dp_meter_detach_free(struct cmap *meters, uint32_t meter_id)
1803
0
{
1804
0
    struct dp_meter *m = dp_meter_lookup(meters, meter_id);
1805
1806
0
    if (m) {
1807
0
        cmap_remove(meters, &m->node, dp_meter_hash(meter_id));
1808
0
        ovsrcu_postpone(free, m);
1809
0
    }
1810
0
}
1811
1812
static void
1813
dp_meter_attach(struct cmap *meters, struct dp_meter *meter)
1814
0
{
1815
0
    cmap_insert(meters, &meter->node, dp_meter_hash(meter->id));
1816
0
}
1817
1818
static int
1819
create_dp_netdev(const char *name, const struct dpif_class *class,
1820
                 struct dp_netdev **dpp)
1821
    OVS_REQUIRES(dp_netdev_mutex)
1822
0
{
1823
0
    static struct ovsthread_once tsc_freq_check = OVSTHREAD_ONCE_INITIALIZER;
1824
0
    struct dp_netdev *dp;
1825
0
    int error;
1826
1827
    /* Avoid estimating TSC frequency for dummy datapath to not slow down
1828
     * unit tests. */
1829
0
    if (!dpif_netdev_class_is_dummy(class)
1830
0
        && ovsthread_once_start(&tsc_freq_check)) {
1831
0
        pmd_perf_estimate_tsc_frequency();
1832
0
        ovsthread_once_done(&tsc_freq_check);
1833
0
    }
1834
1835
0
    dp = xzalloc(sizeof *dp);
1836
0
    shash_add(&dp_netdevs, name, dp);
1837
1838
0
    *CONST_CAST(const struct dpif_class **, &dp->class) = class;
1839
0
    *CONST_CAST(const char **, &dp->name) = xstrdup(name);
1840
0
    ovs_refcount_init(&dp->ref_cnt);
1841
0
    atomic_flag_clear(&dp->destroyed);
1842
1843
0
    ovs_rwlock_init(&dp->port_rwlock);
1844
0
    hmap_init(&dp->ports);
1845
0
    dp->port_seq = seq_create();
1846
0
    ovs_mutex_init(&dp->bond_mutex);
1847
0
    cmap_init(&dp->tx_bonds);
1848
1849
0
    fat_rwlock_init(&dp->upcall_rwlock);
1850
1851
0
    dp->reconfigure_seq = seq_create();
1852
0
    dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
1853
1854
    /* Init meter resources. */
1855
0
    cmap_init(&dp->meters);
1856
0
    ovs_mutex_init(&dp->meters_lock);
1857
1858
    /* Disable upcalls by default. */
1859
0
    dp_netdev_disable_upcall(dp);
1860
0
    dp->upcall_aux = NULL;
1861
0
    dp->upcall_cb = NULL;
1862
1863
0
    dp->conntrack = conntrack_init();
1864
1865
0
    dpif_miniflow_extract_init();
1866
1867
0
    atomic_init(&dp->emc_insert_min, DEFAULT_EM_FLOW_INSERT_MIN);
1868
0
    atomic_init(&dp->tx_flush_interval, DEFAULT_TX_FLUSH_INTERVAL);
1869
1870
0
    cmap_init(&dp->poll_threads);
1871
0
    dp->pmd_rxq_assign_type = SCHED_CYCLES;
1872
1873
0
    ovs_mutex_init(&dp->tx_qid_pool_mutex);
1874
    /* We need 1 Tx queue for each possible core + 1 for non-PMD threads. */
1875
0
    dp->tx_qid_pool = id_pool_create(0, ovs_numa_get_n_cores() + 1);
1876
1877
0
    ovs_mutex_init_recursive(&dp->non_pmd_mutex);
1878
0
    ovsthread_key_create(&dp->per_pmd_key, NULL);
1879
1880
0
    ovs_rwlock_wrlock(&dp->port_rwlock);
1881
    /* non-PMD will be created before all other threads and will
1882
     * allocate static_tx_qid = 0. */
1883
0
    dp_netdev_set_nonpmd(dp);
1884
1885
0
    error = do_add_port(dp, name, dpif_netdev_port_open_type(dp->class,
1886
0
                                                             "internal"),
1887
0
                        ODPP_LOCAL);
1888
0
    ovs_rwlock_unlock(&dp->port_rwlock);
1889
0
    if (error) {
1890
0
        dp_netdev_free(dp);
1891
0
        return error;
1892
0
    }
1893
1894
0
    dp->last_tnl_conf_seq = seq_read(tnl_conf_seq);
1895
0
    *dpp = dp;
1896
0
    return 0;
1897
0
}
1898
1899
static void
1900
dp_netdev_request_reconfigure(struct dp_netdev *dp)
1901
0
{
1902
0
    seq_change(dp->reconfigure_seq);
1903
0
}
1904
1905
static bool
1906
dp_netdev_is_reconf_required(struct dp_netdev *dp)
1907
0
{
1908
0
    return seq_read(dp->reconfigure_seq) != dp->last_reconfigure_seq;
1909
0
}
1910
1911
static int
1912
dpif_netdev_open(const struct dpif_class *class, const char *name,
1913
                 bool create, struct dpif **dpifp)
1914
0
{
1915
0
    struct dp_netdev *dp;
1916
0
    int error;
1917
1918
0
    ovs_mutex_lock(&dp_netdev_mutex);
1919
0
    dp = shash_find_data(&dp_netdevs, name);
1920
0
    if (!dp) {
1921
0
        error = create ? create_dp_netdev(name, class, &dp) : ENODEV;
1922
0
    } else {
1923
0
        error = (dp->class != class ? EINVAL
1924
0
                 : create ? EEXIST
1925
0
                 : 0);
1926
0
    }
1927
0
    if (!error) {
1928
0
        *dpifp = create_dpif_netdev(dp);
1929
0
    }
1930
0
    ovs_mutex_unlock(&dp_netdev_mutex);
1931
1932
0
    return error;
1933
0
}
1934
1935
static void
1936
dp_netdev_destroy_upcall_lock(struct dp_netdev *dp)
1937
    OVS_NO_THREAD_SAFETY_ANALYSIS
1938
0
{
1939
    /* Check that upcalls are disabled, i.e. that the rwlock is taken */
1940
0
    ovs_assert(fat_rwlock_tryrdlock(&dp->upcall_rwlock));
1941
1942
    /* Before freeing a lock we should release it */
1943
0
    fat_rwlock_unlock(&dp->upcall_rwlock);
1944
0
    fat_rwlock_destroy(&dp->upcall_rwlock);
1945
0
}
1946
1947
static uint32_t
1948
hash_bond_id(uint32_t bond_id)
1949
0
{
1950
0
    return hash_int(bond_id, 0);
1951
0
}
1952
1953
/* Requires dp_netdev_mutex so that we can't get a new reference to 'dp'
1954
 * through the 'dp_netdevs' shash while freeing 'dp'. */
1955
static void
1956
dp_netdev_free(struct dp_netdev *dp)
1957
    OVS_REQUIRES(dp_netdev_mutex)
1958
0
{
1959
0
    struct dp_netdev_port *port;
1960
0
    struct tx_bond *bond;
1961
1962
0
    shash_find_and_delete(&dp_netdevs, dp->name);
1963
1964
0
    ovs_rwlock_wrlock(&dp->port_rwlock);
1965
0
    HMAP_FOR_EACH_SAFE (port, node, &dp->ports) {
1966
0
        do_del_port(dp, port);
1967
0
    }
1968
0
    ovs_rwlock_unlock(&dp->port_rwlock);
1969
1970
0
    ovs_mutex_lock(&dp->bond_mutex);
1971
0
    CMAP_FOR_EACH (bond, node, &dp->tx_bonds) {
1972
0
        cmap_remove(&dp->tx_bonds, &bond->node, hash_bond_id(bond->bond_id));
1973
0
        ovsrcu_postpone(free, bond);
1974
0
    }
1975
0
    ovs_mutex_unlock(&dp->bond_mutex);
1976
1977
0
    dp_netdev_destroy_all_pmds(dp, true);
1978
0
    cmap_destroy(&dp->poll_threads);
1979
1980
0
    ovs_mutex_destroy(&dp->tx_qid_pool_mutex);
1981
0
    id_pool_destroy(dp->tx_qid_pool);
1982
1983
0
    ovs_mutex_destroy(&dp->non_pmd_mutex);
1984
0
    ovsthread_key_delete(dp->per_pmd_key);
1985
1986
0
    conntrack_destroy(dp->conntrack);
1987
1988
1989
0
    seq_destroy(dp->reconfigure_seq);
1990
1991
0
    seq_destroy(dp->port_seq);
1992
0
    hmap_destroy(&dp->ports);
1993
0
    ovs_rwlock_destroy(&dp->port_rwlock);
1994
1995
0
    cmap_destroy(&dp->tx_bonds);
1996
0
    ovs_mutex_destroy(&dp->bond_mutex);
1997
1998
    /* Upcalls must be disabled at this point */
1999
0
    dp_netdev_destroy_upcall_lock(dp);
2000
2001
0
    dp_netdev_meter_destroy(dp);
2002
2003
0
    free(dp->pmd_cmask);
2004
0
    free(CONST_CAST(char *, dp->name));
2005
0
    free(dp);
2006
0
}
2007
2008
static void
2009
dp_netdev_unref(struct dp_netdev *dp)
2010
0
{
2011
0
    if (dp) {
2012
        /* Take dp_netdev_mutex so that, if dp->ref_cnt falls to zero, we can't
2013
         * get a new reference to 'dp' through the 'dp_netdevs' shash. */
2014
0
        ovs_mutex_lock(&dp_netdev_mutex);
2015
0
        if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
2016
0
            dp_netdev_free(dp);
2017
0
        }
2018
0
        ovs_mutex_unlock(&dp_netdev_mutex);
2019
0
    }
2020
0
}
2021
2022
static void
2023
dpif_netdev_close(struct dpif *dpif)
2024
0
{
2025
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
2026
2027
0
    dp_netdev_unref(dp);
2028
0
    free(dpif);
2029
0
}
2030
2031
static int
2032
dpif_netdev_destroy(struct dpif *dpif)
2033
0
{
2034
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
2035
2036
0
    if (!atomic_flag_test_and_set(&dp->destroyed)) {
2037
0
        if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
2038
            /* Can't happen: 'dpif' still owns a reference to 'dp'. */
2039
0
            OVS_NOT_REACHED();
2040
0
        }
2041
0
    }
2042
2043
0
    return 0;
2044
0
}
2045
2046
/* Add 'n' to the atomic variable 'var' non-atomically and using relaxed
2047
 * load/store semantics.  While the increment is not atomic, the load and
2048
 * store operations are, making it impossible to read inconsistent values.
2049
 *
2050
 * This is used to update thread local stats counters. */
2051
static void
2052
non_atomic_ullong_add(atomic_ullong *var, unsigned long long n)
2053
0
{
2054
0
    unsigned long long tmp;
2055
2056
0
    atomic_read_relaxed(var, &tmp);
2057
0
    tmp += n;
2058
0
    atomic_store_relaxed(var, tmp);
2059
0
}
2060
2061
static int
2062
dpif_netdev_get_stats(const struct dpif *dpif, struct dpif_dp_stats *stats)
2063
0
{
2064
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
2065
0
    struct dp_netdev_pmd_thread *pmd;
2066
0
    uint64_t pmd_stats[PMD_N_STATS];
2067
2068
0
    stats->n_flows = stats->n_hit = stats->n_missed = stats->n_lost = 0;
2069
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
2070
0
        stats->n_flows += cmap_count(&pmd->flow_table);
2071
0
        pmd_perf_read_counters(&pmd->perf_stats, pmd_stats);
2072
0
        stats->n_hit += pmd_stats[PMD_STAT_PHWOL_HIT];
2073
0
        stats->n_hit += pmd_stats[PMD_STAT_SIMPLE_HIT];
2074
0
        stats->n_hit += pmd_stats[PMD_STAT_EXACT_HIT];
2075
0
        stats->n_hit += pmd_stats[PMD_STAT_SMC_HIT];
2076
0
        stats->n_hit += pmd_stats[PMD_STAT_MASKED_HIT];
2077
0
        stats->n_missed += pmd_stats[PMD_STAT_MISS];
2078
0
        stats->n_lost += pmd_stats[PMD_STAT_LOST];
2079
0
    }
2080
0
    stats->n_masks = UINT32_MAX;
2081
0
    stats->n_mask_hit = UINT64_MAX;
2082
0
    stats->n_cache_hit = UINT64_MAX;
2083
2084
0
    return 0;
2085
0
}
2086
2087
static void
2088
dp_netdev_reload_pmd__(struct dp_netdev_pmd_thread *pmd)
2089
0
{
2090
0
    if (pmd->core_id == NON_PMD_CORE_ID) {
2091
0
        ovs_mutex_lock(&pmd->dp->non_pmd_mutex);
2092
0
        ovs_mutex_lock(&pmd->port_mutex);
2093
0
        pmd_load_cached_ports(pmd);
2094
0
        ovs_mutex_unlock(&pmd->port_mutex);
2095
0
        ovs_mutex_unlock(&pmd->dp->non_pmd_mutex);
2096
0
        return;
2097
0
    }
2098
2099
0
    seq_change(pmd->reload_seq);
2100
0
    atomic_store_explicit(&pmd->reload, true, memory_order_release);
2101
0
}
2102
2103
static uint32_t
2104
hash_port_no(odp_port_t port_no)
2105
0
{
2106
0
    return hash_int(odp_to_u32(port_no), 0);
2107
0
}
2108
2109
static int
2110
port_create(const char *devname, const char *type,
2111
            odp_port_t port_no, struct dp_netdev_port **portp)
2112
0
{
2113
0
    struct dp_netdev_port *port;
2114
0
    enum netdev_flags flags;
2115
0
    struct netdev *netdev;
2116
0
    int error;
2117
2118
0
    *portp = NULL;
2119
2120
    /* Open and validate network device. */
2121
0
    error = netdev_open(devname, type, &netdev);
2122
0
    if (error) {
2123
0
        return error;
2124
0
    }
2125
    /* XXX reject non-Ethernet devices */
2126
2127
0
    netdev_get_flags(netdev, &flags);
2128
0
    if (flags & NETDEV_LOOPBACK) {
2129
0
        VLOG_ERR("%s: cannot add a loopback device", devname);
2130
0
        error = EINVAL;
2131
0
        goto out;
2132
0
    }
2133
2134
0
    port = xzalloc(sizeof *port);
2135
0
    port->port_no = port_no;
2136
0
    port->netdev = netdev;
2137
0
    port->type = xstrdup(type);
2138
0
    port->sf = NULL;
2139
0
    port->emc_enabled = true;
2140
0
    port->need_reconfigure = true;
2141
0
    ovs_mutex_init(&port->txq_used_mutex);
2142
2143
0
    *portp = port;
2144
2145
0
    return 0;
2146
2147
0
out:
2148
0
    netdev_close(netdev);
2149
0
    return error;
2150
0
}
2151
2152
static int
2153
do_add_port(struct dp_netdev *dp, const char *devname, const char *type,
2154
            odp_port_t port_no)
2155
    OVS_REQ_WRLOCK(dp->port_rwlock)
2156
0
{
2157
0
    struct netdev_saved_flags *sf;
2158
0
    struct dp_netdev_port *port;
2159
0
    int error;
2160
2161
    /* Reject devices already in 'dp'. */
2162
0
    if (!get_port_by_name(dp, devname, &port)) {
2163
0
        return EEXIST;
2164
0
    }
2165
2166
0
    error = port_create(devname, type, port_no, &port);
2167
0
    if (error) {
2168
0
        return error;
2169
0
    }
2170
2171
0
    hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
2172
0
    seq_change(dp->port_seq);
2173
2174
0
    reconfigure_datapath(dp);
2175
2176
    /* Check that port was successfully configured. */
2177
0
    if (!dp_netdev_lookup_port(dp, port_no)) {
2178
0
        return EINVAL;
2179
0
    }
2180
2181
    /* Updating device flags triggers an if_notifier, which triggers a bridge
2182
     * reconfiguration and another attempt to add this port, leading to an
2183
     * infinite loop if the device is configured incorrectly and cannot be
2184
     * added.  Setting the promisc mode after a successful reconfiguration,
2185
     * since we already know that the device is somehow properly configured. */
2186
0
    error = netdev_turn_flags_on(port->netdev, NETDEV_PROMISC, &sf);
2187
0
    if (error) {
2188
0
        VLOG_ERR("%s: cannot set promisc flag", devname);
2189
0
        do_del_port(dp, port);
2190
0
        return error;
2191
0
    }
2192
0
    port->sf = sf;
2193
2194
0
    return 0;
2195
0
}
2196
2197
static int
2198
dpif_netdev_port_add(struct dpif *dpif, struct netdev *netdev,
2199
                     odp_port_t *port_nop)
2200
0
{
2201
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
2202
0
    char namebuf[NETDEV_VPORT_NAME_BUFSIZE];
2203
0
    const char *dpif_port;
2204
0
    odp_port_t port_no;
2205
0
    int error;
2206
2207
0
    ovs_rwlock_wrlock(&dp->port_rwlock);
2208
0
    dpif_port = netdev_vport_get_dpif_port(netdev, namebuf, sizeof namebuf);
2209
0
    if (*port_nop != ODPP_NONE) {
2210
0
        port_no = *port_nop;
2211
0
        error = dp_netdev_lookup_port(dp, *port_nop) ? EBUSY : 0;
2212
0
    } else {
2213
0
        port_no = choose_port(dp, dpif_port);
2214
0
        error = port_no == ODPP_NONE ? EFBIG : 0;
2215
0
    }
2216
0
    if (!error) {
2217
0
        *port_nop = port_no;
2218
0
        error = do_add_port(dp, dpif_port, netdev_get_type(netdev), port_no);
2219
0
    }
2220
0
    ovs_rwlock_unlock(&dp->port_rwlock);
2221
2222
0
    return error;
2223
0
}
2224
2225
static int
2226
dpif_netdev_port_del(struct dpif *dpif, odp_port_t port_no)
2227
0
{
2228
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
2229
0
    int error;
2230
2231
0
    ovs_rwlock_wrlock(&dp->port_rwlock);
2232
0
    if (port_no == ODPP_LOCAL) {
2233
0
        error = EINVAL;
2234
0
    } else {
2235
0
        struct dp_netdev_port *port;
2236
2237
0
        error = get_port_by_number(dp, port_no, &port);
2238
0
        if (!error) {
2239
0
            do_del_port(dp, port);
2240
0
        }
2241
0
    }
2242
0
    ovs_rwlock_unlock(&dp->port_rwlock);
2243
2244
0
    return error;
2245
0
}
2246
2247
static bool
2248
is_valid_port_number(odp_port_t port_no)
2249
0
{
2250
0
    return port_no != ODPP_NONE;
2251
0
}
2252
2253
static struct dp_netdev_port *
2254
dp_netdev_lookup_port(const struct dp_netdev *dp, odp_port_t port_no)
2255
    OVS_REQ_RDLOCK(dp->port_rwlock)
2256
0
{
2257
0
    struct dp_netdev_port *port;
2258
2259
0
    HMAP_FOR_EACH_WITH_HASH (port, node, hash_port_no(port_no), &dp->ports) {
2260
0
        if (port->port_no == port_no) {
2261
0
            return port;
2262
0
        }
2263
0
    }
2264
0
    return NULL;
2265
0
}
2266
2267
static int
2268
get_port_by_number(struct dp_netdev *dp,
2269
                   odp_port_t port_no, struct dp_netdev_port **portp)
2270
    OVS_REQ_RDLOCK(dp->port_rwlock)
2271
0
{
2272
0
    if (!is_valid_port_number(port_no)) {
2273
0
        *portp = NULL;
2274
0
        return EINVAL;
2275
0
    } else {
2276
0
        *portp = dp_netdev_lookup_port(dp, port_no);
2277
0
        return *portp ? 0 : ENODEV;
2278
0
    }
2279
0
}
2280
2281
static void
2282
port_destroy(struct dp_netdev_port *port)
2283
0
{
2284
0
    if (!port) {
2285
0
        return;
2286
0
    }
2287
2288
0
    netdev_close(port->netdev);
2289
0
    netdev_restore_flags(port->sf);
2290
2291
0
    for (unsigned i = 0; i < port->n_rxq; i++) {
2292
0
        netdev_rxq_close(port->rxqs[i].rx);
2293
0
    }
2294
0
    ovs_mutex_destroy(&port->txq_used_mutex);
2295
0
    free(port->rxq_affinity_list);
2296
0
    free(port->txq_used);
2297
0
    free(port->rxqs);
2298
0
    free(port->type);
2299
0
    free(port);
2300
0
}
2301
2302
static int
2303
get_port_by_name(struct dp_netdev *dp,
2304
                 const char *devname, struct dp_netdev_port **portp)
2305
    OVS_REQ_RDLOCK(dp->port_rwlock)
2306
0
{
2307
0
    struct dp_netdev_port *port;
2308
2309
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
2310
0
        if (!strcmp(netdev_get_name(port->netdev), devname)) {
2311
0
            *portp = port;
2312
0
            return 0;
2313
0
        }
2314
0
    }
2315
2316
    /* Callers of dpif_netdev_port_query_by_name() expect ENODEV for a non
2317
     * existing port. */
2318
0
    return ENODEV;
2319
0
}
2320
2321
/* Returns 'true' if there is a port with pmd netdev. */
2322
static bool
2323
has_pmd_port(struct dp_netdev *dp)
2324
    OVS_REQ_RDLOCK(dp->port_rwlock)
2325
0
{
2326
0
    struct dp_netdev_port *port;
2327
2328
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
2329
0
        if (netdev_is_pmd(port->netdev)) {
2330
0
            return true;
2331
0
        }
2332
0
    }
2333
2334
0
    return false;
2335
0
}
2336
2337
static void
2338
do_del_port(struct dp_netdev *dp, struct dp_netdev_port *port)
2339
    OVS_REQ_WRLOCK(dp->port_rwlock)
2340
0
{
2341
0
    hmap_remove(&dp->ports, &port->node);
2342
0
    seq_change(dp->port_seq);
2343
2344
0
    reconfigure_datapath(dp);
2345
2346
    /* Flush and disable offloads only after 'port' has been made
2347
     * inaccessible through datapath reconfiguration.
2348
     * This prevents having PMDs enqueuing offload requests after
2349
     * the flush.
2350
     * When only this port is deleted instead of the whole datapath,
2351
     * revalidator threads are still active and can still enqueue
2352
     * offload modification or deletion. Managing those stray requests
2353
     * is done in the offload threads. */
2354
0
    dp_netdev_offload_flush(dp, port);
2355
0
    netdev_uninit_flow_api(port->netdev);
2356
2357
0
    port_destroy(port);
2358
0
}
2359
2360
static void
2361
answer_port_query(const struct dp_netdev_port *port,
2362
                  struct dpif_port *dpif_port)
2363
0
{
2364
0
    dpif_port->name = xstrdup(netdev_get_name(port->netdev));
2365
0
    dpif_port->type = xstrdup(port->type);
2366
0
    dpif_port->port_no = port->port_no;
2367
0
}
2368
2369
static int
2370
dpif_netdev_port_query_by_number(const struct dpif *dpif, odp_port_t port_no,
2371
                                 struct dpif_port *dpif_port)
2372
0
{
2373
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
2374
0
    struct dp_netdev_port *port;
2375
0
    int error;
2376
2377
0
    ovs_rwlock_wrlock(&dp->port_rwlock);
2378
0
    error = get_port_by_number(dp, port_no, &port);
2379
0
    if (!error && dpif_port) {
2380
0
        answer_port_query(port, dpif_port);
2381
0
    }
2382
0
    ovs_rwlock_unlock(&dp->port_rwlock);
2383
2384
0
    return error;
2385
0
}
2386
2387
static int
2388
dpif_netdev_port_query_by_name(const struct dpif *dpif, const char *devname,
2389
                               struct dpif_port *dpif_port)
2390
0
{
2391
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
2392
0
    struct dp_netdev_port *port;
2393
0
    int error;
2394
2395
0
    ovs_rwlock_rdlock(&dp->port_rwlock);
2396
0
    error = get_port_by_name(dp, devname, &port);
2397
0
    if (!error && dpif_port) {
2398
0
        answer_port_query(port, dpif_port);
2399
0
    }
2400
0
    ovs_rwlock_unlock(&dp->port_rwlock);
2401
2402
0
    return error;
2403
0
}
2404
2405
static void
2406
dp_netdev_flow_free(struct dp_netdev_flow *flow)
2407
0
{
2408
0
    dp_netdev_actions_free(dp_netdev_flow_get_actions(flow));
2409
0
    free(flow->dp_extra_info);
2410
0
    free(flow);
2411
0
}
2412
2413
void dp_netdev_flow_unref(struct dp_netdev_flow *flow)
2414
0
{
2415
0
    if (ovs_refcount_unref_relaxed(&flow->ref_cnt) == 1) {
2416
0
        ovsrcu_postpone(dp_netdev_flow_free, flow);
2417
0
    }
2418
0
}
2419
2420
inline struct dpcls *
2421
dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd,
2422
                           odp_port_t in_port)
2423
0
{
2424
0
    struct dpcls *cls;
2425
0
    uint32_t hash = hash_port_no(in_port);
2426
0
    CMAP_FOR_EACH_WITH_HASH (cls, node, hash, &pmd->classifiers) {
2427
0
        if (cls->in_port == in_port) {
2428
            /* Port classifier exists already */
2429
0
            return cls;
2430
0
        }
2431
0
    }
2432
0
    return NULL;
2433
0
}
2434
2435
static inline struct dpcls *
2436
dp_netdev_pmd_find_dpcls(struct dp_netdev_pmd_thread *pmd,
2437
                         odp_port_t in_port)
2438
    OVS_REQUIRES(pmd->flow_mutex)
2439
0
{
2440
0
    struct dpcls *cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
2441
2442
0
    if (!cls) {
2443
0
        uint32_t hash = hash_port_no(in_port);
2444
2445
        /* Create new classifier for in_port */
2446
0
        cls = xmalloc(sizeof(*cls));
2447
0
        dpcls_init(cls);
2448
0
        cls->in_port = in_port;
2449
0
        cmap_insert(&pmd->classifiers, &cls->node, hash);
2450
0
        VLOG_DBG("Creating dpcls %p for in_port %d", cls, in_port);
2451
0
    }
2452
0
    return cls;
2453
0
}
2454
2455
0
#define MAX_FLOW_MARK       (UINT32_MAX - 1)
2456
0
#define INVALID_FLOW_MARK   0
2457
/* Zero flow mark is used to indicate the HW to remove the mark. A packet
2458
 * marked with zero mark is received in SW without a mark at all, so it
2459
 * cannot be used as a valid mark.
2460
 */
2461
2462
struct megaflow_to_mark_data {
2463
    const struct cmap_node node;
2464
    ovs_u128 mega_ufid;
2465
    uint32_t mark;
2466
};
2467
2468
static struct id_fpool *flow_mark_pool;
2469
2470
static uint32_t
2471
flow_mark_alloc(void)
2472
0
{
2473
0
    static struct ovsthread_once init_once = OVSTHREAD_ONCE_INITIALIZER;
2474
0
    unsigned int tid = netdev_offload_thread_id();
2475
0
    uint32_t mark;
2476
2477
0
    if (ovsthread_once_start(&init_once)) {
2478
        /* Haven't initiated yet, do it here */
2479
0
        flow_mark_pool = id_fpool_create(netdev_offload_thread_nb(),
2480
0
                                         1, MAX_FLOW_MARK);
2481
0
        ovsthread_once_done(&init_once);
2482
0
    }
2483
2484
0
    if (id_fpool_new_id(flow_mark_pool, tid, &mark)) {
2485
0
        return mark;
2486
0
    }
2487
2488
0
    return INVALID_FLOW_MARK;
2489
0
}
2490
2491
static void
2492
flow_mark_free(uint32_t mark)
2493
0
{
2494
0
    unsigned int tid = netdev_offload_thread_id();
2495
2496
0
    id_fpool_free_id(flow_mark_pool, tid, mark);
2497
0
}
2498
2499
/* associate megaflow with a mark, which is a 1:1 mapping */
2500
static void
2501
megaflow_to_mark_associate(const ovs_u128 *mega_ufid, uint32_t mark)
2502
0
{
2503
0
    size_t hash = dp_netdev_flow_hash(mega_ufid);
2504
0
    struct megaflow_to_mark_data *data = xzalloc(sizeof(*data));
2505
0
    unsigned int tid = netdev_offload_thread_id();
2506
2507
0
    data->mega_ufid = *mega_ufid;
2508
0
    data->mark = mark;
2509
2510
0
    cmap_insert(&dp_offload_threads[tid].megaflow_to_mark,
2511
0
                CONST_CAST(struct cmap_node *, &data->node), hash);
2512
0
}
2513
2514
/* disassociate meagaflow with a mark */
2515
static void
2516
megaflow_to_mark_disassociate(const ovs_u128 *mega_ufid)
2517
0
{
2518
0
    size_t hash = dp_netdev_flow_hash(mega_ufid);
2519
0
    struct megaflow_to_mark_data *data;
2520
0
    unsigned int tid = netdev_offload_thread_id();
2521
2522
0
    CMAP_FOR_EACH_WITH_HASH (data, node, hash,
2523
0
                             &dp_offload_threads[tid].megaflow_to_mark) {
2524
0
        if (ovs_u128_equals(*mega_ufid, data->mega_ufid)) {
2525
0
            cmap_remove(&dp_offload_threads[tid].megaflow_to_mark,
2526
0
                        CONST_CAST(struct cmap_node *, &data->node), hash);
2527
0
            ovsrcu_postpone(free, data);
2528
0
            return;
2529
0
        }
2530
0
    }
2531
2532
0
    VLOG_WARN("Masked ufid "UUID_FMT" is not associated with a mark?\n",
2533
0
              UUID_ARGS((struct uuid *)mega_ufid));
2534
0
}
2535
2536
static inline uint32_t
2537
megaflow_to_mark_find(const ovs_u128 *mega_ufid)
2538
0
{
2539
0
    size_t hash = dp_netdev_flow_hash(mega_ufid);
2540
0
    struct megaflow_to_mark_data *data;
2541
0
    unsigned int tid = netdev_offload_thread_id();
2542
2543
0
    CMAP_FOR_EACH_WITH_HASH (data, node, hash,
2544
0
                             &dp_offload_threads[tid].megaflow_to_mark) {
2545
0
        if (ovs_u128_equals(*mega_ufid, data->mega_ufid)) {
2546
0
            return data->mark;
2547
0
        }
2548
0
    }
2549
2550
0
    VLOG_DBG("Mark id for ufid "UUID_FMT" was not found\n",
2551
0
             UUID_ARGS((struct uuid *)mega_ufid));
2552
0
    return INVALID_FLOW_MARK;
2553
0
}
2554
2555
/* associate mark with a flow, which is 1:N mapping */
2556
static void
2557
mark_to_flow_associate(const uint32_t mark, struct dp_netdev_flow *flow)
2558
0
{
2559
0
    unsigned int tid = netdev_offload_thread_id();
2560
0
    dp_netdev_flow_ref(flow);
2561
2562
0
    cmap_insert(&dp_offload_threads[tid].mark_to_flow,
2563
0
                CONST_CAST(struct cmap_node *, &flow->mark_node),
2564
0
                hash_int(mark, 0));
2565
0
    flow->mark = mark;
2566
2567
0
    VLOG_DBG("Associated dp_netdev flow %p with mark %u mega_ufid "UUID_FMT,
2568
0
             flow, mark, UUID_ARGS((struct uuid *) &flow->mega_ufid));
2569
0
}
2570
2571
static bool
2572
flow_mark_has_no_ref(uint32_t mark)
2573
0
{
2574
0
    unsigned int tid = netdev_offload_thread_id();
2575
0
    struct dp_netdev_flow *flow;
2576
2577
0
    CMAP_FOR_EACH_WITH_HASH (flow, mark_node, hash_int(mark, 0),
2578
0
                             &dp_offload_threads[tid].mark_to_flow) {
2579
0
        if (flow->mark == mark) {
2580
0
            return false;
2581
0
        }
2582
0
    }
2583
2584
0
    return true;
2585
0
}
2586
2587
static int
2588
mark_to_flow_disassociate(struct dp_netdev *dp,
2589
                          struct dp_netdev_flow *flow)
2590
0
{
2591
0
    const char *dpif_type_str = dpif_normalize_type(dp->class->type);
2592
0
    struct cmap_node *mark_node = CONST_CAST(struct cmap_node *,
2593
0
                                             &flow->mark_node);
2594
0
    unsigned int tid = netdev_offload_thread_id();
2595
0
    uint32_t mark = flow->mark;
2596
0
    int ret = 0;
2597
2598
    /* INVALID_FLOW_MARK may mean that the flow has been disassociated or
2599
     * never associated. */
2600
0
    if (OVS_UNLIKELY(mark == INVALID_FLOW_MARK)) {
2601
0
        return EINVAL;
2602
0
    }
2603
2604
0
    cmap_remove(&dp_offload_threads[tid].mark_to_flow,
2605
0
                mark_node, hash_int(mark, 0));
2606
0
    flow->mark = INVALID_FLOW_MARK;
2607
2608
    /*
2609
     * no flow is referencing the mark any more? If so, let's
2610
     * remove the flow from hardware and free the mark.
2611
     */
2612
0
    if (flow_mark_has_no_ref(mark)) {
2613
0
        struct netdev *port;
2614
0
        odp_port_t in_port = flow->flow.in_port.odp_port;
2615
2616
0
        port = netdev_ports_get(in_port, dpif_type_str);
2617
0
        if (port) {
2618
            /* Taking a global 'port_rwlock' to fulfill thread safety
2619
             * restrictions regarding netdev port mapping. */
2620
0
            ovs_rwlock_rdlock(&dp->port_rwlock);
2621
0
            ret = netdev_flow_del(port, &flow->mega_ufid, NULL);
2622
0
            ovs_rwlock_unlock(&dp->port_rwlock);
2623
0
            netdev_close(port);
2624
0
        }
2625
2626
0
        flow_mark_free(mark);
2627
0
        VLOG_DBG("Freed flow mark %u mega_ufid "UUID_FMT, mark,
2628
0
                 UUID_ARGS((struct uuid *) &flow->mega_ufid));
2629
2630
0
        megaflow_to_mark_disassociate(&flow->mega_ufid);
2631
0
    }
2632
0
    dp_netdev_flow_unref(flow);
2633
2634
0
    return ret;
2635
0
}
2636
2637
static struct dp_netdev_flow *
2638
mark_to_flow_find(const struct dp_netdev_pmd_thread *pmd,
2639
                  const uint32_t mark)
2640
0
{
2641
0
    struct dp_netdev_flow *flow;
2642
0
    unsigned int tid;
2643
0
    size_t hash;
2644
2645
0
    if (dp_offload_threads == NULL) {
2646
0
        return NULL;
2647
0
    }
2648
2649
0
    hash = hash_int(mark, 0);
2650
0
    for (tid = 0; tid < netdev_offload_thread_nb(); tid++) {
2651
0
        CMAP_FOR_EACH_WITH_HASH (flow, mark_node, hash,
2652
0
                                 &dp_offload_threads[tid].mark_to_flow) {
2653
0
            if (flow->mark == mark && flow->pmd_id == pmd->core_id &&
2654
0
                flow->dead == false) {
2655
0
                return flow;
2656
0
            }
2657
0
        }
2658
0
    }
2659
2660
0
    return NULL;
2661
0
}
2662
2663
static struct dp_offload_thread_item *
2664
dp_netdev_alloc_flow_offload(struct dp_netdev *dp,
2665
                             struct dp_netdev_flow *flow,
2666
                             int op)
2667
0
{
2668
0
    struct dp_offload_thread_item *item;
2669
0
    struct dp_offload_flow_item *flow_offload;
2670
2671
0
    item = xzalloc(sizeof *item + sizeof *flow_offload);
2672
0
    flow_offload = &item->data->flow;
2673
2674
0
    item->type = DP_OFFLOAD_FLOW;
2675
0
    item->dp = dp;
2676
2677
0
    flow_offload->flow = flow;
2678
0
    flow_offload->op = op;
2679
2680
0
    dp_netdev_flow_ref(flow);
2681
2682
0
    return item;
2683
0
}
2684
2685
static void
2686
dp_netdev_free_flow_offload__(struct dp_offload_thread_item *offload)
2687
0
{
2688
0
    struct dp_offload_flow_item *flow_offload = &offload->data->flow;
2689
2690
0
    free(flow_offload->actions);
2691
0
    free(offload);
2692
0
}
2693
2694
static void
2695
dp_netdev_free_flow_offload(struct dp_offload_thread_item *offload)
2696
0
{
2697
0
    struct dp_offload_flow_item *flow_offload = &offload->data->flow;
2698
2699
0
    dp_netdev_flow_unref(flow_offload->flow);
2700
0
    ovsrcu_postpone(dp_netdev_free_flow_offload__, offload);
2701
0
}
2702
2703
static void
2704
dp_netdev_free_offload(struct dp_offload_thread_item *offload)
2705
0
{
2706
0
    switch (offload->type) {
2707
0
    case DP_OFFLOAD_FLOW:
2708
0
        dp_netdev_free_flow_offload(offload);
2709
0
        break;
2710
0
    case DP_OFFLOAD_FLUSH:
2711
0
        free(offload);
2712
0
        break;
2713
0
    default:
2714
0
        OVS_NOT_REACHED();
2715
0
    };
2716
0
}
2717
2718
static void
2719
dp_netdev_append_offload(struct dp_offload_thread_item *offload,
2720
                         unsigned int tid)
2721
0
{
2722
0
    dp_netdev_offload_init();
2723
2724
0
    mpsc_queue_insert(&dp_offload_threads[tid].queue, &offload->node);
2725
0
    atomic_count_inc64(&dp_offload_threads[tid].enqueued_item);
2726
0
}
2727
2728
static void
2729
dp_netdev_offload_flow_enqueue(struct dp_offload_thread_item *item)
2730
0
{
2731
0
    struct dp_offload_flow_item *flow_offload = &item->data->flow;
2732
0
    unsigned int tid;
2733
2734
0
    ovs_assert(item->type == DP_OFFLOAD_FLOW);
2735
2736
0
    tid = netdev_offload_ufid_to_thread_id(flow_offload->flow->mega_ufid);
2737
0
    dp_netdev_append_offload(item, tid);
2738
0
}
2739
2740
static int
2741
dp_netdev_flow_offload_del(struct dp_offload_thread_item *item)
2742
0
{
2743
0
    return mark_to_flow_disassociate(item->dp, item->data->flow.flow);
2744
0
}
2745
2746
/*
2747
 * There are two flow offload operations here: addition and modification.
2748
 *
2749
 * For flow addition, this function does:
2750
 * - allocate a new flow mark id
2751
 * - perform hardware flow offload
2752
 * - associate the flow mark with flow and mega flow
2753
 *
2754
 * For flow modification, both flow mark and the associations are still
2755
 * valid, thus only item 2 needed.
2756
 */
2757
static int
2758
dp_netdev_flow_offload_put(struct dp_offload_thread_item *item)
2759
0
{
2760
0
    struct dp_offload_flow_item *offload = &item->data->flow;
2761
0
    struct dp_netdev *dp = item->dp;
2762
0
    struct dp_netdev_flow *flow = offload->flow;
2763
0
    odp_port_t in_port = flow->flow.in_port.odp_port;
2764
0
    const char *dpif_type_str = dpif_normalize_type(dp->class->type);
2765
0
    bool modification = offload->op == DP_NETDEV_FLOW_OFFLOAD_OP_MOD
2766
0
                        && flow->mark != INVALID_FLOW_MARK;
2767
0
    struct offload_info info;
2768
0
    struct netdev *port;
2769
0
    uint32_t mark;
2770
0
    int ret;
2771
2772
0
    if (flow->dead) {
2773
0
        return -1;
2774
0
    }
2775
2776
0
    if (modification) {
2777
0
        mark = flow->mark;
2778
0
    } else {
2779
        /*
2780
         * If a mega flow has already been offloaded (from other PMD
2781
         * instances), do not offload it again.
2782
         */
2783
0
        mark = megaflow_to_mark_find(&flow->mega_ufid);
2784
0
        if (mark != INVALID_FLOW_MARK) {
2785
0
            VLOG_DBG("Flow has already been offloaded with mark %u\n", mark);
2786
0
            if (flow->mark != INVALID_FLOW_MARK) {
2787
0
                ovs_assert(flow->mark == mark);
2788
0
            } else {
2789
0
                mark_to_flow_associate(mark, flow);
2790
0
            }
2791
0
            return 0;
2792
0
        }
2793
2794
0
        mark = flow_mark_alloc();
2795
0
        if (mark == INVALID_FLOW_MARK) {
2796
0
            VLOG_ERR("Failed to allocate flow mark!\n");
2797
0
            return -1;
2798
0
        }
2799
0
    }
2800
0
    info.flow_mark = mark;
2801
0
    info.orig_in_port = offload->orig_in_port;
2802
2803
0
    port = netdev_ports_get(in_port, dpif_type_str);
2804
0
    if (!port) {
2805
0
        goto err_free;
2806
0
    }
2807
2808
    /* Taking a global 'port_rwlock' to fulfill thread safety
2809
     * restrictions regarding the netdev port mapping. */
2810
0
    ovs_rwlock_rdlock(&dp->port_rwlock);
2811
0
    ret = netdev_flow_put(port, &offload->match,
2812
0
                          CONST_CAST(struct nlattr *, offload->actions),
2813
0
                          offload->actions_len, &flow->mega_ufid, &info,
2814
0
                          NULL);
2815
0
    ovs_rwlock_unlock(&dp->port_rwlock);
2816
0
    netdev_close(port);
2817
2818
0
    if (ret) {
2819
0
        goto err_free;
2820
0
    }
2821
2822
0
    if (!modification) {
2823
0
        megaflow_to_mark_associate(&flow->mega_ufid, mark);
2824
0
        mark_to_flow_associate(mark, flow);
2825
0
    }
2826
0
    return 0;
2827
2828
0
err_free:
2829
0
    if (!modification) {
2830
0
        flow_mark_free(mark);
2831
0
    } else {
2832
0
        mark_to_flow_disassociate(item->dp, flow);
2833
0
    }
2834
0
    return -1;
2835
0
}
2836
2837
static void
2838
dp_offload_flow(struct dp_offload_thread_item *item)
2839
0
{
2840
0
    struct dp_offload_flow_item *flow_offload = &item->data->flow;
2841
0
    const char *op;
2842
0
    int ret;
2843
2844
0
    switch (flow_offload->op) {
2845
0
    case DP_NETDEV_FLOW_OFFLOAD_OP_ADD:
2846
0
        op = "add";
2847
0
        ret = dp_netdev_flow_offload_put(item);
2848
0
        break;
2849
0
    case DP_NETDEV_FLOW_OFFLOAD_OP_MOD:
2850
0
        op = "modify";
2851
0
        ret = dp_netdev_flow_offload_put(item);
2852
0
        break;
2853
0
    case DP_NETDEV_FLOW_OFFLOAD_OP_DEL:
2854
0
        op = "delete";
2855
0
        ret = dp_netdev_flow_offload_del(item);
2856
0
        break;
2857
0
    default:
2858
0
        OVS_NOT_REACHED();
2859
0
    }
2860
2861
0
    VLOG_DBG("%s to %s netdev flow "UUID_FMT,
2862
0
             ret == 0 ? "succeed" : "failed", op,
2863
0
             UUID_ARGS((struct uuid *) &flow_offload->flow->mega_ufid));
2864
0
}
2865
2866
static void
2867
dp_offload_flush(struct dp_offload_thread_item *item)
2868
0
{
2869
0
    struct dp_offload_flush_item *flush = &item->data->flush;
2870
2871
0
    ovs_rwlock_rdlock(&item->dp->port_rwlock);
2872
0
    netdev_flow_flush(flush->netdev);
2873
0
    ovs_rwlock_unlock(&item->dp->port_rwlock);
2874
2875
0
    ovs_barrier_block(flush->barrier);
2876
2877
    /* Allow the initiator thread to take again the port lock,
2878
     * before continuing offload operations in this thread.
2879
     */
2880
0
    ovs_barrier_block(flush->barrier);
2881
0
}
2882
2883
0
#define DP_NETDEV_OFFLOAD_BACKOFF_MIN 1
2884
0
#define DP_NETDEV_OFFLOAD_BACKOFF_MAX 64
2885
0
#define DP_NETDEV_OFFLOAD_QUIESCE_INTERVAL_US (10 * 1000) /* 10 ms */
2886
2887
static void *
2888
dp_netdev_flow_offload_main(void *arg)
2889
0
{
2890
0
    struct dp_offload_thread *ofl_thread = arg;
2891
0
    struct dp_offload_thread_item *offload;
2892
0
    struct mpsc_queue_node *node;
2893
0
    struct mpsc_queue *queue;
2894
0
    long long int latency_us;
2895
0
    long long int next_rcu;
2896
0
    long long int now;
2897
0
    uint64_t backoff;
2898
2899
0
    queue = &ofl_thread->queue;
2900
0
    mpsc_queue_acquire(queue);
2901
2902
0
    while (true) {
2903
0
        backoff = DP_NETDEV_OFFLOAD_BACKOFF_MIN;
2904
0
        while (mpsc_queue_tail(queue) == NULL) {
2905
0
            xnanosleep(backoff * 1E6);
2906
0
            if (backoff < DP_NETDEV_OFFLOAD_BACKOFF_MAX) {
2907
0
                backoff <<= 1;
2908
0
            }
2909
0
        }
2910
2911
0
        next_rcu = time_usec() + DP_NETDEV_OFFLOAD_QUIESCE_INTERVAL_US;
2912
0
        MPSC_QUEUE_FOR_EACH_POP (node, queue) {
2913
0
            offload = CONTAINER_OF(node, struct dp_offload_thread_item, node);
2914
0
            atomic_count_dec64(&ofl_thread->enqueued_item);
2915
2916
0
            switch (offload->type) {
2917
0
            case DP_OFFLOAD_FLOW:
2918
0
                dp_offload_flow(offload);
2919
0
                break;
2920
0
            case DP_OFFLOAD_FLUSH:
2921
0
                dp_offload_flush(offload);
2922
0
                break;
2923
0
            default:
2924
0
                OVS_NOT_REACHED();
2925
0
            }
2926
2927
0
            now = time_usec();
2928
2929
0
            latency_us = now - offload->timestamp;
2930
0
            mov_avg_cma_update(&ofl_thread->cma, latency_us);
2931
0
            mov_avg_ema_update(&ofl_thread->ema, latency_us);
2932
2933
0
            dp_netdev_free_offload(offload);
2934
2935
            /* Do RCU synchronization at fixed interval. */
2936
0
            if (now > next_rcu) {
2937
0
                ovsrcu_quiesce();
2938
0
                next_rcu = time_usec() + DP_NETDEV_OFFLOAD_QUIESCE_INTERVAL_US;
2939
0
            }
2940
0
        }
2941
0
    }
2942
2943
0
    OVS_NOT_REACHED();
2944
0
    mpsc_queue_release(queue);
2945
2946
0
    return NULL;
2947
0
}
2948
2949
static void
2950
queue_netdev_flow_del(struct dp_netdev_pmd_thread *pmd,
2951
                      struct dp_netdev_flow *flow)
2952
0
{
2953
0
    struct dp_offload_thread_item *offload;
2954
2955
0
    if (!netdev_is_flow_api_enabled()) {
2956
0
        return;
2957
0
    }
2958
2959
0
    offload = dp_netdev_alloc_flow_offload(pmd->dp, flow,
2960
0
                                           DP_NETDEV_FLOW_OFFLOAD_OP_DEL);
2961
0
    offload->timestamp = pmd->ctx.now;
2962
0
    dp_netdev_offload_flow_enqueue(offload);
2963
0
}
2964
2965
static void
2966
log_netdev_flow_change(const struct dp_netdev_flow *flow,
2967
                       const struct match *match,
2968
                       const struct dp_netdev_actions *old_actions,
2969
                       const struct nlattr *actions,
2970
                       size_t actions_len)
2971
0
{
2972
0
    struct ds ds = DS_EMPTY_INITIALIZER;
2973
0
    struct ofpbuf key_buf, mask_buf;
2974
0
    struct odp_flow_key_parms odp_parms = {
2975
0
        .flow = &match->flow,
2976
0
        .mask = &match->wc.masks,
2977
0
        .support = dp_netdev_support,
2978
0
    };
2979
2980
0
    if (OVS_LIKELY(VLOG_DROP_DBG((&upcall_rl)))) {
2981
0
        return;
2982
0
    }
2983
2984
0
    ofpbuf_init(&key_buf, 0);
2985
0
    ofpbuf_init(&mask_buf, 0);
2986
2987
0
    odp_flow_key_from_flow(&odp_parms, &key_buf);
2988
0
    odp_parms.key_buf = &key_buf;
2989
0
    odp_flow_key_from_mask(&odp_parms, &mask_buf);
2990
2991
0
    if (old_actions) {
2992
0
        ds_put_cstr(&ds, "flow_mod: ");
2993
0
    } else {
2994
0
        ds_put_cstr(&ds, "flow_add: ");
2995
0
    }
2996
0
    odp_format_ufid(&flow->ufid, &ds);
2997
0
    ds_put_cstr(&ds, " mega_");
2998
0
    odp_format_ufid(&flow->mega_ufid, &ds);
2999
0
    ds_put_cstr(&ds, " ");
3000
0
    odp_flow_format(key_buf.data, key_buf.size,
3001
0
                    mask_buf.data, mask_buf.size,
3002
0
                    NULL, &ds, false);
3003
0
    if (old_actions) {
3004
0
        ds_put_cstr(&ds, ", old_actions:");
3005
0
        format_odp_actions(&ds, old_actions->actions, old_actions->size,
3006
0
                           NULL);
3007
0
    }
3008
0
    ds_put_cstr(&ds, ", actions:");
3009
0
    format_odp_actions(&ds, actions, actions_len, NULL);
3010
3011
0
    VLOG_DBG("%s", ds_cstr(&ds));
3012
3013
0
    ofpbuf_uninit(&key_buf);
3014
0
    ofpbuf_uninit(&mask_buf);
3015
3016
    /* Add a printout of the actual match installed. */
3017
0
    struct match m;
3018
0
    ds_clear(&ds);
3019
0
    ds_put_cstr(&ds, "flow match: ");
3020
0
    miniflow_expand(&flow->cr.flow.mf, &m.flow);
3021
0
    miniflow_expand(&flow->cr.mask->mf, &m.wc.masks);
3022
0
    memset(&m.tun_md, 0, sizeof m.tun_md);
3023
0
    match_format(&m, NULL, &ds, OFP_DEFAULT_PRIORITY);
3024
3025
0
    VLOG_DBG("%s", ds_cstr(&ds));
3026
3027
0
    ds_destroy(&ds);
3028
0
}
3029
3030
static void
3031
queue_netdev_flow_put(struct dp_netdev_pmd_thread *pmd,
3032
                      struct dp_netdev_flow *flow, struct match *match,
3033
                      const struct nlattr *actions, size_t actions_len,
3034
                      int op)
3035
0
{
3036
0
    struct dp_offload_thread_item *item;
3037
0
    struct dp_offload_flow_item *flow_offload;
3038
3039
0
    if (!netdev_is_flow_api_enabled()) {
3040
0
        return;
3041
0
    }
3042
3043
0
    item = dp_netdev_alloc_flow_offload(pmd->dp, flow, op);
3044
0
    flow_offload = &item->data->flow;
3045
0
    flow_offload->match = *match;
3046
0
    flow_offload->actions = xmalloc(actions_len);
3047
0
    memcpy(flow_offload->actions, actions, actions_len);
3048
0
    flow_offload->actions_len = actions_len;
3049
0
    flow_offload->orig_in_port = flow->orig_in_port;
3050
3051
0
    item->timestamp = pmd->ctx.now;
3052
0
    dp_netdev_offload_flow_enqueue(item);
3053
0
}
3054
3055
static void
3056
dp_netdev_pmd_remove_flow(struct dp_netdev_pmd_thread *pmd,
3057
                          struct dp_netdev_flow *flow)
3058
    OVS_REQUIRES(pmd->flow_mutex)
3059
0
{
3060
0
    struct cmap_node *node = CONST_CAST(struct cmap_node *, &flow->node);
3061
0
    struct dpcls *cls;
3062
0
    odp_port_t in_port = flow->flow.in_port.odp_port;
3063
3064
0
    cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
3065
0
    ovs_assert(cls != NULL);
3066
0
    dpcls_remove(cls, &flow->cr);
3067
0
    dp_netdev_simple_match_remove(pmd, flow);
3068
0
    cmap_remove(&pmd->flow_table, node, dp_netdev_flow_hash(&flow->ufid));
3069
0
    ccmap_dec(&pmd->n_flows, odp_to_u32(in_port));
3070
0
    queue_netdev_flow_del(pmd, flow);
3071
0
    flow->dead = true;
3072
3073
0
    dp_netdev_flow_unref(flow);
3074
0
}
3075
3076
static void
3077
dp_netdev_offload_flush_enqueue(struct dp_netdev *dp,
3078
                                struct netdev *netdev,
3079
                                struct ovs_barrier *barrier)
3080
0
{
3081
0
    unsigned int tid;
3082
0
    long long int now_us = time_usec();
3083
3084
0
    for (tid = 0; tid < netdev_offload_thread_nb(); tid++) {
3085
0
        struct dp_offload_thread_item *item;
3086
0
        struct dp_offload_flush_item *flush;
3087
3088
0
        item = xmalloc(sizeof *item + sizeof *flush);
3089
0
        item->type = DP_OFFLOAD_FLUSH;
3090
0
        item->dp = dp;
3091
0
        item->timestamp = now_us;
3092
3093
0
        flush = &item->data->flush;
3094
0
        flush->netdev = netdev;
3095
0
        flush->barrier = barrier;
3096
3097
0
        dp_netdev_append_offload(item, tid);
3098
0
    }
3099
0
}
3100
3101
/* Blocking call that will wait on the offload thread to
3102
 * complete its work.  As the flush order will only be
3103
 * enqueued after existing offload requests, those previous
3104
 * offload requests must be processed, which requires being
3105
 * able to lock the 'port_rwlock' from the offload thread.
3106
 *
3107
 * Flow offload flush is done when a port is being deleted.
3108
 * Right after this call executes, the offload API is disabled
3109
 * for the port. This call must be made blocking until the
3110
 * offload provider completed its job.
3111
 */
3112
static void
3113
dp_netdev_offload_flush(struct dp_netdev *dp,
3114
                        struct dp_netdev_port *port)
3115
    OVS_REQ_WRLOCK(dp->port_rwlock)
3116
0
{
3117
    /* The flush mutex serves to exclude mutual access to the static
3118
     * barrier, and to prevent multiple flush orders to several threads.
3119
     *
3120
     * The memory barrier needs to go beyond the function scope as
3121
     * the other threads can resume from blocking after this function
3122
     * already finished.
3123
     *
3124
     * Additionally, because the flush operation is blocking, it would
3125
     * deadlock if multiple offload threads were blocking on several
3126
     * different barriers. Only allow a single flush order in the offload
3127
     * queue at a time.
3128
     */
3129
0
    static struct ovs_mutex flush_mutex = OVS_MUTEX_INITIALIZER;
3130
0
    static struct ovs_barrier barrier OVS_GUARDED_BY(flush_mutex);
3131
0
    struct netdev *netdev;
3132
3133
0
    if (!netdev_is_flow_api_enabled()) {
3134
0
        return;
3135
0
    }
3136
3137
0
    ovs_rwlock_unlock(&dp->port_rwlock);
3138
0
    ovs_mutex_lock(&flush_mutex);
3139
3140
    /* This thread and the offload threads. */
3141
0
    ovs_barrier_init(&barrier, 1 + netdev_offload_thread_nb());
3142
3143
0
    netdev = netdev_ref(port->netdev);
3144
0
    dp_netdev_offload_flush_enqueue(dp, netdev, &barrier);
3145
0
    ovs_barrier_block(&barrier);
3146
0
    netdev_close(netdev);
3147
3148
    /* Take back the datapath port lock before allowing the offload
3149
     * threads to proceed further. The port deletion must complete first,
3150
     * to ensure no further offloads are inserted after the flush.
3151
     *
3152
     * Some offload provider (e.g. DPDK) keeps a netdev reference with
3153
     * the offload data. If this reference is not closed, the netdev is
3154
     * kept indefinitely. */
3155
0
    ovs_rwlock_wrlock(&dp->port_rwlock);
3156
3157
0
    ovs_barrier_block(&barrier);
3158
0
    ovs_barrier_destroy(&barrier);
3159
3160
0
    ovs_mutex_unlock(&flush_mutex);
3161
0
}
3162
3163
static void
3164
dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd)
3165
0
{
3166
0
    struct dp_netdev_flow *netdev_flow;
3167
3168
0
    ovs_mutex_lock(&pmd->flow_mutex);
3169
0
    CMAP_FOR_EACH (netdev_flow, node, &pmd->flow_table) {
3170
0
        dp_netdev_pmd_remove_flow(pmd, netdev_flow);
3171
0
    }
3172
0
    ovs_mutex_unlock(&pmd->flow_mutex);
3173
0
}
3174
3175
static int
3176
dpif_netdev_flow_flush(struct dpif *dpif)
3177
0
{
3178
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
3179
0
    struct dp_netdev_pmd_thread *pmd;
3180
3181
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3182
0
        dp_netdev_pmd_flow_flush(pmd);
3183
0
    }
3184
3185
0
    return 0;
3186
0
}
3187
3188
struct dp_netdev_port_state {
3189
    struct hmap_position position;
3190
    char *name;
3191
};
3192
3193
static int
3194
dpif_netdev_port_dump_start(const struct dpif *dpif OVS_UNUSED, void **statep)
3195
0
{
3196
0
    *statep = xzalloc(sizeof(struct dp_netdev_port_state));
3197
0
    return 0;
3198
0
}
3199
3200
static int
3201
dpif_netdev_port_dump_next(const struct dpif *dpif, void *state_,
3202
                           struct dpif_port *dpif_port)
3203
0
{
3204
0
    struct dp_netdev_port_state *state = state_;
3205
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
3206
0
    struct hmap_node *node;
3207
0
    int retval;
3208
3209
0
    ovs_rwlock_rdlock(&dp->port_rwlock);
3210
0
    node = hmap_at_position(&dp->ports, &state->position);
3211
0
    if (node) {
3212
0
        struct dp_netdev_port *port;
3213
3214
0
        port = CONTAINER_OF(node, struct dp_netdev_port, node);
3215
3216
0
        free(state->name);
3217
0
        state->name = xstrdup(netdev_get_name(port->netdev));
3218
0
        dpif_port->name = state->name;
3219
0
        dpif_port->type = port->type;
3220
0
        dpif_port->port_no = port->port_no;
3221
3222
0
        retval = 0;
3223
0
    } else {
3224
0
        retval = EOF;
3225
0
    }
3226
0
    ovs_rwlock_unlock(&dp->port_rwlock);
3227
3228
0
    return retval;
3229
0
}
3230
3231
static int
3232
dpif_netdev_port_dump_done(const struct dpif *dpif OVS_UNUSED, void *state_)
3233
0
{
3234
0
    struct dp_netdev_port_state *state = state_;
3235
0
    free(state->name);
3236
0
    free(state);
3237
0
    return 0;
3238
0
}
3239
3240
static int
3241
dpif_netdev_port_poll(const struct dpif *dpif_, char **devnamep OVS_UNUSED)
3242
0
{
3243
0
    struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
3244
0
    uint64_t new_port_seq;
3245
0
    int error;
3246
3247
0
    new_port_seq = seq_read(dpif->dp->port_seq);
3248
0
    if (dpif->last_port_seq != new_port_seq) {
3249
0
        dpif->last_port_seq = new_port_seq;
3250
0
        error = ENOBUFS;
3251
0
    } else {
3252
0
        error = EAGAIN;
3253
0
    }
3254
3255
0
    return error;
3256
0
}
3257
3258
static void
3259
dpif_netdev_port_poll_wait(const struct dpif *dpif_)
3260
0
{
3261
0
    struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
3262
3263
0
    seq_wait(dpif->dp->port_seq, dpif->last_port_seq);
3264
0
}
3265
3266
static struct dp_netdev_flow *
3267
dp_netdev_flow_cast(const struct dpcls_rule *cr)
3268
0
{
3269
0
    return cr ? CONTAINER_OF(cr, struct dp_netdev_flow, cr) : NULL;
3270
0
}
3271
3272
static bool dp_netdev_flow_ref(struct dp_netdev_flow *flow)
3273
0
{
3274
0
    return ovs_refcount_try_ref_rcu(&flow->ref_cnt);
3275
0
}
3276
3277
/* netdev_flow_key utilities.
3278
 *
3279
 * netdev_flow_key is basically a miniflow.  We use these functions
3280
 * (netdev_flow_key_clone, netdev_flow_key_equal, ...) instead of the miniflow
3281
 * functions (miniflow_clone_inline, miniflow_equal, ...), because:
3282
 *
3283
 * - Since we are dealing exclusively with miniflows created by
3284
 *   miniflow_extract(), if the map is different the miniflow is different.
3285
 *   Therefore we can be faster by comparing the map and the miniflow in a
3286
 *   single memcmp().
3287
 * - These functions can be inlined by the compiler. */
3288
3289
static inline bool
3290
netdev_flow_key_equal(const struct netdev_flow_key *a,
3291
                      const struct netdev_flow_key *b)
3292
0
{
3293
    /* 'b->len' may be not set yet. */
3294
0
    return a->hash == b->hash && !memcmp(&a->mf, &b->mf, a->len);
3295
0
}
3296
3297
static inline void
3298
netdev_flow_key_clone(struct netdev_flow_key *dst,
3299
                      const struct netdev_flow_key *src)
3300
0
{
3301
0
    memcpy(dst, src,
3302
0
           offsetof(struct netdev_flow_key, mf) + src->len);
3303
0
}
3304
3305
/* Initialize a netdev_flow_key 'mask' from 'match'. */
3306
static inline void
3307
netdev_flow_mask_init(struct netdev_flow_key *mask,
3308
                      const struct match *match)
3309
0
{
3310
0
    uint64_t *dst = miniflow_values(&mask->mf);
3311
0
    struct flowmap fmap;
3312
0
    uint32_t hash = 0;
3313
0
    size_t idx;
3314
3315
    /* Only check masks that make sense for the flow. */
3316
0
    flow_wc_map(&match->flow, &fmap);
3317
0
    flowmap_init(&mask->mf.map);
3318
3319
0
    FLOWMAP_FOR_EACH_INDEX(idx, fmap) {
3320
0
        uint64_t mask_u64 = flow_u64_value(&match->wc.masks, idx);
3321
3322
0
        if (mask_u64) {
3323
0
            flowmap_set(&mask->mf.map, idx, 1);
3324
0
            *dst++ = mask_u64;
3325
0
            hash = hash_add64(hash, mask_u64);
3326
0
        }
3327
0
    }
3328
3329
0
    map_t map;
3330
3331
0
    FLOWMAP_FOR_EACH_MAP (map, mask->mf.map) {
3332
0
        hash = hash_add64(hash, map);
3333
0
    }
3334
3335
0
    size_t n = dst - miniflow_get_values(&mask->mf);
3336
3337
0
    mask->hash = hash_finish(hash, n * 8);
3338
0
    mask->len = netdev_flow_key_size(n);
3339
0
}
3340
3341
/* Initializes 'dst' as a copy of 'flow' masked with 'mask'. */
3342
static inline void
3343
netdev_flow_key_init_masked(struct netdev_flow_key *dst,
3344
                            const struct flow *flow,
3345
                            const struct netdev_flow_key *mask)
3346
0
{
3347
0
    uint64_t *dst_u64 = miniflow_values(&dst->mf);
3348
0
    const uint64_t *mask_u64 = miniflow_get_values(&mask->mf);
3349
0
    uint32_t hash = 0;
3350
0
    uint64_t value;
3351
3352
0
    dst->len = mask->len;
3353
0
    dst->mf = mask->mf;   /* Copy maps. */
3354
3355
0
    FLOW_FOR_EACH_IN_MAPS(value, flow, mask->mf.map) {
3356
0
        *dst_u64 = value & *mask_u64++;
3357
0
        hash = hash_add64(hash, *dst_u64++);
3358
0
    }
3359
0
    dst->hash = hash_finish(hash,
3360
0
                            (dst_u64 - miniflow_get_values(&dst->mf)) * 8);
3361
0
}
3362
3363
/* Initializes 'key' as a copy of 'flow'. */
3364
static inline void
3365
netdev_flow_key_init(struct netdev_flow_key *key,
3366
                     const struct flow *flow)
3367
0
{
3368
0
    uint64_t *dst = miniflow_values(&key->mf);
3369
0
    uint32_t hash = 0;
3370
0
    uint64_t value;
3371
3372
0
    miniflow_map_init(&key->mf, flow);
3373
0
    miniflow_init(&key->mf, flow);
3374
3375
0
    size_t n = dst - miniflow_get_values(&key->mf);
3376
3377
0
    FLOW_FOR_EACH_IN_MAPS (value, flow, key->mf.map) {
3378
0
        hash = hash_add64(hash, value);
3379
0
    }
3380
3381
0
    key->hash = hash_finish(hash, n * 8);
3382
0
    key->len = netdev_flow_key_size(n);
3383
0
}
3384
3385
static inline void
3386
emc_change_entry(struct emc_entry *ce, struct dp_netdev_flow *flow,
3387
                 const struct netdev_flow_key *key)
3388
0
{
3389
0
    if (ce->flow != flow) {
3390
0
        if (ce->flow) {
3391
0
            dp_netdev_flow_unref(ce->flow);
3392
0
        }
3393
3394
0
        if (dp_netdev_flow_ref(flow)) {
3395
0
            ce->flow = flow;
3396
0
        } else {
3397
0
            ce->flow = NULL;
3398
0
        }
3399
0
    }
3400
0
    if (key) {
3401
0
        netdev_flow_key_clone(&ce->key, key);
3402
0
    }
3403
0
}
3404
3405
static inline void
3406
emc_insert(struct emc_cache *cache, const struct netdev_flow_key *key,
3407
           struct dp_netdev_flow *flow)
3408
0
{
3409
0
    struct emc_entry *to_be_replaced = NULL;
3410
0
    struct emc_entry *current_entry;
3411
3412
0
    EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
3413
0
        if (netdev_flow_key_equal(&current_entry->key, key)) {
3414
            /* We found the entry with the 'mf' miniflow */
3415
0
            emc_change_entry(current_entry, flow, NULL);
3416
0
            return;
3417
0
        }
3418
3419
        /* Replacement policy: put the flow in an empty (not alive) entry, or
3420
         * in the first entry where it can be */
3421
0
        if (!to_be_replaced
3422
0
            || (emc_entry_alive(to_be_replaced)
3423
0
                && !emc_entry_alive(current_entry))
3424
0
            || current_entry->key.hash < to_be_replaced->key.hash) {
3425
0
            to_be_replaced = current_entry;
3426
0
        }
3427
0
    }
3428
    /* We didn't find the miniflow in the cache.
3429
     * The 'to_be_replaced' entry is where the new flow will be stored */
3430
3431
0
    emc_change_entry(to_be_replaced, flow, key);
3432
0
}
3433
3434
static inline void
3435
emc_probabilistic_insert(struct dp_netdev_pmd_thread *pmd,
3436
                         const struct netdev_flow_key *key,
3437
                         struct dp_netdev_flow *flow)
3438
0
{
3439
    /* Insert an entry into the EMC based on probability value 'min'. By
3440
     * default the value is UINT32_MAX / 100 which yields an insertion
3441
     * probability of 1/100 ie. 1% */
3442
3443
0
    uint32_t min = pmd->ctx.emc_insert_min;
3444
3445
0
    if (min && random_uint32() <= min) {
3446
0
        emc_insert(&(pmd->flow_cache).emc_cache, key, flow);
3447
0
    }
3448
0
}
3449
3450
static inline const struct cmap_node *
3451
smc_entry_get(struct dp_netdev_pmd_thread *pmd, const uint32_t hash)
3452
0
{
3453
0
    struct smc_cache *cache = &(pmd->flow_cache).smc_cache;
3454
0
    struct smc_bucket *bucket = &cache->buckets[hash & SMC_MASK];
3455
0
    uint16_t sig = hash >> 16;
3456
0
    uint16_t index = UINT16_MAX;
3457
3458
0
    for (int i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
3459
0
        if (bucket->sig[i] == sig) {
3460
0
            index = bucket->flow_idx[i];
3461
0
            break;
3462
0
        }
3463
0
    }
3464
0
    if (index != UINT16_MAX) {
3465
0
        return cmap_find_by_index(&pmd->flow_table, index);
3466
0
    }
3467
0
    return NULL;
3468
0
}
3469
3470
/* Insert the flow_table index into SMC. Insertion may fail when 1) SMC is
3471
 * turned off, 2) the flow_table index is larger than uint16_t can handle.
3472
 * If there is already an SMC entry having same signature, the index will be
3473
 * updated. If there is no existing entry, but an empty entry is available,
3474
 * the empty entry will be taken. If no empty entry or existing same signature,
3475
 * a random entry from the hashed bucket will be picked. */
3476
static inline void
3477
smc_insert(struct dp_netdev_pmd_thread *pmd,
3478
           const struct netdev_flow_key *key,
3479
           uint32_t hash)
3480
0
{
3481
0
    struct smc_cache *smc_cache = &(pmd->flow_cache).smc_cache;
3482
0
    struct smc_bucket *bucket = &smc_cache->buckets[key->hash & SMC_MASK];
3483
0
    uint16_t index;
3484
0
    uint32_t cmap_index;
3485
0
    int i;
3486
3487
0
    if (!pmd->ctx.smc_enable_db) {
3488
0
        return;
3489
0
    }
3490
3491
0
    cmap_index = cmap_find_index(&pmd->flow_table, hash);
3492
0
    index = (cmap_index >= UINT16_MAX) ? UINT16_MAX : (uint16_t)cmap_index;
3493
3494
    /* If the index is larger than SMC can handle (uint16_t), we don't
3495
     * insert */
3496
0
    if (index == UINT16_MAX) {
3497
0
        return;
3498
0
    }
3499
3500
    /* If an entry with same signature already exists, update the index */
3501
0
    uint16_t sig = key->hash >> 16;
3502
0
    for (i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
3503
0
        if (bucket->sig[i] == sig) {
3504
0
            bucket->flow_idx[i] = index;
3505
0
            return;
3506
0
        }
3507
0
    }
3508
    /* If there is an empty entry, occupy it. */
3509
0
    for (i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
3510
0
        if (bucket->flow_idx[i] == UINT16_MAX) {
3511
0
            bucket->sig[i] = sig;
3512
0
            bucket->flow_idx[i] = index;
3513
0
            return;
3514
0
        }
3515
0
    }
3516
    /* Otherwise, pick a random entry. */
3517
0
    i = random_uint32() % SMC_ENTRY_PER_BUCKET;
3518
0
    bucket->sig[i] = sig;
3519
0
    bucket->flow_idx[i] = index;
3520
0
}
3521
3522
inline void
3523
emc_probabilistic_insert_batch(struct dp_netdev_pmd_thread *pmd,
3524
                               const struct netdev_flow_key *keys,
3525
                               struct dpcls_rule **rules,
3526
                               uint32_t emc_insert_mask)
3527
0
{
3528
0
    while (emc_insert_mask) {
3529
0
        uint32_t i = raw_ctz(emc_insert_mask);
3530
0
        emc_insert_mask &= emc_insert_mask - 1;
3531
        /* Get the require parameters for EMC/SMC from the rule */
3532
0
        struct dp_netdev_flow *flow = dp_netdev_flow_cast(rules[i]);
3533
        /* Insert the key into EMC/SMC. */
3534
0
        emc_probabilistic_insert(pmd, &keys[i], flow);
3535
0
    }
3536
0
}
3537
3538
inline void
3539
smc_insert_batch(struct dp_netdev_pmd_thread *pmd,
3540
                 const struct netdev_flow_key *keys,
3541
                 struct dpcls_rule **rules,
3542
                 uint32_t smc_insert_mask)
3543
0
{
3544
0
    while (smc_insert_mask) {
3545
0
        uint32_t i = raw_ctz(smc_insert_mask);
3546
0
        smc_insert_mask &= smc_insert_mask - 1;
3547
        /* Get the require parameters for EMC/SMC from the rule */
3548
0
        struct dp_netdev_flow *flow = dp_netdev_flow_cast(rules[i]);
3549
0
        uint32_t hash = dp_netdev_flow_hash(&flow->ufid);
3550
        /* Insert the key into EMC/SMC. */
3551
0
        smc_insert(pmd, &keys[i], hash);
3552
0
    }
3553
0
}
3554
3555
static struct dp_netdev_flow *
3556
dp_netdev_pmd_lookup_flow(struct dp_netdev_pmd_thread *pmd,
3557
                          const struct netdev_flow_key *key,
3558
                          int *lookup_num_p)
3559
0
{
3560
0
    struct dpcls *cls;
3561
0
    struct dpcls_rule *rule = NULL;
3562
0
    odp_port_t in_port = u32_to_odp(MINIFLOW_GET_U32(&key->mf,
3563
0
                                                     in_port.odp_port));
3564
0
    struct dp_netdev_flow *netdev_flow = NULL;
3565
3566
0
    cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
3567
0
    if (OVS_LIKELY(cls)) {
3568
0
        dpcls_lookup(cls, &key, &rule, 1, lookup_num_p);
3569
0
        netdev_flow = dp_netdev_flow_cast(rule);
3570
0
    }
3571
0
    return netdev_flow;
3572
0
}
3573
3574
static struct dp_netdev_flow *
3575
dp_netdev_pmd_find_flow(const struct dp_netdev_pmd_thread *pmd,
3576
                        const ovs_u128 *ufidp, const struct nlattr *key,
3577
                        size_t key_len)
3578
0
{
3579
0
    struct dp_netdev_flow *netdev_flow;
3580
0
    struct flow flow;
3581
0
    ovs_u128 ufid;
3582
3583
    /* If a UFID is not provided, determine one based on the key. */
3584
0
    if (!ufidp && key && key_len
3585
0
        && !dpif_netdev_flow_from_nlattrs(key, key_len, &flow, false)) {
3586
0
        odp_flow_key_hash(&flow, sizeof flow, &ufid);
3587
0
        ufidp = &ufid;
3588
0
    }
3589
3590
0
    if (ufidp) {
3591
0
        CMAP_FOR_EACH_WITH_HASH (netdev_flow, node, dp_netdev_flow_hash(ufidp),
3592
0
                                 &pmd->flow_table) {
3593
0
            if (ovs_u128_equals(netdev_flow->ufid, *ufidp)) {
3594
0
                return netdev_flow;
3595
0
            }
3596
0
        }
3597
0
    }
3598
3599
0
    return NULL;
3600
0
}
3601
3602
static void
3603
dp_netdev_flow_set_last_stats_attrs(struct dp_netdev_flow *netdev_flow,
3604
                                    const struct dpif_flow_stats *stats,
3605
                                    const struct dpif_flow_attrs *attrs,
3606
                                    int result)
3607
0
{
3608
0
    struct dp_netdev_flow_stats *last_stats = &netdev_flow->last_stats;
3609
0
    struct dp_netdev_flow_attrs *last_attrs = &netdev_flow->last_attrs;
3610
3611
0
    atomic_store_relaxed(&netdev_flow->netdev_flow_get_result, result);
3612
0
    if (result) {
3613
0
        return;
3614
0
    }
3615
3616
0
    atomic_store_relaxed(&last_stats->used,         stats->used);
3617
0
    atomic_store_relaxed(&last_stats->packet_count, stats->n_packets);
3618
0
    atomic_store_relaxed(&last_stats->byte_count,   stats->n_bytes);
3619
0
    atomic_store_relaxed(&last_stats->tcp_flags,    stats->tcp_flags);
3620
3621
0
    atomic_store_relaxed(&last_attrs->offloaded,    attrs->offloaded);
3622
0
    atomic_store_relaxed(&last_attrs->dp_layer,     attrs->dp_layer);
3623
3624
0
}
3625
3626
static void
3627
dp_netdev_flow_get_last_stats_attrs(struct dp_netdev_flow *netdev_flow,
3628
                                    struct dpif_flow_stats *stats,
3629
                                    struct dpif_flow_attrs *attrs,
3630
                                    int *result)
3631
0
{
3632
0
    struct dp_netdev_flow_stats *last_stats = &netdev_flow->last_stats;
3633
0
    struct dp_netdev_flow_attrs *last_attrs = &netdev_flow->last_attrs;
3634
3635
0
    atomic_read_relaxed(&netdev_flow->netdev_flow_get_result, result);
3636
0
    if (*result) {
3637
0
        return;
3638
0
    }
3639
3640
0
    atomic_read_relaxed(&last_stats->used,         &stats->used);
3641
0
    atomic_read_relaxed(&last_stats->packet_count, &stats->n_packets);
3642
0
    atomic_read_relaxed(&last_stats->byte_count,   &stats->n_bytes);
3643
0
    atomic_read_relaxed(&last_stats->tcp_flags,    &stats->tcp_flags);
3644
3645
0
    atomic_read_relaxed(&last_attrs->offloaded,    &attrs->offloaded);
3646
0
    atomic_read_relaxed(&last_attrs->dp_layer,     &attrs->dp_layer);
3647
0
}
3648
3649
static bool
3650
dpif_netdev_get_flow_offload_status(const struct dp_netdev *dp,
3651
                                    struct dp_netdev_flow *netdev_flow,
3652
                                    struct dpif_flow_stats *stats,
3653
                                    struct dpif_flow_attrs *attrs)
3654
0
{
3655
0
    uint64_t act_buf[1024 / 8];
3656
0
    struct nlattr *actions;
3657
0
    struct netdev *netdev;
3658
0
    struct match match;
3659
0
    struct ofpbuf buf;
3660
3661
0
    int ret = 0;
3662
3663
0
    if (!netdev_is_flow_api_enabled()) {
3664
0
        return false;
3665
0
    }
3666
3667
0
    netdev = netdev_ports_get(netdev_flow->flow.in_port.odp_port,
3668
0
                              dpif_normalize_type(dp->class->type));
3669
0
    if (!netdev) {
3670
0
        return false;
3671
0
    }
3672
0
    ofpbuf_use_stack(&buf, &act_buf, sizeof act_buf);
3673
    /* Taking a global 'port_rwlock' to fulfill thread safety
3674
     * restrictions regarding netdev port mapping.
3675
     *
3676
     * XXX: Main thread will try to pause/stop all revalidators during datapath
3677
     *      reconfiguration via datapath purge callback (dp_purge_cb) while
3678
     *      rw-holding 'dp->port_rwlock'.  So we're not waiting for lock here.
3679
     *      Otherwise, deadlock is possible, because revalidators might sleep
3680
     *      waiting for the main thread to release the lock and main thread
3681
     *      will wait for them to stop processing.
3682
     *      This workaround might make statistics less accurate. Especially
3683
     *      for flow deletion case, since there will be no other attempt.  */
3684
0
    if (!ovs_rwlock_tryrdlock(&dp->port_rwlock)) {
3685
0
        ret = netdev_flow_get(netdev, &match, &actions,
3686
0
                              &netdev_flow->mega_ufid, stats, attrs, &buf);
3687
        /* Storing statistics and attributes from the last request for
3688
         * later use on mutex contention. */
3689
0
        dp_netdev_flow_set_last_stats_attrs(netdev_flow, stats, attrs, ret);
3690
0
        ovs_rwlock_unlock(&dp->port_rwlock);
3691
0
    } else {
3692
0
        dp_netdev_flow_get_last_stats_attrs(netdev_flow, stats, attrs, &ret);
3693
0
        if (!ret && !attrs->dp_layer) {
3694
            /* Flow was never reported as 'offloaded' so it's harmless
3695
             * to continue to think so. */
3696
0
            ret = EAGAIN;
3697
0
        }
3698
0
    }
3699
0
    netdev_close(netdev);
3700
0
    if (ret) {
3701
0
        return false;
3702
0
    }
3703
3704
0
    return true;
3705
0
}
3706
3707
static void
3708
get_dpif_flow_status(const struct dp_netdev *dp,
3709
                     const struct dp_netdev_flow *netdev_flow_,
3710
                     struct dpif_flow_stats *stats,
3711
                     struct dpif_flow_attrs *attrs)
3712
0
{
3713
0
    struct dpif_flow_stats offload_stats;
3714
0
    struct dpif_flow_attrs offload_attrs;
3715
0
    struct dp_netdev_flow *netdev_flow;
3716
0
    unsigned long long n;
3717
0
    long long used;
3718
0
    uint16_t flags;
3719
3720
0
    netdev_flow = CONST_CAST(struct dp_netdev_flow *, netdev_flow_);
3721
3722
0
    atomic_read_relaxed(&netdev_flow->stats.packet_count, &n);
3723
0
    stats->n_packets = n;
3724
0
    atomic_read_relaxed(&netdev_flow->stats.byte_count, &n);
3725
0
    stats->n_bytes = n;
3726
0
    atomic_read_relaxed(&netdev_flow->stats.used, &used);
3727
0
    stats->used = used;
3728
0
    atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
3729
0
    stats->tcp_flags = flags;
3730
3731
0
    if (dpif_netdev_get_flow_offload_status(dp, netdev_flow,
3732
0
                                            &offload_stats, &offload_attrs)) {
3733
0
        stats->n_packets += offload_stats.n_packets;
3734
0
        stats->n_bytes += offload_stats.n_bytes;
3735
0
        stats->used = MAX(stats->used, offload_stats.used);
3736
0
        stats->tcp_flags |= offload_stats.tcp_flags;
3737
0
        if (attrs) {
3738
0
            attrs->offloaded = offload_attrs.offloaded;
3739
0
            attrs->dp_layer = offload_attrs.dp_layer;
3740
0
        }
3741
0
    } else if (attrs) {
3742
0
        attrs->offloaded = false;
3743
0
        attrs->dp_layer = "ovs";
3744
0
    }
3745
0
}
3746
3747
/* Converts to the dpif_flow format, using 'key_buf' and 'mask_buf' for
3748
 * storing the netlink-formatted key/mask. 'key_buf' may be the same as
3749
 * 'mask_buf'. Actions will be returned without copying, by relying on RCU to
3750
 * protect them. */
3751
static void
3752
dp_netdev_flow_to_dpif_flow(const struct dp_netdev *dp,
3753
                            const struct dp_netdev_flow *netdev_flow,
3754
                            struct ofpbuf *key_buf, struct ofpbuf *mask_buf,
3755
                            struct dpif_flow *flow, bool terse)
3756
0
{
3757
0
    if (terse) {
3758
0
        memset(flow, 0, sizeof *flow);
3759
0
    } else {
3760
0
        struct flow_wildcards wc;
3761
0
        struct dp_netdev_actions *actions;
3762
0
        size_t offset;
3763
0
        struct odp_flow_key_parms odp_parms = {
3764
0
            .flow = &netdev_flow->flow,
3765
0
            .mask = &wc.masks,
3766
0
            .support = dp_netdev_support,
3767
0
        };
3768
3769
0
        miniflow_expand(&netdev_flow->cr.mask->mf, &wc.masks);
3770
        /* in_port is exact matched, but we have left it out from the mask for
3771
         * optimnization reasons. Add in_port back to the mask. */
3772
0
        wc.masks.in_port.odp_port = ODPP_NONE;
3773
3774
        /* Key */
3775
0
        offset = key_buf->size;
3776
0
        flow->key = ofpbuf_tail(key_buf);
3777
0
        odp_flow_key_from_flow(&odp_parms, key_buf);
3778
0
        flow->key_len = key_buf->size - offset;
3779
3780
        /* Mask */
3781
0
        offset = mask_buf->size;
3782
0
        flow->mask = ofpbuf_tail(mask_buf);
3783
0
        odp_parms.key_buf = key_buf;
3784
0
        odp_flow_key_from_mask(&odp_parms, mask_buf);
3785
0
        flow->mask_len = mask_buf->size - offset;
3786
3787
        /* Actions */
3788
0
        actions = dp_netdev_flow_get_actions(netdev_flow);
3789
0
        flow->actions = actions->actions;
3790
0
        flow->actions_len = actions->size;
3791
0
    }
3792
3793
0
    flow->ufid = netdev_flow->ufid;
3794
0
    flow->ufid_present = true;
3795
0
    flow->pmd_id = netdev_flow->pmd_id;
3796
3797
0
    get_dpif_flow_status(dp, netdev_flow, &flow->stats, &flow->attrs);
3798
0
    flow->attrs.dp_extra_info = netdev_flow->dp_extra_info;
3799
0
}
3800
3801
static int
3802
dpif_netdev_mask_from_nlattrs(const struct nlattr *key, uint32_t key_len,
3803
                              const struct nlattr *mask_key,
3804
                              uint32_t mask_key_len, const struct flow *flow,
3805
                              struct flow_wildcards *wc, bool probe)
3806
0
{
3807
0
    enum odp_key_fitness fitness;
3808
3809
0
    fitness = odp_flow_key_to_mask(mask_key, mask_key_len, wc, flow, NULL);
3810
0
    if (fitness) {
3811
0
        if (!probe) {
3812
            /* This should not happen: it indicates that
3813
             * odp_flow_key_from_mask() and odp_flow_key_to_mask()
3814
             * disagree on the acceptable form of a mask.  Log the problem
3815
             * as an error, with enough details to enable debugging. */
3816
0
            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
3817
3818
0
            if (!VLOG_DROP_ERR(&rl)) {
3819
0
                struct ds s;
3820
3821
0
                ds_init(&s);
3822
0
                odp_flow_format(key, key_len, mask_key, mask_key_len, NULL, &s,
3823
0
                                true);
3824
0
                VLOG_ERR("internal error parsing flow mask %s (%s)",
3825
0
                ds_cstr(&s), odp_key_fitness_to_string(fitness));
3826
0
                ds_destroy(&s);
3827
0
            }
3828
0
        }
3829
3830
0
        return EINVAL;
3831
0
    }
3832
3833
0
    return 0;
3834
0
}
3835
3836
static int
3837
dpif_netdev_flow_from_nlattrs(const struct nlattr *key, uint32_t key_len,
3838
                              struct flow *flow, bool probe)
3839
0
{
3840
0
    if (odp_flow_key_to_flow(key, key_len, flow, NULL)) {
3841
0
        if (!probe) {
3842
            /* This should not happen: it indicates that
3843
             * odp_flow_key_from_flow() and odp_flow_key_to_flow() disagree on
3844
             * the acceptable form of a flow.  Log the problem as an error,
3845
             * with enough details to enable debugging. */
3846
0
            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
3847
3848
0
            if (!VLOG_DROP_ERR(&rl)) {
3849
0
                struct ds s;
3850
3851
0
                ds_init(&s);
3852
0
                odp_flow_format(key, key_len, NULL, 0, NULL, &s, true);
3853
0
                VLOG_ERR("internal error parsing flow key %s", ds_cstr(&s));
3854
0
                ds_destroy(&s);
3855
0
            }
3856
0
        }
3857
3858
0
        return EINVAL;
3859
0
    }
3860
3861
0
    if (flow->ct_state & DP_NETDEV_CS_UNSUPPORTED_MASK) {
3862
0
        return EINVAL;
3863
0
    }
3864
3865
0
    return 0;
3866
0
}
3867
3868
static int
3869
dpif_netdev_flow_get(const struct dpif *dpif, const struct dpif_flow_get *get)
3870
0
{
3871
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
3872
0
    struct dp_netdev_flow *netdev_flow;
3873
0
    struct dp_netdev_pmd_thread *pmd;
3874
0
    struct hmapx to_find = HMAPX_INITIALIZER(&to_find);
3875
0
    struct hmapx_node *node;
3876
0
    int error = EINVAL;
3877
3878
0
    if (get->pmd_id == PMD_ID_NULL) {
3879
0
        CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3880
0
            if (dp_netdev_pmd_try_ref(pmd) && !hmapx_add(&to_find, pmd)) {
3881
0
                dp_netdev_pmd_unref(pmd);
3882
0
            }
3883
0
        }
3884
0
    } else {
3885
0
        pmd = dp_netdev_get_pmd(dp, get->pmd_id);
3886
0
        if (!pmd) {
3887
0
            goto out;
3888
0
        }
3889
0
        hmapx_add(&to_find, pmd);
3890
0
    }
3891
3892
0
    if (!hmapx_count(&to_find)) {
3893
0
        goto out;
3894
0
    }
3895
3896
0
    HMAPX_FOR_EACH (node, &to_find) {
3897
0
        pmd = (struct dp_netdev_pmd_thread *) node->data;
3898
0
        netdev_flow = dp_netdev_pmd_find_flow(pmd, get->ufid, get->key,
3899
0
                                              get->key_len);
3900
0
        if (netdev_flow) {
3901
0
            dp_netdev_flow_to_dpif_flow(dp, netdev_flow, get->buffer,
3902
0
                                        get->buffer, get->flow, false);
3903
0
            error = 0;
3904
0
            break;
3905
0
        } else {
3906
0
            error = ENOENT;
3907
0
        }
3908
0
    }
3909
3910
0
    HMAPX_FOR_EACH (node, &to_find) {
3911
0
        pmd = (struct dp_netdev_pmd_thread *) node->data;
3912
0
        dp_netdev_pmd_unref(pmd);
3913
0
    }
3914
0
out:
3915
0
    hmapx_destroy(&to_find);
3916
0
    return error;
3917
0
}
3918
3919
static void
3920
dp_netdev_get_mega_ufid(const struct match *match, ovs_u128 *mega_ufid)
3921
0
{
3922
0
    struct flow masked_flow;
3923
0
    size_t i;
3924
3925
0
    for (i = 0; i < sizeof(struct flow); i++) {
3926
0
        ((uint8_t *)&masked_flow)[i] = ((uint8_t *)&match->flow)[i] &
3927
0
                                       ((uint8_t *)&match->wc)[i];
3928
0
    }
3929
0
    odp_flow_key_hash(&masked_flow, sizeof masked_flow, mega_ufid);
3930
0
}
3931
3932
uint64_t
3933
dp_netdev_simple_match_mark(odp_port_t in_port, ovs_be16 dl_type,
3934
                            uint8_t nw_frag, ovs_be16 vlan_tci)
3935
0
{
3936
    /* Simple Match Mark:
3937
     *
3938
     * BE:
3939
     * +-----------------+-------------++---------+---+-----------+
3940
     * |     in_port     |   dl_type   || nw_frag |CFI|  VID(12)  |
3941
     * +-----------------+-------------++---------+---+-----------+
3942
     * 0                 32          47 49         51  52     63
3943
     *
3944
     * LE:
3945
     * +-----------------+-------------+------++-------+---+------+
3946
     * |     in_port     |   dl_type   |VID(8)||nw_frag|CFI|VID(4)|
3947
     * +-----------------+-------------+------++-------+---+------+
3948
     * 0                 32          47 48  55  57   59 60  61   63
3949
     *
3950
     *         Big Endian              Little Endian
3951
     * in_port : 32 bits [ 0..31]  in_port : 32 bits [ 0..31]
3952
     * dl_type : 16 bits [32..47]  dl_type : 16 bits [32..47]
3953
     * <empty> :  1 bit  [48..48]  vlan VID:  8 bits [48..55]
3954
     * nw_frag :  2 bits [49..50]  <empty> :  1 bit  [56..56]
3955
     * vlan CFI:  1 bit  [51..51]  nw_frag :  2 bits [57..59]
3956
     * vlan VID: 12 bits [52..63]  vlan CFI:  1 bit  [60..60]
3957
     *                             vlan VID:  4 bits [61..63]
3958
     *
3959
     * Layout is different for LE and BE in order to save a couple of
3960
     * network to host translations.
3961
     * */
3962
0
    return ((uint64_t) odp_to_u32(in_port) << 32)
3963
0
           | ((OVS_FORCE uint32_t) dl_type << 16)
3964
#if WORDS_BIGENDIAN
3965
           | (((uint16_t) nw_frag & FLOW_NW_FRAG_MASK) << VLAN_PCP_SHIFT)
3966
#else
3967
0
           | ((nw_frag & FLOW_NW_FRAG_MASK) << (VLAN_PCP_SHIFT - 8))
3968
0
#endif
3969
0
           | (OVS_FORCE uint16_t) (vlan_tci & htons(VLAN_VID_MASK | VLAN_CFI));
3970
0
}
3971
3972
struct dp_netdev_flow *
3973
dp_netdev_simple_match_lookup(const struct dp_netdev_pmd_thread *pmd,
3974
                              odp_port_t in_port, ovs_be16 dl_type,
3975
                              uint8_t nw_frag, ovs_be16 vlan_tci)
3976
0
{
3977
0
    uint64_t mark = dp_netdev_simple_match_mark(in_port, dl_type,
3978
0
                                                nw_frag, vlan_tci);
3979
0
    uint32_t hash = hash_uint64(mark);
3980
0
    struct dp_netdev_flow *flow;
3981
0
    bool found = false;
3982
3983
0
    CMAP_FOR_EACH_WITH_HASH (flow, simple_match_node,
3984
0
                             hash, &pmd->simple_match_table) {
3985
0
        if (flow->simple_match_mark == mark) {
3986
0
            found = true;
3987
0
            break;
3988
0
        }
3989
0
    }
3990
0
    return found ? flow : NULL;
3991
0
}
3992
3993
bool
3994
dp_netdev_simple_match_enabled(const struct dp_netdev_pmd_thread *pmd,
3995
                               odp_port_t in_port)
3996
0
{
3997
0
    return ccmap_find(&pmd->n_flows, odp_to_u32(in_port))
3998
0
           == ccmap_find(&pmd->n_simple_flows, odp_to_u32(in_port));
3999
0
}
4000
4001
static void
4002
dp_netdev_simple_match_insert(struct dp_netdev_pmd_thread *pmd,
4003
                              struct dp_netdev_flow *dp_flow)
4004
    OVS_REQUIRES(pmd->flow_mutex)
4005
0
{
4006
0
    odp_port_t in_port = dp_flow->flow.in_port.odp_port;
4007
0
    ovs_be16 vlan_tci = dp_flow->flow.vlans[0].tci;
4008
0
    ovs_be16 dl_type = dp_flow->flow.dl_type;
4009
0
    uint8_t nw_frag = dp_flow->flow.nw_frag;
4010
4011
0
    if (!dp_netdev_flow_ref(dp_flow)) {
4012
0
        return;
4013
0
    }
4014
4015
    /* Avoid double insertion.  Should not happen in practice. */
4016
0
    dp_netdev_simple_match_remove(pmd, dp_flow);
4017
4018
0
    uint64_t mark = dp_netdev_simple_match_mark(in_port, dl_type,
4019
0
                                                nw_frag, vlan_tci);
4020
0
    uint32_t hash = hash_uint64(mark);
4021
4022
0
    dp_flow->simple_match_mark = mark;
4023
0
    cmap_insert(&pmd->simple_match_table,
4024
0
                CONST_CAST(struct cmap_node *, &dp_flow->simple_match_node),
4025
0
                hash);
4026
0
    ccmap_inc(&pmd->n_simple_flows, odp_to_u32(in_port));
4027
4028
0
    VLOG_DBG("Simple match insert: "
4029
0
             "core_id(%d),in_port(%"PRIu32"),mark(0x%016"PRIx64").",
4030
0
             pmd->core_id, in_port, mark);
4031
0
}
4032
4033
static void
4034
dp_netdev_simple_match_remove(struct dp_netdev_pmd_thread *pmd,
4035
                               struct dp_netdev_flow *dp_flow)
4036
    OVS_REQUIRES(pmd->flow_mutex)
4037
0
{
4038
0
    odp_port_t in_port = dp_flow->flow.in_port.odp_port;
4039
0
    ovs_be16 vlan_tci = dp_flow->flow.vlans[0].tci;
4040
0
    ovs_be16 dl_type = dp_flow->flow.dl_type;
4041
0
    uint8_t nw_frag = dp_flow->flow.nw_frag;
4042
0
    struct dp_netdev_flow *flow;
4043
0
    uint64_t mark = dp_netdev_simple_match_mark(in_port, dl_type,
4044
0
                                                nw_frag, vlan_tci);
4045
0
    uint32_t hash = hash_uint64(mark);
4046
4047
0
    flow = dp_netdev_simple_match_lookup(pmd, in_port, dl_type,
4048
0
                                         nw_frag, vlan_tci);
4049
0
    if (flow == dp_flow) {
4050
0
        VLOG_DBG("Simple match remove: "
4051
0
                 "core_id(%d),in_port(%"PRIu32"),mark(0x%016"PRIx64").",
4052
0
                 pmd->core_id, in_port, mark);
4053
0
        cmap_remove(&pmd->simple_match_table,
4054
0
                    CONST_CAST(struct cmap_node *, &flow->simple_match_node),
4055
0
                    hash);
4056
0
        ccmap_dec(&pmd->n_simple_flows, odp_to_u32(in_port));
4057
0
        dp_netdev_flow_unref(flow);
4058
0
    }
4059
0
}
4060
4061
static bool
4062
dp_netdev_flow_is_simple_match(const struct match *match)
4063
0
{
4064
0
    const struct flow *flow = &match->flow;
4065
0
    const struct flow_wildcards *wc = &match->wc;
4066
4067
0
    if (flow->recirc_id || flow->packet_type != htonl(PT_ETH)) {
4068
0
        return false;
4069
0
    }
4070
4071
    /* Check that flow matches only minimal set of fields that always set.
4072
     * Also checking that VLAN VID+CFI is an exact match, because these
4073
     * are not mandatory and could be masked. */
4074
0
    struct flow_wildcards *minimal = xmalloc(sizeof *minimal);
4075
0
    ovs_be16 vlan_tci_mask = htons(VLAN_VID_MASK | VLAN_CFI);
4076
4077
0
    flow_wildcards_init_catchall(minimal);
4078
    /* 'dpif-netdev' always has following in exact match:
4079
     *   - recirc_id                   <-- recirc_id == 0 checked on input.
4080
     *   - in_port                     <-- Will be checked on input.
4081
     *   - packet_type                 <-- Assuming all packets are PT_ETH.
4082
     *   - dl_type                     <-- Need to match with.
4083
     *   - vlan_tci                    <-- Need to match with.
4084
     *   - and nw_frag for ip packets. <-- Need to match with.
4085
     */
4086
0
    WC_MASK_FIELD(minimal, recirc_id);
4087
0
    WC_MASK_FIELD(minimal, in_port);
4088
0
    WC_MASK_FIELD(minimal, packet_type);
4089
0
    WC_MASK_FIELD(minimal, dl_type);
4090
0
    WC_MASK_FIELD_MASK(minimal, vlans[0].tci, vlan_tci_mask);
4091
0
    WC_MASK_FIELD_MASK(minimal, nw_frag, FLOW_NW_FRAG_MASK);
4092
4093
0
    if (flow_wildcards_has_extra(minimal, wc)
4094
0
        || wc->masks.vlans[0].tci != vlan_tci_mask) {
4095
0
        free(minimal);
4096
0
        return false;
4097
0
    }
4098
0
    free(minimal);
4099
4100
0
    return true;
4101
0
}
4102
4103
static struct dp_netdev_flow *
4104
dp_netdev_flow_add(struct dp_netdev_pmd_thread *pmd,
4105
                   struct match *match, const ovs_u128 *ufid,
4106
                   const struct nlattr *actions, size_t actions_len,
4107
                   odp_port_t orig_in_port)
4108
    OVS_REQUIRES(pmd->flow_mutex)
4109
0
{
4110
0
    struct ds extra_info = DS_EMPTY_INITIALIZER;
4111
0
    struct dp_netdev_flow *flow;
4112
0
    struct netdev_flow_key mask;
4113
0
    struct dpcls *cls;
4114
0
    size_t unit;
4115
4116
    /* Make sure in_port is exact matched before we read it. */
4117
0
    ovs_assert(match->wc.masks.in_port.odp_port == ODPP_NONE);
4118
0
    odp_port_t in_port = match->flow.in_port.odp_port;
4119
4120
    /* As we select the dpcls based on the port number, each netdev flow
4121
     * belonging to the same dpcls will have the same odp_port value.
4122
     * For performance reasons we wildcard odp_port here in the mask.  In the
4123
     * typical case dp_hash is also wildcarded, and the resulting 8-byte
4124
     * chunk {dp_hash, in_port} will be ignored by netdev_flow_mask_init() and
4125
     * will not be part of the subtable mask.
4126
     * This will speed up the hash computation during dpcls_lookup() because
4127
     * there is one less call to hash_add64() in this case. */
4128
0
    match->wc.masks.in_port.odp_port = 0;
4129
0
    netdev_flow_mask_init(&mask, match);
4130
0
    match->wc.masks.in_port.odp_port = ODPP_NONE;
4131
4132
    /* Make sure wc does not have metadata. */
4133
0
    ovs_assert(!FLOWMAP_HAS_FIELD(&mask.mf.map, metadata)
4134
0
               && !FLOWMAP_HAS_FIELD(&mask.mf.map, regs));
4135
4136
    /* Do not allocate extra space. */
4137
0
    flow = xmalloc(sizeof *flow - sizeof flow->cr.flow.mf + mask.len);
4138
0
    memset(&flow->stats, 0, sizeof flow->stats);
4139
0
    atomic_init(&flow->netdev_flow_get_result, 0);
4140
0
    memset(&flow->last_stats, 0, sizeof flow->last_stats);
4141
0
    memset(&flow->last_attrs, 0, sizeof flow->last_attrs);
4142
0
    flow->dead = false;
4143
0
    flow->batch = NULL;
4144
0
    flow->mark = INVALID_FLOW_MARK;
4145
0
    flow->orig_in_port = orig_in_port;
4146
0
    *CONST_CAST(unsigned *, &flow->pmd_id) = pmd->core_id;
4147
0
    *CONST_CAST(struct flow *, &flow->flow) = match->flow;
4148
0
    *CONST_CAST(ovs_u128 *, &flow->ufid) = *ufid;
4149
0
    ovs_refcount_init(&flow->ref_cnt);
4150
0
    ovsrcu_set(&flow->actions, dp_netdev_actions_create(actions, actions_len));
4151
4152
0
    dp_netdev_get_mega_ufid(match, CONST_CAST(ovs_u128 *, &flow->mega_ufid));
4153
0
    netdev_flow_key_init_masked(&flow->cr.flow, &match->flow, &mask);
4154
4155
    /* Select dpcls for in_port. Relies on in_port to be exact match. */
4156
0
    cls = dp_netdev_pmd_find_dpcls(pmd, in_port);
4157
0
    dpcls_insert(cls, &flow->cr, &mask);
4158
4159
0
    ds_put_cstr(&extra_info, "miniflow_bits(");
4160
0
    FLOWMAP_FOR_EACH_UNIT (unit) {
4161
0
        if (unit) {
4162
0
            ds_put_char(&extra_info, ',');
4163
0
        }
4164
0
        ds_put_format(&extra_info, "%d",
4165
0
                      count_1bits(flow->cr.mask->mf.map.bits[unit]));
4166
0
    }
4167
0
    ds_put_char(&extra_info, ')');
4168
0
    flow->dp_extra_info = ds_steal_cstr(&extra_info);
4169
0
    ds_destroy(&extra_info);
4170
4171
0
    cmap_insert(&pmd->flow_table, CONST_CAST(struct cmap_node *, &flow->node),
4172
0
                dp_netdev_flow_hash(&flow->ufid));
4173
0
    ccmap_inc(&pmd->n_flows, odp_to_u32(in_port));
4174
4175
0
    if (dp_netdev_flow_is_simple_match(match)) {
4176
0
        dp_netdev_simple_match_insert(pmd, flow);
4177
0
    }
4178
4179
0
    queue_netdev_flow_put(pmd, flow, match, actions, actions_len,
4180
0
                          DP_NETDEV_FLOW_OFFLOAD_OP_ADD);
4181
0
    log_netdev_flow_change(flow, match, NULL, actions, actions_len);
4182
4183
0
    return flow;
4184
0
}
4185
4186
static int
4187
flow_put_on_pmd(struct dp_netdev_pmd_thread *pmd,
4188
                struct netdev_flow_key *key,
4189
                struct match *match,
4190
                ovs_u128 *ufid,
4191
                const struct dpif_flow_put *put,
4192
                struct dpif_flow_stats *stats)
4193
0
{
4194
0
    struct dp_netdev_flow *netdev_flow;
4195
0
    int error = 0;
4196
4197
0
    if (stats) {
4198
0
        memset(stats, 0, sizeof *stats);
4199
0
    }
4200
4201
0
    ovs_mutex_lock(&pmd->flow_mutex);
4202
0
    netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL);
4203
0
    if (!netdev_flow) {
4204
0
        if (put->flags & DPIF_FP_CREATE) {
4205
0
            dp_netdev_flow_add(pmd, match, ufid, put->actions,
4206
0
                               put->actions_len, ODPP_NONE);
4207
0
        } else {
4208
0
            error = ENOENT;
4209
0
        }
4210
0
    } else {
4211
0
        if (put->flags & DPIF_FP_MODIFY) {
4212
0
            struct dp_netdev_actions *new_actions;
4213
0
            struct dp_netdev_actions *old_actions;
4214
4215
0
            new_actions = dp_netdev_actions_create(put->actions,
4216
0
                                                   put->actions_len);
4217
4218
0
            old_actions = dp_netdev_flow_get_actions(netdev_flow);
4219
0
            ovsrcu_set(&netdev_flow->actions, new_actions);
4220
4221
0
            queue_netdev_flow_put(pmd, netdev_flow, match,
4222
0
                                  put->actions, put->actions_len,
4223
0
                                  DP_NETDEV_FLOW_OFFLOAD_OP_MOD);
4224
0
            log_netdev_flow_change(netdev_flow, match, old_actions,
4225
0
                                   put->actions, put->actions_len);
4226
4227
0
            if (stats) {
4228
0
                get_dpif_flow_status(pmd->dp, netdev_flow, stats, NULL);
4229
0
            }
4230
0
            if (put->flags & DPIF_FP_ZERO_STATS) {
4231
                /* XXX: The userspace datapath uses thread local statistics
4232
                 * (for flows), which should be updated only by the owning
4233
                 * thread.  Since we cannot write on stats memory here,
4234
                 * we choose not to support this flag.  Please note:
4235
                 * - This feature is currently used only by dpctl commands with
4236
                 *   option --clear.
4237
                 * - Should the need arise, this operation can be implemented
4238
                 *   by keeping a base value (to be update here) for each
4239
                 *   counter, and subtracting it before outputting the stats */
4240
0
                error = EOPNOTSUPP;
4241
0
            }
4242
4243
0
            ovsrcu_postpone(dp_netdev_actions_free, old_actions);
4244
0
        } else if (put->flags & DPIF_FP_CREATE) {
4245
0
            error = EEXIST;
4246
0
        } else {
4247
            /* Overlapping flow. */
4248
0
            error = EINVAL;
4249
0
        }
4250
0
    }
4251
0
    ovs_mutex_unlock(&pmd->flow_mutex);
4252
0
    return error;
4253
0
}
4254
4255
static int
4256
dpif_netdev_flow_put(struct dpif *dpif, const struct dpif_flow_put *put)
4257
0
{
4258
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
4259
0
    struct netdev_flow_key key;
4260
0
    struct dp_netdev_pmd_thread *pmd;
4261
0
    struct match match;
4262
0
    ovs_u128 ufid;
4263
0
    int error;
4264
0
    bool probe = put->flags & DPIF_FP_PROBE;
4265
4266
0
    if (put->stats) {
4267
0
        memset(put->stats, 0, sizeof *put->stats);
4268
0
    }
4269
0
    error = dpif_netdev_flow_from_nlattrs(put->key, put->key_len, &match.flow,
4270
0
                                          probe);
4271
0
    if (error) {
4272
0
        return error;
4273
0
    }
4274
0
    error = dpif_netdev_mask_from_nlattrs(put->key, put->key_len,
4275
0
                                          put->mask, put->mask_len,
4276
0
                                          &match.flow, &match.wc, probe);
4277
0
    if (error) {
4278
0
        return error;
4279
0
    }
4280
4281
0
    if (match.wc.masks.in_port.odp_port != ODPP_NONE) {
4282
0
        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
4283
4284
0
        VLOG_ERR_RL(&rl, "failed to put%s flow: in_port is not an exact match",
4285
0
                    (put->flags & DPIF_FP_CREATE) ? "[create]"
4286
0
                    : (put->flags & DPIF_FP_MODIFY) ? "[modify]" : "[zero]");
4287
0
        return EINVAL;
4288
0
    }
4289
4290
0
    if (put->ufid) {
4291
0
        ufid = *put->ufid;
4292
0
    } else {
4293
0
        odp_flow_key_hash(&match.flow, sizeof match.flow, &ufid);
4294
0
    }
4295
4296
    /* The Netlink encoding of datapath flow keys cannot express
4297
     * wildcarding the presence of a VLAN tag. Instead, a missing VLAN
4298
     * tag is interpreted as exact match on the fact that there is no
4299
     * VLAN.  Unless we refactor a lot of code that translates between
4300
     * Netlink and struct flow representations, we have to do the same
4301
     * here.  This must be in sync with 'match' in handle_packet_upcall(). */
4302
0
    if (!match.wc.masks.vlans[0].tci) {
4303
0
        match.wc.masks.vlans[0].tci = htons(VLAN_VID_MASK | VLAN_CFI);
4304
0
    }
4305
4306
    /* Must produce a netdev_flow_key for lookup.
4307
     * Use the same method as employed to create the key when adding
4308
     * the flow to the dplcs to make sure they match.
4309
     * We need to put in the unmasked key as flow_put_on_pmd() will first try
4310
     * to see if an entry exists doing a packet type lookup. As masked-out
4311
     * fields are interpreted as zeros, they could falsely match a wider IP
4312
     * address mask. Installation of the flow will use the match variable. */
4313
0
    netdev_flow_key_init(&key, &match.flow);
4314
4315
0
    if (put->pmd_id == PMD_ID_NULL) {
4316
0
        if (cmap_count(&dp->poll_threads) == 0) {
4317
0
            return EINVAL;
4318
0
        }
4319
0
        CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4320
0
            struct dpif_flow_stats pmd_stats;
4321
0
            int pmd_error;
4322
4323
0
            pmd_error = flow_put_on_pmd(pmd, &key, &match, &ufid, put,
4324
0
                                        &pmd_stats);
4325
0
            if (pmd_error) {
4326
0
                error = pmd_error;
4327
0
            } else if (put->stats) {
4328
0
                put->stats->n_packets += pmd_stats.n_packets;
4329
0
                put->stats->n_bytes += pmd_stats.n_bytes;
4330
0
                put->stats->used = MAX(put->stats->used, pmd_stats.used);
4331
0
                put->stats->tcp_flags |= pmd_stats.tcp_flags;
4332
0
            }
4333
0
        }
4334
0
    } else {
4335
0
        pmd = dp_netdev_get_pmd(dp, put->pmd_id);
4336
0
        if (!pmd) {
4337
0
            return EINVAL;
4338
0
        }
4339
0
        error = flow_put_on_pmd(pmd, &key, &match, &ufid, put, put->stats);
4340
0
        dp_netdev_pmd_unref(pmd);
4341
0
    }
4342
4343
0
    return error;
4344
0
}
4345
4346
static int
4347
flow_del_on_pmd(struct dp_netdev_pmd_thread *pmd,
4348
                struct dpif_flow_stats *stats,
4349
                const struct dpif_flow_del *del)
4350
0
{
4351
0
    struct dp_netdev_flow *netdev_flow;
4352
0
    int error = 0;
4353
4354
0
    ovs_mutex_lock(&pmd->flow_mutex);
4355
0
    netdev_flow = dp_netdev_pmd_find_flow(pmd, del->ufid, del->key,
4356
0
                                          del->key_len);
4357
0
    if (netdev_flow) {
4358
0
        if (stats) {
4359
0
            get_dpif_flow_status(pmd->dp, netdev_flow, stats, NULL);
4360
0
        }
4361
0
        dp_netdev_pmd_remove_flow(pmd, netdev_flow);
4362
0
    } else {
4363
0
        error = ENOENT;
4364
0
    }
4365
0
    ovs_mutex_unlock(&pmd->flow_mutex);
4366
4367
0
    return error;
4368
0
}
4369
4370
static int
4371
dpif_netdev_flow_del(struct dpif *dpif, const struct dpif_flow_del *del)
4372
0
{
4373
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
4374
0
    struct dp_netdev_pmd_thread *pmd;
4375
0
    int error = 0;
4376
4377
0
    if (del->stats) {
4378
0
        memset(del->stats, 0, sizeof *del->stats);
4379
0
    }
4380
4381
0
    if (del->pmd_id == PMD_ID_NULL) {
4382
0
        if (cmap_count(&dp->poll_threads) == 0) {
4383
0
            return EINVAL;
4384
0
        }
4385
0
        CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4386
0
            struct dpif_flow_stats pmd_stats;
4387
0
            int pmd_error;
4388
4389
0
            pmd_error = flow_del_on_pmd(pmd, &pmd_stats, del);
4390
0
            if (pmd_error) {
4391
0
                error = pmd_error;
4392
0
            } else if (del->stats) {
4393
0
                del->stats->n_packets += pmd_stats.n_packets;
4394
0
                del->stats->n_bytes += pmd_stats.n_bytes;
4395
0
                del->stats->used = MAX(del->stats->used, pmd_stats.used);
4396
0
                del->stats->tcp_flags |= pmd_stats.tcp_flags;
4397
0
            }
4398
0
        }
4399
0
    } else {
4400
0
        pmd = dp_netdev_get_pmd(dp, del->pmd_id);
4401
0
        if (!pmd) {
4402
0
            return EINVAL;
4403
0
        }
4404
0
        error = flow_del_on_pmd(pmd, del->stats, del);
4405
0
        dp_netdev_pmd_unref(pmd);
4406
0
    }
4407
4408
4409
0
    return error;
4410
0
}
4411
4412
struct dpif_netdev_flow_dump {
4413
    struct dpif_flow_dump up;
4414
    struct cmap_position poll_thread_pos;
4415
    struct cmap_position flow_pos;
4416
    struct dp_netdev_pmd_thread *cur_pmd;
4417
    int status;
4418
    struct ovs_mutex mutex;
4419
};
4420
4421
static struct dpif_netdev_flow_dump *
4422
dpif_netdev_flow_dump_cast(struct dpif_flow_dump *dump)
4423
0
{
4424
0
    return CONTAINER_OF(dump, struct dpif_netdev_flow_dump, up);
4425
0
}
4426
4427
static struct dpif_flow_dump *
4428
dpif_netdev_flow_dump_create(const struct dpif *dpif_, bool terse,
4429
                             struct dpif_flow_dump_types *types OVS_UNUSED)
4430
0
{
4431
0
    struct dpif_netdev_flow_dump *dump;
4432
4433
0
    dump = xzalloc(sizeof *dump);
4434
0
    dpif_flow_dump_init(&dump->up, dpif_);
4435
0
    dump->up.terse = terse;
4436
0
    ovs_mutex_init(&dump->mutex);
4437
4438
0
    return &dump->up;
4439
0
}
4440
4441
static int
4442
dpif_netdev_flow_dump_destroy(struct dpif_flow_dump *dump_)
4443
0
{
4444
0
    struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
4445
4446
0
    ovs_mutex_destroy(&dump->mutex);
4447
0
    free(dump);
4448
0
    return 0;
4449
0
}
4450
4451
struct dpif_netdev_flow_dump_thread {
4452
    struct dpif_flow_dump_thread up;
4453
    struct dpif_netdev_flow_dump *dump;
4454
    struct odputil_keybuf keybuf[FLOW_DUMP_MAX_BATCH];
4455
    struct odputil_keybuf maskbuf[FLOW_DUMP_MAX_BATCH];
4456
};
4457
4458
static struct dpif_netdev_flow_dump_thread *
4459
dpif_netdev_flow_dump_thread_cast(struct dpif_flow_dump_thread *thread)
4460
0
{
4461
0
    return CONTAINER_OF(thread, struct dpif_netdev_flow_dump_thread, up);
4462
0
}
4463
4464
static struct dpif_flow_dump_thread *
4465
dpif_netdev_flow_dump_thread_create(struct dpif_flow_dump *dump_)
4466
0
{
4467
0
    struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
4468
0
    struct dpif_netdev_flow_dump_thread *thread;
4469
4470
0
    thread = xmalloc(sizeof *thread);
4471
0
    dpif_flow_dump_thread_init(&thread->up, &dump->up);
4472
0
    thread->dump = dump;
4473
0
    return &thread->up;
4474
0
}
4475
4476
static void
4477
dpif_netdev_flow_dump_thread_destroy(struct dpif_flow_dump_thread *thread_)
4478
0
{
4479
0
    struct dpif_netdev_flow_dump_thread *thread
4480
0
        = dpif_netdev_flow_dump_thread_cast(thread_);
4481
4482
0
    free(thread);
4483
0
}
4484
4485
static int
4486
dpif_netdev_flow_dump_next(struct dpif_flow_dump_thread *thread_,
4487
                           struct dpif_flow *flows, int max_flows)
4488
0
{
4489
0
    struct dpif_netdev_flow_dump_thread *thread
4490
0
        = dpif_netdev_flow_dump_thread_cast(thread_);
4491
0
    struct dpif_netdev_flow_dump *dump = thread->dump;
4492
0
    struct dp_netdev_flow *netdev_flows[FLOW_DUMP_MAX_BATCH];
4493
0
    struct dpif_netdev *dpif = dpif_netdev_cast(thread->up.dpif);
4494
0
    struct dp_netdev *dp = get_dp_netdev(&dpif->dpif);
4495
0
    int n_flows = 0;
4496
0
    int i;
4497
4498
0
    ovs_mutex_lock(&dump->mutex);
4499
0
    if (!dump->status) {
4500
0
        struct dp_netdev_pmd_thread *pmd = dump->cur_pmd;
4501
0
        int flow_limit = MIN(max_flows, FLOW_DUMP_MAX_BATCH);
4502
4503
        /* First call to dump_next(), extracts the first pmd thread.
4504
         * If there is no pmd thread, returns immediately. */
4505
0
        if (!pmd) {
4506
0
            pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
4507
0
            if (!pmd) {
4508
0
                ovs_mutex_unlock(&dump->mutex);
4509
0
                return n_flows;
4510
4511
0
            }
4512
0
        }
4513
4514
0
        do {
4515
0
            for (n_flows = 0; n_flows < flow_limit; n_flows++) {
4516
0
                struct cmap_node *node;
4517
4518
0
                node = cmap_next_position(&pmd->flow_table, &dump->flow_pos);
4519
0
                if (!node) {
4520
0
                    break;
4521
0
                }
4522
0
                netdev_flows[n_flows] = CONTAINER_OF(node,
4523
0
                                                     struct dp_netdev_flow,
4524
0
                                                     node);
4525
0
            }
4526
            /* When finishing dumping the current pmd thread, moves to
4527
             * the next. */
4528
0
            if (n_flows < flow_limit) {
4529
0
                memset(&dump->flow_pos, 0, sizeof dump->flow_pos);
4530
0
                dp_netdev_pmd_unref(pmd);
4531
0
                pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
4532
0
                if (!pmd) {
4533
0
                    dump->status = EOF;
4534
0
                    break;
4535
0
                }
4536
0
            }
4537
            /* Keeps the reference to next caller. */
4538
0
            dump->cur_pmd = pmd;
4539
4540
            /* If the current dump is empty, do not exit the loop, since the
4541
             * remaining pmds could have flows to be dumped.  Just dumps again
4542
             * on the new 'pmd'. */
4543
0
        } while (!n_flows);
4544
0
    }
4545
0
    ovs_mutex_unlock(&dump->mutex);
4546
4547
0
    for (i = 0; i < n_flows; i++) {
4548
0
        struct odputil_keybuf *maskbuf = &thread->maskbuf[i];
4549
0
        struct odputil_keybuf *keybuf = &thread->keybuf[i];
4550
0
        struct dp_netdev_flow *netdev_flow = netdev_flows[i];
4551
0
        struct dpif_flow *f = &flows[i];
4552
0
        struct ofpbuf key, mask;
4553
4554
0
        ofpbuf_use_stack(&key, keybuf, sizeof *keybuf);
4555
0
        ofpbuf_use_stack(&mask, maskbuf, sizeof *maskbuf);
4556
0
        dp_netdev_flow_to_dpif_flow(dp, netdev_flow, &key, &mask, f,
4557
0
                                    dump->up.terse);
4558
0
    }
4559
4560
0
    return n_flows;
4561
0
}
4562
4563
static int
4564
dpif_netdev_execute(struct dpif *dpif, struct dpif_execute *execute)
4565
    OVS_NO_THREAD_SAFETY_ANALYSIS
4566
0
{
4567
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
4568
0
    struct dp_netdev_pmd_thread *pmd;
4569
0
    struct dp_packet_batch pp;
4570
4571
0
    if (dp_packet_size(execute->packet) < ETH_HEADER_LEN ||
4572
0
        dp_packet_size(execute->packet) > UINT16_MAX) {
4573
0
        return EINVAL;
4574
0
    }
4575
4576
    /* Tries finding the 'pmd'.  If NULL is returned, that means
4577
     * the current thread is a non-pmd thread and should use
4578
     * dp_netdev_get_pmd(dp, NON_PMD_CORE_ID). */
4579
0
    pmd = ovsthread_getspecific(dp->per_pmd_key);
4580
0
    if (!pmd) {
4581
0
        pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
4582
0
        if (!pmd) {
4583
0
            return EBUSY;
4584
0
        }
4585
0
    }
4586
4587
0
    if (execute->probe) {
4588
        /* If this is part of a probe, Drop the packet, since executing
4589
         * the action may actually cause spurious packets be sent into
4590
         * the network. */
4591
0
        if (pmd->core_id == NON_PMD_CORE_ID) {
4592
0
            dp_netdev_pmd_unref(pmd);
4593
0
        }
4594
0
        return 0;
4595
0
    }
4596
4597
    /* If the current thread is non-pmd thread, acquires
4598
     * the 'non_pmd_mutex'. */
4599
0
    if (pmd->core_id == NON_PMD_CORE_ID) {
4600
0
        ovs_mutex_lock(&dp->non_pmd_mutex);
4601
0
    }
4602
4603
    /* Update current time in PMD context. We don't care about EMC insertion
4604
     * probability, because we are on a slow path. */
4605
0
    pmd_thread_ctx_time_update(pmd);
4606
4607
    /* The action processing expects the RSS hash to be valid, because
4608
     * it's always initialized at the beginning of datapath processing.
4609
     * In this case, though, 'execute->packet' may not have gone through
4610
     * the datapath at all, it may have been generated by the upper layer
4611
     * (OpenFlow packet-out, BFD frame, ...). */
4612
0
    if (!dp_packet_rss_valid(execute->packet)) {
4613
0
        dp_packet_set_rss_hash(execute->packet,
4614
0
                               flow_hash_5tuple(execute->flow, 0));
4615
0
    }
4616
4617
    /* Making a copy because the packet might be stolen during the execution
4618
     * and caller might still need it.  */
4619
0
    struct dp_packet *packet_clone = dp_packet_clone(execute->packet);
4620
0
    dp_packet_batch_init_packet(&pp, packet_clone);
4621
0
    dp_netdev_execute_actions(pmd, &pp, false, execute->flow,
4622
0
                              execute->actions, execute->actions_len);
4623
0
    dp_netdev_pmd_flush_output_packets(pmd, true);
4624
4625
0
    if (pmd->core_id == NON_PMD_CORE_ID) {
4626
0
        ovs_mutex_unlock(&dp->non_pmd_mutex);
4627
0
        dp_netdev_pmd_unref(pmd);
4628
0
    }
4629
4630
0
    if (dp_packet_batch_size(&pp) == 1) {
4631
        /* Packet wasn't dropped during the execution.  Swapping content with
4632
         * the original packet, because the caller might expect actions to
4633
         * modify it.  Uisng the packet from a batch instead of 'packet_clone'
4634
         * because it maybe stolen and replaced by other packet, e.g. by
4635
         * the fragmentation engine. */
4636
0
        dp_packet_swap(execute->packet, pp.packets[0]);
4637
0
        dp_packet_delete_batch(&pp, true);
4638
0
    } else if (dp_packet_batch_size(&pp)) {
4639
        /* FIXME: We have more packets than expected.  Likely, we got IP
4640
         * fragments of the reassembled packet.  Dropping them here as we have
4641
         * no way to get them to the caller.  It might be that all the required
4642
         * actions with them are already executed, but it also might not be a
4643
         * case, e.g. if dpif_netdev_execute() called to execute a single
4644
         * tunnel push. */
4645
0
        dp_packet_delete_batch(&pp, true);
4646
0
    }
4647
4648
0
    return 0;
4649
0
}
4650
4651
static void
4652
dpif_netdev_operate(struct dpif *dpif, struct dpif_op **ops, size_t n_ops,
4653
                    enum dpif_offload_type offload_type OVS_UNUSED)
4654
0
{
4655
0
    size_t i;
4656
4657
0
    for (i = 0; i < n_ops; i++) {
4658
0
        struct dpif_op *op = ops[i];
4659
4660
0
        switch (op->type) {
4661
0
        case DPIF_OP_FLOW_PUT:
4662
0
            op->error = dpif_netdev_flow_put(dpif, &op->flow_put);
4663
0
            break;
4664
4665
0
        case DPIF_OP_FLOW_DEL:
4666
0
            op->error = dpif_netdev_flow_del(dpif, &op->flow_del);
4667
0
            break;
4668
4669
0
        case DPIF_OP_EXECUTE:
4670
0
            op->error = dpif_netdev_execute(dpif, &op->execute);
4671
0
            break;
4672
4673
0
        case DPIF_OP_FLOW_GET:
4674
0
            op->error = dpif_netdev_flow_get(dpif, &op->flow_get);
4675
0
            break;
4676
0
        }
4677
0
    }
4678
0
}
4679
4680
static int
4681
dpif_netdev_offload_stats_get(struct dpif *dpif,
4682
                              struct netdev_custom_stats *stats)
4683
0
{
4684
0
    enum {
4685
0
        DP_NETDEV_HW_OFFLOADS_STATS_ENQUEUED,
4686
0
        DP_NETDEV_HW_OFFLOADS_STATS_INSERTED,
4687
0
        DP_NETDEV_HW_OFFLOADS_STATS_LAT_CMA_MEAN,
4688
0
        DP_NETDEV_HW_OFFLOADS_STATS_LAT_CMA_STDDEV,
4689
0
        DP_NETDEV_HW_OFFLOADS_STATS_LAT_EMA_MEAN,
4690
0
        DP_NETDEV_HW_OFFLOADS_STATS_LAT_EMA_STDDEV,
4691
0
    };
4692
0
    struct {
4693
0
        const char *name;
4694
0
        uint64_t total;
4695
0
    } hwol_stats[] = {
4696
0
        [DP_NETDEV_HW_OFFLOADS_STATS_ENQUEUED] =
4697
0
            { "                Enqueued offloads", 0 },
4698
0
        [DP_NETDEV_HW_OFFLOADS_STATS_INSERTED] =
4699
0
            { "                Inserted offloads", 0 },
4700
0
        [DP_NETDEV_HW_OFFLOADS_STATS_LAT_CMA_MEAN] =
4701
0
            { "  Cumulative Average latency (us)", 0 },
4702
0
        [DP_NETDEV_HW_OFFLOADS_STATS_LAT_CMA_STDDEV] =
4703
0
            { "   Cumulative Latency stddev (us)", 0 },
4704
0
        [DP_NETDEV_HW_OFFLOADS_STATS_LAT_EMA_MEAN] =
4705
0
            { " Exponential Average latency (us)", 0 },
4706
0
        [DP_NETDEV_HW_OFFLOADS_STATS_LAT_EMA_STDDEV] =
4707
0
            { "  Exponential Latency stddev (us)", 0 },
4708
0
    };
4709
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
4710
0
    struct dp_netdev_port *port;
4711
0
    unsigned int nb_thread;
4712
0
    uint64_t *port_nb_offloads;
4713
0
    uint64_t *nb_offloads;
4714
0
    unsigned int tid;
4715
0
    size_t i;
4716
4717
0
    if (!netdev_is_flow_api_enabled()) {
4718
0
        return EINVAL;
4719
0
    }
4720
4721
0
    nb_thread = netdev_offload_thread_nb();
4722
    /* nb_thread counters for the overall total as well. */
4723
0
    stats->size = ARRAY_SIZE(hwol_stats) * (nb_thread + 1);
4724
0
    stats->counters = xcalloc(stats->size, sizeof *stats->counters);
4725
4726
0
    nb_offloads = xcalloc(nb_thread, sizeof *nb_offloads);
4727
0
    port_nb_offloads = xcalloc(nb_thread, sizeof *port_nb_offloads);
4728
4729
0
    ovs_rwlock_rdlock(&dp->port_rwlock);
4730
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
4731
0
        memset(port_nb_offloads, 0, nb_thread * sizeof *port_nb_offloads);
4732
        /* Do not abort on read error from a port, just report 0. */
4733
0
        if (!netdev_flow_get_n_flows(port->netdev, port_nb_offloads)) {
4734
0
            for (i = 0; i < nb_thread; i++) {
4735
0
                nb_offloads[i] += port_nb_offloads[i];
4736
0
            }
4737
0
        }
4738
0
    }
4739
0
    ovs_rwlock_unlock(&dp->port_rwlock);
4740
4741
0
    free(port_nb_offloads);
4742
4743
0
    for (tid = 0; tid < nb_thread; tid++) {
4744
0
        uint64_t counts[ARRAY_SIZE(hwol_stats)];
4745
0
        size_t idx = ((tid + 1) * ARRAY_SIZE(hwol_stats));
4746
4747
0
        memset(counts, 0, sizeof counts);
4748
0
        counts[DP_NETDEV_HW_OFFLOADS_STATS_INSERTED] = nb_offloads[tid];
4749
0
        if (dp_offload_threads != NULL) {
4750
0
            atomic_read_relaxed(&dp_offload_threads[tid].enqueued_item,
4751
0
                                &counts[DP_NETDEV_HW_OFFLOADS_STATS_ENQUEUED]);
4752
4753
0
            counts[DP_NETDEV_HW_OFFLOADS_STATS_LAT_CMA_MEAN] =
4754
0
                mov_avg_cma(&dp_offload_threads[tid].cma);
4755
0
            counts[DP_NETDEV_HW_OFFLOADS_STATS_LAT_CMA_STDDEV] =
4756
0
                mov_avg_cma_std_dev(&dp_offload_threads[tid].cma);
4757
4758
0
            counts[DP_NETDEV_HW_OFFLOADS_STATS_LAT_EMA_MEAN] =
4759
0
                mov_avg_ema(&dp_offload_threads[tid].ema);
4760
0
            counts[DP_NETDEV_HW_OFFLOADS_STATS_LAT_EMA_STDDEV] =
4761
0
                mov_avg_ema_std_dev(&dp_offload_threads[tid].ema);
4762
0
        }
4763
4764
0
        for (i = 0; i < ARRAY_SIZE(hwol_stats); i++) {
4765
0
            snprintf(stats->counters[idx + i].name,
4766
0
                     sizeof(stats->counters[idx + i].name),
4767
0
                     "  [%3u] %s", tid, hwol_stats[i].name);
4768
0
            stats->counters[idx + i].value = counts[i];
4769
0
            hwol_stats[i].total += counts[i];
4770
0
        }
4771
0
    }
4772
4773
0
    free(nb_offloads);
4774
4775
    /* Do an average of the average for the aggregate. */
4776
0
    hwol_stats[DP_NETDEV_HW_OFFLOADS_STATS_LAT_CMA_MEAN].total /= nb_thread;
4777
0
    hwol_stats[DP_NETDEV_HW_OFFLOADS_STATS_LAT_CMA_STDDEV].total /= nb_thread;
4778
0
    hwol_stats[DP_NETDEV_HW_OFFLOADS_STATS_LAT_EMA_MEAN].total /= nb_thread;
4779
0
    hwol_stats[DP_NETDEV_HW_OFFLOADS_STATS_LAT_EMA_STDDEV].total /= nb_thread;
4780
4781
0
    for (i = 0; i < ARRAY_SIZE(hwol_stats); i++) {
4782
0
        snprintf(stats->counters[i].name, sizeof(stats->counters[i].name),
4783
0
                 "  Total %s", hwol_stats[i].name);
4784
0
        stats->counters[i].value = hwol_stats[i].total;
4785
0
    }
4786
4787
0
    return 0;
4788
0
}
4789
4790
/* Enable or Disable PMD auto load balancing. */
4791
static void
4792
set_pmd_auto_lb(struct dp_netdev *dp, bool state, bool always_log)
4793
0
{
4794
0
    struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
4795
4796
0
    if (pmd_alb->is_enabled != state || always_log) {
4797
0
        pmd_alb->is_enabled = state;
4798
0
        if (pmd_alb->is_enabled) {
4799
0
            uint8_t rebalance_load_thresh;
4800
4801
0
            atomic_read_relaxed(&pmd_alb->rebalance_load_thresh,
4802
0
                                &rebalance_load_thresh);
4803
0
            VLOG_INFO("PMD auto load balance is enabled, "
4804
0
                      "interval %"PRIu64" mins, "
4805
0
                      "pmd load threshold %"PRIu8"%%, "
4806
0
                      "improvement threshold %"PRIu8"%%.",
4807
0
                       pmd_alb->rebalance_intvl / MIN_TO_MSEC,
4808
0
                       rebalance_load_thresh,
4809
0
                       pmd_alb->rebalance_improve_thresh);
4810
0
        } else {
4811
0
            pmd_alb->rebalance_poll_timer = 0;
4812
0
            VLOG_INFO("PMD auto load balance is disabled.");
4813
0
        }
4814
0
    }
4815
0
}
4816
4817
/* Applies datapath configuration from the database. Some of the changes are
4818
 * actually applied in dpif_netdev_run(). */
4819
static int
4820
dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config)
4821
0
{
4822
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
4823
0
    const char *cmask = smap_get(other_config, "pmd-cpu-mask");
4824
0
    const char *pmd_rxq_assign = smap_get_def(other_config, "pmd-rxq-assign",
4825
0
                                             "cycles");
4826
0
    unsigned long long insert_prob =
4827
0
        smap_get_ullong(other_config, "emc-insert-inv-prob",
4828
0
                        DEFAULT_EM_FLOW_INSERT_INV_PROB);
4829
0
    uint32_t insert_min, cur_min;
4830
0
    uint32_t tx_flush_interval, cur_tx_flush_interval;
4831
0
    uint64_t rebalance_intvl;
4832
0
    uint8_t cur_rebalance_load;
4833
0
    uint32_t rebalance_load, rebalance_improve;
4834
0
    uint64_t  pmd_max_sleep, cur_pmd_max_sleep;
4835
0
    bool log_autolb = false;
4836
0
    enum sched_assignment_type pmd_rxq_assign_type;
4837
0
    static bool first_set_config = true;
4838
4839
0
    tx_flush_interval = smap_get_int(other_config, "tx-flush-interval",
4840
0
                                     DEFAULT_TX_FLUSH_INTERVAL);
4841
0
    atomic_read_relaxed(&dp->tx_flush_interval, &cur_tx_flush_interval);
4842
0
    if (tx_flush_interval != cur_tx_flush_interval) {
4843
0
        atomic_store_relaxed(&dp->tx_flush_interval, tx_flush_interval);
4844
0
        VLOG_INFO("Flushing interval for tx queues set to %"PRIu32" us",
4845
0
                  tx_flush_interval);
4846
0
    }
4847
4848
0
    if (!nullable_string_is_equal(dp->pmd_cmask, cmask)) {
4849
0
        free(dp->pmd_cmask);
4850
0
        dp->pmd_cmask = nullable_xstrdup(cmask);
4851
0
        dp_netdev_request_reconfigure(dp);
4852
0
    }
4853
4854
0
    atomic_read_relaxed(&dp->emc_insert_min, &cur_min);
4855
0
    if (insert_prob <= UINT32_MAX) {
4856
0
        insert_min = insert_prob == 0 ? 0 : UINT32_MAX / insert_prob;
4857
0
    } else {
4858
0
        insert_min = DEFAULT_EM_FLOW_INSERT_MIN;
4859
0
        insert_prob = DEFAULT_EM_FLOW_INSERT_INV_PROB;
4860
0
    }
4861
4862
0
    if (insert_min != cur_min) {
4863
0
        atomic_store_relaxed(&dp->emc_insert_min, insert_min);
4864
0
        if (insert_min == 0) {
4865
0
            VLOG_INFO("EMC insertion probability changed to zero");
4866
0
        } else {
4867
0
            VLOG_INFO("EMC insertion probability changed to 1/%llu (~%.2f%%)",
4868
0
                      insert_prob, (100 / (float)insert_prob));
4869
0
        }
4870
0
    }
4871
4872
0
    bool perf_enabled = smap_get_bool(other_config, "pmd-perf-metrics", false);
4873
0
    bool cur_perf_enabled;
4874
0
    atomic_read_relaxed(&dp->pmd_perf_metrics, &cur_perf_enabled);
4875
0
    if (perf_enabled != cur_perf_enabled) {
4876
0
        atomic_store_relaxed(&dp->pmd_perf_metrics, perf_enabled);
4877
0
        if (perf_enabled) {
4878
0
            VLOG_INFO("PMD performance metrics collection enabled");
4879
0
        } else {
4880
0
            VLOG_INFO("PMD performance metrics collection disabled");
4881
0
        }
4882
0
    }
4883
4884
0
    bool smc_enable = smap_get_bool(other_config, "smc-enable", false);
4885
0
    bool cur_smc;
4886
0
    atomic_read_relaxed(&dp->smc_enable_db, &cur_smc);
4887
0
    if (smc_enable != cur_smc) {
4888
0
        atomic_store_relaxed(&dp->smc_enable_db, smc_enable);
4889
0
        if (smc_enable) {
4890
0
            VLOG_INFO("SMC cache is enabled");
4891
0
        } else {
4892
0
            VLOG_INFO("SMC cache is disabled");
4893
0
        }
4894
0
    }
4895
4896
0
    if (!strcmp(pmd_rxq_assign, "roundrobin")) {
4897
0
        pmd_rxq_assign_type = SCHED_ROUNDROBIN;
4898
0
    } else if (!strcmp(pmd_rxq_assign, "cycles")) {
4899
0
        pmd_rxq_assign_type = SCHED_CYCLES;
4900
0
    } else if (!strcmp(pmd_rxq_assign, "group")) {
4901
0
        pmd_rxq_assign_type = SCHED_GROUP;
4902
0
    } else {
4903
        /* Default. */
4904
0
        VLOG_WARN("Unsupported rx queue to PMD assignment mode in "
4905
0
                  "pmd-rxq-assign. Defaulting to 'cycles'.");
4906
0
        pmd_rxq_assign_type = SCHED_CYCLES;
4907
0
        pmd_rxq_assign = "cycles";
4908
0
    }
4909
0
    if (dp->pmd_rxq_assign_type != pmd_rxq_assign_type) {
4910
0
        dp->pmd_rxq_assign_type = pmd_rxq_assign_type;
4911
0
        VLOG_INFO("Rxq to PMD assignment mode changed to: \'%s\'.",
4912
0
                  pmd_rxq_assign);
4913
0
        dp_netdev_request_reconfigure(dp);
4914
0
    }
4915
4916
0
    bool pmd_iso = smap_get_bool(other_config, "pmd-rxq-isolate", true);
4917
4918
0
    if (pmd_rxq_assign_type != SCHED_GROUP && pmd_iso == false) {
4919
        /* Invalid combination. */
4920
0
        VLOG_WARN("pmd-rxq-isolate can only be set false "
4921
0
                  "when using pmd-rxq-assign=group");
4922
0
        pmd_iso = true;
4923
0
    }
4924
0
    if (dp->pmd_iso != pmd_iso) {
4925
0
        dp->pmd_iso = pmd_iso;
4926
0
        if (pmd_iso) {
4927
0
            VLOG_INFO("pmd-rxq-affinity isolates PMD core");
4928
0
        } else {
4929
0
            VLOG_INFO("pmd-rxq-affinity does not isolate PMD core");
4930
0
        }
4931
0
        dp_netdev_request_reconfigure(dp);
4932
0
    }
4933
4934
0
    struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
4935
4936
0
    rebalance_intvl = smap_get_ullong(other_config,
4937
0
                                      "pmd-auto-lb-rebal-interval",
4938
0
                                      ALB_REBALANCE_INTERVAL);
4939
0
    if (rebalance_intvl > MAX_ALB_REBALANCE_INTERVAL) {
4940
0
        rebalance_intvl = ALB_REBALANCE_INTERVAL;
4941
0
    }
4942
4943
    /* Input is in min, convert it to msec. */
4944
0
    rebalance_intvl =
4945
0
        rebalance_intvl ? rebalance_intvl * MIN_TO_MSEC : MIN_TO_MSEC;
4946
4947
0
    if (pmd_alb->rebalance_intvl != rebalance_intvl) {
4948
0
        pmd_alb->rebalance_intvl = rebalance_intvl;
4949
0
        VLOG_INFO("PMD auto load balance interval set to "
4950
0
                  "%"PRIu64" mins\n", rebalance_intvl / MIN_TO_MSEC);
4951
0
        log_autolb = true;
4952
0
    }
4953
4954
0
    rebalance_improve = smap_get_uint(other_config,
4955
0
                                      "pmd-auto-lb-improvement-threshold",
4956
0
                                      ALB_IMPROVEMENT_THRESHOLD);
4957
0
    if (rebalance_improve > 100) {
4958
0
        rebalance_improve = ALB_IMPROVEMENT_THRESHOLD;
4959
0
    }
4960
0
    if (rebalance_improve != pmd_alb->rebalance_improve_thresh) {
4961
0
        pmd_alb->rebalance_improve_thresh = rebalance_improve;
4962
0
        VLOG_INFO("PMD auto load balance improvement threshold set to "
4963
0
                  "%"PRIu32"%%", rebalance_improve);
4964
0
        log_autolb = true;
4965
0
    }
4966
4967
0
    rebalance_load = smap_get_uint(other_config, "pmd-auto-lb-load-threshold",
4968
0
                                   ALB_LOAD_THRESHOLD);
4969
0
    if (rebalance_load > 100) {
4970
0
        rebalance_load = ALB_LOAD_THRESHOLD;
4971
0
    }
4972
0
    atomic_read_relaxed(&pmd_alb->rebalance_load_thresh, &cur_rebalance_load);
4973
0
    if (rebalance_load != cur_rebalance_load) {
4974
0
        atomic_store_relaxed(&pmd_alb->rebalance_load_thresh,
4975
0
                             rebalance_load);
4976
0
        VLOG_INFO("PMD auto load balance load threshold set to %"PRIu32"%%",
4977
0
                  rebalance_load);
4978
0
        log_autolb = true;
4979
0
    }
4980
4981
0
    bool autolb_state = smap_get_bool(other_config, "pmd-auto-lb", false);
4982
4983
0
    set_pmd_auto_lb(dp, autolb_state, log_autolb);
4984
4985
0
    pmd_max_sleep = smap_get_ullong(other_config, "pmd-maxsleep", 0);
4986
0
    pmd_max_sleep = MIN(PMD_RCU_QUIESCE_INTERVAL, pmd_max_sleep);
4987
0
    atomic_read_relaxed(&dp->pmd_max_sleep, &cur_pmd_max_sleep);
4988
0
    if (first_set_config || pmd_max_sleep != cur_pmd_max_sleep) {
4989
0
        atomic_store_relaxed(&dp->pmd_max_sleep, pmd_max_sleep);
4990
0
        VLOG_INFO("PMD max sleep request is %"PRIu64" usecs.", pmd_max_sleep);
4991
0
        VLOG_INFO("PMD load based sleeps are %s.",
4992
0
                  pmd_max_sleep ? "enabled" : "disabled" );
4993
0
    }
4994
4995
0
    first_set_config  = false;
4996
0
    return 0;
4997
0
}
4998
4999
/* Parses affinity list and returns result in 'core_ids'. */
5000
static int
5001
parse_affinity_list(const char *affinity_list, unsigned *core_ids, int n_rxq)
5002
0
{
5003
0
    unsigned i;
5004
0
    char *list, *copy, *key, *value;
5005
0
    int error = 0;
5006
5007
0
    for (i = 0; i < n_rxq; i++) {
5008
0
        core_ids[i] = OVS_CORE_UNSPEC;
5009
0
    }
5010
5011
0
    if (!affinity_list) {
5012
0
        return 0;
5013
0
    }
5014
5015
0
    list = copy = xstrdup(affinity_list);
5016
5017
0
    while (ofputil_parse_key_value(&list, &key, &value)) {
5018
0
        int rxq_id, core_id;
5019
5020
0
        if (!str_to_int(key, 0, &rxq_id) || rxq_id < 0
5021
0
            || !str_to_int(value, 0, &core_id) || core_id < 0) {
5022
0
            error = EINVAL;
5023
0
            break;
5024
0
        }
5025
5026
0
        if (rxq_id < n_rxq) {
5027
0
            core_ids[rxq_id] = core_id;
5028
0
        }
5029
0
    }
5030
5031
0
    free(copy);
5032
0
    return error;
5033
0
}
5034
5035
/* Parses 'affinity_list' and applies configuration if it is valid. */
5036
static int
5037
dpif_netdev_port_set_rxq_affinity(struct dp_netdev_port *port,
5038
                                  const char *affinity_list)
5039
0
{
5040
0
    unsigned *core_ids, i;
5041
0
    int error = 0;
5042
5043
0
    core_ids = xmalloc(port->n_rxq * sizeof *core_ids);
5044
0
    if (parse_affinity_list(affinity_list, core_ids, port->n_rxq)) {
5045
0
        error = EINVAL;
5046
0
        goto exit;
5047
0
    }
5048
5049
0
    for (i = 0; i < port->n_rxq; i++) {
5050
0
        port->rxqs[i].core_id = core_ids[i];
5051
0
    }
5052
5053
0
exit:
5054
0
    free(core_ids);
5055
0
    return error;
5056
0
}
5057
5058
/* Returns 'true' if one of the 'port's RX queues exists in 'poll_list'
5059
 * of given PMD thread. */
5060
static bool
5061
dpif_netdev_pmd_polls_port(struct dp_netdev_pmd_thread *pmd,
5062
                           struct dp_netdev_port *port)
5063
    OVS_EXCLUDED(pmd->port_mutex)
5064
0
{
5065
0
    struct rxq_poll *poll;
5066
0
    bool found = false;
5067
5068
0
    ovs_mutex_lock(&pmd->port_mutex);
5069
0
    HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
5070
0
        if (port == poll->rxq->port) {
5071
0
            found = true;
5072
0
            break;
5073
0
        }
5074
0
    }
5075
0
    ovs_mutex_unlock(&pmd->port_mutex);
5076
0
    return found;
5077
0
}
5078
5079
/* Updates port configuration from the database.  The changes are actually
5080
 * applied in dpif_netdev_run(). */
5081
static int
5082
dpif_netdev_port_set_config(struct dpif *dpif, odp_port_t port_no,
5083
                            const struct smap *cfg)
5084
0
{
5085
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
5086
0
    struct dp_netdev_port *port;
5087
0
    int error = 0;
5088
0
    const char *affinity_list = smap_get(cfg, "pmd-rxq-affinity");
5089
0
    bool emc_enabled = smap_get_bool(cfg, "emc-enable", true);
5090
0
    const char *tx_steering_mode = smap_get(cfg, "tx-steering");
5091
0
    enum txq_req_mode txq_mode;
5092
5093
0
    ovs_rwlock_wrlock(&dp->port_rwlock);
5094
0
    error = get_port_by_number(dp, port_no, &port);
5095
0
    if (error) {
5096
0
        goto unlock;
5097
0
    }
5098
5099
0
    if (emc_enabled != port->emc_enabled) {
5100
0
        struct dp_netdev_pmd_thread *pmd;
5101
0
        struct ds ds = DS_EMPTY_INITIALIZER;
5102
0
        uint32_t cur_min, insert_prob;
5103
5104
0
        port->emc_enabled = emc_enabled;
5105
        /* Mark for reload all the threads that polls this port and request
5106
         * for reconfiguration for the actual reloading of threads. */
5107
0
        CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5108
0
            if (dpif_netdev_pmd_polls_port(pmd, port)) {
5109
0
                pmd->need_reload = true;
5110
0
            }
5111
0
        }
5112
0
        dp_netdev_request_reconfigure(dp);
5113
5114
0
        ds_put_format(&ds, "%s: EMC has been %s.",
5115
0
                      netdev_get_name(port->netdev),
5116
0
                      (emc_enabled) ? "enabled" : "disabled");
5117
0
        if (emc_enabled) {
5118
0
            ds_put_cstr(&ds, " Current insertion probability is ");
5119
0
            atomic_read_relaxed(&dp->emc_insert_min, &cur_min);
5120
0
            if (!cur_min) {
5121
0
                ds_put_cstr(&ds, "zero.");
5122
0
            } else {
5123
0
                insert_prob = UINT32_MAX / cur_min;
5124
0
                ds_put_format(&ds, "1/%"PRIu32" (~%.2f%%).",
5125
0
                              insert_prob, 100 / (float) insert_prob);
5126
0
            }
5127
0
        }
5128
0
        VLOG_INFO("%s", ds_cstr(&ds));
5129
0
        ds_destroy(&ds);
5130
0
    }
5131
5132
    /* Checking for RXq affinity changes. */
5133
0
    if (netdev_is_pmd(port->netdev)
5134
0
        && !nullable_string_is_equal(affinity_list, port->rxq_affinity_list)) {
5135
5136
0
        error = dpif_netdev_port_set_rxq_affinity(port, affinity_list);
5137
0
        if (error) {
5138
0
            goto unlock;
5139
0
        }
5140
0
        free(port->rxq_affinity_list);
5141
0
        port->rxq_affinity_list = nullable_xstrdup(affinity_list);
5142
5143
0
        dp_netdev_request_reconfigure(dp);
5144
0
    }
5145
5146
0
    if (nullable_string_is_equal(tx_steering_mode, "hash")) {
5147
0
        txq_mode = TXQ_REQ_MODE_HASH;
5148
0
    } else {
5149
0
        txq_mode = TXQ_REQ_MODE_THREAD;
5150
0
    }
5151
5152
0
    if (txq_mode != port->txq_requested_mode) {
5153
0
        port->txq_requested_mode = txq_mode;
5154
0
        VLOG_INFO("%s: Tx packet steering mode has been set to '%s'.",
5155
0
                  netdev_get_name(port->netdev),
5156
0
                  (txq_mode == TXQ_REQ_MODE_THREAD) ? "thread" : "hash");
5157
0
        dp_netdev_request_reconfigure(dp);
5158
0
    }
5159
5160
0
unlock:
5161
0
    ovs_rwlock_unlock(&dp->port_rwlock);
5162
0
    return error;
5163
0
}
5164
5165
static int
5166
dpif_netdev_queue_to_priority(const struct dpif *dpif OVS_UNUSED,
5167
                              uint32_t queue_id, uint32_t *priority)
5168
0
{
5169
0
    *priority = queue_id;
5170
0
    return 0;
5171
0
}
5172
5173

5174
/* Creates and returns a new 'struct dp_netdev_actions', whose actions are
5175
 * a copy of the 'size' bytes of 'actions' input parameters. */
5176
struct dp_netdev_actions *
5177
dp_netdev_actions_create(const struct nlattr *actions, size_t size)
5178
0
{
5179
0
    struct dp_netdev_actions *netdev_actions;
5180
5181
0
    netdev_actions = xmalloc(sizeof *netdev_actions + size);
5182
0
    netdev_actions->size = size;
5183
0
    if (size) {
5184
0
        memcpy(netdev_actions->actions, actions, size);
5185
0
    }
5186
5187
0
    return netdev_actions;
5188
0
}
5189
5190
struct dp_netdev_actions *
5191
dp_netdev_flow_get_actions(const struct dp_netdev_flow *flow)
5192
0
{
5193
0
    return ovsrcu_get(struct dp_netdev_actions *, &flow->actions);
5194
0
}
5195
5196
static void
5197
dp_netdev_actions_free(struct dp_netdev_actions *actions)
5198
0
{
5199
0
    free(actions);
5200
0
}
5201

5202
static void
5203
dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
5204
                         enum rxq_cycles_counter_type type,
5205
                         unsigned long long cycles)
5206
0
{
5207
0
   atomic_store_relaxed(&rx->cycles[type], cycles);
5208
0
}
5209
5210
static void
5211
dp_netdev_rxq_add_cycles(struct dp_netdev_rxq *rx,
5212
                         enum rxq_cycles_counter_type type,
5213
                         unsigned long long cycles)
5214
0
{
5215
0
    non_atomic_ullong_add(&rx->cycles[type], cycles);
5216
0
}
5217
5218
static uint64_t
5219
dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx,
5220
                         enum rxq_cycles_counter_type type)
5221
0
{
5222
0
    unsigned long long processing_cycles;
5223
0
    atomic_read_relaxed(&rx->cycles[type], &processing_cycles);
5224
0
    return processing_cycles;
5225
0
}
5226
5227
static void
5228
dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
5229
                                unsigned long long cycles)
5230
0
{
5231
0
    unsigned int idx = atomic_count_inc(&rx->intrvl_idx) % PMD_INTERVAL_MAX;
5232
0
    atomic_store_relaxed(&rx->cycles_intrvl[idx], cycles);
5233
0
}
5234
5235
static uint64_t
5236
dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx)
5237
0
{
5238
0
    unsigned long long processing_cycles;
5239
0
    atomic_read_relaxed(&rx->cycles_intrvl[idx], &processing_cycles);
5240
0
    return processing_cycles;
5241
0
}
5242
5243
#if ATOMIC_ALWAYS_LOCK_FREE_8B
5244
static inline bool
5245
pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd)
5246
0
{
5247
0
    bool pmd_perf_enabled;
5248
0
    atomic_read_relaxed(&pmd->dp->pmd_perf_metrics, &pmd_perf_enabled);
5249
0
    return pmd_perf_enabled;
5250
0
}
5251
#else
5252
/* If stores and reads of 64-bit integers are not atomic, the full PMD
5253
 * performance metrics are not available as locked access to 64 bit
5254
 * integers would be prohibitively expensive. */
5255
static inline bool
5256
pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd OVS_UNUSED)
5257
{
5258
    return false;
5259
}
5260
#endif
5261
5262
static int
5263
dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd,
5264
                                   struct tx_port *p)
5265
0
{
5266
0
    int i;
5267
0
    int tx_qid;
5268
0
    int output_cnt;
5269
0
    bool concurrent_txqs;
5270
0
    struct cycle_timer timer;
5271
0
    uint64_t cycles;
5272
0
    uint32_t tx_flush_interval;
5273
5274
0
    cycle_timer_start(&pmd->perf_stats, &timer);
5275
5276
0
    output_cnt = dp_packet_batch_size(&p->output_pkts);
5277
0
    ovs_assert(output_cnt > 0);
5278
5279
0
    if (p->port->txq_mode == TXQ_MODE_XPS_HASH) {
5280
0
        int n_txq = netdev_n_txq(p->port->netdev);
5281
5282
        /* Re-batch per txq based on packet hash. */
5283
0
        struct dp_packet *packet;
5284
0
        DP_PACKET_BATCH_FOR_EACH (j, packet, &p->output_pkts) {
5285
0
            uint32_t hash;
5286
5287
0
            if (OVS_LIKELY(dp_packet_rss_valid(packet))) {
5288
0
                hash = dp_packet_get_rss_hash(packet);
5289
0
            } else {
5290
0
                struct flow flow;
5291
5292
0
                flow_extract(packet, &flow);
5293
0
                hash = flow_hash_5tuple(&flow, 0);
5294
0
            }
5295
0
            dp_packet_batch_add(&p->txq_pkts[hash % n_txq], packet);
5296
0
        }
5297
5298
        /* Flush batches of each Tx queues. */
5299
0
        for (i = 0; i < n_txq; i++) {
5300
0
            if (dp_packet_batch_is_empty(&p->txq_pkts[i])) {
5301
0
                continue;
5302
0
            }
5303
0
            netdev_send(p->port->netdev, i, &p->txq_pkts[i], true);
5304
0
            dp_packet_batch_init(&p->txq_pkts[i]);
5305
0
        }
5306
0
    } else {
5307
0
        if (p->port->txq_mode == TXQ_MODE_XPS) {
5308
0
            tx_qid = dpif_netdev_xps_get_tx_qid(pmd, p);
5309
0
            concurrent_txqs = true;
5310
0
        } else {
5311
0
            tx_qid = pmd->static_tx_qid;
5312
0
            concurrent_txqs = false;
5313
0
        }
5314
0
        netdev_send(p->port->netdev, tx_qid, &p->output_pkts, concurrent_txqs);
5315
0
    }
5316
0
    dp_packet_batch_init(&p->output_pkts);
5317
5318
    /* Update time of the next flush. */
5319
0
    atomic_read_relaxed(&pmd->dp->tx_flush_interval, &tx_flush_interval);
5320
0
    p->flush_time = pmd->ctx.now + tx_flush_interval;
5321
5322
0
    ovs_assert(pmd->n_output_batches > 0);
5323
0
    pmd->n_output_batches--;
5324
5325
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_PKTS, output_cnt);
5326
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_BATCHES, 1);
5327
5328
    /* Distribute send cycles evenly among transmitted packets and assign to
5329
     * their respective rx queues. */
5330
0
    cycles = cycle_timer_stop(&pmd->perf_stats, &timer) / output_cnt;
5331
0
    for (i = 0; i < output_cnt; i++) {
5332
0
        if (p->output_pkts_rxqs[i]) {
5333
0
            dp_netdev_rxq_add_cycles(p->output_pkts_rxqs[i],
5334
0
                                     RXQ_CYCLES_PROC_CURR, cycles);
5335
0
        }
5336
0
    }
5337
5338
0
    return output_cnt;
5339
0
}
5340
5341
static int
5342
dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
5343
                                   bool force)
5344
0
{
5345
0
    struct tx_port *p;
5346
0
    int output_cnt = 0;
5347
5348
0
    if (!pmd->n_output_batches) {
5349
0
        return 0;
5350
0
    }
5351
5352
0
    HMAP_FOR_EACH (p, node, &pmd->send_port_cache) {
5353
0
        if (!dp_packet_batch_is_empty(&p->output_pkts)
5354
0
            && (force || pmd->ctx.now >= p->flush_time)) {
5355
0
            output_cnt += dp_netdev_pmd_flush_output_on_port(pmd, p);
5356
0
        }
5357
0
    }
5358
0
    return output_cnt;
5359
0
}
5360
5361
static int
5362
dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
5363
                           struct dp_netdev_rxq *rxq,
5364
                           odp_port_t port_no)
5365
0
{
5366
0
    struct pmd_perf_stats *s = &pmd->perf_stats;
5367
0
    struct dp_packet_batch batch;
5368
0
    struct cycle_timer timer;
5369
0
    int error;
5370
0
    int batch_cnt = 0;
5371
0
    int rem_qlen = 0, *qlen_p = NULL;
5372
0
    uint64_t cycles;
5373
5374
    /* Measure duration for polling and processing rx burst. */
5375
0
    cycle_timer_start(&pmd->perf_stats, &timer);
5376
5377
0
    pmd->ctx.last_rxq = rxq;
5378
0
    dp_packet_batch_init(&batch);
5379
5380
    /* Fetch the rx queue length only for vhostuser ports. */
5381
0
    if (pmd_perf_metrics_enabled(pmd) && rxq->is_vhost) {
5382
0
        qlen_p = &rem_qlen;
5383
0
    }
5384
5385
0
    error = netdev_rxq_recv(rxq->rx, &batch, qlen_p);
5386
0
    if (!error) {
5387
        /* At least one packet received. */
5388
0
        *recirc_depth_get() = 0;
5389
0
        pmd_thread_ctx_time_update(pmd);
5390
0
        batch_cnt = dp_packet_batch_size(&batch);
5391
0
        if (pmd_perf_metrics_enabled(pmd)) {
5392
            /* Update batch histogram. */
5393
0
            s->current.batches++;
5394
0
            histogram_add_sample(&s->pkts_per_batch, batch_cnt);
5395
            /* Update the maximum vhost rx queue fill level. */
5396
0
            if (rxq->is_vhost && rem_qlen >= 0) {
5397
0
                uint32_t qfill = batch_cnt + rem_qlen;
5398
0
                if (qfill > s->current.max_vhost_qfill) {
5399
0
                    s->current.max_vhost_qfill = qfill;
5400
0
                }
5401
0
            }
5402
0
        }
5403
5404
        /* Process packet batch. */
5405
0
        int ret = pmd->netdev_input_func(pmd, &batch, port_no);
5406
0
        if (ret) {
5407
0
            dp_netdev_input(pmd, &batch, port_no);
5408
0
        }
5409
5410
        /* Assign processing cycles to rx queue. */
5411
0
        cycles = cycle_timer_stop(&pmd->perf_stats, &timer);
5412
0
        dp_netdev_rxq_add_cycles(rxq, RXQ_CYCLES_PROC_CURR, cycles);
5413
5414
0
        dp_netdev_pmd_flush_output_packets(pmd, false);
5415
0
    } else {
5416
        /* Discard cycles. */
5417
0
        cycle_timer_stop(&pmd->perf_stats, &timer);
5418
0
        if (error != EAGAIN && error != EOPNOTSUPP) {
5419
0
            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
5420
5421
0
            VLOG_ERR_RL(&rl, "error receiving data from %s: %s",
5422
0
                    netdev_rxq_get_name(rxq->rx), ovs_strerror(error));
5423
0
        }
5424
0
    }
5425
5426
0
    pmd->ctx.last_rxq = NULL;
5427
5428
0
    return batch_cnt;
5429
0
}
5430
5431
static struct tx_port *
5432
tx_port_lookup(const struct hmap *hmap, odp_port_t port_no)
5433
0
{
5434
0
    struct tx_port *tx;
5435
5436
0
    HMAP_FOR_EACH_IN_BUCKET (tx, node, hash_port_no(port_no), hmap) {
5437
0
        if (tx->port->port_no == port_no) {
5438
0
            return tx;
5439
0
        }
5440
0
    }
5441
5442
0
    return NULL;
5443
0
}
5444
5445
static struct tx_bond *
5446
tx_bond_lookup(const struct cmap *tx_bonds, uint32_t bond_id)
5447
0
{
5448
0
    uint32_t hash = hash_bond_id(bond_id);
5449
0
    struct tx_bond *tx;
5450
5451
0
    CMAP_FOR_EACH_WITH_HASH (tx, node, hash, tx_bonds) {
5452
0
        if (tx->bond_id == bond_id) {
5453
0
            return tx;
5454
0
        }
5455
0
    }
5456
0
    return NULL;
5457
0
}
5458
5459
static int
5460
port_reconfigure(struct dp_netdev_port *port)
5461
0
{
5462
0
    struct netdev *netdev = port->netdev;
5463
0
    int i, err;
5464
5465
    /* Closes the existing 'rxq's. */
5466
0
    for (i = 0; i < port->n_rxq; i++) {
5467
0
        netdev_rxq_close(port->rxqs[i].rx);
5468
0
        port->rxqs[i].rx = NULL;
5469
0
    }
5470
0
    unsigned last_nrxq = port->n_rxq;
5471
0
    port->n_rxq = 0;
5472
5473
    /* Allows 'netdev' to apply the pending configuration changes. */
5474
0
    if (netdev_is_reconf_required(netdev) || port->need_reconfigure) {
5475
0
        err = netdev_reconfigure(netdev);
5476
0
        if (err && (err != EOPNOTSUPP)) {
5477
0
            VLOG_ERR("Failed to set interface %s new configuration",
5478
0
                     netdev_get_name(netdev));
5479
0
            return err;
5480
0
        }
5481
0
    }
5482
    /* If the netdev_reconfigure() above succeeds, reopens the 'rxq's. */
5483
0
    port->rxqs = xrealloc(port->rxqs,
5484
0
                          sizeof *port->rxqs * netdev_n_rxq(netdev));
5485
    /* Realloc 'used' counters for tx queues. */
5486
0
    free(port->txq_used);
5487
0
    port->txq_used = xcalloc(netdev_n_txq(netdev), sizeof *port->txq_used);
5488
5489
0
    for (i = 0; i < netdev_n_rxq(netdev); i++) {
5490
0
        bool new_queue = i >= last_nrxq;
5491
0
        if (new_queue) {
5492
0
            memset(&port->rxqs[i], 0, sizeof port->rxqs[i]);
5493
0
        }
5494
5495
0
        port->rxqs[i].port = port;
5496
0
        port->rxqs[i].is_vhost = !strncmp(port->type, "dpdkvhost", 9);
5497
5498
0
        err = netdev_rxq_open(netdev, &port->rxqs[i].rx, i);
5499
0
        if (err) {
5500
0
            return err;
5501
0
        }
5502
0
        port->n_rxq++;
5503
0
    }
5504
5505
    /* Parse affinity list to apply configuration for new queues. */
5506
0
    dpif_netdev_port_set_rxq_affinity(port, port->rxq_affinity_list);
5507
5508
    /* If reconfiguration was successful mark it as such, so we can use it */
5509
0
    port->need_reconfigure = false;
5510
5511
0
    return 0;
5512
0
}
5513
5514
struct sched_numa_list {
5515
    struct hmap numas;  /* Contains 'struct sched_numa'. */
5516
};
5517
5518
/* Meta data for out-of-place pmd rxq assignments. */
5519
struct sched_pmd {
5520
    struct sched_numa *numa;
5521
    /* Associated PMD thread. */
5522
    struct dp_netdev_pmd_thread *pmd;
5523
    uint64_t pmd_proc_cycles;
5524
    struct dp_netdev_rxq **rxqs;
5525
    unsigned n_rxq;
5526
    bool isolated;
5527
};
5528
5529
struct sched_numa {
5530
    struct hmap_node node;
5531
    int numa_id;
5532
    /* PMDs on numa node. */
5533
    struct sched_pmd *pmds;
5534
    /* Num of PMDs on numa node. */
5535
    unsigned n_pmds;
5536
    /* Num of isolated PMDs on numa node. */
5537
    unsigned n_isolated;
5538
    int rr_cur_index;
5539
    bool rr_idx_inc;
5540
};
5541
5542
static size_t
5543
sched_numa_list_count(struct sched_numa_list *numa_list)
5544
0
{
5545
0
    return hmap_count(&numa_list->numas);
5546
0
}
5547
5548
static struct sched_numa *
5549
sched_numa_list_next(struct sched_numa_list *numa_list,
5550
                     const struct sched_numa *numa)
5551
0
{
5552
0
    struct hmap_node *node = NULL;
5553
5554
0
    if (numa) {
5555
0
        node = hmap_next(&numa_list->numas, &numa->node);
5556
0
    }
5557
0
    if (!node) {
5558
0
        node = hmap_first(&numa_list->numas);
5559
0
    }
5560
5561
0
    return (node) ? CONTAINER_OF(node, struct sched_numa, node) : NULL;
5562
0
}
5563
5564
static struct sched_numa *
5565
sched_numa_list_lookup(struct sched_numa_list *numa_list, int numa_id)
5566
0
{
5567
0
    struct sched_numa *numa;
5568
5569
0
    HMAP_FOR_EACH_WITH_HASH (numa, node, hash_int(numa_id, 0),
5570
0
                             &numa_list->numas) {
5571
0
        if (numa->numa_id == numa_id) {
5572
0
            return numa;
5573
0
        }
5574
0
    }
5575
0
    return NULL;
5576
0
}
5577
5578
static int
5579
compare_sched_pmd_list(const void *a_, const void *b_)
5580
0
{
5581
0
    struct sched_pmd *a, *b;
5582
5583
0
    a = (struct sched_pmd *) a_;
5584
0
    b = (struct sched_pmd *) b_;
5585
5586
0
    return compare_poll_thread_list(&a->pmd, &b->pmd);
5587
0
}
5588
5589
static void
5590
sort_numa_list_pmds(struct sched_numa_list *numa_list)
5591
0
{
5592
0
    struct sched_numa *numa;
5593
5594
0
    HMAP_FOR_EACH (numa, node, &numa_list->numas) {
5595
0
        if (numa->n_pmds > 1) {
5596
0
            qsort(numa->pmds, numa->n_pmds, sizeof *numa->pmds,
5597
0
                  compare_sched_pmd_list);
5598
0
        }
5599
0
    }
5600
0
}
5601
5602
/* Populate numas and pmds on those numas. */
5603
static void
5604
sched_numa_list_populate(struct sched_numa_list *numa_list,
5605
                         struct dp_netdev *dp)
5606
0
{
5607
0
    struct dp_netdev_pmd_thread *pmd;
5608
5609
0
    hmap_init(&numa_list->numas);
5610
5611
    /* For each pmd on this datapath. */
5612
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5613
0
        struct sched_numa *numa;
5614
0
        struct sched_pmd *sched_pmd;
5615
0
        if (pmd->core_id == NON_PMD_CORE_ID) {
5616
0
            continue;
5617
0
        }
5618
5619
        /* Get the numa of the PMD. */
5620
0
        numa = sched_numa_list_lookup(numa_list, pmd->numa_id);
5621
        /* Create a new numa node for it if not already created. */
5622
0
        if (!numa) {
5623
0
            numa = xzalloc(sizeof *numa);
5624
0
            numa->numa_id = pmd->numa_id;
5625
0
            hmap_insert(&numa_list->numas, &numa->node,
5626
0
                        hash_int(pmd->numa_id, 0));
5627
0
        }
5628
5629
        /* Create a sched_pmd on this numa for the pmd. */
5630
0
        numa->n_pmds++;
5631
0
        numa->pmds = xrealloc(numa->pmds, numa->n_pmds * sizeof *numa->pmds);
5632
0
        sched_pmd = &numa->pmds[numa->n_pmds - 1];
5633
0
        memset(sched_pmd, 0, sizeof *sched_pmd);
5634
0
        sched_pmd->numa = numa;
5635
0
        sched_pmd->pmd = pmd;
5636
        /* At least one pmd is present so initialize curr_idx and idx_inc. */
5637
0
        numa->rr_cur_index = 0;
5638
0
        numa->rr_idx_inc = true;
5639
0
    }
5640
0
    sort_numa_list_pmds(numa_list);
5641
0
}
5642
5643
static void
5644
sched_numa_list_free_entries(struct sched_numa_list *numa_list)
5645
0
{
5646
0
    struct sched_numa *numa;
5647
5648
0
    HMAP_FOR_EACH_POP (numa, node, &numa_list->numas) {
5649
0
        for (unsigned i = 0; i < numa->n_pmds; i++) {
5650
0
            struct sched_pmd *sched_pmd;
5651
5652
0
            sched_pmd = &numa->pmds[i];
5653
0
            sched_pmd->n_rxq = 0;
5654
0
            free(sched_pmd->rxqs);
5655
0
        }
5656
0
        numa->n_pmds = 0;
5657
0
        free(numa->pmds);
5658
0
        free(numa);
5659
0
    }
5660
0
    hmap_destroy(&numa_list->numas);
5661
0
}
5662
5663
static struct sched_pmd *
5664
sched_pmd_find_by_pmd(struct sched_numa_list *numa_list,
5665
                      struct dp_netdev_pmd_thread *pmd)
5666
0
{
5667
0
    struct sched_numa *numa;
5668
5669
0
    HMAP_FOR_EACH (numa, node, &numa_list->numas) {
5670
0
        for (unsigned i = 0; i < numa->n_pmds; i++) {
5671
0
            struct sched_pmd *sched_pmd;
5672
5673
0
            sched_pmd = &numa->pmds[i];
5674
0
            if (pmd == sched_pmd->pmd) {
5675
0
                return sched_pmd;
5676
0
            }
5677
0
        }
5678
0
    }
5679
0
    return NULL;
5680
0
}
5681
5682
static void
5683
sched_pmd_add_rxq(struct sched_pmd *sched_pmd, struct dp_netdev_rxq *rxq,
5684
                  uint64_t cycles)
5685
0
{
5686
    /* As sched_pmd is allocated outside this fn. better to not assume
5687
     * rxqs is initialized to NULL. */
5688
0
    if (sched_pmd->n_rxq == 0) {
5689
0
        sched_pmd->rxqs = xmalloc(sizeof *sched_pmd->rxqs);
5690
0
    } else {
5691
0
        sched_pmd->rxqs = xrealloc(sched_pmd->rxqs, (sched_pmd->n_rxq + 1) *
5692
0
                                                    sizeof *sched_pmd->rxqs);
5693
0
    }
5694
5695
0
    sched_pmd->rxqs[sched_pmd->n_rxq++] = rxq;
5696
0
    sched_pmd->pmd_proc_cycles += cycles;
5697
0
}
5698
5699
static void
5700
sched_numa_list_assignments(struct sched_numa_list *numa_list,
5701
                            struct dp_netdev *dp)
5702
    OVS_REQ_RDLOCK(dp->port_rwlock)
5703
0
{
5704
0
    struct dp_netdev_port *port;
5705
5706
    /* For each port. */
5707
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
5708
0
        if (!netdev_is_pmd(port->netdev)) {
5709
0
            continue;
5710
0
        }
5711
        /* For each rxq on the port. */
5712
0
        for (unsigned qid = 0; qid < port->n_rxq; qid++) {
5713
0
            struct dp_netdev_rxq *rxq = &port->rxqs[qid];
5714
0
            struct sched_pmd *sched_pmd;
5715
0
            uint64_t proc_cycles = 0;
5716
5717
0
            for (int i = 0; i < PMD_INTERVAL_MAX; i++) {
5718
0
                proc_cycles  += dp_netdev_rxq_get_intrvl_cycles(rxq, i);
5719
0
            }
5720
5721
0
            sched_pmd = sched_pmd_find_by_pmd(numa_list, rxq->pmd);
5722
0
            if (sched_pmd) {
5723
0
                if (rxq->core_id != OVS_CORE_UNSPEC && dp->pmd_iso) {
5724
0
                    sched_pmd->isolated = true;
5725
0
                }
5726
0
                sched_pmd_add_rxq(sched_pmd, rxq, proc_cycles);
5727
0
            }
5728
0
        }
5729
0
    }
5730
0
}
5731
5732
static void
5733
sched_numa_list_put_in_place(struct sched_numa_list *numa_list)
5734
0
{
5735
0
    struct sched_numa *numa;
5736
5737
    /* For each numa. */
5738
0
    HMAP_FOR_EACH (numa, node, &numa_list->numas) {
5739
        /* For each pmd. */
5740
0
        for (int i = 0; i < numa->n_pmds; i++) {
5741
0
            struct sched_pmd *sched_pmd;
5742
5743
0
            sched_pmd = &numa->pmds[i];
5744
0
            sched_pmd->pmd->isolated = sched_pmd->isolated;
5745
            /* For each rxq. */
5746
0
            for (unsigned k = 0; k < sched_pmd->n_rxq; k++) {
5747
                /* Store the new pmd from the out of place sched_numa_list
5748
                 * struct to the dp_netdev_rxq struct */
5749
0
                sched_pmd->rxqs[k]->pmd = sched_pmd->pmd;
5750
0
            }
5751
0
        }
5752
0
    }
5753
0
}
5754
5755
/* Returns 'true' if OVS rxq scheduling algorithm assigned any unpinned rxq to
5756
 * a PMD thread core on a non-local numa node. */
5757
static bool
5758
sched_numa_list_cross_numa_polling(struct sched_numa_list *numa_list)
5759
0
{
5760
0
    struct sched_numa *numa;
5761
5762
0
    HMAP_FOR_EACH (numa, node, &numa_list->numas) {
5763
0
        for (int i = 0; i < numa->n_pmds; i++) {
5764
0
            struct sched_pmd *sched_pmd;
5765
5766
0
            sched_pmd = &numa->pmds[i];
5767
0
            if (sched_pmd->isolated) {
5768
                /* All rxqs on this PMD thread core are pinned. */
5769
0
                continue;
5770
0
            }
5771
0
            for (unsigned k = 0; k < sched_pmd->n_rxq; k++) {
5772
0
                struct dp_netdev_rxq *rxq = sched_pmd->rxqs[k];
5773
                /* Check if the rxq is not pinned to a specific PMD thread core
5774
                 * by the user AND the PMD thread core that OVS assigned is
5775
                 * non-local to the rxq port. */
5776
0
                if (rxq->core_id == OVS_CORE_UNSPEC &&
5777
0
                    rxq->pmd->numa_id !=
5778
0
                        netdev_get_numa_id(rxq->port->netdev)) {
5779
0
                    return true;
5780
0
                }
5781
0
            }
5782
0
        }
5783
0
    }
5784
0
    return false;
5785
0
}
5786
5787
static unsigned
5788
sched_numa_noniso_pmd_count(struct sched_numa *numa)
5789
0
{
5790
0
    if (numa->n_pmds > numa->n_isolated) {
5791
0
        return numa->n_pmds - numa->n_isolated;
5792
0
    }
5793
0
    return 0;
5794
0
}
5795
5796
/* Sort Rx Queues by the processing cycles they are consuming. */
5797
static int
5798
compare_rxq_cycles(const void *a, const void *b)
5799
0
{
5800
0
    struct dp_netdev_rxq *qa;
5801
0
    struct dp_netdev_rxq *qb;
5802
0
    uint64_t cycles_qa, cycles_qb;
5803
5804
0
    qa = *(struct dp_netdev_rxq **) a;
5805
0
    qb = *(struct dp_netdev_rxq **) b;
5806
5807
0
    cycles_qa = dp_netdev_rxq_get_cycles(qa, RXQ_CYCLES_PROC_HIST);
5808
0
    cycles_qb = dp_netdev_rxq_get_cycles(qb, RXQ_CYCLES_PROC_HIST);
5809
5810
0
    if (cycles_qa != cycles_qb) {
5811
0
        return (cycles_qa < cycles_qb) ? 1 : -1;
5812
0
    } else {
5813
        /* Cycles are the same so tiebreak on port/queue id.
5814
         * Tiebreaking (as opposed to return 0) ensures consistent
5815
         * sort results across multiple OS's. */
5816
0
        uint32_t port_qa = odp_to_u32(qa->port->port_no);
5817
0
        uint32_t port_qb = odp_to_u32(qb->port->port_no);
5818
0
        if (port_qa != port_qb) {
5819
0
            return port_qa > port_qb ? 1 : -1;
5820
0
        } else {
5821
0
            return netdev_rxq_get_queue_id(qa->rx)
5822
0
                    - netdev_rxq_get_queue_id(qb->rx);
5823
0
        }
5824
0
    }
5825
0
}
5826
5827
static bool
5828
sched_pmd_new_lowest(struct sched_pmd *current_lowest, struct sched_pmd *pmd,
5829
                     bool has_proc)
5830
0
{
5831
0
    uint64_t current_num, pmd_num;
5832
5833
0
    if (current_lowest == NULL) {
5834
0
        return true;
5835
0
    }
5836
5837
0
    if (has_proc) {
5838
0
        current_num = current_lowest->pmd_proc_cycles;
5839
0
        pmd_num = pmd->pmd_proc_cycles;
5840
0
    } else {
5841
0
        current_num = current_lowest->n_rxq;
5842
0
        pmd_num = pmd->n_rxq;
5843
0
    }
5844
5845
0
    if (pmd_num < current_num) {
5846
0
        return true;
5847
0
    }
5848
0
    return false;
5849
0
}
5850
5851
static struct sched_pmd *
5852
sched_pmd_get_lowest(struct sched_numa *numa, bool has_cyc)
5853
0
{
5854
0
    struct sched_pmd *lowest_sched_pmd = NULL;
5855
5856
0
    for (unsigned i = 0; i < numa->n_pmds; i++) {
5857
0
        struct sched_pmd *sched_pmd;
5858
5859
0
        sched_pmd = &numa->pmds[i];
5860
0
        if (sched_pmd->isolated) {
5861
0
            continue;
5862
0
        }
5863
0
        if (sched_pmd_new_lowest(lowest_sched_pmd, sched_pmd, has_cyc)) {
5864
0
            lowest_sched_pmd = sched_pmd;
5865
0
        }
5866
0
    }
5867
0
    return lowest_sched_pmd;
5868
0
}
5869
5870
/*
5871
 * Returns the next pmd from the numa node.
5872
 *
5873
 * If 'updown' is 'true' it will alternate between selecting the next pmd in
5874
 * either an up or down walk, switching between up/down when the first or last
5875
 * core is reached. e.g. 1,2,3,3,2,1,1,2...
5876
 *
5877
 * If 'updown' is 'false' it will select the next pmd wrapping around when
5878
 * last core reached. e.g. 1,2,3,1,2,3,1,2...
5879
 */
5880
static struct sched_pmd *
5881
sched_pmd_next_rr(struct sched_numa *numa, bool updown)
5882
0
{
5883
0
    int numa_idx = numa->rr_cur_index;
5884
5885
0
    if (numa->rr_idx_inc == true) {
5886
        /* Incrementing through list of pmds. */
5887
0
        if (numa->rr_cur_index == numa->n_pmds - 1) {
5888
            /* Reached the last pmd. */
5889
0
            if (updown) {
5890
0
                numa->rr_idx_inc = false;
5891
0
            } else {
5892
0
                numa->rr_cur_index = 0;
5893
0
            }
5894
0
        } else {
5895
0
            numa->rr_cur_index++;
5896
0
        }
5897
0
    } else {
5898
        /* Decrementing through list of pmds. */
5899
0
        if (numa->rr_cur_index == 0) {
5900
            /* Reached the first pmd. */
5901
0
            numa->rr_idx_inc = true;
5902
0
        } else {
5903
0
            numa->rr_cur_index--;
5904
0
        }
5905
0
    }
5906
0
    return &numa->pmds[numa_idx];
5907
0
}
5908
5909
static struct sched_pmd *
5910
sched_pmd_next_noniso_rr(struct sched_numa *numa, bool updown)
5911
0
{
5912
0
    struct sched_pmd *sched_pmd = NULL;
5913
5914
    /* sched_pmd_next_rr() may return duplicate PMDs before all PMDs have been
5915
     * returned depending on updown. Call it more than n_pmds to ensure all
5916
     * PMDs can be searched for the next non-isolated PMD. */
5917
0
    for (unsigned i = 0; i < numa->n_pmds * 2; i++) {
5918
0
        sched_pmd = sched_pmd_next_rr(numa, updown);
5919
0
        if (!sched_pmd->isolated) {
5920
0
            break;
5921
0
        }
5922
0
        sched_pmd = NULL;
5923
0
    }
5924
0
    return sched_pmd;
5925
0
}
5926
5927
static struct sched_pmd *
5928
sched_pmd_next(struct sched_numa *numa, enum sched_assignment_type algo,
5929
               bool has_proc)
5930
0
{
5931
0
    if (algo == SCHED_GROUP) {
5932
0
        return sched_pmd_get_lowest(numa, has_proc);
5933
0
    }
5934
5935
    /* By default RR the PMDs. */
5936
0
    return sched_pmd_next_noniso_rr(numa, algo == SCHED_CYCLES ? true : false);
5937
0
}
5938
5939
static const char *
5940
get_assignment_type_string(enum sched_assignment_type algo)
5941
0
{
5942
0
    switch (algo) {
5943
0
    case SCHED_ROUNDROBIN: return "roundrobin";
5944
0
    case SCHED_CYCLES: return "cycles";
5945
0
    case SCHED_GROUP: return "group";
5946
0
    default: return "Unknown";
5947
0
    }
5948
0
}
5949
5950
0
#define MAX_RXQ_CYC_TEXT 40
5951
0
#define MAX_RXQ_CYC_STRLEN (INT_STRLEN(uint64_t) + MAX_RXQ_CYC_TEXT)
5952
5953
static char *
5954
get_rxq_cyc_log(char *a, enum sched_assignment_type algo, uint64_t cycles)
5955
0
{
5956
0
    int ret = 0;
5957
5958
0
    if (algo != SCHED_ROUNDROBIN) {
5959
0
        ret = snprintf(a, MAX_RXQ_CYC_STRLEN,
5960
0
                       " (measured processing cycles %"PRIu64")", cycles);
5961
0
    }
5962
5963
0
    if (algo == SCHED_ROUNDROBIN || ret <= 0) {
5964
0
        a[0] = '\0';
5965
0
    }
5966
0
    return a;
5967
0
}
5968
5969
static void
5970
sched_numa_list_schedule(struct sched_numa_list *numa_list,
5971
                         struct dp_netdev *dp,
5972
                         enum sched_assignment_type algo,
5973
                         enum vlog_level level)
5974
    OVS_REQ_RDLOCK(dp->port_rwlock)
5975
0
{
5976
0
    struct dp_netdev_port *port;
5977
0
    struct dp_netdev_rxq **rxqs = NULL;
5978
0
    struct sched_numa *last_cross_numa;
5979
0
    unsigned n_rxqs = 0;
5980
0
    bool start_logged = false;
5981
0
    size_t n_numa;
5982
5983
    /* For each port. */
5984
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
5985
0
        if (!netdev_is_pmd(port->netdev)) {
5986
0
            continue;
5987
0
        }
5988
5989
        /* For each rxq on the port. */
5990
0
        for (int qid = 0; qid < port->n_rxq; qid++) {
5991
0
            struct dp_netdev_rxq *rxq = &port->rxqs[qid];
5992
5993
0
            if (algo != SCHED_ROUNDROBIN) {
5994
0
                uint64_t cycle_hist = 0;
5995
5996
                /* Sum the queue intervals and store the cycle history. */
5997
0
                for (unsigned i = 0; i < PMD_INTERVAL_MAX; i++) {
5998
0
                    cycle_hist += dp_netdev_rxq_get_intrvl_cycles(rxq, i);
5999
0
                }
6000
0
                dp_netdev_rxq_set_cycles(rxq, RXQ_CYCLES_PROC_HIST,
6001
0
                                         cycle_hist);
6002
0
            }
6003
6004
            /* Check if this rxq is pinned. */
6005
0
            if (rxq->core_id != OVS_CORE_UNSPEC) {
6006
0
                struct sched_pmd *sched_pmd;
6007
0
                struct dp_netdev_pmd_thread *pmd;
6008
0
                struct sched_numa *numa;
6009
0
                bool iso = dp->pmd_iso;
6010
0
                uint64_t proc_cycles;
6011
0
                char rxq_cyc_log[MAX_RXQ_CYC_STRLEN];
6012
6013
                /* This rxq should be pinned, pin it now. */
6014
0
                pmd = dp_netdev_get_pmd(dp, rxq->core_id);
6015
0
                sched_pmd = sched_pmd_find_by_pmd(numa_list, pmd);
6016
0
                dp_netdev_pmd_unref(pmd);
6017
0
                if (!sched_pmd) {
6018
                    /* Cannot find the PMD.  Cannot pin this rxq. */
6019
0
                    VLOG(level == VLL_DBG ? VLL_DBG : VLL_WARN,
6020
0
                            "Core %2u cannot be pinned with "
6021
0
                            "port \'%s\' rx queue %d. Use pmd-cpu-mask to "
6022
0
                            "enable a pmd on core %u. An alternative core "
6023
0
                            "will be assigned.",
6024
0
                            rxq->core_id,
6025
0
                            netdev_rxq_get_name(rxq->rx),
6026
0
                            netdev_rxq_get_queue_id(rxq->rx),
6027
0
                            rxq->core_id);
6028
0
                    rxqs = xrealloc(rxqs, (n_rxqs + 1) * sizeof *rxqs);
6029
0
                    rxqs[n_rxqs++] = rxq;
6030
0
                    continue;
6031
0
                }
6032
0
                if (iso) {
6033
                    /* Mark PMD as isolated if not done already. */
6034
0
                    if (sched_pmd->isolated == false) {
6035
0
                        sched_pmd->isolated = true;
6036
0
                        numa = sched_pmd->numa;
6037
0
                        numa->n_isolated++;
6038
0
                    }
6039
0
                }
6040
0
                proc_cycles = dp_netdev_rxq_get_cycles(rxq,
6041
0
                                                       RXQ_CYCLES_PROC_HIST);
6042
0
                VLOG(level, "Core %2u on numa node %d is pinned with "
6043
0
                            "port \'%s\' rx queue %d%s",
6044
0
                            sched_pmd->pmd->core_id, sched_pmd->pmd->numa_id,
6045
0
                            netdev_rxq_get_name(rxq->rx),
6046
0
                            netdev_rxq_get_queue_id(rxq->rx),
6047
0
                            get_rxq_cyc_log(rxq_cyc_log, algo, proc_cycles));
6048
0
                sched_pmd_add_rxq(sched_pmd, rxq, proc_cycles);
6049
0
            } else {
6050
0
                rxqs = xrealloc(rxqs, (n_rxqs + 1) * sizeof *rxqs);
6051
0
                rxqs[n_rxqs++] = rxq;
6052
0
            }
6053
0
        }
6054
0
    }
6055
6056
0
    if (n_rxqs > 1 && algo != SCHED_ROUNDROBIN) {
6057
        /* Sort the queues in order of the processing cycles
6058
         * they consumed during their last pmd interval. */
6059
0
        qsort(rxqs, n_rxqs, sizeof *rxqs, compare_rxq_cycles);
6060
0
    }
6061
6062
0
    last_cross_numa = NULL;
6063
0
    n_numa = sched_numa_list_count(numa_list);
6064
0
    for (unsigned i = 0; i < n_rxqs; i++) {
6065
0
        struct dp_netdev_rxq *rxq = rxqs[i];
6066
0
        struct sched_pmd *sched_pmd = NULL;
6067
0
        struct sched_numa *numa;
6068
0
        int port_numa_id;
6069
0
        uint64_t proc_cycles;
6070
0
        char rxq_cyc_log[MAX_RXQ_CYC_STRLEN];
6071
6072
0
        if (start_logged == false && level != VLL_DBG) {
6073
0
            VLOG(level, "Performing pmd to rx queue assignment using %s "
6074
0
                        "algorithm.", get_assignment_type_string(algo));
6075
0
            start_logged = true;
6076
0
        }
6077
6078
        /* Store the cycles for this rxq as we will log these later. */
6079
0
        proc_cycles = dp_netdev_rxq_get_cycles(rxq, RXQ_CYCLES_PROC_HIST);
6080
6081
0
        port_numa_id = netdev_get_numa_id(rxq->port->netdev);
6082
6083
        /* Select numa. */
6084
0
        numa = sched_numa_list_lookup(numa_list, port_numa_id);
6085
6086
        /* Check if numa has no PMDs or no non-isolated PMDs. */
6087
0
        if (!numa || !sched_numa_noniso_pmd_count(numa)) {
6088
            /* Unable to use this numa to find a PMD. */
6089
0
            numa = NULL;
6090
            /* Find any numa with available PMDs. */
6091
0
            for (int j = 0; j < n_numa; j++) {
6092
0
                numa = sched_numa_list_next(numa_list, last_cross_numa);
6093
0
                last_cross_numa = numa;
6094
0
                if (sched_numa_noniso_pmd_count(numa)) {
6095
0
                    break;
6096
0
                }
6097
0
                numa = NULL;
6098
0
            }
6099
0
        }
6100
6101
0
        if (numa) {
6102
            /* Select the PMD that should be used for this rxq. */
6103
0
            sched_pmd = sched_pmd_next(numa, algo,
6104
0
                                       proc_cycles ? true : false);
6105
0
        }
6106
6107
        /* Check that a pmd has been selected. */
6108
0
        if (sched_pmd) {
6109
0
            int pmd_numa_id;
6110
6111
0
            pmd_numa_id = sched_pmd->numa->numa_id;
6112
            /* Check if selected pmd numa matches port numa. */
6113
0
            if (pmd_numa_id != port_numa_id) {
6114
0
                VLOG(level, "There's no available (non-isolated) pmd thread "
6115
0
                            "on numa node %d. Port \'%s\' rx queue %d will "
6116
0
                            "be assigned to a pmd on numa node %d. "
6117
0
                            "This may lead to reduced performance.",
6118
0
                            port_numa_id, netdev_rxq_get_name(rxq->rx),
6119
0
                            netdev_rxq_get_queue_id(rxq->rx), pmd_numa_id);
6120
0
            }
6121
0
            VLOG(level, "Core %2u on numa node %d assigned port \'%s\' "
6122
0
                        "rx queue %d%s.",
6123
0
                        sched_pmd->pmd->core_id, sched_pmd->pmd->numa_id,
6124
0
                        netdev_rxq_get_name(rxq->rx),
6125
0
                        netdev_rxq_get_queue_id(rxq->rx),
6126
0
                        get_rxq_cyc_log(rxq_cyc_log, algo, proc_cycles));
6127
0
            sched_pmd_add_rxq(sched_pmd, rxq, proc_cycles);
6128
0
        } else  {
6129
0
            VLOG(level == VLL_DBG ? level : VLL_WARN,
6130
0
                 "No non-isolated pmd on any numa available for "
6131
0
                 "port \'%s\' rx queue %d%s. "
6132
0
                 "This rx queue will not be polled.",
6133
0
                 netdev_rxq_get_name(rxq->rx),
6134
0
                 netdev_rxq_get_queue_id(rxq->rx),
6135
0
                 get_rxq_cyc_log(rxq_cyc_log, algo, proc_cycles));
6136
0
        }
6137
0
    }
6138
0
    free(rxqs);
6139
0
}
6140
6141
static void
6142
rxq_scheduling(struct dp_netdev *dp)
6143
    OVS_REQ_RDLOCK(dp->port_rwlock)
6144
0
{
6145
0
    struct sched_numa_list numa_list;
6146
0
    enum sched_assignment_type algo = dp->pmd_rxq_assign_type;
6147
6148
0
    sched_numa_list_populate(&numa_list, dp);
6149
0
    sched_numa_list_schedule(&numa_list, dp, algo, VLL_INFO);
6150
0
    sched_numa_list_put_in_place(&numa_list);
6151
6152
0
    sched_numa_list_free_entries(&numa_list);
6153
0
}
6154
6155
static uint64_t variance(uint64_t a[], int n);
6156
6157
static uint64_t
6158
sched_numa_variance(struct sched_numa *numa)
6159
0
{
6160
0
    uint64_t *percent_busy = NULL;
6161
0
    int n_proc = 0;
6162
0
    uint64_t var;
6163
6164
0
    percent_busy = xmalloc(numa->n_pmds * sizeof *percent_busy);
6165
6166
0
    for (unsigned i = 0; i < numa->n_pmds; i++) {
6167
0
        struct sched_pmd *sched_pmd;
6168
0
        uint64_t total_cycles = 0;
6169
6170
0
        sched_pmd = &numa->pmds[i];
6171
        /* Exclude isolated PMDs from variance calculations. */
6172
0
        if (sched_pmd->isolated == true) {
6173
0
            continue;
6174
0
        }
6175
        /* Get the total pmd cycles for an interval. */
6176
0
        atomic_read_relaxed(&sched_pmd->pmd->intrvl_cycles, &total_cycles);
6177
6178
0
        if (total_cycles) {
6179
            /* Estimate the cycles to cover all intervals. */
6180
0
            total_cycles *= PMD_INTERVAL_MAX;
6181
0
            percent_busy[n_proc++] = (sched_pmd->pmd_proc_cycles * 100)
6182
0
                                            / total_cycles;
6183
0
        } else {
6184
0
            percent_busy[n_proc++] = 0;
6185
0
        }
6186
0
    }
6187
0
    var = variance(percent_busy, n_proc);
6188
0
    free(percent_busy);
6189
0
    return var;
6190
0
}
6191
6192
/*
6193
 * This function checks that some basic conditions needed for a rebalance to be
6194
 * effective are met. Such as Rxq scheduling assignment type, more than one
6195
 * PMD, more than 2 Rxqs on a PMD. If there was no reconfiguration change
6196
 * since the last check, it reuses the last result.
6197
 *
6198
 * It is not intended to be an inclusive check of every condition that may make
6199
 * a rebalance ineffective. It is done as a quick check so a full
6200
 * pmd_rebalance_dry_run() can be avoided when it is not needed.
6201
 */
6202
static bool
6203
pmd_rebalance_dry_run_needed(struct dp_netdev *dp)
6204
    OVS_REQ_RDLOCK(dp->port_rwlock)
6205
0
{
6206
0
    struct dp_netdev_pmd_thread *pmd;
6207
0
    struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
6208
0
    unsigned int cnt = 0;
6209
0
    bool multi_rxq = false;
6210
6211
    /* Check if there was no reconfiguration since last check. */
6212
0
    if (!pmd_alb->recheck_config) {
6213
0
        if (!pmd_alb->do_dry_run) {
6214
0
            VLOG_DBG("PMD auto load balance nothing to do, "
6215
0
                     "no configuration changes since last check.");
6216
0
            return false;
6217
0
        }
6218
0
        return true;
6219
0
    }
6220
0
    pmd_alb->recheck_config = false;
6221
6222
    /* Check for incompatible assignment type. */
6223
0
    if (dp->pmd_rxq_assign_type == SCHED_ROUNDROBIN) {
6224
0
        VLOG_DBG("PMD auto load balance nothing to do, "
6225
0
                 "pmd-rxq-assign=roundrobin assignment type configured.");
6226
0
        return pmd_alb->do_dry_run = false;
6227
0
    }
6228
6229
    /* Check that there is at least 2 non-isolated PMDs and
6230
     * one of them is polling more than one rxq. */
6231
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
6232
0
        if (pmd->core_id == NON_PMD_CORE_ID || pmd->isolated) {
6233
0
            continue;
6234
0
        }
6235
6236
0
        if (hmap_count(&pmd->poll_list) > 1) {
6237
0
            multi_rxq = true;
6238
0
        }
6239
0
        if (cnt && multi_rxq) {
6240
0
            return pmd_alb->do_dry_run = true;
6241
0
        }
6242
0
        cnt++;
6243
0
    }
6244
6245
0
    VLOG_DBG("PMD auto load balance nothing to do, "
6246
0
             "not enough non-isolated PMDs or RxQs.");
6247
0
    return pmd_alb->do_dry_run = false;
6248
0
}
6249
6250
static bool
6251
pmd_rebalance_dry_run(struct dp_netdev *dp)
6252
    OVS_REQ_RDLOCK(dp->port_rwlock)
6253
0
{
6254
0
    struct sched_numa_list numa_list_cur;
6255
0
    struct sched_numa_list numa_list_est;
6256
0
    bool thresh_met = false;
6257
0
    uint64_t current_var, estimate_var;
6258
0
    struct sched_numa *numa_cur, *numa_est;
6259
0
    uint64_t improvement = 0;
6260
6261
0
    VLOG_DBG("PMD auto load balance performing dry run.");
6262
6263
    /* Populate current assignments. */
6264
0
    sched_numa_list_populate(&numa_list_cur, dp);
6265
0
    sched_numa_list_assignments(&numa_list_cur, dp);
6266
6267
    /* Populate estimated assignments. */
6268
0
    sched_numa_list_populate(&numa_list_est, dp);
6269
0
    sched_numa_list_schedule(&numa_list_est, dp,
6270
0
                             dp->pmd_rxq_assign_type, VLL_DBG);
6271
6272
    /* Check if cross-numa polling, there is only one numa with PMDs. */
6273
0
    if (!sched_numa_list_cross_numa_polling(&numa_list_est) ||
6274
0
            sched_numa_list_count(&numa_list_est) == 1) {
6275
6276
        /* Calculate variances. */
6277
0
        HMAP_FOR_EACH (numa_cur, node, &numa_list_cur.numas) {
6278
0
            numa_est = sched_numa_list_lookup(&numa_list_est,
6279
0
                                              numa_cur->numa_id);
6280
0
            if (!numa_est) {
6281
0
                continue;
6282
0
            }
6283
0
            current_var = sched_numa_variance(numa_cur);
6284
0
            estimate_var = sched_numa_variance(numa_est);
6285
0
            if (estimate_var < current_var) {
6286
0
                improvement = ((current_var - estimate_var) * 100)
6287
0
                              / current_var;
6288
0
            }
6289
0
            VLOG_DBG("Numa node %d. Current variance %"PRIu64" Estimated "
6290
0
                     "variance %"PRIu64". Variance improvement %"PRIu64"%%.",
6291
0
                     numa_cur->numa_id, current_var,
6292
0
                     estimate_var, improvement);
6293
0
            if (improvement >= dp->pmd_alb.rebalance_improve_thresh) {
6294
0
                thresh_met = true;
6295
0
            }
6296
0
        }
6297
0
        VLOG_DBG("PMD load variance improvement threshold %u%% is %s.",
6298
0
                 dp->pmd_alb.rebalance_improve_thresh,
6299
0
                 thresh_met ? "met" : "not met");
6300
0
    } else {
6301
0
        VLOG_DBG("PMD auto load balance detected cross-numa polling with "
6302
0
                 "multiple numa nodes. Unable to accurately estimate.");
6303
0
    }
6304
6305
0
    sched_numa_list_free_entries(&numa_list_cur);
6306
0
    sched_numa_list_free_entries(&numa_list_est);
6307
6308
0
    return thresh_met;
6309
0
}
6310
6311
static void
6312
reload_affected_pmds(struct dp_netdev *dp)
6313
0
{
6314
0
    struct dp_netdev_pmd_thread *pmd;
6315
6316
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
6317
0
        if (pmd->need_reload) {
6318
0
            dp_netdev_reload_pmd__(pmd);
6319
0
        }
6320
0
    }
6321
6322
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
6323
0
        if (pmd->need_reload) {
6324
0
            if (pmd->core_id != NON_PMD_CORE_ID) {
6325
0
                bool reload;
6326
6327
0
                do {
6328
0
                    atomic_read_explicit(&pmd->reload, &reload,
6329
0
                                         memory_order_acquire);
6330
0
                } while (reload);
6331
0
            }
6332
0
            pmd->need_reload = false;
6333
0
        }
6334
0
    }
6335
0
}
6336
6337
static void
6338
reconfigure_pmd_threads(struct dp_netdev *dp)
6339
    OVS_REQ_RDLOCK(dp->port_rwlock)
6340
0
{
6341
0
    struct dp_netdev_pmd_thread *pmd;
6342
0
    struct ovs_numa_dump *pmd_cores;
6343
0
    struct ovs_numa_info_core *core;
6344
0
    struct hmapx to_delete = HMAPX_INITIALIZER(&to_delete);
6345
0
    struct hmapx_node *node;
6346
0
    bool changed = false;
6347
0
    bool need_to_adjust_static_tx_qids = false;
6348
6349
    /* The pmd threads should be started only if there's a pmd port in the
6350
     * datapath.  If the user didn't provide any "pmd-cpu-mask", we start
6351
     * NR_PMD_THREADS per numa node. */
6352
0
    if (!has_pmd_port(dp)) {
6353
0
        pmd_cores = ovs_numa_dump_n_cores_per_numa(0);
6354
0
    } else if (dp->pmd_cmask && dp->pmd_cmask[0]) {
6355
0
        pmd_cores = ovs_numa_dump_cores_with_cmask(dp->pmd_cmask);
6356
0
    } else {
6357
0
        pmd_cores = ovs_numa_dump_n_cores_per_numa(NR_PMD_THREADS);
6358
0
    }
6359
6360
    /* We need to adjust 'static_tx_qid's only if we're reducing number of
6361
     * PMD threads. Otherwise, new threads will allocate all the freed ids. */
6362
0
    if (ovs_numa_dump_count(pmd_cores) < cmap_count(&dp->poll_threads) - 1) {
6363
        /* Adjustment is required to keep 'static_tx_qid's sequential and
6364
         * avoid possible issues, for example, imbalanced tx queue usage
6365
         * and unnecessary locking caused by remapping on netdev level. */
6366
0
        need_to_adjust_static_tx_qids = true;
6367
0
    }
6368
6369
    /* Check for unwanted pmd threads */
6370
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
6371
0
        if (pmd->core_id == NON_PMD_CORE_ID) {
6372
0
            continue;
6373
0
        }
6374
0
        if (!ovs_numa_dump_contains_core(pmd_cores, pmd->numa_id,
6375
0
                                                    pmd->core_id)) {
6376
0
            hmapx_add(&to_delete, pmd);
6377
0
        } else if (need_to_adjust_static_tx_qids) {
6378
0
            atomic_store_relaxed(&pmd->reload_tx_qid, true);
6379
0
            pmd->need_reload = true;
6380
0
        }
6381
0
    }
6382
6383
0
    HMAPX_FOR_EACH (node, &to_delete) {
6384
0
        pmd = (struct dp_netdev_pmd_thread *) node->data;
6385
0
        VLOG_INFO("PMD thread on numa_id: %d, core id: %2d destroyed.",
6386
0
                  pmd->numa_id, pmd->core_id);
6387
0
        dp_netdev_del_pmd(dp, pmd);
6388
0
    }
6389
0
    changed = !hmapx_is_empty(&to_delete);
6390
0
    hmapx_destroy(&to_delete);
6391
6392
0
    if (need_to_adjust_static_tx_qids) {
6393
        /* 'static_tx_qid's are not sequential now.
6394
         * Reload remaining threads to fix this. */
6395
0
        reload_affected_pmds(dp);
6396
0
    }
6397
6398
    /* Check for required new pmd threads */
6399
0
    FOR_EACH_CORE_ON_DUMP(core, pmd_cores) {
6400
0
        pmd = dp_netdev_get_pmd(dp, core->core_id);
6401
0
        if (!pmd) {
6402
0
            struct ds name = DS_EMPTY_INITIALIZER;
6403
6404
0
            pmd = xzalloc(sizeof *pmd);
6405
0
            dp_netdev_configure_pmd(pmd, dp, core->core_id, core->numa_id);
6406
6407
0
            ds_put_format(&name, "pmd-c%02d/id:", core->core_id);
6408
0
            pmd->thread = ovs_thread_create(ds_cstr(&name),
6409
0
                                            pmd_thread_main, pmd);
6410
0
            ds_destroy(&name);
6411
6412
0
            VLOG_INFO("PMD thread on numa_id: %d, core id: %2d created.",
6413
0
                      pmd->numa_id, pmd->core_id);
6414
0
            changed = true;
6415
0
        } else {
6416
0
            dp_netdev_pmd_unref(pmd);
6417
0
        }
6418
0
    }
6419
6420
0
    if (changed) {
6421
0
        struct ovs_numa_info_numa *numa;
6422
6423
        /* Log the number of pmd threads per numa node. */
6424
0
        FOR_EACH_NUMA_ON_DUMP (numa, pmd_cores) {
6425
0
            VLOG_INFO("There are %"PRIuSIZE" pmd threads on numa node %d",
6426
0
                      numa->n_cores, numa->numa_id);
6427
0
        }
6428
0
    }
6429
6430
0
    ovs_numa_dump_destroy(pmd_cores);
6431
0
}
6432
6433
static void
6434
pmd_remove_stale_ports(struct dp_netdev *dp,
6435
                       struct dp_netdev_pmd_thread *pmd)
6436
    OVS_EXCLUDED(pmd->port_mutex)
6437
    OVS_REQ_RDLOCK(dp->port_rwlock)
6438
0
{
6439
0
    struct rxq_poll *poll;
6440
0
    struct tx_port *tx;
6441
6442
0
    ovs_mutex_lock(&pmd->port_mutex);
6443
0
    HMAP_FOR_EACH_SAFE (poll, node, &pmd->poll_list) {
6444
0
        struct dp_netdev_port *port = poll->rxq->port;
6445
6446
0
        if (port->need_reconfigure
6447
0
            || !hmap_contains(&dp->ports, &port->node)) {
6448
0
            dp_netdev_del_rxq_from_pmd(pmd, poll);
6449
0
        }
6450
0
    }
6451
0
    HMAP_FOR_EACH_SAFE (tx, node, &pmd->tx_ports) {
6452
0
        struct dp_netdev_port *port = tx->port;
6453
6454
0
        if (port->need_reconfigure
6455
0
            || !hmap_contains(&dp->ports, &port->node)) {
6456
0
            dp_netdev_del_port_tx_from_pmd(pmd, tx);
6457
0
        }
6458
0
    }
6459
0
    ovs_mutex_unlock(&pmd->port_mutex);
6460
0
}
6461
6462
/* Must be called each time a port is added/removed or the cmask changes.
6463
 * This creates and destroys pmd threads, reconfigures ports, opens their
6464
 * rxqs and assigns all rxqs/txqs to pmd threads. */
6465
static void
6466
reconfigure_datapath(struct dp_netdev *dp)
6467
    OVS_REQ_RDLOCK(dp->port_rwlock)
6468
0
{
6469
0
    struct hmapx busy_threads = HMAPX_INITIALIZER(&busy_threads);
6470
0
    struct dp_netdev_pmd_thread *pmd;
6471
0
    struct dp_netdev_port *port;
6472
0
    int wanted_txqs;
6473
6474
0
    dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
6475
6476
    /* Step 1: Adjust the pmd threads based on the datapath ports, the cores
6477
     * on the system and the user configuration. */
6478
0
    reconfigure_pmd_threads(dp);
6479
6480
0
    wanted_txqs = cmap_count(&dp->poll_threads);
6481
6482
    /* The number of pmd threads might have changed, or a port can be new:
6483
     * adjust the txqs. */
6484
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
6485
0
        netdev_set_tx_multiq(port->netdev, wanted_txqs);
6486
0
    }
6487
6488
    /* Step 2: Remove from the pmd threads ports that have been removed or
6489
     * need reconfiguration. */
6490
6491
    /* Check for all the ports that need reconfiguration.  We cache this in
6492
     * 'port->need_reconfigure', because netdev_is_reconf_required() can
6493
     * change at any time.
6494
     * Also mark for reconfiguration all ports which will likely change their
6495
     * 'txq_mode' parameter.  It's required to stop using them before
6496
     * changing this setting and it's simpler to mark ports here and allow
6497
     * 'pmd_remove_stale_ports' to remove them from threads.  There will be
6498
     * no actual reconfiguration in 'port_reconfigure' because it's
6499
     * unnecessary.  */
6500
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
6501
0
        if (netdev_is_reconf_required(port->netdev)
6502
0
            || ((port->txq_mode == TXQ_MODE_XPS)
6503
0
                != (netdev_n_txq(port->netdev) < wanted_txqs))
6504
0
            || ((port->txq_mode == TXQ_MODE_XPS_HASH)
6505
0
                != (port->txq_requested_mode == TXQ_REQ_MODE_HASH
6506
0
                    && netdev_n_txq(port->netdev) > 1))) {
6507
0
            port->need_reconfigure = true;
6508
0
        }
6509
0
    }
6510
6511
    /* Remove from the pmd threads all the ports that have been deleted or
6512
     * need reconfiguration. */
6513
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
6514
0
        pmd_remove_stale_ports(dp, pmd);
6515
0
    }
6516
6517
    /* Reload affected pmd threads.  We must wait for the pmd threads before
6518
     * reconfiguring the ports, because a port cannot be reconfigured while
6519
     * it's being used. */
6520
0
    reload_affected_pmds(dp);
6521
6522
    /* Step 3: Reconfigure ports. */
6523
6524
    /* We only reconfigure the ports that we determined above, because they're
6525
     * not being used by any pmd thread at the moment.  If a port fails to
6526
     * reconfigure we remove it from the datapath. */
6527
0
    HMAP_FOR_EACH_SAFE (port, node, &dp->ports) {
6528
0
        int err;
6529
6530
0
        if (!port->need_reconfigure) {
6531
0
            continue;
6532
0
        }
6533
6534
0
        err = port_reconfigure(port);
6535
0
        if (err) {
6536
0
            hmap_remove(&dp->ports, &port->node);
6537
0
            seq_change(dp->port_seq);
6538
0
            port_destroy(port);
6539
0
        } else {
6540
            /* With a single queue, there is no point in using hash mode. */
6541
0
            if (port->txq_requested_mode == TXQ_REQ_MODE_HASH &&
6542
0
                netdev_n_txq(port->netdev) > 1) {
6543
0
                port->txq_mode = TXQ_MODE_XPS_HASH;
6544
0
            } else if (netdev_n_txq(port->netdev) < wanted_txqs) {
6545
0
                port->txq_mode = TXQ_MODE_XPS;
6546
0
            } else {
6547
0
                port->txq_mode = TXQ_MODE_STATIC;
6548
0
            }
6549
0
        }
6550
0
    }
6551
6552
    /* Step 4: Compute new rxq scheduling.  We don't touch the pmd threads
6553
     * for now, we just update the 'pmd' pointer in each rxq to point to the
6554
     * wanted thread according to the scheduling policy. */
6555
6556
    /* Reset all the pmd threads to non isolated. */
6557
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
6558
0
        pmd->isolated = false;
6559
0
    }
6560
6561
    /* Reset all the queues to unassigned */
6562
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
6563
0
        for (int i = 0; i < port->n_rxq; i++) {
6564
0
            port->rxqs[i].pmd = NULL;
6565
0
        }
6566
0
    }
6567
0
    rxq_scheduling(dp);
6568
6569
    /* Step 5: Remove queues not compliant with new scheduling. */
6570
6571
    /* Count all the threads that will have at least one queue to poll. */
6572
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
6573
0
        for (int qid = 0; qid < port->n_rxq; qid++) {
6574
0
            struct dp_netdev_rxq *q = &port->rxqs[qid];
6575
6576
0
            if (q->pmd) {
6577
0
                hmapx_add(&busy_threads, q->pmd);
6578
0
            }
6579
0
        }
6580
0
    }
6581
6582
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
6583
0
        struct rxq_poll *poll;
6584
6585
0
        ovs_mutex_lock(&pmd->port_mutex);
6586
0
        HMAP_FOR_EACH_SAFE (poll, node, &pmd->poll_list) {
6587
0
            if (poll->rxq->pmd != pmd) {
6588
0
                dp_netdev_del_rxq_from_pmd(pmd, poll);
6589
6590
                /* This pmd might sleep after this step if it has no rxq
6591
                 * remaining. Tell it to busy wait for new assignment if it
6592
                 * has at least one scheduled queue. */
6593
0
                if (hmap_count(&pmd->poll_list) == 0 &&
6594
0
                    hmapx_contains(&busy_threads, pmd)) {
6595
0
                    atomic_store_relaxed(&pmd->wait_for_reload, true);
6596
0
                }
6597
0
            }
6598
0
        }
6599
0
        ovs_mutex_unlock(&pmd->port_mutex);
6600
0
    }
6601
6602
0
    hmapx_destroy(&busy_threads);
6603
6604
    /* Reload affected pmd threads.  We must wait for the pmd threads to remove
6605
     * the old queues before readding them, otherwise a queue can be polled by
6606
     * two threads at the same time. */
6607
0
    reload_affected_pmds(dp);
6608
6609
    /* Step 6: Add queues from scheduling, if they're not there already. */
6610
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
6611
0
        if (!netdev_is_pmd(port->netdev)) {
6612
0
            continue;
6613
0
        }
6614
6615
0
        for (int qid = 0; qid < port->n_rxq; qid++) {
6616
0
            struct dp_netdev_rxq *q = &port->rxqs[qid];
6617
6618
0
            if (q->pmd) {
6619
0
                ovs_mutex_lock(&q->pmd->port_mutex);
6620
0
                dp_netdev_add_rxq_to_pmd(q->pmd, q);
6621
0
                ovs_mutex_unlock(&q->pmd->port_mutex);
6622
0
            }
6623
0
        }
6624
0
    }
6625
6626
    /* Add every port and bond to the tx port and bond caches of
6627
     * every pmd thread, if it's not there already and if this pmd
6628
     * has at least one rxq to poll.
6629
     */
6630
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
6631
0
        ovs_mutex_lock(&pmd->port_mutex);
6632
0
        if (hmap_count(&pmd->poll_list) || pmd->core_id == NON_PMD_CORE_ID) {
6633
0
            struct tx_bond *bond;
6634
6635
0
            HMAP_FOR_EACH (port, node, &dp->ports) {
6636
0
                dp_netdev_add_port_tx_to_pmd(pmd, port);
6637
0
            }
6638
6639
0
            CMAP_FOR_EACH (bond, node, &dp->tx_bonds) {
6640
0
                dp_netdev_add_bond_tx_to_pmd(pmd, bond, false);
6641
0
            }
6642
0
        }
6643
0
        ovs_mutex_unlock(&pmd->port_mutex);
6644
0
    }
6645
6646
    /* Reload affected pmd threads. */
6647
0
    reload_affected_pmds(dp);
6648
6649
    /* PMD ALB will need to recheck if dry run needed. */
6650
0
    dp->pmd_alb.recheck_config = true;
6651
0
}
6652
6653
/* Returns true if one of the netdevs in 'dp' requires a reconfiguration */
6654
static bool
6655
ports_require_restart(const struct dp_netdev *dp)
6656
    OVS_REQ_RDLOCK(dp->port_rwlock)
6657
0
{
6658
0
    struct dp_netdev_port *port;
6659
6660
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
6661
0
        if (netdev_is_reconf_required(port->netdev)) {
6662
0
            return true;
6663
0
        }
6664
0
    }
6665
6666
0
    return false;
6667
0
}
6668
6669
/* Calculates variance in the values stored in array 'a'. 'n' is the number
6670
 * of elements in array to be considered for calculating vairance.
6671
 * Usage example: data array 'a' contains the processing load of each pmd and
6672
 * 'n' is the number of PMDs. It returns the variance in processing load of
6673
 * PMDs*/
6674
static uint64_t
6675
variance(uint64_t a[], int n)
6676
0
{
6677
    /* Compute mean (average of elements). */
6678
0
    uint64_t sum = 0;
6679
0
    uint64_t mean = 0;
6680
0
    uint64_t sqDiff = 0;
6681
6682
0
    if (!n) {
6683
0
        return 0;
6684
0
    }
6685
6686
0
    for (int i = 0; i < n; i++) {
6687
0
        sum += a[i];
6688
0
    }
6689
6690
0
    if (sum) {
6691
0
        mean = sum / n;
6692
6693
        /* Compute sum squared differences with mean. */
6694
0
        for (int i = 0; i < n; i++) {
6695
0
            sqDiff += (a[i] - mean)*(a[i] - mean);
6696
0
        }
6697
0
    }
6698
0
    return (sqDiff ? (sqDiff / n) : 0);
6699
0
}
6700
6701
/* Return true if needs to revalidate datapath flows. */
6702
static bool
6703
dpif_netdev_run(struct dpif *dpif)
6704
0
{
6705
0
    struct dp_netdev_port *port;
6706
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
6707
0
    struct dp_netdev_pmd_thread *non_pmd;
6708
0
    uint64_t new_tnl_seq;
6709
0
    bool need_to_flush = true;
6710
0
    bool pmd_rebalance = false;
6711
0
    long long int now = time_msec();
6712
0
    struct dp_netdev_pmd_thread *pmd;
6713
6714
0
    ovs_rwlock_rdlock(&dp->port_rwlock);
6715
0
    non_pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
6716
0
    if (non_pmd) {
6717
0
        ovs_mutex_lock(&dp->non_pmd_mutex);
6718
6719
0
        atomic_read_relaxed(&dp->smc_enable_db, &non_pmd->ctx.smc_enable_db);
6720
6721
0
        HMAP_FOR_EACH (port, node, &dp->ports) {
6722
0
            if (!netdev_is_pmd(port->netdev)) {
6723
0
                int i;
6724
6725
0
                if (port->emc_enabled) {
6726
0
                    atomic_read_relaxed(&dp->emc_insert_min,
6727
0
                                        &non_pmd->ctx.emc_insert_min);
6728
0
                } else {
6729
0
                    non_pmd->ctx.emc_insert_min = 0;
6730
0
                }
6731
6732
0
                for (i = 0; i < port->n_rxq; i++) {
6733
6734
0
                    if (!netdev_rxq_enabled(port->rxqs[i].rx)) {
6735
0
                        continue;
6736
0
                    }
6737
6738
0
                    if (dp_netdev_process_rxq_port(non_pmd,
6739
0
                                                   &port->rxqs[i],
6740
0
                                                   port->port_no)) {
6741
0
                        need_to_flush = false;
6742
0
                    }
6743
0
                }
6744
0
            }
6745
0
        }
6746
0
        if (need_to_flush) {
6747
            /* We didn't receive anything in the process loop.
6748
             * Check if we need to send something.
6749
             * There was no time updates on current iteration. */
6750
0
            pmd_thread_ctx_time_update(non_pmd);
6751
0
            dp_netdev_pmd_flush_output_packets(non_pmd, false);
6752
0
        }
6753
6754
0
        dpif_netdev_xps_revalidate_pmd(non_pmd, false);
6755
0
        ovs_mutex_unlock(&dp->non_pmd_mutex);
6756
6757
0
        dp_netdev_pmd_unref(non_pmd);
6758
0
    }
6759
6760
0
    struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
6761
0
    if (pmd_alb->is_enabled) {
6762
0
        if (!pmd_alb->rebalance_poll_timer) {
6763
0
            pmd_alb->rebalance_poll_timer = now;
6764
0
        } else if ((pmd_alb->rebalance_poll_timer +
6765
0
                   pmd_alb->rebalance_intvl) < now) {
6766
0
            pmd_alb->rebalance_poll_timer = now;
6767
0
            CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
6768
0
                if (atomic_count_get(&pmd->pmd_overloaded) >=
6769
0
                                    PMD_INTERVAL_MAX) {
6770
0
                    pmd_rebalance = true;
6771
0
                    break;
6772
0
                }
6773
0
            }
6774
6775
0
            if (pmd_rebalance &&
6776
0
                !dp_netdev_is_reconf_required(dp) &&
6777
0
                !ports_require_restart(dp) &&
6778
0
                pmd_rebalance_dry_run_needed(dp) &&
6779
0
                pmd_rebalance_dry_run(dp)) {
6780
0
                VLOG_INFO("PMD auto load balance dry run. "
6781
0
                          "Requesting datapath reconfigure.");
6782
0
                dp_netdev_request_reconfigure(dp);
6783
0
            }
6784
0
        }
6785
0
    }
6786
6787
0
    if (dp_netdev_is_reconf_required(dp) || ports_require_restart(dp)) {
6788
0
        reconfigure_datapath(dp);
6789
0
    }
6790
0
    ovs_rwlock_unlock(&dp->port_rwlock);
6791
6792
0
    tnl_neigh_cache_run();
6793
0
    tnl_port_map_run();
6794
0
    new_tnl_seq = seq_read(tnl_conf_seq);
6795
6796
0
    if (dp->last_tnl_conf_seq != new_tnl_seq) {
6797
0
        dp->last_tnl_conf_seq = new_tnl_seq;
6798
0
        return true;
6799
0
    }
6800
0
    return false;
6801
0
}
6802
6803
static void
6804
dpif_netdev_wait(struct dpif *dpif)
6805
0
{
6806
0
    struct dp_netdev_port *port;
6807
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
6808
6809
0
    ovs_mutex_lock(&dp_netdev_mutex);
6810
0
    ovs_rwlock_rdlock(&dp->port_rwlock);
6811
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
6812
0
        netdev_wait_reconf_required(port->netdev);
6813
0
        if (!netdev_is_pmd(port->netdev)) {
6814
0
            int i;
6815
6816
0
            for (i = 0; i < port->n_rxq; i++) {
6817
0
                netdev_rxq_wait(port->rxqs[i].rx);
6818
0
            }
6819
0
        }
6820
0
    }
6821
0
    ovs_rwlock_unlock(&dp->port_rwlock);
6822
0
    ovs_mutex_unlock(&dp_netdev_mutex);
6823
0
    seq_wait(tnl_conf_seq, dp->last_tnl_conf_seq);
6824
0
}
6825
6826
static void
6827
pmd_free_cached_ports(struct dp_netdev_pmd_thread *pmd)
6828
0
{
6829
0
    struct tx_port *tx_port_cached;
6830
6831
    /* Flush all the queued packets. */
6832
0
    dp_netdev_pmd_flush_output_packets(pmd, true);
6833
    /* Free all used tx queue ids. */
6834
0
    dpif_netdev_xps_revalidate_pmd(pmd, true);
6835
6836
0
    HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->tnl_port_cache) {
6837
0
        free(tx_port_cached->txq_pkts);
6838
0
        free(tx_port_cached);
6839
0
    }
6840
0
    HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->send_port_cache) {
6841
0
        free(tx_port_cached->txq_pkts);
6842
0
        free(tx_port_cached);
6843
0
    }
6844
0
}
6845
6846
/* Copies ports from 'pmd->tx_ports' (shared with the main thread) to
6847
 * thread-local copies. Copy to 'pmd->tnl_port_cache' if it is a tunnel
6848
 * device, otherwise to 'pmd->send_port_cache' if the port has at least
6849
 * one txq. */
6850
static void
6851
pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
6852
    OVS_REQUIRES(pmd->port_mutex)
6853
0
{
6854
0
    struct tx_port *tx_port, *tx_port_cached;
6855
6856
0
    pmd_free_cached_ports(pmd);
6857
0
    hmap_shrink(&pmd->send_port_cache);
6858
0
    hmap_shrink(&pmd->tnl_port_cache);
6859
6860
0
    HMAP_FOR_EACH (tx_port, node, &pmd->tx_ports) {
6861
0
        int n_txq = netdev_n_txq(tx_port->port->netdev);
6862
0
        struct dp_packet_batch *txq_pkts_cached;
6863
6864
0
        if (netdev_has_tunnel_push_pop(tx_port->port->netdev)) {
6865
0
            tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
6866
0
            if (tx_port->txq_pkts) {
6867
0
                txq_pkts_cached = xmemdup(tx_port->txq_pkts,
6868
0
                                          n_txq * sizeof *tx_port->txq_pkts);
6869
0
                tx_port_cached->txq_pkts = txq_pkts_cached;
6870
0
            }
6871
0
            hmap_insert(&pmd->tnl_port_cache, &tx_port_cached->node,
6872
0
                        hash_port_no(tx_port_cached->port->port_no));
6873
0
        }
6874
6875
0
        if (n_txq) {
6876
0
            tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
6877
0
            if (tx_port->txq_pkts) {
6878
0
                txq_pkts_cached = xmemdup(tx_port->txq_pkts,
6879
0
                                          n_txq * sizeof *tx_port->txq_pkts);
6880
0
                tx_port_cached->txq_pkts = txq_pkts_cached;
6881
0
            }
6882
0
            hmap_insert(&pmd->send_port_cache, &tx_port_cached->node,
6883
0
                        hash_port_no(tx_port_cached->port->port_no));
6884
0
        }
6885
0
    }
6886
0
}
6887
6888
static void
6889
pmd_alloc_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
6890
0
{
6891
0
    ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
6892
0
    if (!id_pool_alloc_id(pmd->dp->tx_qid_pool, &pmd->static_tx_qid)) {
6893
0
        VLOG_ABORT("static_tx_qid allocation failed for PMD on core %2d"
6894
0
                   ", numa_id %d.", pmd->core_id, pmd->numa_id);
6895
0
    }
6896
0
    ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
6897
6898
0
    VLOG_DBG("static_tx_qid = %d allocated for PMD thread on core %2d"
6899
0
             ", numa_id %d.", pmd->static_tx_qid, pmd->core_id, pmd->numa_id);
6900
0
}
6901
6902
static void
6903
pmd_free_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
6904
0
{
6905
0
    ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
6906
0
    id_pool_free_id(pmd->dp->tx_qid_pool, pmd->static_tx_qid);
6907
0
    ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
6908
0
}
6909
6910
static int
6911
pmd_load_queues_and_ports(struct dp_netdev_pmd_thread *pmd,
6912
                          struct polled_queue **ppoll_list)
6913
0
{
6914
0
    struct polled_queue *poll_list = *ppoll_list;
6915
0
    struct rxq_poll *poll;
6916
0
    int i;
6917
6918
0
    ovs_mutex_lock(&pmd->port_mutex);
6919
0
    poll_list = xrealloc(poll_list, hmap_count(&pmd->poll_list)
6920
0
                                    * sizeof *poll_list);
6921
6922
0
    i = 0;
6923
0
    HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
6924
0
        poll_list[i].rxq = poll->rxq;
6925
0
        poll_list[i].port_no = poll->rxq->port->port_no;
6926
0
        poll_list[i].emc_enabled = poll->rxq->port->emc_enabled;
6927
0
        poll_list[i].rxq_enabled = netdev_rxq_enabled(poll->rxq->rx);
6928
0
        poll_list[i].change_seq =
6929
0
                     netdev_get_change_seq(poll->rxq->port->netdev);
6930
0
        i++;
6931
0
    }
6932
6933
0
    pmd_load_cached_ports(pmd);
6934
6935
0
    ovs_mutex_unlock(&pmd->port_mutex);
6936
6937
0
    *ppoll_list = poll_list;
6938
0
    return i;
6939
0
}
6940
6941
static void *
6942
pmd_thread_main(void *f_)
6943
0
{
6944
0
    struct dp_netdev_pmd_thread *pmd = f_;
6945
0
    struct pmd_perf_stats *s = &pmd->perf_stats;
6946
0
    unsigned int lc = 0;
6947
0
    struct polled_queue *poll_list;
6948
0
    bool wait_for_reload = false;
6949
0
    bool dpdk_attached;
6950
0
    bool reload_tx_qid;
6951
0
    bool exiting;
6952
0
    bool reload;
6953
0
    int poll_cnt;
6954
0
    int i;
6955
0
    int process_packets = 0;
6956
0
    uint64_t sleep_time = 0;
6957
6958
0
    poll_list = NULL;
6959
6960
    /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
6961
0
    ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
6962
0
    ovs_numa_thread_setaffinity_core(pmd->core_id);
6963
0
    dpdk_attached = dpdk_attach_thread(pmd->core_id);
6964
0
    poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
6965
0
    dfc_cache_init(&pmd->flow_cache);
6966
0
    pmd_alloc_static_tx_qid(pmd);
6967
0
    set_timer_resolution(PMD_TIMER_RES_NS);
6968
6969
0
reload:
6970
0
    atomic_count_init(&pmd->pmd_overloaded, 0);
6971
6972
0
    pmd->intrvl_tsc_prev = 0;
6973
0
    atomic_store_relaxed(&pmd->intrvl_cycles, 0);
6974
6975
0
    if (!dpdk_attached) {
6976
0
        dpdk_attached = dpdk_attach_thread(pmd->core_id);
6977
0
    }
6978
6979
    /* List port/core affinity */
6980
0
    for (i = 0; i < poll_cnt; i++) {
6981
0
       VLOG_DBG("Core %d processing port \'%s\' with queue-id %d\n",
6982
0
                pmd->core_id, netdev_rxq_get_name(poll_list[i].rxq->rx),
6983
0
                netdev_rxq_get_queue_id(poll_list[i].rxq->rx));
6984
       /* Reset the rxq current cycles counter. */
6985
0
       dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR, 0);
6986
0
       for (int j = 0; j < PMD_INTERVAL_MAX; j++) {
6987
0
           dp_netdev_rxq_set_intrvl_cycles(poll_list[i].rxq, 0);
6988
0
       }
6989
0
    }
6990
6991
0
    if (!poll_cnt) {
6992
0
        if (wait_for_reload) {
6993
            /* Don't sleep, control thread will ask for a reload shortly. */
6994
0
            do {
6995
0
                atomic_read_explicit(&pmd->reload, &reload,
6996
0
                                     memory_order_acquire);
6997
0
            } while (!reload);
6998
0
        } else {
6999
0
            while (seq_read(pmd->reload_seq) == pmd->last_reload_seq) {
7000
0
                seq_wait(pmd->reload_seq, pmd->last_reload_seq);
7001
0
                poll_block();
7002
0
            }
7003
0
        }
7004
0
    }
7005
7006
0
    for (i = 0; i < PMD_INTERVAL_MAX; i++) {
7007
0
        atomic_store_relaxed(&pmd->busy_cycles_intrvl[i], 0);
7008
0
    }
7009
0
    atomic_count_set(&pmd->intrvl_idx, 0);
7010
0
    cycles_counter_update(s);
7011
7012
0
    pmd->next_rcu_quiesce = pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL;
7013
7014
    /* Protect pmd stats from external clearing while polling. */
7015
0
    ovs_mutex_lock(&pmd->perf_stats.stats_mutex);
7016
0
    for (;;) {
7017
0
        uint64_t rx_packets = 0, tx_packets = 0;
7018
0
        uint64_t time_slept = 0;
7019
0
        uint64_t max_sleep;
7020
7021
0
        pmd_perf_start_iteration(s);
7022
7023
0
        atomic_read_relaxed(&pmd->dp->smc_enable_db, &pmd->ctx.smc_enable_db);
7024
0
        atomic_read_relaxed(&pmd->dp->pmd_max_sleep, &max_sleep);
7025
7026
0
        for (i = 0; i < poll_cnt; i++) {
7027
7028
0
            if (!poll_list[i].rxq_enabled) {
7029
0
                continue;
7030
0
            }
7031
7032
0
            if (poll_list[i].emc_enabled) {
7033
0
                atomic_read_relaxed(&pmd->dp->emc_insert_min,
7034
0
                                    &pmd->ctx.emc_insert_min);
7035
0
            } else {
7036
0
                pmd->ctx.emc_insert_min = 0;
7037
0
            }
7038
7039
0
            process_packets =
7040
0
                dp_netdev_process_rxq_port(pmd, poll_list[i].rxq,
7041
0
                                           poll_list[i].port_no);
7042
0
            rx_packets += process_packets;
7043
0
            if (process_packets >= PMD_SLEEP_THRESH) {
7044
0
                sleep_time = 0;
7045
0
            }
7046
0
        }
7047
7048
0
        if (!rx_packets) {
7049
            /* We didn't receive anything in the process loop.
7050
             * Check if we need to send something.
7051
             * There was no time updates on current iteration. */
7052
0
            pmd_thread_ctx_time_update(pmd);
7053
0
            tx_packets = dp_netdev_pmd_flush_output_packets(pmd,
7054
0
                                                   max_sleep && sleep_time
7055
0
                                                   ? true : false);
7056
0
        }
7057
7058
0
        if (max_sleep) {
7059
            /* Check if a sleep should happen on this iteration. */
7060
0
            if (sleep_time) {
7061
0
                struct cycle_timer sleep_timer;
7062
7063
0
                cycle_timer_start(&pmd->perf_stats, &sleep_timer);
7064
0
                xnanosleep_no_quiesce(sleep_time * 1000);
7065
0
                time_slept = cycle_timer_stop(&pmd->perf_stats, &sleep_timer);
7066
0
                pmd_thread_ctx_time_update(pmd);
7067
0
            }
7068
0
            if (sleep_time < max_sleep) {
7069
                /* Increase sleep time for next iteration. */
7070
0
                sleep_time += PMD_SLEEP_INC_US;
7071
0
            } else {
7072
0
                sleep_time = max_sleep;
7073
0
            }
7074
0
        } else {
7075
            /* Reset sleep time as max sleep policy may have been changed. */
7076
0
            sleep_time = 0;
7077
0
        }
7078
7079
        /* Do RCU synchronization at fixed interval.  This ensures that
7080
         * synchronization would not be delayed long even at high load of
7081
         * packet processing. */
7082
0
        if (pmd->ctx.now > pmd->next_rcu_quiesce) {
7083
0
            if (!ovsrcu_try_quiesce()) {
7084
0
                pmd->next_rcu_quiesce =
7085
0
                    pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL;
7086
0
            }
7087
0
        }
7088
7089
0
        if (lc++ > 1024) {
7090
0
            lc = 0;
7091
7092
0
            coverage_try_clear();
7093
0
            dp_netdev_pmd_try_optimize(pmd, poll_list, poll_cnt);
7094
0
            if (!ovsrcu_try_quiesce()) {
7095
0
                emc_cache_slow_sweep(&((pmd->flow_cache).emc_cache));
7096
0
                pmd->next_rcu_quiesce =
7097
0
                    pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL;
7098
0
            }
7099
7100
0
            for (i = 0; i < poll_cnt; i++) {
7101
0
                uint64_t current_seq =
7102
0
                         netdev_get_change_seq(poll_list[i].rxq->port->netdev);
7103
0
                if (poll_list[i].change_seq != current_seq) {
7104
0
                    poll_list[i].change_seq = current_seq;
7105
0
                    poll_list[i].rxq_enabled =
7106
0
                                 netdev_rxq_enabled(poll_list[i].rxq->rx);
7107
0
                }
7108
0
            }
7109
0
        }
7110
7111
0
        atomic_read_explicit(&pmd->reload, &reload, memory_order_acquire);
7112
0
        if (OVS_UNLIKELY(reload)) {
7113
0
            break;
7114
0
        }
7115
7116
0
        pmd_perf_end_iteration(s, rx_packets, tx_packets, time_slept,
7117
0
                               pmd_perf_metrics_enabled(pmd));
7118
0
    }
7119
0
    ovs_mutex_unlock(&pmd->perf_stats.stats_mutex);
7120
7121
0
    poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
7122
0
    atomic_read_relaxed(&pmd->wait_for_reload, &wait_for_reload);
7123
0
    atomic_read_relaxed(&pmd->reload_tx_qid, &reload_tx_qid);
7124
0
    atomic_read_relaxed(&pmd->exit, &exiting);
7125
    /* Signal here to make sure the pmd finishes
7126
     * reloading the updated configuration. */
7127
0
    dp_netdev_pmd_reload_done(pmd);
7128
7129
0
    if (reload_tx_qid) {
7130
0
        pmd_free_static_tx_qid(pmd);
7131
0
        pmd_alloc_static_tx_qid(pmd);
7132
0
    }
7133
7134
0
    if (!exiting) {
7135
0
        goto reload;
7136
0
    }
7137
7138
0
    pmd_free_static_tx_qid(pmd);
7139
0
    dfc_cache_uninit(&pmd->flow_cache);
7140
0
    free(poll_list);
7141
0
    pmd_free_cached_ports(pmd);
7142
0
    if (dpdk_attached) {
7143
0
        dpdk_detach_thread();
7144
0
    }
7145
0
    return NULL;
7146
0
}
7147
7148
static void
7149
dp_netdev_disable_upcall(struct dp_netdev *dp)
7150
    OVS_ACQUIRES(dp->upcall_rwlock)
7151
0
{
7152
0
    fat_rwlock_wrlock(&dp->upcall_rwlock);
7153
0
}
7154
7155

7156
/* Meters */
7157
static void
7158
dpif_netdev_meter_get_features(const struct dpif * dpif OVS_UNUSED,
7159
                               struct ofputil_meter_features *features)
7160
0
{
7161
0
    features->max_meters = MAX_METERS;
7162
0
    features->band_types = DP_SUPPORTED_METER_BAND_TYPES;
7163
0
    features->capabilities = DP_SUPPORTED_METER_FLAGS_MASK;
7164
0
    features->max_bands = MAX_BANDS;
7165
0
    features->max_color = 0;
7166
0
}
7167
7168
/* Applies the meter identified by 'meter_id' to 'packets_'.  Packets
7169
 * that exceed a band are dropped in-place. */
7170
static void
7171
dp_netdev_run_meter(struct dp_netdev *dp, struct dp_packet_batch *packets_,
7172
                    uint32_t meter_id, long long int now)
7173
0
{
7174
0
    struct dp_meter *meter;
7175
0
    struct dp_meter_band *band;
7176
0
    struct dp_packet *packet;
7177
0
    long long int long_delta_t; /* msec */
7178
0
    uint32_t delta_t; /* msec */
7179
0
    const size_t cnt = dp_packet_batch_size(packets_);
7180
0
    uint32_t bytes, volume;
7181
0
    int exceeded_band[NETDEV_MAX_BURST];
7182
0
    uint32_t exceeded_rate[NETDEV_MAX_BURST];
7183
0
    int exceeded_pkt = cnt; /* First packet that exceeded a band rate. */
7184
7185
0
    if (meter_id >= MAX_METERS) {
7186
0
        return;
7187
0
    }
7188
7189
0
    meter = dp_meter_lookup(&dp->meters, meter_id);
7190
0
    if (!meter) {
7191
0
        return;
7192
0
    }
7193
7194
    /* Initialize as negative values. */
7195
0
    memset(exceeded_band, 0xff, cnt * sizeof *exceeded_band);
7196
    /* Initialize as zeroes. */
7197
0
    memset(exceeded_rate, 0, cnt * sizeof *exceeded_rate);
7198
7199
0
    ovs_mutex_lock(&meter->lock);
7200
    /* All packets will hit the meter at the same time. */
7201
0
    long_delta_t = now / 1000 - meter->used / 1000; /* msec */
7202
7203
0
    if (long_delta_t < 0) {
7204
        /* This condition means that we have several threads fighting for a
7205
           meter lock, and the one who received the packets a bit later wins.
7206
           Assuming that all racing threads received packets at the same time
7207
           to avoid overflow. */
7208
0
        long_delta_t = 0;
7209
0
    }
7210
7211
    /* Make sure delta_t will not be too large, so that bucket will not
7212
     * wrap around below. */
7213
0
    delta_t = (long_delta_t > (long long int)meter->max_delta_t)
7214
0
        ? meter->max_delta_t : (uint32_t)long_delta_t;
7215
7216
    /* Update meter stats. */
7217
0
    meter->used = now;
7218
0
    meter->packet_count += cnt;
7219
0
    bytes = 0;
7220
0
    DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7221
0
        bytes += dp_packet_size(packet);
7222
0
    }
7223
0
    meter->byte_count += bytes;
7224
7225
    /* Meters can operate in terms of packets per second or kilobits per
7226
     * second. */
7227
0
    if (meter->flags & OFPMF13_PKTPS) {
7228
        /* Rate in packets/second, bucket 1/1000 packets. */
7229
        /* msec * packets/sec = 1/1000 packets. */
7230
0
        volume = cnt * 1000; /* Take 'cnt' packets from the bucket. */
7231
0
    } else {
7232
        /* Rate in kbps, bucket in bits. */
7233
        /* msec * kbps = bits */
7234
0
        volume = bytes * 8;
7235
0
    }
7236
7237
    /* Update all bands and find the one hit with the highest rate for each
7238
     * packet (if any). */
7239
0
    for (int m = 0; m < meter->n_bands; ++m) {
7240
0
        uint64_t max_bucket_size;
7241
7242
0
        band = &meter->bands[m];
7243
0
        max_bucket_size = band->burst_size * 1000ULL;
7244
        /* Update band's bucket. */
7245
0
        band->bucket += (uint64_t) delta_t * band->rate;
7246
0
        if (band->bucket > max_bucket_size) {
7247
0
            band->bucket = max_bucket_size;
7248
0
        }
7249
7250
        /* Drain the bucket for all the packets, if possible. */
7251
0
        if (band->bucket >= volume) {
7252
0
            band->bucket -= volume;
7253
0
        } else {
7254
0
            int band_exceeded_pkt;
7255
7256
            /* Band limit hit, must process packet-by-packet. */
7257
0
            if (meter->flags & OFPMF13_PKTPS) {
7258
0
                band_exceeded_pkt = band->bucket / 1000;
7259
0
                band->bucket %= 1000; /* Remainder stays in bucket. */
7260
7261
                /* Update the exceeding band for each exceeding packet.
7262
                 * (Only one band will be fired by a packet, and that
7263
                 * can be different for each packet.) */
7264
0
                for (int i = band_exceeded_pkt; i < cnt; i++) {
7265
0
                    if (band->rate > exceeded_rate[i]) {
7266
0
                        exceeded_rate[i] = band->rate;
7267
0
                        exceeded_band[i] = m;
7268
0
                    }
7269
0
                }
7270
0
            } else {
7271
                /* Packet sizes differ, must process one-by-one. */
7272
0
                band_exceeded_pkt = cnt;
7273
0
                DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7274
0
                    uint32_t bits = dp_packet_size(packet) * 8;
7275
7276
0
                    if (band->bucket >= bits) {
7277
0
                        band->bucket -= bits;
7278
0
                    } else {
7279
0
                        if (i < band_exceeded_pkt) {
7280
0
                            band_exceeded_pkt = i;
7281
0
                        }
7282
                        /* Update the exceeding band for the exceeding packet.
7283
                         * (Only one band will be fired by a packet, and that
7284
                         * can be different for each packet.) */
7285
0
                        if (band->rate > exceeded_rate[i]) {
7286
0
                            exceeded_rate[i] = band->rate;
7287
0
                            exceeded_band[i] = m;
7288
0
                        }
7289
0
                    }
7290
0
                }
7291
0
            }
7292
            /* Remember the first exceeding packet. */
7293
0
            if (exceeded_pkt > band_exceeded_pkt) {
7294
0
                exceeded_pkt = band_exceeded_pkt;
7295
0
            }
7296
0
        }
7297
0
    }
7298
7299
    /* Fire the highest rate band exceeded by each packet, and drop
7300
     * packets if needed. */
7301
0
    size_t j;
7302
0
    DP_PACKET_BATCH_REFILL_FOR_EACH (j, cnt, packet, packets_) {
7303
0
        if (exceeded_band[j] >= 0) {
7304
            /* Meter drop packet. */
7305
0
            band = &meter->bands[exceeded_band[j]];
7306
0
            band->packet_count += 1;
7307
0
            band->byte_count += dp_packet_size(packet);
7308
0
            COVERAGE_INC(datapath_drop_meter);
7309
0
            dp_packet_delete(packet);
7310
0
        } else {
7311
            /* Meter accepts packet. */
7312
0
            dp_packet_batch_refill(packets_, packet, j);
7313
0
        }
7314
0
    }
7315
7316
0
    ovs_mutex_unlock(&meter->lock);
7317
0
}
7318
7319
/* Meter set/get/del processing is still single-threaded. */
7320
static int
7321
dpif_netdev_meter_set(struct dpif *dpif, ofproto_meter_id meter_id,
7322
                      struct ofputil_meter_config *config)
7323
0
{
7324
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
7325
0
    uint32_t mid = meter_id.uint32;
7326
0
    struct dp_meter *meter;
7327
0
    int i;
7328
7329
0
    if (mid >= MAX_METERS) {
7330
0
        return EFBIG; /* Meter_id out of range. */
7331
0
    }
7332
7333
0
    if (config->flags & ~DP_SUPPORTED_METER_FLAGS_MASK) {
7334
0
        return EBADF; /* Unsupported flags set */
7335
0
    }
7336
7337
0
    if (config->n_bands > MAX_BANDS) {
7338
0
        return EINVAL;
7339
0
    }
7340
7341
0
    for (i = 0; i < config->n_bands; ++i) {
7342
0
        switch (config->bands[i].type) {
7343
0
        case OFPMBT13_DROP:
7344
0
            break;
7345
0
        default:
7346
0
            return ENODEV; /* Unsupported band type */
7347
0
        }
7348
0
    }
7349
7350
    /* Allocate meter */
7351
0
    meter = xzalloc(sizeof *meter
7352
0
                    + config->n_bands * sizeof(struct dp_meter_band));
7353
7354
0
    meter->flags = config->flags;
7355
0
    meter->n_bands = config->n_bands;
7356
0
    meter->max_delta_t = 0;
7357
0
    meter->used = time_usec();
7358
0
    meter->id = mid;
7359
0
    ovs_mutex_init_adaptive(&meter->lock);
7360
7361
    /* set up bands */
7362
0
    for (i = 0; i < config->n_bands; ++i) {
7363
0
        uint32_t band_max_delta_t;
7364
7365
        /* Set burst size to a workable value if none specified. */
7366
0
        if (config->bands[i].burst_size == 0) {
7367
0
            config->bands[i].burst_size = config->bands[i].rate;
7368
0
        }
7369
7370
0
        meter->bands[i].rate = config->bands[i].rate;
7371
0
        meter->bands[i].burst_size = config->bands[i].burst_size;
7372
        /* Start with a full bucket. */
7373
0
        meter->bands[i].bucket = meter->bands[i].burst_size * 1000ULL;
7374
7375
        /* Figure out max delta_t that is enough to fill any bucket. */
7376
0
        band_max_delta_t
7377
0
            = meter->bands[i].bucket / meter->bands[i].rate;
7378
0
        if (band_max_delta_t > meter->max_delta_t) {
7379
0
            meter->max_delta_t = band_max_delta_t;
7380
0
        }
7381
0
    }
7382
7383
0
    ovs_mutex_lock(&dp->meters_lock);
7384
7385
0
    dp_meter_detach_free(&dp->meters, mid); /* Free existing meter, if any. */
7386
0
    dp_meter_attach(&dp->meters, meter);
7387
7388
0
    ovs_mutex_unlock(&dp->meters_lock);
7389
7390
0
    return 0;
7391
0
}
7392
7393
static int
7394
dpif_netdev_meter_get(const struct dpif *dpif,
7395
                      ofproto_meter_id meter_id_,
7396
                      struct ofputil_meter_stats *stats, uint16_t n_bands)
7397
0
{
7398
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
7399
0
    uint32_t meter_id = meter_id_.uint32;
7400
0
    const struct dp_meter *meter;
7401
7402
0
    if (meter_id >= MAX_METERS) {
7403
0
        return EFBIG;
7404
0
    }
7405
7406
0
    meter = dp_meter_lookup(&dp->meters, meter_id);
7407
0
    if (!meter) {
7408
0
        return ENOENT;
7409
0
    }
7410
7411
0
    if (stats) {
7412
0
        int i = 0;
7413
7414
0
        ovs_mutex_lock(&meter->lock);
7415
7416
0
        stats->packet_in_count = meter->packet_count;
7417
0
        stats->byte_in_count = meter->byte_count;
7418
7419
0
        for (i = 0; i < n_bands && i < meter->n_bands; ++i) {
7420
0
            stats->bands[i].packet_count = meter->bands[i].packet_count;
7421
0
            stats->bands[i].byte_count = meter->bands[i].byte_count;
7422
0
        }
7423
7424
0
        ovs_mutex_unlock(&meter->lock);
7425
0
        stats->n_bands = i;
7426
0
    }
7427
7428
0
    return 0;
7429
0
}
7430
7431
static int
7432
dpif_netdev_meter_del(struct dpif *dpif,
7433
                      ofproto_meter_id meter_id_,
7434
                      struct ofputil_meter_stats *stats, uint16_t n_bands)
7435
0
{
7436
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
7437
0
    int error;
7438
7439
0
    error = dpif_netdev_meter_get(dpif, meter_id_, stats, n_bands);
7440
0
    if (!error) {
7441
0
        uint32_t meter_id = meter_id_.uint32;
7442
7443
0
        ovs_mutex_lock(&dp->meters_lock);
7444
0
        dp_meter_detach_free(&dp->meters, meter_id);
7445
0
        ovs_mutex_unlock(&dp->meters_lock);
7446
0
    }
7447
0
    return error;
7448
0
}
7449
7450

7451
static void
7452
dpif_netdev_disable_upcall(struct dpif *dpif)
7453
    OVS_NO_THREAD_SAFETY_ANALYSIS
7454
0
{
7455
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
7456
0
    dp_netdev_disable_upcall(dp);
7457
0
}
7458
7459
static void
7460
dp_netdev_enable_upcall(struct dp_netdev *dp)
7461
    OVS_RELEASES(dp->upcall_rwlock)
7462
0
{
7463
0
    fat_rwlock_unlock(&dp->upcall_rwlock);
7464
0
}
7465
7466
static void
7467
dpif_netdev_enable_upcall(struct dpif *dpif)
7468
    OVS_NO_THREAD_SAFETY_ANALYSIS
7469
0
{
7470
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
7471
0
    dp_netdev_enable_upcall(dp);
7472
0
}
7473
7474
static void
7475
dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd)
7476
0
{
7477
0
    atomic_store_relaxed(&pmd->wait_for_reload, false);
7478
0
    atomic_store_relaxed(&pmd->reload_tx_qid, false);
7479
0
    pmd->last_reload_seq = seq_read(pmd->reload_seq);
7480
0
    atomic_store_explicit(&pmd->reload, false, memory_order_release);
7481
0
}
7482
7483
/* Finds and refs the dp_netdev_pmd_thread on core 'core_id'.  Returns
7484
 * the pointer if succeeds, otherwise, NULL (it can return NULL even if
7485
 * 'core_id' is NON_PMD_CORE_ID).
7486
 *
7487
 * Caller must unrefs the returned reference.  */
7488
static struct dp_netdev_pmd_thread *
7489
dp_netdev_get_pmd(struct dp_netdev *dp, unsigned core_id)
7490
0
{
7491
0
    struct dp_netdev_pmd_thread *pmd;
7492
7493
0
    CMAP_FOR_EACH_WITH_HASH (pmd, node, hash_int(core_id, 0),
7494
0
                             &dp->poll_threads) {
7495
0
        if (pmd->core_id == core_id) {
7496
0
            return dp_netdev_pmd_try_ref(pmd) ? pmd : NULL;
7497
0
        }
7498
0
    }
7499
7500
0
    return NULL;
7501
0
}
7502
7503
/* Sets the 'struct dp_netdev_pmd_thread' for non-pmd threads. */
7504
static void
7505
dp_netdev_set_nonpmd(struct dp_netdev *dp)
7506
    OVS_REQ_WRLOCK(dp->port_rwlock)
7507
0
{
7508
0
    struct dp_netdev_pmd_thread *non_pmd;
7509
7510
0
    non_pmd = xzalloc(sizeof *non_pmd);
7511
0
    dp_netdev_configure_pmd(non_pmd, dp, NON_PMD_CORE_ID, OVS_NUMA_UNSPEC);
7512
0
}
7513
7514
/* Caller must have valid pointer to 'pmd'. */
7515
static bool
7516
dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd)
7517
0
{
7518
0
    return ovs_refcount_try_ref_rcu(&pmd->ref_cnt);
7519
0
}
7520
7521
static void
7522
dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd)
7523
0
{
7524
0
    if (pmd && ovs_refcount_unref(&pmd->ref_cnt) == 1) {
7525
0
        ovsrcu_postpone(dp_netdev_destroy_pmd, pmd);
7526
0
    }
7527
0
}
7528
7529
/* Given cmap position 'pos', tries to ref the next node.  If try_ref()
7530
 * fails, keeps checking for next node until reaching the end of cmap.
7531
 *
7532
 * Caller must unrefs the returned reference. */
7533
static struct dp_netdev_pmd_thread *
7534
dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos)
7535
0
{
7536
0
    struct dp_netdev_pmd_thread *next;
7537
7538
0
    do {
7539
0
        struct cmap_node *node;
7540
7541
0
        node = cmap_next_position(&dp->poll_threads, pos);
7542
0
        next = node ? CONTAINER_OF(node, struct dp_netdev_pmd_thread, node)
7543
0
            : NULL;
7544
0
    } while (next && !dp_netdev_pmd_try_ref(next));
7545
7546
0
    return next;
7547
0
}
7548
7549
/* Configures the 'pmd' based on the input argument. */
7550
static void
7551
dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
7552
                        unsigned core_id, int numa_id)
7553
0
{
7554
0
    pmd->dp = dp;
7555
0
    pmd->core_id = core_id;
7556
0
    pmd->numa_id = numa_id;
7557
0
    pmd->need_reload = false;
7558
0
    pmd->n_output_batches = 0;
7559
7560
0
    ovs_refcount_init(&pmd->ref_cnt);
7561
0
    atomic_init(&pmd->exit, false);
7562
0
    pmd->reload_seq = seq_create();
7563
0
    pmd->last_reload_seq = seq_read(pmd->reload_seq);
7564
0
    atomic_init(&pmd->reload, false);
7565
0
    ovs_mutex_init(&pmd->flow_mutex);
7566
0
    ovs_mutex_init(&pmd->port_mutex);
7567
0
    ovs_mutex_init(&pmd->bond_mutex);
7568
0
    cmap_init(&pmd->flow_table);
7569
0
    cmap_init(&pmd->classifiers);
7570
0
    cmap_init(&pmd->simple_match_table);
7571
0
    ccmap_init(&pmd->n_flows);
7572
0
    ccmap_init(&pmd->n_simple_flows);
7573
0
    pmd->ctx.last_rxq = NULL;
7574
0
    pmd_thread_ctx_time_update(pmd);
7575
0
    pmd->next_optimization = pmd->ctx.now + DPCLS_OPTIMIZATION_INTERVAL;
7576
0
    pmd->next_rcu_quiesce = pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL;
7577
0
    pmd->next_cycle_store = pmd->ctx.now + PMD_INTERVAL_LEN;
7578
0
    pmd->busy_cycles_intrvl = xzalloc(PMD_INTERVAL_MAX *
7579
0
                                      sizeof *pmd->busy_cycles_intrvl);
7580
0
    hmap_init(&pmd->poll_list);
7581
0
    hmap_init(&pmd->tx_ports);
7582
0
    hmap_init(&pmd->tnl_port_cache);
7583
0
    hmap_init(&pmd->send_port_cache);
7584
0
    cmap_init(&pmd->tx_bonds);
7585
7586
    /* Initialize DPIF function pointer to the default configured version. */
7587
0
    atomic_init(&pmd->netdev_input_func, dp_netdev_impl_get_default());
7588
7589
    /* Init default miniflow_extract function */
7590
0
    atomic_init(&pmd->miniflow_extract_opt, dp_mfex_impl_get_default());
7591
7592
    /* init the 'flow_cache' since there is no
7593
     * actual thread created for NON_PMD_CORE_ID. */
7594
0
    if (core_id == NON_PMD_CORE_ID) {
7595
0
        dfc_cache_init(&pmd->flow_cache);
7596
0
        pmd_alloc_static_tx_qid(pmd);
7597
0
    }
7598
0
    pmd_perf_stats_init(&pmd->perf_stats);
7599
0
    cmap_insert(&dp->poll_threads, CONST_CAST(struct cmap_node *, &pmd->node),
7600
0
                hash_int(core_id, 0));
7601
0
}
7602
7603
static void
7604
dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd)
7605
0
{
7606
0
    struct dpcls *cls;
7607
7608
0
    dp_netdev_pmd_flow_flush(pmd);
7609
0
    hmap_destroy(&pmd->send_port_cache);
7610
0
    hmap_destroy(&pmd->tnl_port_cache);
7611
0
    hmap_destroy(&pmd->tx_ports);
7612
0
    cmap_destroy(&pmd->tx_bonds);
7613
0
    hmap_destroy(&pmd->poll_list);
7614
0
    free(pmd->busy_cycles_intrvl);
7615
    /* All flows (including their dpcls_rules) have been deleted already */
7616
0
    CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
7617
0
        dpcls_destroy(cls);
7618
0
        ovsrcu_postpone(free, cls);
7619
0
    }
7620
0
    cmap_destroy(&pmd->classifiers);
7621
0
    cmap_destroy(&pmd->flow_table);
7622
0
    cmap_destroy(&pmd->simple_match_table);
7623
0
    ccmap_destroy(&pmd->n_flows);
7624
0
    ccmap_destroy(&pmd->n_simple_flows);
7625
0
    ovs_mutex_destroy(&pmd->flow_mutex);
7626
0
    seq_destroy(pmd->reload_seq);
7627
0
    ovs_mutex_destroy(&pmd->port_mutex);
7628
0
    ovs_mutex_destroy(&pmd->bond_mutex);
7629
0
    free(pmd->netdev_input_func_userdata);
7630
0
    free(pmd);
7631
0
}
7632
7633
/* Stops the pmd thread, removes it from the 'dp->poll_threads',
7634
 * and unrefs the struct. */
7635
static void
7636
dp_netdev_del_pmd(struct dp_netdev *dp, struct dp_netdev_pmd_thread *pmd)
7637
0
{
7638
    /* NON_PMD_CORE_ID doesn't have a thread, so we don't have to synchronize,
7639
     * but extra cleanup is necessary */
7640
0
    if (pmd->core_id == NON_PMD_CORE_ID) {
7641
0
        ovs_mutex_lock(&dp->non_pmd_mutex);
7642
0
        dfc_cache_uninit(&pmd->flow_cache);
7643
0
        pmd_free_cached_ports(pmd);
7644
0
        pmd_free_static_tx_qid(pmd);
7645
0
        ovs_mutex_unlock(&dp->non_pmd_mutex);
7646
0
    } else {
7647
0
        atomic_store_relaxed(&pmd->exit, true);
7648
0
        dp_netdev_reload_pmd__(pmd);
7649
0
        xpthread_join(pmd->thread, NULL);
7650
0
    }
7651
7652
0
    dp_netdev_pmd_clear_ports(pmd);
7653
7654
    /* Purges the 'pmd''s flows after stopping the thread, but before
7655
     * destroying the flows, so that the flow stats can be collected. */
7656
0
    if (dp->dp_purge_cb) {
7657
0
        dp->dp_purge_cb(dp->dp_purge_aux, pmd->core_id);
7658
0
    }
7659
0
    cmap_remove(&pmd->dp->poll_threads, &pmd->node, hash_int(pmd->core_id, 0));
7660
0
    dp_netdev_pmd_unref(pmd);
7661
0
}
7662
7663
/* Destroys all pmd threads. If 'non_pmd' is true it also destroys the non pmd
7664
 * thread. */
7665
static void
7666
dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd)
7667
0
{
7668
0
    struct dp_netdev_pmd_thread *pmd;
7669
0
    struct dp_netdev_pmd_thread **pmd_list;
7670
0
    size_t k = 0, n_pmds;
7671
7672
0
    n_pmds = cmap_count(&dp->poll_threads);
7673
0
    pmd_list = xcalloc(n_pmds, sizeof *pmd_list);
7674
7675
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
7676
0
        if (!non_pmd && pmd->core_id == NON_PMD_CORE_ID) {
7677
0
            continue;
7678
0
        }
7679
        /* We cannot call dp_netdev_del_pmd(), since it alters
7680
         * 'dp->poll_threads' (while we're iterating it) and it
7681
         * might quiesce. */
7682
0
        ovs_assert(k < n_pmds);
7683
0
        pmd_list[k++] = pmd;
7684
0
    }
7685
7686
0
    for (size_t i = 0; i < k; i++) {
7687
0
        dp_netdev_del_pmd(dp, pmd_list[i]);
7688
0
    }
7689
0
    free(pmd_list);
7690
0
}
7691
7692
/* Deletes all rx queues from pmd->poll_list and all the ports from
7693
 * pmd->tx_ports. */
7694
static void
7695
dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd)
7696
0
{
7697
0
    struct rxq_poll *poll;
7698
0
    struct tx_port *port;
7699
0
    struct tx_bond *tx;
7700
7701
0
    ovs_mutex_lock(&pmd->port_mutex);
7702
0
    HMAP_FOR_EACH_POP (poll, node, &pmd->poll_list) {
7703
0
        free(poll);
7704
0
    }
7705
0
    HMAP_FOR_EACH_POP (port, node, &pmd->tx_ports) {
7706
0
        free(port->txq_pkts);
7707
0
        free(port);
7708
0
    }
7709
0
    ovs_mutex_unlock(&pmd->port_mutex);
7710
7711
0
    ovs_mutex_lock(&pmd->bond_mutex);
7712
0
    CMAP_FOR_EACH (tx, node, &pmd->tx_bonds) {
7713
0
        cmap_remove(&pmd->tx_bonds, &tx->node, hash_bond_id(tx->bond_id));
7714
0
        ovsrcu_postpone(free, tx);
7715
0
    }
7716
0
    ovs_mutex_unlock(&pmd->bond_mutex);
7717
0
}
7718
7719
/* Adds rx queue to poll_list of PMD thread, if it's not there already. */
7720
static void
7721
dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
7722
                         struct dp_netdev_rxq *rxq)
7723
    OVS_REQUIRES(pmd->port_mutex)
7724
0
{
7725
0
    int qid = netdev_rxq_get_queue_id(rxq->rx);
7726
0
    uint32_t hash = hash_2words(odp_to_u32(rxq->port->port_no), qid);
7727
0
    struct rxq_poll *poll;
7728
7729
0
    HMAP_FOR_EACH_WITH_HASH (poll, node, hash, &pmd->poll_list) {
7730
0
        if (poll->rxq == rxq) {
7731
            /* 'rxq' is already polled by this thread. Do nothing. */
7732
0
            return;
7733
0
        }
7734
0
    }
7735
7736
0
    poll = xmalloc(sizeof *poll);
7737
0
    poll->rxq = rxq;
7738
0
    hmap_insert(&pmd->poll_list, &poll->node, hash);
7739
7740
0
    pmd->need_reload = true;
7741
0
}
7742
7743
/* Delete 'poll' from poll_list of PMD thread. */
7744
static void
7745
dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
7746
                           struct rxq_poll *poll)
7747
    OVS_REQUIRES(pmd->port_mutex)
7748
0
{
7749
0
    hmap_remove(&pmd->poll_list, &poll->node);
7750
0
    free(poll);
7751
7752
0
    pmd->need_reload = true;
7753
0
}
7754
7755
/* Add 'port' to the tx port cache of 'pmd', which must be reloaded for the
7756
 * changes to take effect. */
7757
static void
7758
dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
7759
                             struct dp_netdev_port *port)
7760
    OVS_REQUIRES(pmd->port_mutex)
7761
0
{
7762
0
    struct tx_port *tx;
7763
7764
0
    tx = tx_port_lookup(&pmd->tx_ports, port->port_no);
7765
0
    if (tx) {
7766
        /* 'port' is already on this thread tx cache. Do nothing. */
7767
0
        return;
7768
0
    }
7769
7770
0
    tx = xzalloc(sizeof *tx);
7771
7772
0
    tx->port = port;
7773
0
    tx->qid = -1;
7774
0
    tx->flush_time = 0LL;
7775
0
    dp_packet_batch_init(&tx->output_pkts);
7776
7777
0
    if (tx->port->txq_mode == TXQ_MODE_XPS_HASH) {
7778
0
        int i, n_txq = netdev_n_txq(tx->port->netdev);
7779
7780
0
        tx->txq_pkts = xzalloc(n_txq * sizeof *tx->txq_pkts);
7781
0
        for (i = 0; i < n_txq; i++) {
7782
0
            dp_packet_batch_init(&tx->txq_pkts[i]);
7783
0
        }
7784
0
    }
7785
7786
0
    hmap_insert(&pmd->tx_ports, &tx->node, hash_port_no(tx->port->port_no));
7787
0
    pmd->need_reload = true;
7788
0
}
7789
7790
/* Del 'tx' from the tx port cache of 'pmd', which must be reloaded for the
7791
 * changes to take effect. */
7792
static void
7793
dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
7794
                               struct tx_port *tx)
7795
    OVS_REQUIRES(pmd->port_mutex)
7796
0
{
7797
0
    hmap_remove(&pmd->tx_ports, &tx->node);
7798
0
    free(tx->txq_pkts);
7799
0
    free(tx);
7800
0
    pmd->need_reload = true;
7801
0
}
7802
7803
/* Add bond to the tx bond cmap of 'pmd'. */
7804
static void
7805
dp_netdev_add_bond_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
7806
                             struct tx_bond *bond, bool update)
7807
    OVS_EXCLUDED(pmd->bond_mutex)
7808
0
{
7809
0
    struct tx_bond *tx;
7810
7811
0
    ovs_mutex_lock(&pmd->bond_mutex);
7812
0
    tx = tx_bond_lookup(&pmd->tx_bonds, bond->bond_id);
7813
7814
0
    if (tx && !update) {
7815
        /* It's not an update and the entry already exists.  Do nothing. */
7816
0
        goto unlock;
7817
0
    }
7818
7819
0
    if (tx) {
7820
0
        struct tx_bond *new_tx = xmemdup(bond, sizeof *bond);
7821
7822
        /* Copy the stats for each bucket. */
7823
0
        for (int i = 0; i < BOND_BUCKETS; i++) {
7824
0
            uint64_t n_packets, n_bytes;
7825
7826
0
            atomic_read_relaxed(&tx->member_buckets[i].n_packets, &n_packets);
7827
0
            atomic_read_relaxed(&tx->member_buckets[i].n_bytes, &n_bytes);
7828
0
            atomic_init(&new_tx->member_buckets[i].n_packets, n_packets);
7829
0
            atomic_init(&new_tx->member_buckets[i].n_bytes, n_bytes);
7830
0
        }
7831
0
        cmap_replace(&pmd->tx_bonds, &tx->node, &new_tx->node,
7832
0
                     hash_bond_id(bond->bond_id));
7833
0
        ovsrcu_postpone(free, tx);
7834
0
    } else {
7835
0
        tx = xmemdup(bond, sizeof *bond);
7836
0
        cmap_insert(&pmd->tx_bonds, &tx->node, hash_bond_id(bond->bond_id));
7837
0
    }
7838
0
unlock:
7839
0
    ovs_mutex_unlock(&pmd->bond_mutex);
7840
0
}
7841
7842
/* Delete bond from the tx bond cmap of 'pmd'. */
7843
static void
7844
dp_netdev_del_bond_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
7845
                               uint32_t bond_id)
7846
    OVS_EXCLUDED(pmd->bond_mutex)
7847
0
{
7848
0
    struct tx_bond *tx;
7849
7850
0
    ovs_mutex_lock(&pmd->bond_mutex);
7851
0
    tx = tx_bond_lookup(&pmd->tx_bonds, bond_id);
7852
0
    if (tx) {
7853
0
        cmap_remove(&pmd->tx_bonds, &tx->node, hash_bond_id(tx->bond_id));
7854
0
        ovsrcu_postpone(free, tx);
7855
0
    }
7856
0
    ovs_mutex_unlock(&pmd->bond_mutex);
7857
0
}
7858

7859
static char *
7860
dpif_netdev_get_datapath_version(void)
7861
0
{
7862
0
     return xstrdup("<built-in>");
7863
0
}
7864
7865
static void
7866
dp_netdev_flow_used(struct dp_netdev_flow *netdev_flow, int cnt, int size,
7867
                    uint16_t tcp_flags, long long now)
7868
0
{
7869
0
    uint16_t flags;
7870
7871
0
    atomic_store_relaxed(&netdev_flow->stats.used, now);
7872
0
    non_atomic_ullong_add(&netdev_flow->stats.packet_count, cnt);
7873
0
    non_atomic_ullong_add(&netdev_flow->stats.byte_count, size);
7874
0
    atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
7875
0
    flags |= tcp_flags;
7876
0
    atomic_store_relaxed(&netdev_flow->stats.tcp_flags, flags);
7877
0
}
7878
7879
static int
7880
dp_netdev_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet_,
7881
                 struct flow *flow, struct flow_wildcards *wc, ovs_u128 *ufid,
7882
                 enum dpif_upcall_type type, const struct nlattr *userdata,
7883
                 struct ofpbuf *actions, struct ofpbuf *put_actions)
7884
0
{
7885
0
    struct dp_netdev *dp = pmd->dp;
7886
7887
0
    if (OVS_UNLIKELY(!dp->upcall_cb)) {
7888
0
        return ENODEV;
7889
0
    }
7890
7891
0
    if (OVS_UNLIKELY(!VLOG_DROP_DBG(&upcall_rl))) {
7892
0
        struct ds ds = DS_EMPTY_INITIALIZER;
7893
0
        char *packet_str;
7894
0
        struct ofpbuf key;
7895
0
        struct odp_flow_key_parms odp_parms = {
7896
0
            .flow = flow,
7897
0
            .mask = wc ? &wc->masks : NULL,
7898
0
            .support = dp_netdev_support,
7899
0
        };
7900
7901
0
        ofpbuf_init(&key, 0);
7902
0
        odp_flow_key_from_flow(&odp_parms, &key);
7903
0
        packet_str = ofp_dp_packet_to_string(packet_);
7904
7905
0
        odp_flow_key_format(key.data, key.size, &ds);
7906
7907
0
        VLOG_DBG("%s: %s upcall:\n%s\n%s", dp->name,
7908
0
                 dpif_upcall_type_to_string(type), ds_cstr(&ds), packet_str);
7909
7910
0
        ofpbuf_uninit(&key);
7911
0
        free(packet_str);
7912
7913
0
        ds_destroy(&ds);
7914
0
    }
7915
7916
0
    return dp->upcall_cb(packet_, flow, ufid, pmd->core_id, type, userdata,
7917
0
                         actions, wc, put_actions, dp->upcall_aux);
7918
0
}
7919
7920
static inline uint32_t
7921
dpif_netdev_packet_get_rss_hash(struct dp_packet *packet,
7922
                                const struct miniflow *mf)
7923
0
{
7924
0
    uint32_t hash, recirc_depth;
7925
7926
0
    if (OVS_LIKELY(dp_packet_rss_valid(packet))) {
7927
0
        hash = dp_packet_get_rss_hash(packet);
7928
0
    } else {
7929
0
        hash = miniflow_hash_5tuple(mf, 0);
7930
0
        dp_packet_set_rss_hash(packet, hash);
7931
0
    }
7932
7933
    /* The RSS hash must account for the recirculation depth to avoid
7934
     * collisions in the exact match cache */
7935
0
    recirc_depth = *recirc_depth_get_unsafe();
7936
0
    if (OVS_UNLIKELY(recirc_depth)) {
7937
0
        hash = hash_finish(hash, recirc_depth);
7938
0
    }
7939
0
    return hash;
7940
0
}
7941
7942
struct packet_batch_per_flow {
7943
    unsigned int byte_count;
7944
    uint16_t tcp_flags;
7945
    struct dp_netdev_flow *flow;
7946
7947
    struct dp_packet_batch array;
7948
};
7949
7950
static inline void
7951
packet_batch_per_flow_update(struct packet_batch_per_flow *batch,
7952
                             struct dp_packet *packet,
7953
                             uint16_t tcp_flags)
7954
0
{
7955
0
    batch->byte_count += dp_packet_size(packet);
7956
0
    batch->tcp_flags |= tcp_flags;
7957
0
    dp_packet_batch_add(&batch->array, packet);
7958
0
}
7959
7960
static inline void
7961
packet_batch_per_flow_init(struct packet_batch_per_flow *batch,
7962
                           struct dp_netdev_flow *flow)
7963
0
{
7964
0
    flow->batch = batch;
7965
7966
0
    batch->flow = flow;
7967
0
    dp_packet_batch_init(&batch->array);
7968
0
    batch->byte_count = 0;
7969
0
    batch->tcp_flags = 0;
7970
0
}
7971
7972
static inline void
7973
packet_batch_per_flow_execute(struct packet_batch_per_flow *batch,
7974
                              struct dp_netdev_pmd_thread *pmd)
7975
0
{
7976
0
    struct dp_netdev_actions *actions;
7977
0
    struct dp_netdev_flow *flow = batch->flow;
7978
7979
0
    dp_netdev_flow_used(flow, dp_packet_batch_size(&batch->array),
7980
0
                        batch->byte_count,
7981
0
                        batch->tcp_flags, pmd->ctx.now / 1000);
7982
7983
0
    actions = dp_netdev_flow_get_actions(flow);
7984
7985
0
    dp_netdev_execute_actions(pmd, &batch->array, true, &flow->flow,
7986
0
                              actions->actions, actions->size);
7987
0
}
7988
7989
void
7990
dp_netdev_batch_execute(struct dp_netdev_pmd_thread *pmd,
7991
                        struct dp_packet_batch *packets,
7992
                        struct dpcls_rule *rule,
7993
                        uint32_t bytes,
7994
                        uint16_t tcp_flags)
7995
0
{
7996
    /* Gets action* from the rule. */
7997
0
    struct dp_netdev_flow *flow = dp_netdev_flow_cast(rule);
7998
0
    struct dp_netdev_actions *actions = dp_netdev_flow_get_actions(flow);
7999
8000
0
    dp_netdev_flow_used(flow, dp_packet_batch_size(packets), bytes,
8001
0
                        tcp_flags, pmd->ctx.now / 1000);
8002
0
    const uint32_t steal = 1;
8003
0
    dp_netdev_execute_actions(pmd, packets, steal, &flow->flow,
8004
0
                              actions->actions, actions->size);
8005
0
}
8006
8007
static inline void
8008
dp_netdev_queue_batches(struct dp_packet *pkt,
8009
                        struct dp_netdev_flow *flow, uint16_t tcp_flags,
8010
                        struct packet_batch_per_flow *batches,
8011
                        size_t *n_batches)
8012
0
{
8013
0
    struct packet_batch_per_flow *batch = flow->batch;
8014
8015
0
    if (OVS_UNLIKELY(!batch)) {
8016
0
        batch = &batches[(*n_batches)++];
8017
0
        packet_batch_per_flow_init(batch, flow);
8018
0
    }
8019
8020
0
    packet_batch_per_flow_update(batch, pkt, tcp_flags);
8021
0
}
8022
8023
static inline void
8024
packet_enqueue_to_flow_map(struct dp_packet *packet,
8025
                           struct dp_netdev_flow *flow,
8026
                           uint16_t tcp_flags,
8027
                           struct dp_packet_flow_map *flow_map,
8028
                           size_t index)
8029
0
{
8030
0
    struct dp_packet_flow_map *map = &flow_map[index];
8031
0
    map->flow = flow;
8032
0
    map->packet = packet;
8033
0
    map->tcp_flags = tcp_flags;
8034
0
}
8035
8036
/* SMC lookup function for a batch of packets.
8037
 * By doing batching SMC lookup, we can use prefetch
8038
 * to hide memory access latency.
8039
 */
8040
static inline void
8041
smc_lookup_batch(struct dp_netdev_pmd_thread *pmd,
8042
            struct netdev_flow_key *keys,
8043
            struct netdev_flow_key **missed_keys,
8044
            struct dp_packet_batch *packets_,
8045
            const int cnt,
8046
            struct dp_packet_flow_map *flow_map,
8047
            uint8_t *index_map)
8048
0
{
8049
0
    int i;
8050
0
    struct dp_packet *packet;
8051
0
    size_t n_smc_hit = 0, n_missed = 0;
8052
0
    struct dfc_cache *cache = &pmd->flow_cache;
8053
0
    struct smc_cache *smc_cache = &cache->smc_cache;
8054
0
    const struct cmap_node *flow_node;
8055
0
    int recv_idx;
8056
0
    uint16_t tcp_flags;
8057
8058
    /* Prefetch buckets for all packets */
8059
0
    for (i = 0; i < cnt; i++) {
8060
0
        OVS_PREFETCH(&smc_cache->buckets[keys[i].hash & SMC_MASK]);
8061
0
    }
8062
8063
0
    DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) {
8064
0
        struct dp_netdev_flow *flow = NULL;
8065
0
        flow_node = smc_entry_get(pmd, keys[i].hash);
8066
0
        bool hit = false;
8067
        /* Get the original order of this packet in received batch. */
8068
0
        recv_idx = index_map[i];
8069
8070
0
        if (OVS_LIKELY(flow_node != NULL)) {
8071
0
            CMAP_NODE_FOR_EACH (flow, node, flow_node) {
8072
                /* Since we dont have per-port megaflow to check the port
8073
                 * number, we need to  verify that the input ports match. */
8074
0
                if (OVS_LIKELY(dpcls_rule_matches_key(&flow->cr, &keys[i]) &&
8075
0
                flow->flow.in_port.odp_port == packet->md.in_port.odp_port)) {
8076
0
                    tcp_flags = miniflow_get_tcp_flags(&keys[i].mf);
8077
8078
                    /* SMC hit and emc miss, we insert into EMC */
8079
0
                    keys[i].len =
8080
0
                        netdev_flow_key_size(miniflow_n_values(&keys[i].mf));
8081
0
                    emc_probabilistic_insert(pmd, &keys[i], flow);
8082
                    /* Add these packets into the flow map in the same order
8083
                     * as received.
8084
                     */
8085
0
                    packet_enqueue_to_flow_map(packet, flow, tcp_flags,
8086
0
                                               flow_map, recv_idx);
8087
0
                    n_smc_hit++;
8088
0
                    hit = true;
8089
0
                    break;
8090
0
                }
8091
0
            }
8092
0
            if (hit) {
8093
0
                continue;
8094
0
            }
8095
0
        }
8096
8097
        /* SMC missed. Group missed packets together at
8098
         * the beginning of the 'packets' array. */
8099
0
        dp_packet_batch_refill(packets_, packet, i);
8100
8101
        /* Preserve the order of packet for flow batching. */
8102
0
        index_map[n_missed] = recv_idx;
8103
8104
        /* Put missed keys to the pointer arrays return to the caller */
8105
0
        missed_keys[n_missed++] = &keys[i];
8106
0
    }
8107
8108
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SMC_HIT, n_smc_hit);
8109
0
}
8110
8111
struct dp_netdev_flow *
8112
smc_lookup_single(struct dp_netdev_pmd_thread *pmd,
8113
                  struct dp_packet *packet,
8114
                  struct netdev_flow_key *key)
8115
0
{
8116
0
    const struct cmap_node *flow_node = smc_entry_get(pmd, key->hash);
8117
8118
0
    if (OVS_LIKELY(flow_node != NULL)) {
8119
0
        struct dp_netdev_flow *flow = NULL;
8120
8121
0
        CMAP_NODE_FOR_EACH (flow, node, flow_node) {
8122
            /* Since we dont have per-port megaflow to check the port
8123
             * number, we need to verify that the input ports match. */
8124
0
            if (OVS_LIKELY(dpcls_rule_matches_key(&flow->cr, key) &&
8125
0
                flow->flow.in_port.odp_port == packet->md.in_port.odp_port)) {
8126
8127
0
                return (void *) flow;
8128
0
            }
8129
0
        }
8130
0
    }
8131
8132
0
    return NULL;
8133
0
}
8134
8135
inline int
8136
dp_netdev_hw_flow(const struct dp_netdev_pmd_thread *pmd,
8137
                  struct dp_packet *packet,
8138
                  struct dp_netdev_flow **flow)
8139
0
{
8140
0
    uint32_t mark;
8141
8142
#ifdef ALLOW_EXPERIMENTAL_API /* Packet restoration API required. */
8143
    /* Restore the packet if HW processing was terminated before completion. */
8144
    struct dp_netdev_rxq *rxq = pmd->ctx.last_rxq;
8145
    bool miss_api_supported;
8146
8147
    atomic_read_relaxed(&rxq->port->netdev->hw_info.miss_api_supported,
8148
                        &miss_api_supported);
8149
    if (miss_api_supported) {
8150
        int err = netdev_hw_miss_packet_recover(rxq->port->netdev, packet);
8151
        if (err && err != EOPNOTSUPP) {
8152
            COVERAGE_INC(datapath_drop_hw_miss_recover);
8153
            return -1;
8154
        }
8155
    }
8156
#endif
8157
8158
    /* If no mark, no flow to find. */
8159
0
    if (!dp_packet_has_flow_mark(packet, &mark)) {
8160
0
        *flow = NULL;
8161
0
        return 0;
8162
0
    }
8163
8164
0
    *flow = mark_to_flow_find(pmd, mark);
8165
0
    return 0;
8166
0
}
8167
8168
/* Enqueues already classified packet into per-flow batches or the flow map,
8169
 * depending on the fact if batching enabled. */
8170
static inline void
8171
dfc_processing_enqueue_classified_packet(struct dp_packet *packet,
8172
                                         struct dp_netdev_flow *flow,
8173
                                         uint16_t tcp_flags,
8174
                                         bool batch_enable,
8175
                                         struct packet_batch_per_flow *batches,
8176
                                         size_t *n_batches,
8177
                                         struct dp_packet_flow_map *flow_map,
8178
                                         size_t *map_cnt)
8179
8180
0
{
8181
0
    if (OVS_LIKELY(batch_enable)) {
8182
0
        dp_netdev_queue_batches(packet, flow, tcp_flags, batches,
8183
0
                                n_batches);
8184
0
    } else {
8185
        /* Flow batching should be performed only after fast-path
8186
         * processing is also completed for packets with emc miss
8187
         * or else it will result in reordering of packets with
8188
         * same datapath flows. */
8189
0
        packet_enqueue_to_flow_map(packet, flow, tcp_flags,
8190
0
                                   flow_map, (*map_cnt)++);
8191
0
    }
8192
8193
0
}
8194
8195
/* Try to process all ('cnt') the 'packets' using only the datapath flow cache
8196
 * 'pmd->flow_cache'. If a flow is not found for a packet 'packets[i]', the
8197
 * miniflow is copied into 'keys' and the packet pointer is moved at the
8198
 * beginning of the 'packets' array. The pointers of missed keys are put in the
8199
 * missed_keys pointer array for future processing.
8200
 *
8201
 * The function returns the number of packets that needs to be processed in the
8202
 * 'packets' array (they have been moved to the beginning of the vector).
8203
 *
8204
 * For performance reasons a caller may choose not to initialize the metadata
8205
 * in 'packets_'.  If 'md_is_valid' is false, the metadata in 'packets'
8206
 * is not valid and must be initialized by this function using 'port_no'.
8207
 * If 'md_is_valid' is true, the metadata is already valid and 'port_no'
8208
 * will be ignored.
8209
 */
8210
static inline size_t
8211
dfc_processing(struct dp_netdev_pmd_thread *pmd,
8212
               struct dp_packet_batch *packets_,
8213
               struct netdev_flow_key *keys,
8214
               struct netdev_flow_key **missed_keys,
8215
               struct packet_batch_per_flow batches[], size_t *n_batches,
8216
               struct dp_packet_flow_map *flow_map,
8217
               size_t *n_flows, uint8_t *index_map,
8218
               bool md_is_valid, odp_port_t port_no)
8219
0
{
8220
0
    const bool netdev_flow_api = netdev_is_flow_api_enabled();
8221
0
    const uint32_t recirc_depth = *recirc_depth_get();
8222
0
    const size_t cnt = dp_packet_batch_size(packets_);
8223
0
    size_t n_missed = 0, n_emc_hit = 0, n_phwol_hit = 0;
8224
0
    size_t n_mfex_opt_hit = 0, n_simple_hit = 0;
8225
0
    struct dfc_cache *cache = &pmd->flow_cache;
8226
0
    struct netdev_flow_key *key = &keys[0];
8227
0
    struct dp_packet *packet;
8228
0
    size_t map_cnt = 0;
8229
0
    bool batch_enable = true;
8230
8231
0
    const bool simple_match_enabled =
8232
0
        !md_is_valid && dp_netdev_simple_match_enabled(pmd, port_no);
8233
    /* 'simple_match_table' is a full flow table.  If the flow is not there,
8234
     * upcall is required, and there is no chance to find a match in caches. */
8235
0
    const bool smc_enable_db = !simple_match_enabled && pmd->ctx.smc_enable_db;
8236
0
    const uint32_t cur_min = simple_match_enabled
8237
0
                             ? 0 : pmd->ctx.emc_insert_min;
8238
8239
0
    pmd_perf_update_counter(&pmd->perf_stats,
8240
0
                            md_is_valid ? PMD_STAT_RECIRC : PMD_STAT_RECV,
8241
0
                            cnt);
8242
0
    int i;
8243
0
    DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) {
8244
0
        struct dp_netdev_flow *flow = NULL;
8245
0
        uint16_t tcp_flags;
8246
8247
0
        if (OVS_UNLIKELY(dp_packet_size(packet) < ETH_HEADER_LEN)) {
8248
0
            dp_packet_delete(packet);
8249
0
            COVERAGE_INC(datapath_drop_rx_invalid_packet);
8250
0
            continue;
8251
0
        }
8252
8253
0
        if (i != cnt - 1) {
8254
0
            struct dp_packet **packets = packets_->packets;
8255
            /* Prefetch next packet data and metadata. */
8256
0
            OVS_PREFETCH(dp_packet_data(packets[i+1]));
8257
0
            pkt_metadata_prefetch_init(&packets[i+1]->md);
8258
0
        }
8259
8260
0
        if (!md_is_valid) {
8261
0
            pkt_metadata_init(&packet->md, port_no);
8262
0
        }
8263
8264
0
        if (netdev_flow_api && recirc_depth == 0) {
8265
0
            if (OVS_UNLIKELY(dp_netdev_hw_flow(pmd, packet, &flow))) {
8266
                /* Packet restoration failed and it was dropped, do not
8267
                 * continue processing.
8268
                 */
8269
0
                continue;
8270
0
            }
8271
0
            if (OVS_LIKELY(flow)) {
8272
0
                tcp_flags = parse_tcp_flags(packet, NULL, NULL, NULL);
8273
0
                n_phwol_hit++;
8274
0
                dfc_processing_enqueue_classified_packet(
8275
0
                        packet, flow, tcp_flags, batch_enable,
8276
0
                        batches, n_batches, flow_map, &map_cnt);
8277
0
                continue;
8278
0
            }
8279
0
        }
8280
8281
0
        if (!flow && simple_match_enabled) {
8282
0
            ovs_be16 dl_type = 0, vlan_tci = 0;
8283
0
            uint8_t nw_frag = 0;
8284
8285
0
            tcp_flags = parse_tcp_flags(packet, &dl_type, &nw_frag, &vlan_tci);
8286
0
            flow = dp_netdev_simple_match_lookup(pmd, port_no, dl_type,
8287
0
                                                 nw_frag, vlan_tci);
8288
0
            if (OVS_LIKELY(flow)) {
8289
0
                n_simple_hit++;
8290
0
                dfc_processing_enqueue_classified_packet(
8291
0
                        packet, flow, tcp_flags, batch_enable,
8292
0
                        batches, n_batches, flow_map, &map_cnt);
8293
0
                continue;
8294
0
            }
8295
0
        }
8296
8297
0
        miniflow_extract(packet, &key->mf);
8298
0
        key->len = 0; /* Not computed yet. */
8299
0
        key->hash =
8300
0
                (md_is_valid == false)
8301
0
                ? dpif_netdev_packet_get_rss_hash_orig_pkt(packet, &key->mf)
8302
0
                : dpif_netdev_packet_get_rss_hash(packet, &key->mf);
8303
8304
        /* If EMC is disabled skip emc_lookup */
8305
0
        flow = (cur_min != 0) ? emc_lookup(&cache->emc_cache, key) : NULL;
8306
0
        if (OVS_LIKELY(flow)) {
8307
0
            tcp_flags = miniflow_get_tcp_flags(&key->mf);
8308
0
            n_emc_hit++;
8309
0
            dfc_processing_enqueue_classified_packet(
8310
0
                    packet, flow, tcp_flags, batch_enable,
8311
0
                    batches, n_batches, flow_map, &map_cnt);
8312
0
        } else {
8313
            /* Exact match cache missed. Group missed packets together at
8314
             * the beginning of the 'packets' array. */
8315
0
            dp_packet_batch_refill(packets_, packet, i);
8316
8317
            /* Preserve the order of packet for flow batching. */
8318
0
            index_map[n_missed] = map_cnt;
8319
0
            flow_map[map_cnt++].flow = NULL;
8320
8321
            /* 'key[n_missed]' contains the key of the current packet and it
8322
             * will be passed to SMC lookup. The next key should be extracted
8323
             * to 'keys[n_missed + 1]'.
8324
             * We also maintain a pointer array to keys missed both SMC and EMC
8325
             * which will be returned to the caller for future processing. */
8326
0
            missed_keys[n_missed] = key;
8327
0
            key = &keys[++n_missed];
8328
8329
            /* Skip batching for subsequent packets to avoid reordering. */
8330
0
            batch_enable = false;
8331
0
        }
8332
0
    }
8333
    /* Count of packets which are not flow batched. */
8334
0
    *n_flows = map_cnt;
8335
8336
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_PHWOL_HIT, n_phwol_hit);
8337
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MFEX_OPT_HIT,
8338
0
                            n_mfex_opt_hit);
8339
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SIMPLE_HIT,
8340
0
                            n_simple_hit);
8341
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_EXACT_HIT, n_emc_hit);
8342
8343
0
    if (!smc_enable_db) {
8344
0
        return dp_packet_batch_size(packets_);
8345
0
    }
8346
8347
    /* Packets miss EMC will do a batch lookup in SMC if enabled */
8348
0
    smc_lookup_batch(pmd, keys, missed_keys, packets_,
8349
0
                     n_missed, flow_map, index_map);
8350
8351
0
    return dp_packet_batch_size(packets_);
8352
0
}
8353
8354
static inline int
8355
handle_packet_upcall(struct dp_netdev_pmd_thread *pmd,
8356
                     struct dp_packet *packet,
8357
                     const struct netdev_flow_key *key,
8358
                     struct ofpbuf *actions, struct ofpbuf *put_actions)
8359
0
{
8360
0
    struct ofpbuf *add_actions;
8361
0
    struct dp_packet_batch b;
8362
0
    struct match match;
8363
0
    ovs_u128 ufid;
8364
0
    int error;
8365
0
    uint64_t cycles = cycles_counter_update(&pmd->perf_stats);
8366
0
    odp_port_t orig_in_port = packet->md.orig_in_port;
8367
8368
0
    match.tun_md.valid = false;
8369
0
    miniflow_expand(&key->mf, &match.flow);
8370
0
    memset(&match.wc, 0, sizeof match.wc);
8371
8372
0
    ofpbuf_clear(actions);
8373
0
    ofpbuf_clear(put_actions);
8374
8375
0
    odp_flow_key_hash(&match.flow, sizeof match.flow, &ufid);
8376
0
    error = dp_netdev_upcall(pmd, packet, &match.flow, &match.wc,
8377
0
                             &ufid, DPIF_UC_MISS, NULL, actions,
8378
0
                             put_actions);
8379
0
    if (OVS_UNLIKELY(error && error != ENOSPC)) {
8380
0
        dp_packet_delete(packet);
8381
0
        COVERAGE_INC(datapath_drop_upcall_error);
8382
0
        return error;
8383
0
    }
8384
8385
    /* The Netlink encoding of datapath flow keys cannot express
8386
     * wildcarding the presence of a VLAN tag. Instead, a missing VLAN
8387
     * tag is interpreted as exact match on the fact that there is no
8388
     * VLAN.  Unless we refactor a lot of code that translates between
8389
     * Netlink and struct flow representations, we have to do the same
8390
     * here.  This must be in sync with 'match' in dpif_netdev_flow_put(). */
8391
0
    if (!match.wc.masks.vlans[0].tci) {
8392
0
        match.wc.masks.vlans[0].tci = htons(VLAN_VID_MASK | VLAN_CFI);
8393
0
    }
8394
8395
    /* We can't allow the packet batching in the next loop to execute
8396
     * the actions.  Otherwise, if there are any slow path actions,
8397
     * we'll send the packet up twice. */
8398
0
    dp_packet_batch_init_packet(&b, packet);
8399
0
    dp_netdev_execute_actions(pmd, &b, true, &match.flow,
8400
0
                              actions->data, actions->size);
8401
8402
0
    add_actions = put_actions->size ? put_actions : actions;
8403
0
    if (OVS_LIKELY(error != ENOSPC)) {
8404
0
        struct dp_netdev_flow *netdev_flow;
8405
8406
        /* XXX: There's a race window where a flow covering this packet
8407
         * could have already been installed since we last did the flow
8408
         * lookup before upcall.  This could be solved by moving the
8409
         * mutex lock outside the loop, but that's an awful long time
8410
         * to be locking revalidators out of making flow modifications. */
8411
0
        ovs_mutex_lock(&pmd->flow_mutex);
8412
0
        netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL);
8413
0
        if (OVS_LIKELY(!netdev_flow)) {
8414
0
            netdev_flow = dp_netdev_flow_add(pmd, &match, &ufid,
8415
0
                                             add_actions->data,
8416
0
                                             add_actions->size, orig_in_port);
8417
0
        }
8418
0
        ovs_mutex_unlock(&pmd->flow_mutex);
8419
0
        uint32_t hash = dp_netdev_flow_hash(&netdev_flow->ufid);
8420
0
        smc_insert(pmd, key, hash);
8421
0
        emc_probabilistic_insert(pmd, key, netdev_flow);
8422
0
    }
8423
0
    if (pmd_perf_metrics_enabled(pmd)) {
8424
        /* Update upcall stats. */
8425
0
        cycles = cycles_counter_update(&pmd->perf_stats) - cycles;
8426
0
        struct pmd_perf_stats *s = &pmd->perf_stats;
8427
0
        s->current.upcalls++;
8428
0
        s->current.upcall_cycles += cycles;
8429
0
        histogram_add_sample(&s->cycles_per_upcall, cycles);
8430
0
    }
8431
0
    return error;
8432
0
}
8433
8434
static inline void
8435
fast_path_processing(struct dp_netdev_pmd_thread *pmd,
8436
                     struct dp_packet_batch *packets_,
8437
                     struct netdev_flow_key **keys,
8438
                     struct dp_packet_flow_map *flow_map,
8439
                     uint8_t *index_map,
8440
                     odp_port_t in_port)
8441
0
{
8442
0
    const size_t cnt = dp_packet_batch_size(packets_);
8443
0
#if !defined(__CHECKER__) && !defined(_WIN32)
8444
0
    const size_t PKT_ARRAY_SIZE = cnt;
8445
#else
8446
    /* Sparse or MSVC doesn't like variable length array. */
8447
    enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
8448
#endif
8449
0
    struct dp_packet *packet;
8450
0
    struct dpcls *cls;
8451
0
    struct dpcls_rule *rules[PKT_ARRAY_SIZE];
8452
0
    struct dp_netdev *dp = pmd->dp;
8453
0
    int upcall_ok_cnt = 0, upcall_fail_cnt = 0;
8454
0
    int lookup_cnt = 0, add_lookup_cnt;
8455
0
    bool any_miss;
8456
8457
0
    for (size_t i = 0; i < cnt; i++) {
8458
        /* Key length is needed in all the cases, hash computed on demand. */
8459
0
        keys[i]->len = netdev_flow_key_size(miniflow_n_values(&keys[i]->mf));
8460
0
    }
8461
    /* Get the classifier for the in_port */
8462
0
    cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
8463
0
    if (OVS_LIKELY(cls)) {
8464
0
        any_miss = !dpcls_lookup(cls, (const struct netdev_flow_key **)keys,
8465
0
                                rules, cnt, &lookup_cnt);
8466
0
    } else {
8467
0
        any_miss = true;
8468
0
        memset(rules, 0, sizeof(rules));
8469
0
    }
8470
0
    if (OVS_UNLIKELY(any_miss) && !fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
8471
0
        uint64_t actions_stub[512 / 8], slow_stub[512 / 8];
8472
0
        struct ofpbuf actions, put_actions;
8473
8474
0
        ofpbuf_use_stub(&actions, actions_stub, sizeof actions_stub);
8475
0
        ofpbuf_use_stub(&put_actions, slow_stub, sizeof slow_stub);
8476
8477
0
        DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
8478
0
            struct dp_netdev_flow *netdev_flow;
8479
8480
0
            if (OVS_LIKELY(rules[i])) {
8481
0
                continue;
8482
0
            }
8483
8484
            /* It's possible that an earlier slow path execution installed
8485
             * a rule covering this flow.  In this case, it's a lot cheaper
8486
             * to catch it here than execute a miss. */
8487
0
            netdev_flow = dp_netdev_pmd_lookup_flow(pmd, keys[i],
8488
0
                                                    &add_lookup_cnt);
8489
0
            if (netdev_flow) {
8490
0
                lookup_cnt += add_lookup_cnt;
8491
0
                rules[i] = &netdev_flow->cr;
8492
0
                continue;
8493
0
            }
8494
8495
0
            int error = handle_packet_upcall(pmd, packet, keys[i],
8496
0
                                             &actions, &put_actions);
8497
8498
0
            if (OVS_UNLIKELY(error)) {
8499
0
                upcall_fail_cnt++;
8500
0
            } else {
8501
0
                upcall_ok_cnt++;
8502
0
            }
8503
0
        }
8504
8505
0
        ofpbuf_uninit(&actions);
8506
0
        ofpbuf_uninit(&put_actions);
8507
0
        fat_rwlock_unlock(&dp->upcall_rwlock);
8508
0
    } else if (OVS_UNLIKELY(any_miss)) {
8509
0
        DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
8510
0
            if (OVS_UNLIKELY(!rules[i])) {
8511
0
                dp_packet_delete(packet);
8512
0
                COVERAGE_INC(datapath_drop_lock_error);
8513
0
                upcall_fail_cnt++;
8514
0
            }
8515
0
        }
8516
0
    }
8517
8518
0
    DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
8519
0
        struct dp_netdev_flow *flow;
8520
        /* Get the original order of this packet in received batch. */
8521
0
        int recv_idx = index_map[i];
8522
0
        uint16_t tcp_flags;
8523
8524
0
        if (OVS_UNLIKELY(!rules[i])) {
8525
0
            continue;
8526
0
        }
8527
8528
0
        flow = dp_netdev_flow_cast(rules[i]);
8529
0
        uint32_t hash =  dp_netdev_flow_hash(&flow->ufid);
8530
0
        smc_insert(pmd, keys[i], hash);
8531
8532
0
        emc_probabilistic_insert(pmd, keys[i], flow);
8533
        /* Add these packets into the flow map in the same order
8534
         * as received.
8535
         */
8536
0
        tcp_flags = miniflow_get_tcp_flags(&keys[i]->mf);
8537
0
        packet_enqueue_to_flow_map(packet, flow, tcp_flags,
8538
0
                                   flow_map, recv_idx);
8539
0
    }
8540
8541
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_HIT,
8542
0
                            cnt - upcall_ok_cnt - upcall_fail_cnt);
8543
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_LOOKUP,
8544
0
                            lookup_cnt);
8545
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MISS,
8546
0
                            upcall_ok_cnt);
8547
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_LOST,
8548
0
                            upcall_fail_cnt);
8549
0
}
8550
8551
/* Packets enter the datapath from a port (or from recirculation) here.
8552
 *
8553
 * When 'md_is_valid' is true the metadata in 'packets' are already valid.
8554
 * When false the metadata in 'packets' need to be initialized. */
8555
static void
8556
dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
8557
                  struct dp_packet_batch *packets,
8558
                  bool md_is_valid, odp_port_t port_no)
8559
0
{
8560
0
#if !defined(__CHECKER__) && !defined(_WIN32)
8561
0
    const size_t PKT_ARRAY_SIZE = dp_packet_batch_size(packets);
8562
#else
8563
    /* Sparse or MSVC doesn't like variable length array. */
8564
    enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
8565
#endif
8566
0
    OVS_ALIGNED_VAR(CACHE_LINE_SIZE)
8567
0
        struct netdev_flow_key keys[PKT_ARRAY_SIZE];
8568
0
    struct netdev_flow_key *missed_keys[PKT_ARRAY_SIZE];
8569
0
    struct packet_batch_per_flow batches[PKT_ARRAY_SIZE];
8570
0
    size_t n_batches;
8571
0
    struct dp_packet_flow_map flow_map[PKT_ARRAY_SIZE];
8572
0
    uint8_t index_map[PKT_ARRAY_SIZE];
8573
0
    size_t n_flows, i;
8574
8575
0
    odp_port_t in_port;
8576
8577
0
    n_batches = 0;
8578
0
    dfc_processing(pmd, packets, keys, missed_keys, batches, &n_batches,
8579
0
                   flow_map, &n_flows, index_map, md_is_valid, port_no);
8580
8581
0
    if (!dp_packet_batch_is_empty(packets)) {
8582
        /* Get ingress port from first packet's metadata. */
8583
0
        in_port = packets->packets[0]->md.in_port.odp_port;
8584
0
        fast_path_processing(pmd, packets, missed_keys,
8585
0
                             flow_map, index_map, in_port);
8586
0
    }
8587
8588
    /* Batch rest of packets which are in flow map. */
8589
0
    for (i = 0; i < n_flows; i++) {
8590
0
        struct dp_packet_flow_map *map = &flow_map[i];
8591
8592
0
        if (OVS_UNLIKELY(!map->flow)) {
8593
0
            continue;
8594
0
        }
8595
0
        dp_netdev_queue_batches(map->packet, map->flow, map->tcp_flags,
8596
0
                                batches, &n_batches);
8597
0
     }
8598
8599
    /* All the flow batches need to be reset before any call to
8600
     * packet_batch_per_flow_execute() as it could potentially trigger
8601
     * recirculation. When a packet matching flow 'j' happens to be
8602
     * recirculated, the nested call to dp_netdev_input__() could potentially
8603
     * classify the packet as matching another flow - say 'k'. It could happen
8604
     * that in the previous call to dp_netdev_input__() that same flow 'k' had
8605
     * already its own batches[k] still waiting to be served.  So if its
8606
     * 'batch' member is not reset, the recirculated packet would be wrongly
8607
     * appended to batches[k] of the 1st call to dp_netdev_input__(). */
8608
0
    for (i = 0; i < n_batches; i++) {
8609
0
        batches[i].flow->batch = NULL;
8610
0
    }
8611
8612
0
    for (i = 0; i < n_batches; i++) {
8613
0
        packet_batch_per_flow_execute(&batches[i], pmd);
8614
0
    }
8615
0
}
8616
8617
int32_t
8618
dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
8619
                struct dp_packet_batch *packets,
8620
                odp_port_t port_no)
8621
0
{
8622
0
    dp_netdev_input__(pmd, packets, false, port_no);
8623
0
    return 0;
8624
0
}
8625
8626
static void
8627
dp_netdev_recirculate(struct dp_netdev_pmd_thread *pmd,
8628
                      struct dp_packet_batch *packets)
8629
0
{
8630
0
    dp_netdev_input__(pmd, packets, true, 0);
8631
0
}
8632
8633
struct dp_netdev_execute_aux {
8634
    struct dp_netdev_pmd_thread *pmd;
8635
    const struct flow *flow;
8636
};
8637
8638
static void
8639
dpif_netdev_register_dp_purge_cb(struct dpif *dpif, dp_purge_callback *cb,
8640
                                 void *aux)
8641
0
{
8642
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8643
0
    dp->dp_purge_aux = aux;
8644
0
    dp->dp_purge_cb = cb;
8645
0
}
8646
8647
static void
8648
dpif_netdev_register_upcall_cb(struct dpif *dpif, upcall_callback *cb,
8649
                               void *aux)
8650
0
{
8651
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8652
0
    dp->upcall_aux = aux;
8653
0
    dp->upcall_cb = cb;
8654
0
}
8655
8656
static void
8657
dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
8658
                               bool purge)
8659
0
{
8660
0
    struct tx_port *tx;
8661
0
    struct dp_netdev_port *port;
8662
0
    long long interval;
8663
8664
0
    HMAP_FOR_EACH (tx, node, &pmd->send_port_cache) {
8665
0
        if (tx->port->txq_mode != TXQ_MODE_XPS) {
8666
0
            continue;
8667
0
        }
8668
0
        interval = pmd->ctx.now - tx->last_used;
8669
0
        if (tx->qid >= 0 && (purge || interval >= XPS_TIMEOUT)) {
8670
0
            port = tx->port;
8671
0
            ovs_mutex_lock(&port->txq_used_mutex);
8672
0
            port->txq_used[tx->qid]--;
8673
0
            ovs_mutex_unlock(&port->txq_used_mutex);
8674
0
            tx->qid = -1;
8675
0
        }
8676
0
    }
8677
0
}
8678
8679
static int
8680
dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
8681
                           struct tx_port *tx)
8682
0
{
8683
0
    struct dp_netdev_port *port;
8684
0
    long long interval;
8685
0
    int i, min_cnt, min_qid;
8686
8687
0
    interval = pmd->ctx.now - tx->last_used;
8688
0
    tx->last_used = pmd->ctx.now;
8689
8690
0
    if (OVS_LIKELY(tx->qid >= 0 && interval < XPS_TIMEOUT)) {
8691
0
        return tx->qid;
8692
0
    }
8693
8694
0
    port = tx->port;
8695
8696
0
    ovs_mutex_lock(&port->txq_used_mutex);
8697
0
    if (tx->qid >= 0) {
8698
0
        port->txq_used[tx->qid]--;
8699
0
        tx->qid = -1;
8700
0
    }
8701
8702
0
    min_cnt = -1;
8703
0
    min_qid = 0;
8704
0
    for (i = 0; i < netdev_n_txq(port->netdev); i++) {
8705
0
        if (port->txq_used[i] < min_cnt || min_cnt == -1) {
8706
0
            min_cnt = port->txq_used[i];
8707
0
            min_qid = i;
8708
0
        }
8709
0
    }
8710
8711
0
    port->txq_used[min_qid]++;
8712
0
    tx->qid = min_qid;
8713
8714
0
    ovs_mutex_unlock(&port->txq_used_mutex);
8715
8716
0
    dpif_netdev_xps_revalidate_pmd(pmd, false);
8717
8718
0
    VLOG_DBG("Core %d: New TX queue ID %d for port \'%s\'.",
8719
0
             pmd->core_id, tx->qid, netdev_get_name(tx->port->netdev));
8720
0
    return min_qid;
8721
0
}
8722
8723
static struct tx_port *
8724
pmd_tnl_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
8725
                          odp_port_t port_no)
8726
0
{
8727
0
    return tx_port_lookup(&pmd->tnl_port_cache, port_no);
8728
0
}
8729
8730
static struct tx_port *
8731
pmd_send_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
8732
                           odp_port_t port_no)
8733
0
{
8734
0
    return tx_port_lookup(&pmd->send_port_cache, port_no);
8735
0
}
8736
8737
static int
8738
push_tnl_action(const struct dp_netdev_pmd_thread *pmd,
8739
                const struct nlattr *attr,
8740
                struct dp_packet_batch *batch)
8741
0
{
8742
0
    struct tx_port *tun_port;
8743
0
    const struct ovs_action_push_tnl *data;
8744
0
    int err;
8745
8746
0
    data = nl_attr_get(attr);
8747
8748
0
    tun_port = pmd_tnl_port_cache_lookup(pmd, data->tnl_port);
8749
0
    if (!tun_port) {
8750
0
        err = -EINVAL;
8751
0
        goto error;
8752
0
    }
8753
0
    err = netdev_push_header(tun_port->port->netdev, batch, data);
8754
0
    if (!err) {
8755
0
        return 0;
8756
0
    }
8757
0
error:
8758
0
    dp_packet_delete_batch(batch, true);
8759
0
    return err;
8760
0
}
8761
8762
static void
8763
dp_execute_userspace_action(struct dp_netdev_pmd_thread *pmd,
8764
                            struct dp_packet *packet, bool should_steal,
8765
                            struct flow *flow, ovs_u128 *ufid,
8766
                            struct ofpbuf *actions,
8767
                            const struct nlattr *userdata)
8768
0
{
8769
0
    struct dp_packet_batch b;
8770
0
    int error;
8771
8772
0
    ofpbuf_clear(actions);
8773
8774
0
    error = dp_netdev_upcall(pmd, packet, flow, NULL, ufid,
8775
0
                             DPIF_UC_ACTION, userdata, actions,
8776
0
                             NULL);
8777
0
    if (!error || error == ENOSPC) {
8778
0
        dp_packet_batch_init_packet(&b, packet);
8779
0
        dp_netdev_execute_actions(pmd, &b, should_steal, flow,
8780
0
                                  actions->data, actions->size);
8781
0
    } else if (should_steal) {
8782
0
        dp_packet_delete(packet);
8783
0
        COVERAGE_INC(datapath_drop_userspace_action_error);
8784
0
    }
8785
0
}
8786
8787
static bool
8788
dp_execute_output_action(struct dp_netdev_pmd_thread *pmd,
8789
                         struct dp_packet_batch *packets_,
8790
                         bool should_steal, odp_port_t port_no)
8791
0
{
8792
0
    struct tx_port *p = pmd_send_port_cache_lookup(pmd, port_no);
8793
0
    struct dp_packet_batch out;
8794
8795
0
    if (!OVS_LIKELY(p)) {
8796
0
        COVERAGE_ADD(datapath_drop_invalid_port,
8797
0
                     dp_packet_batch_size(packets_));
8798
0
        dp_packet_delete_batch(packets_, should_steal);
8799
0
        return false;
8800
0
    }
8801
0
    if (!should_steal) {
8802
0
        dp_packet_batch_clone(&out, packets_);
8803
0
        dp_packet_batch_reset_cutlen(packets_);
8804
0
        packets_ = &out;
8805
0
    }
8806
0
    dp_packet_batch_apply_cutlen(packets_);
8807
#ifdef DPDK_NETDEV
8808
    if (OVS_UNLIKELY(!dp_packet_batch_is_empty(&p->output_pkts)
8809
                     && packets_->packets[0]->source
8810
                        != p->output_pkts.packets[0]->source)) {
8811
        /* XXX: netdev-dpdk assumes that all packets in a single
8812
         *      output batch has the same source. Flush here to
8813
         *      avoid memory access issues. */
8814
        dp_netdev_pmd_flush_output_on_port(pmd, p);
8815
    }
8816
#endif
8817
0
    if (dp_packet_batch_size(&p->output_pkts)
8818
0
        + dp_packet_batch_size(packets_) > NETDEV_MAX_BURST) {
8819
        /* Flush here to avoid overflow. */
8820
0
        dp_netdev_pmd_flush_output_on_port(pmd, p);
8821
0
    }
8822
0
    if (dp_packet_batch_is_empty(&p->output_pkts)) {
8823
0
        pmd->n_output_batches++;
8824
0
    }
8825
8826
0
    struct dp_packet *packet;
8827
0
    DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
8828
0
        p->output_pkts_rxqs[dp_packet_batch_size(&p->output_pkts)] =
8829
0
            pmd->ctx.last_rxq;
8830
0
        dp_packet_batch_add(&p->output_pkts, packet);
8831
0
    }
8832
0
    return true;
8833
0
}
8834
8835
static void
8836
dp_execute_lb_output_action(struct dp_netdev_pmd_thread *pmd,
8837
                            struct dp_packet_batch *packets_,
8838
                            bool should_steal, uint32_t bond)
8839
0
{
8840
0
    struct tx_bond *p_bond = tx_bond_lookup(&pmd->tx_bonds, bond);
8841
0
    struct dp_packet_batch out;
8842
0
    struct dp_packet *packet;
8843
8844
0
    if (!p_bond) {
8845
0
        COVERAGE_ADD(datapath_drop_invalid_bond,
8846
0
                     dp_packet_batch_size(packets_));
8847
0
        dp_packet_delete_batch(packets_, should_steal);
8848
0
        return;
8849
0
    }
8850
0
    if (!should_steal) {
8851
0
        dp_packet_batch_clone(&out, packets_);
8852
0
        dp_packet_batch_reset_cutlen(packets_);
8853
0
        packets_ = &out;
8854
0
    }
8855
0
    dp_packet_batch_apply_cutlen(packets_);
8856
8857
0
    DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
8858
        /*
8859
         * Lookup the bond-hash table using hash to get the member.
8860
         */
8861
0
        uint32_t hash = dp_packet_get_rss_hash(packet);
8862
0
        struct member_entry *s_entry
8863
0
            = &p_bond->member_buckets[hash & BOND_MASK];
8864
0
        odp_port_t bond_member = s_entry->member_id;
8865
0
        uint32_t size = dp_packet_size(packet);
8866
0
        struct dp_packet_batch output_pkt;
8867
8868
0
        dp_packet_batch_init_packet(&output_pkt, packet);
8869
0
        if (OVS_LIKELY(dp_execute_output_action(pmd, &output_pkt, true,
8870
0
                                                bond_member))) {
8871
            /* Update member stats. */
8872
0
            non_atomic_ullong_add(&s_entry->n_packets, 1);
8873
0
            non_atomic_ullong_add(&s_entry->n_bytes, size);
8874
0
        }
8875
0
    }
8876
0
}
8877
8878
static void
8879
dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
8880
              const struct nlattr *a, bool should_steal)
8881
    OVS_NO_THREAD_SAFETY_ANALYSIS
8882
0
{
8883
0
    struct dp_netdev_execute_aux *aux = aux_;
8884
0
    uint32_t *depth = recirc_depth_get();
8885
0
    struct dp_netdev_pmd_thread *pmd = aux->pmd;
8886
0
    struct dp_netdev *dp = pmd->dp;
8887
0
    int type = nl_attr_type(a);
8888
0
    struct tx_port *p;
8889
0
    uint32_t packet_count, packets_dropped;
8890
8891
0
    switch ((enum ovs_action_attr)type) {
8892
0
    case OVS_ACTION_ATTR_OUTPUT:
8893
0
        dp_execute_output_action(pmd, packets_, should_steal,
8894
0
                                 nl_attr_get_odp_port(a));
8895
0
        return;
8896
8897
0
    case OVS_ACTION_ATTR_LB_OUTPUT:
8898
0
        dp_execute_lb_output_action(pmd, packets_, should_steal,
8899
0
                                    nl_attr_get_u32(a));
8900
0
        return;
8901
8902
0
    case OVS_ACTION_ATTR_TUNNEL_PUSH:
8903
0
        if (should_steal) {
8904
            /* We're requested to push tunnel header, but also we need to take
8905
             * the ownership of these packets. Thus, we can avoid performing
8906
             * the action, because the caller will not use the result anyway.
8907
             * Just break to free the batch. */
8908
0
            break;
8909
0
        }
8910
0
        dp_packet_batch_apply_cutlen(packets_);
8911
0
        packet_count = dp_packet_batch_size(packets_);
8912
0
        if (push_tnl_action(pmd, a, packets_)) {
8913
0
            COVERAGE_ADD(datapath_drop_tunnel_push_error,
8914
0
                         packet_count);
8915
0
        }
8916
0
        return;
8917
8918
0
    case OVS_ACTION_ATTR_TUNNEL_POP:
8919
0
        if (*depth < MAX_RECIRC_DEPTH) {
8920
0
            struct dp_packet_batch *orig_packets_ = packets_;
8921
0
            odp_port_t portno = nl_attr_get_odp_port(a);
8922
8923
0
            p = pmd_tnl_port_cache_lookup(pmd, portno);
8924
0
            if (p) {
8925
0
                struct dp_packet_batch tnl_pkt;
8926
8927
0
                if (!should_steal) {
8928
0
                    dp_packet_batch_clone(&tnl_pkt, packets_);
8929
0
                    packets_ = &tnl_pkt;
8930
0
                    dp_packet_batch_reset_cutlen(orig_packets_);
8931
0
                }
8932
8933
0
                dp_packet_batch_apply_cutlen(packets_);
8934
8935
0
                packet_count = dp_packet_batch_size(packets_);
8936
0
                netdev_pop_header(p->port->netdev, packets_);
8937
0
                packets_dropped =
8938
0
                   packet_count - dp_packet_batch_size(packets_);
8939
0
                if (packets_dropped) {
8940
0
                    COVERAGE_ADD(datapath_drop_tunnel_pop_error,
8941
0
                                 packets_dropped);
8942
0
                }
8943
0
                if (dp_packet_batch_is_empty(packets_)) {
8944
0
                    return;
8945
0
                }
8946
8947
0
                struct dp_packet *packet;
8948
0
                DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
8949
0
                    packet->md.in_port.odp_port = portno;
8950
0
                }
8951
8952
0
                (*depth)++;
8953
0
                dp_netdev_recirculate(pmd, packets_);
8954
0
                (*depth)--;
8955
0
                return;
8956
0
            }
8957
0
            COVERAGE_ADD(datapath_drop_invalid_tnl_port,
8958
0
                         dp_packet_batch_size(packets_));
8959
0
        } else {
8960
0
            COVERAGE_ADD(datapath_drop_recirc_error,
8961
0
                         dp_packet_batch_size(packets_));
8962
0
        }
8963
0
        break;
8964
8965
0
    case OVS_ACTION_ATTR_USERSPACE:
8966
0
        if (!fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
8967
0
            struct dp_packet_batch *orig_packets_ = packets_;
8968
0
            const struct nlattr *userdata;
8969
0
            struct dp_packet_batch usr_pkt;
8970
0
            struct ofpbuf actions;
8971
0
            struct flow flow;
8972
0
            ovs_u128 ufid;
8973
0
            bool clone = false;
8974
8975
0
            userdata = nl_attr_find_nested(a, OVS_USERSPACE_ATTR_USERDATA);
8976
0
            ofpbuf_init(&actions, 0);
8977
8978
0
            if (packets_->trunc) {
8979
0
                if (!should_steal) {
8980
0
                    dp_packet_batch_clone(&usr_pkt, packets_);
8981
0
                    packets_ = &usr_pkt;
8982
0
                    clone = true;
8983
0
                    dp_packet_batch_reset_cutlen(orig_packets_);
8984
0
                }
8985
8986
0
                dp_packet_batch_apply_cutlen(packets_);
8987
0
            }
8988
8989
0
            struct dp_packet *packet;
8990
0
            DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
8991
0
                flow_extract(packet, &flow);
8992
0
                odp_flow_key_hash(&flow, sizeof flow, &ufid);
8993
0
                dp_execute_userspace_action(pmd, packet, should_steal, &flow,
8994
0
                                            &ufid, &actions, userdata);
8995
0
            }
8996
8997
0
            if (clone) {
8998
0
                dp_packet_delete_batch(packets_, true);
8999
0
            }
9000
9001
0
            ofpbuf_uninit(&actions);
9002
0
            fat_rwlock_unlock(&dp->upcall_rwlock);
9003
9004
0
            return;
9005
0
        }
9006
0
        COVERAGE_ADD(datapath_drop_lock_error,
9007
0
                     dp_packet_batch_size(packets_));
9008
0
        break;
9009
9010
0
    case OVS_ACTION_ATTR_RECIRC:
9011
0
        if (*depth < MAX_RECIRC_DEPTH) {
9012
0
            struct dp_packet_batch recirc_pkts;
9013
9014
0
            if (!should_steal) {
9015
0
               dp_packet_batch_clone(&recirc_pkts, packets_);
9016
0
               packets_ = &recirc_pkts;
9017
0
            }
9018
9019
0
            struct dp_packet *packet;
9020
0
            DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
9021
0
                packet->md.recirc_id = nl_attr_get_u32(a);
9022
0
            }
9023
9024
0
            (*depth)++;
9025
0
            dp_netdev_recirculate(pmd, packets_);
9026
0
            (*depth)--;
9027
9028
0
            return;
9029
0
        }
9030
9031
0
        COVERAGE_ADD(datapath_drop_recirc_error,
9032
0
                     dp_packet_batch_size(packets_));
9033
0
        VLOG_WARN("Packet dropped. Max recirculation depth exceeded.");
9034
0
        break;
9035
9036
0
    case OVS_ACTION_ATTR_CT: {
9037
0
        const struct nlattr *b;
9038
0
        bool force = false;
9039
0
        bool commit = false;
9040
0
        unsigned int left;
9041
0
        uint16_t zone = 0;
9042
0
        uint32_t tp_id = 0;
9043
0
        const char *helper = NULL;
9044
0
        const uint32_t *setmark = NULL;
9045
0
        const struct ovs_key_ct_labels *setlabel = NULL;
9046
0
        struct nat_action_info_t nat_action_info;
9047
0
        struct nat_action_info_t *nat_action_info_ref = NULL;
9048
0
        bool nat_config = false;
9049
9050
0
        NL_ATTR_FOR_EACH_UNSAFE (b, left, nl_attr_get(a),
9051
0
                                 nl_attr_get_size(a)) {
9052
0
            enum ovs_ct_attr sub_type = nl_attr_type(b);
9053
9054
0
            switch(sub_type) {
9055
0
            case OVS_CT_ATTR_FORCE_COMMIT:
9056
0
                force = true;
9057
                /* fall through. */
9058
0
            case OVS_CT_ATTR_COMMIT:
9059
0
                commit = true;
9060
0
                break;
9061
0
            case OVS_CT_ATTR_ZONE:
9062
0
                zone = nl_attr_get_u16(b);
9063
0
                break;
9064
0
            case OVS_CT_ATTR_HELPER:
9065
0
                helper = nl_attr_get_string(b);
9066
0
                break;
9067
0
            case OVS_CT_ATTR_MARK:
9068
0
                setmark = nl_attr_get(b);
9069
0
                break;
9070
0
            case OVS_CT_ATTR_LABELS:
9071
0
                setlabel = nl_attr_get(b);
9072
0
                break;
9073
0
            case OVS_CT_ATTR_EVENTMASK:
9074
                /* Silently ignored, as userspace datapath does not generate
9075
                 * netlink events. */
9076
0
                break;
9077
0
            case OVS_CT_ATTR_TIMEOUT:
9078
0
                if (!str_to_uint(nl_attr_get_string(b), 10, &tp_id)) {
9079
0
                    VLOG_WARN("Invalid Timeout Policy ID: %s.",
9080
0
                              nl_attr_get_string(b));
9081
0
                    tp_id = DEFAULT_TP_ID;
9082
0
                }
9083
0
                break;
9084
0
            case OVS_CT_ATTR_NAT: {
9085
0
                const struct nlattr *b_nest;
9086
0
                unsigned int left_nest;
9087
0
                bool ip_min_specified = false;
9088
0
                bool proto_num_min_specified = false;
9089
0
                bool ip_max_specified = false;
9090
0
                bool proto_num_max_specified = false;
9091
0
                memset(&nat_action_info, 0, sizeof nat_action_info);
9092
0
                nat_action_info_ref = &nat_action_info;
9093
9094
0
                NL_NESTED_FOR_EACH_UNSAFE (b_nest, left_nest, b) {
9095
0
                    enum ovs_nat_attr sub_type_nest = nl_attr_type(b_nest);
9096
9097
0
                    switch (sub_type_nest) {
9098
0
                    case OVS_NAT_ATTR_SRC:
9099
0
                    case OVS_NAT_ATTR_DST:
9100
0
                        nat_config = true;
9101
0
                        nat_action_info.nat_action |=
9102
0
                            ((sub_type_nest == OVS_NAT_ATTR_SRC)
9103
0
                                ? NAT_ACTION_SRC : NAT_ACTION_DST);
9104
0
                        break;
9105
0
                    case OVS_NAT_ATTR_IP_MIN:
9106
0
                        memcpy(&nat_action_info.min_addr,
9107
0
                               nl_attr_get(b_nest),
9108
0
                               nl_attr_get_size(b_nest));
9109
0
                        ip_min_specified = true;
9110
0
                        break;
9111
0
                    case OVS_NAT_ATTR_IP_MAX:
9112
0
                        memcpy(&nat_action_info.max_addr,
9113
0
                               nl_attr_get(b_nest),
9114
0
                               nl_attr_get_size(b_nest));
9115
0
                        ip_max_specified = true;
9116
0
                        break;
9117
0
                    case OVS_NAT_ATTR_PROTO_MIN:
9118
0
                        nat_action_info.min_port =
9119
0
                            nl_attr_get_u16(b_nest);
9120
0
                        proto_num_min_specified = true;
9121
0
                        break;
9122
0
                    case OVS_NAT_ATTR_PROTO_MAX:
9123
0
                        nat_action_info.max_port =
9124
0
                            nl_attr_get_u16(b_nest);
9125
0
                        proto_num_max_specified = true;
9126
0
                        break;
9127
0
                    case OVS_NAT_ATTR_PERSISTENT:
9128
0
                    case OVS_NAT_ATTR_PROTO_HASH:
9129
0
                    case OVS_NAT_ATTR_PROTO_RANDOM:
9130
0
                        break;
9131
0
                    case OVS_NAT_ATTR_UNSPEC:
9132
0
                    case __OVS_NAT_ATTR_MAX:
9133
0
                        OVS_NOT_REACHED();
9134
0
                    }
9135
0
                }
9136
9137
0
                if (ip_min_specified && !ip_max_specified) {
9138
0
                    nat_action_info.max_addr = nat_action_info.min_addr;
9139
0
                }
9140
0
                if (proto_num_min_specified && !proto_num_max_specified) {
9141
0
                    nat_action_info.max_port = nat_action_info.min_port;
9142
0
                }
9143
0
                if (proto_num_min_specified || proto_num_max_specified) {
9144
0
                    if (nat_action_info.nat_action & NAT_ACTION_SRC) {
9145
0
                        nat_action_info.nat_action |= NAT_ACTION_SRC_PORT;
9146
0
                    } else if (nat_action_info.nat_action & NAT_ACTION_DST) {
9147
0
                        nat_action_info.nat_action |= NAT_ACTION_DST_PORT;
9148
0
                    }
9149
0
                }
9150
0
                break;
9151
0
            }
9152
0
            case OVS_CT_ATTR_UNSPEC:
9153
0
            case __OVS_CT_ATTR_MAX:
9154
0
                OVS_NOT_REACHED();
9155
0
            }
9156
0
        }
9157
9158
        /* We won't be able to function properly in this case, hence
9159
         * complain loudly. */
9160
0
        if (nat_config && !commit) {
9161
0
            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
9162
0
            VLOG_WARN_RL(&rl, "NAT specified without commit.");
9163
0
        }
9164
9165
0
        conntrack_execute(dp->conntrack, packets_, aux->flow->dl_type, force,
9166
0
                          commit, zone, setmark, setlabel, aux->flow->tp_src,
9167
0
                          aux->flow->tp_dst, helper, nat_action_info_ref,
9168
0
                          pmd->ctx.now / 1000, tp_id);
9169
0
        break;
9170
0
    }
9171
9172
0
    case OVS_ACTION_ATTR_METER:
9173
0
        dp_netdev_run_meter(pmd->dp, packets_, nl_attr_get_u32(a),
9174
0
                            pmd->ctx.now);
9175
0
        break;
9176
9177
0
    case OVS_ACTION_ATTR_PUSH_VLAN:
9178
0
    case OVS_ACTION_ATTR_POP_VLAN:
9179
0
    case OVS_ACTION_ATTR_PUSH_MPLS:
9180
0
    case OVS_ACTION_ATTR_POP_MPLS:
9181
0
    case OVS_ACTION_ATTR_SET:
9182
0
    case OVS_ACTION_ATTR_SET_MASKED:
9183
0
    case OVS_ACTION_ATTR_SAMPLE:
9184
0
    case OVS_ACTION_ATTR_HASH:
9185
0
    case OVS_ACTION_ATTR_UNSPEC:
9186
0
    case OVS_ACTION_ATTR_TRUNC:
9187
0
    case OVS_ACTION_ATTR_PUSH_ETH:
9188
0
    case OVS_ACTION_ATTR_POP_ETH:
9189
0
    case OVS_ACTION_ATTR_CLONE:
9190
0
    case OVS_ACTION_ATTR_PUSH_NSH:
9191
0
    case OVS_ACTION_ATTR_POP_NSH:
9192
0
    case OVS_ACTION_ATTR_CT_CLEAR:
9193
0
    case OVS_ACTION_ATTR_CHECK_PKT_LEN:
9194
0
    case OVS_ACTION_ATTR_DROP:
9195
0
    case OVS_ACTION_ATTR_ADD_MPLS:
9196
0
    case __OVS_ACTION_ATTR_MAX:
9197
0
        OVS_NOT_REACHED();
9198
0
    }
9199
9200
0
    dp_packet_delete_batch(packets_, should_steal);
9201
0
}
9202
9203
static void
9204
dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
9205
                          struct dp_packet_batch *packets,
9206
                          bool should_steal, const struct flow *flow,
9207
                          const struct nlattr *actions, size_t actions_len)
9208
0
{
9209
0
    struct dp_netdev_execute_aux aux = { pmd, flow };
9210
9211
0
    odp_execute_actions(&aux, packets, should_steal, actions,
9212
0
                        actions_len, dp_execute_cb);
9213
0
}
9214
9215
struct dp_netdev_ct_dump {
9216
    struct ct_dpif_dump_state up;
9217
    struct conntrack_dump dump;
9218
    struct conntrack *ct;
9219
    struct dp_netdev *dp;
9220
};
9221
9222
static int
9223
dpif_netdev_ct_dump_start(struct dpif *dpif, struct ct_dpif_dump_state **dump_,
9224
                          const uint16_t *pzone, int *ptot_bkts)
9225
0
{
9226
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9227
0
    struct dp_netdev_ct_dump *dump;
9228
9229
0
    dump = xzalloc(sizeof *dump);
9230
0
    dump->dp = dp;
9231
0
    dump->ct = dp->conntrack;
9232
9233
0
    conntrack_dump_start(dp->conntrack, &dump->dump, pzone, ptot_bkts);
9234
9235
0
    *dump_ = &dump->up;
9236
9237
0
    return 0;
9238
0
}
9239
9240
static int
9241
dpif_netdev_ct_dump_next(struct dpif *dpif OVS_UNUSED,
9242
                         struct ct_dpif_dump_state *dump_,
9243
                         struct ct_dpif_entry *entry)
9244
0
{
9245
0
    struct dp_netdev_ct_dump *dump;
9246
9247
0
    INIT_CONTAINER(dump, dump_, up);
9248
9249
0
    return conntrack_dump_next(&dump->dump, entry);
9250
0
}
9251
9252
static int
9253
dpif_netdev_ct_dump_done(struct dpif *dpif OVS_UNUSED,
9254
                         struct ct_dpif_dump_state *dump_)
9255
0
{
9256
0
    struct dp_netdev_ct_dump *dump;
9257
0
    int err;
9258
9259
0
    INIT_CONTAINER(dump, dump_, up);
9260
9261
0
    err = conntrack_dump_done(&dump->dump);
9262
9263
0
    free(dump);
9264
9265
0
    return err;
9266
0
}
9267
9268
static int
9269
dpif_netdev_ct_flush(struct dpif *dpif, const uint16_t *zone,
9270
                     const struct ct_dpif_tuple *tuple)
9271
0
{
9272
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9273
9274
0
    if (tuple) {
9275
0
        return conntrack_flush_tuple(dp->conntrack, tuple, zone ? *zone : 0);
9276
0
    }
9277
0
    return conntrack_flush(dp->conntrack, zone);
9278
0
}
9279
9280
static int
9281
dpif_netdev_ct_set_maxconns(struct dpif *dpif, uint32_t maxconns)
9282
0
{
9283
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9284
9285
0
    return conntrack_set_maxconns(dp->conntrack, maxconns);
9286
0
}
9287
9288
static int
9289
dpif_netdev_ct_get_maxconns(struct dpif *dpif, uint32_t *maxconns)
9290
0
{
9291
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9292
9293
0
    return conntrack_get_maxconns(dp->conntrack, maxconns);
9294
0
}
9295
9296
static int
9297
dpif_netdev_ct_get_nconns(struct dpif *dpif, uint32_t *nconns)
9298
0
{
9299
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9300
9301
0
    return conntrack_get_nconns(dp->conntrack, nconns);
9302
0
}
9303
9304
static int
9305
dpif_netdev_ct_set_tcp_seq_chk(struct dpif *dpif, bool enabled)
9306
0
{
9307
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9308
9309
0
    return conntrack_set_tcp_seq_chk(dp->conntrack, enabled);
9310
0
}
9311
9312
static int
9313
dpif_netdev_ct_get_tcp_seq_chk(struct dpif *dpif, bool *enabled)
9314
0
{
9315
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9316
0
    *enabled = conntrack_get_tcp_seq_chk(dp->conntrack);
9317
0
    return 0;
9318
0
}
9319
9320
static int
9321
dpif_netdev_ct_set_limits(struct dpif *dpif,
9322
                           const uint32_t *default_limits,
9323
                           const struct ovs_list *zone_limits)
9324
0
{
9325
0
    int err = 0;
9326
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9327
0
    if (default_limits) {
9328
0
        err = zone_limit_update(dp->conntrack, DEFAULT_ZONE, *default_limits);
9329
0
        if (err != 0) {
9330
0
            return err;
9331
0
        }
9332
0
    }
9333
9334
0
    struct ct_dpif_zone_limit *zone_limit;
9335
0
    LIST_FOR_EACH (zone_limit, node, zone_limits) {
9336
0
        err = zone_limit_update(dp->conntrack, zone_limit->zone,
9337
0
                                zone_limit->limit);
9338
0
        if (err != 0) {
9339
0
            break;
9340
0
        }
9341
0
    }
9342
0
    return err;
9343
0
}
9344
9345
static int
9346
dpif_netdev_ct_get_limits(struct dpif *dpif,
9347
                           uint32_t *default_limit,
9348
                           const struct ovs_list *zone_limits_request,
9349
                           struct ovs_list *zone_limits_reply)
9350
0
{
9351
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9352
0
    struct conntrack_zone_limit czl;
9353
9354
0
    czl = zone_limit_get(dp->conntrack, DEFAULT_ZONE);
9355
0
    if (czl.zone == DEFAULT_ZONE) {
9356
0
        *default_limit = czl.limit;
9357
0
    } else {
9358
0
        return EINVAL;
9359
0
    }
9360
9361
0
    if (!ovs_list_is_empty(zone_limits_request)) {
9362
0
        struct ct_dpif_zone_limit *zone_limit;
9363
0
        LIST_FOR_EACH (zone_limit, node, zone_limits_request) {
9364
0
            czl = zone_limit_get(dp->conntrack, zone_limit->zone);
9365
0
            if (czl.zone == zone_limit->zone || czl.zone == DEFAULT_ZONE) {
9366
0
                ct_dpif_push_zone_limit(zone_limits_reply, zone_limit->zone,
9367
0
                                        czl.limit,
9368
0
                                        atomic_count_get(&czl.count));
9369
0
            } else {
9370
0
                return EINVAL;
9371
0
            }
9372
0
        }
9373
0
    } else {
9374
0
        for (int z = MIN_ZONE; z <= MAX_ZONE; z++) {
9375
0
            czl = zone_limit_get(dp->conntrack, z);
9376
0
            if (czl.zone == z) {
9377
0
                ct_dpif_push_zone_limit(zone_limits_reply, z, czl.limit,
9378
0
                                        atomic_count_get(&czl.count));
9379
0
            }
9380
0
        }
9381
0
    }
9382
9383
0
    return 0;
9384
0
}
9385
9386
static int
9387
dpif_netdev_ct_del_limits(struct dpif *dpif,
9388
                           const struct ovs_list *zone_limits)
9389
0
{
9390
0
    int err = 0;
9391
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9392
0
    struct ct_dpif_zone_limit *zone_limit;
9393
0
    LIST_FOR_EACH (zone_limit, node, zone_limits) {
9394
0
        err = zone_limit_delete(dp->conntrack, zone_limit->zone);
9395
0
        if (err != 0) {
9396
0
            break;
9397
0
        }
9398
0
    }
9399
9400
0
    return err;
9401
0
}
9402
9403
static int
9404
dpif_netdev_ct_get_features(struct dpif *dpif OVS_UNUSED,
9405
                            enum ct_features *features)
9406
0
{
9407
0
    if (features != NULL) {
9408
0
        *features = CONNTRACK_F_ZERO_SNAT;
9409
0
    }
9410
0
    return 0;
9411
0
}
9412
9413
static int
9414
dpif_netdev_ct_set_timeout_policy(struct dpif *dpif,
9415
                                  const struct ct_dpif_timeout_policy *dpif_tp)
9416
0
{
9417
0
    struct timeout_policy tp;
9418
0
    struct dp_netdev *dp;
9419
9420
0
    dp = get_dp_netdev(dpif);
9421
0
    memcpy(&tp.policy, dpif_tp, sizeof tp.policy);
9422
0
    return timeout_policy_update(dp->conntrack, &tp);
9423
0
}
9424
9425
static int
9426
dpif_netdev_ct_get_timeout_policy(struct dpif *dpif, uint32_t tp_id,
9427
                                  struct ct_dpif_timeout_policy *dpif_tp)
9428
0
{
9429
0
    struct timeout_policy *tp;
9430
0
    struct dp_netdev *dp;
9431
0
    int err = 0;
9432
9433
0
    dp = get_dp_netdev(dpif);
9434
0
    tp = timeout_policy_get(dp->conntrack, tp_id);
9435
0
    if (!tp) {
9436
0
        return ENOENT;
9437
0
    }
9438
0
    memcpy(dpif_tp, &tp->policy, sizeof tp->policy);
9439
0
    return err;
9440
0
}
9441
9442
static int
9443
dpif_netdev_ct_del_timeout_policy(struct dpif *dpif,
9444
                                  uint32_t tp_id)
9445
0
{
9446
0
    struct dp_netdev *dp;
9447
0
    int err = 0;
9448
9449
0
    dp = get_dp_netdev(dpif);
9450
0
    err = timeout_policy_delete(dp->conntrack, tp_id);
9451
0
    return err;
9452
0
}
9453
9454
static int
9455
dpif_netdev_ct_get_timeout_policy_name(struct dpif *dpif OVS_UNUSED,
9456
                                       uint32_t tp_id,
9457
                                       uint16_t dl_type OVS_UNUSED,
9458
                                       uint8_t nw_proto OVS_UNUSED,
9459
                                       char **tp_name, bool *is_generic)
9460
0
{
9461
0
    struct ds ds = DS_EMPTY_INITIALIZER;
9462
9463
0
    ds_put_format(&ds, "%"PRIu32, tp_id);
9464
0
    *tp_name = ds_steal_cstr(&ds);
9465
0
    *is_generic = true;
9466
0
    return 0;
9467
0
}
9468
9469
static int
9470
dpif_netdev_ipf_set_enabled(struct dpif *dpif, bool v6, bool enable)
9471
0
{
9472
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9473
0
    return ipf_set_enabled(conntrack_ipf_ctx(dp->conntrack), v6, enable);
9474
0
}
9475
9476
static int
9477
dpif_netdev_ipf_set_min_frag(struct dpif *dpif, bool v6, uint32_t min_frag)
9478
0
{
9479
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9480
0
    return ipf_set_min_frag(conntrack_ipf_ctx(dp->conntrack), v6, min_frag);
9481
0
}
9482
9483
static int
9484
dpif_netdev_ipf_set_max_nfrags(struct dpif *dpif, uint32_t max_frags)
9485
0
{
9486
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9487
0
    return ipf_set_max_nfrags(conntrack_ipf_ctx(dp->conntrack), max_frags);
9488
0
}
9489
9490
/* Adjust this function if 'dpif_ipf_status' and 'ipf_status' were to
9491
 * diverge. */
9492
static int
9493
dpif_netdev_ipf_get_status(struct dpif *dpif,
9494
                           struct dpif_ipf_status *dpif_ipf_status)
9495
0
{
9496
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9497
0
    ipf_get_status(conntrack_ipf_ctx(dp->conntrack),
9498
0
                   (struct ipf_status *) dpif_ipf_status);
9499
0
    return 0;
9500
0
}
9501
9502
static int
9503
dpif_netdev_ipf_dump_start(struct dpif *dpif OVS_UNUSED,
9504
                           struct ipf_dump_ctx **ipf_dump_ctx)
9505
0
{
9506
0
    return ipf_dump_start(ipf_dump_ctx);
9507
0
}
9508
9509
static int
9510
dpif_netdev_ipf_dump_next(struct dpif *dpif, void *ipf_dump_ctx, char **dump)
9511
0
{
9512
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9513
0
    return ipf_dump_next(conntrack_ipf_ctx(dp->conntrack), ipf_dump_ctx,
9514
0
                         dump);
9515
0
}
9516
9517
static int
9518
dpif_netdev_ipf_dump_done(struct dpif *dpif OVS_UNUSED, void *ipf_dump_ctx)
9519
0
{
9520
0
    return ipf_dump_done(ipf_dump_ctx);
9521
9522
0
}
9523
9524
static int
9525
dpif_netdev_bond_add(struct dpif *dpif, uint32_t bond_id,
9526
                     odp_port_t *member_map)
9527
0
{
9528
0
    struct tx_bond *new_tx = xzalloc(sizeof *new_tx);
9529
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9530
0
    struct dp_netdev_pmd_thread *pmd;
9531
9532
    /* Prepare new bond mapping. */
9533
0
    new_tx->bond_id = bond_id;
9534
0
    for (int bucket = 0; bucket < BOND_BUCKETS; bucket++) {
9535
0
        new_tx->member_buckets[bucket].member_id = member_map[bucket];
9536
0
    }
9537
9538
0
    ovs_mutex_lock(&dp->bond_mutex);
9539
    /* Check if bond already existed. */
9540
0
    struct tx_bond *old_tx = tx_bond_lookup(&dp->tx_bonds, bond_id);
9541
0
    if (old_tx) {
9542
0
        cmap_replace(&dp->tx_bonds, &old_tx->node, &new_tx->node,
9543
0
                     hash_bond_id(bond_id));
9544
0
        ovsrcu_postpone(free, old_tx);
9545
0
    } else {
9546
0
        cmap_insert(&dp->tx_bonds, &new_tx->node, hash_bond_id(bond_id));
9547
0
    }
9548
0
    ovs_mutex_unlock(&dp->bond_mutex);
9549
9550
    /* Update all PMDs with new bond mapping. */
9551
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
9552
0
        dp_netdev_add_bond_tx_to_pmd(pmd, new_tx, true);
9553
0
    }
9554
0
    return 0;
9555
0
}
9556
9557
static int
9558
dpif_netdev_bond_del(struct dpif *dpif, uint32_t bond_id)
9559
0
{
9560
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9561
0
    struct dp_netdev_pmd_thread *pmd;
9562
0
    struct tx_bond *tx;
9563
9564
0
    ovs_mutex_lock(&dp->bond_mutex);
9565
    /* Check if bond existed. */
9566
0
    tx = tx_bond_lookup(&dp->tx_bonds, bond_id);
9567
0
    if (tx) {
9568
0
        cmap_remove(&dp->tx_bonds, &tx->node, hash_bond_id(bond_id));
9569
0
        ovsrcu_postpone(free, tx);
9570
0
    } else {
9571
        /* Bond is not present. */
9572
0
        ovs_mutex_unlock(&dp->bond_mutex);
9573
0
        return ENOENT;
9574
0
    }
9575
0
    ovs_mutex_unlock(&dp->bond_mutex);
9576
9577
    /* Remove the bond map in all pmds. */
9578
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
9579
0
        dp_netdev_del_bond_tx_from_pmd(pmd, bond_id);
9580
0
    }
9581
0
    return 0;
9582
0
}
9583
9584
static int
9585
dpif_netdev_bond_stats_get(struct dpif *dpif, uint32_t bond_id,
9586
                           uint64_t *n_bytes)
9587
0
{
9588
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9589
0
    struct dp_netdev_pmd_thread *pmd;
9590
9591
0
    if (!tx_bond_lookup(&dp->tx_bonds, bond_id)) {
9592
0
        return ENOENT;
9593
0
    }
9594
9595
    /* Search the bond in all PMDs. */
9596
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
9597
0
        struct tx_bond *pmd_bond_entry
9598
0
            = tx_bond_lookup(&pmd->tx_bonds, bond_id);
9599
9600
0
        if (!pmd_bond_entry) {
9601
0
            continue;
9602
0
        }
9603
9604
        /* Read bond stats. */
9605
0
        for (int i = 0; i < BOND_BUCKETS; i++) {
9606
0
            uint64_t pmd_n_bytes;
9607
9608
0
            atomic_read_relaxed(&pmd_bond_entry->member_buckets[i].n_bytes,
9609
0
                                &pmd_n_bytes);
9610
0
            n_bytes[i] += pmd_n_bytes;
9611
0
        }
9612
0
    }
9613
0
    return 0;
9614
0
}
9615
9616
const struct dpif_class dpif_netdev_class = {
9617
    "netdev",
9618
    true,                       /* cleanup_required */
9619
    true,                       /* synced_dp_layers */
9620
    dpif_netdev_init,
9621
    dpif_netdev_enumerate,
9622
    dpif_netdev_port_open_type,
9623
    dpif_netdev_open,
9624
    dpif_netdev_close,
9625
    dpif_netdev_destroy,
9626
    dpif_netdev_run,
9627
    dpif_netdev_wait,
9628
    dpif_netdev_get_stats,
9629
    NULL,                      /* set_features */
9630
    dpif_netdev_port_add,
9631
    dpif_netdev_port_del,
9632
    dpif_netdev_port_set_config,
9633
    dpif_netdev_port_query_by_number,
9634
    dpif_netdev_port_query_by_name,
9635
    NULL,                       /* port_get_pid */
9636
    dpif_netdev_port_dump_start,
9637
    dpif_netdev_port_dump_next,
9638
    dpif_netdev_port_dump_done,
9639
    dpif_netdev_port_poll,
9640
    dpif_netdev_port_poll_wait,
9641
    dpif_netdev_flow_flush,
9642
    dpif_netdev_flow_dump_create,
9643
    dpif_netdev_flow_dump_destroy,
9644
    dpif_netdev_flow_dump_thread_create,
9645
    dpif_netdev_flow_dump_thread_destroy,
9646
    dpif_netdev_flow_dump_next,
9647
    dpif_netdev_operate,
9648
    dpif_netdev_offload_stats_get,
9649
    NULL,                       /* recv_set */
9650
    NULL,                       /* handlers_set */
9651
    NULL,                       /* number_handlers_required */
9652
    dpif_netdev_set_config,
9653
    dpif_netdev_queue_to_priority,
9654
    NULL,                       /* recv */
9655
    NULL,                       /* recv_wait */
9656
    NULL,                       /* recv_purge */
9657
    dpif_netdev_register_dp_purge_cb,
9658
    dpif_netdev_register_upcall_cb,
9659
    dpif_netdev_enable_upcall,
9660
    dpif_netdev_disable_upcall,
9661
    dpif_netdev_get_datapath_version,
9662
    dpif_netdev_ct_dump_start,
9663
    dpif_netdev_ct_dump_next,
9664
    dpif_netdev_ct_dump_done,
9665
    dpif_netdev_ct_flush,
9666
    dpif_netdev_ct_set_maxconns,
9667
    dpif_netdev_ct_get_maxconns,
9668
    dpif_netdev_ct_get_nconns,
9669
    dpif_netdev_ct_set_tcp_seq_chk,
9670
    dpif_netdev_ct_get_tcp_seq_chk,
9671
    dpif_netdev_ct_set_limits,
9672
    dpif_netdev_ct_get_limits,
9673
    dpif_netdev_ct_del_limits,
9674
    dpif_netdev_ct_set_timeout_policy,
9675
    dpif_netdev_ct_get_timeout_policy,
9676
    dpif_netdev_ct_del_timeout_policy,
9677
    NULL,                       /* ct_timeout_policy_dump_start */
9678
    NULL,                       /* ct_timeout_policy_dump_next */
9679
    NULL,                       /* ct_timeout_policy_dump_done */
9680
    dpif_netdev_ct_get_timeout_policy_name,
9681
    dpif_netdev_ct_get_features,
9682
    dpif_netdev_ipf_set_enabled,
9683
    dpif_netdev_ipf_set_min_frag,
9684
    dpif_netdev_ipf_set_max_nfrags,
9685
    dpif_netdev_ipf_get_status,
9686
    dpif_netdev_ipf_dump_start,
9687
    dpif_netdev_ipf_dump_next,
9688
    dpif_netdev_ipf_dump_done,
9689
    dpif_netdev_meter_get_features,
9690
    dpif_netdev_meter_set,
9691
    dpif_netdev_meter_get,
9692
    dpif_netdev_meter_del,
9693
    dpif_netdev_bond_add,
9694
    dpif_netdev_bond_del,
9695
    dpif_netdev_bond_stats_get,
9696
    NULL,                       /* cache_get_supported_levels */
9697
    NULL,                       /* cache_get_name */
9698
    NULL,                       /* cache_get_size */
9699
    NULL,                       /* cache_set_size */
9700
};
9701
9702
static void
9703
dpif_dummy_change_port_number(struct unixctl_conn *conn, int argc OVS_UNUSED,
9704
                              const char *argv[], void *aux OVS_UNUSED)
9705
0
{
9706
0
    struct dp_netdev_port *port;
9707
0
    struct dp_netdev *dp;
9708
0
    odp_port_t port_no;
9709
9710
0
    ovs_mutex_lock(&dp_netdev_mutex);
9711
0
    dp = shash_find_data(&dp_netdevs, argv[1]);
9712
0
    if (!dp || !dpif_netdev_class_is_dummy(dp->class)) {
9713
0
        ovs_mutex_unlock(&dp_netdev_mutex);
9714
0
        unixctl_command_reply_error(conn, "unknown datapath or not a dummy");
9715
0
        return;
9716
0
    }
9717
0
    ovs_refcount_ref(&dp->ref_cnt);
9718
0
    ovs_mutex_unlock(&dp_netdev_mutex);
9719
9720
0
    ovs_rwlock_wrlock(&dp->port_rwlock);
9721
0
    if (get_port_by_name(dp, argv[2], &port)) {
9722
0
        unixctl_command_reply_error(conn, "unknown port");
9723
0
        goto exit;
9724
0
    }
9725
9726
0
    port_no = u32_to_odp(atoi(argv[3]));
9727
0
    if (!port_no || port_no == ODPP_NONE) {
9728
0
        unixctl_command_reply_error(conn, "bad port number");
9729
0
        goto exit;
9730
0
    }
9731
0
    if (dp_netdev_lookup_port(dp, port_no)) {
9732
0
        unixctl_command_reply_error(conn, "port number already in use");
9733
0
        goto exit;
9734
0
    }
9735
9736
    /* Remove port. */
9737
0
    hmap_remove(&dp->ports, &port->node);
9738
0
    reconfigure_datapath(dp);
9739
9740
    /* Reinsert with new port number. */
9741
0
    port->port_no = port_no;
9742
0
    hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
9743
0
    reconfigure_datapath(dp);
9744
9745
0
    seq_change(dp->port_seq);
9746
0
    unixctl_command_reply(conn, NULL);
9747
9748
0
exit:
9749
0
    ovs_rwlock_unlock(&dp->port_rwlock);
9750
0
    dp_netdev_unref(dp);
9751
0
}
9752
9753
static void
9754
dpif_dummy_register__(const char *type)
9755
0
{
9756
0
    struct dpif_class *class;
9757
9758
0
    class = xmalloc(sizeof *class);
9759
0
    *class = dpif_netdev_class;
9760
0
    class->type = xstrdup(type);
9761
0
    dp_register_provider(class);
9762
0
}
9763
9764
static void
9765
dpif_dummy_override(const char *type)
9766
0
{
9767
0
    int error;
9768
9769
    /*
9770
     * Ignore EAFNOSUPPORT to allow --enable-dummy=system with
9771
     * a userland-only build.  It's useful for testsuite.
9772
     */
9773
0
    error = dp_unregister_provider(type);
9774
0
    if (error == 0 || error == EAFNOSUPPORT) {
9775
0
        dpif_dummy_register__(type);
9776
0
    }
9777
0
}
9778
9779
void
9780
dpif_dummy_register(enum dummy_level level)
9781
0
{
9782
0
    if (level == DUMMY_OVERRIDE_ALL) {
9783
0
        struct sset types;
9784
0
        const char *type;
9785
9786
0
        sset_init(&types);
9787
0
        dp_enumerate_types(&types);
9788
0
        SSET_FOR_EACH (type, &types) {
9789
0
            dpif_dummy_override(type);
9790
0
        }
9791
0
        sset_destroy(&types);
9792
0
    } else if (level == DUMMY_OVERRIDE_SYSTEM) {
9793
0
        dpif_dummy_override("system");
9794
0
    }
9795
9796
0
    dpif_dummy_register__("dummy");
9797
9798
0
    unixctl_command_register("dpif-dummy/change-port-number",
9799
0
                             "dp port new-number",
9800
0
                             3, 3, dpif_dummy_change_port_number, NULL);
9801
0
}
9802

9803
/* Datapath Classifier. */
9804
9805
static void
9806
dpcls_subtable_destroy_cb(struct dpcls_subtable *subtable)
9807
0
{
9808
0
    cmap_destroy(&subtable->rules);
9809
0
    ovsrcu_postpone(free, subtable->mf_masks);
9810
0
    ovsrcu_postpone(free, subtable);
9811
0
}
9812
9813
/* Initializes 'cls' as a classifier that initially contains no classification
9814
 * rules. */
9815
static void
9816
dpcls_init(struct dpcls *cls)
9817
0
{
9818
0
    cmap_init(&cls->subtables_map);
9819
0
    pvector_init(&cls->subtables);
9820
0
}
9821
9822
static void
9823
dpcls_destroy_subtable(struct dpcls *cls, struct dpcls_subtable *subtable)
9824
0
{
9825
0
    VLOG_DBG("Destroying subtable %p for in_port %d", subtable, cls->in_port);
9826
0
    pvector_remove(&cls->subtables, subtable);
9827
0
    cmap_remove(&cls->subtables_map, &subtable->cmap_node,
9828
0
                subtable->mask.hash);
9829
0
    dpcls_info_dec_usage(subtable->lookup_func_info);
9830
0
    ovsrcu_postpone(dpcls_subtable_destroy_cb, subtable);
9831
0
}
9832
9833
/* Destroys 'cls'.  Rules within 'cls', if any, are not freed; this is the
9834
 * caller's responsibility.
9835
 * May only be called after all the readers have been terminated. */
9836
static void
9837
dpcls_destroy(struct dpcls *cls)
9838
0
{
9839
0
    if (cls) {
9840
0
        struct dpcls_subtable *subtable;
9841
9842
0
        CMAP_FOR_EACH (subtable, cmap_node, &cls->subtables_map) {
9843
0
            ovs_assert(cmap_count(&subtable->rules) == 0);
9844
0
            dpcls_destroy_subtable(cls, subtable);
9845
0
        }
9846
0
        cmap_destroy(&cls->subtables_map);
9847
0
        pvector_destroy(&cls->subtables);
9848
0
    }
9849
0
}
9850
9851
static struct dpcls_subtable *
9852
dpcls_create_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
9853
0
{
9854
0
    struct dpcls_subtable *subtable;
9855
9856
    /* Need to add one. */
9857
0
    subtable = xmalloc(sizeof *subtable
9858
0
                       - sizeof subtable->mask.mf + mask->len);
9859
0
    cmap_init(&subtable->rules);
9860
0
    subtable->hit_cnt = 0;
9861
0
    netdev_flow_key_clone(&subtable->mask, mask);
9862
9863
    /* The count of bits in the mask defines the space required for masks.
9864
     * Then call gen_masks() to create the appropriate masks, avoiding the cost
9865
     * of doing runtime calculations. */
9866
0
    uint32_t unit0 = count_1bits(mask->mf.map.bits[0]);
9867
0
    uint32_t unit1 = count_1bits(mask->mf.map.bits[1]);
9868
0
    subtable->mf_bits_set_unit0 = unit0;
9869
0
    subtable->mf_bits_set_unit1 = unit1;
9870
0
    subtable->mf_masks = xmalloc(sizeof(uint64_t) * (unit0 + unit1));
9871
0
    dpcls_flow_key_gen_masks(mask, subtable->mf_masks, unit0, unit1);
9872
9873
    /* Get the preferred subtable search function for this (u0,u1) subtable.
9874
     * The function is guaranteed to always return a valid implementation, and
9875
     * possibly an ISA optimized, and/or specialized implementation. Initialize
9876
     * the subtable search function atomically to avoid garbage data being read
9877
     * by the PMD thread.
9878
     */
9879
0
    atomic_init(&subtable->lookup_func,
9880
0
                dpcls_subtable_get_best_impl(unit0, unit1,
9881
0
                                             &subtable->lookup_func_info));
9882
0
    dpcls_info_inc_usage(subtable->lookup_func_info);
9883
9884
0
    cmap_insert(&cls->subtables_map, &subtable->cmap_node, mask->hash);
9885
    /* Add the new subtable at the end of the pvector (with no hits yet) */
9886
0
    pvector_insert(&cls->subtables, subtable, 0);
9887
0
    VLOG_DBG("Creating %"PRIuSIZE". subtable %p for in_port %d",
9888
0
             cmap_count(&cls->subtables_map), subtable, cls->in_port);
9889
0
    pvector_publish(&cls->subtables);
9890
9891
0
    return subtable;
9892
0
}
9893
9894
static inline struct dpcls_subtable *
9895
dpcls_find_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
9896
0
{
9897
0
    struct dpcls_subtable *subtable;
9898
9899
0
    CMAP_FOR_EACH_WITH_HASH (subtable, cmap_node, mask->hash,
9900
0
                             &cls->subtables_map) {
9901
0
        if (netdev_flow_key_equal(&subtable->mask, mask)) {
9902
0
            return subtable;
9903
0
        }
9904
0
    }
9905
0
    return dpcls_create_subtable(cls, mask);
9906
0
}
9907
9908
/* Checks for the best available implementation for each subtable lookup
9909
 * function, and assigns it as the lookup function pointer for each subtable.
9910
 * Returns the number of subtables that have changed lookup implementation.
9911
 * This function requires holding a flow_mutex when called. This is to make
9912
 * sure modifications done by this function are not overwritten. This could
9913
 * happen if dpcls_sort_subtable_vector() is called at the same time as this
9914
 * function.
9915
 */
9916
static uint32_t
9917
dpcls_subtable_lookup_reprobe(struct dpcls *cls)
9918
0
{
9919
0
    struct pvector *pvec = &cls->subtables;
9920
0
    uint32_t subtables_changed = 0;
9921
0
    struct dpcls_subtable *subtable = NULL;
9922
9923
0
    PVECTOR_FOR_EACH (subtable, pvec) {
9924
0
        uint32_t u0_bits = subtable->mf_bits_set_unit0;
9925
0
        uint32_t u1_bits = subtable->mf_bits_set_unit1;
9926
0
        void *old_func = subtable->lookup_func;
9927
0
        struct dpcls_subtable_lookup_info_t *old_info;
9928
0
        old_info = subtable->lookup_func_info;
9929
        /* Set the subtable lookup function atomically to avoid garbage data
9930
         * being read by the PMD thread. */
9931
0
        atomic_store_relaxed(&subtable->lookup_func,
9932
0
                dpcls_subtable_get_best_impl(u0_bits, u1_bits,
9933
0
                                             &subtable->lookup_func_info));
9934
0
        if (old_func != subtable->lookup_func) {
9935
0
            subtables_changed += 1;
9936
0
        }
9937
9938
0
        if (old_info != subtable->lookup_func_info) {
9939
            /* In theory, functions can be shared between implementations, so
9940
             * do an explicit check on the function info structures. */
9941
0
            dpcls_info_dec_usage(old_info);
9942
0
            dpcls_info_inc_usage(subtable->lookup_func_info);
9943
0
        }
9944
0
    }
9945
9946
0
    return subtables_changed;
9947
0
}
9948
9949
/* Periodically sort the dpcls subtable vectors according to hit counts */
9950
static void
9951
dpcls_sort_subtable_vector(struct dpcls *cls)
9952
0
{
9953
0
    struct pvector *pvec = &cls->subtables;
9954
0
    struct dpcls_subtable *subtable;
9955
9956
0
    PVECTOR_FOR_EACH (subtable, pvec) {
9957
0
        pvector_change_priority(pvec, subtable, subtable->hit_cnt);
9958
0
        subtable->hit_cnt = 0;
9959
0
    }
9960
0
    pvector_publish(pvec);
9961
0
}
9962
9963
static inline void
9964
dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
9965
                           struct polled_queue *poll_list, int poll_cnt)
9966
0
{
9967
0
    struct dpcls *cls;
9968
0
    uint64_t tot_idle = 0, tot_proc = 0, tot_sleep = 0;
9969
0
    unsigned int pmd_load = 0;
9970
9971
0
    if (pmd->ctx.now > pmd->next_cycle_store) {
9972
0
        uint64_t curr_tsc;
9973
0
        uint8_t rebalance_load_trigger;
9974
0
        struct pmd_auto_lb *pmd_alb = &pmd->dp->pmd_alb;
9975
0
        unsigned int idx;
9976
9977
0
        if (pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] >=
9978
0
                pmd->prev_stats[PMD_CYCLES_ITER_IDLE] &&
9979
0
            pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] >=
9980
0
                pmd->prev_stats[PMD_CYCLES_ITER_BUSY]) {
9981
0
            tot_idle = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] -
9982
0
                       pmd->prev_stats[PMD_CYCLES_ITER_IDLE];
9983
0
            tot_proc = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] -
9984
0
                       pmd->prev_stats[PMD_CYCLES_ITER_BUSY];
9985
0
            tot_sleep = pmd->perf_stats.counters.n[PMD_CYCLES_SLEEP] -
9986
0
                        pmd->prev_stats[PMD_CYCLES_SLEEP];
9987
9988
0
            if (pmd_alb->is_enabled && !pmd->isolated) {
9989
0
                if (tot_proc) {
9990
0
                    pmd_load = ((tot_proc * 100) /
9991
0
                                    (tot_idle + tot_proc + tot_sleep));
9992
0
                }
9993
9994
0
                atomic_read_relaxed(&pmd_alb->rebalance_load_thresh,
9995
0
                                    &rebalance_load_trigger);
9996
0
                if (pmd_load >= rebalance_load_trigger) {
9997
0
                    atomic_count_inc(&pmd->pmd_overloaded);
9998
0
                } else {
9999
0
                    atomic_count_set(&pmd->pmd_overloaded, 0);
10000
0
                }
10001
0
            }
10002
0
        }
10003
10004
0
        pmd->prev_stats[PMD_CYCLES_ITER_IDLE] =
10005
0
                        pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE];
10006
0
        pmd->prev_stats[PMD_CYCLES_ITER_BUSY] =
10007
0
                        pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY];
10008
0
        pmd->prev_stats[PMD_CYCLES_SLEEP] =
10009
0
                        pmd->perf_stats.counters.n[PMD_CYCLES_SLEEP];
10010
10011
        /* Get the cycles that were used to process each queue and store. */
10012
0
        for (unsigned i = 0; i < poll_cnt; i++) {
10013
0
            uint64_t rxq_cyc_curr = dp_netdev_rxq_get_cycles(poll_list[i].rxq,
10014
0
                                                        RXQ_CYCLES_PROC_CURR);
10015
0
            dp_netdev_rxq_set_intrvl_cycles(poll_list[i].rxq, rxq_cyc_curr);
10016
0
            dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR,
10017
0
                                     0);
10018
0
        }
10019
0
        curr_tsc = cycles_counter_update(&pmd->perf_stats);
10020
0
        if (pmd->intrvl_tsc_prev) {
10021
            /* There is a prev timestamp, store a new intrvl cycle count. */
10022
0
            atomic_store_relaxed(&pmd->intrvl_cycles,
10023
0
                                 curr_tsc - pmd->intrvl_tsc_prev);
10024
0
        }
10025
0
        idx = atomic_count_inc(&pmd->intrvl_idx) % PMD_INTERVAL_MAX;
10026
0
        atomic_store_relaxed(&pmd->busy_cycles_intrvl[idx], tot_proc);
10027
0
        pmd->intrvl_tsc_prev = curr_tsc;
10028
        /* Start new measuring interval */
10029
0
        pmd->next_cycle_store = pmd->ctx.now + PMD_INTERVAL_LEN;
10030
0
    }
10031
10032
0
    if (pmd->ctx.now > pmd->next_optimization) {
10033
        /* Try to obtain the flow lock to block out revalidator threads.
10034
         * If not possible, just try next time. */
10035
0
        if (!ovs_mutex_trylock(&pmd->flow_mutex)) {
10036
            /* Optimize each classifier */
10037
0
            CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
10038
0
                dpcls_sort_subtable_vector(cls);
10039
0
            }
10040
0
            ovs_mutex_unlock(&pmd->flow_mutex);
10041
            /* Start new measuring interval */
10042
0
            pmd->next_optimization = pmd->ctx.now
10043
0
                                     + DPCLS_OPTIMIZATION_INTERVAL;
10044
0
        }
10045
0
    }
10046
0
}
10047
10048
/* Returns the sum of a specified number of newest to
10049
 * oldest interval values. 'cur_idx' is where the next
10050
 * write will be and wrap around needs to be handled.
10051
 */
10052
static uint64_t
10053
get_interval_values(atomic_ullong *source, atomic_count *cur_idx,
10054
0
                    int num_to_read) {
10055
0
    unsigned int i;
10056
0
    uint64_t total = 0;
10057
10058
0
    i = atomic_count_get(cur_idx) % PMD_INTERVAL_MAX;
10059
0
    for (int read = 0; read < num_to_read; read++) {
10060
0
        uint64_t interval_value;
10061
10062
0
        i = i ? i - 1 : PMD_INTERVAL_MAX - 1;
10063
0
        atomic_read_relaxed(&source[i], &interval_value);
10064
0
        total += interval_value;
10065
0
    }
10066
0
    return total;
10067
0
}
10068
10069
/* Insert 'rule' into 'cls'. */
10070
static void
10071
dpcls_insert(struct dpcls *cls, struct dpcls_rule *rule,
10072
             const struct netdev_flow_key *mask)
10073
0
{
10074
0
    struct dpcls_subtable *subtable = dpcls_find_subtable(cls, mask);
10075
10076
    /* Refer to subtable's mask, also for later removal. */
10077
0
    rule->mask = &subtable->mask;
10078
0
    cmap_insert(&subtable->rules, &rule->cmap_node, rule->flow.hash);
10079
0
}
10080
10081
/* Removes 'rule' from 'cls', also destructing the 'rule'. */
10082
static void
10083
dpcls_remove(struct dpcls *cls, struct dpcls_rule *rule)
10084
0
{
10085
0
    struct dpcls_subtable *subtable;
10086
10087
0
    ovs_assert(rule->mask);
10088
10089
    /* Get subtable from reference in rule->mask. */
10090
0
    INIT_CONTAINER(subtable, rule->mask, mask);
10091
0
    if (cmap_remove(&subtable->rules, &rule->cmap_node, rule->flow.hash)
10092
0
        == 0) {
10093
        /* Delete empty subtable. */
10094
0
        dpcls_destroy_subtable(cls, subtable);
10095
0
        pvector_publish(&cls->subtables);
10096
0
    }
10097
0
}
10098
10099
/* Inner loop for mask generation of a unit, see dpcls_flow_key_gen_masks. */
10100
static inline void
10101
dpcls_flow_key_gen_mask_unit(uint64_t iter, const uint64_t count,
10102
                             uint64_t *mf_masks)
10103
0
{
10104
0
    int i;
10105
0
    for (i = 0; i < count; i++) {
10106
0
        uint64_t lowest_bit = (iter & -iter);
10107
0
        iter &= ~lowest_bit;
10108
0
        mf_masks[i] = (lowest_bit - 1);
10109
0
    }
10110
    /* Checks that count has covered all bits in the iter bitmap. */
10111
0
    ovs_assert(iter == 0);
10112
0
}
10113
10114
/* Generate a mask for each block in the miniflow, based on the bits set. This
10115
 * allows easily masking packets with the generated array here, without
10116
 * calculations. This replaces runtime-calculating the masks.
10117
 * @param key The table to generate the mf_masks for
10118
 * @param mf_masks Pointer to a u64 array of at least *mf_bits* in size
10119
 * @param mf_bits_total Number of bits set in the whole miniflow (both units)
10120
 * @param mf_bits_unit0 Number of bits set in unit0 of the miniflow
10121
 */
10122
void
10123
dpcls_flow_key_gen_masks(const struct netdev_flow_key *tbl,
10124
                         uint64_t *mf_masks,
10125
                         const uint32_t mf_bits_u0,
10126
                         const uint32_t mf_bits_u1)
10127
0
{
10128
0
    uint64_t iter_u0 = tbl->mf.map.bits[0];
10129
0
    uint64_t iter_u1 = tbl->mf.map.bits[1];
10130
10131
0
    dpcls_flow_key_gen_mask_unit(iter_u0, mf_bits_u0, &mf_masks[0]);
10132
0
    dpcls_flow_key_gen_mask_unit(iter_u1, mf_bits_u1, &mf_masks[mf_bits_u0]);
10133
0
}
10134
10135
/* Returns true if 'target' satisfies 'key' in 'mask', that is, if each 1-bit
10136
 * in 'mask' the values in 'key' and 'target' are the same. */
10137
inline bool
10138
dpcls_rule_matches_key(const struct dpcls_rule *rule,
10139
                       const struct netdev_flow_key *target)
10140
0
{
10141
0
    const uint64_t *keyp = miniflow_get_values(&rule->flow.mf);
10142
0
    const uint64_t *maskp = miniflow_get_values(&rule->mask->mf);
10143
0
    uint64_t value;
10144
10145
0
    NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, target, rule->flow.mf.map) {
10146
0
        if (OVS_UNLIKELY((value & *maskp++) != *keyp++)) {
10147
0
            return false;
10148
0
        }
10149
0
    }
10150
0
    return true;
10151
0
}
10152
10153
/* For each miniflow in 'keys' performs a classifier lookup writing the result
10154
 * into the corresponding slot in 'rules'.  If a particular entry in 'keys' is
10155
 * NULL it is skipped.
10156
 *
10157
 * This function is optimized for use in the userspace datapath and therefore
10158
 * does not implement a lot of features available in the standard
10159
 * classifier_lookup() function.  Specifically, it does not implement
10160
 * priorities, instead returning any rule which matches the flow.
10161
 *
10162
 * Returns true if all miniflows found a corresponding rule. */
10163
bool
10164
dpcls_lookup(struct dpcls *cls, const struct netdev_flow_key *keys[],
10165
             struct dpcls_rule **rules, const size_t cnt,
10166
             int *num_lookups_p)
10167
0
{
10168
    /* The received 'cnt' miniflows are the search-keys that will be processed
10169
     * to find a matching entry into the available subtables.
10170
     * The number of bits in map_type is equal to NETDEV_MAX_BURST. */
10171
0
#define MAP_BITS (sizeof(uint32_t) * CHAR_BIT)
10172
0
    BUILD_ASSERT_DECL(MAP_BITS >= NETDEV_MAX_BURST);
10173
10174
0
    struct dpcls_subtable *subtable;
10175
0
    uint32_t keys_map = TYPE_MAXIMUM(uint32_t); /* Set all bits. */
10176
10177
0
    if (cnt != MAP_BITS) {
10178
0
        keys_map >>= MAP_BITS - cnt; /* Clear extra bits. */
10179
0
    }
10180
0
    memset(rules, 0, cnt * sizeof *rules);
10181
10182
0
    int lookups_match = 0, subtable_pos = 1;
10183
0
    uint32_t found_map;
10184
10185
    /* The Datapath classifier - aka dpcls - is composed of subtables.
10186
     * Subtables are dynamically created as needed when new rules are inserted.
10187
     * Each subtable collects rules with matches on a specific subset of packet
10188
     * fields as defined by the subtable's mask.  We proceed to process every
10189
     * search-key against each subtable, but when a match is found for a
10190
     * search-key, the search for that key can stop because the rules are
10191
     * non-overlapping. */
10192
0
    PVECTOR_FOR_EACH (subtable, &cls->subtables) {
10193
        /* Call the subtable specific lookup function. */
10194
0
        found_map = subtable->lookup_func(subtable, keys_map, keys, rules);
10195
10196
        /* Count the number of subtables searched for this packet match. This
10197
         * estimates the "spread" of subtables looked at per matched packet. */
10198
0
        uint32_t pkts_matched = count_1bits(found_map);
10199
0
        lookups_match += pkts_matched * subtable_pos;
10200
10201
        /* Clear the found rules, and return early if all packets are found. */
10202
0
        keys_map &= ~found_map;
10203
0
        if (!keys_map) {
10204
0
            if (num_lookups_p) {
10205
0
                *num_lookups_p = lookups_match;
10206
0
            }
10207
0
            return true;
10208
0
        }
10209
0
        subtable_pos++;
10210
0
    }
10211
10212
0
    if (num_lookups_p) {
10213
0
        *num_lookups_p = lookups_match;
10214
0
    }
10215
0
    return false;
10216
0
}