Coverage Report

Created: 2025-07-11 06:12

/src/openvswitch/lib/dpif-netdev.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2009-2014, 2016-2018 Nicira, Inc.
3
 *
4
 * Licensed under the Apache License, Version 2.0 (the "License");
5
 * you may not use this file except in compliance with the License.
6
 * You may obtain a copy of the License at:
7
 *
8
 *     http://www.apache.org/licenses/LICENSE-2.0
9
 *
10
 * Unless required by applicable law or agreed to in writing, software
11
 * distributed under the License is distributed on an "AS IS" BASIS,
12
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
 * See the License for the specific language governing permissions and
14
 * limitations under the License.
15
 */
16
17
#include <config.h>
18
#include "dpif-netdev.h"
19
#include "dpif-netdev-private.h"
20
#include "dpif-netdev-private-dfc.h"
21
22
#include <ctype.h>
23
#include <errno.h>
24
#include <fcntl.h>
25
#include <inttypes.h>
26
#include <net/if.h>
27
#include <sys/types.h>
28
#include <netinet/in.h>
29
#include <stdint.h>
30
#include <stdlib.h>
31
#include <string.h>
32
#include <sys/ioctl.h>
33
#include <sys/socket.h>
34
#include <sys/stat.h>
35
#include <unistd.h>
36
37
#include "bitmap.h"
38
#include "ccmap.h"
39
#include "cmap.h"
40
#include "conntrack.h"
41
#include "conntrack-tp.h"
42
#include "coverage.h"
43
#include "ct-dpif.h"
44
#include "csum.h"
45
#include "dp-packet.h"
46
#include "dpif.h"
47
#include "dpif-netdev-lookup.h"
48
#include "dpif-netdev-perf.h"
49
#include "dpif-netdev-private-extract.h"
50
#include "dpif-provider.h"
51
#include "dummy.h"
52
#include "fat-rwlock.h"
53
#include "flow.h"
54
#include "hmapx.h"
55
#include "id-fpool.h"
56
#include "id-pool.h"
57
#include "ipf.h"
58
#include "mov-avg.h"
59
#include "mpsc-queue.h"
60
#include "netdev.h"
61
#include "netdev-offload.h"
62
#include "netdev-provider.h"
63
#include "netdev-vport.h"
64
#include "netlink.h"
65
#include "odp-execute.h"
66
#include "odp-util.h"
67
#include "openvswitch/dynamic-string.h"
68
#include "openvswitch/list.h"
69
#include "openvswitch/match.h"
70
#include "openvswitch/ofp-parse.h"
71
#include "openvswitch/ofp-print.h"
72
#include "openvswitch/ofpbuf.h"
73
#include "openvswitch/shash.h"
74
#include "openvswitch/vlog.h"
75
#include "ovs-numa.h"
76
#include "ovs-rcu.h"
77
#include "packets.h"
78
#include "openvswitch/poll-loop.h"
79
#include "pvector.h"
80
#include "random.h"
81
#include "seq.h"
82
#include "smap.h"
83
#include "sset.h"
84
#include "timeval.h"
85
#include "tnl-neigh-cache.h"
86
#include "tnl-ports.h"
87
#include "unixctl.h"
88
#include "util.h"
89
#include "uuid.h"
90
91
VLOG_DEFINE_THIS_MODULE(dpif_netdev);
92
93
/* Auto Load Balancing Defaults */
94
0
#define ALB_IMPROVEMENT_THRESHOLD    25
95
0
#define ALB_LOAD_THRESHOLD           95
96
0
#define ALB_REBALANCE_INTERVAL       1     /* 1 Min */
97
0
#define MAX_ALB_REBALANCE_INTERVAL   20000 /* 20000 Min */
98
0
#define MIN_TO_MSEC                  60000
99
100
#define FLOW_DUMP_MAX_BATCH 50
101
/* Use per thread recirc_depth to prevent recirculation loop. */
102
0
#define MAX_RECIRC_DEPTH 8
103
DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0)
104
105
/* Use instant packet send by default. */
106
0
#define DEFAULT_TX_FLUSH_INTERVAL 0
107
108
/* Configuration parameters. */
109
enum { MAX_METERS = 1 << 18 };  /* Maximum number of meters. */
110
enum { MAX_BANDS = 8 };         /* Maximum number of bands / meter. */
111
112
COVERAGE_DEFINE(datapath_drop_meter);
113
COVERAGE_DEFINE(datapath_drop_upcall_error);
114
COVERAGE_DEFINE(datapath_drop_lock_error);
115
COVERAGE_DEFINE(datapath_drop_userspace_action_error);
116
COVERAGE_DEFINE(datapath_drop_tunnel_push_error);
117
COVERAGE_DEFINE(datapath_drop_tunnel_pop_error);
118
COVERAGE_DEFINE(datapath_drop_recirc_error);
119
COVERAGE_DEFINE(datapath_drop_invalid_port);
120
COVERAGE_DEFINE(datapath_drop_invalid_bond);
121
COVERAGE_DEFINE(datapath_drop_invalid_tnl_port);
122
COVERAGE_DEFINE(datapath_drop_rx_invalid_packet);
123
#ifdef ALLOW_EXPERIMENTAL_API /* Packet restoration API required. */
124
COVERAGE_DEFINE(datapath_drop_hw_miss_recover);
125
#endif
126
127
/* Protects against changes to 'dp_netdevs'. */
128
struct ovs_mutex dp_netdev_mutex = OVS_MUTEX_INITIALIZER;
129
130
/* Contains all 'struct dp_netdev's. */
131
static struct shash dp_netdevs OVS_GUARDED_BY(dp_netdev_mutex)
132
    = SHASH_INITIALIZER(&dp_netdevs);
133
134
static struct vlog_rate_limit upcall_rl = VLOG_RATE_LIMIT_INIT(600, 600);
135
136
0
#define DP_NETDEV_CS_SUPPORTED_MASK (CS_NEW | CS_ESTABLISHED | CS_RELATED \
137
0
                                     | CS_INVALID | CS_REPLY_DIR | CS_TRACKED \
138
0
                                     | CS_SRC_NAT | CS_DST_NAT)
139
0
#define DP_NETDEV_CS_UNSUPPORTED_MASK (~(uint32_t)DP_NETDEV_CS_SUPPORTED_MASK)
140
141
static struct odp_support dp_netdev_support = {
142
    .max_vlan_headers = SIZE_MAX,
143
    .max_mpls_depth = SIZE_MAX,
144
    .recirc = true,
145
    .ct_state = true,
146
    .ct_zone = true,
147
    .ct_mark = true,
148
    .ct_label = true,
149
    .ct_state_nat = true,
150
    .ct_orig_tuple = true,
151
    .ct_orig_tuple6 = true,
152
};
153
154

155
/* Simple non-wildcarding single-priority classifier. */
156
157
/* Time in microseconds between successive optimizations of the dpcls
158
 * subtable vector */
159
0
#define DPCLS_OPTIMIZATION_INTERVAL 1000000LL
160
161
/* Time in microseconds of the interval in which rxq processing cycles used
162
 * in rxq to pmd assignments is measured and stored. */
163
0
#define PMD_INTERVAL_LEN 5000000LL
164
/* For converting PMD_INTERVAL_LEN to secs. */
165
0
#define INTERVAL_USEC_TO_SEC 1000000LL
166
167
/* Number of intervals for which cycles are stored
168
 * and used during rxq to pmd assignment. */
169
0
#define PMD_INTERVAL_MAX 12
170
171
/* Time in microseconds to try RCU quiescing. */
172
0
#define PMD_RCU_QUIESCE_INTERVAL 10000LL
173
174
/* Timer resolution for PMD threads in nanoseconds. */
175
0
#define PMD_TIMER_RES_NS 1000
176
177
/* Number of pkts Rx on an interface that will stop pmd thread sleeping. */
178
0
#define PMD_SLEEP_THRESH (NETDEV_MAX_BURST / 2)
179
/* Time in uS to increment a pmd thread sleep time. */
180
0
#define PMD_SLEEP_INC_US 1
181
182
struct pmd_sleep {
183
    unsigned core_id;
184
    uint64_t max_sleep;
185
};
186
187
struct dpcls {
188
    struct cmap_node node;      /* Within dp_netdev_pmd_thread.classifiers */
189
    odp_port_t in_port;
190
    struct cmap subtables_map;
191
    struct pvector subtables;
192
};
193
194
/* Data structure to keep packet order till fastpath processing. */
195
struct dp_packet_flow_map {
196
    struct dp_packet *packet;
197
    struct dp_netdev_flow *flow;
198
    uint16_t tcp_flags;
199
};
200
201
static void dpcls_init(struct dpcls *);
202
static void dpcls_destroy(struct dpcls *);
203
static void dpcls_sort_subtable_vector(struct dpcls *);
204
static uint32_t dpcls_subtable_lookup_reprobe(struct dpcls *cls);
205
static void dpcls_insert(struct dpcls *, struct dpcls_rule *,
206
                         const struct netdev_flow_key *mask);
207
static void dpcls_remove(struct dpcls *, struct dpcls_rule *);
208
209
/* Set of supported meter flags */
210
#define DP_SUPPORTED_METER_FLAGS_MASK \
211
0
    (OFPMF13_STATS | OFPMF13_PKTPS | OFPMF13_KBPS | OFPMF13_BURST)
212
213
/* Set of supported meter band types */
214
#define DP_SUPPORTED_METER_BAND_TYPES           \
215
0
    ( 1 << OFPMBT13_DROP )
216
217
struct dp_meter_band {
218
    uint32_t rate;
219
    uint32_t burst_size;
220
    atomic_uint64_t bucket;          /* In 1/1000 packets for PKTPS,
221
                                      * or in bits for KBPS. */
222
    atomic_uint64_t packet_count;
223
    atomic_uint64_t byte_count;
224
};
225
226
struct dp_meter {
227
    struct cmap_node node;
228
    uint32_t id;
229
    uint16_t flags;
230
    uint16_t n_bands;
231
    uint32_t max_delta_t;
232
    atomic_uint64_t used;  /* Time of a last use in milliseconds. */
233
    atomic_uint64_t packet_count;
234
    atomic_uint64_t byte_count;
235
    struct dp_meter_band bands[];
236
};
237
238
struct pmd_auto_lb {
239
    bool do_dry_run;
240
    bool recheck_config;
241
    bool is_enabled;            /* Current status of Auto load balancing. */
242
    uint64_t rebalance_intvl;
243
    uint64_t rebalance_poll_timer;
244
    uint8_t rebalance_improve_thresh;
245
    atomic_uint8_t rebalance_load_thresh;
246
};
247
248
enum sched_assignment_type {
249
    SCHED_ROUNDROBIN,
250
    SCHED_CYCLES, /* Default.*/
251
    SCHED_GROUP
252
};
253
254
/* Datapath based on the network device interface from netdev.h.
255
 *
256
 *
257
 * Thread-safety
258
 * =============
259
 *
260
 * Some members, marked 'const', are immutable.  Accessing other members
261
 * requires synchronization, as noted in more detail below.
262
 *
263
 * Acquisition order is, from outermost to innermost:
264
 *
265
 *    dp_netdev_mutex (global)
266
 *    port_rwlock
267
 *    bond_mutex
268
 *    non_pmd_mutex
269
 */
270
struct dp_netdev {
271
    const struct dpif_class *const class;
272
    const char *const name;
273
    struct ovs_refcount ref_cnt;
274
    atomic_flag destroyed;
275
276
    /* Ports.
277
     *
278
     * Any lookup into 'ports' or any access to the dp_netdev_ports found
279
     * through 'ports' requires taking 'port_rwlock'. */
280
    struct ovs_rwlock port_rwlock;
281
    struct hmap ports;
282
    struct seq *port_seq;       /* Incremented whenever a port changes. */
283
284
    /* The time that a packet can wait in output batch for sending. */
285
    atomic_uint32_t tx_flush_interval;
286
287
    /* Meters. */
288
    struct ovs_mutex meters_lock;
289
    struct cmap meters OVS_GUARDED;
290
291
    /* Probability of EMC insertions is a factor of 'emc_insert_min'.*/
292
    atomic_uint32_t emc_insert_min;
293
    /* Enable collection of PMD performance metrics. */
294
    atomic_bool pmd_perf_metrics;
295
    /* Default max load based sleep request. */
296
    uint64_t pmd_max_sleep_default;
297
    /* Enable the SMC cache from ovsdb config */
298
    atomic_bool smc_enable_db;
299
300
    /* Protects access to ofproto-dpif-upcall interface during revalidator
301
     * thread synchronization. */
302
    struct fat_rwlock upcall_rwlock;
303
    upcall_callback *upcall_cb;  /* Callback function for executing upcalls. */
304
    void *upcall_aux;
305
306
    /* Callback function for notifying the purging of dp flows (during
307
     * reseting pmd deletion). */
308
    dp_purge_callback *dp_purge_cb;
309
    void *dp_purge_aux;
310
311
    /* Stores all 'struct dp_netdev_pmd_thread's. */
312
    struct cmap poll_threads;
313
    /* id pool for per thread static_tx_qid. */
314
    struct id_pool *tx_qid_pool;
315
    struct ovs_mutex tx_qid_pool_mutex;
316
    /* Rxq to pmd assignment type. */
317
    enum sched_assignment_type pmd_rxq_assign_type;
318
    bool pmd_iso;
319
320
    /* Protects the access of the 'struct dp_netdev_pmd_thread'
321
     * instance for non-pmd thread. */
322
    struct ovs_mutex non_pmd_mutex;
323
324
    /* Each pmd thread will store its pointer to
325
     * 'struct dp_netdev_pmd_thread' in 'per_pmd_key'. */
326
    ovsthread_key_t per_pmd_key;
327
328
    struct seq *reconfigure_seq;
329
    uint64_t last_reconfigure_seq;
330
331
    /* Cpu mask for pin of pmd threads. */
332
    char *pmd_cmask;
333
334
    /* PMD max load based sleep request user string. */
335
    char *max_sleep_list;
336
337
    uint64_t last_tnl_conf_seq;
338
339
    struct conntrack *conntrack;
340
    struct pmd_auto_lb pmd_alb;
341
342
    /* Bonds. */
343
    struct ovs_mutex bond_mutex; /* Protects updates of 'tx_bonds'. */
344
    struct cmap tx_bonds; /* Contains 'struct tx_bond'. */
345
};
346
347
static struct dp_netdev_port *dp_netdev_lookup_port(const struct dp_netdev *dp,
348
                                                    odp_port_t)
349
    OVS_REQ_RDLOCK(dp->port_rwlock);
350
351
enum rxq_cycles_counter_type {
352
    RXQ_CYCLES_PROC_CURR,       /* Cycles spent successfully polling and
353
                                   processing packets during the current
354
                                   interval. */
355
    RXQ_CYCLES_PROC_HIST,       /* Total cycles of all intervals that are used
356
                                   during rxq to pmd assignment. */
357
    RXQ_N_CYCLES
358
};
359
360
enum dp_offload_type {
361
    DP_OFFLOAD_FLOW,
362
    DP_OFFLOAD_FLUSH,
363
};
364
365
enum {
366
    DP_NETDEV_FLOW_OFFLOAD_OP_ADD,
367
    DP_NETDEV_FLOW_OFFLOAD_OP_MOD,
368
    DP_NETDEV_FLOW_OFFLOAD_OP_DEL,
369
};
370
371
struct dp_offload_flow_item {
372
    struct dp_netdev_flow *flow;
373
    int op;
374
    struct match match;
375
    struct nlattr *actions;
376
    size_t actions_len;
377
    odp_port_t orig_in_port; /* Originating in_port for tnl flows. */
378
};
379
380
struct dp_offload_flush_item {
381
    struct netdev *netdev;
382
    struct ovs_barrier *barrier;
383
};
384
385
union dp_offload_thread_data {
386
    struct dp_offload_flow_item flow;
387
    struct dp_offload_flush_item flush;
388
};
389
390
struct dp_offload_thread_item {
391
    struct mpsc_queue_node node;
392
    enum dp_offload_type type;
393
    long long int timestamp;
394
    struct dp_netdev *dp;
395
    union dp_offload_thread_data data[0];
396
};
397
398
struct dp_offload_thread {
399
    PADDED_MEMBERS(CACHE_LINE_SIZE,
400
        struct mpsc_queue queue;
401
        atomic_uint64_t enqueued_item;
402
        struct cmap megaflow_to_mark;
403
        struct cmap mark_to_flow;
404
        struct mov_avg_cma cma;
405
        struct mov_avg_ema ema;
406
    );
407
};
408
static struct dp_offload_thread *dp_offload_threads;
409
static void *dp_netdev_flow_offload_main(void *arg);
410
411
static void
412
dp_netdev_offload_init(void)
413
0
{
414
0
    static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
415
0
    unsigned int nb_offload_thread = netdev_offload_thread_nb();
416
0
    unsigned int tid;
417
418
0
    if (!ovsthread_once_start(&once)) {
419
0
        return;
420
0
    }
421
422
0
    dp_offload_threads = xcalloc(nb_offload_thread,
423
0
                                 sizeof *dp_offload_threads);
424
425
0
    for (tid = 0; tid < nb_offload_thread; tid++) {
426
0
        struct dp_offload_thread *thread;
427
428
0
        thread = &dp_offload_threads[tid];
429
0
        mpsc_queue_init(&thread->queue);
430
0
        cmap_init(&thread->megaflow_to_mark);
431
0
        cmap_init(&thread->mark_to_flow);
432
0
        atomic_init(&thread->enqueued_item, 0);
433
0
        mov_avg_cma_init(&thread->cma);
434
0
        mov_avg_ema_init(&thread->ema, 100);
435
0
        ovs_thread_create("hw_offload", dp_netdev_flow_offload_main, thread);
436
0
    }
437
438
0
    ovsthread_once_done(&once);
439
0
}
440
441
0
#define XPS_TIMEOUT 500000LL    /* In microseconds. */
442
443
/* Contained by struct dp_netdev_port's 'rxqs' member.  */
444
struct dp_netdev_rxq {
445
    struct dp_netdev_port *port;
446
    struct netdev_rxq *rx;
447
    unsigned core_id;                  /* Core to which this queue should be
448
                                          pinned. OVS_CORE_UNSPEC if the
449
                                          queue doesn't need to be pinned to a
450
                                          particular core. */
451
    atomic_count intrvl_idx;           /* Write index for 'cycles_intrvl'. */
452
    struct dp_netdev_pmd_thread *pmd;  /* pmd thread that polls this queue. */
453
    bool is_vhost;                     /* Is rxq of a vhost port. */
454
455
    /* Counters of cycles spent successfully polling and processing pkts. */
456
    atomic_ullong cycles[RXQ_N_CYCLES];
457
    /* We store PMD_INTERVAL_MAX intervals of data for an rxq and then
458
       sum them to yield the cycles used for an rxq. */
459
    atomic_ullong cycles_intrvl[PMD_INTERVAL_MAX];
460
};
461
462
enum txq_req_mode {
463
    TXQ_REQ_MODE_THREAD,
464
    TXQ_REQ_MODE_HASH,
465
};
466
467
enum txq_mode {
468
    TXQ_MODE_STATIC,
469
    TXQ_MODE_XPS,
470
    TXQ_MODE_XPS_HASH,
471
};
472
473
/* A port in a netdev-based datapath. */
474
struct dp_netdev_port {
475
    odp_port_t port_no;
476
    enum txq_mode txq_mode;     /* static, XPS, XPS_HASH. */
477
    bool need_reconfigure;      /* True if we should reconfigure netdev. */
478
    struct netdev *netdev;
479
    struct hmap_node node;      /* Node in dp_netdev's 'ports'. */
480
    struct netdev_saved_flags *sf;
481
    struct dp_netdev_rxq *rxqs;
482
    unsigned n_rxq;             /* Number of elements in 'rxqs' */
483
    unsigned *txq_used;         /* Number of threads that use each tx queue. */
484
    struct ovs_mutex txq_used_mutex;
485
    bool emc_enabled;           /* If true EMC will be used. */
486
    char *type;                 /* Port type as requested by user. */
487
    char *rxq_affinity_list;    /* Requested affinity of rx queues. */
488
    enum txq_req_mode txq_requested_mode;
489
};
490
491
static bool dp_netdev_flow_ref(struct dp_netdev_flow *);
492
static int dpif_netdev_flow_from_nlattrs(const struct nlattr *, uint32_t,
493
                                         struct flow *, bool);
494
495
struct dp_netdev_actions *dp_netdev_actions_create(const struct nlattr *,
496
                                                   size_t);
497
struct dp_netdev_actions *dp_netdev_flow_get_actions(
498
    const struct dp_netdev_flow *);
499
static void dp_netdev_actions_free(struct dp_netdev_actions *);
500
501
struct polled_queue {
502
    struct dp_netdev_rxq *rxq;
503
    odp_port_t port_no;
504
    bool emc_enabled;
505
    bool rxq_enabled;
506
    uint64_t change_seq;
507
};
508
509
/* Contained by struct dp_netdev_pmd_thread's 'poll_list' member. */
510
struct rxq_poll {
511
    struct dp_netdev_rxq *rxq;
512
    struct hmap_node node;
513
};
514
515
/* Contained by struct dp_netdev_pmd_thread's 'send_port_cache',
516
 * 'tnl_port_cache' or 'tx_ports'. */
517
struct tx_port {
518
    struct dp_netdev_port *port;
519
    int qid;
520
    long long last_used;
521
    struct hmap_node node;
522
    long long flush_time;
523
    struct dp_packet_batch output_pkts;
524
    struct dp_packet_batch *txq_pkts; /* Only for hash mode. */
525
    struct dp_netdev_rxq *output_pkts_rxqs[NETDEV_MAX_BURST];
526
};
527
528
/* Contained by struct tx_bond 'member_buckets'. */
529
struct member_entry {
530
    odp_port_t member_id;
531
    atomic_ullong n_packets;
532
    atomic_ullong n_bytes;
533
};
534
535
/* Contained by struct dp_netdev_pmd_thread's 'tx_bonds'. */
536
struct tx_bond {
537
    struct cmap_node node;
538
    uint32_t bond_id;
539
    struct member_entry member_buckets[BOND_BUCKETS];
540
};
541
542
/* Interface to netdev-based datapath. */
543
struct dpif_netdev {
544
    struct dpif dpif;
545
    struct dp_netdev *dp;
546
    uint64_t last_port_seq;
547
};
548
549
static int get_port_by_number(struct dp_netdev *dp, odp_port_t port_no,
550
                              struct dp_netdev_port **portp)
551
    OVS_REQ_RDLOCK(dp->port_rwlock);
552
static int get_port_by_name(struct dp_netdev *dp, const char *devname,
553
                            struct dp_netdev_port **portp)
554
    OVS_REQ_RDLOCK(dp->port_rwlock);
555
static void dp_netdev_free(struct dp_netdev *)
556
    OVS_REQUIRES(dp_netdev_mutex);
557
static int do_add_port(struct dp_netdev *dp, const char *devname,
558
                       const char *type, odp_port_t port_no)
559
    OVS_REQ_WRLOCK(dp->port_rwlock);
560
static void do_del_port(struct dp_netdev *dp, struct dp_netdev_port *)
561
    OVS_REQ_WRLOCK(dp->port_rwlock);
562
static int dpif_netdev_open(const struct dpif_class *, const char *name,
563
                            bool create, struct dpif **);
564
static void dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
565
                                      struct dp_packet_batch *,
566
                                      bool should_steal,
567
                                      const struct flow *flow,
568
                                      const struct nlattr *actions,
569
                                      size_t actions_len);
570
static void dp_netdev_recirculate(struct dp_netdev_pmd_thread *,
571
                                  struct dp_packet_batch *);
572
573
static void dp_netdev_disable_upcall(struct dp_netdev *);
574
static void dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd);
575
static void dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd,
576
                                    struct dp_netdev *dp, unsigned core_id,
577
                                    int numa_id);
578
static void dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd);
579
static void dp_netdev_set_nonpmd(struct dp_netdev *dp)
580
    OVS_REQ_WRLOCK(dp->port_rwlock);
581
582
static void *pmd_thread_main(void *);
583
static struct dp_netdev_pmd_thread *dp_netdev_get_pmd(struct dp_netdev *dp,
584
                                                      unsigned core_id);
585
static struct dp_netdev_pmd_thread *
586
dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos);
587
static void dp_netdev_del_pmd(struct dp_netdev *dp,
588
                              struct dp_netdev_pmd_thread *pmd);
589
static void dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd);
590
static void dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd);
591
static void dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
592
                                         struct dp_netdev_port *port)
593
    OVS_REQUIRES(pmd->port_mutex);
594
static void dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
595
                                           struct tx_port *tx)
596
    OVS_REQUIRES(pmd->port_mutex);
597
static void dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
598
                                     struct dp_netdev_rxq *rxq)
599
    OVS_REQUIRES(pmd->port_mutex);
600
static void dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
601
                                       struct rxq_poll *poll)
602
    OVS_REQUIRES(pmd->port_mutex);
603
static int
604
dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
605
                                   bool force);
606
static void dp_netdev_add_bond_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
607
                                         struct tx_bond *bond, bool update)
608
    OVS_EXCLUDED(pmd->bond_mutex);
609
static void dp_netdev_del_bond_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
610
                                           uint32_t bond_id)
611
    OVS_EXCLUDED(pmd->bond_mutex);
612
613
static void dp_netdev_offload_flush(struct dp_netdev *dp,
614
                                    struct dp_netdev_port *port);
615
616
static void reconfigure_datapath(struct dp_netdev *dp)
617
    OVS_REQ_RDLOCK(dp->port_rwlock);
618
static bool dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd);
619
static void dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd);
620
static void dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd);
621
static void pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
622
    OVS_REQUIRES(pmd->port_mutex);
623
static inline void
624
dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
625
                           struct polled_queue *poll_list, int poll_cnt);
626
static void
627
dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
628
                         enum rxq_cycles_counter_type type,
629
                         unsigned long long cycles);
630
static uint64_t
631
dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx,
632
                         enum rxq_cycles_counter_type type);
633
static void
634
dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
635
                           unsigned long long cycles);
636
static uint64_t
637
dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx);
638
static uint64_t
639
get_interval_values(atomic_ullong *source, atomic_count *cur_idx,
640
                    int num_to_read);
641
static void
642
dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
643
                               bool purge);
644
static int dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
645
                                      struct tx_port *tx);
646
inline struct dpcls *
647
dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd,
648
                           odp_port_t in_port);
649
650
static void dp_netdev_request_reconfigure(struct dp_netdev *dp);
651
static inline bool
652
pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd);
653
static void queue_netdev_flow_del(struct dp_netdev_pmd_thread *pmd,
654
                                  struct dp_netdev_flow *flow);
655
656
static void dp_netdev_simple_match_insert(struct dp_netdev_pmd_thread *pmd,
657
                                          struct dp_netdev_flow *flow)
658
    OVS_REQUIRES(pmd->flow_mutex);
659
static void dp_netdev_simple_match_remove(struct dp_netdev_pmd_thread *pmd,
660
                                          struct dp_netdev_flow *flow)
661
    OVS_REQUIRES(pmd->flow_mutex);
662
663
static bool dp_netdev_flow_is_simple_match(const struct match *);
664
665
/* Updates the time in PMD threads context and should be called in three cases:
666
 *
667
 *     1. PMD structure initialization:
668
 *         - dp_netdev_configure_pmd()
669
 *
670
 *     2. Before processing of the new packet batch:
671
 *         - dpif_netdev_execute()
672
 *         - dp_netdev_process_rxq_port()
673
 *
674
 *     3. At least once per polling iteration in main polling threads if no
675
 *        packets received on current iteration:
676
 *         - dpif_netdev_run()
677
 *         - pmd_thread_main()
678
 *
679
 * 'pmd->ctx.now' should be used without update in all other cases if possible.
680
 */
681
static inline void
682
pmd_thread_ctx_time_update(struct dp_netdev_pmd_thread *pmd)
683
0
{
684
0
    pmd->ctx.now = time_usec();
685
0
}
686
687
/* Returns true if 'dpif' is a netdev or dummy dpif, false otherwise. */
688
bool
689
dpif_is_netdev(const struct dpif *dpif)
690
0
{
691
0
    return dpif->dpif_class->open == dpif_netdev_open;
692
0
}
693
694
static struct dpif_netdev *
695
dpif_netdev_cast(const struct dpif *dpif)
696
0
{
697
0
    ovs_assert(dpif_is_netdev(dpif));
698
0
    return CONTAINER_OF(dpif, struct dpif_netdev, dpif);
699
0
}
700
701
static struct dp_netdev *
702
get_dp_netdev(const struct dpif *dpif)
703
0
{
704
0
    return dpif_netdev_cast(dpif)->dp;
705
0
}
706

707
enum pmd_info_type {
708
    PMD_INFO_SHOW_STATS,  /* Show how cpu cycles are spent. */
709
    PMD_INFO_CLEAR_STATS, /* Set the cycles count to 0. */
710
    PMD_INFO_SHOW_RXQ,    /* Show poll lists of pmd threads. */
711
    PMD_INFO_PERF_SHOW,   /* Show pmd performance details. */
712
    PMD_INFO_SLEEP_SHOW,  /* Show max sleep configuration details. */
713
};
714
715
static void
716
format_pmd_thread(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
717
0
{
718
0
    ds_put_cstr(reply, (pmd->core_id == NON_PMD_CORE_ID)
719
0
                        ? "main thread" : "pmd thread");
720
0
    if (pmd->numa_id != OVS_NUMA_UNSPEC) {
721
0
        ds_put_format(reply, " numa_id %d", pmd->numa_id);
722
0
    }
723
0
    if (pmd->core_id != OVS_CORE_UNSPEC && pmd->core_id != NON_PMD_CORE_ID) {
724
0
        ds_put_format(reply, " core_id %u", pmd->core_id);
725
0
    }
726
0
    ds_put_cstr(reply, ":\n");
727
0
}
728
729
static void
730
pmd_info_show_stats(struct ds *reply,
731
                    struct dp_netdev_pmd_thread *pmd)
732
0
{
733
0
    uint64_t stats[PMD_N_STATS];
734
0
    uint64_t total_cycles, total_packets;
735
0
    double passes_per_pkt = 0;
736
0
    double lookups_per_hit = 0;
737
0
    double packets_per_batch = 0;
738
739
0
    pmd_perf_read_counters(&pmd->perf_stats, stats);
740
0
    total_cycles = stats[PMD_CYCLES_ITER_IDLE]
741
0
                         + stats[PMD_CYCLES_ITER_BUSY];
742
0
    total_packets = stats[PMD_STAT_RECV];
743
744
0
    format_pmd_thread(reply, pmd);
745
746
0
    if (total_packets > 0) {
747
0
        passes_per_pkt = (total_packets + stats[PMD_STAT_RECIRC])
748
0
                            / (double) total_packets;
749
0
    }
750
0
    if (stats[PMD_STAT_MASKED_HIT] > 0) {
751
0
        lookups_per_hit = stats[PMD_STAT_MASKED_LOOKUP]
752
0
                            / (double) stats[PMD_STAT_MASKED_HIT];
753
0
    }
754
0
    if (stats[PMD_STAT_SENT_BATCHES] > 0) {
755
0
        packets_per_batch = stats[PMD_STAT_SENT_PKTS]
756
0
                            / (double) stats[PMD_STAT_SENT_BATCHES];
757
0
    }
758
759
0
    ds_put_format(reply,
760
0
                  "  packets received: %"PRIu64"\n"
761
0
                  "  packet recirculations: %"PRIu64"\n"
762
0
                  "  avg. datapath passes per packet: %.02f\n"
763
0
                  "  phwol hits: %"PRIu64"\n"
764
0
                  "  mfex opt hits: %"PRIu64"\n"
765
0
                  "  simple match hits: %"PRIu64"\n"
766
0
                  "  emc hits: %"PRIu64"\n"
767
0
                  "  smc hits: %"PRIu64"\n"
768
0
                  "  megaflow hits: %"PRIu64"\n"
769
0
                  "  avg. subtable lookups per megaflow hit: %.02f\n"
770
0
                  "  miss with success upcall: %"PRIu64"\n"
771
0
                  "  miss with failed upcall: %"PRIu64"\n"
772
0
                  "  avg. packets per output batch: %.02f\n",
773
0
                  total_packets, stats[PMD_STAT_RECIRC],
774
0
                  passes_per_pkt, stats[PMD_STAT_PHWOL_HIT],
775
0
                  stats[PMD_STAT_MFEX_OPT_HIT],
776
0
                  stats[PMD_STAT_SIMPLE_HIT],
777
0
                  stats[PMD_STAT_EXACT_HIT],
778
0
                  stats[PMD_STAT_SMC_HIT],
779
0
                  stats[PMD_STAT_MASKED_HIT],
780
0
                  lookups_per_hit, stats[PMD_STAT_MISS], stats[PMD_STAT_LOST],
781
0
                  packets_per_batch);
782
783
0
    if (total_cycles == 0) {
784
0
        return;
785
0
    }
786
787
0
    ds_put_format(reply,
788
0
                  "  idle cycles: %"PRIu64" (%.02f%%)\n"
789
0
                  "  processing cycles: %"PRIu64" (%.02f%%)\n",
790
0
                  stats[PMD_CYCLES_ITER_IDLE],
791
0
                  stats[PMD_CYCLES_ITER_IDLE] / (double) total_cycles * 100,
792
0
                  stats[PMD_CYCLES_ITER_BUSY],
793
0
                  stats[PMD_CYCLES_ITER_BUSY] / (double) total_cycles * 100);
794
795
0
    if (total_packets == 0) {
796
0
        return;
797
0
    }
798
799
0
    ds_put_format(reply,
800
0
                  "  avg cycles per packet: %.02f (%"PRIu64"/%"PRIu64")\n",
801
0
                  total_cycles / (double) total_packets,
802
0
                  total_cycles, total_packets);
803
804
0
    ds_put_format(reply,
805
0
                  "  avg processing cycles per packet: "
806
0
                  "%.02f (%"PRIu64"/%"PRIu64")\n",
807
0
                  stats[PMD_CYCLES_ITER_BUSY] / (double) total_packets,
808
0
                  stats[PMD_CYCLES_ITER_BUSY], total_packets);
809
0
}
810
811
static void
812
pmd_info_show_perf(struct ds *reply,
813
                   struct dp_netdev_pmd_thread *pmd,
814
                   struct pmd_perf_params *par)
815
0
{
816
0
    if (pmd->core_id != NON_PMD_CORE_ID) {
817
0
        char *time_str =
818
0
                xastrftime_msec("%H:%M:%S.###", time_wall_msec(), true);
819
0
        long long now = time_msec();
820
0
        double duration = (now - pmd->perf_stats.start_ms) / 1000.0;
821
822
0
        ds_put_cstr(reply, "\n");
823
0
        ds_put_format(reply, "Time: %s\n", time_str);
824
0
        ds_put_format(reply, "Measurement duration: %.3f s\n", duration);
825
0
        ds_put_cstr(reply, "\n");
826
0
        format_pmd_thread(reply, pmd);
827
0
        ds_put_cstr(reply, "\n");
828
0
        pmd_perf_format_overall_stats(reply, &pmd->perf_stats, duration);
829
0
        if (pmd_perf_metrics_enabled(pmd)) {
830
            /* Prevent parallel clearing of perf metrics. */
831
0
            ovs_mutex_lock(&pmd->perf_stats.clear_mutex);
832
0
            if (par->histograms) {
833
0
                ds_put_cstr(reply, "\n");
834
0
                pmd_perf_format_histograms(reply, &pmd->perf_stats);
835
0
            }
836
0
            if (par->iter_hist_len > 0) {
837
0
                ds_put_cstr(reply, "\n");
838
0
                pmd_perf_format_iteration_history(reply, &pmd->perf_stats,
839
0
                        par->iter_hist_len);
840
0
            }
841
0
            if (par->ms_hist_len > 0) {
842
0
                ds_put_cstr(reply, "\n");
843
0
                pmd_perf_format_ms_history(reply, &pmd->perf_stats,
844
0
                        par->ms_hist_len);
845
0
            }
846
0
            ovs_mutex_unlock(&pmd->perf_stats.clear_mutex);
847
0
        }
848
0
        free(time_str);
849
0
    }
850
0
}
851
852
static int
853
compare_poll_list(const void *a_, const void *b_)
854
0
{
855
0
    const struct rxq_poll *a = a_;
856
0
    const struct rxq_poll *b = b_;
857
858
0
    const char *namea = netdev_rxq_get_name(a->rxq->rx);
859
0
    const char *nameb = netdev_rxq_get_name(b->rxq->rx);
860
861
0
    int cmp = strcmp(namea, nameb);
862
0
    if (!cmp) {
863
0
        return netdev_rxq_get_queue_id(a->rxq->rx)
864
0
               - netdev_rxq_get_queue_id(b->rxq->rx);
865
0
    } else {
866
0
        return cmp;
867
0
    }
868
0
}
869
870
static void
871
sorted_poll_list(struct dp_netdev_pmd_thread *pmd, struct rxq_poll **list,
872
                 size_t *n)
873
    OVS_REQUIRES(pmd->port_mutex)
874
0
{
875
0
    struct rxq_poll *ret, *poll;
876
0
    size_t i;
877
878
0
    *n = hmap_count(&pmd->poll_list);
879
0
    if (!*n) {
880
0
        ret = NULL;
881
0
    } else {
882
0
        ret = xcalloc(*n, sizeof *ret);
883
0
        i = 0;
884
0
        HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
885
0
            ret[i] = *poll;
886
0
            i++;
887
0
        }
888
0
        ovs_assert(i == *n);
889
0
        qsort(ret, *n, sizeof *ret, compare_poll_list);
890
0
    }
891
892
0
    *list = ret;
893
0
}
894
895
static void
896
pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd,
897
                  int secs)
898
0
{
899
0
    if (pmd->core_id != NON_PMD_CORE_ID) {
900
0
        struct rxq_poll *list;
901
0
        size_t n_rxq;
902
0
        uint64_t total_pmd_cycles = 0;
903
0
        uint64_t busy_pmd_cycles = 0;
904
0
        uint64_t total_rxq_proc_cycles = 0;
905
0
        unsigned int intervals;
906
907
0
        ds_put_format(reply,
908
0
                      "pmd thread numa_id %d core_id %u:\n  isolated : %s\n",
909
0
                      pmd->numa_id, pmd->core_id, (pmd->isolated)
910
0
                                                  ? "true" : "false");
911
912
0
        ovs_mutex_lock(&pmd->port_mutex);
913
0
        sorted_poll_list(pmd, &list, &n_rxq);
914
915
        /* Get the total pmd cycles for an interval. */
916
0
        atomic_read_relaxed(&pmd->intrvl_cycles, &total_pmd_cycles);
917
        /* Calculate how many intervals are to be used. */
918
0
        intervals = DIV_ROUND_UP(secs,
919
0
                                 PMD_INTERVAL_LEN / INTERVAL_USEC_TO_SEC);
920
        /* Estimate the cycles to cover all intervals. */
921
0
        total_pmd_cycles *= intervals;
922
0
        busy_pmd_cycles = get_interval_values(pmd->busy_cycles_intrvl,
923
0
                                              &pmd->intrvl_idx,
924
0
                                              intervals);
925
0
        if (busy_pmd_cycles > total_pmd_cycles) {
926
0
            busy_pmd_cycles = total_pmd_cycles;
927
0
        }
928
929
0
        for (int i = 0; i < n_rxq; i++) {
930
0
            struct dp_netdev_rxq *rxq = list[i].rxq;
931
0
            const char *name = netdev_rxq_get_name(rxq->rx);
932
0
            uint64_t rxq_proc_cycles = 0;
933
934
0
            rxq_proc_cycles = get_interval_values(rxq->cycles_intrvl,
935
0
                                                  &rxq->intrvl_idx,
936
0
                                                  intervals);
937
0
            total_rxq_proc_cycles += rxq_proc_cycles;
938
0
            ds_put_format(reply, "  port: %-16s  queue-id: %2d", name,
939
0
                          netdev_rxq_get_queue_id(list[i].rxq->rx));
940
0
            ds_put_format(reply, " %s", netdev_rxq_enabled(list[i].rxq->rx)
941
0
                                        ? "(enabled) " : "(disabled)");
942
0
            ds_put_format(reply, "  pmd usage: ");
943
0
            if (total_pmd_cycles) {
944
0
                ds_put_format(reply, "%2.0f %%",
945
0
                              (double) (rxq_proc_cycles * 100) /
946
0
                              total_pmd_cycles);
947
0
            } else {
948
0
                ds_put_format(reply, "%s", "NOT AVAIL");
949
0
            }
950
0
            ds_put_cstr(reply, "\n");
951
0
        }
952
953
0
        if (n_rxq > 0) {
954
0
            ds_put_cstr(reply, "  overhead: ");
955
0
            if (total_pmd_cycles) {
956
0
                uint64_t overhead_cycles = 0;
957
958
0
                if (total_rxq_proc_cycles < busy_pmd_cycles) {
959
0
                    overhead_cycles = busy_pmd_cycles - total_rxq_proc_cycles;
960
0
                }
961
962
0
                ds_put_format(reply, "%2.0f %%",
963
0
                              (double) (overhead_cycles * 100) /
964
0
                              total_pmd_cycles);
965
0
            } else {
966
0
                ds_put_cstr(reply, "NOT AVAIL");
967
0
            }
968
0
            ds_put_cstr(reply, "\n");
969
0
        }
970
971
0
        ovs_mutex_unlock(&pmd->port_mutex);
972
0
        free(list);
973
0
    }
974
0
}
975
976
static int
977
compare_poll_thread_list(const void *a_, const void *b_)
978
0
{
979
0
    const struct dp_netdev_pmd_thread *a, *b;
980
981
0
    a = *(struct dp_netdev_pmd_thread **)a_;
982
0
    b = *(struct dp_netdev_pmd_thread **)b_;
983
984
0
    if (a->core_id < b->core_id) {
985
0
        return -1;
986
0
    }
987
0
    if (a->core_id > b->core_id) {
988
0
        return 1;
989
0
    }
990
0
    return 0;
991
0
}
992
993
/* Create a sorted list of pmd's from the dp->poll_threads cmap. We can use
994
 * this list, as long as we do not go to quiescent state. */
995
static void
996
sorted_poll_thread_list(struct dp_netdev *dp,
997
                        struct dp_netdev_pmd_thread ***list,
998
                        size_t *n)
999
0
{
1000
0
    struct dp_netdev_pmd_thread *pmd;
1001
0
    struct dp_netdev_pmd_thread **pmd_list;
1002
0
    size_t k = 0, n_pmds;
1003
1004
0
    n_pmds = cmap_count(&dp->poll_threads);
1005
0
    pmd_list = xcalloc(n_pmds, sizeof *pmd_list);
1006
1007
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
1008
0
        if (k >= n_pmds) {
1009
0
            break;
1010
0
        }
1011
0
        pmd_list[k++] = pmd;
1012
0
    }
1013
1014
0
    qsort(pmd_list, k, sizeof *pmd_list, compare_poll_thread_list);
1015
1016
0
    *list = pmd_list;
1017
0
    *n = k;
1018
0
}
1019
1020
static void
1021
dpif_netdev_subtable_lookup_get(struct unixctl_conn *conn, int argc OVS_UNUSED,
1022
                                const char *argv[] OVS_UNUSED,
1023
                                void *aux OVS_UNUSED)
1024
0
{
1025
0
    struct ds reply = DS_EMPTY_INITIALIZER;
1026
1027
0
    dpcls_impl_print_stats(&reply);
1028
0
    unixctl_command_reply(conn, ds_cstr(&reply));
1029
0
    ds_destroy(&reply);
1030
0
}
1031
1032
static void
1033
dpif_netdev_subtable_lookup_set(struct unixctl_conn *conn, int argc OVS_UNUSED,
1034
                                const char *argv[], void *aux OVS_UNUSED)
1035
0
{
1036
    /* This function requires 2 parameters (argv[1] and argv[2]) to execute.
1037
     *   argv[1] is subtable name
1038
     *   argv[2] is priority
1039
     */
1040
0
    const char *func_name = argv[1];
1041
1042
0
    errno = 0;
1043
0
    char *err_char;
1044
0
    uint32_t new_prio = strtoul(argv[2], &err_char, 10);
1045
0
    uint32_t lookup_dpcls_changed = 0;
1046
0
    uint32_t lookup_subtable_changed = 0;
1047
0
    struct shash_node *node;
1048
0
    if (errno != 0 || new_prio > UINT8_MAX) {
1049
0
        unixctl_command_reply_error(conn,
1050
0
            "error converting priority, use integer in range 0-255\n");
1051
0
        return;
1052
0
    }
1053
1054
0
    int32_t err = dpcls_subtable_set_prio(func_name, new_prio);
1055
0
    if (err) {
1056
0
        unixctl_command_reply_error(conn,
1057
0
            "error, subtable lookup function not found\n");
1058
0
        return;
1059
0
    }
1060
1061
0
    ovs_mutex_lock(&dp_netdev_mutex);
1062
0
    SHASH_FOR_EACH (node, &dp_netdevs) {
1063
0
        struct dp_netdev *dp = node->data;
1064
1065
        /* Get PMD threads list, required to get DPCLS instances. */
1066
0
        size_t n;
1067
0
        struct dp_netdev_pmd_thread **pmd_list;
1068
0
        sorted_poll_thread_list(dp, &pmd_list, &n);
1069
1070
        /* take port mutex as HMAP iters over them. */
1071
0
        ovs_rwlock_rdlock(&dp->port_rwlock);
1072
1073
0
        for (size_t i = 0; i < n; i++) {
1074
0
            struct dp_netdev_pmd_thread *pmd = pmd_list[i];
1075
0
            if (pmd->core_id == NON_PMD_CORE_ID) {
1076
0
                continue;
1077
0
            }
1078
1079
0
            struct dp_netdev_port *port = NULL;
1080
0
            HMAP_FOR_EACH (port, node, &dp->ports) {
1081
0
                odp_port_t in_port = port->port_no;
1082
0
                struct dpcls *cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
1083
0
                if (!cls) {
1084
0
                    continue;
1085
0
                }
1086
0
                ovs_mutex_lock(&pmd->flow_mutex);
1087
0
                uint32_t subtbl_changes = dpcls_subtable_lookup_reprobe(cls);
1088
0
                ovs_mutex_unlock(&pmd->flow_mutex);
1089
0
                if (subtbl_changes) {
1090
0
                    lookup_dpcls_changed++;
1091
0
                    lookup_subtable_changed += subtbl_changes;
1092
0
                }
1093
0
            }
1094
0
        }
1095
1096
        /* release port mutex before netdev mutex. */
1097
0
        ovs_rwlock_unlock(&dp->port_rwlock);
1098
0
        free(pmd_list);
1099
0
    }
1100
0
    ovs_mutex_unlock(&dp_netdev_mutex);
1101
1102
0
    struct ds reply = DS_EMPTY_INITIALIZER;
1103
0
    ds_put_format(&reply,
1104
0
        "Lookup priority change affected %d dpcls ports and %d subtables.\n",
1105
0
        lookup_dpcls_changed, lookup_subtable_changed);
1106
0
    const char *reply_str = ds_cstr(&reply);
1107
0
    unixctl_command_reply(conn, reply_str);
1108
0
    VLOG_INFO("%s", reply_str);
1109
0
    ds_destroy(&reply);
1110
0
}
1111
1112
static void
1113
dpif_netdev_impl_get(struct unixctl_conn *conn, int argc OVS_UNUSED,
1114
                     const char *argv[] OVS_UNUSED, void *aux OVS_UNUSED)
1115
0
{
1116
0
    struct ds reply = DS_EMPTY_INITIALIZER;
1117
0
    struct shash_node *node;
1118
1119
0
    ovs_mutex_lock(&dp_netdev_mutex);
1120
0
    SHASH_FOR_EACH (node, &dp_netdevs) {
1121
0
        struct dp_netdev_pmd_thread **pmd_list;
1122
0
        struct dp_netdev *dp = node->data;
1123
0
        size_t n;
1124
1125
        /* Get PMD threads list, required to get the DPIF impl used by each PMD
1126
         * thread. */
1127
0
        sorted_poll_thread_list(dp, &pmd_list, &n);
1128
0
        dp_netdev_impl_get(&reply, pmd_list, n);
1129
0
        free(pmd_list);
1130
0
    }
1131
0
    ovs_mutex_unlock(&dp_netdev_mutex);
1132
0
    unixctl_command_reply(conn, ds_cstr(&reply));
1133
0
    ds_destroy(&reply);
1134
0
}
1135
1136
static void
1137
dpif_netdev_impl_set(struct unixctl_conn *conn, int argc OVS_UNUSED,
1138
                     const char *argv[], void *aux OVS_UNUSED)
1139
0
{
1140
    /* This function requires just one parameter, the DPIF name. */
1141
0
    const char *dpif_name = argv[1];
1142
0
    struct shash_node *node;
1143
1144
0
    static const char *error_description[2] = {
1145
0
        "Unknown DPIF implementation",
1146
0
        "CPU doesn't support the required instruction for",
1147
0
    };
1148
1149
0
    ovs_mutex_lock(&dp_netdev_mutex);
1150
0
    int32_t err = dp_netdev_impl_set_default_by_name(dpif_name);
1151
1152
0
    if (err) {
1153
0
        struct ds reply = DS_EMPTY_INITIALIZER;
1154
0
        ds_put_format(&reply, "DPIF implementation not available: %s %s.\n",
1155
0
                      error_description[ (err == -ENOTSUP) ], dpif_name);
1156
0
        const char *reply_str = ds_cstr(&reply);
1157
0
        unixctl_command_reply_error(conn, reply_str);
1158
0
        VLOG_ERR("%s", reply_str);
1159
0
        ds_destroy(&reply);
1160
0
        ovs_mutex_unlock(&dp_netdev_mutex);
1161
0
        return;
1162
0
    }
1163
1164
0
    SHASH_FOR_EACH (node, &dp_netdevs) {
1165
0
        struct dp_netdev *dp = node->data;
1166
1167
        /* Get PMD threads list, required to get DPCLS instances. */
1168
0
        size_t n;
1169
0
        struct dp_netdev_pmd_thread **pmd_list;
1170
0
        sorted_poll_thread_list(dp, &pmd_list, &n);
1171
1172
0
        for (size_t i = 0; i < n; i++) {
1173
0
            struct dp_netdev_pmd_thread *pmd = pmd_list[i];
1174
0
            if (pmd->core_id == NON_PMD_CORE_ID) {
1175
0
                continue;
1176
0
            }
1177
1178
            /* Initialize DPIF function pointer to the newly configured
1179
             * default. */
1180
0
            atomic_store_relaxed(&pmd->netdev_input_func,
1181
0
                                 dp_netdev_impl_get_default());
1182
0
        };
1183
1184
0
        free(pmd_list);
1185
0
    }
1186
0
    ovs_mutex_unlock(&dp_netdev_mutex);
1187
1188
    /* Reply with success to command. */
1189
0
    struct ds reply = DS_EMPTY_INITIALIZER;
1190
0
    ds_put_format(&reply, "DPIF implementation set to %s.\n", dpif_name);
1191
0
    const char *reply_str = ds_cstr(&reply);
1192
0
    unixctl_command_reply(conn, reply_str);
1193
0
    VLOG_INFO("%s", reply_str);
1194
0
    ds_destroy(&reply);
1195
0
}
1196
1197
static void
1198
dpif_miniflow_extract_impl_get(struct unixctl_conn *conn, int argc OVS_UNUSED,
1199
                               const char *argv[] OVS_UNUSED,
1200
                               void *aux OVS_UNUSED)
1201
0
{
1202
0
    struct ds reply = DS_EMPTY_INITIALIZER;
1203
0
    struct shash_node *node;
1204
1205
0
    ovs_mutex_lock(&dp_netdev_mutex);
1206
0
    SHASH_FOR_EACH (node, &dp_netdevs) {
1207
0
        struct dp_netdev_pmd_thread **pmd_list;
1208
0
        struct dp_netdev *dp = node->data;
1209
0
        size_t n;
1210
1211
        /* Get PMD threads list, required to get the DPIF impl used by each PMD
1212
         * thread. */
1213
0
        sorted_poll_thread_list(dp, &pmd_list, &n);
1214
0
        dp_mfex_impl_get(&reply, pmd_list, n);
1215
0
        free(pmd_list);
1216
0
    }
1217
0
    ovs_mutex_unlock(&dp_netdev_mutex);
1218
0
    unixctl_command_reply(conn, ds_cstr(&reply));
1219
0
    ds_destroy(&reply);
1220
0
}
1221
1222
static void
1223
dpif_miniflow_extract_impl_set(struct unixctl_conn *conn, int argc,
1224
                               const char *argv[], void *aux OVS_UNUSED)
1225
0
{
1226
    /* This command takes some optional and mandatory arguments. The function
1227
     * here first parses all of the options, saving results in local variables.
1228
     * Then the parsed values are acted on.
1229
     */
1230
0
    unsigned int pmd_thread_to_change = NON_PMD_CORE_ID;
1231
0
    unsigned int study_count = MFEX_MAX_PKT_COUNT;
1232
0
    struct ds reply = DS_EMPTY_INITIALIZER;
1233
0
    bool pmd_thread_update_done = false;
1234
0
    bool mfex_name_is_study = false;
1235
0
    const char *mfex_name = NULL;
1236
0
    const char *reply_str = NULL;
1237
0
    struct shash_node *node;
1238
0
    int err;
1239
1240
0
    while (argc > 1) {
1241
        /* Optional argument "-pmd" limits the commands actions to just this
1242
         * PMD thread.
1243
         */
1244
0
        if ((!strcmp(argv[1], "-pmd") && !mfex_name)) {
1245
0
            if (argc < 3) {
1246
0
                ds_put_format(&reply,
1247
0
                              "Error: -pmd option requires a thread id"
1248
0
                              " argument.\n");
1249
0
                goto error;
1250
0
            }
1251
1252
            /* Ensure argument can be parsed to an integer. */
1253
0
            if (!str_to_uint(argv[2], 10, &pmd_thread_to_change) ||
1254
0
                (pmd_thread_to_change == NON_PMD_CORE_ID)) {
1255
0
                ds_put_format(&reply,
1256
0
                              "Error: miniflow extract parser not changed,"
1257
0
                              " PMD thread passed is not valid: '%s'."
1258
0
                              " Pass a valid pmd thread ID.\n",
1259
0
                              argv[2]);
1260
0
                goto error;
1261
0
            }
1262
1263
0
            argc -= 2;
1264
0
            argv += 2;
1265
1266
0
        } else if (!mfex_name) {
1267
            /* Name of MFEX impl requested by user. */
1268
0
            mfex_name = argv[1];
1269
0
            mfex_name_is_study = strcmp("study", mfex_name) == 0;
1270
0
            argc -= 1;
1271
0
            argv += 1;
1272
1273
        /* If name is study and more args exist, parse study_count value. */
1274
0
        } else if (mfex_name && mfex_name_is_study) {
1275
0
            if (!str_to_uint(argv[1], 10, &study_count) ||
1276
0
                (study_count == 0)) {
1277
0
                ds_put_format(&reply,
1278
0
                              "Error: invalid study_pkt_cnt value: %s.\n",
1279
0
                              argv[1]);
1280
0
                goto error;
1281
0
            }
1282
1283
0
            argc -= 1;
1284
0
            argv += 1;
1285
0
        } else {
1286
0
            ds_put_format(&reply, "Error: unknown argument %s.\n", argv[1]);
1287
0
            goto error;
1288
0
        }
1289
0
    }
1290
1291
    /* Ensure user passed an MFEX name. */
1292
0
    if (!mfex_name) {
1293
0
        ds_put_format(&reply, "Error: no miniflow extract name provided."
1294
0
                      " Output of miniflow-parser-get shows implementation"
1295
0
                      " list.\n");
1296
0
        goto error;
1297
0
    }
1298
1299
    /* If the MFEX name is "study", set the study packet count. */
1300
0
    if (mfex_name_is_study) {
1301
0
        err = mfex_set_study_pkt_cnt(study_count, mfex_name);
1302
0
        if (err) {
1303
0
            ds_put_format(&reply, "Error: failed to set study count %d for"
1304
0
                          " miniflow extract implementation %s.\n",
1305
0
                          study_count, mfex_name);
1306
0
            goto error;
1307
0
        }
1308
0
    }
1309
1310
    /* Set the default MFEX impl only if the command was applied to all PMD
1311
     * threads. If a PMD thread was selected, do NOT update the default.
1312
     */
1313
0
    if (pmd_thread_to_change == NON_PMD_CORE_ID) {
1314
0
        err = dp_mfex_impl_set_default_by_name(mfex_name);
1315
0
        if (err == -ENODEV) {
1316
0
            ds_put_format(&reply,
1317
0
                          "Error: miniflow extract not available due to CPU"
1318
0
                          " ISA requirements: %s",
1319
0
                          mfex_name);
1320
0
            goto error;
1321
0
        } else if (err) {
1322
0
            ds_put_format(&reply,
1323
0
                          "Error: unknown miniflow extract implementation %s.",
1324
0
                          mfex_name);
1325
0
            goto error;
1326
0
        }
1327
0
    }
1328
1329
    /* Get the desired MFEX function pointer and error check its usage. */
1330
0
    miniflow_extract_func mfex_func = NULL;
1331
0
    err = dp_mfex_impl_get_by_name(mfex_name, &mfex_func);
1332
0
    if (err) {
1333
0
        if (err == -ENODEV) {
1334
0
            ds_put_format(&reply,
1335
0
                          "Error: miniflow extract not available due to CPU"
1336
0
                          " ISA requirements: %s", mfex_name);
1337
0
        } else {
1338
0
            ds_put_format(&reply,
1339
0
                          "Error: unknown miniflow extract implementation %s.",
1340
0
                          mfex_name);
1341
0
        }
1342
0
        goto error;
1343
0
    }
1344
1345
    /* Apply the MFEX pointer to each pmd thread in each netdev, filtering
1346
     * by the users "-pmd" argument if required.
1347
     */
1348
0
    ovs_mutex_lock(&dp_netdev_mutex);
1349
1350
0
    SHASH_FOR_EACH (node, &dp_netdevs) {
1351
0
        struct dp_netdev_pmd_thread **pmd_list;
1352
0
        struct dp_netdev *dp = node->data;
1353
0
        size_t n;
1354
1355
0
        sorted_poll_thread_list(dp, &pmd_list, &n);
1356
1357
0
        for (size_t i = 0; i < n; i++) {
1358
0
            struct dp_netdev_pmd_thread *pmd = pmd_list[i];
1359
0
            if (pmd->core_id == NON_PMD_CORE_ID) {
1360
0
                continue;
1361
0
            }
1362
1363
            /* If -pmd specified, skip all other pmd threads. */
1364
0
            if ((pmd_thread_to_change != NON_PMD_CORE_ID) &&
1365
0
                (pmd->core_id != pmd_thread_to_change)) {
1366
0
                continue;
1367
0
            }
1368
1369
0
            pmd_thread_update_done = true;
1370
0
            atomic_store_relaxed(&pmd->miniflow_extract_opt, mfex_func);
1371
0
        };
1372
1373
0
        free(pmd_list);
1374
0
    }
1375
1376
0
    ovs_mutex_unlock(&dp_netdev_mutex);
1377
1378
    /* If PMD thread was specified, but it wasn't found, return error. */
1379
0
    if (pmd_thread_to_change != NON_PMD_CORE_ID && !pmd_thread_update_done) {
1380
0
        ds_put_format(&reply,
1381
0
                      "Error: miniflow extract parser not changed, "
1382
0
                      "PMD thread %d not in use, pass a valid pmd"
1383
0
                      " thread ID.\n", pmd_thread_to_change);
1384
0
        goto error;
1385
0
    }
1386
1387
    /* Reply with success to command. */
1388
0
    ds_put_format(&reply, "Miniflow extract implementation set to %s",
1389
0
                  mfex_name);
1390
0
    if (pmd_thread_to_change != NON_PMD_CORE_ID) {
1391
0
        ds_put_format(&reply, ", on pmd thread %d", pmd_thread_to_change);
1392
0
    }
1393
0
    if (mfex_name_is_study) {
1394
0
        ds_put_format(&reply, ", studying %d packets", study_count);
1395
0
    }
1396
0
    ds_put_format(&reply, ".\n");
1397
1398
0
    reply_str = ds_cstr(&reply);
1399
0
    VLOG_INFO("%s", reply_str);
1400
0
    unixctl_command_reply(conn, reply_str);
1401
0
    ds_destroy(&reply);
1402
0
    return;
1403
1404
0
error:
1405
0
    reply_str = ds_cstr(&reply);
1406
0
    VLOG_ERR("%s", reply_str);
1407
0
    unixctl_command_reply_error(conn, reply_str);
1408
0
    ds_destroy(&reply);
1409
0
}
1410
1411
static void
1412
dpif_netdev_pmd_rebalance(struct unixctl_conn *conn, int argc,
1413
                          const char *argv[], void *aux OVS_UNUSED)
1414
0
{
1415
0
    struct ds reply = DS_EMPTY_INITIALIZER;
1416
0
    struct dp_netdev *dp = NULL;
1417
1418
0
    ovs_mutex_lock(&dp_netdev_mutex);
1419
1420
0
    if (argc == 2) {
1421
0
        dp = shash_find_data(&dp_netdevs, argv[1]);
1422
0
    } else if (shash_count(&dp_netdevs) == 1) {
1423
        /* There's only one datapath */
1424
0
        dp = shash_first(&dp_netdevs)->data;
1425
0
    }
1426
1427
0
    if (!dp) {
1428
0
        ovs_mutex_unlock(&dp_netdev_mutex);
1429
0
        unixctl_command_reply_error(conn,
1430
0
                                    "please specify an existing datapath");
1431
0
        return;
1432
0
    }
1433
1434
0
    dp_netdev_request_reconfigure(dp);
1435
0
    ovs_mutex_unlock(&dp_netdev_mutex);
1436
0
    ds_put_cstr(&reply, "pmd rxq rebalance requested.\n");
1437
0
    unixctl_command_reply(conn, ds_cstr(&reply));
1438
0
    ds_destroy(&reply);
1439
0
}
1440
1441
static void
1442
pmd_info_show_sleep(struct ds *reply, unsigned core_id, int numa_id,
1443
                    uint64_t pmd_max_sleep)
1444
0
{
1445
0
    if (core_id == NON_PMD_CORE_ID) {
1446
0
        return;
1447
0
    }
1448
0
    ds_put_format(reply,
1449
0
                  "pmd thread numa_id %d core_id %d:\n"
1450
0
                  "  max sleep: %4"PRIu64" us\n",
1451
0
                  numa_id, core_id, pmd_max_sleep);
1452
0
}
1453
1454
static void
1455
dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[],
1456
                     void *aux)
1457
0
{
1458
0
    struct ds reply = DS_EMPTY_INITIALIZER;
1459
0
    struct dp_netdev_pmd_thread **pmd_list;
1460
0
    struct dp_netdev *dp = NULL;
1461
0
    enum pmd_info_type type = *(enum pmd_info_type *) aux;
1462
0
    unsigned int core_id;
1463
0
    bool filter_on_pmd = false;
1464
0
    size_t n;
1465
0
    unsigned int secs = 0;
1466
0
    unsigned long long max_secs = (PMD_INTERVAL_LEN * PMD_INTERVAL_MAX)
1467
0
                                      / INTERVAL_USEC_TO_SEC;
1468
0
    bool show_header = true;
1469
0
    uint64_t max_sleep;
1470
1471
0
    ovs_mutex_lock(&dp_netdev_mutex);
1472
1473
0
    while (argc > 1) {
1474
0
        if (!strcmp(argv[1], "-pmd") && argc > 2) {
1475
0
            if (str_to_uint(argv[2], 10, &core_id)) {
1476
0
                filter_on_pmd = true;
1477
0
            }
1478
0
            argc -= 2;
1479
0
            argv += 2;
1480
0
        } else if (type == PMD_INFO_SHOW_RXQ &&
1481
0
                       !strcmp(argv[1], "-secs") &&
1482
0
                       argc > 2) {
1483
0
            if (!str_to_uint(argv[2], 10, &secs)) {
1484
0
                secs = max_secs;
1485
0
            }
1486
0
            argc -= 2;
1487
0
            argv += 2;
1488
0
        } else {
1489
0
            dp = shash_find_data(&dp_netdevs, argv[1]);
1490
0
            argc -= 1;
1491
0
            argv += 1;
1492
0
        }
1493
0
    }
1494
1495
0
    if (!dp) {
1496
0
        if (shash_count(&dp_netdevs) == 1) {
1497
            /* There's only one datapath */
1498
0
            dp = shash_first(&dp_netdevs)->data;
1499
0
        } else {
1500
0
            ovs_mutex_unlock(&dp_netdev_mutex);
1501
0
            unixctl_command_reply_error(conn,
1502
0
                                        "please specify an existing datapath");
1503
0
            return;
1504
0
        }
1505
0
    }
1506
1507
0
    sorted_poll_thread_list(dp, &pmd_list, &n);
1508
0
    for (size_t i = 0; i < n; i++) {
1509
0
        struct dp_netdev_pmd_thread *pmd = pmd_list[i];
1510
0
        if (!pmd) {
1511
0
            break;
1512
0
        }
1513
0
        if (filter_on_pmd && pmd->core_id != core_id) {
1514
0
            continue;
1515
0
        }
1516
0
        if (type == PMD_INFO_SHOW_RXQ) {
1517
0
            if (show_header) {
1518
0
                if (!secs || secs > max_secs) {
1519
0
                    secs = max_secs;
1520
0
                } else {
1521
0
                    secs = ROUND_UP(secs,
1522
0
                                    PMD_INTERVAL_LEN / INTERVAL_USEC_TO_SEC);
1523
0
                }
1524
0
                ds_put_format(&reply, "Displaying last %u seconds "
1525
0
                              "pmd usage %%\n", secs);
1526
0
                show_header = false;
1527
0
            }
1528
0
            pmd_info_show_rxq(&reply, pmd, secs);
1529
0
        } else if (type == PMD_INFO_CLEAR_STATS) {
1530
0
            pmd_perf_stats_clear(&pmd->perf_stats);
1531
0
        } else if (type == PMD_INFO_SHOW_STATS) {
1532
0
            pmd_info_show_stats(&reply, pmd);
1533
0
        } else if (type == PMD_INFO_PERF_SHOW) {
1534
0
            pmd_info_show_perf(&reply, pmd, (struct pmd_perf_params *)aux);
1535
0
        } else if (type == PMD_INFO_SLEEP_SHOW) {
1536
0
            if (show_header) {
1537
0
                ds_put_format(&reply, "Default max sleep: %4"PRIu64" us\n",
1538
0
                              dp->pmd_max_sleep_default);
1539
0
                show_header = false;
1540
0
            }
1541
0
            atomic_read_relaxed(&pmd->max_sleep, &max_sleep);
1542
0
            pmd_info_show_sleep(&reply, pmd->core_id, pmd->numa_id,
1543
0
                                max_sleep);
1544
0
        }
1545
0
    }
1546
0
    free(pmd_list);
1547
1548
0
    ovs_mutex_unlock(&dp_netdev_mutex);
1549
1550
0
    unixctl_command_reply(conn, ds_cstr(&reply));
1551
0
    ds_destroy(&reply);
1552
0
}
1553
1554
static void
1555
pmd_perf_show_cmd(struct unixctl_conn *conn, int argc,
1556
                          const char *argv[],
1557
                          void *aux OVS_UNUSED)
1558
0
{
1559
0
    struct pmd_perf_params par;
1560
0
    long int it_hist = 0, ms_hist = 0;
1561
0
    par.histograms = true;
1562
1563
0
    while (argc > 1) {
1564
0
        if (!strcmp(argv[1], "-nh")) {
1565
0
            par.histograms = false;
1566
0
            argc -= 1;
1567
0
            argv += 1;
1568
0
        } else if (!strcmp(argv[1], "-it") && argc > 2) {
1569
0
            it_hist = strtol(argv[2], NULL, 10);
1570
0
            if (it_hist < 0) {
1571
0
                it_hist = 0;
1572
0
            } else if (it_hist > HISTORY_LEN) {
1573
0
                it_hist = HISTORY_LEN;
1574
0
            }
1575
0
            argc -= 2;
1576
0
            argv += 2;
1577
0
        } else if (!strcmp(argv[1], "-ms") && argc > 2) {
1578
0
            ms_hist = strtol(argv[2], NULL, 10);
1579
0
            if (ms_hist < 0) {
1580
0
                ms_hist = 0;
1581
0
            } else if (ms_hist > HISTORY_LEN) {
1582
0
                ms_hist = HISTORY_LEN;
1583
0
            }
1584
0
            argc -= 2;
1585
0
            argv += 2;
1586
0
        } else {
1587
0
            break;
1588
0
        }
1589
0
    }
1590
0
    par.iter_hist_len = it_hist;
1591
0
    par.ms_hist_len = ms_hist;
1592
0
    par.command_type = PMD_INFO_PERF_SHOW;
1593
0
    dpif_netdev_pmd_info(conn, argc, argv, &par);
1594
0
}
1595
1596
static void
1597
dpif_netdev_bond_show(struct unixctl_conn *conn, int argc,
1598
                      const char *argv[], void *aux OVS_UNUSED)
1599
0
{
1600
0
    struct ds reply = DS_EMPTY_INITIALIZER;
1601
0
    struct dp_netdev *dp = NULL;
1602
1603
0
    ovs_mutex_lock(&dp_netdev_mutex);
1604
0
    if (argc == 2) {
1605
0
        dp = shash_find_data(&dp_netdevs, argv[1]);
1606
0
    } else if (shash_count(&dp_netdevs) == 1) {
1607
        /* There's only one datapath. */
1608
0
        dp = shash_first(&dp_netdevs)->data;
1609
0
    }
1610
0
    if (!dp) {
1611
0
        ovs_mutex_unlock(&dp_netdev_mutex);
1612
0
        unixctl_command_reply_error(conn,
1613
0
                                    "please specify an existing datapath");
1614
0
        return;
1615
0
    }
1616
1617
0
    if (cmap_count(&dp->tx_bonds) > 0) {
1618
0
        struct tx_bond *dp_bond_entry;
1619
1620
0
        ds_put_cstr(&reply, "Bonds:\n");
1621
0
        CMAP_FOR_EACH (dp_bond_entry, node, &dp->tx_bonds) {
1622
0
            ds_put_format(&reply, "  bond-id %"PRIu32":\n",
1623
0
                          dp_bond_entry->bond_id);
1624
0
            for (int bucket = 0; bucket < BOND_BUCKETS; bucket++) {
1625
0
                uint32_t member_id = odp_to_u32(
1626
0
                    dp_bond_entry->member_buckets[bucket].member_id);
1627
0
                ds_put_format(&reply,
1628
0
                              "    bucket %d - member %"PRIu32"\n",
1629
0
                              bucket, member_id);
1630
0
            }
1631
0
        }
1632
0
    }
1633
0
    ovs_mutex_unlock(&dp_netdev_mutex);
1634
0
    unixctl_command_reply(conn, ds_cstr(&reply));
1635
0
    ds_destroy(&reply);
1636
0
}
1637
1638

1639
static int
1640
dpif_netdev_init(void)
1641
0
{
1642
0
    static enum pmd_info_type show_aux = PMD_INFO_SHOW_STATS,
1643
0
                              clear_aux = PMD_INFO_CLEAR_STATS,
1644
0
                              poll_aux = PMD_INFO_SHOW_RXQ,
1645
0
                              sleep_aux = PMD_INFO_SLEEP_SHOW;
1646
1647
0
    unixctl_command_register("dpif-netdev/pmd-stats-show", "[-pmd core] [dp]",
1648
0
                             0, 3, dpif_netdev_pmd_info,
1649
0
                             (void *)&show_aux);
1650
0
    unixctl_command_register("dpif-netdev/pmd-stats-clear", "[-pmd core] [dp]",
1651
0
                             0, 3, dpif_netdev_pmd_info,
1652
0
                             (void *)&clear_aux);
1653
0
    unixctl_command_register("dpif-netdev/pmd-rxq-show", "[-pmd core] "
1654
0
                             "[-secs secs] [dp]",
1655
0
                             0, 5, dpif_netdev_pmd_info,
1656
0
                             (void *)&poll_aux);
1657
0
    unixctl_command_register("dpif-netdev/pmd-sleep-show", "[dp]",
1658
0
                             0, 1, dpif_netdev_pmd_info,
1659
0
                             (void *)&sleep_aux);
1660
0
    unixctl_command_register("dpif-netdev/pmd-perf-show",
1661
0
                             "[-nh] [-it iter-history-len]"
1662
0
                             " [-ms ms-history-len]"
1663
0
                             " [-pmd core] [dp]",
1664
0
                             0, 8, pmd_perf_show_cmd,
1665
0
                             NULL);
1666
0
    unixctl_command_register("dpif-netdev/pmd-rxq-rebalance", "[dp]",
1667
0
                             0, 1, dpif_netdev_pmd_rebalance,
1668
0
                             NULL);
1669
0
    unixctl_command_register("dpif-netdev/pmd-perf-log-set",
1670
0
                             "on|off [-b before] [-a after] [-e|-ne] "
1671
0
                             "[-us usec] [-q qlen]",
1672
0
                             0, 10, pmd_perf_log_set_cmd,
1673
0
                             NULL);
1674
0
    unixctl_command_register("dpif-netdev/bond-show", "[dp]",
1675
0
                             0, 1, dpif_netdev_bond_show,
1676
0
                             NULL);
1677
0
    unixctl_command_register("dpif-netdev/subtable-lookup-prio-set",
1678
0
                             "[lookup_func] [prio]",
1679
0
                             2, 2, dpif_netdev_subtable_lookup_set,
1680
0
                             NULL);
1681
0
    unixctl_command_register("dpif-netdev/subtable-lookup-info-get", "",
1682
0
                             0, 0, dpif_netdev_subtable_lookup_get,
1683
0
                             NULL);
1684
0
    unixctl_command_register("dpif-netdev/subtable-lookup-prio-get", NULL,
1685
0
                             0, 0, dpif_netdev_subtable_lookup_get,
1686
0
                             NULL);
1687
0
    unixctl_command_register("dpif-netdev/dpif-impl-set",
1688
0
                             "dpif_implementation_name",
1689
0
                             1, 1, dpif_netdev_impl_set,
1690
0
                             NULL);
1691
0
    unixctl_command_register("dpif-netdev/dpif-impl-get", "",
1692
0
                             0, 0, dpif_netdev_impl_get,
1693
0
                             NULL);
1694
0
    unixctl_command_register("dpif-netdev/miniflow-parser-set",
1695
0
                             "[-pmd core] miniflow_implementation_name"
1696
0
                             " [study_pkt_cnt]",
1697
0
                             1, 5, dpif_miniflow_extract_impl_set,
1698
0
                             NULL);
1699
0
    unixctl_command_register("dpif-netdev/miniflow-parser-get", "",
1700
0
                             0, 0, dpif_miniflow_extract_impl_get,
1701
0
                             NULL);
1702
0
    return 0;
1703
0
}
1704
1705
static int
1706
dpif_netdev_enumerate(struct sset *all_dps,
1707
                      const struct dpif_class *dpif_class)
1708
0
{
1709
0
    struct shash_node *node;
1710
1711
0
    ovs_mutex_lock(&dp_netdev_mutex);
1712
0
    SHASH_FOR_EACH(node, &dp_netdevs) {
1713
0
        struct dp_netdev *dp = node->data;
1714
0
        if (dpif_class != dp->class) {
1715
            /* 'dp_netdevs' contains both "netdev" and "dummy" dpifs.
1716
             * If the class doesn't match, skip this dpif. */
1717
0
             continue;
1718
0
        }
1719
0
        sset_add(all_dps, node->name);
1720
0
    }
1721
0
    ovs_mutex_unlock(&dp_netdev_mutex);
1722
1723
0
    return 0;
1724
0
}
1725
1726
static bool
1727
dpif_netdev_class_is_dummy(const struct dpif_class *class)
1728
0
{
1729
0
    return class != &dpif_netdev_class;
1730
0
}
1731
1732
static const char *
1733
dpif_netdev_port_open_type(const struct dpif_class *class, const char *type)
1734
0
{
1735
0
    return strcmp(type, "internal") ? type
1736
0
                  : dpif_netdev_class_is_dummy(class) ? "dummy-internal"
1737
0
                  : "tap";
1738
0
}
1739
1740
static struct dpif *
1741
create_dpif_netdev(struct dp_netdev *dp)
1742
0
{
1743
0
    uint16_t netflow_id = hash_string(dp->name, 0);
1744
0
    struct dpif_netdev *dpif;
1745
1746
0
    ovs_refcount_ref(&dp->ref_cnt);
1747
1748
0
    dpif = xmalloc(sizeof *dpif);
1749
0
    dpif_init(&dpif->dpif, dp->class, dp->name, netflow_id >> 8, netflow_id);
1750
0
    dpif->dp = dp;
1751
0
    dpif->last_port_seq = seq_read(dp->port_seq);
1752
1753
0
    return &dpif->dpif;
1754
0
}
1755
1756
/* Choose an unused, non-zero port number and return it on success.
1757
 * Return ODPP_NONE on failure. */
1758
static odp_port_t
1759
choose_port(struct dp_netdev *dp, const char *name)
1760
    OVS_REQ_RDLOCK(dp->port_rwlock)
1761
0
{
1762
0
    uint32_t port_no;
1763
1764
0
    if (dp->class != &dpif_netdev_class) {
1765
0
        const char *p;
1766
0
        int start_no = 0;
1767
1768
        /* If the port name begins with "br", start the number search at
1769
         * 100 to make writing tests easier. */
1770
0
        if (!strncmp(name, "br", 2)) {
1771
0
            start_no = 100;
1772
0
        }
1773
1774
        /* If the port name contains a number, try to assign that port number.
1775
         * This can make writing unit tests easier because port numbers are
1776
         * predictable. */
1777
0
        for (p = name; *p != '\0'; p++) {
1778
0
            if (isdigit((unsigned char) *p)) {
1779
0
                port_no = start_no + strtol(p, NULL, 10);
1780
0
                if (port_no > 0 && port_no != odp_to_u32(ODPP_NONE)
1781
0
                    && !dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
1782
0
                    return u32_to_odp(port_no);
1783
0
                }
1784
0
                break;
1785
0
            }
1786
0
        }
1787
0
    }
1788
1789
0
    for (port_no = 1; port_no <= UINT16_MAX; port_no++) {
1790
0
        if (!dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
1791
0
            return u32_to_odp(port_no);
1792
0
        }
1793
0
    }
1794
1795
0
    return ODPP_NONE;
1796
0
}
1797
1798
static uint32_t
1799
dp_meter_hash(uint32_t meter_id)
1800
0
{
1801
    /* In the ofproto-dpif layer, we use the id-pool to alloc meter id
1802
     * orderly (e.g. 1, 2, ... N.), which provides a better hash
1803
     * distribution.  Use them directly instead of hash_xxx function for
1804
     * achieving high-performance. */
1805
0
    return meter_id;
1806
0
}
1807
1808
static void
1809
dp_netdev_meter_destroy(struct dp_netdev *dp)
1810
0
{
1811
0
    struct dp_meter *m;
1812
1813
0
    ovs_mutex_lock(&dp->meters_lock);
1814
0
    CMAP_FOR_EACH (m, node, &dp->meters) {
1815
0
        cmap_remove(&dp->meters, &m->node, dp_meter_hash(m->id));
1816
0
        ovsrcu_postpone(free, m);
1817
0
    }
1818
1819
0
    cmap_destroy(&dp->meters);
1820
0
    ovs_mutex_unlock(&dp->meters_lock);
1821
0
    ovs_mutex_destroy(&dp->meters_lock);
1822
0
}
1823
1824
static struct dp_meter *
1825
dp_meter_lookup(struct cmap *meters, uint32_t meter_id)
1826
0
{
1827
0
    uint32_t hash = dp_meter_hash(meter_id);
1828
0
    struct dp_meter *m;
1829
1830
0
    CMAP_FOR_EACH_WITH_HASH (m, node, hash, meters) {
1831
0
        if (m->id == meter_id) {
1832
0
            return m;
1833
0
        }
1834
0
    }
1835
1836
0
    return NULL;
1837
0
}
1838
1839
static void
1840
dp_meter_detach_free(struct cmap *meters, uint32_t meter_id)
1841
0
{
1842
0
    struct dp_meter *m = dp_meter_lookup(meters, meter_id);
1843
1844
0
    if (m) {
1845
0
        cmap_remove(meters, &m->node, dp_meter_hash(meter_id));
1846
0
        ovsrcu_postpone(free, m);
1847
0
    }
1848
0
}
1849
1850
static void
1851
dp_meter_attach(struct cmap *meters, struct dp_meter *meter)
1852
0
{
1853
0
    cmap_insert(meters, &meter->node, dp_meter_hash(meter->id));
1854
0
}
1855
1856
static int
1857
create_dp_netdev(const char *name, const struct dpif_class *class,
1858
                 struct dp_netdev **dpp)
1859
    OVS_REQUIRES(dp_netdev_mutex)
1860
0
{
1861
0
    static struct ovsthread_once tsc_freq_check = OVSTHREAD_ONCE_INITIALIZER;
1862
0
    struct dp_netdev *dp;
1863
0
    int error;
1864
1865
    /* Avoid estimating TSC frequency for dummy datapath to not slow down
1866
     * unit tests. */
1867
0
    if (!dpif_netdev_class_is_dummy(class)
1868
0
        && ovsthread_once_start(&tsc_freq_check)) {
1869
0
        pmd_perf_estimate_tsc_frequency();
1870
0
        ovsthread_once_done(&tsc_freq_check);
1871
0
    }
1872
1873
0
    dp = xzalloc(sizeof *dp);
1874
0
    shash_add(&dp_netdevs, name, dp);
1875
1876
0
    *CONST_CAST(const struct dpif_class **, &dp->class) = class;
1877
0
    *CONST_CAST(const char **, &dp->name) = xstrdup(name);
1878
0
    ovs_refcount_init(&dp->ref_cnt);
1879
0
    atomic_flag_clear(&dp->destroyed);
1880
1881
0
    ovs_rwlock_init(&dp->port_rwlock);
1882
0
    hmap_init(&dp->ports);
1883
0
    dp->port_seq = seq_create();
1884
0
    ovs_mutex_init(&dp->bond_mutex);
1885
0
    cmap_init(&dp->tx_bonds);
1886
1887
0
    fat_rwlock_init(&dp->upcall_rwlock);
1888
1889
0
    dp->reconfigure_seq = seq_create();
1890
0
    dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
1891
1892
    /* Init meter resources. */
1893
0
    cmap_init(&dp->meters);
1894
0
    ovs_mutex_init(&dp->meters_lock);
1895
1896
    /* Disable upcalls by default. */
1897
0
    dp_netdev_disable_upcall(dp);
1898
0
    dp->upcall_aux = NULL;
1899
0
    dp->upcall_cb = NULL;
1900
1901
0
    dp->conntrack = conntrack_init();
1902
1903
0
    dpif_miniflow_extract_init();
1904
1905
0
    atomic_init(&dp->emc_insert_min, DEFAULT_EM_FLOW_INSERT_MIN);
1906
0
    atomic_init(&dp->tx_flush_interval, DEFAULT_TX_FLUSH_INTERVAL);
1907
1908
0
    cmap_init(&dp->poll_threads);
1909
0
    dp->pmd_rxq_assign_type = SCHED_CYCLES;
1910
1911
0
    ovs_mutex_init(&dp->tx_qid_pool_mutex);
1912
    /* We need 1 Tx queue for each possible core + 1 for non-PMD threads. */
1913
0
    dp->tx_qid_pool = id_pool_create(0, ovs_numa_get_n_cores() + 1);
1914
1915
0
    ovs_mutex_init_recursive(&dp->non_pmd_mutex);
1916
0
    ovsthread_key_create(&dp->per_pmd_key, NULL);
1917
1918
0
    ovs_rwlock_wrlock(&dp->port_rwlock);
1919
    /* non-PMD will be created before all other threads and will
1920
     * allocate static_tx_qid = 0. */
1921
0
    dp_netdev_set_nonpmd(dp);
1922
1923
0
    error = do_add_port(dp, name, dpif_netdev_port_open_type(dp->class,
1924
0
                                                             "internal"),
1925
0
                        ODPP_LOCAL);
1926
0
    ovs_rwlock_unlock(&dp->port_rwlock);
1927
0
    if (error) {
1928
0
        dp_netdev_free(dp);
1929
0
        return error;
1930
0
    }
1931
1932
0
    dp->max_sleep_list = NULL;
1933
1934
0
    dp->last_tnl_conf_seq = seq_read(tnl_conf_seq);
1935
0
    *dpp = dp;
1936
0
    return 0;
1937
0
}
1938
1939
static void
1940
dp_netdev_request_reconfigure(struct dp_netdev *dp)
1941
0
{
1942
0
    seq_change(dp->reconfigure_seq);
1943
0
}
1944
1945
static bool
1946
dp_netdev_is_reconf_required(struct dp_netdev *dp)
1947
0
{
1948
0
    return seq_read(dp->reconfigure_seq) != dp->last_reconfigure_seq;
1949
0
}
1950
1951
static int
1952
dpif_netdev_open(const struct dpif_class *class, const char *name,
1953
                 bool create, struct dpif **dpifp)
1954
0
{
1955
0
    struct dp_netdev *dp;
1956
0
    int error;
1957
1958
0
    ovs_mutex_lock(&dp_netdev_mutex);
1959
0
    dp = shash_find_data(&dp_netdevs, name);
1960
0
    if (!dp) {
1961
0
        error = create ? create_dp_netdev(name, class, &dp) : ENODEV;
1962
0
    } else {
1963
0
        error = (dp->class != class ? EINVAL
1964
0
                 : create ? EEXIST
1965
0
                 : 0);
1966
0
    }
1967
0
    if (!error) {
1968
0
        *dpifp = create_dpif_netdev(dp);
1969
0
    }
1970
0
    ovs_mutex_unlock(&dp_netdev_mutex);
1971
1972
0
    return error;
1973
0
}
1974
1975
static void
1976
dp_netdev_destroy_upcall_lock(struct dp_netdev *dp)
1977
    OVS_NO_THREAD_SAFETY_ANALYSIS
1978
0
{
1979
    /* Check that upcalls are disabled, i.e. that the rwlock is taken */
1980
0
    ovs_assert(fat_rwlock_tryrdlock(&dp->upcall_rwlock));
1981
1982
    /* Before freeing a lock we should release it */
1983
0
    fat_rwlock_unlock(&dp->upcall_rwlock);
1984
0
    fat_rwlock_destroy(&dp->upcall_rwlock);
1985
0
}
1986
1987
static uint32_t
1988
hash_bond_id(uint32_t bond_id)
1989
0
{
1990
0
    return hash_int(bond_id, 0);
1991
0
}
1992
1993
/* Requires dp_netdev_mutex so that we can't get a new reference to 'dp'
1994
 * through the 'dp_netdevs' shash while freeing 'dp'. */
1995
static void
1996
dp_netdev_free(struct dp_netdev *dp)
1997
    OVS_REQUIRES(dp_netdev_mutex)
1998
0
{
1999
0
    struct dp_netdev_port *port;
2000
0
    struct tx_bond *bond;
2001
2002
0
    shash_find_and_delete(&dp_netdevs, dp->name);
2003
2004
0
    ovs_rwlock_wrlock(&dp->port_rwlock);
2005
0
    HMAP_FOR_EACH_SAFE (port, node, &dp->ports) {
2006
0
        do_del_port(dp, port);
2007
0
    }
2008
0
    ovs_rwlock_unlock(&dp->port_rwlock);
2009
2010
0
    ovs_mutex_lock(&dp->bond_mutex);
2011
0
    CMAP_FOR_EACH (bond, node, &dp->tx_bonds) {
2012
0
        cmap_remove(&dp->tx_bonds, &bond->node, hash_bond_id(bond->bond_id));
2013
0
        ovsrcu_postpone(free, bond);
2014
0
    }
2015
0
    ovs_mutex_unlock(&dp->bond_mutex);
2016
2017
0
    dp_netdev_destroy_all_pmds(dp, true);
2018
0
    cmap_destroy(&dp->poll_threads);
2019
2020
0
    ovs_mutex_destroy(&dp->tx_qid_pool_mutex);
2021
0
    id_pool_destroy(dp->tx_qid_pool);
2022
2023
0
    ovs_mutex_destroy(&dp->non_pmd_mutex);
2024
0
    ovsthread_key_delete(dp->per_pmd_key);
2025
2026
0
    conntrack_destroy(dp->conntrack);
2027
2028
2029
0
    seq_destroy(dp->reconfigure_seq);
2030
2031
0
    seq_destroy(dp->port_seq);
2032
0
    hmap_destroy(&dp->ports);
2033
0
    ovs_rwlock_destroy(&dp->port_rwlock);
2034
2035
0
    cmap_destroy(&dp->tx_bonds);
2036
0
    ovs_mutex_destroy(&dp->bond_mutex);
2037
2038
    /* Upcalls must be disabled at this point */
2039
0
    dp_netdev_destroy_upcall_lock(dp);
2040
2041
0
    dp_netdev_meter_destroy(dp);
2042
2043
0
    free(dp->max_sleep_list);
2044
0
    free(dp->pmd_cmask);
2045
0
    free(CONST_CAST(char *, dp->name));
2046
0
    free(dp);
2047
0
}
2048
2049
static void
2050
dp_netdev_unref(struct dp_netdev *dp)
2051
0
{
2052
0
    if (dp) {
2053
        /* Take dp_netdev_mutex so that, if dp->ref_cnt falls to zero, we can't
2054
         * get a new reference to 'dp' through the 'dp_netdevs' shash. */
2055
0
        ovs_mutex_lock(&dp_netdev_mutex);
2056
0
        if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
2057
0
            dp_netdev_free(dp);
2058
0
        }
2059
0
        ovs_mutex_unlock(&dp_netdev_mutex);
2060
0
    }
2061
0
}
2062
2063
static void
2064
dpif_netdev_close(struct dpif *dpif)
2065
0
{
2066
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
2067
2068
0
    dp_netdev_unref(dp);
2069
0
    free(dpif);
2070
0
}
2071
2072
static int
2073
dpif_netdev_destroy(struct dpif *dpif)
2074
0
{
2075
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
2076
2077
0
    if (!atomic_flag_test_and_set(&dp->destroyed)) {
2078
0
        if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
2079
            /* Can't happen: 'dpif' still owns a reference to 'dp'. */
2080
0
            OVS_NOT_REACHED();
2081
0
        }
2082
0
    }
2083
2084
0
    return 0;
2085
0
}
2086
2087
/* Add 'n' to the atomic variable 'var' non-atomically and using relaxed
2088
 * load/store semantics.  While the increment is not atomic, the load and
2089
 * store operations are, making it impossible to read inconsistent values.
2090
 *
2091
 * This is used to update thread local stats counters. */
2092
static void
2093
non_atomic_ullong_add(atomic_ullong *var, unsigned long long n)
2094
0
{
2095
0
    unsigned long long tmp;
2096
2097
0
    atomic_read_relaxed(var, &tmp);
2098
0
    tmp += n;
2099
0
    atomic_store_relaxed(var, tmp);
2100
0
}
2101
2102
static int
2103
dpif_netdev_get_stats(const struct dpif *dpif, struct dpif_dp_stats *stats)
2104
0
{
2105
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
2106
0
    struct dp_netdev_pmd_thread *pmd;
2107
0
    uint64_t pmd_stats[PMD_N_STATS];
2108
2109
0
    stats->n_flows = stats->n_hit = stats->n_missed = stats->n_lost = 0;
2110
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
2111
0
        stats->n_flows += cmap_count(&pmd->flow_table);
2112
0
        pmd_perf_read_counters(&pmd->perf_stats, pmd_stats);
2113
0
        stats->n_hit += pmd_stats[PMD_STAT_PHWOL_HIT];
2114
0
        stats->n_hit += pmd_stats[PMD_STAT_SIMPLE_HIT];
2115
0
        stats->n_hit += pmd_stats[PMD_STAT_EXACT_HIT];
2116
0
        stats->n_hit += pmd_stats[PMD_STAT_SMC_HIT];
2117
0
        stats->n_hit += pmd_stats[PMD_STAT_MASKED_HIT];
2118
0
        stats->n_missed += pmd_stats[PMD_STAT_MISS];
2119
0
        stats->n_lost += pmd_stats[PMD_STAT_LOST];
2120
0
    }
2121
0
    stats->n_masks = UINT32_MAX;
2122
0
    stats->n_mask_hit = UINT64_MAX;
2123
0
    stats->n_cache_hit = UINT64_MAX;
2124
2125
0
    return 0;
2126
0
}
2127
2128
static void
2129
dp_netdev_reload_pmd__(struct dp_netdev_pmd_thread *pmd)
2130
0
{
2131
0
    if (pmd->core_id == NON_PMD_CORE_ID) {
2132
0
        ovs_mutex_lock(&pmd->dp->non_pmd_mutex);
2133
0
        ovs_mutex_lock(&pmd->port_mutex);
2134
0
        pmd_load_cached_ports(pmd);
2135
0
        ovs_mutex_unlock(&pmd->port_mutex);
2136
0
        ovs_mutex_unlock(&pmd->dp->non_pmd_mutex);
2137
0
        return;
2138
0
    }
2139
2140
0
    seq_change(pmd->reload_seq);
2141
0
    atomic_store_explicit(&pmd->reload, true, memory_order_release);
2142
0
}
2143
2144
static uint32_t
2145
hash_port_no(odp_port_t port_no)
2146
0
{
2147
0
    return hash_int(odp_to_u32(port_no), 0);
2148
0
}
2149
2150
static int
2151
port_create(const char *devname, const char *type,
2152
            odp_port_t port_no, struct dp_netdev_port **portp)
2153
0
{
2154
0
    struct dp_netdev_port *port;
2155
0
    enum netdev_flags flags;
2156
0
    struct netdev *netdev;
2157
0
    int error;
2158
2159
0
    *portp = NULL;
2160
2161
    /* Open and validate network device. */
2162
0
    error = netdev_open(devname, type, &netdev);
2163
0
    if (error) {
2164
0
        return error;
2165
0
    }
2166
    /* XXX reject non-Ethernet devices */
2167
2168
0
    netdev_get_flags(netdev, &flags);
2169
0
    if (flags & NETDEV_LOOPBACK) {
2170
0
        VLOG_ERR("%s: cannot add a loopback device", devname);
2171
0
        error = EINVAL;
2172
0
        goto out;
2173
0
    }
2174
2175
0
    port = xzalloc(sizeof *port);
2176
0
    port->port_no = port_no;
2177
0
    port->netdev = netdev;
2178
0
    port->type = xstrdup(type);
2179
0
    port->sf = NULL;
2180
0
    port->emc_enabled = true;
2181
0
    port->need_reconfigure = true;
2182
0
    ovs_mutex_init(&port->txq_used_mutex);
2183
2184
0
    *portp = port;
2185
2186
0
    return 0;
2187
2188
0
out:
2189
0
    netdev_close(netdev);
2190
0
    return error;
2191
0
}
2192
2193
static int
2194
do_add_port(struct dp_netdev *dp, const char *devname, const char *type,
2195
            odp_port_t port_no)
2196
    OVS_REQ_WRLOCK(dp->port_rwlock)
2197
0
{
2198
0
    struct netdev_saved_flags *sf;
2199
0
    struct dp_netdev_port *port;
2200
0
    int error;
2201
2202
    /* Reject devices already in 'dp'. */
2203
0
    if (!get_port_by_name(dp, devname, &port)) {
2204
0
        return EEXIST;
2205
0
    }
2206
2207
0
    error = port_create(devname, type, port_no, &port);
2208
0
    if (error) {
2209
0
        return error;
2210
0
    }
2211
2212
0
    hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
2213
0
    seq_change(dp->port_seq);
2214
2215
0
    reconfigure_datapath(dp);
2216
2217
    /* Check that port was successfully configured. */
2218
0
    if (!dp_netdev_lookup_port(dp, port_no)) {
2219
0
        return EINVAL;
2220
0
    }
2221
2222
    /* Updating device flags triggers an if_notifier, which triggers a bridge
2223
     * reconfiguration and another attempt to add this port, leading to an
2224
     * infinite loop if the device is configured incorrectly and cannot be
2225
     * added.  Setting the promisc mode after a successful reconfiguration,
2226
     * since we already know that the device is somehow properly configured. */
2227
0
    error = netdev_turn_flags_on(port->netdev, NETDEV_PROMISC, &sf);
2228
0
    if (error) {
2229
0
        VLOG_ERR("%s: cannot set promisc flag", devname);
2230
0
        do_del_port(dp, port);
2231
0
        return error;
2232
0
    }
2233
0
    port->sf = sf;
2234
2235
0
    return 0;
2236
0
}
2237
2238
static int
2239
dpif_netdev_port_add(struct dpif *dpif, struct netdev *netdev,
2240
                     odp_port_t *port_nop)
2241
0
{
2242
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
2243
0
    char namebuf[NETDEV_VPORT_NAME_BUFSIZE];
2244
0
    const char *dpif_port;
2245
0
    odp_port_t port_no;
2246
0
    int error;
2247
2248
0
    ovs_rwlock_wrlock(&dp->port_rwlock);
2249
0
    dpif_port = netdev_vport_get_dpif_port(netdev, namebuf, sizeof namebuf);
2250
0
    if (*port_nop != ODPP_NONE) {
2251
0
        port_no = *port_nop;
2252
0
        error = dp_netdev_lookup_port(dp, *port_nop) ? EBUSY : 0;
2253
0
    } else {
2254
0
        port_no = choose_port(dp, dpif_port);
2255
0
        error = port_no == ODPP_NONE ? EFBIG : 0;
2256
0
    }
2257
0
    if (!error) {
2258
0
        *port_nop = port_no;
2259
0
        error = do_add_port(dp, dpif_port, netdev_get_type(netdev), port_no);
2260
0
    }
2261
0
    ovs_rwlock_unlock(&dp->port_rwlock);
2262
2263
0
    return error;
2264
0
}
2265
2266
static int
2267
dpif_netdev_port_del(struct dpif *dpif, odp_port_t port_no)
2268
0
{
2269
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
2270
0
    int error;
2271
2272
0
    ovs_rwlock_wrlock(&dp->port_rwlock);
2273
0
    if (port_no == ODPP_LOCAL) {
2274
0
        error = EINVAL;
2275
0
    } else {
2276
0
        struct dp_netdev_port *port;
2277
2278
0
        error = get_port_by_number(dp, port_no, &port);
2279
0
        if (!error) {
2280
0
            do_del_port(dp, port);
2281
0
        }
2282
0
    }
2283
0
    ovs_rwlock_unlock(&dp->port_rwlock);
2284
2285
0
    return error;
2286
0
}
2287
2288
static bool
2289
is_valid_port_number(odp_port_t port_no)
2290
0
{
2291
0
    return port_no != ODPP_NONE;
2292
0
}
2293
2294
static struct dp_netdev_port *
2295
dp_netdev_lookup_port(const struct dp_netdev *dp, odp_port_t port_no)
2296
    OVS_REQ_RDLOCK(dp->port_rwlock)
2297
0
{
2298
0
    struct dp_netdev_port *port;
2299
2300
0
    HMAP_FOR_EACH_WITH_HASH (port, node, hash_port_no(port_no), &dp->ports) {
2301
0
        if (port->port_no == port_no) {
2302
0
            return port;
2303
0
        }
2304
0
    }
2305
0
    return NULL;
2306
0
}
2307
2308
static int
2309
get_port_by_number(struct dp_netdev *dp,
2310
                   odp_port_t port_no, struct dp_netdev_port **portp)
2311
    OVS_REQ_RDLOCK(dp->port_rwlock)
2312
0
{
2313
0
    if (!is_valid_port_number(port_no)) {
2314
0
        *portp = NULL;
2315
0
        return EINVAL;
2316
0
    } else {
2317
0
        *portp = dp_netdev_lookup_port(dp, port_no);
2318
0
        return *portp ? 0 : ENODEV;
2319
0
    }
2320
0
}
2321
2322
static void
2323
port_destroy(struct dp_netdev_port *port)
2324
0
{
2325
0
    if (!port) {
2326
0
        return;
2327
0
    }
2328
2329
0
    netdev_close(port->netdev);
2330
0
    netdev_restore_flags(port->sf);
2331
2332
0
    for (unsigned i = 0; i < port->n_rxq; i++) {
2333
0
        netdev_rxq_close(port->rxqs[i].rx);
2334
0
    }
2335
0
    ovs_mutex_destroy(&port->txq_used_mutex);
2336
0
    free(port->rxq_affinity_list);
2337
0
    free(port->txq_used);
2338
0
    free(port->rxqs);
2339
0
    free(port->type);
2340
0
    free(port);
2341
0
}
2342
2343
static int
2344
get_port_by_name(struct dp_netdev *dp,
2345
                 const char *devname, struct dp_netdev_port **portp)
2346
    OVS_REQ_RDLOCK(dp->port_rwlock)
2347
0
{
2348
0
    struct dp_netdev_port *port;
2349
2350
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
2351
0
        if (!strcmp(netdev_get_name(port->netdev), devname)) {
2352
0
            *portp = port;
2353
0
            return 0;
2354
0
        }
2355
0
    }
2356
2357
    /* Callers of dpif_netdev_port_query_by_name() expect ENODEV for a non
2358
     * existing port. */
2359
0
    return ENODEV;
2360
0
}
2361
2362
/* Returns 'true' if there is a port with pmd netdev. */
2363
static bool
2364
has_pmd_port(struct dp_netdev *dp)
2365
    OVS_REQ_RDLOCK(dp->port_rwlock)
2366
0
{
2367
0
    struct dp_netdev_port *port;
2368
2369
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
2370
0
        if (netdev_is_pmd(port->netdev)) {
2371
0
            return true;
2372
0
        }
2373
0
    }
2374
2375
0
    return false;
2376
0
}
2377
2378
static void
2379
do_del_port(struct dp_netdev *dp, struct dp_netdev_port *port)
2380
    OVS_REQ_WRLOCK(dp->port_rwlock)
2381
0
{
2382
0
    hmap_remove(&dp->ports, &port->node);
2383
0
    seq_change(dp->port_seq);
2384
2385
0
    reconfigure_datapath(dp);
2386
2387
    /* Flush and disable offloads only after 'port' has been made
2388
     * inaccessible through datapath reconfiguration.
2389
     * This prevents having PMDs enqueuing offload requests after
2390
     * the flush.
2391
     * When only this port is deleted instead of the whole datapath,
2392
     * revalidator threads are still active and can still enqueue
2393
     * offload modification or deletion. Managing those stray requests
2394
     * is done in the offload threads. */
2395
0
    dp_netdev_offload_flush(dp, port);
2396
0
    netdev_uninit_flow_api(port->netdev);
2397
2398
0
    port_destroy(port);
2399
0
}
2400
2401
static void
2402
answer_port_query(const struct dp_netdev_port *port,
2403
                  struct dpif_port *dpif_port)
2404
0
{
2405
0
    dpif_port->name = xstrdup(netdev_get_name(port->netdev));
2406
0
    dpif_port->type = xstrdup(port->type);
2407
0
    dpif_port->port_no = port->port_no;
2408
0
}
2409
2410
static int
2411
dpif_netdev_port_query_by_number(const struct dpif *dpif, odp_port_t port_no,
2412
                                 struct dpif_port *dpif_port)
2413
0
{
2414
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
2415
0
    struct dp_netdev_port *port;
2416
0
    int error;
2417
2418
0
    ovs_rwlock_wrlock(&dp->port_rwlock);
2419
0
    error = get_port_by_number(dp, port_no, &port);
2420
0
    if (!error && dpif_port) {
2421
0
        answer_port_query(port, dpif_port);
2422
0
    }
2423
0
    ovs_rwlock_unlock(&dp->port_rwlock);
2424
2425
0
    return error;
2426
0
}
2427
2428
static int
2429
dpif_netdev_port_query_by_name(const struct dpif *dpif, const char *devname,
2430
                               struct dpif_port *dpif_port)
2431
0
{
2432
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
2433
0
    struct dp_netdev_port *port;
2434
0
    int error;
2435
2436
0
    ovs_rwlock_rdlock(&dp->port_rwlock);
2437
0
    error = get_port_by_name(dp, devname, &port);
2438
0
    if (!error && dpif_port) {
2439
0
        answer_port_query(port, dpif_port);
2440
0
    }
2441
0
    ovs_rwlock_unlock(&dp->port_rwlock);
2442
2443
0
    return error;
2444
0
}
2445
2446
static void
2447
dp_netdev_flow_free(struct dp_netdev_flow *flow)
2448
0
{
2449
0
    dp_netdev_actions_free(dp_netdev_flow_get_actions(flow));
2450
0
    free(flow->dp_extra_info);
2451
0
    free(flow);
2452
0
}
2453
2454
void dp_netdev_flow_unref(struct dp_netdev_flow *flow)
2455
0
{
2456
0
    if (ovs_refcount_unref_relaxed(&flow->ref_cnt) == 1) {
2457
0
        ovsrcu_postpone(dp_netdev_flow_free, flow);
2458
0
    }
2459
0
}
2460
2461
inline struct dpcls *
2462
dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd,
2463
                           odp_port_t in_port)
2464
0
{
2465
0
    struct dpcls *cls;
2466
0
    uint32_t hash = hash_port_no(in_port);
2467
0
    CMAP_FOR_EACH_WITH_HASH (cls, node, hash, &pmd->classifiers) {
2468
0
        if (cls->in_port == in_port) {
2469
            /* Port classifier exists already */
2470
0
            return cls;
2471
0
        }
2472
0
    }
2473
0
    return NULL;
2474
0
}
2475
2476
static inline struct dpcls *
2477
dp_netdev_pmd_find_dpcls(struct dp_netdev_pmd_thread *pmd,
2478
                         odp_port_t in_port)
2479
    OVS_REQUIRES(pmd->flow_mutex)
2480
0
{
2481
0
    struct dpcls *cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
2482
2483
0
    if (!cls) {
2484
0
        uint32_t hash = hash_port_no(in_port);
2485
2486
        /* Create new classifier for in_port */
2487
0
        cls = xmalloc(sizeof(*cls));
2488
0
        dpcls_init(cls);
2489
0
        cls->in_port = in_port;
2490
0
        cmap_insert(&pmd->classifiers, &cls->node, hash);
2491
0
        VLOG_DBG("Creating dpcls %p for in_port %d", cls, in_port);
2492
0
    }
2493
0
    return cls;
2494
0
}
2495
2496
0
#define MAX_FLOW_MARK       (UINT32_MAX - 1)
2497
0
#define INVALID_FLOW_MARK   0
2498
/* Zero flow mark is used to indicate the HW to remove the mark. A packet
2499
 * marked with zero mark is received in SW without a mark at all, so it
2500
 * cannot be used as a valid mark.
2501
 */
2502
2503
struct megaflow_to_mark_data {
2504
    const struct cmap_node node;
2505
    ovs_u128 mega_ufid;
2506
    uint32_t mark;
2507
};
2508
2509
static struct id_fpool *flow_mark_pool;
2510
2511
static uint32_t
2512
flow_mark_alloc(void)
2513
0
{
2514
0
    static struct ovsthread_once init_once = OVSTHREAD_ONCE_INITIALIZER;
2515
0
    unsigned int tid = netdev_offload_thread_id();
2516
0
    uint32_t mark;
2517
2518
0
    if (ovsthread_once_start(&init_once)) {
2519
        /* Haven't initiated yet, do it here */
2520
0
        flow_mark_pool = id_fpool_create(netdev_offload_thread_nb(),
2521
0
                                         1, MAX_FLOW_MARK);
2522
0
        ovsthread_once_done(&init_once);
2523
0
    }
2524
2525
0
    if (id_fpool_new_id(flow_mark_pool, tid, &mark)) {
2526
0
        return mark;
2527
0
    }
2528
2529
0
    return INVALID_FLOW_MARK;
2530
0
}
2531
2532
static void
2533
flow_mark_free(uint32_t mark)
2534
0
{
2535
0
    unsigned int tid = netdev_offload_thread_id();
2536
2537
0
    id_fpool_free_id(flow_mark_pool, tid, mark);
2538
0
}
2539
2540
/* associate megaflow with a mark, which is a 1:1 mapping */
2541
static void
2542
megaflow_to_mark_associate(const ovs_u128 *mega_ufid, uint32_t mark)
2543
0
{
2544
0
    size_t hash = dp_netdev_flow_hash(mega_ufid);
2545
0
    struct megaflow_to_mark_data *data = xzalloc(sizeof(*data));
2546
0
    unsigned int tid = netdev_offload_thread_id();
2547
2548
0
    data->mega_ufid = *mega_ufid;
2549
0
    data->mark = mark;
2550
2551
0
    cmap_insert(&dp_offload_threads[tid].megaflow_to_mark,
2552
0
                CONST_CAST(struct cmap_node *, &data->node), hash);
2553
0
}
2554
2555
/* disassociate meagaflow with a mark */
2556
static void
2557
megaflow_to_mark_disassociate(const ovs_u128 *mega_ufid)
2558
0
{
2559
0
    size_t hash = dp_netdev_flow_hash(mega_ufid);
2560
0
    struct megaflow_to_mark_data *data;
2561
0
    unsigned int tid = netdev_offload_thread_id();
2562
2563
0
    CMAP_FOR_EACH_WITH_HASH (data, node, hash,
2564
0
                             &dp_offload_threads[tid].megaflow_to_mark) {
2565
0
        if (ovs_u128_equals(*mega_ufid, data->mega_ufid)) {
2566
0
            cmap_remove(&dp_offload_threads[tid].megaflow_to_mark,
2567
0
                        CONST_CAST(struct cmap_node *, &data->node), hash);
2568
0
            ovsrcu_postpone(free, data);
2569
0
            return;
2570
0
        }
2571
0
    }
2572
2573
0
    VLOG_WARN("Masked ufid "UUID_FMT" is not associated with a mark?\n",
2574
0
              UUID_ARGS((struct uuid *)mega_ufid));
2575
0
}
2576
2577
static inline uint32_t
2578
megaflow_to_mark_find(const ovs_u128 *mega_ufid)
2579
0
{
2580
0
    size_t hash = dp_netdev_flow_hash(mega_ufid);
2581
0
    struct megaflow_to_mark_data *data;
2582
0
    unsigned int tid = netdev_offload_thread_id();
2583
2584
0
    CMAP_FOR_EACH_WITH_HASH (data, node, hash,
2585
0
                             &dp_offload_threads[tid].megaflow_to_mark) {
2586
0
        if (ovs_u128_equals(*mega_ufid, data->mega_ufid)) {
2587
0
            return data->mark;
2588
0
        }
2589
0
    }
2590
2591
0
    VLOG_DBG("Mark id for ufid "UUID_FMT" was not found\n",
2592
0
             UUID_ARGS((struct uuid *)mega_ufid));
2593
0
    return INVALID_FLOW_MARK;
2594
0
}
2595
2596
/* associate mark with a flow, which is 1:N mapping */
2597
static void
2598
mark_to_flow_associate(const uint32_t mark, struct dp_netdev_flow *flow)
2599
0
{
2600
0
    unsigned int tid = netdev_offload_thread_id();
2601
0
    dp_netdev_flow_ref(flow);
2602
2603
0
    cmap_insert(&dp_offload_threads[tid].mark_to_flow,
2604
0
                CONST_CAST(struct cmap_node *, &flow->mark_node),
2605
0
                hash_int(mark, 0));
2606
0
    flow->mark = mark;
2607
2608
0
    VLOG_DBG("Associated dp_netdev flow %p with mark %u mega_ufid "UUID_FMT,
2609
0
             flow, mark, UUID_ARGS((struct uuid *) &flow->mega_ufid));
2610
0
}
2611
2612
static bool
2613
flow_mark_has_no_ref(uint32_t mark)
2614
0
{
2615
0
    unsigned int tid = netdev_offload_thread_id();
2616
0
    struct dp_netdev_flow *flow;
2617
2618
0
    CMAP_FOR_EACH_WITH_HASH (flow, mark_node, hash_int(mark, 0),
2619
0
                             &dp_offload_threads[tid].mark_to_flow) {
2620
0
        if (flow->mark == mark) {
2621
0
            return false;
2622
0
        }
2623
0
    }
2624
2625
0
    return true;
2626
0
}
2627
2628
static int
2629
mark_to_flow_disassociate(struct dp_netdev *dp,
2630
                          struct dp_netdev_flow *flow)
2631
0
{
2632
0
    const char *dpif_type_str = dpif_normalize_type(dp->class->type);
2633
0
    struct cmap_node *mark_node = CONST_CAST(struct cmap_node *,
2634
0
                                             &flow->mark_node);
2635
0
    unsigned int tid = netdev_offload_thread_id();
2636
0
    uint32_t mark = flow->mark;
2637
0
    int ret = 0;
2638
2639
    /* INVALID_FLOW_MARK may mean that the flow has been disassociated or
2640
     * never associated. */
2641
0
    if (OVS_UNLIKELY(mark == INVALID_FLOW_MARK)) {
2642
0
        return EINVAL;
2643
0
    }
2644
2645
0
    cmap_remove(&dp_offload_threads[tid].mark_to_flow,
2646
0
                mark_node, hash_int(mark, 0));
2647
0
    flow->mark = INVALID_FLOW_MARK;
2648
2649
    /*
2650
     * no flow is referencing the mark any more? If so, let's
2651
     * remove the flow from hardware and free the mark.
2652
     */
2653
0
    if (flow_mark_has_no_ref(mark)) {
2654
0
        struct netdev *port;
2655
0
        odp_port_t in_port = flow->flow.in_port.odp_port;
2656
2657
0
        port = netdev_ports_get(in_port, dpif_type_str);
2658
0
        if (port) {
2659
            /* Taking a global 'port_rwlock' to fulfill thread safety
2660
             * restrictions regarding netdev port mapping. */
2661
0
            ovs_rwlock_rdlock(&dp->port_rwlock);
2662
0
            ret = netdev_flow_del(port, &flow->mega_ufid, NULL);
2663
0
            ovs_rwlock_unlock(&dp->port_rwlock);
2664
0
            netdev_close(port);
2665
0
        }
2666
2667
0
        flow_mark_free(mark);
2668
0
        VLOG_DBG("Freed flow mark %u mega_ufid "UUID_FMT, mark,
2669
0
                 UUID_ARGS((struct uuid *) &flow->mega_ufid));
2670
2671
0
        megaflow_to_mark_disassociate(&flow->mega_ufid);
2672
0
    }
2673
0
    dp_netdev_flow_unref(flow);
2674
2675
0
    return ret;
2676
0
}
2677
2678
static struct dp_netdev_flow *
2679
mark_to_flow_find(const struct dp_netdev_pmd_thread *pmd,
2680
                  const uint32_t mark)
2681
0
{
2682
0
    struct dp_netdev_flow *flow;
2683
0
    unsigned int tid;
2684
0
    size_t hash;
2685
2686
0
    if (dp_offload_threads == NULL) {
2687
0
        return NULL;
2688
0
    }
2689
2690
0
    hash = hash_int(mark, 0);
2691
0
    for (tid = 0; tid < netdev_offload_thread_nb(); tid++) {
2692
0
        CMAP_FOR_EACH_WITH_HASH (flow, mark_node, hash,
2693
0
                                 &dp_offload_threads[tid].mark_to_flow) {
2694
0
            if (flow->mark == mark && flow->pmd_id == pmd->core_id &&
2695
0
                flow->dead == false) {
2696
0
                return flow;
2697
0
            }
2698
0
        }
2699
0
    }
2700
2701
0
    return NULL;
2702
0
}
2703
2704
static struct dp_offload_thread_item *
2705
dp_netdev_alloc_flow_offload(struct dp_netdev *dp,
2706
                             struct dp_netdev_flow *flow,
2707
                             int op)
2708
0
{
2709
0
    struct dp_offload_thread_item *item;
2710
0
    struct dp_offload_flow_item *flow_offload;
2711
2712
0
    item = xzalloc(sizeof *item + sizeof *flow_offload);
2713
0
    flow_offload = &item->data->flow;
2714
2715
0
    item->type = DP_OFFLOAD_FLOW;
2716
0
    item->dp = dp;
2717
2718
0
    flow_offload->flow = flow;
2719
0
    flow_offload->op = op;
2720
2721
0
    dp_netdev_flow_ref(flow);
2722
2723
0
    return item;
2724
0
}
2725
2726
static void
2727
dp_netdev_free_flow_offload__(struct dp_offload_thread_item *offload)
2728
0
{
2729
0
    struct dp_offload_flow_item *flow_offload = &offload->data->flow;
2730
2731
0
    free(flow_offload->actions);
2732
0
    free(offload);
2733
0
}
2734
2735
static void
2736
dp_netdev_free_flow_offload(struct dp_offload_thread_item *offload)
2737
0
{
2738
0
    struct dp_offload_flow_item *flow_offload = &offload->data->flow;
2739
2740
0
    dp_netdev_flow_unref(flow_offload->flow);
2741
0
    ovsrcu_postpone(dp_netdev_free_flow_offload__, offload);
2742
0
}
2743
2744
static void
2745
dp_netdev_free_offload(struct dp_offload_thread_item *offload)
2746
0
{
2747
0
    switch (offload->type) {
2748
0
    case DP_OFFLOAD_FLOW:
2749
0
        dp_netdev_free_flow_offload(offload);
2750
0
        break;
2751
0
    case DP_OFFLOAD_FLUSH:
2752
0
        free(offload);
2753
0
        break;
2754
0
    default:
2755
0
        OVS_NOT_REACHED();
2756
0
    };
2757
0
}
2758
2759
static void
2760
dp_netdev_append_offload(struct dp_offload_thread_item *offload,
2761
                         unsigned int tid)
2762
0
{
2763
0
    dp_netdev_offload_init();
2764
2765
0
    mpsc_queue_insert(&dp_offload_threads[tid].queue, &offload->node);
2766
0
    atomic_count_inc64(&dp_offload_threads[tid].enqueued_item);
2767
0
}
2768
2769
static void
2770
dp_netdev_offload_flow_enqueue(struct dp_offload_thread_item *item)
2771
0
{
2772
0
    struct dp_offload_flow_item *flow_offload = &item->data->flow;
2773
0
    unsigned int tid;
2774
2775
0
    ovs_assert(item->type == DP_OFFLOAD_FLOW);
2776
2777
0
    tid = netdev_offload_ufid_to_thread_id(flow_offload->flow->mega_ufid);
2778
0
    dp_netdev_append_offload(item, tid);
2779
0
}
2780
2781
static int
2782
dp_netdev_flow_offload_del(struct dp_offload_thread_item *item)
2783
0
{
2784
0
    return mark_to_flow_disassociate(item->dp, item->data->flow.flow);
2785
0
}
2786
2787
/*
2788
 * There are two flow offload operations here: addition and modification.
2789
 *
2790
 * For flow addition, this function does:
2791
 * - allocate a new flow mark id
2792
 * - perform hardware flow offload
2793
 * - associate the flow mark with flow and mega flow
2794
 *
2795
 * For flow modification, both flow mark and the associations are still
2796
 * valid, thus only item 2 needed.
2797
 */
2798
static int
2799
dp_netdev_flow_offload_put(struct dp_offload_thread_item *item)
2800
0
{
2801
0
    struct dp_offload_flow_item *offload = &item->data->flow;
2802
0
    struct dp_netdev *dp = item->dp;
2803
0
    struct dp_netdev_flow *flow = offload->flow;
2804
0
    odp_port_t in_port = flow->flow.in_port.odp_port;
2805
0
    const char *dpif_type_str = dpif_normalize_type(dp->class->type);
2806
0
    bool modification = offload->op == DP_NETDEV_FLOW_OFFLOAD_OP_MOD
2807
0
                        && flow->mark != INVALID_FLOW_MARK;
2808
0
    struct offload_info info;
2809
0
    struct netdev *port;
2810
0
    uint32_t mark;
2811
0
    int ret;
2812
2813
0
    if (flow->dead) {
2814
0
        return -1;
2815
0
    }
2816
2817
0
    if (modification) {
2818
0
        mark = flow->mark;
2819
0
    } else {
2820
        /*
2821
         * If a mega flow has already been offloaded (from other PMD
2822
         * instances), do not offload it again.
2823
         */
2824
0
        mark = megaflow_to_mark_find(&flow->mega_ufid);
2825
0
        if (mark != INVALID_FLOW_MARK) {
2826
0
            VLOG_DBG("Flow has already been offloaded with mark %u\n", mark);
2827
0
            if (flow->mark != INVALID_FLOW_MARK) {
2828
0
                ovs_assert(flow->mark == mark);
2829
0
            } else {
2830
0
                mark_to_flow_associate(mark, flow);
2831
0
            }
2832
0
            return 0;
2833
0
        }
2834
2835
0
        mark = flow_mark_alloc();
2836
0
        if (mark == INVALID_FLOW_MARK) {
2837
0
            VLOG_ERR("Failed to allocate flow mark!\n");
2838
0
            return -1;
2839
0
        }
2840
0
    }
2841
0
    info.flow_mark = mark;
2842
0
    info.orig_in_port = offload->orig_in_port;
2843
2844
0
    port = netdev_ports_get(in_port, dpif_type_str);
2845
0
    if (!port) {
2846
0
        goto err_free;
2847
0
    }
2848
2849
    /* Taking a global 'port_rwlock' to fulfill thread safety
2850
     * restrictions regarding the netdev port mapping. */
2851
0
    ovs_rwlock_rdlock(&dp->port_rwlock);
2852
0
    ret = netdev_flow_put(port, &offload->match,
2853
0
                          CONST_CAST(struct nlattr *, offload->actions),
2854
0
                          offload->actions_len, &flow->mega_ufid, &info,
2855
0
                          NULL);
2856
0
    ovs_rwlock_unlock(&dp->port_rwlock);
2857
0
    netdev_close(port);
2858
2859
0
    if (ret) {
2860
0
        goto err_free;
2861
0
    }
2862
2863
0
    if (!modification) {
2864
0
        megaflow_to_mark_associate(&flow->mega_ufid, mark);
2865
0
        mark_to_flow_associate(mark, flow);
2866
0
    }
2867
0
    return 0;
2868
2869
0
err_free:
2870
0
    if (!modification) {
2871
0
        flow_mark_free(mark);
2872
0
    } else {
2873
0
        mark_to_flow_disassociate(item->dp, flow);
2874
0
    }
2875
0
    return -1;
2876
0
}
2877
2878
static void
2879
dp_offload_flow(struct dp_offload_thread_item *item)
2880
0
{
2881
0
    struct dp_offload_flow_item *flow_offload = &item->data->flow;
2882
0
    const char *op;
2883
0
    int ret;
2884
2885
0
    switch (flow_offload->op) {
2886
0
    case DP_NETDEV_FLOW_OFFLOAD_OP_ADD:
2887
0
        op = "add";
2888
0
        ret = dp_netdev_flow_offload_put(item);
2889
0
        break;
2890
0
    case DP_NETDEV_FLOW_OFFLOAD_OP_MOD:
2891
0
        op = "modify";
2892
0
        ret = dp_netdev_flow_offload_put(item);
2893
0
        break;
2894
0
    case DP_NETDEV_FLOW_OFFLOAD_OP_DEL:
2895
0
        op = "delete";
2896
0
        ret = dp_netdev_flow_offload_del(item);
2897
0
        break;
2898
0
    default:
2899
0
        OVS_NOT_REACHED();
2900
0
    }
2901
2902
0
    VLOG_DBG("%s to %s netdev flow "UUID_FMT,
2903
0
             ret == 0 ? "succeed" : "failed", op,
2904
0
             UUID_ARGS((struct uuid *) &flow_offload->flow->mega_ufid));
2905
0
}
2906
2907
static void
2908
dp_offload_flush(struct dp_offload_thread_item *item)
2909
0
{
2910
0
    struct dp_offload_flush_item *flush = &item->data->flush;
2911
2912
0
    ovs_rwlock_rdlock(&item->dp->port_rwlock);
2913
0
    netdev_flow_flush(flush->netdev);
2914
0
    ovs_rwlock_unlock(&item->dp->port_rwlock);
2915
2916
0
    ovs_barrier_block(flush->barrier);
2917
2918
    /* Allow the initiator thread to take again the port lock,
2919
     * before continuing offload operations in this thread.
2920
     */
2921
0
    ovs_barrier_block(flush->barrier);
2922
0
}
2923
2924
0
#define DP_NETDEV_OFFLOAD_BACKOFF_MIN 1
2925
0
#define DP_NETDEV_OFFLOAD_BACKOFF_MAX 64
2926
0
#define DP_NETDEV_OFFLOAD_QUIESCE_INTERVAL_US (10 * 1000) /* 10 ms */
2927
2928
static void *
2929
dp_netdev_flow_offload_main(void *arg)
2930
0
{
2931
0
    struct dp_offload_thread *ofl_thread = arg;
2932
0
    struct dp_offload_thread_item *offload;
2933
0
    struct mpsc_queue_node *node;
2934
0
    struct mpsc_queue *queue;
2935
0
    long long int latency_us;
2936
0
    long long int next_rcu;
2937
0
    long long int now;
2938
0
    uint64_t backoff;
2939
2940
0
    queue = &ofl_thread->queue;
2941
0
    mpsc_queue_acquire(queue);
2942
2943
0
    while (true) {
2944
0
        backoff = DP_NETDEV_OFFLOAD_BACKOFF_MIN;
2945
0
        while (mpsc_queue_tail(queue) == NULL) {
2946
0
            xnanosleep(backoff * 1E6);
2947
0
            if (backoff < DP_NETDEV_OFFLOAD_BACKOFF_MAX) {
2948
0
                backoff <<= 1;
2949
0
            }
2950
0
        }
2951
2952
0
        next_rcu = time_usec() + DP_NETDEV_OFFLOAD_QUIESCE_INTERVAL_US;
2953
0
        MPSC_QUEUE_FOR_EACH_POP (node, queue) {
2954
0
            offload = CONTAINER_OF(node, struct dp_offload_thread_item, node);
2955
0
            atomic_count_dec64(&ofl_thread->enqueued_item);
2956
2957
0
            switch (offload->type) {
2958
0
            case DP_OFFLOAD_FLOW:
2959
0
                dp_offload_flow(offload);
2960
0
                break;
2961
0
            case DP_OFFLOAD_FLUSH:
2962
0
                dp_offload_flush(offload);
2963
0
                break;
2964
0
            default:
2965
0
                OVS_NOT_REACHED();
2966
0
            }
2967
2968
0
            now = time_usec();
2969
2970
0
            latency_us = now - offload->timestamp;
2971
0
            mov_avg_cma_update(&ofl_thread->cma, latency_us);
2972
0
            mov_avg_ema_update(&ofl_thread->ema, latency_us);
2973
2974
0
            dp_netdev_free_offload(offload);
2975
2976
            /* Do RCU synchronization at fixed interval. */
2977
0
            if (now > next_rcu) {
2978
0
                ovsrcu_quiesce();
2979
0
                next_rcu = time_usec() + DP_NETDEV_OFFLOAD_QUIESCE_INTERVAL_US;
2980
0
            }
2981
0
        }
2982
0
    }
2983
2984
0
    OVS_NOT_REACHED();
2985
0
    mpsc_queue_release(queue);
2986
2987
0
    return NULL;
2988
0
}
2989
2990
static void
2991
queue_netdev_flow_del(struct dp_netdev_pmd_thread *pmd,
2992
                      struct dp_netdev_flow *flow)
2993
0
{
2994
0
    struct dp_offload_thread_item *offload;
2995
2996
0
    if (!netdev_is_flow_api_enabled()) {
2997
0
        return;
2998
0
    }
2999
3000
0
    offload = dp_netdev_alloc_flow_offload(pmd->dp, flow,
3001
0
                                           DP_NETDEV_FLOW_OFFLOAD_OP_DEL);
3002
0
    offload->timestamp = pmd->ctx.now;
3003
0
    dp_netdev_offload_flow_enqueue(offload);
3004
0
}
3005
3006
static void
3007
log_netdev_flow_change(const struct dp_netdev_flow *flow,
3008
                       const struct match *match,
3009
                       const struct dp_netdev_actions *old_actions,
3010
                       const struct nlattr *actions,
3011
                       size_t actions_len)
3012
0
{
3013
0
    struct ds ds = DS_EMPTY_INITIALIZER;
3014
0
    struct ofpbuf key_buf, mask_buf;
3015
0
    struct odp_flow_key_parms odp_parms = {
3016
0
        .flow = &match->flow,
3017
0
        .mask = &match->wc.masks,
3018
0
        .support = dp_netdev_support,
3019
0
    };
3020
3021
0
    if (OVS_LIKELY(VLOG_DROP_DBG((&upcall_rl)))) {
3022
0
        return;
3023
0
    }
3024
3025
0
    ofpbuf_init(&key_buf, 0);
3026
0
    ofpbuf_init(&mask_buf, 0);
3027
3028
0
    odp_flow_key_from_flow(&odp_parms, &key_buf);
3029
0
    odp_parms.key_buf = &key_buf;
3030
0
    odp_flow_key_from_mask(&odp_parms, &mask_buf);
3031
3032
0
    if (old_actions) {
3033
0
        ds_put_cstr(&ds, "flow_mod: ");
3034
0
    } else {
3035
0
        ds_put_cstr(&ds, "flow_add: ");
3036
0
    }
3037
0
    odp_format_ufid(&flow->ufid, &ds);
3038
0
    ds_put_cstr(&ds, " mega_");
3039
0
    odp_format_ufid(&flow->mega_ufid, &ds);
3040
0
    ds_put_cstr(&ds, " ");
3041
0
    odp_flow_format(key_buf.data, key_buf.size,
3042
0
                    mask_buf.data, mask_buf.size,
3043
0
                    NULL, &ds, false, true);
3044
0
    if (old_actions) {
3045
0
        ds_put_cstr(&ds, ", old_actions:");
3046
0
        format_odp_actions(&ds, old_actions->actions, old_actions->size,
3047
0
                           NULL);
3048
0
    }
3049
0
    ds_put_cstr(&ds, ", actions:");
3050
0
    format_odp_actions(&ds, actions, actions_len, NULL);
3051
3052
0
    VLOG_DBG("%s", ds_cstr(&ds));
3053
3054
0
    ofpbuf_uninit(&key_buf);
3055
0
    ofpbuf_uninit(&mask_buf);
3056
3057
    /* Add a printout of the actual match installed. */
3058
0
    struct match m;
3059
0
    ds_clear(&ds);
3060
0
    ds_put_cstr(&ds, "flow match: ");
3061
0
    miniflow_expand(&flow->cr.flow.mf, &m.flow);
3062
0
    miniflow_expand(&flow->cr.mask->mf, &m.wc.masks);
3063
0
    memset(&m.tun_md, 0, sizeof m.tun_md);
3064
0
    match_format(&m, NULL, &ds, OFP_DEFAULT_PRIORITY);
3065
3066
0
    VLOG_DBG("%s", ds_cstr(&ds));
3067
3068
0
    ds_destroy(&ds);
3069
0
}
3070
3071
static void
3072
queue_netdev_flow_put(struct dp_netdev_pmd_thread *pmd,
3073
                      struct dp_netdev_flow *flow, struct match *match,
3074
                      const struct nlattr *actions, size_t actions_len,
3075
                      int op)
3076
0
{
3077
0
    struct dp_offload_thread_item *item;
3078
0
    struct dp_offload_flow_item *flow_offload;
3079
3080
0
    if (!netdev_is_flow_api_enabled()) {
3081
0
        return;
3082
0
    }
3083
3084
0
    item = dp_netdev_alloc_flow_offload(pmd->dp, flow, op);
3085
0
    flow_offload = &item->data->flow;
3086
0
    flow_offload->match = *match;
3087
0
    flow_offload->actions = xmalloc(actions_len);
3088
0
    memcpy(flow_offload->actions, actions, actions_len);
3089
0
    flow_offload->actions_len = actions_len;
3090
0
    flow_offload->orig_in_port = flow->orig_in_port;
3091
3092
0
    item->timestamp = pmd->ctx.now;
3093
0
    dp_netdev_offload_flow_enqueue(item);
3094
0
}
3095
3096
static void
3097
dp_netdev_pmd_remove_flow(struct dp_netdev_pmd_thread *pmd,
3098
                          struct dp_netdev_flow *flow)
3099
    OVS_REQUIRES(pmd->flow_mutex)
3100
0
{
3101
0
    struct cmap_node *node = CONST_CAST(struct cmap_node *, &flow->node);
3102
0
    struct dpcls *cls;
3103
0
    odp_port_t in_port = flow->flow.in_port.odp_port;
3104
3105
0
    cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
3106
0
    ovs_assert(cls != NULL);
3107
0
    dpcls_remove(cls, &flow->cr);
3108
0
    dp_netdev_simple_match_remove(pmd, flow);
3109
0
    cmap_remove(&pmd->flow_table, node, dp_netdev_flow_hash(&flow->ufid));
3110
0
    ccmap_dec(&pmd->n_flows, odp_to_u32(in_port));
3111
0
    queue_netdev_flow_del(pmd, flow);
3112
0
    flow->dead = true;
3113
3114
0
    dp_netdev_flow_unref(flow);
3115
0
}
3116
3117
static void
3118
dp_netdev_offload_flush_enqueue(struct dp_netdev *dp,
3119
                                struct netdev *netdev,
3120
                                struct ovs_barrier *barrier)
3121
0
{
3122
0
    unsigned int tid;
3123
0
    long long int now_us = time_usec();
3124
3125
0
    for (tid = 0; tid < netdev_offload_thread_nb(); tid++) {
3126
0
        struct dp_offload_thread_item *item;
3127
0
        struct dp_offload_flush_item *flush;
3128
3129
0
        item = xmalloc(sizeof *item + sizeof *flush);
3130
0
        item->type = DP_OFFLOAD_FLUSH;
3131
0
        item->dp = dp;
3132
0
        item->timestamp = now_us;
3133
3134
0
        flush = &item->data->flush;
3135
0
        flush->netdev = netdev;
3136
0
        flush->barrier = barrier;
3137
3138
0
        dp_netdev_append_offload(item, tid);
3139
0
    }
3140
0
}
3141
3142
/* Blocking call that will wait on the offload thread to
3143
 * complete its work.  As the flush order will only be
3144
 * enqueued after existing offload requests, those previous
3145
 * offload requests must be processed, which requires being
3146
 * able to lock the 'port_rwlock' from the offload thread.
3147
 *
3148
 * Flow offload flush is done when a port is being deleted.
3149
 * Right after this call executes, the offload API is disabled
3150
 * for the port. This call must be made blocking until the
3151
 * offload provider completed its job.
3152
 */
3153
static void
3154
dp_netdev_offload_flush(struct dp_netdev *dp,
3155
                        struct dp_netdev_port *port)
3156
    OVS_REQ_WRLOCK(dp->port_rwlock)
3157
0
{
3158
    /* The flush mutex serves to exclude mutual access to the static
3159
     * barrier, and to prevent multiple flush orders to several threads.
3160
     *
3161
     * The memory barrier needs to go beyond the function scope as
3162
     * the other threads can resume from blocking after this function
3163
     * already finished.
3164
     *
3165
     * Additionally, because the flush operation is blocking, it would
3166
     * deadlock if multiple offload threads were blocking on several
3167
     * different barriers. Only allow a single flush order in the offload
3168
     * queue at a time.
3169
     */
3170
0
    static struct ovs_mutex flush_mutex = OVS_MUTEX_INITIALIZER;
3171
0
    static struct ovs_barrier barrier OVS_GUARDED_BY(flush_mutex);
3172
0
    struct netdev *netdev;
3173
3174
0
    if (!netdev_is_flow_api_enabled()) {
3175
0
        return;
3176
0
    }
3177
3178
0
    ovs_rwlock_unlock(&dp->port_rwlock);
3179
0
    ovs_mutex_lock(&flush_mutex);
3180
3181
    /* This thread and the offload threads. */
3182
0
    ovs_barrier_init(&barrier, 1 + netdev_offload_thread_nb());
3183
3184
0
    netdev = netdev_ref(port->netdev);
3185
0
    dp_netdev_offload_flush_enqueue(dp, netdev, &barrier);
3186
0
    ovs_barrier_block(&barrier);
3187
0
    netdev_close(netdev);
3188
3189
    /* Take back the datapath port lock before allowing the offload
3190
     * threads to proceed further. The port deletion must complete first,
3191
     * to ensure no further offloads are inserted after the flush.
3192
     *
3193
     * Some offload provider (e.g. DPDK) keeps a netdev reference with
3194
     * the offload data. If this reference is not closed, the netdev is
3195
     * kept indefinitely. */
3196
0
    ovs_rwlock_wrlock(&dp->port_rwlock);
3197
3198
0
    ovs_barrier_block(&barrier);
3199
0
    ovs_barrier_destroy(&barrier);
3200
3201
0
    ovs_mutex_unlock(&flush_mutex);
3202
0
}
3203
3204
static void
3205
dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd)
3206
0
{
3207
0
    struct dp_netdev_flow *netdev_flow;
3208
3209
0
    ovs_mutex_lock(&pmd->flow_mutex);
3210
0
    CMAP_FOR_EACH (netdev_flow, node, &pmd->flow_table) {
3211
0
        dp_netdev_pmd_remove_flow(pmd, netdev_flow);
3212
0
    }
3213
0
    ovs_mutex_unlock(&pmd->flow_mutex);
3214
0
}
3215
3216
static int
3217
dpif_netdev_flow_flush(struct dpif *dpif)
3218
0
{
3219
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
3220
0
    struct dp_netdev_pmd_thread *pmd;
3221
3222
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3223
0
        dp_netdev_pmd_flow_flush(pmd);
3224
0
    }
3225
3226
0
    return 0;
3227
0
}
3228
3229
struct dp_netdev_port_state {
3230
    struct hmap_position position;
3231
    char *name;
3232
};
3233
3234
static int
3235
dpif_netdev_port_dump_start(const struct dpif *dpif OVS_UNUSED, void **statep)
3236
0
{
3237
0
    *statep = xzalloc(sizeof(struct dp_netdev_port_state));
3238
0
    return 0;
3239
0
}
3240
3241
static int
3242
dpif_netdev_port_dump_next(const struct dpif *dpif, void *state_,
3243
                           struct dpif_port *dpif_port)
3244
0
{
3245
0
    struct dp_netdev_port_state *state = state_;
3246
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
3247
0
    struct hmap_node *node;
3248
0
    int retval;
3249
3250
0
    ovs_rwlock_rdlock(&dp->port_rwlock);
3251
0
    node = hmap_at_position(&dp->ports, &state->position);
3252
0
    if (node) {
3253
0
        struct dp_netdev_port *port;
3254
3255
0
        port = CONTAINER_OF(node, struct dp_netdev_port, node);
3256
3257
0
        free(state->name);
3258
0
        state->name = xstrdup(netdev_get_name(port->netdev));
3259
0
        dpif_port->name = state->name;
3260
0
        dpif_port->type = port->type;
3261
0
        dpif_port->port_no = port->port_no;
3262
3263
0
        retval = 0;
3264
0
    } else {
3265
0
        retval = EOF;
3266
0
    }
3267
0
    ovs_rwlock_unlock(&dp->port_rwlock);
3268
3269
0
    return retval;
3270
0
}
3271
3272
static int
3273
dpif_netdev_port_dump_done(const struct dpif *dpif OVS_UNUSED, void *state_)
3274
0
{
3275
0
    struct dp_netdev_port_state *state = state_;
3276
0
    free(state->name);
3277
0
    free(state);
3278
0
    return 0;
3279
0
}
3280
3281
static int
3282
dpif_netdev_port_poll(const struct dpif *dpif_, char **devnamep OVS_UNUSED)
3283
0
{
3284
0
    struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
3285
0
    uint64_t new_port_seq;
3286
0
    int error;
3287
3288
0
    new_port_seq = seq_read(dpif->dp->port_seq);
3289
0
    if (dpif->last_port_seq != new_port_seq) {
3290
0
        dpif->last_port_seq = new_port_seq;
3291
0
        error = ENOBUFS;
3292
0
    } else {
3293
0
        error = EAGAIN;
3294
0
    }
3295
3296
0
    return error;
3297
0
}
3298
3299
static void
3300
dpif_netdev_port_poll_wait(const struct dpif *dpif_)
3301
0
{
3302
0
    struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
3303
3304
0
    seq_wait(dpif->dp->port_seq, dpif->last_port_seq);
3305
0
}
3306
3307
static struct dp_netdev_flow *
3308
dp_netdev_flow_cast(const struct dpcls_rule *cr)
3309
0
{
3310
0
    return cr ? CONTAINER_OF(cr, struct dp_netdev_flow, cr) : NULL;
3311
0
}
3312
3313
static bool dp_netdev_flow_ref(struct dp_netdev_flow *flow)
3314
0
{
3315
0
    return ovs_refcount_try_ref_rcu(&flow->ref_cnt);
3316
0
}
3317
3318
/* netdev_flow_key utilities.
3319
 *
3320
 * netdev_flow_key is basically a miniflow.  We use these functions
3321
 * (netdev_flow_key_clone, netdev_flow_key_equal, ...) instead of the miniflow
3322
 * functions (miniflow_clone_inline, miniflow_equal, ...), because:
3323
 *
3324
 * - Since we are dealing exclusively with miniflows created by
3325
 *   miniflow_extract(), if the map is different the miniflow is different.
3326
 *   Therefore we can be faster by comparing the map and the miniflow in a
3327
 *   single memcmp().
3328
 * - These functions can be inlined by the compiler. */
3329
3330
static inline bool
3331
netdev_flow_key_equal(const struct netdev_flow_key *a,
3332
                      const struct netdev_flow_key *b)
3333
0
{
3334
    /* 'b->len' may be not set yet. */
3335
0
    return a->hash == b->hash && !memcmp(&a->mf, &b->mf, a->len);
3336
0
}
3337
3338
static inline void
3339
netdev_flow_key_clone(struct netdev_flow_key *dst,
3340
                      const struct netdev_flow_key *src)
3341
0
{
3342
0
    memcpy(dst, src,
3343
0
           offsetof(struct netdev_flow_key, mf) + src->len);
3344
0
}
3345
3346
/* Initialize a netdev_flow_key 'mask' from 'match'. */
3347
static inline void
3348
netdev_flow_mask_init(struct netdev_flow_key *mask,
3349
                      const struct match *match)
3350
0
{
3351
0
    uint64_t *dst = miniflow_values(&mask->mf);
3352
0
    struct flowmap fmap;
3353
0
    uint32_t hash = 0;
3354
0
    size_t idx;
3355
3356
    /* Only check masks that make sense for the flow. */
3357
0
    flow_wc_map(&match->flow, &fmap);
3358
0
    flowmap_init(&mask->mf.map);
3359
3360
0
    FLOWMAP_FOR_EACH_INDEX(idx, fmap) {
3361
0
        uint64_t mask_u64 = flow_u64_value(&match->wc.masks, idx);
3362
3363
0
        if (mask_u64) {
3364
0
            flowmap_set(&mask->mf.map, idx, 1);
3365
0
            *dst++ = mask_u64;
3366
0
            hash = hash_add64(hash, mask_u64);
3367
0
        }
3368
0
    }
3369
3370
0
    map_t map;
3371
3372
0
    FLOWMAP_FOR_EACH_MAP (map, mask->mf.map) {
3373
0
        hash = hash_add64(hash, map);
3374
0
    }
3375
3376
0
    size_t n = dst - miniflow_get_values(&mask->mf);
3377
3378
0
    mask->hash = hash_finish(hash, n * 8);
3379
0
    mask->len = netdev_flow_key_size(n);
3380
0
}
3381
3382
/* Initializes 'dst' as a copy of 'flow' masked with 'mask'. */
3383
static inline void
3384
netdev_flow_key_init_masked(struct netdev_flow_key *dst,
3385
                            const struct flow *flow,
3386
                            const struct netdev_flow_key *mask)
3387
0
{
3388
0
    uint64_t *dst_u64 = miniflow_values(&dst->mf);
3389
0
    const uint64_t *mask_u64 = miniflow_get_values(&mask->mf);
3390
0
    uint32_t hash = 0;
3391
0
    uint64_t value;
3392
3393
0
    dst->len = mask->len;
3394
0
    dst->mf = mask->mf;   /* Copy maps. */
3395
3396
0
    FLOW_FOR_EACH_IN_MAPS(value, flow, mask->mf.map) {
3397
0
        *dst_u64 = value & *mask_u64++;
3398
0
        hash = hash_add64(hash, *dst_u64++);
3399
0
    }
3400
0
    dst->hash = hash_finish(hash,
3401
0
                            (dst_u64 - miniflow_get_values(&dst->mf)) * 8);
3402
0
}
3403
3404
/* Initializes 'key' as a copy of 'flow'. */
3405
static inline void
3406
netdev_flow_key_init(struct netdev_flow_key *key,
3407
                     const struct flow *flow)
3408
0
{
3409
0
    uint32_t hash = 0;
3410
0
    uint64_t value;
3411
3412
0
    miniflow_map_init(&key->mf, flow);
3413
0
    miniflow_init(&key->mf, flow);
3414
3415
0
    size_t n = miniflow_n_values(&key->mf);
3416
3417
0
    FLOW_FOR_EACH_IN_MAPS (value, flow, key->mf.map) {
3418
0
        hash = hash_add64(hash, value);
3419
0
    }
3420
3421
0
    key->hash = hash_finish(hash, n * 8);
3422
0
    key->len = netdev_flow_key_size(n);
3423
0
}
3424
3425
static inline void
3426
emc_change_entry(struct emc_entry *ce, struct dp_netdev_flow *flow,
3427
                 const struct netdev_flow_key *key)
3428
0
{
3429
0
    if (ce->flow != flow) {
3430
0
        if (ce->flow) {
3431
0
            dp_netdev_flow_unref(ce->flow);
3432
0
        }
3433
3434
0
        if (dp_netdev_flow_ref(flow)) {
3435
0
            ce->flow = flow;
3436
0
        } else {
3437
0
            ce->flow = NULL;
3438
0
        }
3439
0
    }
3440
0
    if (key) {
3441
0
        netdev_flow_key_clone(&ce->key, key);
3442
0
    }
3443
0
}
3444
3445
static inline void
3446
emc_insert(struct emc_cache *cache, const struct netdev_flow_key *key,
3447
           struct dp_netdev_flow *flow)
3448
0
{
3449
0
    struct emc_entry *to_be_replaced = NULL;
3450
0
    struct emc_entry *current_entry;
3451
3452
0
    EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
3453
0
        if (netdev_flow_key_equal(&current_entry->key, key)) {
3454
            /* We found the entry with the 'mf' miniflow */
3455
0
            emc_change_entry(current_entry, flow, NULL);
3456
0
            return;
3457
0
        }
3458
3459
        /* Replacement policy: put the flow in an empty (not alive) entry, or
3460
         * in the first entry where it can be */
3461
0
        if (!to_be_replaced
3462
0
            || (emc_entry_alive(to_be_replaced)
3463
0
                && !emc_entry_alive(current_entry))
3464
0
            || current_entry->key.hash < to_be_replaced->key.hash) {
3465
0
            to_be_replaced = current_entry;
3466
0
        }
3467
0
    }
3468
    /* We didn't find the miniflow in the cache.
3469
     * The 'to_be_replaced' entry is where the new flow will be stored */
3470
3471
0
    emc_change_entry(to_be_replaced, flow, key);
3472
0
}
3473
3474
static inline void
3475
emc_probabilistic_insert(struct dp_netdev_pmd_thread *pmd,
3476
                         const struct netdev_flow_key *key,
3477
                         struct dp_netdev_flow *flow)
3478
0
{
3479
    /* Insert an entry into the EMC based on probability value 'min'. By
3480
     * default the value is UINT32_MAX / 100 which yields an insertion
3481
     * probability of 1/100 ie. 1% */
3482
3483
0
    uint32_t min = pmd->ctx.emc_insert_min;
3484
3485
0
    if (min && random_uint32() <= min) {
3486
0
        emc_insert(&(pmd->flow_cache).emc_cache, key, flow);
3487
0
    }
3488
0
}
3489
3490
static inline const struct cmap_node *
3491
smc_entry_get(struct dp_netdev_pmd_thread *pmd, const uint32_t hash)
3492
0
{
3493
0
    struct smc_cache *cache = &(pmd->flow_cache).smc_cache;
3494
0
    struct smc_bucket *bucket = &cache->buckets[hash & SMC_MASK];
3495
0
    uint16_t sig = hash >> 16;
3496
0
    uint16_t index = UINT16_MAX;
3497
3498
0
    for (int i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
3499
0
        if (bucket->sig[i] == sig) {
3500
0
            index = bucket->flow_idx[i];
3501
0
            break;
3502
0
        }
3503
0
    }
3504
0
    if (index != UINT16_MAX) {
3505
0
        return cmap_find_by_index(&pmd->flow_table, index);
3506
0
    }
3507
0
    return NULL;
3508
0
}
3509
3510
/* Insert the flow_table index into SMC. Insertion may fail when 1) SMC is
3511
 * turned off, 2) the flow_table index is larger than uint16_t can handle.
3512
 * If there is already an SMC entry having same signature, the index will be
3513
 * updated. If there is no existing entry, but an empty entry is available,
3514
 * the empty entry will be taken. If no empty entry or existing same signature,
3515
 * a random entry from the hashed bucket will be picked. */
3516
static inline void
3517
smc_insert(struct dp_netdev_pmd_thread *pmd,
3518
           const struct netdev_flow_key *key,
3519
           uint32_t hash)
3520
0
{
3521
0
    struct smc_cache *smc_cache = &(pmd->flow_cache).smc_cache;
3522
0
    struct smc_bucket *bucket = &smc_cache->buckets[key->hash & SMC_MASK];
3523
0
    uint16_t index;
3524
0
    uint32_t cmap_index;
3525
0
    int i;
3526
3527
0
    if (!pmd->ctx.smc_enable_db) {
3528
0
        return;
3529
0
    }
3530
3531
0
    cmap_index = cmap_find_index(&pmd->flow_table, hash);
3532
0
    index = (cmap_index >= UINT16_MAX) ? UINT16_MAX : (uint16_t)cmap_index;
3533
3534
    /* If the index is larger than SMC can handle (uint16_t), we don't
3535
     * insert */
3536
0
    if (index == UINT16_MAX) {
3537
0
        return;
3538
0
    }
3539
3540
    /* If an entry with same signature already exists, update the index */
3541
0
    uint16_t sig = key->hash >> 16;
3542
0
    for (i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
3543
0
        if (bucket->sig[i] == sig) {
3544
0
            bucket->flow_idx[i] = index;
3545
0
            return;
3546
0
        }
3547
0
    }
3548
    /* If there is an empty entry, occupy it. */
3549
0
    for (i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
3550
0
        if (bucket->flow_idx[i] == UINT16_MAX) {
3551
0
            bucket->sig[i] = sig;
3552
0
            bucket->flow_idx[i] = index;
3553
0
            return;
3554
0
        }
3555
0
    }
3556
    /* Otherwise, pick a random entry. */
3557
0
    i = random_uint32() % SMC_ENTRY_PER_BUCKET;
3558
0
    bucket->sig[i] = sig;
3559
0
    bucket->flow_idx[i] = index;
3560
0
}
3561
3562
inline void
3563
emc_probabilistic_insert_batch(struct dp_netdev_pmd_thread *pmd,
3564
                               const struct netdev_flow_key *keys,
3565
                               struct dpcls_rule **rules,
3566
                               uint32_t emc_insert_mask)
3567
0
{
3568
0
    while (emc_insert_mask) {
3569
0
        uint32_t i = raw_ctz(emc_insert_mask);
3570
0
        emc_insert_mask &= emc_insert_mask - 1;
3571
        /* Get the require parameters for EMC/SMC from the rule */
3572
0
        struct dp_netdev_flow *flow = dp_netdev_flow_cast(rules[i]);
3573
        /* Insert the key into EMC/SMC. */
3574
0
        emc_probabilistic_insert(pmd, &keys[i], flow);
3575
0
    }
3576
0
}
3577
3578
inline void
3579
smc_insert_batch(struct dp_netdev_pmd_thread *pmd,
3580
                 const struct netdev_flow_key *keys,
3581
                 struct dpcls_rule **rules,
3582
                 uint32_t smc_insert_mask)
3583
0
{
3584
0
    while (smc_insert_mask) {
3585
0
        uint32_t i = raw_ctz(smc_insert_mask);
3586
0
        smc_insert_mask &= smc_insert_mask - 1;
3587
        /* Get the require parameters for EMC/SMC from the rule */
3588
0
        struct dp_netdev_flow *flow = dp_netdev_flow_cast(rules[i]);
3589
0
        uint32_t hash = dp_netdev_flow_hash(&flow->ufid);
3590
        /* Insert the key into EMC/SMC. */
3591
0
        smc_insert(pmd, &keys[i], hash);
3592
0
    }
3593
0
}
3594
3595
static struct dp_netdev_flow *
3596
dp_netdev_pmd_lookup_flow(struct dp_netdev_pmd_thread *pmd,
3597
                          const struct netdev_flow_key *key,
3598
                          int *lookup_num_p)
3599
0
{
3600
0
    struct dpcls *cls;
3601
0
    struct dpcls_rule *rule = NULL;
3602
0
    odp_port_t in_port = u32_to_odp(MINIFLOW_GET_U32(&key->mf,
3603
0
                                                     in_port.odp_port));
3604
0
    struct dp_netdev_flow *netdev_flow = NULL;
3605
3606
0
    cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
3607
0
    if (OVS_LIKELY(cls)) {
3608
0
        dpcls_lookup(cls, &key, &rule, 1, lookup_num_p);
3609
0
        netdev_flow = dp_netdev_flow_cast(rule);
3610
0
    }
3611
0
    return netdev_flow;
3612
0
}
3613
3614
static struct dp_netdev_flow *
3615
dp_netdev_pmd_find_flow(const struct dp_netdev_pmd_thread *pmd,
3616
                        const ovs_u128 *ufidp, const struct nlattr *key,
3617
                        size_t key_len)
3618
0
{
3619
0
    struct dp_netdev_flow *netdev_flow;
3620
0
    struct flow flow;
3621
0
    ovs_u128 ufid;
3622
3623
    /* If a UFID is not provided, determine one based on the key. */
3624
0
    if (!ufidp && key && key_len
3625
0
        && !dpif_netdev_flow_from_nlattrs(key, key_len, &flow, false)) {
3626
0
        odp_flow_key_hash(&flow, sizeof flow, &ufid);
3627
0
        ufidp = &ufid;
3628
0
    }
3629
3630
0
    if (ufidp) {
3631
0
        CMAP_FOR_EACH_WITH_HASH (netdev_flow, node, dp_netdev_flow_hash(ufidp),
3632
0
                                 &pmd->flow_table) {
3633
0
            if (ovs_u128_equals(netdev_flow->ufid, *ufidp)) {
3634
0
                return netdev_flow;
3635
0
            }
3636
0
        }
3637
0
    }
3638
3639
0
    return NULL;
3640
0
}
3641
3642
static void
3643
dp_netdev_flow_set_last_stats_attrs(struct dp_netdev_flow *netdev_flow,
3644
                                    const struct dpif_flow_stats *stats,
3645
                                    const struct dpif_flow_attrs *attrs,
3646
                                    int result)
3647
0
{
3648
0
    struct dp_netdev_flow_stats *last_stats = &netdev_flow->last_stats;
3649
0
    struct dp_netdev_flow_attrs *last_attrs = &netdev_flow->last_attrs;
3650
3651
0
    atomic_store_relaxed(&netdev_flow->netdev_flow_get_result, result);
3652
0
    if (result) {
3653
0
        return;
3654
0
    }
3655
3656
0
    atomic_store_relaxed(&last_stats->used,         stats->used);
3657
0
    atomic_store_relaxed(&last_stats->packet_count, stats->n_packets);
3658
0
    atomic_store_relaxed(&last_stats->byte_count,   stats->n_bytes);
3659
0
    atomic_store_relaxed(&last_stats->tcp_flags,    stats->tcp_flags);
3660
3661
0
    atomic_store_relaxed(&last_attrs->offloaded,    attrs->offloaded);
3662
0
    atomic_store_relaxed(&last_attrs->dp_layer,     attrs->dp_layer);
3663
3664
0
}
3665
3666
static void
3667
dp_netdev_flow_get_last_stats_attrs(struct dp_netdev_flow *netdev_flow,
3668
                                    struct dpif_flow_stats *stats,
3669
                                    struct dpif_flow_attrs *attrs,
3670
                                    int *result)
3671
0
{
3672
0
    struct dp_netdev_flow_stats *last_stats = &netdev_flow->last_stats;
3673
0
    struct dp_netdev_flow_attrs *last_attrs = &netdev_flow->last_attrs;
3674
3675
0
    atomic_read_relaxed(&netdev_flow->netdev_flow_get_result, result);
3676
0
    if (*result) {
3677
0
        return;
3678
0
    }
3679
3680
0
    atomic_read_relaxed(&last_stats->used,         &stats->used);
3681
0
    atomic_read_relaxed(&last_stats->packet_count, &stats->n_packets);
3682
0
    atomic_read_relaxed(&last_stats->byte_count,   &stats->n_bytes);
3683
0
    atomic_read_relaxed(&last_stats->tcp_flags,    &stats->tcp_flags);
3684
3685
0
    atomic_read_relaxed(&last_attrs->offloaded,    &attrs->offloaded);
3686
0
    atomic_read_relaxed(&last_attrs->dp_layer,     &attrs->dp_layer);
3687
0
}
3688
3689
static bool
3690
dpif_netdev_get_flow_offload_status(const struct dp_netdev *dp,
3691
                                    struct dp_netdev_flow *netdev_flow,
3692
                                    struct dpif_flow_stats *stats,
3693
                                    struct dpif_flow_attrs *attrs)
3694
0
{
3695
0
    uint64_t act_buf[1024 / 8];
3696
0
    struct nlattr *actions;
3697
0
    struct netdev *netdev;
3698
0
    struct match match;
3699
0
    struct ofpbuf buf;
3700
3701
0
    int ret = 0;
3702
3703
0
    if (!netdev_is_flow_api_enabled()) {
3704
0
        return false;
3705
0
    }
3706
3707
0
    netdev = netdev_ports_get(netdev_flow->flow.in_port.odp_port,
3708
0
                              dpif_normalize_type(dp->class->type));
3709
0
    if (!netdev) {
3710
0
        return false;
3711
0
    }
3712
0
    ofpbuf_use_stack(&buf, &act_buf, sizeof act_buf);
3713
    /* Taking a global 'port_rwlock' to fulfill thread safety
3714
     * restrictions regarding netdev port mapping.
3715
     *
3716
     * XXX: Main thread will try to pause/stop all revalidators during datapath
3717
     *      reconfiguration via datapath purge callback (dp_purge_cb) while
3718
     *      rw-holding 'dp->port_rwlock'.  So we're not waiting for lock here.
3719
     *      Otherwise, deadlock is possible, because revalidators might sleep
3720
     *      waiting for the main thread to release the lock and main thread
3721
     *      will wait for them to stop processing.
3722
     *      This workaround might make statistics less accurate. Especially
3723
     *      for flow deletion case, since there will be no other attempt.  */
3724
0
    if (!ovs_rwlock_tryrdlock(&dp->port_rwlock)) {
3725
0
        ret = netdev_flow_get(netdev, &match, &actions,
3726
0
                              &netdev_flow->mega_ufid, stats, attrs, &buf);
3727
        /* Storing statistics and attributes from the last request for
3728
         * later use on mutex contention. */
3729
0
        dp_netdev_flow_set_last_stats_attrs(netdev_flow, stats, attrs, ret);
3730
0
        ovs_rwlock_unlock(&dp->port_rwlock);
3731
0
    } else {
3732
0
        dp_netdev_flow_get_last_stats_attrs(netdev_flow, stats, attrs, &ret);
3733
0
        if (!ret && !attrs->dp_layer) {
3734
            /* Flow was never reported as 'offloaded' so it's harmless
3735
             * to continue to think so. */
3736
0
            ret = EAGAIN;
3737
0
        }
3738
0
    }
3739
0
    netdev_close(netdev);
3740
0
    if (ret) {
3741
0
        return false;
3742
0
    }
3743
3744
0
    return true;
3745
0
}
3746
3747
static void
3748
get_dpif_flow_status(const struct dp_netdev *dp,
3749
                     const struct dp_netdev_flow *netdev_flow_,
3750
                     struct dpif_flow_stats *stats,
3751
                     struct dpif_flow_attrs *attrs)
3752
0
{
3753
0
    struct dpif_flow_stats offload_stats;
3754
0
    struct dpif_flow_attrs offload_attrs;
3755
0
    struct dp_netdev_flow *netdev_flow;
3756
0
    unsigned long long n;
3757
0
    long long used;
3758
0
    uint16_t flags;
3759
3760
0
    netdev_flow = CONST_CAST(struct dp_netdev_flow *, netdev_flow_);
3761
3762
0
    atomic_read_relaxed(&netdev_flow->stats.packet_count, &n);
3763
0
    stats->n_packets = n;
3764
0
    atomic_read_relaxed(&netdev_flow->stats.byte_count, &n);
3765
0
    stats->n_bytes = n;
3766
0
    atomic_read_relaxed(&netdev_flow->stats.used, &used);
3767
0
    stats->used = used;
3768
0
    atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
3769
0
    stats->tcp_flags = flags;
3770
3771
0
    if (dpif_netdev_get_flow_offload_status(dp, netdev_flow,
3772
0
                                            &offload_stats, &offload_attrs)) {
3773
0
        stats->n_packets += offload_stats.n_packets;
3774
0
        stats->n_bytes += offload_stats.n_bytes;
3775
0
        stats->used = MAX(stats->used, offload_stats.used);
3776
0
        stats->tcp_flags |= offload_stats.tcp_flags;
3777
0
        if (attrs) {
3778
0
            attrs->offloaded = offload_attrs.offloaded;
3779
0
            attrs->dp_layer = offload_attrs.dp_layer;
3780
0
        }
3781
0
    } else if (attrs) {
3782
0
        attrs->offloaded = false;
3783
0
        attrs->dp_layer = "ovs";
3784
0
    }
3785
0
}
3786
3787
/* Converts to the dpif_flow format, using 'key_buf' and 'mask_buf' for
3788
 * storing the netlink-formatted key/mask. 'key_buf' may be the same as
3789
 * 'mask_buf'. Actions will be returned without copying, by relying on RCU to
3790
 * protect them. */
3791
static void
3792
dp_netdev_flow_to_dpif_flow(const struct dp_netdev *dp,
3793
                            const struct dp_netdev_flow *netdev_flow,
3794
                            struct ofpbuf *key_buf, struct ofpbuf *mask_buf,
3795
                            struct dpif_flow *flow, bool terse)
3796
0
{
3797
0
    if (terse) {
3798
0
        memset(flow, 0, sizeof *flow);
3799
0
    } else {
3800
0
        struct flow_wildcards wc;
3801
0
        struct dp_netdev_actions *actions;
3802
0
        size_t offset;
3803
0
        struct odp_flow_key_parms odp_parms = {
3804
0
            .flow = &netdev_flow->flow,
3805
0
            .mask = &wc.masks,
3806
0
            .support = dp_netdev_support,
3807
0
        };
3808
3809
0
        miniflow_expand(&netdev_flow->cr.mask->mf, &wc.masks);
3810
        /* in_port is exact matched, but we have left it out from the mask for
3811
         * optimnization reasons. Add in_port back to the mask. */
3812
0
        wc.masks.in_port.odp_port = ODPP_NONE;
3813
3814
        /* Key */
3815
0
        offset = key_buf->size;
3816
0
        flow->key = ofpbuf_tail(key_buf);
3817
0
        odp_flow_key_from_flow(&odp_parms, key_buf);
3818
0
        flow->key_len = key_buf->size - offset;
3819
3820
        /* Mask */
3821
0
        offset = mask_buf->size;
3822
0
        flow->mask = ofpbuf_tail(mask_buf);
3823
0
        odp_parms.key_buf = key_buf;
3824
0
        odp_flow_key_from_mask(&odp_parms, mask_buf);
3825
0
        flow->mask_len = mask_buf->size - offset;
3826
3827
        /* Actions */
3828
0
        actions = dp_netdev_flow_get_actions(netdev_flow);
3829
0
        flow->actions = actions->actions;
3830
0
        flow->actions_len = actions->size;
3831
0
    }
3832
3833
0
    flow->ufid = netdev_flow->ufid;
3834
0
    flow->ufid_present = true;
3835
0
    flow->pmd_id = netdev_flow->pmd_id;
3836
3837
0
    get_dpif_flow_status(dp, netdev_flow, &flow->stats, &flow->attrs);
3838
0
    flow->attrs.dp_extra_info = netdev_flow->dp_extra_info;
3839
0
}
3840
3841
static int
3842
dpif_netdev_mask_from_nlattrs(const struct nlattr *key, uint32_t key_len,
3843
                              const struct nlattr *mask_key,
3844
                              uint32_t mask_key_len, const struct flow *flow,
3845
                              struct flow_wildcards *wc, bool probe)
3846
0
{
3847
0
    enum odp_key_fitness fitness;
3848
3849
0
    fitness = odp_flow_key_to_mask(mask_key, mask_key_len, wc, flow, NULL);
3850
0
    if (fitness) {
3851
0
        if (!probe) {
3852
            /* This should not happen: it indicates that
3853
             * odp_flow_key_from_mask() and odp_flow_key_to_mask()
3854
             * disagree on the acceptable form of a mask.  Log the problem
3855
             * as an error, with enough details to enable debugging. */
3856
0
            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
3857
3858
0
            if (!VLOG_DROP_ERR(&rl)) {
3859
0
                struct ds s;
3860
3861
0
                ds_init(&s);
3862
0
                odp_flow_format(key, key_len, mask_key, mask_key_len, NULL, &s,
3863
0
                                true, true);
3864
0
                VLOG_ERR("internal error parsing flow mask %s (%s)",
3865
0
                ds_cstr(&s), odp_key_fitness_to_string(fitness));
3866
0
                ds_destroy(&s);
3867
0
            }
3868
0
        }
3869
3870
0
        return EINVAL;
3871
0
    }
3872
3873
0
    return 0;
3874
0
}
3875
3876
static int
3877
dpif_netdev_flow_from_nlattrs(const struct nlattr *key, uint32_t key_len,
3878
                              struct flow *flow, bool probe)
3879
0
{
3880
0
    if (odp_flow_key_to_flow(key, key_len, flow, NULL)) {
3881
0
        if (!probe) {
3882
            /* This should not happen: it indicates that
3883
             * odp_flow_key_from_flow() and odp_flow_key_to_flow() disagree on
3884
             * the acceptable form of a flow.  Log the problem as an error,
3885
             * with enough details to enable debugging. */
3886
0
            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
3887
3888
0
            if (!VLOG_DROP_ERR(&rl)) {
3889
0
                struct ds s;
3890
3891
0
                ds_init(&s);
3892
0
                odp_flow_format(key, key_len, NULL, 0, NULL, &s, true, false);
3893
0
                VLOG_ERR("internal error parsing flow key %s", ds_cstr(&s));
3894
0
                ds_destroy(&s);
3895
0
            }
3896
0
        }
3897
3898
0
        return EINVAL;
3899
0
    }
3900
3901
0
    if (flow->ct_state & DP_NETDEV_CS_UNSUPPORTED_MASK) {
3902
0
        return EINVAL;
3903
0
    }
3904
3905
0
    return 0;
3906
0
}
3907
3908
static int
3909
dpif_netdev_flow_get(const struct dpif *dpif, const struct dpif_flow_get *get)
3910
0
{
3911
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
3912
0
    struct dp_netdev_flow *netdev_flow;
3913
0
    struct dp_netdev_pmd_thread *pmd;
3914
0
    struct hmapx to_find = HMAPX_INITIALIZER(&to_find);
3915
0
    struct hmapx_node *node;
3916
0
    int error = EINVAL;
3917
3918
0
    if (get->pmd_id == PMD_ID_NULL) {
3919
0
        CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3920
0
            if (dp_netdev_pmd_try_ref(pmd) && !hmapx_add(&to_find, pmd)) {
3921
0
                dp_netdev_pmd_unref(pmd);
3922
0
            }
3923
0
        }
3924
0
    } else {
3925
0
        pmd = dp_netdev_get_pmd(dp, get->pmd_id);
3926
0
        if (!pmd) {
3927
0
            goto out;
3928
0
        }
3929
0
        hmapx_add(&to_find, pmd);
3930
0
    }
3931
3932
0
    if (!hmapx_count(&to_find)) {
3933
0
        goto out;
3934
0
    }
3935
3936
0
    HMAPX_FOR_EACH (node, &to_find) {
3937
0
        pmd = (struct dp_netdev_pmd_thread *) node->data;
3938
0
        netdev_flow = dp_netdev_pmd_find_flow(pmd, get->ufid, get->key,
3939
0
                                              get->key_len);
3940
0
        if (netdev_flow) {
3941
0
            dp_netdev_flow_to_dpif_flow(dp, netdev_flow, get->buffer,
3942
0
                                        get->buffer, get->flow, false);
3943
0
            error = 0;
3944
0
            break;
3945
0
        } else {
3946
0
            error = ENOENT;
3947
0
        }
3948
0
    }
3949
3950
0
    HMAPX_FOR_EACH (node, &to_find) {
3951
0
        pmd = (struct dp_netdev_pmd_thread *) node->data;
3952
0
        dp_netdev_pmd_unref(pmd);
3953
0
    }
3954
0
out:
3955
0
    hmapx_destroy(&to_find);
3956
0
    return error;
3957
0
}
3958
3959
static void
3960
dp_netdev_get_mega_ufid(const struct match *match, ovs_u128 *mega_ufid)
3961
0
{
3962
0
    struct flow masked_flow;
3963
0
    size_t i;
3964
3965
0
    for (i = 0; i < sizeof(struct flow); i++) {
3966
0
        ((uint8_t *)&masked_flow)[i] = ((uint8_t *)&match->flow)[i] &
3967
0
                                       ((uint8_t *)&match->wc)[i];
3968
0
    }
3969
0
    odp_flow_key_hash(&masked_flow, sizeof masked_flow, mega_ufid);
3970
0
}
3971
3972
uint64_t
3973
dp_netdev_simple_match_mark(odp_port_t in_port, ovs_be16 dl_type,
3974
                            uint8_t nw_frag, ovs_be16 vlan_tci)
3975
0
{
3976
    /* Simple Match Mark:
3977
     *
3978
     * BE:
3979
     * +-----------------+-------------++---------+---+-----------+
3980
     * |     in_port     |   dl_type   || nw_frag |CFI|  VID(12)  |
3981
     * +-----------------+-------------++---------+---+-----------+
3982
     * 0                 32          47 49         51  52     63
3983
     *
3984
     * LE:
3985
     * +-----------------+-------------+------++-------+---+------+
3986
     * |     in_port     |   dl_type   |VID(8)||nw_frag|CFI|VID(4)|
3987
     * +-----------------+-------------+------++-------+---+------+
3988
     * 0                 32          47 48  55  57   59 60  61   63
3989
     *
3990
     *         Big Endian              Little Endian
3991
     * in_port : 32 bits [ 0..31]  in_port : 32 bits [ 0..31]
3992
     * dl_type : 16 bits [32..47]  dl_type : 16 bits [32..47]
3993
     * <empty> :  1 bit  [48..48]  vlan VID:  8 bits [48..55]
3994
     * nw_frag :  2 bits [49..50]  <empty> :  1 bit  [56..56]
3995
     * vlan CFI:  1 bit  [51..51]  nw_frag :  2 bits [57..59]
3996
     * vlan VID: 12 bits [52..63]  vlan CFI:  1 bit  [60..60]
3997
     *                             vlan VID:  4 bits [61..63]
3998
     *
3999
     * Layout is different for LE and BE in order to save a couple of
4000
     * network to host translations.
4001
     * */
4002
0
    return ((uint64_t) odp_to_u32(in_port) << 32)
4003
0
           | ((OVS_FORCE uint32_t) dl_type << 16)
4004
#if WORDS_BIGENDIAN
4005
           | (((uint16_t) nw_frag & FLOW_NW_FRAG_MASK) << VLAN_PCP_SHIFT)
4006
#else
4007
0
           | ((nw_frag & FLOW_NW_FRAG_MASK) << (VLAN_PCP_SHIFT - 8))
4008
0
#endif
4009
0
           | (OVS_FORCE uint16_t) (vlan_tci & htons(VLAN_VID_MASK | VLAN_CFI));
4010
0
}
4011
4012
struct dp_netdev_flow *
4013
dp_netdev_simple_match_lookup(const struct dp_netdev_pmd_thread *pmd,
4014
                              odp_port_t in_port, ovs_be16 dl_type,
4015
                              uint8_t nw_frag, ovs_be16 vlan_tci)
4016
0
{
4017
0
    uint64_t mark = dp_netdev_simple_match_mark(in_port, dl_type,
4018
0
                                                nw_frag, vlan_tci);
4019
0
    uint32_t hash = hash_uint64(mark);
4020
0
    struct dp_netdev_flow *flow;
4021
0
    bool found = false;
4022
4023
0
    CMAP_FOR_EACH_WITH_HASH (flow, simple_match_node,
4024
0
                             hash, &pmd->simple_match_table) {
4025
0
        if (flow->simple_match_mark == mark) {
4026
0
            found = true;
4027
0
            break;
4028
0
        }
4029
0
    }
4030
0
    return found ? flow : NULL;
4031
0
}
4032
4033
bool
4034
dp_netdev_simple_match_enabled(const struct dp_netdev_pmd_thread *pmd,
4035
                               odp_port_t in_port)
4036
0
{
4037
0
    return ccmap_find(&pmd->n_flows, odp_to_u32(in_port))
4038
0
           == ccmap_find(&pmd->n_simple_flows, odp_to_u32(in_port));
4039
0
}
4040
4041
static void
4042
dp_netdev_simple_match_insert(struct dp_netdev_pmd_thread *pmd,
4043
                              struct dp_netdev_flow *dp_flow)
4044
    OVS_REQUIRES(pmd->flow_mutex)
4045
0
{
4046
0
    odp_port_t in_port = dp_flow->flow.in_port.odp_port;
4047
0
    ovs_be16 vlan_tci = dp_flow->flow.vlans[0].tci;
4048
0
    ovs_be16 dl_type = dp_flow->flow.dl_type;
4049
0
    uint8_t nw_frag = dp_flow->flow.nw_frag;
4050
4051
0
    if (!dp_netdev_flow_ref(dp_flow)) {
4052
0
        return;
4053
0
    }
4054
4055
    /* Avoid double insertion.  Should not happen in practice. */
4056
0
    dp_netdev_simple_match_remove(pmd, dp_flow);
4057
4058
0
    uint64_t mark = dp_netdev_simple_match_mark(in_port, dl_type,
4059
0
                                                nw_frag, vlan_tci);
4060
0
    uint32_t hash = hash_uint64(mark);
4061
4062
0
    dp_flow->simple_match_mark = mark;
4063
0
    cmap_insert(&pmd->simple_match_table,
4064
0
                CONST_CAST(struct cmap_node *, &dp_flow->simple_match_node),
4065
0
                hash);
4066
0
    ccmap_inc(&pmd->n_simple_flows, odp_to_u32(in_port));
4067
4068
0
    VLOG_DBG("Simple match insert: "
4069
0
             "core_id(%d),in_port(%"PRIu32"),mark(0x%016"PRIx64").",
4070
0
             pmd->core_id, in_port, mark);
4071
0
}
4072
4073
static void
4074
dp_netdev_simple_match_remove(struct dp_netdev_pmd_thread *pmd,
4075
                               struct dp_netdev_flow *dp_flow)
4076
    OVS_REQUIRES(pmd->flow_mutex)
4077
0
{
4078
0
    odp_port_t in_port = dp_flow->flow.in_port.odp_port;
4079
0
    ovs_be16 vlan_tci = dp_flow->flow.vlans[0].tci;
4080
0
    ovs_be16 dl_type = dp_flow->flow.dl_type;
4081
0
    uint8_t nw_frag = dp_flow->flow.nw_frag;
4082
0
    struct dp_netdev_flow *flow;
4083
0
    uint64_t mark = dp_netdev_simple_match_mark(in_port, dl_type,
4084
0
                                                nw_frag, vlan_tci);
4085
0
    uint32_t hash = hash_uint64(mark);
4086
4087
0
    flow = dp_netdev_simple_match_lookup(pmd, in_port, dl_type,
4088
0
                                         nw_frag, vlan_tci);
4089
0
    if (flow == dp_flow) {
4090
0
        VLOG_DBG("Simple match remove: "
4091
0
                 "core_id(%d),in_port(%"PRIu32"),mark(0x%016"PRIx64").",
4092
0
                 pmd->core_id, in_port, mark);
4093
0
        cmap_remove(&pmd->simple_match_table,
4094
0
                    CONST_CAST(struct cmap_node *, &flow->simple_match_node),
4095
0
                    hash);
4096
0
        ccmap_dec(&pmd->n_simple_flows, odp_to_u32(in_port));
4097
0
        dp_netdev_flow_unref(flow);
4098
0
    }
4099
0
}
4100
4101
static bool
4102
dp_netdev_flow_is_simple_match(const struct match *match)
4103
0
{
4104
0
    const struct flow *flow = &match->flow;
4105
0
    const struct flow_wildcards *wc = &match->wc;
4106
4107
0
    if (flow->recirc_id || flow->packet_type != htonl(PT_ETH)) {
4108
0
        return false;
4109
0
    }
4110
4111
    /* Check that flow matches only minimal set of fields that always set.
4112
     * Also checking that VLAN VID+CFI is an exact match, because these
4113
     * are not mandatory and could be masked. */
4114
0
    struct flow_wildcards *minimal = xmalloc(sizeof *minimal);
4115
0
    ovs_be16 vlan_tci_mask = htons(VLAN_VID_MASK | VLAN_CFI);
4116
4117
0
    flow_wildcards_init_catchall(minimal);
4118
    /* 'dpif-netdev' always has following in exact match:
4119
     *   - recirc_id                   <-- recirc_id == 0 checked on input.
4120
     *   - in_port                     <-- Will be checked on input.
4121
     *   - packet_type                 <-- Assuming all packets are PT_ETH.
4122
     *   - dl_type                     <-- Need to match with.
4123
     *   - vlan_tci                    <-- Need to match with.
4124
     *   - and nw_frag for ip packets. <-- Need to match with.
4125
     */
4126
0
    WC_MASK_FIELD(minimal, recirc_id);
4127
0
    WC_MASK_FIELD(minimal, in_port);
4128
0
    WC_MASK_FIELD(minimal, packet_type);
4129
0
    WC_MASK_FIELD(minimal, dl_type);
4130
0
    WC_MASK_FIELD_MASK(minimal, vlans[0].tci, vlan_tci_mask);
4131
0
    WC_MASK_FIELD_MASK(minimal, nw_frag, FLOW_NW_FRAG_MASK);
4132
4133
0
    if (flow_wildcards_has_extra(minimal, wc)
4134
0
        || wc->masks.vlans[0].tci != vlan_tci_mask) {
4135
0
        free(minimal);
4136
0
        return false;
4137
0
    }
4138
0
    free(minimal);
4139
4140
0
    return true;
4141
0
}
4142
4143
static struct dp_netdev_flow *
4144
dp_netdev_flow_add(struct dp_netdev_pmd_thread *pmd,
4145
                   struct match *match, const ovs_u128 *ufid,
4146
                   const struct nlattr *actions, size_t actions_len,
4147
                   odp_port_t orig_in_port)
4148
    OVS_REQUIRES(pmd->flow_mutex)
4149
0
{
4150
0
    struct ds extra_info = DS_EMPTY_INITIALIZER;
4151
0
    struct dp_netdev_flow *flow;
4152
0
    struct netdev_flow_key mask;
4153
0
    struct dpcls *cls;
4154
0
    size_t unit;
4155
4156
    /* Make sure in_port is exact matched before we read it. */
4157
0
    ovs_assert(match->wc.masks.in_port.odp_port == ODPP_NONE);
4158
0
    odp_port_t in_port = match->flow.in_port.odp_port;
4159
4160
    /* As we select the dpcls based on the port number, each netdev flow
4161
     * belonging to the same dpcls will have the same odp_port value.
4162
     * For performance reasons we wildcard odp_port here in the mask.  In the
4163
     * typical case dp_hash is also wildcarded, and the resulting 8-byte
4164
     * chunk {dp_hash, in_port} will be ignored by netdev_flow_mask_init() and
4165
     * will not be part of the subtable mask.
4166
     * This will speed up the hash computation during dpcls_lookup() because
4167
     * there is one less call to hash_add64() in this case. */
4168
0
    match->wc.masks.in_port.odp_port = 0;
4169
0
    netdev_flow_mask_init(&mask, match);
4170
0
    match->wc.masks.in_port.odp_port = ODPP_NONE;
4171
4172
    /* Make sure wc does not have metadata. */
4173
0
    ovs_assert(!FLOWMAP_HAS_FIELD(&mask.mf.map, metadata)
4174
0
               && !FLOWMAP_HAS_FIELD(&mask.mf.map, regs));
4175
4176
    /* Do not allocate extra space. */
4177
0
    flow = xmalloc(sizeof *flow - sizeof flow->cr.flow.mf + mask.len);
4178
0
    memset(&flow->stats, 0, sizeof flow->stats);
4179
0
    atomic_init(&flow->netdev_flow_get_result, 0);
4180
0
    memset(&flow->last_stats, 0, sizeof flow->last_stats);
4181
0
    memset(&flow->last_attrs, 0, sizeof flow->last_attrs);
4182
0
    flow->dead = false;
4183
0
    flow->batch = NULL;
4184
0
    flow->mark = INVALID_FLOW_MARK;
4185
0
    flow->orig_in_port = orig_in_port;
4186
0
    *CONST_CAST(unsigned *, &flow->pmd_id) = pmd->core_id;
4187
0
    *CONST_CAST(struct flow *, &flow->flow) = match->flow;
4188
0
    *CONST_CAST(ovs_u128 *, &flow->ufid) = *ufid;
4189
0
    ovs_refcount_init(&flow->ref_cnt);
4190
0
    ovsrcu_set(&flow->actions, dp_netdev_actions_create(actions, actions_len));
4191
4192
0
    dp_netdev_get_mega_ufid(match, CONST_CAST(ovs_u128 *, &flow->mega_ufid));
4193
0
    netdev_flow_key_init_masked(&flow->cr.flow, &match->flow, &mask);
4194
4195
    /* Select dpcls for in_port. Relies on in_port to be exact match. */
4196
0
    cls = dp_netdev_pmd_find_dpcls(pmd, in_port);
4197
0
    dpcls_insert(cls, &flow->cr, &mask);
4198
4199
0
    ds_put_cstr(&extra_info, "miniflow_bits(");
4200
0
    FLOWMAP_FOR_EACH_UNIT (unit) {
4201
0
        if (unit) {
4202
0
            ds_put_char(&extra_info, ',');
4203
0
        }
4204
0
        ds_put_format(&extra_info, "%d",
4205
0
                      count_1bits(flow->cr.mask->mf.map.bits[unit]));
4206
0
    }
4207
0
    ds_put_char(&extra_info, ')');
4208
0
    flow->dp_extra_info = ds_steal_cstr(&extra_info);
4209
0
    ds_destroy(&extra_info);
4210
4211
0
    cmap_insert(&pmd->flow_table, CONST_CAST(struct cmap_node *, &flow->node),
4212
0
                dp_netdev_flow_hash(&flow->ufid));
4213
0
    ccmap_inc(&pmd->n_flows, odp_to_u32(in_port));
4214
4215
0
    if (dp_netdev_flow_is_simple_match(match)) {
4216
0
        dp_netdev_simple_match_insert(pmd, flow);
4217
0
    }
4218
4219
0
    queue_netdev_flow_put(pmd, flow, match, actions, actions_len,
4220
0
                          DP_NETDEV_FLOW_OFFLOAD_OP_ADD);
4221
0
    log_netdev_flow_change(flow, match, NULL, actions, actions_len);
4222
4223
0
    return flow;
4224
0
}
4225
4226
static int
4227
flow_put_on_pmd(struct dp_netdev_pmd_thread *pmd,
4228
                struct netdev_flow_key *key,
4229
                struct match *match,
4230
                ovs_u128 *ufid,
4231
                const struct dpif_flow_put *put,
4232
                struct dpif_flow_stats *stats)
4233
0
{
4234
0
    struct dp_netdev_flow *netdev_flow = NULL;
4235
0
    int error = 0;
4236
4237
0
    if (stats) {
4238
0
        memset(stats, 0, sizeof *stats);
4239
0
    }
4240
4241
0
    ovs_mutex_lock(&pmd->flow_mutex);
4242
0
    if (put->ufid) {
4243
0
        netdev_flow = dp_netdev_pmd_find_flow(pmd, put->ufid,
4244
0
                                              put->key, put->key_len);
4245
0
    } else {
4246
        /* Use key instead of the locally generated ufid
4247
         * to search netdev_flow. */
4248
0
        netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL);
4249
0
    }
4250
4251
0
    if (put->flags & DPIF_FP_CREATE) {
4252
0
        if (!netdev_flow) {
4253
0
            dp_netdev_flow_add(pmd, match, ufid,
4254
0
                               put->actions, put->actions_len, ODPP_NONE);
4255
0
        } else {
4256
0
            error = EEXIST;
4257
0
        }
4258
0
        goto exit;
4259
0
    }
4260
4261
0
    if (put->flags & DPIF_FP_MODIFY) {
4262
0
        if (!netdev_flow) {
4263
0
            error = ENOENT;
4264
0
        } else {
4265
0
            if (!put->ufid && !flow_equal(&match->flow, &netdev_flow->flow)) {
4266
                /* Overlapping flow. */
4267
0
                error = EINVAL;
4268
0
                goto exit;
4269
0
            }
4270
4271
0
            struct dp_netdev_actions *new_actions;
4272
0
            struct dp_netdev_actions *old_actions;
4273
4274
0
            new_actions = dp_netdev_actions_create(put->actions,
4275
0
                                                   put->actions_len);
4276
4277
0
            old_actions = dp_netdev_flow_get_actions(netdev_flow);
4278
0
            ovsrcu_set(&netdev_flow->actions, new_actions);
4279
4280
0
            queue_netdev_flow_put(pmd, netdev_flow, match,
4281
0
                                  put->actions, put->actions_len,
4282
0
                                  DP_NETDEV_FLOW_OFFLOAD_OP_MOD);
4283
0
            log_netdev_flow_change(netdev_flow, match, old_actions,
4284
0
                                   put->actions, put->actions_len);
4285
4286
0
            if (stats) {
4287
0
                get_dpif_flow_status(pmd->dp, netdev_flow, stats, NULL);
4288
0
            }
4289
0
            if (put->flags & DPIF_FP_ZERO_STATS) {
4290
                /* XXX: The userspace datapath uses thread local statistics
4291
                 * (for flows), which should be updated only by the owning
4292
                 * thread.  Since we cannot write on stats memory here,
4293
                 * we choose not to support this flag.  Please note:
4294
                 * - This feature is currently used only by dpctl commands with
4295
                 *   option --clear.
4296
                 * - Should the need arise, this operation can be implemented
4297
                 *   by keeping a base value (to be update here) for each
4298
                 *   counter, and subtracting it before outputting the stats */
4299
0
                error = EOPNOTSUPP;
4300
0
            }
4301
0
            ovsrcu_postpone(dp_netdev_actions_free, old_actions);
4302
0
        }
4303
0
    }
4304
4305
0
exit:
4306
0
    ovs_mutex_unlock(&pmd->flow_mutex);
4307
0
    return error;
4308
0
}
4309
4310
static int
4311
dpif_netdev_flow_put(struct dpif *dpif, const struct dpif_flow_put *put)
4312
0
{
4313
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
4314
0
    struct netdev_flow_key key;
4315
0
    struct dp_netdev_pmd_thread *pmd;
4316
0
    struct match match;
4317
0
    ovs_u128 ufid;
4318
0
    int error;
4319
0
    bool probe = put->flags & DPIF_FP_PROBE;
4320
4321
0
    if (put->stats) {
4322
0
        memset(put->stats, 0, sizeof *put->stats);
4323
0
    }
4324
0
    error = dpif_netdev_flow_from_nlattrs(put->key, put->key_len, &match.flow,
4325
0
                                          probe);
4326
0
    if (error) {
4327
0
        return error;
4328
0
    }
4329
0
    error = dpif_netdev_mask_from_nlattrs(put->key, put->key_len,
4330
0
                                          put->mask, put->mask_len,
4331
0
                                          &match.flow, &match.wc, probe);
4332
0
    if (error) {
4333
0
        return error;
4334
0
    }
4335
4336
0
    if (match.wc.masks.in_port.odp_port != ODPP_NONE) {
4337
0
        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
4338
4339
0
        VLOG_ERR_RL(&rl, "failed to put%s flow: in_port is not an exact match",
4340
0
                    (put->flags & DPIF_FP_CREATE) ? "[create]"
4341
0
                    : (put->flags & DPIF_FP_MODIFY) ? "[modify]" : "[zero]");
4342
0
        return EINVAL;
4343
0
    }
4344
4345
0
    if (put->ufid) {
4346
0
        ufid = *put->ufid;
4347
0
    } else {
4348
0
        odp_flow_key_hash(&match.flow, sizeof match.flow, &ufid);
4349
0
    }
4350
4351
    /* The Netlink encoding of datapath flow keys cannot express
4352
     * wildcarding the presence of a VLAN tag. Instead, a missing VLAN
4353
     * tag is interpreted as exact match on the fact that there is no
4354
     * VLAN.  Unless we refactor a lot of code that translates between
4355
     * Netlink and struct flow representations, we have to do the same
4356
     * here.  This must be in sync with 'match' in handle_packet_upcall(). */
4357
0
    if (!match.wc.masks.vlans[0].tci) {
4358
0
        match.wc.masks.vlans[0].tci = htons(VLAN_VID_MASK | VLAN_CFI);
4359
0
    }
4360
4361
    /* Must produce a netdev_flow_key for lookup.
4362
     * Use the same method as employed to create the key when adding
4363
     * the flow to the dplcs to make sure they match.
4364
     * We need to put in the unmasked key as flow_put_on_pmd() will first try
4365
     * to see if an entry exists doing a packet type lookup. As masked-out
4366
     * fields are interpreted as zeros, they could falsely match a wider IP
4367
     * address mask. Installation of the flow will use the match variable. */
4368
0
    netdev_flow_key_init(&key, &match.flow);
4369
4370
0
    if (put->pmd_id == PMD_ID_NULL) {
4371
0
        if (cmap_count(&dp->poll_threads) == 0) {
4372
0
            return EINVAL;
4373
0
        }
4374
0
        CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4375
0
            struct dpif_flow_stats pmd_stats;
4376
0
            int pmd_error;
4377
4378
0
            pmd_error = flow_put_on_pmd(pmd, &key, &match, &ufid, put,
4379
0
                                        &pmd_stats);
4380
0
            if (pmd_error) {
4381
0
                error = pmd_error;
4382
0
            } else if (put->stats) {
4383
0
                put->stats->n_packets += pmd_stats.n_packets;
4384
0
                put->stats->n_bytes += pmd_stats.n_bytes;
4385
0
                put->stats->used = MAX(put->stats->used, pmd_stats.used);
4386
0
                put->stats->tcp_flags |= pmd_stats.tcp_flags;
4387
0
            }
4388
0
        }
4389
0
    } else {
4390
0
        pmd = dp_netdev_get_pmd(dp, put->pmd_id);
4391
0
        if (!pmd) {
4392
0
            return EINVAL;
4393
0
        }
4394
0
        error = flow_put_on_pmd(pmd, &key, &match, &ufid, put, put->stats);
4395
0
        dp_netdev_pmd_unref(pmd);
4396
0
    }
4397
4398
0
    return error;
4399
0
}
4400
4401
static int
4402
flow_del_on_pmd(struct dp_netdev_pmd_thread *pmd,
4403
                struct dpif_flow_stats *stats,
4404
                const struct dpif_flow_del *del)
4405
0
{
4406
0
    struct dp_netdev_flow *netdev_flow;
4407
0
    int error = 0;
4408
4409
0
    ovs_mutex_lock(&pmd->flow_mutex);
4410
0
    netdev_flow = dp_netdev_pmd_find_flow(pmd, del->ufid, del->key,
4411
0
                                          del->key_len);
4412
0
    if (netdev_flow) {
4413
0
        if (stats) {
4414
0
            get_dpif_flow_status(pmd->dp, netdev_flow, stats, NULL);
4415
0
        }
4416
0
        dp_netdev_pmd_remove_flow(pmd, netdev_flow);
4417
0
    } else {
4418
0
        error = ENOENT;
4419
0
    }
4420
0
    ovs_mutex_unlock(&pmd->flow_mutex);
4421
4422
0
    return error;
4423
0
}
4424
4425
static int
4426
dpif_netdev_flow_del(struct dpif *dpif, const struct dpif_flow_del *del)
4427
0
{
4428
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
4429
0
    struct dp_netdev_pmd_thread *pmd;
4430
0
    int error = 0;
4431
4432
0
    if (del->stats) {
4433
0
        memset(del->stats, 0, sizeof *del->stats);
4434
0
    }
4435
4436
0
    if (del->pmd_id == PMD_ID_NULL) {
4437
0
        if (cmap_count(&dp->poll_threads) == 0) {
4438
0
            return EINVAL;
4439
0
        }
4440
0
        CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4441
0
            struct dpif_flow_stats pmd_stats;
4442
0
            int pmd_error;
4443
4444
0
            pmd_error = flow_del_on_pmd(pmd, &pmd_stats, del);
4445
0
            if (pmd_error) {
4446
0
                error = pmd_error;
4447
0
            } else if (del->stats) {
4448
0
                del->stats->n_packets += pmd_stats.n_packets;
4449
0
                del->stats->n_bytes += pmd_stats.n_bytes;
4450
0
                del->stats->used = MAX(del->stats->used, pmd_stats.used);
4451
0
                del->stats->tcp_flags |= pmd_stats.tcp_flags;
4452
0
            }
4453
0
        }
4454
0
    } else {
4455
0
        pmd = dp_netdev_get_pmd(dp, del->pmd_id);
4456
0
        if (!pmd) {
4457
0
            return EINVAL;
4458
0
        }
4459
0
        error = flow_del_on_pmd(pmd, del->stats, del);
4460
0
        dp_netdev_pmd_unref(pmd);
4461
0
    }
4462
4463
4464
0
    return error;
4465
0
}
4466
4467
struct dpif_netdev_flow_dump {
4468
    struct dpif_flow_dump up;
4469
    struct cmap_position poll_thread_pos;
4470
    struct cmap_position flow_pos;
4471
    struct dp_netdev_pmd_thread *cur_pmd;
4472
    int status;
4473
    struct ovs_mutex mutex;
4474
};
4475
4476
static struct dpif_netdev_flow_dump *
4477
dpif_netdev_flow_dump_cast(struct dpif_flow_dump *dump)
4478
0
{
4479
0
    return CONTAINER_OF(dump, struct dpif_netdev_flow_dump, up);
4480
0
}
4481
4482
static struct dpif_flow_dump *
4483
dpif_netdev_flow_dump_create(const struct dpif *dpif_, bool terse,
4484
                             struct dpif_flow_dump_types *types OVS_UNUSED)
4485
0
{
4486
0
    struct dpif_netdev_flow_dump *dump;
4487
4488
0
    dump = xzalloc(sizeof *dump);
4489
0
    dpif_flow_dump_init(&dump->up, dpif_);
4490
0
    dump->up.terse = terse;
4491
0
    ovs_mutex_init(&dump->mutex);
4492
4493
0
    return &dump->up;
4494
0
}
4495
4496
static int
4497
dpif_netdev_flow_dump_destroy(struct dpif_flow_dump *dump_)
4498
0
{
4499
0
    struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
4500
4501
0
    ovs_mutex_destroy(&dump->mutex);
4502
0
    free(dump);
4503
0
    return 0;
4504
0
}
4505
4506
struct dpif_netdev_flow_dump_thread {
4507
    struct dpif_flow_dump_thread up;
4508
    struct dpif_netdev_flow_dump *dump;
4509
    struct odputil_keybuf keybuf[FLOW_DUMP_MAX_BATCH];
4510
    struct odputil_keybuf maskbuf[FLOW_DUMP_MAX_BATCH];
4511
};
4512
4513
static struct dpif_netdev_flow_dump_thread *
4514
dpif_netdev_flow_dump_thread_cast(struct dpif_flow_dump_thread *thread)
4515
0
{
4516
0
    return CONTAINER_OF(thread, struct dpif_netdev_flow_dump_thread, up);
4517
0
}
4518
4519
static struct dpif_flow_dump_thread *
4520
dpif_netdev_flow_dump_thread_create(struct dpif_flow_dump *dump_)
4521
0
{
4522
0
    struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
4523
0
    struct dpif_netdev_flow_dump_thread *thread;
4524
4525
0
    thread = xmalloc(sizeof *thread);
4526
0
    dpif_flow_dump_thread_init(&thread->up, &dump->up);
4527
0
    thread->dump = dump;
4528
0
    return &thread->up;
4529
0
}
4530
4531
static void
4532
dpif_netdev_flow_dump_thread_destroy(struct dpif_flow_dump_thread *thread_)
4533
0
{
4534
0
    struct dpif_netdev_flow_dump_thread *thread
4535
0
        = dpif_netdev_flow_dump_thread_cast(thread_);
4536
4537
0
    free(thread);
4538
0
}
4539
4540
static int
4541
dpif_netdev_flow_dump_next(struct dpif_flow_dump_thread *thread_,
4542
                           struct dpif_flow *flows, int max_flows)
4543
0
{
4544
0
    struct dpif_netdev_flow_dump_thread *thread
4545
0
        = dpif_netdev_flow_dump_thread_cast(thread_);
4546
0
    struct dpif_netdev_flow_dump *dump = thread->dump;
4547
0
    struct dp_netdev_flow *netdev_flows[FLOW_DUMP_MAX_BATCH];
4548
0
    struct dpif_netdev *dpif = dpif_netdev_cast(thread->up.dpif);
4549
0
    struct dp_netdev *dp = get_dp_netdev(&dpif->dpif);
4550
0
    int n_flows = 0;
4551
0
    int i;
4552
4553
0
    ovs_mutex_lock(&dump->mutex);
4554
0
    if (!dump->status) {
4555
0
        struct dp_netdev_pmd_thread *pmd = dump->cur_pmd;
4556
0
        int flow_limit = MIN(max_flows, FLOW_DUMP_MAX_BATCH);
4557
4558
        /* First call to dump_next(), extracts the first pmd thread.
4559
         * If there is no pmd thread, returns immediately. */
4560
0
        if (!pmd) {
4561
0
            pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
4562
0
            if (!pmd) {
4563
0
                ovs_mutex_unlock(&dump->mutex);
4564
0
                return n_flows;
4565
4566
0
            }
4567
0
        }
4568
4569
0
        do {
4570
0
            for (n_flows = 0; n_flows < flow_limit; n_flows++) {
4571
0
                struct cmap_node *node;
4572
4573
0
                node = cmap_next_position(&pmd->flow_table, &dump->flow_pos);
4574
0
                if (!node) {
4575
0
                    break;
4576
0
                }
4577
0
                netdev_flows[n_flows] = CONTAINER_OF(node,
4578
0
                                                     struct dp_netdev_flow,
4579
0
                                                     node);
4580
0
            }
4581
            /* When finishing dumping the current pmd thread, moves to
4582
             * the next. */
4583
0
            if (n_flows < flow_limit) {
4584
0
                memset(&dump->flow_pos, 0, sizeof dump->flow_pos);
4585
0
                dp_netdev_pmd_unref(pmd);
4586
0
                pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
4587
0
                if (!pmd) {
4588
0
                    dump->status = EOF;
4589
0
                    break;
4590
0
                }
4591
0
            }
4592
            /* Keeps the reference to next caller. */
4593
0
            dump->cur_pmd = pmd;
4594
4595
            /* If the current dump is empty, do not exit the loop, since the
4596
             * remaining pmds could have flows to be dumped.  Just dumps again
4597
             * on the new 'pmd'. */
4598
0
        } while (!n_flows);
4599
0
    }
4600
0
    ovs_mutex_unlock(&dump->mutex);
4601
4602
0
    for (i = 0; i < n_flows; i++) {
4603
0
        struct odputil_keybuf *maskbuf = &thread->maskbuf[i];
4604
0
        struct odputil_keybuf *keybuf = &thread->keybuf[i];
4605
0
        struct dp_netdev_flow *netdev_flow = netdev_flows[i];
4606
0
        struct dpif_flow *f = &flows[i];
4607
0
        struct ofpbuf key, mask;
4608
4609
0
        ofpbuf_use_stack(&key, keybuf, sizeof *keybuf);
4610
0
        ofpbuf_use_stack(&mask, maskbuf, sizeof *maskbuf);
4611
0
        dp_netdev_flow_to_dpif_flow(dp, netdev_flow, &key, &mask, f,
4612
0
                                    dump->up.terse);
4613
0
    }
4614
4615
0
    return n_flows;
4616
0
}
4617
4618
static int
4619
dpif_netdev_execute(struct dpif *dpif, struct dpif_execute *execute)
4620
    OVS_NO_THREAD_SAFETY_ANALYSIS
4621
0
{
4622
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
4623
0
    struct dp_netdev_pmd_thread *pmd;
4624
0
    struct dp_packet_batch pp;
4625
4626
0
    if (dp_packet_size(execute->packet) < ETH_HEADER_LEN ||
4627
0
        dp_packet_size(execute->packet) > UINT16_MAX) {
4628
0
        return EINVAL;
4629
0
    }
4630
4631
    /* Tries finding the 'pmd'.  If NULL is returned, that means
4632
     * the current thread is a non-pmd thread and should use
4633
     * dp_netdev_get_pmd(dp, NON_PMD_CORE_ID). */
4634
0
    pmd = ovsthread_getspecific(dp->per_pmd_key);
4635
0
    if (!pmd) {
4636
0
        pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
4637
0
        if (!pmd) {
4638
0
            return EBUSY;
4639
0
        }
4640
0
    }
4641
4642
0
    if (execute->probe) {
4643
        /* If this is part of a probe, Drop the packet, since executing
4644
         * the action may actually cause spurious packets be sent into
4645
         * the network. */
4646
0
        if (pmd->core_id == NON_PMD_CORE_ID) {
4647
0
            dp_netdev_pmd_unref(pmd);
4648
0
        }
4649
0
        return 0;
4650
0
    }
4651
4652
    /* If the current thread is non-pmd thread, acquires
4653
     * the 'non_pmd_mutex'. */
4654
0
    if (pmd->core_id == NON_PMD_CORE_ID) {
4655
0
        ovs_mutex_lock(&dp->non_pmd_mutex);
4656
0
    }
4657
4658
    /* Update current time in PMD context. We don't care about EMC insertion
4659
     * probability, because we are on a slow path. */
4660
0
    pmd_thread_ctx_time_update(pmd);
4661
4662
    /* The action processing expects the RSS hash to be valid, because
4663
     * it's always initialized at the beginning of datapath processing.
4664
     * In this case, though, 'execute->packet' may not have gone through
4665
     * the datapath at all, it may have been generated by the upper layer
4666
     * (OpenFlow packet-out, BFD frame, ...). */
4667
0
    if (!dp_packet_rss_valid(execute->packet)) {
4668
0
        dp_packet_set_rss_hash(execute->packet,
4669
0
                               flow_hash_5tuple(execute->flow, 0));
4670
0
    }
4671
4672
    /* Making a copy because the packet might be stolen during the execution
4673
     * and caller might still need it.  */
4674
0
    struct dp_packet *packet_clone = dp_packet_clone(execute->packet);
4675
0
    dp_packet_batch_init_packet(&pp, packet_clone);
4676
0
    dp_netdev_execute_actions(pmd, &pp, false, execute->flow,
4677
0
                              execute->actions, execute->actions_len);
4678
0
    dp_netdev_pmd_flush_output_packets(pmd, true);
4679
4680
0
    if (pmd->core_id == NON_PMD_CORE_ID) {
4681
0
        ovs_mutex_unlock(&dp->non_pmd_mutex);
4682
0
        dp_netdev_pmd_unref(pmd);
4683
0
    }
4684
4685
0
    if (dp_packet_batch_size(&pp) == 1) {
4686
        /* Packet wasn't dropped during the execution.  Swapping content with
4687
         * the original packet, because the caller might expect actions to
4688
         * modify it.  Uisng the packet from a batch instead of 'packet_clone'
4689
         * because it maybe stolen and replaced by other packet, e.g. by
4690
         * the fragmentation engine. */
4691
0
        dp_packet_swap(execute->packet, pp.packets[0]);
4692
0
        dp_packet_delete_batch(&pp, true);
4693
0
    } else if (dp_packet_batch_size(&pp)) {
4694
        /* FIXME: We have more packets than expected.  Likely, we got IP
4695
         * fragments of the reassembled packet.  Dropping them here as we have
4696
         * no way to get them to the caller.  It might be that all the required
4697
         * actions with them are already executed, but it also might not be a
4698
         * case, e.g. if dpif_netdev_execute() called to execute a single
4699
         * tunnel push. */
4700
0
        dp_packet_delete_batch(&pp, true);
4701
0
    }
4702
4703
0
    return 0;
4704
0
}
4705
4706
static void
4707
dpif_netdev_operate(struct dpif *dpif, struct dpif_op **ops, size_t n_ops,
4708
                    enum dpif_offload_type offload_type OVS_UNUSED)
4709
0
{
4710
0
    size_t i;
4711
4712
0
    for (i = 0; i < n_ops; i++) {
4713
0
        struct dpif_op *op = ops[i];
4714
4715
0
        switch (op->type) {
4716
0
        case DPIF_OP_FLOW_PUT:
4717
0
            op->error = dpif_netdev_flow_put(dpif, &op->flow_put);
4718
0
            break;
4719
4720
0
        case DPIF_OP_FLOW_DEL:
4721
0
            op->error = dpif_netdev_flow_del(dpif, &op->flow_del);
4722
0
            break;
4723
4724
0
        case DPIF_OP_EXECUTE:
4725
0
            op->error = dpif_netdev_execute(dpif, &op->execute);
4726
0
            break;
4727
4728
0
        case DPIF_OP_FLOW_GET:
4729
0
            op->error = dpif_netdev_flow_get(dpif, &op->flow_get);
4730
0
            break;
4731
0
        }
4732
0
    }
4733
0
}
4734
4735
static int
4736
dpif_netdev_offload_stats_get(struct dpif *dpif,
4737
                              struct netdev_custom_stats *stats)
4738
0
{
4739
0
    enum {
4740
0
        DP_NETDEV_HW_OFFLOADS_STATS_ENQUEUED,
4741
0
        DP_NETDEV_HW_OFFLOADS_STATS_INSERTED,
4742
0
        DP_NETDEV_HW_OFFLOADS_STATS_LAT_CMA_MEAN,
4743
0
        DP_NETDEV_HW_OFFLOADS_STATS_LAT_CMA_STDDEV,
4744
0
        DP_NETDEV_HW_OFFLOADS_STATS_LAT_EMA_MEAN,
4745
0
        DP_NETDEV_HW_OFFLOADS_STATS_LAT_EMA_STDDEV,
4746
0
    };
4747
0
    struct {
4748
0
        const char *name;
4749
0
        uint64_t total;
4750
0
    } hwol_stats[] = {
4751
0
        [DP_NETDEV_HW_OFFLOADS_STATS_ENQUEUED] =
4752
0
            { "                Enqueued offloads", 0 },
4753
0
        [DP_NETDEV_HW_OFFLOADS_STATS_INSERTED] =
4754
0
            { "                Inserted offloads", 0 },
4755
0
        [DP_NETDEV_HW_OFFLOADS_STATS_LAT_CMA_MEAN] =
4756
0
            { "  Cumulative Average latency (us)", 0 },
4757
0
        [DP_NETDEV_HW_OFFLOADS_STATS_LAT_CMA_STDDEV] =
4758
0
            { "   Cumulative Latency stddev (us)", 0 },
4759
0
        [DP_NETDEV_HW_OFFLOADS_STATS_LAT_EMA_MEAN] =
4760
0
            { " Exponential Average latency (us)", 0 },
4761
0
        [DP_NETDEV_HW_OFFLOADS_STATS_LAT_EMA_STDDEV] =
4762
0
            { "  Exponential Latency stddev (us)", 0 },
4763
0
    };
4764
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
4765
0
    struct dp_netdev_port *port;
4766
0
    unsigned int nb_thread;
4767
0
    uint64_t *port_nb_offloads;
4768
0
    uint64_t *nb_offloads;
4769
0
    unsigned int tid;
4770
0
    size_t i;
4771
4772
0
    if (!netdev_is_flow_api_enabled()) {
4773
0
        return EINVAL;
4774
0
    }
4775
4776
0
    nb_thread = netdev_offload_thread_nb();
4777
0
    if (!nb_thread) {
4778
0
        return EINVAL;
4779
0
    }
4780
4781
    /* nb_thread counters for the overall total as well. */
4782
0
    stats->size = ARRAY_SIZE(hwol_stats) * (nb_thread + 1);
4783
0
    stats->counters = xcalloc(stats->size, sizeof *stats->counters);
4784
4785
0
    nb_offloads = xcalloc(nb_thread, sizeof *nb_offloads);
4786
0
    port_nb_offloads = xcalloc(nb_thread, sizeof *port_nb_offloads);
4787
4788
0
    ovs_rwlock_rdlock(&dp->port_rwlock);
4789
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
4790
0
        memset(port_nb_offloads, 0, nb_thread * sizeof *port_nb_offloads);
4791
        /* Do not abort on read error from a port, just report 0. */
4792
0
        if (!netdev_flow_get_n_flows(port->netdev, port_nb_offloads)) {
4793
0
            for (i = 0; i < nb_thread; i++) {
4794
0
                nb_offloads[i] += port_nb_offloads[i];
4795
0
            }
4796
0
        }
4797
0
    }
4798
0
    ovs_rwlock_unlock(&dp->port_rwlock);
4799
4800
0
    free(port_nb_offloads);
4801
4802
0
    for (tid = 0; tid < nb_thread; tid++) {
4803
0
        uint64_t counts[ARRAY_SIZE(hwol_stats)];
4804
0
        size_t idx = ((tid + 1) * ARRAY_SIZE(hwol_stats));
4805
4806
0
        memset(counts, 0, sizeof counts);
4807
0
        counts[DP_NETDEV_HW_OFFLOADS_STATS_INSERTED] = nb_offloads[tid];
4808
0
        if (dp_offload_threads != NULL) {
4809
0
            atomic_read_relaxed(&dp_offload_threads[tid].enqueued_item,
4810
0
                                &counts[DP_NETDEV_HW_OFFLOADS_STATS_ENQUEUED]);
4811
4812
0
            counts[DP_NETDEV_HW_OFFLOADS_STATS_LAT_CMA_MEAN] =
4813
0
                mov_avg_cma(&dp_offload_threads[tid].cma);
4814
0
            counts[DP_NETDEV_HW_OFFLOADS_STATS_LAT_CMA_STDDEV] =
4815
0
                mov_avg_cma_std_dev(&dp_offload_threads[tid].cma);
4816
4817
0
            counts[DP_NETDEV_HW_OFFLOADS_STATS_LAT_EMA_MEAN] =
4818
0
                mov_avg_ema(&dp_offload_threads[tid].ema);
4819
0
            counts[DP_NETDEV_HW_OFFLOADS_STATS_LAT_EMA_STDDEV] =
4820
0
                mov_avg_ema_std_dev(&dp_offload_threads[tid].ema);
4821
0
        }
4822
4823
0
        for (i = 0; i < ARRAY_SIZE(hwol_stats); i++) {
4824
0
            snprintf(stats->counters[idx + i].name,
4825
0
                     sizeof(stats->counters[idx + i].name),
4826
0
                     "  [%3u] %s", tid, hwol_stats[i].name);
4827
0
            stats->counters[idx + i].value = counts[i];
4828
0
            hwol_stats[i].total += counts[i];
4829
0
        }
4830
0
    }
4831
4832
0
    free(nb_offloads);
4833
4834
    /* Do an average of the average for the aggregate. */
4835
0
    hwol_stats[DP_NETDEV_HW_OFFLOADS_STATS_LAT_CMA_MEAN].total /= nb_thread;
4836
0
    hwol_stats[DP_NETDEV_HW_OFFLOADS_STATS_LAT_CMA_STDDEV].total /= nb_thread;
4837
0
    hwol_stats[DP_NETDEV_HW_OFFLOADS_STATS_LAT_EMA_MEAN].total /= nb_thread;
4838
0
    hwol_stats[DP_NETDEV_HW_OFFLOADS_STATS_LAT_EMA_STDDEV].total /= nb_thread;
4839
4840
0
    for (i = 0; i < ARRAY_SIZE(hwol_stats); i++) {
4841
0
        snprintf(stats->counters[i].name, sizeof(stats->counters[i].name),
4842
0
                 "  Total %s", hwol_stats[i].name);
4843
0
        stats->counters[i].value = hwol_stats[i].total;
4844
0
    }
4845
4846
0
    return 0;
4847
0
}
4848
4849
/* Enable or Disable PMD auto load balancing. */
4850
static void
4851
set_pmd_auto_lb(struct dp_netdev *dp, bool state, bool always_log)
4852
0
{
4853
0
    struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
4854
4855
0
    if (pmd_alb->is_enabled != state || always_log) {
4856
0
        pmd_alb->is_enabled = state;
4857
0
        if (pmd_alb->is_enabled) {
4858
0
            uint8_t rebalance_load_thresh;
4859
4860
0
            atomic_read_relaxed(&pmd_alb->rebalance_load_thresh,
4861
0
                                &rebalance_load_thresh);
4862
0
            VLOG_INFO("PMD auto load balance is enabled, "
4863
0
                      "interval %"PRIu64" mins, "
4864
0
                      "pmd load threshold %"PRIu8"%%, "
4865
0
                      "improvement threshold %"PRIu8"%%.",
4866
0
                       pmd_alb->rebalance_intvl / MIN_TO_MSEC,
4867
0
                       rebalance_load_thresh,
4868
0
                       pmd_alb->rebalance_improve_thresh);
4869
0
        } else {
4870
0
            pmd_alb->rebalance_poll_timer = 0;
4871
0
            VLOG_INFO("PMD auto load balance is disabled.");
4872
0
        }
4873
0
    }
4874
0
}
4875
4876
static int
4877
parse_pmd_sleep_list(const char *max_sleep_list,
4878
                     struct pmd_sleep **pmd_sleeps)
4879
0
{
4880
0
    char *list, *copy, *key, *value;
4881
0
    int num_vals = 0;
4882
4883
0
    if (!max_sleep_list) {
4884
0
        return num_vals;
4885
0
    }
4886
4887
0
    list = copy = xstrdup(max_sleep_list);
4888
4889
0
    while (ofputil_parse_key_value(&list, &key, &value)) {
4890
0
        uint64_t temp, pmd_max_sleep;
4891
0
        char *error = NULL;
4892
0
        unsigned core;
4893
0
        int i;
4894
4895
0
        error = str_to_u64(key, &temp);
4896
0
        if (error) {
4897
0
            free(error);
4898
0
            continue;
4899
0
        }
4900
4901
0
        if (value[0] == '\0') {
4902
            /* No value specified. key is dp default. */
4903
0
            core = UINT_MAX;
4904
0
            pmd_max_sleep = temp;
4905
0
        } else {
4906
0
            error = str_to_u64(value, &pmd_max_sleep);
4907
0
            if (!error && temp < UINT_MAX) {
4908
                /* Key is pmd core id. */
4909
0
                core = (unsigned) temp;
4910
0
            } else {
4911
0
                free(error);
4912
0
                continue;
4913
0
            }
4914
0
        }
4915
4916
        /* Detect duplicate max sleep values. */
4917
0
        for (i = 0; i < num_vals; i++) {
4918
0
            if ((*pmd_sleeps)[i].core_id == core) {
4919
0
                break;
4920
0
            }
4921
0
        }
4922
0
        if (i == num_vals) {
4923
            /* Not duplicate, add a new entry. */
4924
0
            *pmd_sleeps = xrealloc(*pmd_sleeps,
4925
0
                                   (num_vals + 1) * sizeof **pmd_sleeps);
4926
0
            num_vals++;
4927
0
        }
4928
4929
0
        pmd_max_sleep = MIN(PMD_RCU_QUIESCE_INTERVAL, pmd_max_sleep);
4930
4931
0
        (*pmd_sleeps)[i].core_id = core;
4932
0
        (*pmd_sleeps)[i].max_sleep = pmd_max_sleep;
4933
0
    }
4934
4935
0
    free(copy);
4936
0
    return num_vals;
4937
0
}
4938
4939
static void
4940
log_pmd_sleep(unsigned core_id, int numa_id, uint64_t pmd_max_sleep)
4941
0
{
4942
0
    if (core_id == NON_PMD_CORE_ID) {
4943
0
        return;
4944
0
    }
4945
0
    VLOG_INFO("PMD thread on numa_id: %d, core id: %2d, "
4946
0
              "max sleep: %4"PRIu64" us.", numa_id, core_id, pmd_max_sleep);
4947
0
}
4948
4949
static void
4950
pmd_init_max_sleep(struct dp_netdev *dp, struct dp_netdev_pmd_thread *pmd)
4951
0
{
4952
0
    uint64_t max_sleep = dp->pmd_max_sleep_default;
4953
0
    struct pmd_sleep *pmd_sleeps = NULL;
4954
0
    int num_vals;
4955
4956
0
    num_vals = parse_pmd_sleep_list(dp->max_sleep_list, &pmd_sleeps);
4957
4958
    /* Check if the user has set a specific value for this pmd. */
4959
0
    for (int i = 0; i < num_vals; i++) {
4960
0
        if (pmd_sleeps[i].core_id == pmd->core_id) {
4961
0
            max_sleep = pmd_sleeps[i].max_sleep;
4962
0
            break;
4963
0
        }
4964
0
    }
4965
0
    atomic_init(&pmd->max_sleep, max_sleep);
4966
0
    log_pmd_sleep(pmd->core_id, pmd->numa_id, max_sleep);
4967
0
    free(pmd_sleeps);
4968
0
}
4969
4970
static bool
4971
assign_sleep_values_to_pmds(struct dp_netdev *dp, int num_vals,
4972
                            struct pmd_sleep *pmd_sleeps)
4973
0
{
4974
0
    struct dp_netdev_pmd_thread *pmd;
4975
0
    bool value_changed = false;
4976
4977
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4978
0
        uint64_t new_max_sleep, cur_pmd_max_sleep;
4979
4980
0
        if (pmd->core_id == NON_PMD_CORE_ID) {
4981
0
            continue;
4982
0
        }
4983
4984
        /* Default to global value. */
4985
0
        new_max_sleep = dp->pmd_max_sleep_default;
4986
4987
        /* Check for pmd specific value. */
4988
0
        for (int i = 0;  i < num_vals; i++) {
4989
0
            if (pmd->core_id == pmd_sleeps[i].core_id) {
4990
0
                new_max_sleep = pmd_sleeps[i].max_sleep;
4991
0
                break;
4992
0
            }
4993
0
        }
4994
0
        atomic_read_relaxed(&pmd->max_sleep, &cur_pmd_max_sleep);
4995
0
        if (new_max_sleep != cur_pmd_max_sleep) {
4996
0
            atomic_store_relaxed(&pmd->max_sleep, new_max_sleep);
4997
0
            value_changed = true;
4998
0
        }
4999
0
    }
5000
0
    return value_changed;
5001
0
}
5002
5003
static void
5004
log_all_pmd_sleeps(struct dp_netdev *dp)
5005
0
{
5006
0
    struct dp_netdev_pmd_thread **pmd_list = NULL;
5007
0
    struct dp_netdev_pmd_thread *pmd;
5008
0
    size_t n;
5009
5010
0
    VLOG_INFO("Default PMD thread max sleep: %4"PRIu64" us.",
5011
0
              dp->pmd_max_sleep_default);
5012
5013
0
    sorted_poll_thread_list(dp, &pmd_list, &n);
5014
5015
0
    for (size_t i = 0; i < n; i++) {
5016
0
        uint64_t cur_pmd_max_sleep;
5017
5018
0
        pmd = pmd_list[i];
5019
0
        atomic_read_relaxed(&pmd->max_sleep, &cur_pmd_max_sleep);
5020
0
        log_pmd_sleep(pmd->core_id, pmd->numa_id, cur_pmd_max_sleep);
5021
0
    }
5022
0
    free(pmd_list);
5023
0
}
5024
5025
static bool
5026
set_all_pmd_max_sleeps(struct dp_netdev *dp, const struct smap *config)
5027
0
{
5028
0
    const char *max_sleep_list = smap_get(config, "pmd-sleep-max");
5029
0
    struct pmd_sleep *pmd_sleeps = NULL;
5030
0
    uint64_t default_max_sleep = 0;
5031
0
    bool default_changed = false;
5032
0
    bool pmd_changed = false;
5033
0
    uint64_t pmd_maxsleep;
5034
0
    int num_vals = 0;
5035
5036
    /* Check for deprecated 'pmd-maxsleep' value. */
5037
0
    pmd_maxsleep = smap_get_ullong(config, "pmd-maxsleep", UINT64_MAX);
5038
0
    if (pmd_maxsleep != UINT64_MAX && !max_sleep_list) {
5039
0
        VLOG_WARN_ONCE("pmd-maxsleep is deprecated. "
5040
0
                       "Please use pmd-sleep-max instead.");
5041
0
        default_max_sleep = pmd_maxsleep;
5042
0
    }
5043
5044
    /* Check if there is no change in string or value. */
5045
0
    if (!!dp->max_sleep_list == !!max_sleep_list) {
5046
0
        if (max_sleep_list
5047
0
            ? nullable_string_is_equal(max_sleep_list, dp->max_sleep_list)
5048
0
            : default_max_sleep == dp->pmd_max_sleep_default) {
5049
0
            return false;
5050
0
        }
5051
0
    }
5052
5053
    /* Free existing string and copy new one (if any). */
5054
0
    free(dp->max_sleep_list);
5055
0
    dp->max_sleep_list = nullable_xstrdup(max_sleep_list);
5056
5057
0
    if (max_sleep_list) {
5058
0
        num_vals = parse_pmd_sleep_list(max_sleep_list, &pmd_sleeps);
5059
5060
        /* Check if the user has set a global value. */
5061
0
        for (int i = 0; i < num_vals; i++) {
5062
0
            if (pmd_sleeps[i].core_id == UINT_MAX) {
5063
0
                default_max_sleep = pmd_sleeps[i].max_sleep;
5064
0
                break;
5065
0
            }
5066
0
        }
5067
0
    }
5068
5069
0
    if (dp->pmd_max_sleep_default != default_max_sleep) {
5070
0
        dp->pmd_max_sleep_default = default_max_sleep;
5071
0
        default_changed = true;
5072
0
    }
5073
0
    pmd_changed = assign_sleep_values_to_pmds(dp, num_vals, pmd_sleeps);
5074
5075
0
    free(pmd_sleeps);
5076
0
    return default_changed || pmd_changed;
5077
0
}
5078
5079
/* Applies datapath configuration from the database. Some of the changes are
5080
 * actually applied in dpif_netdev_run(). */
5081
static int
5082
dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config)
5083
0
{
5084
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
5085
0
    const char *cmask = smap_get(other_config, "pmd-cpu-mask");
5086
0
    const char *pmd_rxq_assign = smap_get_def(other_config, "pmd-rxq-assign",
5087
0
                                             "cycles");
5088
0
    unsigned long long insert_prob =
5089
0
        smap_get_ullong(other_config, "emc-insert-inv-prob",
5090
0
                        DEFAULT_EM_FLOW_INSERT_INV_PROB);
5091
0
    uint32_t insert_min, cur_min;
5092
0
    uint32_t tx_flush_interval, cur_tx_flush_interval;
5093
0
    uint64_t rebalance_intvl;
5094
0
    uint8_t cur_rebalance_load;
5095
0
    uint32_t rebalance_load, rebalance_improve;
5096
0
    bool log_autolb = false;
5097
0
    enum sched_assignment_type pmd_rxq_assign_type;
5098
0
    static bool first_set_config = true;
5099
5100
0
    tx_flush_interval = smap_get_int(other_config, "tx-flush-interval",
5101
0
                                     DEFAULT_TX_FLUSH_INTERVAL);
5102
0
    atomic_read_relaxed(&dp->tx_flush_interval, &cur_tx_flush_interval);
5103
0
    if (tx_flush_interval != cur_tx_flush_interval) {
5104
0
        atomic_store_relaxed(&dp->tx_flush_interval, tx_flush_interval);
5105
0
        VLOG_INFO("Flushing interval for tx queues set to %"PRIu32" us",
5106
0
                  tx_flush_interval);
5107
0
    }
5108
5109
0
    if (!nullable_string_is_equal(dp->pmd_cmask, cmask)) {
5110
0
        free(dp->pmd_cmask);
5111
0
        dp->pmd_cmask = nullable_xstrdup(cmask);
5112
0
        dp_netdev_request_reconfigure(dp);
5113
0
    }
5114
5115
0
    atomic_read_relaxed(&dp->emc_insert_min, &cur_min);
5116
0
    if (insert_prob <= UINT32_MAX) {
5117
0
        insert_min = insert_prob == 0 ? 0 : UINT32_MAX / insert_prob;
5118
0
    } else {
5119
0
        insert_min = DEFAULT_EM_FLOW_INSERT_MIN;
5120
0
        insert_prob = DEFAULT_EM_FLOW_INSERT_INV_PROB;
5121
0
    }
5122
5123
0
    if (insert_min != cur_min) {
5124
0
        atomic_store_relaxed(&dp->emc_insert_min, insert_min);
5125
0
        if (insert_min == 0) {
5126
0
            VLOG_INFO("EMC insertion probability changed to zero");
5127
0
        } else {
5128
0
            VLOG_INFO("EMC insertion probability changed to 1/%llu (~%.2f%%)",
5129
0
                      insert_prob, (100 / (float)insert_prob));
5130
0
        }
5131
0
    }
5132
5133
0
    bool perf_enabled = smap_get_bool(other_config, "pmd-perf-metrics", false);
5134
0
    bool cur_perf_enabled;
5135
0
    atomic_read_relaxed(&dp->pmd_perf_metrics, &cur_perf_enabled);
5136
0
    if (perf_enabled != cur_perf_enabled) {
5137
0
        atomic_store_relaxed(&dp->pmd_perf_metrics, perf_enabled);
5138
0
        if (perf_enabled) {
5139
0
            VLOG_INFO("PMD performance metrics collection enabled");
5140
0
        } else {
5141
0
            VLOG_INFO("PMD performance metrics collection disabled");
5142
0
        }
5143
0
    }
5144
5145
0
    bool smc_enable = smap_get_bool(other_config, "smc-enable", false);
5146
0
    bool cur_smc;
5147
0
    atomic_read_relaxed(&dp->smc_enable_db, &cur_smc);
5148
0
    if (smc_enable != cur_smc) {
5149
0
        atomic_store_relaxed(&dp->smc_enable_db, smc_enable);
5150
0
        if (smc_enable) {
5151
0
            VLOG_INFO("SMC cache is enabled");
5152
0
        } else {
5153
0
            VLOG_INFO("SMC cache is disabled");
5154
0
        }
5155
0
    }
5156
5157
0
    if (!strcmp(pmd_rxq_assign, "roundrobin")) {
5158
0
        pmd_rxq_assign_type = SCHED_ROUNDROBIN;
5159
0
    } else if (!strcmp(pmd_rxq_assign, "cycles")) {
5160
0
        pmd_rxq_assign_type = SCHED_CYCLES;
5161
0
    } else if (!strcmp(pmd_rxq_assign, "group")) {
5162
0
        pmd_rxq_assign_type = SCHED_GROUP;
5163
0
    } else {
5164
        /* Default. */
5165
0
        VLOG_WARN("Unsupported rx queue to PMD assignment mode in "
5166
0
                  "pmd-rxq-assign. Defaulting to 'cycles'.");
5167
0
        pmd_rxq_assign_type = SCHED_CYCLES;
5168
0
        pmd_rxq_assign = "cycles";
5169
0
    }
5170
0
    if (dp->pmd_rxq_assign_type != pmd_rxq_assign_type) {
5171
0
        dp->pmd_rxq_assign_type = pmd_rxq_assign_type;
5172
0
        VLOG_INFO("Rxq to PMD assignment mode changed to: \'%s\'.",
5173
0
                  pmd_rxq_assign);
5174
0
        dp_netdev_request_reconfigure(dp);
5175
0
    }
5176
5177
0
    bool pmd_iso = smap_get_bool(other_config, "pmd-rxq-isolate", true);
5178
5179
0
    if (pmd_rxq_assign_type != SCHED_GROUP && pmd_iso == false) {
5180
        /* Invalid combination. */
5181
0
        VLOG_WARN("pmd-rxq-isolate can only be set false "
5182
0
                  "when using pmd-rxq-assign=group");
5183
0
        pmd_iso = true;
5184
0
    }
5185
0
    if (dp->pmd_iso != pmd_iso) {
5186
0
        dp->pmd_iso = pmd_iso;
5187
0
        if (pmd_iso) {
5188
0
            VLOG_INFO("pmd-rxq-affinity isolates PMD core");
5189
0
        } else {
5190
0
            VLOG_INFO("pmd-rxq-affinity does not isolate PMD core");
5191
0
        }
5192
0
        dp_netdev_request_reconfigure(dp);
5193
0
    }
5194
5195
0
    struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
5196
5197
0
    rebalance_intvl = smap_get_ullong(other_config,
5198
0
                                      "pmd-auto-lb-rebal-interval",
5199
0
                                      ALB_REBALANCE_INTERVAL);
5200
0
    if (rebalance_intvl > MAX_ALB_REBALANCE_INTERVAL) {
5201
0
        rebalance_intvl = ALB_REBALANCE_INTERVAL;
5202
0
    }
5203
5204
    /* Input is in min, convert it to msec. */
5205
0
    rebalance_intvl =
5206
0
        rebalance_intvl ? rebalance_intvl * MIN_TO_MSEC : MIN_TO_MSEC;
5207
5208
0
    if (pmd_alb->rebalance_intvl != rebalance_intvl) {
5209
0
        pmd_alb->rebalance_intvl = rebalance_intvl;
5210
0
        VLOG_INFO("PMD auto load balance interval set to "
5211
0
                  "%"PRIu64" mins\n", rebalance_intvl / MIN_TO_MSEC);
5212
0
        log_autolb = true;
5213
0
    }
5214
5215
0
    rebalance_improve = smap_get_uint(other_config,
5216
0
                                      "pmd-auto-lb-improvement-threshold",
5217
0
                                      ALB_IMPROVEMENT_THRESHOLD);
5218
0
    if (rebalance_improve > 100) {
5219
0
        rebalance_improve = ALB_IMPROVEMENT_THRESHOLD;
5220
0
    }
5221
0
    if (rebalance_improve != pmd_alb->rebalance_improve_thresh) {
5222
0
        pmd_alb->rebalance_improve_thresh = rebalance_improve;
5223
0
        VLOG_INFO("PMD auto load balance improvement threshold set to "
5224
0
                  "%"PRIu32"%%", rebalance_improve);
5225
0
        log_autolb = true;
5226
0
    }
5227
5228
0
    rebalance_load = smap_get_uint(other_config, "pmd-auto-lb-load-threshold",
5229
0
                                   ALB_LOAD_THRESHOLD);
5230
0
    if (rebalance_load > 100) {
5231
0
        rebalance_load = ALB_LOAD_THRESHOLD;
5232
0
    }
5233
0
    atomic_read_relaxed(&pmd_alb->rebalance_load_thresh, &cur_rebalance_load);
5234
0
    if (rebalance_load != cur_rebalance_load) {
5235
0
        atomic_store_relaxed(&pmd_alb->rebalance_load_thresh,
5236
0
                             rebalance_load);
5237
0
        VLOG_INFO("PMD auto load balance load threshold set to %"PRIu32"%%",
5238
0
                  rebalance_load);
5239
0
        log_autolb = true;
5240
0
    }
5241
5242
0
    bool autolb_state = smap_get_bool(other_config, "pmd-auto-lb", false);
5243
5244
0
    set_pmd_auto_lb(dp, autolb_state, log_autolb);
5245
5246
0
    bool sleep_changed = set_all_pmd_max_sleeps(dp, other_config);
5247
0
    if (first_set_config || sleep_changed) {
5248
0
        log_all_pmd_sleeps(dp);
5249
0
    }
5250
5251
0
    first_set_config = false;
5252
0
    return 0;
5253
0
}
5254
5255
static bool
5256
dpif_netdev_number_handlers_required(struct dpif *dpif_ OVS_UNUSED,
5257
                                     uint32_t *n_handlers)
5258
0
{
5259
0
    *n_handlers = 0;
5260
0
    return true;
5261
0
}
5262
5263
/* Parses affinity list and returns result in 'core_ids'. */
5264
static int
5265
parse_affinity_list(const char *affinity_list, unsigned *core_ids, int n_rxq)
5266
0
{
5267
0
    unsigned i;
5268
0
    char *list, *copy, *key, *value;
5269
0
    int error = 0;
5270
5271
0
    for (i = 0; i < n_rxq; i++) {
5272
0
        core_ids[i] = OVS_CORE_UNSPEC;
5273
0
    }
5274
5275
0
    if (!affinity_list) {
5276
0
        return 0;
5277
0
    }
5278
5279
0
    list = copy = xstrdup(affinity_list);
5280
5281
0
    while (ofputil_parse_key_value(&list, &key, &value)) {
5282
0
        int rxq_id, core_id;
5283
5284
0
        if (!str_to_int(key, 0, &rxq_id) || rxq_id < 0
5285
0
            || !str_to_int(value, 0, &core_id) || core_id < 0) {
5286
0
            error = EINVAL;
5287
0
            break;
5288
0
        }
5289
5290
0
        if (rxq_id < n_rxq) {
5291
0
            core_ids[rxq_id] = core_id;
5292
0
        }
5293
0
    }
5294
5295
0
    free(copy);
5296
0
    return error;
5297
0
}
5298
5299
/* Parses 'affinity_list' and applies configuration if it is valid. */
5300
static int
5301
dpif_netdev_port_set_rxq_affinity(struct dp_netdev_port *port,
5302
                                  const char *affinity_list)
5303
0
{
5304
0
    unsigned *core_ids, i;
5305
0
    int error = 0;
5306
5307
0
    core_ids = xmalloc(port->n_rxq * sizeof *core_ids);
5308
0
    if (parse_affinity_list(affinity_list, core_ids, port->n_rxq)) {
5309
0
        error = EINVAL;
5310
0
        goto exit;
5311
0
    }
5312
5313
0
    for (i = 0; i < port->n_rxq; i++) {
5314
0
        port->rxqs[i].core_id = core_ids[i];
5315
0
    }
5316
5317
0
exit:
5318
0
    free(core_ids);
5319
0
    return error;
5320
0
}
5321
5322
/* Returns 'true' if one of the 'port's RX queues exists in 'poll_list'
5323
 * of given PMD thread. */
5324
static bool
5325
dpif_netdev_pmd_polls_port(struct dp_netdev_pmd_thread *pmd,
5326
                           struct dp_netdev_port *port)
5327
    OVS_EXCLUDED(pmd->port_mutex)
5328
0
{
5329
0
    struct rxq_poll *poll;
5330
0
    bool found = false;
5331
5332
0
    ovs_mutex_lock(&pmd->port_mutex);
5333
0
    HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
5334
0
        if (port == poll->rxq->port) {
5335
0
            found = true;
5336
0
            break;
5337
0
        }
5338
0
    }
5339
0
    ovs_mutex_unlock(&pmd->port_mutex);
5340
0
    return found;
5341
0
}
5342
5343
/* Updates port configuration from the database.  The changes are actually
5344
 * applied in dpif_netdev_run(). */
5345
static int
5346
dpif_netdev_port_set_config(struct dpif *dpif, odp_port_t port_no,
5347
                            const struct smap *cfg)
5348
0
{
5349
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
5350
0
    struct dp_netdev_port *port;
5351
0
    int error = 0;
5352
0
    const char *affinity_list = smap_get(cfg, "pmd-rxq-affinity");
5353
0
    bool emc_enabled = smap_get_bool(cfg, "emc-enable", true);
5354
0
    const char *tx_steering_mode = smap_get(cfg, "tx-steering");
5355
0
    enum txq_req_mode txq_mode;
5356
5357
0
    ovs_rwlock_wrlock(&dp->port_rwlock);
5358
0
    error = get_port_by_number(dp, port_no, &port);
5359
0
    if (error) {
5360
0
        goto unlock;
5361
0
    }
5362
5363
0
    if (emc_enabled != port->emc_enabled) {
5364
0
        struct dp_netdev_pmd_thread *pmd;
5365
0
        struct ds ds = DS_EMPTY_INITIALIZER;
5366
0
        uint32_t cur_min, insert_prob;
5367
5368
0
        port->emc_enabled = emc_enabled;
5369
        /* Mark for reload all the threads that polls this port and request
5370
         * for reconfiguration for the actual reloading of threads. */
5371
0
        CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5372
0
            if (dpif_netdev_pmd_polls_port(pmd, port)) {
5373
0
                pmd->need_reload = true;
5374
0
            }
5375
0
        }
5376
0
        dp_netdev_request_reconfigure(dp);
5377
5378
0
        ds_put_format(&ds, "%s: EMC has been %s.",
5379
0
                      netdev_get_name(port->netdev),
5380
0
                      (emc_enabled) ? "enabled" : "disabled");
5381
0
        if (emc_enabled) {
5382
0
            ds_put_cstr(&ds, " Current insertion probability is ");
5383
0
            atomic_read_relaxed(&dp->emc_insert_min, &cur_min);
5384
0
            if (!cur_min) {
5385
0
                ds_put_cstr(&ds, "zero.");
5386
0
            } else {
5387
0
                insert_prob = UINT32_MAX / cur_min;
5388
0
                ds_put_format(&ds, "1/%"PRIu32" (~%.2f%%).",
5389
0
                              insert_prob, 100 / (float) insert_prob);
5390
0
            }
5391
0
        }
5392
0
        VLOG_INFO("%s", ds_cstr(&ds));
5393
0
        ds_destroy(&ds);
5394
0
    }
5395
5396
    /* Checking for RXq affinity changes. */
5397
0
    if (netdev_is_pmd(port->netdev)
5398
0
        && !nullable_string_is_equal(affinity_list, port->rxq_affinity_list)) {
5399
5400
0
        error = dpif_netdev_port_set_rxq_affinity(port, affinity_list);
5401
0
        if (error) {
5402
0
            goto unlock;
5403
0
        }
5404
0
        free(port->rxq_affinity_list);
5405
0
        port->rxq_affinity_list = nullable_xstrdup(affinity_list);
5406
5407
0
        dp_netdev_request_reconfigure(dp);
5408
0
    }
5409
5410
0
    if (nullable_string_is_equal(tx_steering_mode, "hash")) {
5411
0
        txq_mode = TXQ_REQ_MODE_HASH;
5412
0
    } else {
5413
0
        txq_mode = TXQ_REQ_MODE_THREAD;
5414
0
    }
5415
5416
0
    if (txq_mode != port->txq_requested_mode) {
5417
0
        port->txq_requested_mode = txq_mode;
5418
0
        VLOG_INFO("%s: Tx packet steering mode has been set to '%s'.",
5419
0
                  netdev_get_name(port->netdev),
5420
0
                  (txq_mode == TXQ_REQ_MODE_THREAD) ? "thread" : "hash");
5421
0
        dp_netdev_request_reconfigure(dp);
5422
0
    }
5423
5424
0
unlock:
5425
0
    ovs_rwlock_unlock(&dp->port_rwlock);
5426
0
    return error;
5427
0
}
5428
5429
static int
5430
dpif_netdev_queue_to_priority(const struct dpif *dpif OVS_UNUSED,
5431
                              uint32_t queue_id, uint32_t *priority)
5432
0
{
5433
0
    *priority = queue_id;
5434
0
    return 0;
5435
0
}
5436
5437

5438
/* Creates and returns a new 'struct dp_netdev_actions', whose actions are
5439
 * a copy of the 'size' bytes of 'actions' input parameters. */
5440
struct dp_netdev_actions *
5441
dp_netdev_actions_create(const struct nlattr *actions, size_t size)
5442
0
{
5443
0
    struct dp_netdev_actions *netdev_actions;
5444
5445
0
    netdev_actions = xmalloc(sizeof *netdev_actions + size);
5446
0
    netdev_actions->size = size;
5447
0
    if (size) {
5448
0
        memcpy(netdev_actions->actions, actions, size);
5449
0
    }
5450
5451
0
    return netdev_actions;
5452
0
}
5453
5454
struct dp_netdev_actions *
5455
dp_netdev_flow_get_actions(const struct dp_netdev_flow *flow)
5456
0
{
5457
0
    return ovsrcu_get(struct dp_netdev_actions *, &flow->actions);
5458
0
}
5459
5460
static void
5461
dp_netdev_actions_free(struct dp_netdev_actions *actions)
5462
0
{
5463
0
    free(actions);
5464
0
}
5465

5466
static void
5467
dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
5468
                         enum rxq_cycles_counter_type type,
5469
                         unsigned long long cycles)
5470
0
{
5471
0
   atomic_store_relaxed(&rx->cycles[type], cycles);
5472
0
}
5473
5474
static void
5475
dp_netdev_rxq_add_cycles(struct dp_netdev_rxq *rx,
5476
                         enum rxq_cycles_counter_type type,
5477
                         unsigned long long cycles)
5478
0
{
5479
0
    non_atomic_ullong_add(&rx->cycles[type], cycles);
5480
0
}
5481
5482
static uint64_t
5483
dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx,
5484
                         enum rxq_cycles_counter_type type)
5485
0
{
5486
0
    unsigned long long processing_cycles;
5487
0
    atomic_read_relaxed(&rx->cycles[type], &processing_cycles);
5488
0
    return processing_cycles;
5489
0
}
5490
5491
static void
5492
dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
5493
                                unsigned long long cycles)
5494
0
{
5495
0
    unsigned int idx = atomic_count_inc(&rx->intrvl_idx) % PMD_INTERVAL_MAX;
5496
0
    atomic_store_relaxed(&rx->cycles_intrvl[idx], cycles);
5497
0
}
5498
5499
static uint64_t
5500
dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx)
5501
0
{
5502
0
    unsigned long long processing_cycles;
5503
0
    atomic_read_relaxed(&rx->cycles_intrvl[idx], &processing_cycles);
5504
0
    return processing_cycles;
5505
0
}
5506
5507
#if ATOMIC_ALWAYS_LOCK_FREE_8B
5508
static inline bool
5509
pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd)
5510
0
{
5511
0
    bool pmd_perf_enabled;
5512
0
    atomic_read_relaxed(&pmd->dp->pmd_perf_metrics, &pmd_perf_enabled);
5513
0
    return pmd_perf_enabled;
5514
0
}
5515
#else
5516
/* If stores and reads of 64-bit integers are not atomic, the full PMD
5517
 * performance metrics are not available as locked access to 64 bit
5518
 * integers would be prohibitively expensive. */
5519
static inline bool
5520
pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd OVS_UNUSED)
5521
{
5522
    return false;
5523
}
5524
#endif
5525
5526
static int
5527
dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd,
5528
                                   struct tx_port *p)
5529
0
{
5530
0
    int i;
5531
0
    int tx_qid;
5532
0
    int output_cnt;
5533
0
    bool concurrent_txqs;
5534
0
    struct cycle_timer timer;
5535
0
    uint64_t cycles;
5536
0
    uint32_t tx_flush_interval;
5537
5538
0
    cycle_timer_start(&pmd->perf_stats, &timer);
5539
5540
0
    output_cnt = dp_packet_batch_size(&p->output_pkts);
5541
0
    ovs_assert(output_cnt > 0);
5542
5543
0
    if (p->port->txq_mode == TXQ_MODE_XPS_HASH) {
5544
0
        int n_txq = netdev_n_txq(p->port->netdev);
5545
5546
        /* Re-batch per txq based on packet hash. */
5547
0
        struct dp_packet *packet;
5548
0
        DP_PACKET_BATCH_FOR_EACH (j, packet, &p->output_pkts) {
5549
0
            uint32_t hash;
5550
5551
0
            if (OVS_LIKELY(dp_packet_rss_valid(packet))) {
5552
0
                hash = dp_packet_get_rss_hash(packet);
5553
0
            } else {
5554
0
                struct flow flow;
5555
5556
0
                flow_extract(packet, &flow);
5557
0
                hash = flow_hash_5tuple(&flow, 0);
5558
0
            }
5559
0
            dp_packet_batch_add(&p->txq_pkts[hash % n_txq], packet);
5560
0
        }
5561
5562
        /* Flush batches of each Tx queues. */
5563
0
        for (i = 0; i < n_txq; i++) {
5564
0
            if (dp_packet_batch_is_empty(&p->txq_pkts[i])) {
5565
0
                continue;
5566
0
            }
5567
0
            netdev_send(p->port->netdev, i, &p->txq_pkts[i], true);
5568
0
            dp_packet_batch_init(&p->txq_pkts[i]);
5569
0
        }
5570
0
    } else {
5571
0
        if (p->port->txq_mode == TXQ_MODE_XPS) {
5572
0
            tx_qid = dpif_netdev_xps_get_tx_qid(pmd, p);
5573
0
            concurrent_txqs = true;
5574
0
        } else {
5575
0
            tx_qid = pmd->static_tx_qid;
5576
0
            concurrent_txqs = false;
5577
0
        }
5578
0
        netdev_send(p->port->netdev, tx_qid, &p->output_pkts, concurrent_txqs);
5579
0
    }
5580
0
    dp_packet_batch_init(&p->output_pkts);
5581
5582
    /* Update time of the next flush. */
5583
0
    atomic_read_relaxed(&pmd->dp->tx_flush_interval, &tx_flush_interval);
5584
0
    p->flush_time = pmd->ctx.now + tx_flush_interval;
5585
5586
0
    ovs_assert(pmd->n_output_batches > 0);
5587
0
    pmd->n_output_batches--;
5588
5589
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_PKTS, output_cnt);
5590
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_BATCHES, 1);
5591
5592
    /* Distribute send cycles evenly among transmitted packets and assign to
5593
     * their respective rx queues. */
5594
0
    cycles = cycle_timer_stop(&pmd->perf_stats, &timer) / output_cnt;
5595
0
    for (i = 0; i < output_cnt; i++) {
5596
0
        if (p->output_pkts_rxqs[i]) {
5597
0
            dp_netdev_rxq_add_cycles(p->output_pkts_rxqs[i],
5598
0
                                     RXQ_CYCLES_PROC_CURR, cycles);
5599
0
        }
5600
0
    }
5601
5602
0
    return output_cnt;
5603
0
}
5604
5605
static int
5606
dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
5607
                                   bool force)
5608
0
{
5609
0
    struct tx_port *p;
5610
0
    int output_cnt = 0;
5611
5612
0
    if (!pmd->n_output_batches) {
5613
0
        return 0;
5614
0
    }
5615
5616
0
    HMAP_FOR_EACH (p, node, &pmd->send_port_cache) {
5617
0
        if (!dp_packet_batch_is_empty(&p->output_pkts)
5618
0
            && (force || pmd->ctx.now >= p->flush_time)) {
5619
0
            output_cnt += dp_netdev_pmd_flush_output_on_port(pmd, p);
5620
0
        }
5621
0
    }
5622
0
    return output_cnt;
5623
0
}
5624
5625
static int
5626
dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
5627
                           struct dp_netdev_rxq *rxq,
5628
                           odp_port_t port_no)
5629
0
{
5630
0
    struct pmd_perf_stats *s = &pmd->perf_stats;
5631
0
    struct dp_packet_batch batch;
5632
0
    struct cycle_timer timer;
5633
0
    int error;
5634
0
    int batch_cnt = 0;
5635
0
    int rem_qlen = 0, *qlen_p = NULL;
5636
0
    uint64_t cycles;
5637
5638
    /* Measure duration for polling and processing rx burst. */
5639
0
    cycle_timer_start(&pmd->perf_stats, &timer);
5640
5641
0
    pmd->ctx.last_rxq = rxq;
5642
0
    dp_packet_batch_init(&batch);
5643
5644
    /* Fetch the rx queue length only for vhostuser ports. */
5645
0
    if (pmd_perf_metrics_enabled(pmd) && rxq->is_vhost) {
5646
0
        qlen_p = &rem_qlen;
5647
0
    }
5648
5649
0
    error = netdev_rxq_recv(rxq->rx, &batch, qlen_p);
5650
0
    if (!error) {
5651
        /* At least one packet received. */
5652
0
        *recirc_depth_get() = 0;
5653
0
        pmd_thread_ctx_time_update(pmd);
5654
0
        batch_cnt = dp_packet_batch_size(&batch);
5655
0
        if (pmd_perf_metrics_enabled(pmd)) {
5656
            /* Update batch histogram. */
5657
0
            s->current.batches++;
5658
0
            histogram_add_sample(&s->pkts_per_batch, batch_cnt);
5659
            /* Update the maximum vhost rx queue fill level. */
5660
0
            if (rxq->is_vhost && rem_qlen >= 0) {
5661
0
                uint32_t qfill = batch_cnt + rem_qlen;
5662
0
                if (qfill > s->current.max_vhost_qfill) {
5663
0
                    s->current.max_vhost_qfill = qfill;
5664
0
                }
5665
0
            }
5666
0
        }
5667
5668
        /* Process packet batch. */
5669
0
        int ret = pmd->netdev_input_func(pmd, &batch, port_no);
5670
0
        if (ret) {
5671
0
            dp_netdev_input(pmd, &batch, port_no);
5672
0
        }
5673
5674
        /* Assign processing cycles to rx queue. */
5675
0
        cycles = cycle_timer_stop(&pmd->perf_stats, &timer);
5676
0
        dp_netdev_rxq_add_cycles(rxq, RXQ_CYCLES_PROC_CURR, cycles);
5677
5678
0
        dp_netdev_pmd_flush_output_packets(pmd, false);
5679
0
    } else {
5680
        /* Discard cycles. */
5681
0
        cycle_timer_stop(&pmd->perf_stats, &timer);
5682
0
        if (error != EAGAIN && error != EOPNOTSUPP) {
5683
0
            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
5684
5685
0
            VLOG_ERR_RL(&rl, "error receiving data from %s: %s",
5686
0
                    netdev_rxq_get_name(rxq->rx), ovs_strerror(error));
5687
0
        }
5688
0
    }
5689
5690
0
    pmd->ctx.last_rxq = NULL;
5691
5692
0
    return batch_cnt;
5693
0
}
5694
5695
static struct tx_port *
5696
tx_port_lookup(const struct hmap *hmap, odp_port_t port_no)
5697
0
{
5698
0
    struct tx_port *tx;
5699
5700
0
    HMAP_FOR_EACH_IN_BUCKET (tx, node, hash_port_no(port_no), hmap) {
5701
0
        if (tx->port->port_no == port_no) {
5702
0
            return tx;
5703
0
        }
5704
0
    }
5705
5706
0
    return NULL;
5707
0
}
5708
5709
static struct tx_bond *
5710
tx_bond_lookup(const struct cmap *tx_bonds, uint32_t bond_id)
5711
0
{
5712
0
    uint32_t hash = hash_bond_id(bond_id);
5713
0
    struct tx_bond *tx;
5714
5715
0
    CMAP_FOR_EACH_WITH_HASH (tx, node, hash, tx_bonds) {
5716
0
        if (tx->bond_id == bond_id) {
5717
0
            return tx;
5718
0
        }
5719
0
    }
5720
0
    return NULL;
5721
0
}
5722
5723
static int
5724
port_reconfigure(struct dp_netdev_port *port)
5725
0
{
5726
0
    struct netdev *netdev = port->netdev;
5727
0
    int i, err;
5728
5729
    /* Closes the existing 'rxq's. */
5730
0
    for (i = 0; i < port->n_rxq; i++) {
5731
0
        netdev_rxq_close(port->rxqs[i].rx);
5732
0
        port->rxqs[i].rx = NULL;
5733
0
    }
5734
0
    unsigned last_nrxq = port->n_rxq;
5735
0
    port->n_rxq = 0;
5736
5737
    /* Allows 'netdev' to apply the pending configuration changes. */
5738
0
    if (netdev_is_reconf_required(netdev) || port->need_reconfigure) {
5739
0
        err = netdev_reconfigure(netdev);
5740
0
        if (err && (err != EOPNOTSUPP)) {
5741
0
            VLOG_ERR("Failed to set interface %s new configuration",
5742
0
                     netdev_get_name(netdev));
5743
0
            return err;
5744
0
        }
5745
0
    }
5746
    /* If the netdev_reconfigure() above succeeds, reopens the 'rxq's. */
5747
0
    port->rxqs = xrealloc(port->rxqs,
5748
0
                          sizeof *port->rxqs * netdev_n_rxq(netdev));
5749
    /* Realloc 'used' counters for tx queues. */
5750
0
    free(port->txq_used);
5751
0
    port->txq_used = xcalloc(netdev_n_txq(netdev), sizeof *port->txq_used);
5752
5753
0
    for (i = 0; i < netdev_n_rxq(netdev); i++) {
5754
0
        bool new_queue = i >= last_nrxq;
5755
0
        if (new_queue) {
5756
0
            memset(&port->rxqs[i], 0, sizeof port->rxqs[i]);
5757
0
        }
5758
5759
0
        port->rxqs[i].port = port;
5760
0
        port->rxqs[i].is_vhost = !strncmp(port->type, "dpdkvhost", 9);
5761
5762
0
        err = netdev_rxq_open(netdev, &port->rxqs[i].rx, i);
5763
0
        if (err) {
5764
0
            return err;
5765
0
        }
5766
0
        port->n_rxq++;
5767
0
    }
5768
5769
    /* Parse affinity list to apply configuration for new queues. */
5770
0
    dpif_netdev_port_set_rxq_affinity(port, port->rxq_affinity_list);
5771
5772
    /* If reconfiguration was successful mark it as such, so we can use it */
5773
0
    port->need_reconfigure = false;
5774
5775
0
    return 0;
5776
0
}
5777
5778
struct sched_numa_list {
5779
    struct hmap numas;  /* Contains 'struct sched_numa'. */
5780
};
5781
5782
/* Meta data for out-of-place pmd rxq assignments. */
5783
struct sched_pmd {
5784
    struct sched_numa *numa;
5785
    /* Associated PMD thread. */
5786
    struct dp_netdev_pmd_thread *pmd;
5787
    uint64_t pmd_proc_cycles;
5788
    struct dp_netdev_rxq **rxqs;
5789
    unsigned n_rxq;
5790
    bool isolated;
5791
};
5792
5793
struct sched_numa {
5794
    struct hmap_node node;
5795
    int numa_id;
5796
    /* PMDs on numa node. */
5797
    struct sched_pmd *pmds;
5798
    /* Num of PMDs on numa node. */
5799
    unsigned n_pmds;
5800
    /* Num of isolated PMDs on numa node. */
5801
    unsigned n_isolated;
5802
    int rr_cur_index;
5803
    bool rr_idx_inc;
5804
};
5805
5806
static size_t
5807
sched_numa_list_count(struct sched_numa_list *numa_list)
5808
0
{
5809
0
    return hmap_count(&numa_list->numas);
5810
0
}
5811
5812
static struct sched_numa *
5813
sched_numa_list_next(struct sched_numa_list *numa_list,
5814
                     const struct sched_numa *numa)
5815
0
{
5816
0
    struct hmap_node *node = NULL;
5817
5818
0
    if (numa) {
5819
0
        node = hmap_next(&numa_list->numas, &numa->node);
5820
0
    }
5821
0
    if (!node) {
5822
0
        node = hmap_first(&numa_list->numas);
5823
0
    }
5824
5825
0
    return (node) ? CONTAINER_OF(node, struct sched_numa, node) : NULL;
5826
0
}
5827
5828
static struct sched_numa *
5829
sched_numa_list_lookup(struct sched_numa_list *numa_list, int numa_id)
5830
0
{
5831
0
    struct sched_numa *numa;
5832
5833
0
    HMAP_FOR_EACH_WITH_HASH (numa, node, hash_int(numa_id, 0),
5834
0
                             &numa_list->numas) {
5835
0
        if (numa->numa_id == numa_id) {
5836
0
            return numa;
5837
0
        }
5838
0
    }
5839
0
    return NULL;
5840
0
}
5841
5842
static int
5843
compare_sched_pmd_list(const void *a_, const void *b_)
5844
0
{
5845
0
    struct sched_pmd *a, *b;
5846
5847
0
    a = (struct sched_pmd *) a_;
5848
0
    b = (struct sched_pmd *) b_;
5849
5850
0
    return compare_poll_thread_list(&a->pmd, &b->pmd);
5851
0
}
5852
5853
static void
5854
sort_numa_list_pmds(struct sched_numa_list *numa_list)
5855
0
{
5856
0
    struct sched_numa *numa;
5857
5858
0
    HMAP_FOR_EACH (numa, node, &numa_list->numas) {
5859
0
        if (numa->n_pmds > 1) {
5860
0
            qsort(numa->pmds, numa->n_pmds, sizeof *numa->pmds,
5861
0
                  compare_sched_pmd_list);
5862
0
        }
5863
0
    }
5864
0
}
5865
5866
/* Populate numas and pmds on those numas. */
5867
static void
5868
sched_numa_list_populate(struct sched_numa_list *numa_list,
5869
                         struct dp_netdev *dp)
5870
0
{
5871
0
    struct dp_netdev_pmd_thread *pmd;
5872
5873
0
    hmap_init(&numa_list->numas);
5874
5875
    /* For each pmd on this datapath. */
5876
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5877
0
        struct sched_numa *numa;
5878
0
        struct sched_pmd *sched_pmd;
5879
0
        if (pmd->core_id == NON_PMD_CORE_ID) {
5880
0
            continue;
5881
0
        }
5882
5883
        /* Get the numa of the PMD. */
5884
0
        numa = sched_numa_list_lookup(numa_list, pmd->numa_id);
5885
        /* Create a new numa node for it if not already created. */
5886
0
        if (!numa) {
5887
0
            numa = xzalloc(sizeof *numa);
5888
0
            numa->numa_id = pmd->numa_id;
5889
0
            hmap_insert(&numa_list->numas, &numa->node,
5890
0
                        hash_int(pmd->numa_id, 0));
5891
0
        }
5892
5893
        /* Create a sched_pmd on this numa for the pmd. */
5894
0
        numa->n_pmds++;
5895
0
        numa->pmds = xrealloc(numa->pmds, numa->n_pmds * sizeof *numa->pmds);
5896
0
        sched_pmd = &numa->pmds[numa->n_pmds - 1];
5897
0
        memset(sched_pmd, 0, sizeof *sched_pmd);
5898
0
        sched_pmd->numa = numa;
5899
0
        sched_pmd->pmd = pmd;
5900
        /* At least one pmd is present so initialize curr_idx and idx_inc. */
5901
0
        numa->rr_cur_index = 0;
5902
0
        numa->rr_idx_inc = true;
5903
0
    }
5904
0
    sort_numa_list_pmds(numa_list);
5905
0
}
5906
5907
static void
5908
sched_numa_list_free_entries(struct sched_numa_list *numa_list)
5909
0
{
5910
0
    struct sched_numa *numa;
5911
5912
0
    HMAP_FOR_EACH_POP (numa, node, &numa_list->numas) {
5913
0
        for (unsigned i = 0; i < numa->n_pmds; i++) {
5914
0
            struct sched_pmd *sched_pmd;
5915
5916
0
            sched_pmd = &numa->pmds[i];
5917
0
            sched_pmd->n_rxq = 0;
5918
0
            free(sched_pmd->rxqs);
5919
0
        }
5920
0
        numa->n_pmds = 0;
5921
0
        free(numa->pmds);
5922
0
        free(numa);
5923
0
    }
5924
0
    hmap_destroy(&numa_list->numas);
5925
0
}
5926
5927
static struct sched_pmd *
5928
sched_pmd_find_by_pmd(struct sched_numa_list *numa_list,
5929
                      struct dp_netdev_pmd_thread *pmd)
5930
0
{
5931
0
    struct sched_numa *numa;
5932
5933
0
    HMAP_FOR_EACH (numa, node, &numa_list->numas) {
5934
0
        for (unsigned i = 0; i < numa->n_pmds; i++) {
5935
0
            struct sched_pmd *sched_pmd;
5936
5937
0
            sched_pmd = &numa->pmds[i];
5938
0
            if (pmd == sched_pmd->pmd) {
5939
0
                return sched_pmd;
5940
0
            }
5941
0
        }
5942
0
    }
5943
0
    return NULL;
5944
0
}
5945
5946
static void
5947
sched_pmd_add_rxq(struct sched_pmd *sched_pmd, struct dp_netdev_rxq *rxq,
5948
                  uint64_t cycles)
5949
0
{
5950
    /* As sched_pmd is allocated outside this fn. better to not assume
5951
     * rxqs is initialized to NULL. */
5952
0
    if (sched_pmd->n_rxq == 0) {
5953
0
        sched_pmd->rxqs = xmalloc(sizeof *sched_pmd->rxqs);
5954
0
    } else {
5955
0
        sched_pmd->rxqs = xrealloc(sched_pmd->rxqs, (sched_pmd->n_rxq + 1) *
5956
0
                                                    sizeof *sched_pmd->rxqs);
5957
0
    }
5958
5959
0
    sched_pmd->rxqs[sched_pmd->n_rxq++] = rxq;
5960
0
    sched_pmd->pmd_proc_cycles += cycles;
5961
0
}
5962
5963
static void
5964
sched_numa_list_assignments(struct sched_numa_list *numa_list,
5965
                            struct dp_netdev *dp)
5966
    OVS_REQ_RDLOCK(dp->port_rwlock)
5967
0
{
5968
0
    struct dp_netdev_port *port;
5969
5970
    /* For each port. */
5971
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
5972
0
        if (!netdev_is_pmd(port->netdev)) {
5973
0
            continue;
5974
0
        }
5975
        /* For each rxq on the port. */
5976
0
        for (unsigned qid = 0; qid < port->n_rxq; qid++) {
5977
0
            struct dp_netdev_rxq *rxq = &port->rxqs[qid];
5978
0
            struct sched_pmd *sched_pmd;
5979
0
            uint64_t proc_cycles = 0;
5980
5981
0
            for (int i = 0; i < PMD_INTERVAL_MAX; i++) {
5982
0
                proc_cycles  += dp_netdev_rxq_get_intrvl_cycles(rxq, i);
5983
0
            }
5984
5985
0
            sched_pmd = sched_pmd_find_by_pmd(numa_list, rxq->pmd);
5986
0
            if (sched_pmd) {
5987
0
                if (rxq->core_id != OVS_CORE_UNSPEC && dp->pmd_iso) {
5988
0
                    sched_pmd->isolated = true;
5989
0
                }
5990
0
                sched_pmd_add_rxq(sched_pmd, rxq, proc_cycles);
5991
0
            }
5992
0
        }
5993
0
    }
5994
0
}
5995
5996
static void
5997
sched_numa_list_put_in_place(struct sched_numa_list *numa_list)
5998
0
{
5999
0
    struct sched_numa *numa;
6000
6001
    /* For each numa. */
6002
0
    HMAP_FOR_EACH (numa, node, &numa_list->numas) {
6003
        /* For each pmd. */
6004
0
        for (int i = 0; i < numa->n_pmds; i++) {
6005
0
            struct sched_pmd *sched_pmd;
6006
6007
0
            sched_pmd = &numa->pmds[i];
6008
0
            sched_pmd->pmd->isolated = sched_pmd->isolated;
6009
            /* For each rxq. */
6010
0
            for (unsigned k = 0; k < sched_pmd->n_rxq; k++) {
6011
                /* Store the new pmd from the out of place sched_numa_list
6012
                 * struct to the dp_netdev_rxq struct */
6013
0
                sched_pmd->rxqs[k]->pmd = sched_pmd->pmd;
6014
0
            }
6015
0
        }
6016
0
    }
6017
0
}
6018
6019
/* Returns 'true' if OVS rxq scheduling algorithm assigned any unpinned rxq to
6020
 * a PMD thread core on a non-local numa node. */
6021
static bool
6022
sched_numa_list_cross_numa_polling(struct sched_numa_list *numa_list)
6023
0
{
6024
0
    struct sched_numa *numa;
6025
6026
0
    HMAP_FOR_EACH (numa, node, &numa_list->numas) {
6027
0
        for (int i = 0; i < numa->n_pmds; i++) {
6028
0
            struct sched_pmd *sched_pmd;
6029
6030
0
            sched_pmd = &numa->pmds[i];
6031
0
            if (sched_pmd->isolated) {
6032
                /* All rxqs on this PMD thread core are pinned. */
6033
0
                continue;
6034
0
            }
6035
0
            for (unsigned k = 0; k < sched_pmd->n_rxq; k++) {
6036
0
                struct dp_netdev_rxq *rxq = sched_pmd->rxqs[k];
6037
                /* Check if the rxq is not pinned to a specific PMD thread core
6038
                 * by the user AND the PMD thread core that OVS assigned is
6039
                 * non-local to the rxq port. */
6040
0
                if (rxq->core_id == OVS_CORE_UNSPEC &&
6041
0
                    rxq->pmd->numa_id !=
6042
0
                        netdev_get_numa_id(rxq->port->netdev)) {
6043
0
                    return true;
6044
0
                }
6045
0
            }
6046
0
        }
6047
0
    }
6048
0
    return false;
6049
0
}
6050
6051
static unsigned
6052
sched_numa_noniso_pmd_count(struct sched_numa *numa)
6053
0
{
6054
0
    if (numa->n_pmds > numa->n_isolated) {
6055
0
        return numa->n_pmds - numa->n_isolated;
6056
0
    }
6057
0
    return 0;
6058
0
}
6059
6060
/* Sort Rx Queues by the processing cycles they are consuming. */
6061
static int
6062
compare_rxq_cycles(const void *a, const void *b)
6063
0
{
6064
0
    struct dp_netdev_rxq *qa;
6065
0
    struct dp_netdev_rxq *qb;
6066
0
    uint64_t cycles_qa, cycles_qb;
6067
6068
0
    qa = *(struct dp_netdev_rxq **) a;
6069
0
    qb = *(struct dp_netdev_rxq **) b;
6070
6071
0
    cycles_qa = dp_netdev_rxq_get_cycles(qa, RXQ_CYCLES_PROC_HIST);
6072
0
    cycles_qb = dp_netdev_rxq_get_cycles(qb, RXQ_CYCLES_PROC_HIST);
6073
6074
0
    if (cycles_qa != cycles_qb) {
6075
0
        return (cycles_qa < cycles_qb) ? 1 : -1;
6076
0
    } else {
6077
        /* Cycles are the same so tiebreak on port/queue id.
6078
         * Tiebreaking (as opposed to return 0) ensures consistent
6079
         * sort results across multiple OS's. */
6080
0
        uint32_t port_qa = odp_to_u32(qa->port->port_no);
6081
0
        uint32_t port_qb = odp_to_u32(qb->port->port_no);
6082
0
        if (port_qa != port_qb) {
6083
0
            return port_qa > port_qb ? 1 : -1;
6084
0
        } else {
6085
0
            return netdev_rxq_get_queue_id(qa->rx)
6086
0
                    - netdev_rxq_get_queue_id(qb->rx);
6087
0
        }
6088
0
    }
6089
0
}
6090
6091
static bool
6092
sched_pmd_new_lowest(struct sched_pmd *current_lowest, struct sched_pmd *pmd,
6093
                     bool has_proc)
6094
0
{
6095
0
    uint64_t current_num, pmd_num;
6096
6097
0
    if (current_lowest == NULL) {
6098
0
        return true;
6099
0
    }
6100
6101
0
    if (has_proc) {
6102
0
        current_num = current_lowest->pmd_proc_cycles;
6103
0
        pmd_num = pmd->pmd_proc_cycles;
6104
0
    } else {
6105
0
        current_num = current_lowest->n_rxq;
6106
0
        pmd_num = pmd->n_rxq;
6107
0
    }
6108
6109
0
    if (pmd_num < current_num) {
6110
0
        return true;
6111
0
    }
6112
0
    return false;
6113
0
}
6114
6115
static struct sched_pmd *
6116
sched_pmd_get_lowest(struct sched_numa *numa, bool has_cyc)
6117
0
{
6118
0
    struct sched_pmd *lowest_sched_pmd = NULL;
6119
6120
0
    for (unsigned i = 0; i < numa->n_pmds; i++) {
6121
0
        struct sched_pmd *sched_pmd;
6122
6123
0
        sched_pmd = &numa->pmds[i];
6124
0
        if (sched_pmd->isolated) {
6125
0
            continue;
6126
0
        }
6127
0
        if (sched_pmd_new_lowest(lowest_sched_pmd, sched_pmd, has_cyc)) {
6128
0
            lowest_sched_pmd = sched_pmd;
6129
0
        }
6130
0
    }
6131
0
    return lowest_sched_pmd;
6132
0
}
6133
6134
/*
6135
 * Returns the next pmd from the numa node.
6136
 *
6137
 * If 'updown' is 'true' it will alternate between selecting the next pmd in
6138
 * either an up or down walk, switching between up/down when the first or last
6139
 * core is reached. e.g. 1,2,3,3,2,1,1,2...
6140
 *
6141
 * If 'updown' is 'false' it will select the next pmd wrapping around when
6142
 * last core reached. e.g. 1,2,3,1,2,3,1,2...
6143
 */
6144
static struct sched_pmd *
6145
sched_pmd_next_rr(struct sched_numa *numa, bool updown)
6146
0
{
6147
0
    int numa_idx = numa->rr_cur_index;
6148
6149
0
    if (numa->rr_idx_inc == true) {
6150
        /* Incrementing through list of pmds. */
6151
0
        if (numa->rr_cur_index == numa->n_pmds - 1) {
6152
            /* Reached the last pmd. */
6153
0
            if (updown) {
6154
0
                numa->rr_idx_inc = false;
6155
0
            } else {
6156
0
                numa->rr_cur_index = 0;
6157
0
            }
6158
0
        } else {
6159
0
            numa->rr_cur_index++;
6160
0
        }
6161
0
    } else {
6162
        /* Decrementing through list of pmds. */
6163
0
        if (numa->rr_cur_index == 0) {
6164
            /* Reached the first pmd. */
6165
0
            numa->rr_idx_inc = true;
6166
0
        } else {
6167
0
            numa->rr_cur_index--;
6168
0
        }
6169
0
    }
6170
0
    return &numa->pmds[numa_idx];
6171
0
}
6172
6173
static struct sched_pmd *
6174
sched_pmd_next_noniso_rr(struct sched_numa *numa, bool updown)
6175
0
{
6176
0
    struct sched_pmd *sched_pmd = NULL;
6177
6178
    /* sched_pmd_next_rr() may return duplicate PMDs before all PMDs have been
6179
     * returned depending on updown. Call it more than n_pmds to ensure all
6180
     * PMDs can be searched for the next non-isolated PMD. */
6181
0
    for (unsigned i = 0; i < numa->n_pmds * 2; i++) {
6182
0
        sched_pmd = sched_pmd_next_rr(numa, updown);
6183
0
        if (!sched_pmd->isolated) {
6184
0
            break;
6185
0
        }
6186
0
        sched_pmd = NULL;
6187
0
    }
6188
0
    return sched_pmd;
6189
0
}
6190
6191
static struct sched_pmd *
6192
sched_pmd_next(struct sched_numa *numa, enum sched_assignment_type algo,
6193
               bool has_proc)
6194
0
{
6195
0
    if (algo == SCHED_GROUP) {
6196
0
        return sched_pmd_get_lowest(numa, has_proc);
6197
0
    }
6198
6199
    /* By default RR the PMDs. */
6200
0
    return sched_pmd_next_noniso_rr(numa, algo == SCHED_CYCLES ? true : false);
6201
0
}
6202
6203
static const char *
6204
get_assignment_type_string(enum sched_assignment_type algo)
6205
0
{
6206
0
    switch (algo) {
6207
0
    case SCHED_ROUNDROBIN: return "roundrobin";
6208
0
    case SCHED_CYCLES: return "cycles";
6209
0
    case SCHED_GROUP: return "group";
6210
0
    default: return "Unknown";
6211
0
    }
6212
0
}
6213
6214
0
#define MAX_RXQ_CYC_TEXT 40
6215
0
#define MAX_RXQ_CYC_STRLEN (INT_STRLEN(uint64_t) + MAX_RXQ_CYC_TEXT)
6216
6217
static char *
6218
get_rxq_cyc_log(char *a, enum sched_assignment_type algo, uint64_t cycles)
6219
0
{
6220
0
    int ret = 0;
6221
6222
0
    if (algo != SCHED_ROUNDROBIN) {
6223
0
        ret = snprintf(a, MAX_RXQ_CYC_STRLEN,
6224
0
                       " (measured processing cycles %"PRIu64")", cycles);
6225
0
    }
6226
6227
0
    if (algo == SCHED_ROUNDROBIN || ret <= 0) {
6228
0
        a[0] = '\0';
6229
0
    }
6230
0
    return a;
6231
0
}
6232
6233
static void
6234
sched_numa_list_schedule(struct sched_numa_list *numa_list,
6235
                         struct dp_netdev *dp,
6236
                         enum sched_assignment_type algo,
6237
                         enum vlog_level level)
6238
    OVS_REQ_RDLOCK(dp->port_rwlock)
6239
0
{
6240
0
    struct dp_netdev_port *port;
6241
0
    struct dp_netdev_rxq **rxqs = NULL;
6242
0
    struct sched_numa *last_cross_numa;
6243
0
    unsigned n_rxqs = 0;
6244
0
    bool start_logged = false;
6245
0
    size_t n_numa;
6246
6247
    /* For each port. */
6248
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
6249
0
        if (!netdev_is_pmd(port->netdev)) {
6250
0
            continue;
6251
0
        }
6252
6253
        /* For each rxq on the port. */
6254
0
        for (int qid = 0; qid < port->n_rxq; qid++) {
6255
0
            struct dp_netdev_rxq *rxq = &port->rxqs[qid];
6256
6257
0
            if (algo != SCHED_ROUNDROBIN) {
6258
0
                uint64_t cycle_hist = 0;
6259
6260
                /* Sum the queue intervals and store the cycle history. */
6261
0
                for (unsigned i = 0; i < PMD_INTERVAL_MAX; i++) {
6262
0
                    cycle_hist += dp_netdev_rxq_get_intrvl_cycles(rxq, i);
6263
0
                }
6264
0
                dp_netdev_rxq_set_cycles(rxq, RXQ_CYCLES_PROC_HIST,
6265
0
                                         cycle_hist);
6266
0
            }
6267
6268
            /* Check if this rxq is pinned. */
6269
0
            if (rxq->core_id != OVS_CORE_UNSPEC) {
6270
0
                struct sched_pmd *sched_pmd;
6271
0
                struct dp_netdev_pmd_thread *pmd;
6272
0
                struct sched_numa *numa;
6273
0
                bool iso = dp->pmd_iso;
6274
0
                uint64_t proc_cycles;
6275
0
                char rxq_cyc_log[MAX_RXQ_CYC_STRLEN];
6276
6277
                /* This rxq should be pinned, pin it now. */
6278
0
                pmd = dp_netdev_get_pmd(dp, rxq->core_id);
6279
0
                sched_pmd = sched_pmd_find_by_pmd(numa_list, pmd);
6280
0
                dp_netdev_pmd_unref(pmd);
6281
0
                if (!sched_pmd) {
6282
                    /* Cannot find the PMD.  Cannot pin this rxq. */
6283
0
                    VLOG(level == VLL_DBG ? VLL_DBG : VLL_WARN,
6284
0
                            "Core %2u cannot be pinned with "
6285
0
                            "port \'%s\' rx queue %d. Use pmd-cpu-mask to "
6286
0
                            "enable a pmd on core %u. An alternative core "
6287
0
                            "will be assigned.",
6288
0
                            rxq->core_id,
6289
0
                            netdev_rxq_get_name(rxq->rx),
6290
0
                            netdev_rxq_get_queue_id(rxq->rx),
6291
0
                            rxq->core_id);
6292
0
                    rxqs = xrealloc(rxqs, (n_rxqs + 1) * sizeof *rxqs);
6293
0
                    rxqs[n_rxqs++] = rxq;
6294
0
                    continue;
6295
0
                }
6296
0
                if (iso) {
6297
                    /* Mark PMD as isolated if not done already. */
6298
0
                    if (sched_pmd->isolated == false) {
6299
0
                        sched_pmd->isolated = true;
6300
0
                        numa = sched_pmd->numa;
6301
0
                        numa->n_isolated++;
6302
0
                    }
6303
0
                }
6304
0
                proc_cycles = dp_netdev_rxq_get_cycles(rxq,
6305
0
                                                       RXQ_CYCLES_PROC_HIST);
6306
0
                VLOG(level, "Core %2u on numa node %d is pinned with "
6307
0
                            "port \'%s\' rx queue %d%s",
6308
0
                            sched_pmd->pmd->core_id, sched_pmd->pmd->numa_id,
6309
0
                            netdev_rxq_get_name(rxq->rx),
6310
0
                            netdev_rxq_get_queue_id(rxq->rx),
6311
0
                            get_rxq_cyc_log(rxq_cyc_log, algo, proc_cycles));
6312
0
                sched_pmd_add_rxq(sched_pmd, rxq, proc_cycles);
6313
0
            } else {
6314
0
                rxqs = xrealloc(rxqs, (n_rxqs + 1) * sizeof *rxqs);
6315
0
                rxqs[n_rxqs++] = rxq;
6316
0
            }
6317
0
        }
6318
0
    }
6319
6320
0
    if (n_rxqs > 1 && algo != SCHED_ROUNDROBIN) {
6321
        /* Sort the queues in order of the processing cycles
6322
         * they consumed during their last pmd interval. */
6323
0
        qsort(rxqs, n_rxqs, sizeof *rxqs, compare_rxq_cycles);
6324
0
    }
6325
6326
0
    last_cross_numa = NULL;
6327
0
    n_numa = sched_numa_list_count(numa_list);
6328
0
    for (unsigned i = 0; i < n_rxqs; i++) {
6329
0
        struct dp_netdev_rxq *rxq = rxqs[i];
6330
0
        struct sched_pmd *sched_pmd = NULL;
6331
0
        struct sched_numa *numa;
6332
0
        int port_numa_id;
6333
0
        uint64_t proc_cycles;
6334
0
        char rxq_cyc_log[MAX_RXQ_CYC_STRLEN];
6335
6336
0
        if (start_logged == false && level != VLL_DBG) {
6337
0
            VLOG(level, "Performing pmd to rx queue assignment using %s "
6338
0
                        "algorithm.", get_assignment_type_string(algo));
6339
0
            start_logged = true;
6340
0
        }
6341
6342
        /* Store the cycles for this rxq as we will log these later. */
6343
0
        proc_cycles = dp_netdev_rxq_get_cycles(rxq, RXQ_CYCLES_PROC_HIST);
6344
6345
0
        port_numa_id = netdev_get_numa_id(rxq->port->netdev);
6346
6347
        /* Select numa. */
6348
0
        numa = sched_numa_list_lookup(numa_list, port_numa_id);
6349
6350
        /* Check if numa has no PMDs or no non-isolated PMDs. */
6351
0
        if (!numa || !sched_numa_noniso_pmd_count(numa)) {
6352
            /* Unable to use this numa to find a PMD. */
6353
0
            numa = NULL;
6354
            /* Find any numa with available PMDs. */
6355
0
            for (int j = 0; j < n_numa; j++) {
6356
0
                numa = sched_numa_list_next(numa_list, last_cross_numa);
6357
0
                last_cross_numa = numa;
6358
0
                if (sched_numa_noniso_pmd_count(numa)) {
6359
0
                    break;
6360
0
                }
6361
0
                numa = NULL;
6362
0
            }
6363
0
        }
6364
6365
0
        if (numa) {
6366
            /* Select the PMD that should be used for this rxq. */
6367
0
            sched_pmd = sched_pmd_next(numa, algo,
6368
0
                                       proc_cycles ? true : false);
6369
0
        }
6370
6371
        /* Check that a pmd has been selected. */
6372
0
        if (sched_pmd) {
6373
0
            int pmd_numa_id;
6374
6375
0
            pmd_numa_id = sched_pmd->numa->numa_id;
6376
            /* Check if selected pmd numa matches port numa. */
6377
0
            if (pmd_numa_id != port_numa_id) {
6378
0
                VLOG(level, "There's no available (non-isolated) pmd thread "
6379
0
                            "on numa node %d. Port \'%s\' rx queue %d will "
6380
0
                            "be assigned to a pmd on numa node %d. "
6381
0
                            "This may lead to reduced performance.",
6382
0
                            port_numa_id, netdev_rxq_get_name(rxq->rx),
6383
0
                            netdev_rxq_get_queue_id(rxq->rx), pmd_numa_id);
6384
0
            }
6385
0
            VLOG(level, "Core %2u on numa node %d assigned port \'%s\' "
6386
0
                        "rx queue %d%s.",
6387
0
                        sched_pmd->pmd->core_id, sched_pmd->pmd->numa_id,
6388
0
                        netdev_rxq_get_name(rxq->rx),
6389
0
                        netdev_rxq_get_queue_id(rxq->rx),
6390
0
                        get_rxq_cyc_log(rxq_cyc_log, algo, proc_cycles));
6391
0
            sched_pmd_add_rxq(sched_pmd, rxq, proc_cycles);
6392
0
        } else  {
6393
0
            VLOG(level == VLL_DBG ? level : VLL_WARN,
6394
0
                 "No non-isolated pmd on any numa available for "
6395
0
                 "port \'%s\' rx queue %d%s. "
6396
0
                 "This rx queue will not be polled.",
6397
0
                 netdev_rxq_get_name(rxq->rx),
6398
0
                 netdev_rxq_get_queue_id(rxq->rx),
6399
0
                 get_rxq_cyc_log(rxq_cyc_log, algo, proc_cycles));
6400
0
        }
6401
0
    }
6402
0
    free(rxqs);
6403
0
}
6404
6405
static void
6406
rxq_scheduling(struct dp_netdev *dp)
6407
    OVS_REQ_RDLOCK(dp->port_rwlock)
6408
0
{
6409
0
    struct sched_numa_list numa_list;
6410
0
    enum sched_assignment_type algo = dp->pmd_rxq_assign_type;
6411
6412
0
    sched_numa_list_populate(&numa_list, dp);
6413
0
    sched_numa_list_schedule(&numa_list, dp, algo, VLL_INFO);
6414
0
    sched_numa_list_put_in_place(&numa_list);
6415
6416
0
    sched_numa_list_free_entries(&numa_list);
6417
0
}
6418
6419
static uint64_t variance(uint64_t a[], int n);
6420
6421
static uint64_t
6422
sched_numa_variance(struct sched_numa *numa)
6423
0
{
6424
0
    uint64_t *percent_busy = NULL;
6425
0
    int n_proc = 0;
6426
0
    uint64_t var;
6427
6428
0
    percent_busy = xmalloc(numa->n_pmds * sizeof *percent_busy);
6429
6430
0
    for (unsigned i = 0; i < numa->n_pmds; i++) {
6431
0
        struct sched_pmd *sched_pmd;
6432
0
        uint64_t total_cycles = 0;
6433
6434
0
        sched_pmd = &numa->pmds[i];
6435
        /* Exclude isolated PMDs from variance calculations. */
6436
0
        if (sched_pmd->isolated == true) {
6437
0
            continue;
6438
0
        }
6439
        /* Get the total pmd cycles for an interval. */
6440
0
        atomic_read_relaxed(&sched_pmd->pmd->intrvl_cycles, &total_cycles);
6441
6442
0
        if (total_cycles) {
6443
            /* Estimate the cycles to cover all intervals. */
6444
0
            total_cycles *= PMD_INTERVAL_MAX;
6445
0
            percent_busy[n_proc++] = (sched_pmd->pmd_proc_cycles * 100)
6446
0
                                            / total_cycles;
6447
0
        } else {
6448
0
            percent_busy[n_proc++] = 0;
6449
0
        }
6450
0
    }
6451
0
    var = variance(percent_busy, n_proc);
6452
0
    free(percent_busy);
6453
0
    return var;
6454
0
}
6455
6456
/*
6457
 * This function checks that some basic conditions needed for a rebalance to be
6458
 * effective are met. Such as Rxq scheduling assignment type, more than one
6459
 * PMD, more than 2 Rxqs on a PMD. If there was no reconfiguration change
6460
 * since the last check, it reuses the last result.
6461
 *
6462
 * It is not intended to be an inclusive check of every condition that may make
6463
 * a rebalance ineffective. It is done as a quick check so a full
6464
 * pmd_rebalance_dry_run() can be avoided when it is not needed.
6465
 */
6466
static bool
6467
pmd_rebalance_dry_run_needed(struct dp_netdev *dp)
6468
    OVS_REQ_RDLOCK(dp->port_rwlock)
6469
0
{
6470
0
    struct dp_netdev_pmd_thread *pmd;
6471
0
    struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
6472
0
    unsigned int cnt = 0;
6473
0
    bool multi_rxq = false;
6474
6475
    /* Check if there was no reconfiguration since last check. */
6476
0
    if (!pmd_alb->recheck_config) {
6477
0
        if (!pmd_alb->do_dry_run) {
6478
0
            VLOG_DBG("PMD auto load balance nothing to do, "
6479
0
                     "no configuration changes since last check.");
6480
0
            return false;
6481
0
        }
6482
0
        return true;
6483
0
    }
6484
0
    pmd_alb->recheck_config = false;
6485
6486
    /* Check for incompatible assignment type. */
6487
0
    if (dp->pmd_rxq_assign_type == SCHED_ROUNDROBIN) {
6488
0
        VLOG_DBG("PMD auto load balance nothing to do, "
6489
0
                 "pmd-rxq-assign=roundrobin assignment type configured.");
6490
0
        return pmd_alb->do_dry_run = false;
6491
0
    }
6492
6493
    /* Check that there is at least 2 non-isolated PMDs and
6494
     * one of them is polling more than one rxq. */
6495
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
6496
0
        if (pmd->core_id == NON_PMD_CORE_ID || pmd->isolated) {
6497
0
            continue;
6498
0
        }
6499
6500
0
        if (hmap_count(&pmd->poll_list) > 1) {
6501
0
            multi_rxq = true;
6502
0
        }
6503
0
        if (cnt && multi_rxq) {
6504
0
            return pmd_alb->do_dry_run = true;
6505
0
        }
6506
0
        cnt++;
6507
0
    }
6508
6509
0
    VLOG_DBG("PMD auto load balance nothing to do, "
6510
0
             "not enough non-isolated PMDs or RxQs.");
6511
0
    return pmd_alb->do_dry_run = false;
6512
0
}
6513
6514
static bool
6515
pmd_rebalance_dry_run(struct dp_netdev *dp)
6516
    OVS_REQ_RDLOCK(dp->port_rwlock)
6517
0
{
6518
0
    struct sched_numa_list numa_list_cur;
6519
0
    struct sched_numa_list numa_list_est;
6520
0
    bool thresh_met = false;
6521
6522
0
    VLOG_DBG("PMD auto load balance performing dry run.");
6523
6524
    /* Populate current assignments. */
6525
0
    sched_numa_list_populate(&numa_list_cur, dp);
6526
0
    sched_numa_list_assignments(&numa_list_cur, dp);
6527
6528
    /* Populate estimated assignments. */
6529
0
    sched_numa_list_populate(&numa_list_est, dp);
6530
0
    sched_numa_list_schedule(&numa_list_est, dp,
6531
0
                             dp->pmd_rxq_assign_type, VLL_DBG);
6532
6533
    /* Check if cross-numa polling, there is only one numa with PMDs. */
6534
0
    if (!sched_numa_list_cross_numa_polling(&numa_list_est) ||
6535
0
            sched_numa_list_count(&numa_list_est) == 1) {
6536
0
        struct sched_numa *numa_cur;
6537
6538
        /* Calculate variances. */
6539
0
        HMAP_FOR_EACH (numa_cur, node, &numa_list_cur.numas) {
6540
0
            uint64_t current_var, estimate_var;
6541
0
            struct sched_numa *numa_est;
6542
0
            uint64_t improvement = 0;
6543
6544
0
            numa_est = sched_numa_list_lookup(&numa_list_est,
6545
0
                                              numa_cur->numa_id);
6546
0
            if (!numa_est) {
6547
0
                continue;
6548
0
            }
6549
0
            current_var = sched_numa_variance(numa_cur);
6550
0
            estimate_var = sched_numa_variance(numa_est);
6551
0
            if (estimate_var < current_var) {
6552
0
                improvement = ((current_var - estimate_var) * 100)
6553
0
                              / current_var;
6554
0
            }
6555
0
            VLOG_DBG("Numa node %d. Current variance %"PRIu64" Estimated "
6556
0
                     "variance %"PRIu64". Variance improvement %"PRIu64"%%.",
6557
0
                     numa_cur->numa_id, current_var,
6558
0
                     estimate_var, improvement);
6559
0
            if (improvement >= dp->pmd_alb.rebalance_improve_thresh) {
6560
0
                thresh_met = true;
6561
0
            }
6562
0
        }
6563
0
        VLOG_DBG("PMD load variance improvement threshold %u%% is %s.",
6564
0
                 dp->pmd_alb.rebalance_improve_thresh,
6565
0
                 thresh_met ? "met" : "not met");
6566
0
    } else {
6567
0
        VLOG_DBG("PMD auto load balance detected cross-numa polling with "
6568
0
                 "multiple numa nodes. Unable to accurately estimate.");
6569
0
    }
6570
6571
0
    sched_numa_list_free_entries(&numa_list_cur);
6572
0
    sched_numa_list_free_entries(&numa_list_est);
6573
6574
0
    return thresh_met;
6575
0
}
6576
6577
static void
6578
reload_affected_pmds(struct dp_netdev *dp)
6579
0
{
6580
0
    struct dp_netdev_pmd_thread *pmd;
6581
6582
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
6583
0
        if (pmd->need_reload) {
6584
0
            dp_netdev_reload_pmd__(pmd);
6585
0
        }
6586
0
    }
6587
6588
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
6589
0
        if (pmd->need_reload) {
6590
0
            if (pmd->core_id != NON_PMD_CORE_ID) {
6591
0
                bool reload;
6592
6593
0
                do {
6594
0
                    atomic_read_explicit(&pmd->reload, &reload,
6595
0
                                         memory_order_acquire);
6596
0
                } while (reload);
6597
0
            }
6598
0
            pmd->need_reload = false;
6599
0
        }
6600
0
    }
6601
0
}
6602
6603
static void
6604
reconfigure_pmd_threads(struct dp_netdev *dp)
6605
    OVS_REQ_RDLOCK(dp->port_rwlock)
6606
0
{
6607
0
    struct dp_netdev_pmd_thread *pmd;
6608
0
    struct ovs_numa_dump *pmd_cores;
6609
0
    struct ovs_numa_info_core *core;
6610
0
    struct hmapx to_delete = HMAPX_INITIALIZER(&to_delete);
6611
0
    struct hmapx_node *node;
6612
0
    bool changed = false;
6613
0
    bool need_to_adjust_static_tx_qids = false;
6614
6615
    /* The pmd threads should be started only if there's a pmd port in the
6616
     * datapath.  If the user didn't provide any "pmd-cpu-mask", we start
6617
     * NR_PMD_THREADS per numa node. */
6618
0
    if (!has_pmd_port(dp)) {
6619
0
        pmd_cores = ovs_numa_dump_n_cores_per_numa(0);
6620
0
    } else if (dp->pmd_cmask && dp->pmd_cmask[0]) {
6621
0
        pmd_cores = ovs_numa_dump_cores_with_cmask(dp->pmd_cmask);
6622
0
    } else {
6623
0
        pmd_cores = ovs_numa_dump_n_cores_per_numa(NR_PMD_THREADS);
6624
0
    }
6625
6626
    /* We need to adjust 'static_tx_qid's only if we're reducing number of
6627
     * PMD threads. Otherwise, new threads will allocate all the freed ids. */
6628
0
    if (ovs_numa_dump_count(pmd_cores) < cmap_count(&dp->poll_threads) - 1) {
6629
        /* Adjustment is required to keep 'static_tx_qid's sequential and
6630
         * avoid possible issues, for example, imbalanced tx queue usage
6631
         * and unnecessary locking caused by remapping on netdev level. */
6632
0
        need_to_adjust_static_tx_qids = true;
6633
0
    }
6634
6635
    /* Check for unwanted pmd threads */
6636
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
6637
0
        if (pmd->core_id == NON_PMD_CORE_ID) {
6638
0
            continue;
6639
0
        }
6640
0
        if (!ovs_numa_dump_contains_core(pmd_cores, pmd->numa_id,
6641
0
                                                    pmd->core_id)) {
6642
0
            hmapx_add(&to_delete, pmd);
6643
0
        } else if (need_to_adjust_static_tx_qids) {
6644
0
            atomic_store_relaxed(&pmd->reload_tx_qid, true);
6645
0
            pmd->need_reload = true;
6646
0
        }
6647
0
    }
6648
6649
0
    HMAPX_FOR_EACH (node, &to_delete) {
6650
0
        pmd = (struct dp_netdev_pmd_thread *) node->data;
6651
0
        VLOG_INFO("PMD thread on numa_id: %d, core id: %2d destroyed.",
6652
0
                  pmd->numa_id, pmd->core_id);
6653
0
        dp_netdev_del_pmd(dp, pmd);
6654
0
    }
6655
0
    changed = !hmapx_is_empty(&to_delete);
6656
0
    hmapx_destroy(&to_delete);
6657
6658
0
    if (need_to_adjust_static_tx_qids) {
6659
        /* 'static_tx_qid's are not sequential now.
6660
         * Reload remaining threads to fix this. */
6661
0
        reload_affected_pmds(dp);
6662
0
    }
6663
6664
    /* Check for required new pmd threads */
6665
0
    FOR_EACH_CORE_ON_DUMP(core, pmd_cores) {
6666
0
        pmd = dp_netdev_get_pmd(dp, core->core_id);
6667
0
        if (!pmd) {
6668
0
            struct ds name = DS_EMPTY_INITIALIZER;
6669
6670
0
            pmd = xzalloc(sizeof *pmd);
6671
0
            dp_netdev_configure_pmd(pmd, dp, core->core_id, core->numa_id);
6672
6673
0
            ds_put_format(&name, "pmd-c%02d/id:", core->core_id);
6674
0
            pmd->thread = ovs_thread_create(ds_cstr(&name),
6675
0
                                            pmd_thread_main, pmd);
6676
0
            ds_destroy(&name);
6677
6678
0
            VLOG_INFO("PMD thread on numa_id: %d, core id: %2d created.",
6679
0
                      pmd->numa_id, pmd->core_id);
6680
0
            changed = true;
6681
0
        } else {
6682
0
            dp_netdev_pmd_unref(pmd);
6683
0
        }
6684
0
    }
6685
6686
0
    if (changed) {
6687
0
        struct ovs_numa_info_numa *numa;
6688
6689
        /* Log the number of pmd threads per numa node. */
6690
0
        FOR_EACH_NUMA_ON_DUMP (numa, pmd_cores) {
6691
0
            VLOG_INFO("There are %"PRIuSIZE" pmd threads on numa node %d",
6692
0
                      numa->n_cores, numa->numa_id);
6693
0
        }
6694
0
    }
6695
6696
0
    ovs_numa_dump_destroy(pmd_cores);
6697
0
}
6698
6699
static void
6700
pmd_remove_stale_ports(struct dp_netdev *dp,
6701
                       struct dp_netdev_pmd_thread *pmd)
6702
    OVS_EXCLUDED(pmd->port_mutex)
6703
    OVS_REQ_RDLOCK(dp->port_rwlock)
6704
0
{
6705
0
    struct rxq_poll *poll;
6706
0
    struct tx_port *tx;
6707
6708
0
    ovs_mutex_lock(&pmd->port_mutex);
6709
0
    HMAP_FOR_EACH_SAFE (poll, node, &pmd->poll_list) {
6710
0
        struct dp_netdev_port *port = poll->rxq->port;
6711
6712
0
        if (port->need_reconfigure
6713
0
            || !hmap_contains(&dp->ports, &port->node)) {
6714
0
            dp_netdev_del_rxq_from_pmd(pmd, poll);
6715
0
        }
6716
0
    }
6717
0
    HMAP_FOR_EACH_SAFE (tx, node, &pmd->tx_ports) {
6718
0
        struct dp_netdev_port *port = tx->port;
6719
6720
0
        if (port->need_reconfigure
6721
0
            || !hmap_contains(&dp->ports, &port->node)) {
6722
0
            dp_netdev_del_port_tx_from_pmd(pmd, tx);
6723
0
        }
6724
0
    }
6725
0
    ovs_mutex_unlock(&pmd->port_mutex);
6726
0
}
6727
6728
/* Must be called each time a port is added/removed or the cmask changes.
6729
 * This creates and destroys pmd threads, reconfigures ports, opens their
6730
 * rxqs and assigns all rxqs/txqs to pmd threads. */
6731
static void
6732
reconfigure_datapath(struct dp_netdev *dp)
6733
    OVS_REQ_RDLOCK(dp->port_rwlock)
6734
0
{
6735
0
    struct hmapx busy_threads = HMAPX_INITIALIZER(&busy_threads);
6736
0
    struct dp_netdev_pmd_thread *pmd;
6737
0
    struct dp_netdev_port *port;
6738
0
    int wanted_txqs;
6739
6740
0
    dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
6741
6742
    /* Step 1: Adjust the pmd threads based on the datapath ports, the cores
6743
     * on the system and the user configuration. */
6744
0
    reconfigure_pmd_threads(dp);
6745
6746
0
    wanted_txqs = cmap_count(&dp->poll_threads);
6747
6748
    /* The number of pmd threads might have changed, or a port can be new:
6749
     * adjust the txqs. */
6750
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
6751
0
        netdev_set_tx_multiq(port->netdev, wanted_txqs);
6752
0
    }
6753
6754
    /* Step 2: Remove from the pmd threads ports that have been removed or
6755
     * need reconfiguration. */
6756
6757
    /* Check for all the ports that need reconfiguration.  We cache this in
6758
     * 'port->need_reconfigure', because netdev_is_reconf_required() can
6759
     * change at any time.
6760
     * Also mark for reconfiguration all ports which will likely change their
6761
     * 'txq_mode' parameter.  It's required to stop using them before
6762
     * changing this setting and it's simpler to mark ports here and allow
6763
     * 'pmd_remove_stale_ports' to remove them from threads.  There will be
6764
     * no actual reconfiguration in 'port_reconfigure' because it's
6765
     * unnecessary.  */
6766
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
6767
0
        if (netdev_is_reconf_required(port->netdev)
6768
0
            || ((port->txq_mode == TXQ_MODE_XPS)
6769
0
                != (netdev_n_txq(port->netdev) < wanted_txqs))
6770
0
            || ((port->txq_mode == TXQ_MODE_XPS_HASH)
6771
0
                != (port->txq_requested_mode == TXQ_REQ_MODE_HASH
6772
0
                    && netdev_n_txq(port->netdev) > 1))) {
6773
0
            port->need_reconfigure = true;
6774
0
        }
6775
0
    }
6776
6777
    /* Remove from the pmd threads all the ports that have been deleted or
6778
     * need reconfiguration. */
6779
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
6780
0
        pmd_remove_stale_ports(dp, pmd);
6781
0
    }
6782
6783
    /* Reload affected pmd threads.  We must wait for the pmd threads before
6784
     * reconfiguring the ports, because a port cannot be reconfigured while
6785
     * it's being used. */
6786
0
    reload_affected_pmds(dp);
6787
6788
    /* Step 3: Reconfigure ports. */
6789
6790
    /* We only reconfigure the ports that we determined above, because they're
6791
     * not being used by any pmd thread at the moment.  If a port fails to
6792
     * reconfigure we remove it from the datapath. */
6793
0
    HMAP_FOR_EACH_SAFE (port, node, &dp->ports) {
6794
0
        int err;
6795
6796
0
        if (!port->need_reconfigure) {
6797
0
            continue;
6798
0
        }
6799
6800
0
        err = port_reconfigure(port);
6801
0
        if (err) {
6802
0
            hmap_remove(&dp->ports, &port->node);
6803
0
            seq_change(dp->port_seq);
6804
0
            port_destroy(port);
6805
0
        } else {
6806
            /* With a single queue, there is no point in using hash mode. */
6807
0
            if (port->txq_requested_mode == TXQ_REQ_MODE_HASH &&
6808
0
                netdev_n_txq(port->netdev) > 1) {
6809
0
                port->txq_mode = TXQ_MODE_XPS_HASH;
6810
0
            } else if (netdev_n_txq(port->netdev) < wanted_txqs) {
6811
0
                port->txq_mode = TXQ_MODE_XPS;
6812
0
            } else {
6813
0
                port->txq_mode = TXQ_MODE_STATIC;
6814
0
            }
6815
0
        }
6816
0
    }
6817
6818
    /* Step 4: Compute new rxq scheduling.  We don't touch the pmd threads
6819
     * for now, we just update the 'pmd' pointer in each rxq to point to the
6820
     * wanted thread according to the scheduling policy. */
6821
6822
    /* Reset all the pmd threads to non isolated. */
6823
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
6824
0
        pmd->isolated = false;
6825
0
    }
6826
6827
    /* Reset all the queues to unassigned */
6828
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
6829
0
        for (int i = 0; i < port->n_rxq; i++) {
6830
0
            port->rxqs[i].pmd = NULL;
6831
0
        }
6832
0
    }
6833
0
    rxq_scheduling(dp);
6834
6835
    /* Step 5: Remove queues not compliant with new scheduling. */
6836
6837
    /* Count all the threads that will have at least one queue to poll. */
6838
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
6839
0
        for (int qid = 0; qid < port->n_rxq; qid++) {
6840
0
            struct dp_netdev_rxq *q = &port->rxqs[qid];
6841
6842
0
            if (q->pmd) {
6843
0
                hmapx_add(&busy_threads, q->pmd);
6844
0
            }
6845
0
        }
6846
0
    }
6847
6848
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
6849
0
        struct rxq_poll *poll;
6850
6851
0
        ovs_mutex_lock(&pmd->port_mutex);
6852
0
        HMAP_FOR_EACH_SAFE (poll, node, &pmd->poll_list) {
6853
0
            if (poll->rxq->pmd != pmd) {
6854
0
                dp_netdev_del_rxq_from_pmd(pmd, poll);
6855
6856
                /* This pmd might sleep after this step if it has no rxq
6857
                 * remaining. Tell it to busy wait for new assignment if it
6858
                 * has at least one scheduled queue. */
6859
0
                if (hmap_count(&pmd->poll_list) == 0 &&
6860
0
                    hmapx_contains(&busy_threads, pmd)) {
6861
0
                    atomic_store_relaxed(&pmd->wait_for_reload, true);
6862
0
                }
6863
0
            }
6864
0
        }
6865
0
        ovs_mutex_unlock(&pmd->port_mutex);
6866
0
    }
6867
6868
0
    hmapx_destroy(&busy_threads);
6869
6870
    /* Reload affected pmd threads.  We must wait for the pmd threads to remove
6871
     * the old queues before readding them, otherwise a queue can be polled by
6872
     * two threads at the same time. */
6873
0
    reload_affected_pmds(dp);
6874
6875
    /* Step 6: Add queues from scheduling, if they're not there already. */
6876
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
6877
0
        if (!netdev_is_pmd(port->netdev)) {
6878
0
            continue;
6879
0
        }
6880
6881
0
        for (int qid = 0; qid < port->n_rxq; qid++) {
6882
0
            struct dp_netdev_rxq *q = &port->rxqs[qid];
6883
6884
0
            if (q->pmd) {
6885
0
                ovs_mutex_lock(&q->pmd->port_mutex);
6886
0
                dp_netdev_add_rxq_to_pmd(q->pmd, q);
6887
0
                ovs_mutex_unlock(&q->pmd->port_mutex);
6888
0
            }
6889
0
        }
6890
0
    }
6891
6892
    /* Add every port and bond to the tx port and bond caches of
6893
     * every pmd thread, if it's not there already and if this pmd
6894
     * has at least one rxq to poll.
6895
     */
6896
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
6897
0
        ovs_mutex_lock(&pmd->port_mutex);
6898
0
        if (hmap_count(&pmd->poll_list) || pmd->core_id == NON_PMD_CORE_ID) {
6899
0
            struct tx_bond *bond;
6900
6901
0
            HMAP_FOR_EACH (port, node, &dp->ports) {
6902
0
                dp_netdev_add_port_tx_to_pmd(pmd, port);
6903
0
            }
6904
6905
0
            CMAP_FOR_EACH (bond, node, &dp->tx_bonds) {
6906
0
                dp_netdev_add_bond_tx_to_pmd(pmd, bond, false);
6907
0
            }
6908
0
        }
6909
0
        ovs_mutex_unlock(&pmd->port_mutex);
6910
0
    }
6911
6912
    /* Reload affected pmd threads. */
6913
0
    reload_affected_pmds(dp);
6914
6915
    /* PMD ALB will need to recheck if dry run needed. */
6916
0
    dp->pmd_alb.recheck_config = true;
6917
0
}
6918
6919
/* Returns true if one of the netdevs in 'dp' requires a reconfiguration */
6920
static bool
6921
ports_require_restart(const struct dp_netdev *dp)
6922
    OVS_REQ_RDLOCK(dp->port_rwlock)
6923
0
{
6924
0
    struct dp_netdev_port *port;
6925
6926
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
6927
0
        if (netdev_is_reconf_required(port->netdev)) {
6928
0
            return true;
6929
0
        }
6930
0
    }
6931
6932
0
    return false;
6933
0
}
6934
6935
/* Calculates variance in the values stored in array 'a'. 'n' is the number
6936
 * of elements in array to be considered for calculating vairance.
6937
 * Usage example: data array 'a' contains the processing load of each pmd and
6938
 * 'n' is the number of PMDs. It returns the variance in processing load of
6939
 * PMDs*/
6940
static uint64_t
6941
variance(uint64_t a[], int n)
6942
0
{
6943
    /* Compute mean (average of elements). */
6944
0
    uint64_t sum = 0;
6945
0
    uint64_t mean = 0;
6946
0
    uint64_t sqDiff = 0;
6947
6948
0
    if (!n) {
6949
0
        return 0;
6950
0
    }
6951
6952
0
    for (int i = 0; i < n; i++) {
6953
0
        sum += a[i];
6954
0
    }
6955
6956
0
    if (sum) {
6957
0
        mean = sum / n;
6958
6959
        /* Compute sum squared differences with mean. */
6960
0
        for (int i = 0; i < n; i++) {
6961
0
            sqDiff += (a[i] - mean)*(a[i] - mean);
6962
0
        }
6963
0
    }
6964
0
    return (sqDiff ? (sqDiff / n) : 0);
6965
0
}
6966
6967
/* Return true if needs to revalidate datapath flows. */
6968
static bool
6969
dpif_netdev_run(struct dpif *dpif)
6970
0
{
6971
0
    struct dp_netdev_port *port;
6972
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
6973
0
    struct dp_netdev_pmd_thread *non_pmd;
6974
0
    uint64_t new_tnl_seq;
6975
0
    bool need_to_flush = true;
6976
0
    bool pmd_rebalance = false;
6977
0
    long long int now = time_msec();
6978
0
    struct dp_netdev_pmd_thread *pmd;
6979
6980
0
    ovs_rwlock_rdlock(&dp->port_rwlock);
6981
0
    non_pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
6982
0
    if (non_pmd) {
6983
0
        ovs_mutex_lock(&dp->non_pmd_mutex);
6984
6985
0
        atomic_read_relaxed(&dp->smc_enable_db, &non_pmd->ctx.smc_enable_db);
6986
6987
0
        HMAP_FOR_EACH (port, node, &dp->ports) {
6988
0
            if (!netdev_is_pmd(port->netdev)) {
6989
0
                int i;
6990
6991
0
                if (port->emc_enabled) {
6992
0
                    atomic_read_relaxed(&dp->emc_insert_min,
6993
0
                                        &non_pmd->ctx.emc_insert_min);
6994
0
                } else {
6995
0
                    non_pmd->ctx.emc_insert_min = 0;
6996
0
                }
6997
6998
0
                for (i = 0; i < port->n_rxq; i++) {
6999
7000
0
                    if (!netdev_rxq_enabled(port->rxqs[i].rx)) {
7001
0
                        continue;
7002
0
                    }
7003
7004
0
                    if (dp_netdev_process_rxq_port(non_pmd,
7005
0
                                                   &port->rxqs[i],
7006
0
                                                   port->port_no)) {
7007
0
                        need_to_flush = false;
7008
0
                    }
7009
0
                }
7010
0
            }
7011
0
        }
7012
0
        if (need_to_flush) {
7013
            /* We didn't receive anything in the process loop.
7014
             * Check if we need to send something.
7015
             * There was no time updates on current iteration. */
7016
0
            pmd_thread_ctx_time_update(non_pmd);
7017
0
            dp_netdev_pmd_flush_output_packets(non_pmd, false);
7018
0
        }
7019
7020
0
        dpif_netdev_xps_revalidate_pmd(non_pmd, false);
7021
0
        ovs_mutex_unlock(&dp->non_pmd_mutex);
7022
7023
0
        dp_netdev_pmd_unref(non_pmd);
7024
0
    }
7025
7026
0
    struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
7027
0
    if (pmd_alb->is_enabled) {
7028
0
        if (!pmd_alb->rebalance_poll_timer) {
7029
0
            pmd_alb->rebalance_poll_timer = now;
7030
0
        } else if ((pmd_alb->rebalance_poll_timer +
7031
0
                   pmd_alb->rebalance_intvl) < now) {
7032
0
            pmd_alb->rebalance_poll_timer = now;
7033
0
            CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
7034
0
                if (atomic_count_get(&pmd->pmd_overloaded) >=
7035
0
                                    PMD_INTERVAL_MAX) {
7036
0
                    pmd_rebalance = true;
7037
0
                    break;
7038
0
                }
7039
0
            }
7040
7041
0
            if (pmd_rebalance &&
7042
0
                !dp_netdev_is_reconf_required(dp) &&
7043
0
                !ports_require_restart(dp) &&
7044
0
                pmd_rebalance_dry_run_needed(dp) &&
7045
0
                pmd_rebalance_dry_run(dp)) {
7046
0
                VLOG_INFO("PMD auto load balance dry run. "
7047
0
                          "Requesting datapath reconfigure.");
7048
0
                dp_netdev_request_reconfigure(dp);
7049
0
            }
7050
0
        }
7051
0
    }
7052
7053
0
    if (dp_netdev_is_reconf_required(dp) || ports_require_restart(dp)) {
7054
0
        reconfigure_datapath(dp);
7055
0
    }
7056
0
    ovs_rwlock_unlock(&dp->port_rwlock);
7057
7058
0
    tnl_neigh_cache_run();
7059
0
    tnl_port_map_run();
7060
0
    new_tnl_seq = seq_read(tnl_conf_seq);
7061
7062
0
    if (dp->last_tnl_conf_seq != new_tnl_seq) {
7063
0
        dp->last_tnl_conf_seq = new_tnl_seq;
7064
0
        return true;
7065
0
    }
7066
0
    return false;
7067
0
}
7068
7069
static void
7070
dpif_netdev_wait(struct dpif *dpif)
7071
0
{
7072
0
    struct dp_netdev_port *port;
7073
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
7074
7075
0
    ovs_mutex_lock(&dp_netdev_mutex);
7076
0
    ovs_rwlock_rdlock(&dp->port_rwlock);
7077
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
7078
0
        netdev_wait_reconf_required(port->netdev);
7079
0
        if (!netdev_is_pmd(port->netdev)) {
7080
0
            int i;
7081
7082
0
            for (i = 0; i < port->n_rxq; i++) {
7083
0
                netdev_rxq_wait(port->rxqs[i].rx);
7084
0
            }
7085
0
        }
7086
0
    }
7087
0
    ovs_rwlock_unlock(&dp->port_rwlock);
7088
0
    ovs_mutex_unlock(&dp_netdev_mutex);
7089
0
    seq_wait(tnl_conf_seq, dp->last_tnl_conf_seq);
7090
0
}
7091
7092
static void
7093
pmd_free_cached_ports(struct dp_netdev_pmd_thread *pmd)
7094
0
{
7095
0
    struct tx_port *tx_port_cached;
7096
7097
    /* Flush all the queued packets. */
7098
0
    dp_netdev_pmd_flush_output_packets(pmd, true);
7099
    /* Free all used tx queue ids. */
7100
0
    dpif_netdev_xps_revalidate_pmd(pmd, true);
7101
7102
0
    HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->tnl_port_cache) {
7103
0
        free(tx_port_cached->txq_pkts);
7104
0
        free(tx_port_cached);
7105
0
    }
7106
0
    HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->send_port_cache) {
7107
0
        free(tx_port_cached->txq_pkts);
7108
0
        free(tx_port_cached);
7109
0
    }
7110
0
}
7111
7112
/* Copies ports from 'pmd->tx_ports' (shared with the main thread) to
7113
 * thread-local copies. Copy to 'pmd->tnl_port_cache' if it is a tunnel
7114
 * device, otherwise to 'pmd->send_port_cache' if the port has at least
7115
 * one txq. */
7116
static void
7117
pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
7118
    OVS_REQUIRES(pmd->port_mutex)
7119
0
{
7120
0
    struct tx_port *tx_port, *tx_port_cached;
7121
7122
0
    pmd_free_cached_ports(pmd);
7123
0
    hmap_shrink(&pmd->send_port_cache);
7124
0
    hmap_shrink(&pmd->tnl_port_cache);
7125
7126
0
    HMAP_FOR_EACH (tx_port, node, &pmd->tx_ports) {
7127
0
        int n_txq = netdev_n_txq(tx_port->port->netdev);
7128
0
        struct dp_packet_batch *txq_pkts_cached;
7129
7130
0
        if (netdev_has_tunnel_push_pop(tx_port->port->netdev)) {
7131
0
            tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
7132
0
            if (tx_port->txq_pkts) {
7133
0
                txq_pkts_cached = xmemdup(tx_port->txq_pkts,
7134
0
                                          n_txq * sizeof *tx_port->txq_pkts);
7135
0
                tx_port_cached->txq_pkts = txq_pkts_cached;
7136
0
            }
7137
0
            hmap_insert(&pmd->tnl_port_cache, &tx_port_cached->node,
7138
0
                        hash_port_no(tx_port_cached->port->port_no));
7139
0
        }
7140
7141
0
        if (n_txq) {
7142
0
            tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
7143
0
            if (tx_port->txq_pkts) {
7144
0
                txq_pkts_cached = xmemdup(tx_port->txq_pkts,
7145
0
                                          n_txq * sizeof *tx_port->txq_pkts);
7146
0
                tx_port_cached->txq_pkts = txq_pkts_cached;
7147
0
            }
7148
0
            hmap_insert(&pmd->send_port_cache, &tx_port_cached->node,
7149
0
                        hash_port_no(tx_port_cached->port->port_no));
7150
0
        }
7151
0
    }
7152
0
}
7153
7154
static void
7155
pmd_alloc_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
7156
0
{
7157
0
    ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
7158
0
    if (!id_pool_alloc_id(pmd->dp->tx_qid_pool, &pmd->static_tx_qid)) {
7159
0
        VLOG_ABORT("static_tx_qid allocation failed for PMD on core %2d"
7160
0
                   ", numa_id %d.", pmd->core_id, pmd->numa_id);
7161
0
    }
7162
0
    ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
7163
7164
0
    VLOG_DBG("static_tx_qid = %d allocated for PMD thread on core %2d"
7165
0
             ", numa_id %d.", pmd->static_tx_qid, pmd->core_id, pmd->numa_id);
7166
0
}
7167
7168
static void
7169
pmd_free_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
7170
0
{
7171
0
    ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
7172
0
    id_pool_free_id(pmd->dp->tx_qid_pool, pmd->static_tx_qid);
7173
0
    ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
7174
0
}
7175
7176
static int
7177
pmd_load_queues_and_ports(struct dp_netdev_pmd_thread *pmd,
7178
                          struct polled_queue **ppoll_list)
7179
0
{
7180
0
    struct polled_queue *poll_list = *ppoll_list;
7181
0
    struct rxq_poll *poll;
7182
0
    int i;
7183
7184
0
    ovs_mutex_lock(&pmd->port_mutex);
7185
0
    poll_list = xrealloc(poll_list, hmap_count(&pmd->poll_list)
7186
0
                                    * sizeof *poll_list);
7187
7188
0
    i = 0;
7189
0
    HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
7190
0
        poll_list[i].rxq = poll->rxq;
7191
0
        poll_list[i].port_no = poll->rxq->port->port_no;
7192
0
        poll_list[i].emc_enabled = poll->rxq->port->emc_enabled;
7193
0
        poll_list[i].rxq_enabled = netdev_rxq_enabled(poll->rxq->rx);
7194
0
        poll_list[i].change_seq =
7195
0
                     netdev_get_change_seq(poll->rxq->port->netdev);
7196
0
        i++;
7197
0
    }
7198
7199
0
    pmd_load_cached_ports(pmd);
7200
7201
0
    ovs_mutex_unlock(&pmd->port_mutex);
7202
7203
0
    *ppoll_list = poll_list;
7204
0
    return i;
7205
0
}
7206
7207
static void *
7208
pmd_thread_main(void *f_)
7209
0
{
7210
0
    struct dp_netdev_pmd_thread *pmd = f_;
7211
0
    struct pmd_perf_stats *s = &pmd->perf_stats;
7212
0
    unsigned int lc = 0;
7213
0
    struct polled_queue *poll_list;
7214
0
    bool wait_for_reload = false;
7215
0
    bool dpdk_attached;
7216
0
    bool reload_tx_qid;
7217
0
    bool exiting;
7218
0
    bool reload;
7219
0
    int poll_cnt;
7220
0
    int i;
7221
0
    int process_packets = 0;
7222
0
    uint64_t sleep_time = 0;
7223
7224
0
    poll_list = NULL;
7225
7226
    /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
7227
0
    ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
7228
0
    ovs_numa_thread_setaffinity_core(pmd->core_id);
7229
0
    dpdk_attached = dpdk_attach_thread(pmd->core_id);
7230
0
    poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
7231
0
    dfc_cache_init(&pmd->flow_cache);
7232
0
    pmd_alloc_static_tx_qid(pmd);
7233
0
    set_timer_resolution(PMD_TIMER_RES_NS);
7234
7235
0
reload:
7236
0
    atomic_count_init(&pmd->pmd_overloaded, 0);
7237
7238
0
    pmd->intrvl_tsc_prev = 0;
7239
0
    atomic_store_relaxed(&pmd->intrvl_cycles, 0);
7240
7241
0
    if (!dpdk_attached) {
7242
0
        dpdk_attached = dpdk_attach_thread(pmd->core_id);
7243
0
    }
7244
7245
    /* List port/core affinity */
7246
0
    for (i = 0; i < poll_cnt; i++) {
7247
0
       VLOG_DBG("Core %d processing port \'%s\' with queue-id %d\n",
7248
0
                pmd->core_id, netdev_rxq_get_name(poll_list[i].rxq->rx),
7249
0
                netdev_rxq_get_queue_id(poll_list[i].rxq->rx));
7250
       /* Reset the rxq current cycles counter. */
7251
0
       dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR, 0);
7252
0
       for (int j = 0; j < PMD_INTERVAL_MAX; j++) {
7253
0
           dp_netdev_rxq_set_intrvl_cycles(poll_list[i].rxq, 0);
7254
0
       }
7255
0
    }
7256
7257
0
    if (!poll_cnt) {
7258
0
        if (wait_for_reload) {
7259
            /* Don't sleep, control thread will ask for a reload shortly. */
7260
0
            do {
7261
0
                atomic_read_explicit(&pmd->reload, &reload,
7262
0
                                     memory_order_acquire);
7263
0
            } while (!reload);
7264
0
        } else {
7265
0
            while (seq_read(pmd->reload_seq) == pmd->last_reload_seq) {
7266
0
                seq_wait(pmd->reload_seq, pmd->last_reload_seq);
7267
0
                poll_block();
7268
0
            }
7269
0
        }
7270
0
    }
7271
7272
0
    for (i = 0; i < PMD_INTERVAL_MAX; i++) {
7273
0
        atomic_store_relaxed(&pmd->busy_cycles_intrvl[i], 0);
7274
0
    }
7275
0
    atomic_count_set(&pmd->intrvl_idx, 0);
7276
0
    cycles_counter_update(s);
7277
7278
0
    pmd->next_rcu_quiesce = pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL;
7279
7280
    /* Protect pmd stats from external clearing while polling. */
7281
0
    ovs_mutex_lock(&pmd->perf_stats.stats_mutex);
7282
0
    for (;;) {
7283
0
        uint64_t rx_packets = 0, tx_packets = 0;
7284
0
        uint64_t time_slept = 0;
7285
0
        uint64_t max_sleep;
7286
7287
0
        pmd_perf_start_iteration(s);
7288
7289
0
        atomic_read_relaxed(&pmd->dp->smc_enable_db, &pmd->ctx.smc_enable_db);
7290
0
        atomic_read_relaxed(&pmd->max_sleep, &max_sleep);
7291
7292
0
        for (i = 0; i < poll_cnt; i++) {
7293
7294
0
            if (!poll_list[i].rxq_enabled) {
7295
0
                continue;
7296
0
            }
7297
7298
0
            if (poll_list[i].emc_enabled) {
7299
0
                atomic_read_relaxed(&pmd->dp->emc_insert_min,
7300
0
                                    &pmd->ctx.emc_insert_min);
7301
0
            } else {
7302
0
                pmd->ctx.emc_insert_min = 0;
7303
0
            }
7304
7305
0
            process_packets =
7306
0
                dp_netdev_process_rxq_port(pmd, poll_list[i].rxq,
7307
0
                                           poll_list[i].port_no);
7308
0
            rx_packets += process_packets;
7309
0
            if (process_packets >= PMD_SLEEP_THRESH) {
7310
0
                sleep_time = 0;
7311
0
            }
7312
0
        }
7313
7314
0
        if (!rx_packets) {
7315
            /* We didn't receive anything in the process loop.
7316
             * Check if we need to send something.
7317
             * There was no time updates on current iteration. */
7318
0
            pmd_thread_ctx_time_update(pmd);
7319
0
            tx_packets = dp_netdev_pmd_flush_output_packets(pmd,
7320
0
                                                   max_sleep && sleep_time
7321
0
                                                   ? true : false);
7322
0
        }
7323
7324
0
        if (max_sleep) {
7325
            /* Check if a sleep should happen on this iteration. */
7326
0
            if (sleep_time) {
7327
0
                struct cycle_timer sleep_timer;
7328
7329
0
                cycle_timer_start(&pmd->perf_stats, &sleep_timer);
7330
0
                xnanosleep_no_quiesce(sleep_time * 1000);
7331
0
                time_slept = cycle_timer_stop(&pmd->perf_stats, &sleep_timer);
7332
0
                pmd_thread_ctx_time_update(pmd);
7333
0
            }
7334
0
            if (sleep_time < max_sleep) {
7335
                /* Increase sleep time for next iteration. */
7336
0
                sleep_time += PMD_SLEEP_INC_US;
7337
0
            } else {
7338
0
                sleep_time = max_sleep;
7339
0
            }
7340
0
        } else {
7341
            /* Reset sleep time as max sleep policy may have been changed. */
7342
0
            sleep_time = 0;
7343
0
        }
7344
7345
        /* Do RCU synchronization at fixed interval.  This ensures that
7346
         * synchronization would not be delayed long even at high load of
7347
         * packet processing. */
7348
0
        if (pmd->ctx.now > pmd->next_rcu_quiesce) {
7349
0
            if (!ovsrcu_try_quiesce()) {
7350
0
                pmd->next_rcu_quiesce =
7351
0
                    pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL;
7352
0
            }
7353
0
        }
7354
7355
0
        if (lc++ > 1024) {
7356
0
            lc = 0;
7357
7358
0
            coverage_try_clear();
7359
0
            dp_netdev_pmd_try_optimize(pmd, poll_list, poll_cnt);
7360
0
            if (!ovsrcu_try_quiesce()) {
7361
0
                emc_cache_slow_sweep(&((pmd->flow_cache).emc_cache));
7362
0
                pmd->next_rcu_quiesce =
7363
0
                    pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL;
7364
0
            }
7365
7366
0
            for (i = 0; i < poll_cnt; i++) {
7367
0
                uint64_t current_seq =
7368
0
                         netdev_get_change_seq(poll_list[i].rxq->port->netdev);
7369
0
                if (poll_list[i].change_seq != current_seq) {
7370
0
                    poll_list[i].change_seq = current_seq;
7371
0
                    poll_list[i].rxq_enabled =
7372
0
                                 netdev_rxq_enabled(poll_list[i].rxq->rx);
7373
0
                }
7374
0
            }
7375
0
        }
7376
7377
0
        atomic_read_explicit(&pmd->reload, &reload, memory_order_acquire);
7378
0
        if (OVS_UNLIKELY(reload)) {
7379
0
            break;
7380
0
        }
7381
7382
0
        pmd_perf_end_iteration(s, rx_packets, tx_packets, time_slept,
7383
0
                               pmd_perf_metrics_enabled(pmd));
7384
0
    }
7385
0
    ovs_mutex_unlock(&pmd->perf_stats.stats_mutex);
7386
7387
0
    poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
7388
0
    atomic_read_relaxed(&pmd->wait_for_reload, &wait_for_reload);
7389
0
    atomic_read_relaxed(&pmd->reload_tx_qid, &reload_tx_qid);
7390
0
    atomic_read_relaxed(&pmd->exit, &exiting);
7391
    /* Signal here to make sure the pmd finishes
7392
     * reloading the updated configuration. */
7393
0
    dp_netdev_pmd_reload_done(pmd);
7394
7395
0
    if (reload_tx_qid) {
7396
0
        pmd_free_static_tx_qid(pmd);
7397
0
        pmd_alloc_static_tx_qid(pmd);
7398
0
    }
7399
7400
0
    if (!exiting) {
7401
0
        goto reload;
7402
0
    }
7403
7404
0
    pmd_free_static_tx_qid(pmd);
7405
0
    dfc_cache_uninit(&pmd->flow_cache);
7406
0
    free(poll_list);
7407
0
    pmd_free_cached_ports(pmd);
7408
0
    if (dpdk_attached) {
7409
0
        dpdk_detach_thread();
7410
0
    }
7411
0
    return NULL;
7412
0
}
7413
7414
static void
7415
dp_netdev_disable_upcall(struct dp_netdev *dp)
7416
    OVS_ACQUIRES(dp->upcall_rwlock)
7417
0
{
7418
0
    fat_rwlock_wrlock(&dp->upcall_rwlock);
7419
0
}
7420
7421

7422
/* Meters */
7423
static void
7424
dpif_netdev_meter_get_features(const struct dpif * dpif OVS_UNUSED,
7425
                               struct ofputil_meter_features *features)
7426
0
{
7427
0
    features->max_meters = MAX_METERS;
7428
0
    features->band_types = DP_SUPPORTED_METER_BAND_TYPES;
7429
0
    features->capabilities = DP_SUPPORTED_METER_FLAGS_MASK;
7430
0
    features->max_bands = MAX_BANDS;
7431
0
    features->max_color = 0;
7432
0
}
7433
7434
/* Tries to atomically add 'n' to 'value' in terms of saturation arithmetic,
7435
 * i.e., if the result will be larger than 'max_value', will store 'max_value'
7436
 * instead. */
7437
static void
7438
atomic_sat_add(atomic_uint64_t *value, uint64_t n, uint64_t max_value)
7439
0
{
7440
0
    uint64_t current, new_value;
7441
7442
0
    atomic_read_relaxed(value, &current);
7443
0
    do {
7444
0
        new_value = current + n;
7445
0
        new_value = MIN(new_value, max_value);
7446
0
    } while (!atomic_compare_exchange_weak_relaxed(value, &current,
7447
0
                                                   new_value));
7448
0
}
7449
7450
/* Tries to atomically subtract 'n' from 'value'.  Does not perform the
7451
 * operation and returns 'false' if the result will be less than 'min_value'.
7452
 * Otherwise, stores the result and returns 'true'. */
7453
static bool
7454
atomic_bound_sub(atomic_uint64_t *value, uint64_t n, uint64_t min_value)
7455
0
{
7456
0
    uint64_t current;
7457
7458
0
    atomic_read_relaxed(value, &current);
7459
0
    do {
7460
0
        if (current < min_value + n) {
7461
0
            return false;
7462
0
        }
7463
0
    } while (!atomic_compare_exchange_weak_relaxed(value, &current,
7464
0
                                                   current - n));
7465
0
    return true;
7466
0
}
7467
7468
/* Applies the meter identified by 'meter_id' to 'packets_'.  Packets
7469
 * that exceed a band are dropped in-place. */
7470
static void
7471
dp_netdev_run_meter(struct dp_netdev *dp, struct dp_packet_batch *packets_,
7472
                    uint32_t meter_id, long long int now_ms)
7473
0
{
7474
0
    const size_t cnt = dp_packet_batch_size(packets_);
7475
0
    uint32_t exceeded_rate[NETDEV_MAX_BURST];
7476
0
    uint32_t exceeded_band[NETDEV_MAX_BURST];
7477
0
    uint64_t bytes, volume, meter_used, old;
7478
0
    uint64_t band_packets[MAX_BANDS];
7479
0
    uint64_t band_bytes[MAX_BANDS];
7480
0
    struct dp_meter_band *band;
7481
0
    struct dp_packet *packet;
7482
0
    struct dp_meter *meter;
7483
0
    bool exceeded = false;
7484
7485
0
    if (meter_id >= MAX_METERS) {
7486
0
        return;
7487
0
    }
7488
7489
0
    meter = dp_meter_lookup(&dp->meters, meter_id);
7490
0
    if (!meter) {
7491
0
        return;
7492
0
    }
7493
7494
    /* Initialize as negative values. */
7495
0
    memset(exceeded_band, 0xff, cnt * sizeof *exceeded_band);
7496
    /* Initialize as zeroes. */
7497
0
    memset(exceeded_rate, 0, cnt * sizeof *exceeded_rate);
7498
7499
0
    atomic_read_relaxed(&meter->used, &meter_used);
7500
0
    do {
7501
0
        if (meter_used >= now_ms) {
7502
            /* The '>' condition means that we have several threads hitting the
7503
             * same meter, and the other one already advanced the time. */
7504
0
            meter_used = now_ms;
7505
0
            break;
7506
0
        }
7507
0
    } while (!atomic_compare_exchange_weak_relaxed(&meter->used,
7508
0
                                                   &meter_used, now_ms));
7509
7510
    /* Refill all buckets right away, since other threads may use them. */
7511
0
    if (meter_used < now_ms) {
7512
        /* All packets will hit the meter at the same time. */
7513
0
        uint64_t delta_t = now_ms - meter_used;
7514
7515
        /* Make sure delta_t will not be too large, so that bucket will not
7516
         * wrap around below. */
7517
0
        delta_t = MIN(delta_t, meter->max_delta_t);
7518
7519
0
        for (int m = 0; m < meter->n_bands; m++) {
7520
0
            band = &meter->bands[m];
7521
            /* Update band's bucket.  We can't just use atomic add here,
7522
             * because we should never add above the max capacity. */
7523
0
            atomic_sat_add(&band->bucket, delta_t * band->rate,
7524
0
                           band->burst_size * 1000ULL);
7525
0
        }
7526
0
    }
7527
7528
    /* Update meter stats. */
7529
0
    atomic_add_relaxed(&meter->packet_count, cnt, &old);
7530
0
    bytes = 0;
7531
0
    DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7532
0
        bytes += dp_packet_size(packet);
7533
0
    }
7534
0
    atomic_add_relaxed(&meter->byte_count, bytes, &old);
7535
7536
    /* Meters can operate in terms of packets per second or kilobits per
7537
     * second. */
7538
0
    if (meter->flags & OFPMF13_PKTPS) {
7539
        /* Rate in packets/second, bucket 1/1000 packets.
7540
         * msec * packets/sec = 1/1000 packets. */
7541
0
        volume = cnt * 1000; /* Take 'cnt' packets from the bucket. */
7542
0
    } else {
7543
        /* Rate in kbps, bucket in bits.
7544
         * msec * kbps = bits */
7545
0
        volume = bytes * 8;
7546
0
    }
7547
7548
    /* Find the band hit with the highest rate for each packet (if any). */
7549
0
    for (int m = 0; m < meter->n_bands; m++) {
7550
0
        band = &meter->bands[m];
7551
7552
        /* Drain the bucket for all the packets, if possible. */
7553
0
        if (atomic_bound_sub(&band->bucket, volume, 0)) {
7554
0
            continue;
7555
0
        }
7556
7557
        /* Band limit hit, must process packet-by-packet. */
7558
0
        DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7559
0
            uint64_t packet_volume = (meter->flags & OFPMF13_PKTPS)
7560
0
                                     ? 1000 : (dp_packet_size(packet) * 8);
7561
7562
0
            if (!atomic_bound_sub(&band->bucket, packet_volume, 0)) {
7563
                /* Update the exceeding band for the exceeding packet.
7564
                 * Only one band will be fired by a packet, and that can
7565
                 * be different for each packet. */
7566
0
                if (band->rate > exceeded_rate[i]) {
7567
0
                    exceeded_rate[i] = band->rate;
7568
0
                    exceeded_band[i] = m;
7569
0
                    exceeded = true;
7570
0
                }
7571
0
            }
7572
0
        }
7573
0
    }
7574
7575
    /* No need to iterate over packets if there are no drops. */
7576
0
    if (!exceeded) {
7577
0
        return;
7578
0
    }
7579
7580
    /* Fire the highest rate band exceeded by each packet, and drop
7581
     * packets if needed. */
7582
7583
0
    memset(band_packets, 0, sizeof band_packets);
7584
0
    memset(band_bytes,   0, sizeof band_bytes);
7585
7586
0
    size_t j;
7587
0
    DP_PACKET_BATCH_REFILL_FOR_EACH (j, cnt, packet, packets_) {
7588
0
        uint32_t m = exceeded_band[j];
7589
7590
0
        if (m != UINT32_MAX) {
7591
            /* Meter drop packet. */
7592
0
            band_packets[m]++;
7593
0
            band_bytes[m] += dp_packet_size(packet);
7594
0
            dp_packet_delete(packet);
7595
0
        } else {
7596
            /* Meter accepts packet. */
7597
0
            dp_packet_batch_refill(packets_, packet, j);
7598
0
        }
7599
0
    }
7600
7601
0
    for (int m = 0; m < meter->n_bands; m++) {
7602
0
        if (!band_packets[m]) {
7603
0
            continue;
7604
0
        }
7605
0
        band = &meter->bands[m];
7606
0
        atomic_add_relaxed(&band->packet_count, band_packets[m], &old);
7607
0
        atomic_add_relaxed(&band->byte_count,   band_bytes[m],   &old);
7608
0
        COVERAGE_ADD(datapath_drop_meter, band_packets[m]);
7609
0
    }
7610
0
}
7611
7612
/* Meter set/get/del processing is still single-threaded. */
7613
static int
7614
dpif_netdev_meter_set(struct dpif *dpif, ofproto_meter_id meter_id,
7615
                      struct ofputil_meter_config *config)
7616
0
{
7617
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
7618
0
    uint32_t mid = meter_id.uint32;
7619
0
    struct dp_meter *meter;
7620
0
    int i;
7621
7622
0
    if (mid >= MAX_METERS) {
7623
0
        return EFBIG; /* Meter_id out of range. */
7624
0
    }
7625
7626
0
    if (config->flags & ~DP_SUPPORTED_METER_FLAGS_MASK) {
7627
0
        return EBADF; /* Unsupported flags set */
7628
0
    }
7629
7630
0
    if (config->n_bands > MAX_BANDS) {
7631
0
        return EINVAL;
7632
0
    }
7633
7634
0
    for (i = 0; i < config->n_bands; ++i) {
7635
0
        switch (config->bands[i].type) {
7636
0
        case OFPMBT13_DROP:
7637
0
            break;
7638
0
        default:
7639
0
            return ENODEV; /* Unsupported band type */
7640
0
        }
7641
0
    }
7642
7643
    /* Allocate meter */
7644
0
    meter = xzalloc(sizeof *meter
7645
0
                    + config->n_bands * sizeof(struct dp_meter_band));
7646
7647
0
    meter->flags = config->flags;
7648
0
    meter->n_bands = config->n_bands;
7649
0
    meter->max_delta_t = 0;
7650
0
    meter->id = mid;
7651
0
    atomic_init(&meter->used, time_msec());
7652
7653
    /* set up bands */
7654
0
    for (i = 0; i < config->n_bands; ++i) {
7655
0
        uint32_t band_max_delta_t;
7656
0
        uint64_t bucket_size;
7657
7658
        /* Set burst size to a workable value if none specified. */
7659
0
        if (config->bands[i].burst_size == 0) {
7660
0
            config->bands[i].burst_size = config->bands[i].rate;
7661
0
        }
7662
7663
0
        meter->bands[i].rate = config->bands[i].rate;
7664
0
        meter->bands[i].burst_size = config->bands[i].burst_size;
7665
        /* Start with a full bucket. */
7666
0
        bucket_size = meter->bands[i].burst_size * 1000ULL;
7667
0
        atomic_init(&meter->bands[i].bucket, bucket_size);
7668
7669
        /* Figure out max delta_t that is enough to fill any bucket. */
7670
0
        band_max_delta_t = bucket_size / meter->bands[i].rate;
7671
0
        if (band_max_delta_t > meter->max_delta_t) {
7672
0
            meter->max_delta_t = band_max_delta_t;
7673
0
        }
7674
0
    }
7675
7676
0
    ovs_mutex_lock(&dp->meters_lock);
7677
7678
0
    dp_meter_detach_free(&dp->meters, mid); /* Free existing meter, if any. */
7679
0
    dp_meter_attach(&dp->meters, meter);
7680
7681
0
    ovs_mutex_unlock(&dp->meters_lock);
7682
7683
0
    return 0;
7684
0
}
7685
7686
static int
7687
dpif_netdev_meter_get(const struct dpif *dpif,
7688
                      ofproto_meter_id meter_id_,
7689
                      struct ofputil_meter_stats *stats, uint16_t n_bands)
7690
0
{
7691
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
7692
0
    uint32_t meter_id = meter_id_.uint32;
7693
0
    struct dp_meter *meter;
7694
7695
0
    if (meter_id >= MAX_METERS) {
7696
0
        return EFBIG;
7697
0
    }
7698
7699
0
    meter = dp_meter_lookup(&dp->meters, meter_id);
7700
0
    if (!meter) {
7701
0
        return ENOENT;
7702
0
    }
7703
7704
0
    if (stats) {
7705
0
        int i = 0;
7706
7707
0
        atomic_read_relaxed(&meter->packet_count, &stats->packet_in_count);
7708
0
        atomic_read_relaxed(&meter->byte_count, &stats->byte_in_count);
7709
7710
0
        for (i = 0; i < n_bands && i < meter->n_bands; ++i) {
7711
0
            atomic_read_relaxed(&meter->bands[i].packet_count,
7712
0
                                &stats->bands[i].packet_count);
7713
0
            atomic_read_relaxed(&meter->bands[i].byte_count,
7714
0
                                &stats->bands[i].byte_count);
7715
0
        }
7716
0
        stats->n_bands = i;
7717
0
    }
7718
7719
0
    return 0;
7720
0
}
7721
7722
static int
7723
dpif_netdev_meter_del(struct dpif *dpif,
7724
                      ofproto_meter_id meter_id_,
7725
                      struct ofputil_meter_stats *stats, uint16_t n_bands)
7726
0
{
7727
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
7728
0
    int error;
7729
7730
0
    error = dpif_netdev_meter_get(dpif, meter_id_, stats, n_bands);
7731
0
    if (!error) {
7732
0
        uint32_t meter_id = meter_id_.uint32;
7733
7734
0
        ovs_mutex_lock(&dp->meters_lock);
7735
0
        dp_meter_detach_free(&dp->meters, meter_id);
7736
0
        ovs_mutex_unlock(&dp->meters_lock);
7737
0
    }
7738
0
    return error;
7739
0
}
7740
7741

7742
static void
7743
dpif_netdev_disable_upcall(struct dpif *dpif)
7744
    OVS_NO_THREAD_SAFETY_ANALYSIS
7745
0
{
7746
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
7747
0
    dp_netdev_disable_upcall(dp);
7748
0
}
7749
7750
static void
7751
dp_netdev_enable_upcall(struct dp_netdev *dp)
7752
    OVS_RELEASES(dp->upcall_rwlock)
7753
0
{
7754
0
    fat_rwlock_unlock(&dp->upcall_rwlock);
7755
0
}
7756
7757
static void
7758
dpif_netdev_enable_upcall(struct dpif *dpif)
7759
    OVS_NO_THREAD_SAFETY_ANALYSIS
7760
0
{
7761
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
7762
0
    dp_netdev_enable_upcall(dp);
7763
0
}
7764
7765
static void
7766
dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd)
7767
0
{
7768
0
    atomic_store_relaxed(&pmd->wait_for_reload, false);
7769
0
    atomic_store_relaxed(&pmd->reload_tx_qid, false);
7770
0
    pmd->last_reload_seq = seq_read(pmd->reload_seq);
7771
0
    atomic_store_explicit(&pmd->reload, false, memory_order_release);
7772
0
}
7773
7774
/* Finds and refs the dp_netdev_pmd_thread on core 'core_id'.  Returns
7775
 * the pointer if succeeds, otherwise, NULL (it can return NULL even if
7776
 * 'core_id' is NON_PMD_CORE_ID).
7777
 *
7778
 * Caller must unrefs the returned reference.  */
7779
static struct dp_netdev_pmd_thread *
7780
dp_netdev_get_pmd(struct dp_netdev *dp, unsigned core_id)
7781
0
{
7782
0
    struct dp_netdev_pmd_thread *pmd;
7783
7784
0
    CMAP_FOR_EACH_WITH_HASH (pmd, node, hash_int(core_id, 0),
7785
0
                             &dp->poll_threads) {
7786
0
        if (pmd->core_id == core_id) {
7787
0
            return dp_netdev_pmd_try_ref(pmd) ? pmd : NULL;
7788
0
        }
7789
0
    }
7790
7791
0
    return NULL;
7792
0
}
7793
7794
/* Sets the 'struct dp_netdev_pmd_thread' for non-pmd threads. */
7795
static void
7796
dp_netdev_set_nonpmd(struct dp_netdev *dp)
7797
    OVS_REQ_WRLOCK(dp->port_rwlock)
7798
0
{
7799
0
    struct dp_netdev_pmd_thread *non_pmd;
7800
7801
0
    non_pmd = xzalloc(sizeof *non_pmd);
7802
0
    dp_netdev_configure_pmd(non_pmd, dp, NON_PMD_CORE_ID, OVS_NUMA_UNSPEC);
7803
0
}
7804
7805
/* Caller must have valid pointer to 'pmd'. */
7806
static bool
7807
dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd)
7808
0
{
7809
0
    return ovs_refcount_try_ref_rcu(&pmd->ref_cnt);
7810
0
}
7811
7812
static void
7813
dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd)
7814
0
{
7815
0
    if (pmd && ovs_refcount_unref(&pmd->ref_cnt) == 1) {
7816
0
        ovsrcu_postpone(dp_netdev_destroy_pmd, pmd);
7817
0
    }
7818
0
}
7819
7820
/* Given cmap position 'pos', tries to ref the next node.  If try_ref()
7821
 * fails, keeps checking for next node until reaching the end of cmap.
7822
 *
7823
 * Caller must unrefs the returned reference. */
7824
static struct dp_netdev_pmd_thread *
7825
dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos)
7826
0
{
7827
0
    struct dp_netdev_pmd_thread *next;
7828
7829
0
    do {
7830
0
        struct cmap_node *node;
7831
7832
0
        node = cmap_next_position(&dp->poll_threads, pos);
7833
0
        next = node ? CONTAINER_OF(node, struct dp_netdev_pmd_thread, node)
7834
0
            : NULL;
7835
0
    } while (next && !dp_netdev_pmd_try_ref(next));
7836
7837
0
    return next;
7838
0
}
7839
7840
/* Configures the 'pmd' based on the input argument. */
7841
static void
7842
dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
7843
                        unsigned core_id, int numa_id)
7844
0
{
7845
0
    pmd->dp = dp;
7846
0
    pmd->core_id = core_id;
7847
0
    pmd->numa_id = numa_id;
7848
0
    pmd->need_reload = false;
7849
0
    pmd->n_output_batches = 0;
7850
7851
0
    ovs_refcount_init(&pmd->ref_cnt);
7852
0
    atomic_init(&pmd->exit, false);
7853
0
    pmd->reload_seq = seq_create();
7854
0
    pmd->last_reload_seq = seq_read(pmd->reload_seq);
7855
0
    atomic_init(&pmd->reload, false);
7856
0
    ovs_mutex_init(&pmd->flow_mutex);
7857
0
    ovs_mutex_init(&pmd->port_mutex);
7858
0
    ovs_mutex_init(&pmd->bond_mutex);
7859
0
    cmap_init(&pmd->flow_table);
7860
0
    cmap_init(&pmd->classifiers);
7861
0
    cmap_init(&pmd->simple_match_table);
7862
0
    ccmap_init(&pmd->n_flows);
7863
0
    ccmap_init(&pmd->n_simple_flows);
7864
0
    pmd->ctx.last_rxq = NULL;
7865
0
    pmd_thread_ctx_time_update(pmd);
7866
0
    pmd->next_optimization = pmd->ctx.now + DPCLS_OPTIMIZATION_INTERVAL;
7867
0
    pmd->next_rcu_quiesce = pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL;
7868
0
    pmd->next_cycle_store = pmd->ctx.now + PMD_INTERVAL_LEN;
7869
0
    pmd->busy_cycles_intrvl = xzalloc(PMD_INTERVAL_MAX *
7870
0
                                      sizeof *pmd->busy_cycles_intrvl);
7871
0
    hmap_init(&pmd->poll_list);
7872
0
    hmap_init(&pmd->tx_ports);
7873
0
    hmap_init(&pmd->tnl_port_cache);
7874
0
    hmap_init(&pmd->send_port_cache);
7875
0
    cmap_init(&pmd->tx_bonds);
7876
7877
0
    pmd_init_max_sleep(dp, pmd);
7878
7879
    /* Initialize DPIF function pointer to the default configured version. */
7880
0
    atomic_init(&pmd->netdev_input_func, dp_netdev_impl_get_default());
7881
7882
    /* Init default miniflow_extract function */
7883
0
    atomic_init(&pmd->miniflow_extract_opt, dp_mfex_impl_get_default());
7884
7885
    /* init the 'flow_cache' since there is no
7886
     * actual thread created for NON_PMD_CORE_ID. */
7887
0
    if (core_id == NON_PMD_CORE_ID) {
7888
0
        dfc_cache_init(&pmd->flow_cache);
7889
0
        pmd_alloc_static_tx_qid(pmd);
7890
0
    }
7891
0
    pmd_perf_stats_init(&pmd->perf_stats);
7892
0
    cmap_insert(&dp->poll_threads, CONST_CAST(struct cmap_node *, &pmd->node),
7893
0
                hash_int(core_id, 0));
7894
0
}
7895
7896
static void
7897
dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd)
7898
0
{
7899
0
    struct dpcls *cls;
7900
7901
0
    dp_netdev_pmd_flow_flush(pmd);
7902
0
    hmap_destroy(&pmd->send_port_cache);
7903
0
    hmap_destroy(&pmd->tnl_port_cache);
7904
0
    hmap_destroy(&pmd->tx_ports);
7905
0
    cmap_destroy(&pmd->tx_bonds);
7906
0
    hmap_destroy(&pmd->poll_list);
7907
0
    free(pmd->busy_cycles_intrvl);
7908
    /* All flows (including their dpcls_rules) have been deleted already */
7909
0
    CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
7910
0
        dpcls_destroy(cls);
7911
0
        ovsrcu_postpone(free, cls);
7912
0
    }
7913
0
    cmap_destroy(&pmd->classifiers);
7914
0
    cmap_destroy(&pmd->flow_table);
7915
0
    cmap_destroy(&pmd->simple_match_table);
7916
0
    ccmap_destroy(&pmd->n_flows);
7917
0
    ccmap_destroy(&pmd->n_simple_flows);
7918
0
    ovs_mutex_destroy(&pmd->flow_mutex);
7919
0
    seq_destroy(pmd->reload_seq);
7920
0
    ovs_mutex_destroy(&pmd->port_mutex);
7921
0
    ovs_mutex_destroy(&pmd->bond_mutex);
7922
0
    free(pmd->netdev_input_func_userdata);
7923
0
    free(pmd);
7924
0
}
7925
7926
/* Stops the pmd thread, removes it from the 'dp->poll_threads',
7927
 * and unrefs the struct. */
7928
static void
7929
dp_netdev_del_pmd(struct dp_netdev *dp, struct dp_netdev_pmd_thread *pmd)
7930
0
{
7931
    /* NON_PMD_CORE_ID doesn't have a thread, so we don't have to synchronize,
7932
     * but extra cleanup is necessary */
7933
0
    if (pmd->core_id == NON_PMD_CORE_ID) {
7934
0
        ovs_mutex_lock(&dp->non_pmd_mutex);
7935
0
        dfc_cache_uninit(&pmd->flow_cache);
7936
0
        pmd_free_cached_ports(pmd);
7937
0
        pmd_free_static_tx_qid(pmd);
7938
0
        ovs_mutex_unlock(&dp->non_pmd_mutex);
7939
0
    } else {
7940
0
        atomic_store_relaxed(&pmd->exit, true);
7941
0
        dp_netdev_reload_pmd__(pmd);
7942
0
        xpthread_join(pmd->thread, NULL);
7943
0
    }
7944
7945
0
    dp_netdev_pmd_clear_ports(pmd);
7946
7947
    /* Purges the 'pmd''s flows after stopping the thread, but before
7948
     * destroying the flows, so that the flow stats can be collected. */
7949
0
    if (dp->dp_purge_cb) {
7950
0
        dp->dp_purge_cb(dp->dp_purge_aux, pmd->core_id);
7951
0
    }
7952
0
    cmap_remove(&pmd->dp->poll_threads, &pmd->node, hash_int(pmd->core_id, 0));
7953
0
    dp_netdev_pmd_unref(pmd);
7954
0
}
7955
7956
/* Destroys all pmd threads. If 'non_pmd' is true it also destroys the non pmd
7957
 * thread. */
7958
static void
7959
dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd)
7960
0
{
7961
0
    struct dp_netdev_pmd_thread *pmd;
7962
0
    struct dp_netdev_pmd_thread **pmd_list;
7963
0
    size_t k = 0, n_pmds;
7964
7965
0
    n_pmds = cmap_count(&dp->poll_threads);
7966
0
    pmd_list = xcalloc(n_pmds, sizeof *pmd_list);
7967
7968
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
7969
0
        if (!non_pmd && pmd->core_id == NON_PMD_CORE_ID) {
7970
0
            continue;
7971
0
        }
7972
        /* We cannot call dp_netdev_del_pmd(), since it alters
7973
         * 'dp->poll_threads' (while we're iterating it) and it
7974
         * might quiesce. */
7975
0
        ovs_assert(k < n_pmds);
7976
0
        pmd_list[k++] = pmd;
7977
0
    }
7978
7979
0
    for (size_t i = 0; i < k; i++) {
7980
0
        dp_netdev_del_pmd(dp, pmd_list[i]);
7981
0
    }
7982
0
    free(pmd_list);
7983
0
}
7984
7985
/* Deletes all rx queues from pmd->poll_list and all the ports from
7986
 * pmd->tx_ports. */
7987
static void
7988
dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd)
7989
0
{
7990
0
    struct rxq_poll *poll;
7991
0
    struct tx_port *port;
7992
0
    struct tx_bond *tx;
7993
7994
0
    ovs_mutex_lock(&pmd->port_mutex);
7995
0
    HMAP_FOR_EACH_POP (poll, node, &pmd->poll_list) {
7996
0
        free(poll);
7997
0
    }
7998
0
    HMAP_FOR_EACH_POP (port, node, &pmd->tx_ports) {
7999
0
        free(port->txq_pkts);
8000
0
        free(port);
8001
0
    }
8002
0
    ovs_mutex_unlock(&pmd->port_mutex);
8003
8004
0
    ovs_mutex_lock(&pmd->bond_mutex);
8005
0
    CMAP_FOR_EACH (tx, node, &pmd->tx_bonds) {
8006
0
        cmap_remove(&pmd->tx_bonds, &tx->node, hash_bond_id(tx->bond_id));
8007
0
        ovsrcu_postpone(free, tx);
8008
0
    }
8009
0
    ovs_mutex_unlock(&pmd->bond_mutex);
8010
0
}
8011
8012
/* Adds rx queue to poll_list of PMD thread, if it's not there already. */
8013
static void
8014
dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
8015
                         struct dp_netdev_rxq *rxq)
8016
    OVS_REQUIRES(pmd->port_mutex)
8017
0
{
8018
0
    int qid = netdev_rxq_get_queue_id(rxq->rx);
8019
0
    uint32_t hash = hash_2words(odp_to_u32(rxq->port->port_no), qid);
8020
0
    struct rxq_poll *poll;
8021
8022
0
    HMAP_FOR_EACH_WITH_HASH (poll, node, hash, &pmd->poll_list) {
8023
0
        if (poll->rxq == rxq) {
8024
            /* 'rxq' is already polled by this thread. Do nothing. */
8025
0
            return;
8026
0
        }
8027
0
    }
8028
8029
0
    poll = xmalloc(sizeof *poll);
8030
0
    poll->rxq = rxq;
8031
0
    hmap_insert(&pmd->poll_list, &poll->node, hash);
8032
8033
0
    pmd->need_reload = true;
8034
0
}
8035
8036
/* Delete 'poll' from poll_list of PMD thread. */
8037
static void
8038
dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
8039
                           struct rxq_poll *poll)
8040
    OVS_REQUIRES(pmd->port_mutex)
8041
0
{
8042
0
    hmap_remove(&pmd->poll_list, &poll->node);
8043
0
    free(poll);
8044
8045
0
    pmd->need_reload = true;
8046
0
}
8047
8048
/* Add 'port' to the tx port cache of 'pmd', which must be reloaded for the
8049
 * changes to take effect. */
8050
static void
8051
dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
8052
                             struct dp_netdev_port *port)
8053
    OVS_REQUIRES(pmd->port_mutex)
8054
0
{
8055
0
    struct tx_port *tx;
8056
8057
0
    tx = tx_port_lookup(&pmd->tx_ports, port->port_no);
8058
0
    if (tx) {
8059
        /* 'port' is already on this thread tx cache. Do nothing. */
8060
0
        return;
8061
0
    }
8062
8063
0
    tx = xzalloc(sizeof *tx);
8064
8065
0
    tx->port = port;
8066
0
    tx->qid = -1;
8067
0
    tx->flush_time = 0LL;
8068
0
    dp_packet_batch_init(&tx->output_pkts);
8069
8070
0
    if (tx->port->txq_mode == TXQ_MODE_XPS_HASH) {
8071
0
        int i, n_txq = netdev_n_txq(tx->port->netdev);
8072
8073
0
        tx->txq_pkts = xzalloc(n_txq * sizeof *tx->txq_pkts);
8074
0
        for (i = 0; i < n_txq; i++) {
8075
0
            dp_packet_batch_init(&tx->txq_pkts[i]);
8076
0
        }
8077
0
    }
8078
8079
0
    hmap_insert(&pmd->tx_ports, &tx->node, hash_port_no(tx->port->port_no));
8080
0
    pmd->need_reload = true;
8081
0
}
8082
8083
/* Del 'tx' from the tx port cache of 'pmd', which must be reloaded for the
8084
 * changes to take effect. */
8085
static void
8086
dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
8087
                               struct tx_port *tx)
8088
    OVS_REQUIRES(pmd->port_mutex)
8089
0
{
8090
0
    hmap_remove(&pmd->tx_ports, &tx->node);
8091
0
    free(tx->txq_pkts);
8092
0
    free(tx);
8093
0
    pmd->need_reload = true;
8094
0
}
8095
8096
/* Add bond to the tx bond cmap of 'pmd'. */
8097
static void
8098
dp_netdev_add_bond_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
8099
                             struct tx_bond *bond, bool update)
8100
    OVS_EXCLUDED(pmd->bond_mutex)
8101
0
{
8102
0
    struct tx_bond *tx;
8103
8104
0
    ovs_mutex_lock(&pmd->bond_mutex);
8105
0
    tx = tx_bond_lookup(&pmd->tx_bonds, bond->bond_id);
8106
8107
0
    if (tx && !update) {
8108
        /* It's not an update and the entry already exists.  Do nothing. */
8109
0
        goto unlock;
8110
0
    }
8111
8112
0
    if (tx) {
8113
0
        struct tx_bond *new_tx = xmemdup(bond, sizeof *bond);
8114
8115
        /* Copy the stats for each bucket. */
8116
0
        for (int i = 0; i < BOND_BUCKETS; i++) {
8117
0
            uint64_t n_packets, n_bytes;
8118
8119
0
            atomic_read_relaxed(&tx->member_buckets[i].n_packets, &n_packets);
8120
0
            atomic_read_relaxed(&tx->member_buckets[i].n_bytes, &n_bytes);
8121
0
            atomic_init(&new_tx->member_buckets[i].n_packets, n_packets);
8122
0
            atomic_init(&new_tx->member_buckets[i].n_bytes, n_bytes);
8123
0
        }
8124
0
        cmap_replace(&pmd->tx_bonds, &tx->node, &new_tx->node,
8125
0
                     hash_bond_id(bond->bond_id));
8126
0
        ovsrcu_postpone(free, tx);
8127
0
    } else {
8128
0
        tx = xmemdup(bond, sizeof *bond);
8129
0
        cmap_insert(&pmd->tx_bonds, &tx->node, hash_bond_id(bond->bond_id));
8130
0
    }
8131
0
unlock:
8132
0
    ovs_mutex_unlock(&pmd->bond_mutex);
8133
0
}
8134
8135
/* Delete bond from the tx bond cmap of 'pmd'. */
8136
static void
8137
dp_netdev_del_bond_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
8138
                               uint32_t bond_id)
8139
    OVS_EXCLUDED(pmd->bond_mutex)
8140
0
{
8141
0
    struct tx_bond *tx;
8142
8143
0
    ovs_mutex_lock(&pmd->bond_mutex);
8144
0
    tx = tx_bond_lookup(&pmd->tx_bonds, bond_id);
8145
0
    if (tx) {
8146
0
        cmap_remove(&pmd->tx_bonds, &tx->node, hash_bond_id(tx->bond_id));
8147
0
        ovsrcu_postpone(free, tx);
8148
0
    }
8149
0
    ovs_mutex_unlock(&pmd->bond_mutex);
8150
0
}
8151

8152
static char *
8153
dpif_netdev_get_datapath_version(void)
8154
0
{
8155
0
     return xstrdup("<built-in>");
8156
0
}
8157
8158
static void
8159
dp_netdev_flow_used(struct dp_netdev_flow *netdev_flow, int cnt, int size,
8160
                    uint16_t tcp_flags, long long now)
8161
0
{
8162
0
    uint16_t flags;
8163
8164
0
    atomic_store_relaxed(&netdev_flow->stats.used, now);
8165
0
    non_atomic_ullong_add(&netdev_flow->stats.packet_count, cnt);
8166
0
    non_atomic_ullong_add(&netdev_flow->stats.byte_count, size);
8167
0
    atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
8168
0
    flags |= tcp_flags;
8169
0
    atomic_store_relaxed(&netdev_flow->stats.tcp_flags, flags);
8170
0
}
8171
8172
static int
8173
dp_netdev_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet_,
8174
                 struct flow *flow, struct flow_wildcards *wc, ovs_u128 *ufid,
8175
                 enum dpif_upcall_type type, const struct nlattr *userdata,
8176
                 struct ofpbuf *actions, struct ofpbuf *put_actions)
8177
0
{
8178
0
    struct dp_netdev *dp = pmd->dp;
8179
8180
0
    if (OVS_UNLIKELY(!dp->upcall_cb)) {
8181
0
        return ENODEV;
8182
0
    }
8183
8184
0
    if (OVS_UNLIKELY(!VLOG_DROP_DBG(&upcall_rl))) {
8185
0
        struct ds ds = DS_EMPTY_INITIALIZER;
8186
0
        char *packet_str;
8187
0
        struct ofpbuf key;
8188
0
        struct odp_flow_key_parms odp_parms = {
8189
0
            .flow = flow,
8190
0
            .mask = wc ? &wc->masks : NULL,
8191
0
            .support = dp_netdev_support,
8192
0
        };
8193
8194
0
        ofpbuf_init(&key, 0);
8195
0
        odp_flow_key_from_flow(&odp_parms, &key);
8196
0
        packet_str = ofp_dp_packet_to_string(packet_);
8197
8198
0
        odp_flow_key_format(key.data, key.size, &ds);
8199
8200
0
        VLOG_DBG("%s: %s upcall:\n%s\n%s", dp->name,
8201
0
                 dpif_upcall_type_to_string(type), ds_cstr(&ds), packet_str);
8202
8203
0
        ofpbuf_uninit(&key);
8204
0
        free(packet_str);
8205
8206
0
        ds_destroy(&ds);
8207
0
    }
8208
8209
0
    if (type != DPIF_UC_MISS) {
8210
0
        dp_packet_ol_send_prepare(packet_, 0);
8211
0
    }
8212
8213
0
    return dp->upcall_cb(packet_, flow, ufid, pmd->core_id, type, userdata,
8214
0
                         actions, wc, put_actions, dp->upcall_aux);
8215
0
}
8216
8217
static inline uint32_t
8218
dpif_netdev_packet_get_rss_hash(struct dp_packet *packet,
8219
                                const struct miniflow *mf)
8220
0
{
8221
0
    uint32_t hash, recirc_depth;
8222
8223
0
    if (OVS_LIKELY(dp_packet_rss_valid(packet))) {
8224
0
        hash = dp_packet_get_rss_hash(packet);
8225
0
    } else {
8226
0
        hash = miniflow_hash_5tuple(mf, 0);
8227
0
        dp_packet_set_rss_hash(packet, hash);
8228
0
    }
8229
8230
    /* The RSS hash must account for the recirculation depth to avoid
8231
     * collisions in the exact match cache */
8232
0
    recirc_depth = *recirc_depth_get_unsafe();
8233
0
    if (OVS_UNLIKELY(recirc_depth)) {
8234
0
        hash = hash_finish(hash, recirc_depth);
8235
0
    }
8236
0
    return hash;
8237
0
}
8238
8239
struct packet_batch_per_flow {
8240
    unsigned int byte_count;
8241
    uint16_t tcp_flags;
8242
    struct dp_netdev_flow *flow;
8243
8244
    struct dp_packet_batch array;
8245
};
8246
8247
static inline void
8248
packet_batch_per_flow_update(struct packet_batch_per_flow *batch,
8249
                             struct dp_packet *packet,
8250
                             uint16_t tcp_flags)
8251
0
{
8252
0
    batch->byte_count += dp_packet_size(packet);
8253
0
    batch->tcp_flags |= tcp_flags;
8254
0
    dp_packet_batch_add(&batch->array, packet);
8255
0
}
8256
8257
static inline void
8258
packet_batch_per_flow_init(struct packet_batch_per_flow *batch,
8259
                           struct dp_netdev_flow *flow)
8260
0
{
8261
0
    flow->batch = batch;
8262
8263
0
    batch->flow = flow;
8264
0
    dp_packet_batch_init(&batch->array);
8265
0
    batch->byte_count = 0;
8266
0
    batch->tcp_flags = 0;
8267
0
}
8268
8269
static inline void
8270
packet_batch_per_flow_execute(struct packet_batch_per_flow *batch,
8271
                              struct dp_netdev_pmd_thread *pmd)
8272
0
{
8273
0
    struct dp_netdev_actions *actions;
8274
0
    struct dp_netdev_flow *flow = batch->flow;
8275
8276
0
    dp_netdev_flow_used(flow, dp_packet_batch_size(&batch->array),
8277
0
                        batch->byte_count,
8278
0
                        batch->tcp_flags, pmd->ctx.now / 1000);
8279
8280
0
    actions = dp_netdev_flow_get_actions(flow);
8281
8282
0
    dp_netdev_execute_actions(pmd, &batch->array, true, &flow->flow,
8283
0
                              actions->actions, actions->size);
8284
0
}
8285
8286
void
8287
dp_netdev_batch_execute(struct dp_netdev_pmd_thread *pmd,
8288
                        struct dp_packet_batch *packets,
8289
                        struct dpcls_rule *rule,
8290
                        uint32_t bytes,
8291
                        uint16_t tcp_flags)
8292
0
{
8293
    /* Gets action* from the rule. */
8294
0
    struct dp_netdev_flow *flow = dp_netdev_flow_cast(rule);
8295
0
    struct dp_netdev_actions *actions = dp_netdev_flow_get_actions(flow);
8296
8297
0
    dp_netdev_flow_used(flow, dp_packet_batch_size(packets), bytes,
8298
0
                        tcp_flags, pmd->ctx.now / 1000);
8299
0
    const uint32_t steal = 1;
8300
0
    dp_netdev_execute_actions(pmd, packets, steal, &flow->flow,
8301
0
                              actions->actions, actions->size);
8302
0
}
8303
8304
static inline void
8305
dp_netdev_queue_batches(struct dp_packet *pkt,
8306
                        struct dp_netdev_flow *flow, uint16_t tcp_flags,
8307
                        struct packet_batch_per_flow *batches,
8308
                        size_t *n_batches)
8309
0
{
8310
0
    struct packet_batch_per_flow *batch = flow->batch;
8311
8312
0
    if (OVS_UNLIKELY(!batch)) {
8313
0
        batch = &batches[(*n_batches)++];
8314
0
        packet_batch_per_flow_init(batch, flow);
8315
0
    }
8316
8317
0
    packet_batch_per_flow_update(batch, pkt, tcp_flags);
8318
0
}
8319
8320
static inline void
8321
packet_enqueue_to_flow_map(struct dp_packet *packet,
8322
                           struct dp_netdev_flow *flow,
8323
                           uint16_t tcp_flags,
8324
                           struct dp_packet_flow_map *flow_map,
8325
                           size_t index)
8326
0
{
8327
0
    struct dp_packet_flow_map *map = &flow_map[index];
8328
0
    map->flow = flow;
8329
0
    map->packet = packet;
8330
0
    map->tcp_flags = tcp_flags;
8331
0
}
8332
8333
/* SMC lookup function for a batch of packets.
8334
 * By doing batching SMC lookup, we can use prefetch
8335
 * to hide memory access latency.
8336
 */
8337
static inline void
8338
smc_lookup_batch(struct dp_netdev_pmd_thread *pmd,
8339
            struct netdev_flow_key *keys,
8340
            struct netdev_flow_key **missed_keys,
8341
            struct dp_packet_batch *packets_,
8342
            const int cnt,
8343
            struct dp_packet_flow_map *flow_map,
8344
            uint8_t *index_map)
8345
0
{
8346
0
    int i;
8347
0
    struct dp_packet *packet;
8348
0
    size_t n_smc_hit = 0, n_missed = 0;
8349
0
    struct dfc_cache *cache = &pmd->flow_cache;
8350
0
    struct smc_cache *smc_cache = &cache->smc_cache;
8351
0
    const struct cmap_node *flow_node;
8352
0
    int recv_idx;
8353
0
    uint16_t tcp_flags;
8354
8355
    /* Prefetch buckets for all packets */
8356
0
    for (i = 0; i < cnt; i++) {
8357
0
        OVS_PREFETCH(&smc_cache->buckets[keys[i].hash & SMC_MASK]);
8358
0
    }
8359
8360
0
    DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) {
8361
0
        struct dp_netdev_flow *flow = NULL;
8362
0
        flow_node = smc_entry_get(pmd, keys[i].hash);
8363
0
        bool hit = false;
8364
        /* Get the original order of this packet in received batch. */
8365
0
        recv_idx = index_map[i];
8366
8367
0
        if (OVS_LIKELY(flow_node != NULL)) {
8368
0
            CMAP_NODE_FOR_EACH (flow, node, flow_node) {
8369
                /* Since we dont have per-port megaflow to check the port
8370
                 * number, we need to  verify that the input ports match. */
8371
0
                if (OVS_LIKELY(dpcls_rule_matches_key(&flow->cr, &keys[i]) &&
8372
0
                flow->flow.in_port.odp_port == packet->md.in_port.odp_port)) {
8373
0
                    tcp_flags = miniflow_get_tcp_flags(&keys[i].mf);
8374
8375
                    /* SMC hit and emc miss, we insert into EMC */
8376
0
                    keys[i].len =
8377
0
                        netdev_flow_key_size(miniflow_n_values(&keys[i].mf));
8378
0
                    emc_probabilistic_insert(pmd, &keys[i], flow);
8379
                    /* Add these packets into the flow map in the same order
8380
                     * as received.
8381
                     */
8382
0
                    packet_enqueue_to_flow_map(packet, flow, tcp_flags,
8383
0
                                               flow_map, recv_idx);
8384
0
                    n_smc_hit++;
8385
0
                    hit = true;
8386
0
                    break;
8387
0
                }
8388
0
            }
8389
0
            if (hit) {
8390
0
                continue;
8391
0
            }
8392
0
        }
8393
8394
        /* SMC missed. Group missed packets together at
8395
         * the beginning of the 'packets' array. */
8396
0
        dp_packet_batch_refill(packets_, packet, i);
8397
8398
        /* Preserve the order of packet for flow batching. */
8399
0
        index_map[n_missed] = recv_idx;
8400
8401
        /* Put missed keys to the pointer arrays return to the caller */
8402
0
        missed_keys[n_missed++] = &keys[i];
8403
0
    }
8404
8405
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SMC_HIT, n_smc_hit);
8406
0
}
8407
8408
struct dp_netdev_flow *
8409
smc_lookup_single(struct dp_netdev_pmd_thread *pmd,
8410
                  struct dp_packet *packet,
8411
                  struct netdev_flow_key *key)
8412
0
{
8413
0
    const struct cmap_node *flow_node = smc_entry_get(pmd, key->hash);
8414
8415
0
    if (OVS_LIKELY(flow_node != NULL)) {
8416
0
        struct dp_netdev_flow *flow = NULL;
8417
8418
0
        CMAP_NODE_FOR_EACH (flow, node, flow_node) {
8419
            /* Since we dont have per-port megaflow to check the port
8420
             * number, we need to verify that the input ports match. */
8421
0
            if (OVS_LIKELY(dpcls_rule_matches_key(&flow->cr, key) &&
8422
0
                flow->flow.in_port.odp_port == packet->md.in_port.odp_port)) {
8423
8424
0
                return (void *) flow;
8425
0
            }
8426
0
        }
8427
0
    }
8428
8429
0
    return NULL;
8430
0
}
8431
8432
inline int
8433
dp_netdev_hw_flow(const struct dp_netdev_pmd_thread *pmd,
8434
                  struct dp_packet *packet,
8435
                  struct dp_netdev_flow **flow)
8436
0
{
8437
0
    uint32_t mark;
8438
8439
#ifdef ALLOW_EXPERIMENTAL_API /* Packet restoration API required. */
8440
    /* Restore the packet if HW processing was terminated before completion. */
8441
    struct dp_netdev_rxq *rxq = pmd->ctx.last_rxq;
8442
    bool miss_api_supported;
8443
8444
    atomic_read_relaxed(&rxq->port->netdev->hw_info.miss_api_supported,
8445
                        &miss_api_supported);
8446
    if (miss_api_supported) {
8447
        int err = netdev_hw_miss_packet_recover(rxq->port->netdev, packet);
8448
        if (err && err != EOPNOTSUPP) {
8449
            COVERAGE_INC(datapath_drop_hw_miss_recover);
8450
            return -1;
8451
        }
8452
    }
8453
#endif
8454
8455
    /* If no mark, no flow to find. */
8456
0
    if (!dp_packet_has_flow_mark(packet, &mark)) {
8457
0
        *flow = NULL;
8458
0
        return 0;
8459
0
    }
8460
8461
0
    *flow = mark_to_flow_find(pmd, mark);
8462
0
    return 0;
8463
0
}
8464
8465
/* Enqueues already classified packet into per-flow batches or the flow map,
8466
 * depending on the fact if batching enabled. */
8467
static inline void
8468
dfc_processing_enqueue_classified_packet(struct dp_packet *packet,
8469
                                         struct dp_netdev_flow *flow,
8470
                                         uint16_t tcp_flags,
8471
                                         bool batch_enable,
8472
                                         struct packet_batch_per_flow *batches,
8473
                                         size_t *n_batches,
8474
                                         struct dp_packet_flow_map *flow_map,
8475
                                         size_t *map_cnt)
8476
8477
0
{
8478
0
    if (OVS_LIKELY(batch_enable)) {
8479
0
        dp_netdev_queue_batches(packet, flow, tcp_flags, batches,
8480
0
                                n_batches);
8481
0
    } else {
8482
        /* Flow batching should be performed only after fast-path
8483
         * processing is also completed for packets with emc miss
8484
         * or else it will result in reordering of packets with
8485
         * same datapath flows. */
8486
0
        packet_enqueue_to_flow_map(packet, flow, tcp_flags,
8487
0
                                   flow_map, (*map_cnt)++);
8488
0
    }
8489
8490
0
}
8491
8492
/* Try to process all ('cnt') the 'packets' using only the datapath flow cache
8493
 * 'pmd->flow_cache'. If a flow is not found for a packet 'packets[i]', the
8494
 * miniflow is copied into 'keys' and the packet pointer is moved at the
8495
 * beginning of the 'packets' array. The pointers of missed keys are put in the
8496
 * missed_keys pointer array for future processing.
8497
 *
8498
 * The function returns the number of packets that needs to be processed in the
8499
 * 'packets' array (they have been moved to the beginning of the vector).
8500
 *
8501
 * For performance reasons a caller may choose not to initialize the metadata
8502
 * in 'packets_'.  If 'md_is_valid' is false, the metadata in 'packets'
8503
 * is not valid and must be initialized by this function using 'port_no'.
8504
 * If 'md_is_valid' is true, the metadata is already valid and 'port_no'
8505
 * will be ignored.
8506
 */
8507
static inline size_t
8508
dfc_processing(struct dp_netdev_pmd_thread *pmd,
8509
               struct dp_packet_batch *packets_,
8510
               struct netdev_flow_key *keys,
8511
               struct netdev_flow_key **missed_keys,
8512
               struct packet_batch_per_flow batches[], size_t *n_batches,
8513
               struct dp_packet_flow_map *flow_map,
8514
               size_t *n_flows, uint8_t *index_map,
8515
               bool md_is_valid, odp_port_t port_no)
8516
0
{
8517
0
    const bool netdev_flow_api = netdev_is_flow_api_enabled();
8518
0
    const uint32_t recirc_depth = *recirc_depth_get();
8519
0
    const size_t cnt = dp_packet_batch_size(packets_);
8520
0
    size_t n_missed = 0, n_emc_hit = 0, n_phwol_hit = 0;
8521
0
    size_t n_mfex_opt_hit = 0, n_simple_hit = 0;
8522
0
    struct dfc_cache *cache = &pmd->flow_cache;
8523
0
    struct netdev_flow_key *key = &keys[0];
8524
0
    struct dp_packet *packet;
8525
0
    size_t map_cnt = 0;
8526
0
    bool batch_enable = true;
8527
8528
0
    const bool simple_match_enabled =
8529
0
        !md_is_valid && dp_netdev_simple_match_enabled(pmd, port_no);
8530
    /* 'simple_match_table' is a full flow table.  If the flow is not there,
8531
     * upcall is required, and there is no chance to find a match in caches. */
8532
0
    const bool smc_enable_db = !simple_match_enabled && pmd->ctx.smc_enable_db;
8533
0
    const uint32_t cur_min = simple_match_enabled
8534
0
                             ? 0 : pmd->ctx.emc_insert_min;
8535
8536
0
    pmd_perf_update_counter(&pmd->perf_stats,
8537
0
                            md_is_valid ? PMD_STAT_RECIRC : PMD_STAT_RECV,
8538
0
                            cnt);
8539
0
    int i;
8540
0
    DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) {
8541
0
        struct dp_netdev_flow *flow = NULL;
8542
0
        uint16_t tcp_flags;
8543
8544
0
        if (OVS_UNLIKELY(dp_packet_size(packet) < ETH_HEADER_LEN)) {
8545
0
            dp_packet_delete(packet);
8546
0
            COVERAGE_INC(datapath_drop_rx_invalid_packet);
8547
0
            continue;
8548
0
        }
8549
8550
0
        if (i != cnt - 1) {
8551
0
            struct dp_packet **packets = packets_->packets;
8552
            /* Prefetch next packet data and metadata. */
8553
0
            OVS_PREFETCH(dp_packet_data(packets[i+1]));
8554
0
            pkt_metadata_prefetch_init(&packets[i+1]->md);
8555
0
        }
8556
8557
0
        if (!md_is_valid) {
8558
0
            pkt_metadata_init(&packet->md, port_no);
8559
0
        }
8560
8561
0
        if (netdev_flow_api && recirc_depth == 0) {
8562
0
            if (OVS_UNLIKELY(dp_netdev_hw_flow(pmd, packet, &flow))) {
8563
                /* Packet restoration failed and it was dropped, do not
8564
                 * continue processing.
8565
                 */
8566
0
                continue;
8567
0
            }
8568
0
            if (OVS_LIKELY(flow)) {
8569
0
                tcp_flags = parse_tcp_flags(packet, NULL, NULL, NULL);
8570
0
                n_phwol_hit++;
8571
0
                dfc_processing_enqueue_classified_packet(
8572
0
                        packet, flow, tcp_flags, batch_enable,
8573
0
                        batches, n_batches, flow_map, &map_cnt);
8574
0
                continue;
8575
0
            }
8576
0
        }
8577
8578
0
        if (!flow && simple_match_enabled) {
8579
0
            ovs_be16 dl_type = 0, vlan_tci = 0;
8580
0
            uint8_t nw_frag = 0;
8581
8582
0
            tcp_flags = parse_tcp_flags(packet, &dl_type, &nw_frag, &vlan_tci);
8583
0
            flow = dp_netdev_simple_match_lookup(pmd, port_no, dl_type,
8584
0
                                                 nw_frag, vlan_tci);
8585
0
            if (OVS_LIKELY(flow)) {
8586
0
                n_simple_hit++;
8587
0
                dfc_processing_enqueue_classified_packet(
8588
0
                        packet, flow, tcp_flags, batch_enable,
8589
0
                        batches, n_batches, flow_map, &map_cnt);
8590
0
                continue;
8591
0
            }
8592
0
        }
8593
8594
0
        miniflow_extract(packet, &key->mf);
8595
0
        key->len = 0; /* Not computed yet. */
8596
0
        key->hash =
8597
0
                (md_is_valid == false)
8598
0
                ? dpif_netdev_packet_get_rss_hash_orig_pkt(packet, &key->mf)
8599
0
                : dpif_netdev_packet_get_rss_hash(packet, &key->mf);
8600
8601
        /* If EMC is disabled skip emc_lookup */
8602
0
        flow = (cur_min != 0) ? emc_lookup(&cache->emc_cache, key) : NULL;
8603
0
        if (OVS_LIKELY(flow)) {
8604
0
            tcp_flags = miniflow_get_tcp_flags(&key->mf);
8605
0
            n_emc_hit++;
8606
0
            dfc_processing_enqueue_classified_packet(
8607
0
                    packet, flow, tcp_flags, batch_enable,
8608
0
                    batches, n_batches, flow_map, &map_cnt);
8609
0
        } else {
8610
            /* Exact match cache missed. Group missed packets together at
8611
             * the beginning of the 'packets' array. */
8612
0
            dp_packet_batch_refill(packets_, packet, i);
8613
8614
            /* Preserve the order of packet for flow batching. */
8615
0
            index_map[n_missed] = map_cnt;
8616
0
            flow_map[map_cnt++].flow = NULL;
8617
8618
            /* 'key[n_missed]' contains the key of the current packet and it
8619
             * will be passed to SMC lookup. The next key should be extracted
8620
             * to 'keys[n_missed + 1]'.
8621
             * We also maintain a pointer array to keys missed both SMC and EMC
8622
             * which will be returned to the caller for future processing. */
8623
0
            missed_keys[n_missed] = key;
8624
0
            key = &keys[++n_missed];
8625
8626
            /* Skip batching for subsequent packets to avoid reordering. */
8627
0
            batch_enable = false;
8628
0
        }
8629
0
    }
8630
    /* Count of packets which are not flow batched. */
8631
0
    *n_flows = map_cnt;
8632
8633
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_PHWOL_HIT, n_phwol_hit);
8634
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MFEX_OPT_HIT,
8635
0
                            n_mfex_opt_hit);
8636
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SIMPLE_HIT,
8637
0
                            n_simple_hit);
8638
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_EXACT_HIT, n_emc_hit);
8639
8640
0
    if (!smc_enable_db) {
8641
0
        return dp_packet_batch_size(packets_);
8642
0
    }
8643
8644
    /* Packets miss EMC will do a batch lookup in SMC if enabled */
8645
0
    smc_lookup_batch(pmd, keys, missed_keys, packets_,
8646
0
                     n_missed, flow_map, index_map);
8647
8648
0
    return dp_packet_batch_size(packets_);
8649
0
}
8650
8651
static inline int
8652
handle_packet_upcall(struct dp_netdev_pmd_thread *pmd,
8653
                     struct dp_packet *packet,
8654
                     const struct netdev_flow_key *key,
8655
                     struct ofpbuf *actions, struct ofpbuf *put_actions)
8656
0
{
8657
0
    struct ofpbuf *add_actions;
8658
0
    struct dp_packet_batch b;
8659
0
    struct match match;
8660
0
    ovs_u128 ufid;
8661
0
    int error;
8662
0
    uint64_t cycles = cycles_counter_update(&pmd->perf_stats);
8663
0
    odp_port_t orig_in_port = packet->md.orig_in_port;
8664
8665
0
    match.tun_md.valid = false;
8666
0
    miniflow_expand(&key->mf, &match.flow);
8667
0
    memset(&match.wc, 0, sizeof match.wc);
8668
8669
0
    ofpbuf_clear(actions);
8670
0
    ofpbuf_clear(put_actions);
8671
8672
0
    odp_flow_key_hash(&match.flow, sizeof match.flow, &ufid);
8673
0
    error = dp_netdev_upcall(pmd, packet, &match.flow, &match.wc,
8674
0
                             &ufid, DPIF_UC_MISS, NULL, actions,
8675
0
                             put_actions);
8676
0
    if (OVS_UNLIKELY(error && error != ENOSPC)) {
8677
0
        dp_packet_delete(packet);
8678
0
        COVERAGE_INC(datapath_drop_upcall_error);
8679
0
        return error;
8680
0
    }
8681
8682
    /* The Netlink encoding of datapath flow keys cannot express
8683
     * wildcarding the presence of a VLAN tag. Instead, a missing VLAN
8684
     * tag is interpreted as exact match on the fact that there is no
8685
     * VLAN.  Unless we refactor a lot of code that translates between
8686
     * Netlink and struct flow representations, we have to do the same
8687
     * here.  This must be in sync with 'match' in dpif_netdev_flow_put(). */
8688
0
    if (!match.wc.masks.vlans[0].tci) {
8689
0
        match.wc.masks.vlans[0].tci = htons(VLAN_VID_MASK | VLAN_CFI);
8690
0
    }
8691
8692
    /* We can't allow the packet batching in the next loop to execute
8693
     * the actions.  Otherwise, if there are any slow path actions,
8694
     * we'll send the packet up twice. */
8695
0
    dp_packet_batch_init_packet(&b, packet);
8696
0
    dp_netdev_execute_actions(pmd, &b, true, &match.flow,
8697
0
                              actions->data, actions->size);
8698
8699
0
    add_actions = put_actions->size ? put_actions : actions;
8700
0
    if (OVS_LIKELY(error != ENOSPC)) {
8701
0
        struct dp_netdev_flow *netdev_flow;
8702
8703
        /* XXX: There's a race window where a flow covering this packet
8704
         * could have already been installed since we last did the flow
8705
         * lookup before upcall.  This could be solved by moving the
8706
         * mutex lock outside the loop, but that's an awful long time
8707
         * to be locking revalidators out of making flow modifications. */
8708
0
        ovs_mutex_lock(&pmd->flow_mutex);
8709
0
        netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL);
8710
0
        if (OVS_LIKELY(!netdev_flow)) {
8711
0
            netdev_flow = dp_netdev_flow_add(pmd, &match, &ufid,
8712
0
                                             add_actions->data,
8713
0
                                             add_actions->size, orig_in_port);
8714
0
        }
8715
0
        ovs_mutex_unlock(&pmd->flow_mutex);
8716
0
        uint32_t hash = dp_netdev_flow_hash(&netdev_flow->ufid);
8717
0
        smc_insert(pmd, key, hash);
8718
0
        emc_probabilistic_insert(pmd, key, netdev_flow);
8719
0
    }
8720
0
    if (pmd_perf_metrics_enabled(pmd)) {
8721
        /* Update upcall stats. */
8722
0
        cycles = cycles_counter_update(&pmd->perf_stats) - cycles;
8723
0
        struct pmd_perf_stats *s = &pmd->perf_stats;
8724
0
        s->current.upcalls++;
8725
0
        s->current.upcall_cycles += cycles;
8726
0
        histogram_add_sample(&s->cycles_per_upcall, cycles);
8727
0
    }
8728
0
    return error;
8729
0
}
8730
8731
static inline void
8732
fast_path_processing(struct dp_netdev_pmd_thread *pmd,
8733
                     struct dp_packet_batch *packets_,
8734
                     struct netdev_flow_key **keys,
8735
                     struct dp_packet_flow_map *flow_map,
8736
                     uint8_t *index_map,
8737
                     odp_port_t in_port)
8738
0
{
8739
0
    const size_t cnt = dp_packet_batch_size(packets_);
8740
0
#if !defined(__CHECKER__) && !defined(_WIN32)
8741
0
    const size_t PKT_ARRAY_SIZE = cnt;
8742
#else
8743
    /* Sparse or MSVC doesn't like variable length array. */
8744
    enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
8745
#endif
8746
0
    struct dp_packet *packet;
8747
0
    struct dpcls *cls;
8748
0
    struct dpcls_rule *rules[PKT_ARRAY_SIZE];
8749
0
    struct dp_netdev *dp = pmd->dp;
8750
0
    int upcall_ok_cnt = 0, upcall_fail_cnt = 0;
8751
0
    int lookup_cnt = 0, add_lookup_cnt;
8752
0
    bool any_miss;
8753
8754
0
    for (size_t i = 0; i < cnt; i++) {
8755
        /* Key length is needed in all the cases, hash computed on demand. */
8756
0
        keys[i]->len = netdev_flow_key_size(miniflow_n_values(&keys[i]->mf));
8757
0
    }
8758
    /* Get the classifier for the in_port */
8759
0
    cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
8760
0
    if (OVS_LIKELY(cls)) {
8761
0
        any_miss = !dpcls_lookup(cls, (const struct netdev_flow_key **)keys,
8762
0
                                rules, cnt, &lookup_cnt);
8763
0
    } else {
8764
0
        any_miss = true;
8765
0
        memset(rules, 0, sizeof(rules));
8766
0
    }
8767
0
    if (OVS_UNLIKELY(any_miss) && !fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
8768
0
        uint64_t actions_stub[512 / 8], slow_stub[512 / 8];
8769
0
        struct ofpbuf actions, put_actions;
8770
8771
0
        ofpbuf_use_stub(&actions, actions_stub, sizeof actions_stub);
8772
0
        ofpbuf_use_stub(&put_actions, slow_stub, sizeof slow_stub);
8773
8774
0
        DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
8775
0
            struct dp_netdev_flow *netdev_flow;
8776
8777
0
            if (OVS_LIKELY(rules[i])) {
8778
0
                continue;
8779
0
            }
8780
8781
            /* It's possible that an earlier slow path execution installed
8782
             * a rule covering this flow.  In this case, it's a lot cheaper
8783
             * to catch it here than execute a miss. */
8784
0
            netdev_flow = dp_netdev_pmd_lookup_flow(pmd, keys[i],
8785
0
                                                    &add_lookup_cnt);
8786
0
            if (netdev_flow) {
8787
0
                lookup_cnt += add_lookup_cnt;
8788
0
                rules[i] = &netdev_flow->cr;
8789
0
                continue;
8790
0
            }
8791
8792
0
            int error = handle_packet_upcall(pmd, packet, keys[i],
8793
0
                                             &actions, &put_actions);
8794
8795
0
            if (OVS_UNLIKELY(error)) {
8796
0
                upcall_fail_cnt++;
8797
0
            } else {
8798
0
                upcall_ok_cnt++;
8799
0
            }
8800
0
        }
8801
8802
0
        ofpbuf_uninit(&actions);
8803
0
        ofpbuf_uninit(&put_actions);
8804
0
        fat_rwlock_unlock(&dp->upcall_rwlock);
8805
0
    } else if (OVS_UNLIKELY(any_miss)) {
8806
0
        DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
8807
0
            if (OVS_UNLIKELY(!rules[i])) {
8808
0
                dp_packet_delete(packet);
8809
0
                COVERAGE_INC(datapath_drop_lock_error);
8810
0
                upcall_fail_cnt++;
8811
0
            }
8812
0
        }
8813
0
    }
8814
8815
0
    DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
8816
0
        struct dp_netdev_flow *flow;
8817
        /* Get the original order of this packet in received batch. */
8818
0
        int recv_idx = index_map[i];
8819
0
        uint16_t tcp_flags;
8820
8821
0
        if (OVS_UNLIKELY(!rules[i])) {
8822
0
            continue;
8823
0
        }
8824
8825
0
        flow = dp_netdev_flow_cast(rules[i]);
8826
0
        uint32_t hash =  dp_netdev_flow_hash(&flow->ufid);
8827
0
        smc_insert(pmd, keys[i], hash);
8828
8829
0
        emc_probabilistic_insert(pmd, keys[i], flow);
8830
        /* Add these packets into the flow map in the same order
8831
         * as received.
8832
         */
8833
0
        tcp_flags = miniflow_get_tcp_flags(&keys[i]->mf);
8834
0
        packet_enqueue_to_flow_map(packet, flow, tcp_flags,
8835
0
                                   flow_map, recv_idx);
8836
0
    }
8837
8838
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_HIT,
8839
0
                            cnt - upcall_ok_cnt - upcall_fail_cnt);
8840
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_LOOKUP,
8841
0
                            lookup_cnt);
8842
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MISS,
8843
0
                            upcall_ok_cnt);
8844
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_LOST,
8845
0
                            upcall_fail_cnt);
8846
0
}
8847
8848
/* Packets enter the datapath from a port (or from recirculation) here.
8849
 *
8850
 * When 'md_is_valid' is true the metadata in 'packets' are already valid.
8851
 * When false the metadata in 'packets' need to be initialized. */
8852
static void
8853
dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
8854
                  struct dp_packet_batch *packets,
8855
                  bool md_is_valid, odp_port_t port_no)
8856
0
{
8857
0
#if !defined(__CHECKER__) && !defined(_WIN32)
8858
0
    const size_t PKT_ARRAY_SIZE = dp_packet_batch_size(packets);
8859
#else
8860
    /* Sparse or MSVC doesn't like variable length array. */
8861
    enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
8862
#endif
8863
0
    OVS_ALIGNED_VAR(CACHE_LINE_SIZE)
8864
0
        struct netdev_flow_key keys[PKT_ARRAY_SIZE];
8865
0
    struct netdev_flow_key *missed_keys[PKT_ARRAY_SIZE];
8866
0
    struct packet_batch_per_flow batches[PKT_ARRAY_SIZE];
8867
0
    size_t n_batches;
8868
0
    struct dp_packet_flow_map flow_map[PKT_ARRAY_SIZE];
8869
0
    uint8_t index_map[PKT_ARRAY_SIZE];
8870
0
    size_t n_flows, i;
8871
8872
0
    odp_port_t in_port;
8873
8874
0
    n_batches = 0;
8875
0
    dfc_processing(pmd, packets, keys, missed_keys, batches, &n_batches,
8876
0
                   flow_map, &n_flows, index_map, md_is_valid, port_no);
8877
8878
0
    if (!dp_packet_batch_is_empty(packets)) {
8879
        /* Get ingress port from first packet's metadata. */
8880
0
        in_port = packets->packets[0]->md.in_port.odp_port;
8881
0
        fast_path_processing(pmd, packets, missed_keys,
8882
0
                             flow_map, index_map, in_port);
8883
0
    }
8884
8885
    /* Batch rest of packets which are in flow map. */
8886
0
    for (i = 0; i < n_flows; i++) {
8887
0
        struct dp_packet_flow_map *map = &flow_map[i];
8888
8889
0
        if (OVS_UNLIKELY(!map->flow)) {
8890
0
            continue;
8891
0
        }
8892
0
        dp_netdev_queue_batches(map->packet, map->flow, map->tcp_flags,
8893
0
                                batches, &n_batches);
8894
0
     }
8895
8896
    /* All the flow batches need to be reset before any call to
8897
     * packet_batch_per_flow_execute() as it could potentially trigger
8898
     * recirculation. When a packet matching flow 'j' happens to be
8899
     * recirculated, the nested call to dp_netdev_input__() could potentially
8900
     * classify the packet as matching another flow - say 'k'. It could happen
8901
     * that in the previous call to dp_netdev_input__() that same flow 'k' had
8902
     * already its own batches[k] still waiting to be served.  So if its
8903
     * 'batch' member is not reset, the recirculated packet would be wrongly
8904
     * appended to batches[k] of the 1st call to dp_netdev_input__(). */
8905
0
    for (i = 0; i < n_batches; i++) {
8906
0
        batches[i].flow->batch = NULL;
8907
0
    }
8908
8909
0
    for (i = 0; i < n_batches; i++) {
8910
0
        packet_batch_per_flow_execute(&batches[i], pmd);
8911
0
    }
8912
0
}
8913
8914
int32_t
8915
dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
8916
                struct dp_packet_batch *packets,
8917
                odp_port_t port_no)
8918
0
{
8919
0
    dp_netdev_input__(pmd, packets, false, port_no);
8920
0
    return 0;
8921
0
}
8922
8923
static void
8924
dp_netdev_recirculate(struct dp_netdev_pmd_thread *pmd,
8925
                      struct dp_packet_batch *packets)
8926
0
{
8927
0
    dp_netdev_input__(pmd, packets, true, 0);
8928
0
}
8929
8930
struct dp_netdev_execute_aux {
8931
    struct dp_netdev_pmd_thread *pmd;
8932
    const struct flow *flow;
8933
};
8934
8935
static void
8936
dpif_netdev_register_dp_purge_cb(struct dpif *dpif, dp_purge_callback *cb,
8937
                                 void *aux)
8938
0
{
8939
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8940
0
    dp->dp_purge_aux = aux;
8941
0
    dp->dp_purge_cb = cb;
8942
0
}
8943
8944
static void
8945
dpif_netdev_register_upcall_cb(struct dpif *dpif, upcall_callback *cb,
8946
                               void *aux)
8947
0
{
8948
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8949
0
    dp->upcall_aux = aux;
8950
0
    dp->upcall_cb = cb;
8951
0
}
8952
8953
static void
8954
dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
8955
                               bool purge)
8956
0
{
8957
0
    struct tx_port *tx;
8958
0
    struct dp_netdev_port *port;
8959
0
    long long interval;
8960
8961
0
    HMAP_FOR_EACH (tx, node, &pmd->send_port_cache) {
8962
0
        if (tx->port->txq_mode != TXQ_MODE_XPS) {
8963
0
            continue;
8964
0
        }
8965
0
        interval = pmd->ctx.now - tx->last_used;
8966
0
        if (tx->qid >= 0 && (purge || interval >= XPS_TIMEOUT)) {
8967
0
            port = tx->port;
8968
0
            ovs_mutex_lock(&port->txq_used_mutex);
8969
0
            port->txq_used[tx->qid]--;
8970
0
            ovs_mutex_unlock(&port->txq_used_mutex);
8971
0
            tx->qid = -1;
8972
0
        }
8973
0
    }
8974
0
}
8975
8976
static int
8977
dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
8978
                           struct tx_port *tx)
8979
0
{
8980
0
    struct dp_netdev_port *port;
8981
0
    long long interval;
8982
0
    int i, min_cnt, min_qid;
8983
8984
0
    interval = pmd->ctx.now - tx->last_used;
8985
0
    tx->last_used = pmd->ctx.now;
8986
8987
0
    if (OVS_LIKELY(tx->qid >= 0 && interval < XPS_TIMEOUT)) {
8988
0
        return tx->qid;
8989
0
    }
8990
8991
0
    port = tx->port;
8992
8993
0
    ovs_mutex_lock(&port->txq_used_mutex);
8994
0
    if (tx->qid >= 0) {
8995
0
        port->txq_used[tx->qid]--;
8996
0
        tx->qid = -1;
8997
0
    }
8998
8999
0
    min_cnt = -1;
9000
0
    min_qid = 0;
9001
0
    for (i = 0; i < netdev_n_txq(port->netdev); i++) {
9002
0
        if (port->txq_used[i] < min_cnt || min_cnt == -1) {
9003
0
            min_cnt = port->txq_used[i];
9004
0
            min_qid = i;
9005
0
        }
9006
0
    }
9007
9008
0
    port->txq_used[min_qid]++;
9009
0
    tx->qid = min_qid;
9010
9011
0
    ovs_mutex_unlock(&port->txq_used_mutex);
9012
9013
0
    dpif_netdev_xps_revalidate_pmd(pmd, false);
9014
9015
0
    VLOG_DBG("Core %d: New TX queue ID %d for port \'%s\'.",
9016
0
             pmd->core_id, tx->qid, netdev_get_name(tx->port->netdev));
9017
0
    return min_qid;
9018
0
}
9019
9020
static struct tx_port *
9021
pmd_tnl_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
9022
                          odp_port_t port_no)
9023
0
{
9024
0
    return tx_port_lookup(&pmd->tnl_port_cache, port_no);
9025
0
}
9026
9027
static struct tx_port *
9028
pmd_send_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
9029
                           odp_port_t port_no)
9030
0
{
9031
0
    return tx_port_lookup(&pmd->send_port_cache, port_no);
9032
0
}
9033
9034
static int
9035
push_tnl_action(const struct dp_netdev_pmd_thread *pmd,
9036
                const struct nlattr *attr,
9037
                struct dp_packet_batch *batch)
9038
0
{
9039
0
    struct tx_port *tun_port;
9040
0
    const struct ovs_action_push_tnl *data;
9041
0
    int err;
9042
9043
0
    data = nl_attr_get(attr);
9044
9045
0
    tun_port = pmd_tnl_port_cache_lookup(pmd, data->tnl_port);
9046
0
    if (!tun_port) {
9047
0
        err = -EINVAL;
9048
0
        goto error;
9049
0
    }
9050
0
    err = netdev_push_header(tun_port->port->netdev, batch, data);
9051
0
    if (!err) {
9052
0
        return 0;
9053
0
    }
9054
0
error:
9055
0
    dp_packet_delete_batch(batch, true);
9056
0
    return err;
9057
0
}
9058
9059
static void
9060
dp_execute_userspace_action(struct dp_netdev_pmd_thread *pmd,
9061
                            struct dp_packet *packet, bool should_steal,
9062
                            struct flow *flow, ovs_u128 *ufid,
9063
                            struct ofpbuf *actions,
9064
                            const struct nlattr *userdata)
9065
0
{
9066
0
    struct dp_packet_batch b;
9067
0
    int error;
9068
9069
0
    ofpbuf_clear(actions);
9070
9071
0
    error = dp_netdev_upcall(pmd, packet, flow, NULL, ufid,
9072
0
                             DPIF_UC_ACTION, userdata, actions,
9073
0
                             NULL);
9074
0
    if (!error || error == ENOSPC) {
9075
0
        dp_packet_batch_init_packet(&b, packet);
9076
0
        dp_netdev_execute_actions(pmd, &b, should_steal, flow,
9077
0
                                  actions->data, actions->size);
9078
0
    } else if (should_steal) {
9079
0
        dp_packet_delete(packet);
9080
0
        COVERAGE_INC(datapath_drop_userspace_action_error);
9081
0
    }
9082
0
}
9083
9084
static bool
9085
dp_execute_output_action(struct dp_netdev_pmd_thread *pmd,
9086
                         struct dp_packet_batch *packets_,
9087
                         bool should_steal, odp_port_t port_no)
9088
0
{
9089
0
    struct tx_port *p = pmd_send_port_cache_lookup(pmd, port_no);
9090
0
    struct dp_packet_batch out;
9091
9092
0
    if (!OVS_LIKELY(p)) {
9093
0
        COVERAGE_ADD(datapath_drop_invalid_port,
9094
0
                     dp_packet_batch_size(packets_));
9095
0
        dp_packet_delete_batch(packets_, should_steal);
9096
0
        return false;
9097
0
    }
9098
0
    if (!should_steal) {
9099
0
        dp_packet_batch_clone(&out, packets_);
9100
0
        dp_packet_batch_reset_cutlen(packets_);
9101
0
        packets_ = &out;
9102
0
    }
9103
0
    dp_packet_batch_apply_cutlen(packets_);
9104
#ifdef DPDK_NETDEV
9105
    if (OVS_UNLIKELY(!dp_packet_batch_is_empty(&p->output_pkts)
9106
                     && packets_->packets[0]->source
9107
                        != p->output_pkts.packets[0]->source)) {
9108
        /* XXX: netdev-dpdk assumes that all packets in a single
9109
         *      output batch has the same source. Flush here to
9110
         *      avoid memory access issues. */
9111
        dp_netdev_pmd_flush_output_on_port(pmd, p);
9112
    }
9113
#endif
9114
0
    if (dp_packet_batch_size(&p->output_pkts)
9115
0
        + dp_packet_batch_size(packets_) > NETDEV_MAX_BURST) {
9116
        /* Flush here to avoid overflow. */
9117
0
        dp_netdev_pmd_flush_output_on_port(pmd, p);
9118
0
    }
9119
0
    if (dp_packet_batch_is_empty(&p->output_pkts)) {
9120
0
        pmd->n_output_batches++;
9121
0
    }
9122
9123
0
    struct dp_packet *packet;
9124
0
    DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
9125
0
        p->output_pkts_rxqs[dp_packet_batch_size(&p->output_pkts)] =
9126
0
            pmd->ctx.last_rxq;
9127
0
        dp_packet_batch_add(&p->output_pkts, packet);
9128
0
    }
9129
0
    return true;
9130
0
}
9131
9132
static void
9133
dp_execute_lb_output_action(struct dp_netdev_pmd_thread *pmd,
9134
                            struct dp_packet_batch *packets_,
9135
                            bool should_steal, uint32_t bond)
9136
0
{
9137
0
    struct tx_bond *p_bond = tx_bond_lookup(&pmd->tx_bonds, bond);
9138
0
    struct dp_packet_batch out;
9139
0
    struct dp_packet *packet;
9140
9141
0
    if (!p_bond) {
9142
0
        COVERAGE_ADD(datapath_drop_invalid_bond,
9143
0
                     dp_packet_batch_size(packets_));
9144
0
        dp_packet_delete_batch(packets_, should_steal);
9145
0
        return;
9146
0
    }
9147
0
    if (!should_steal) {
9148
0
        dp_packet_batch_clone(&out, packets_);
9149
0
        dp_packet_batch_reset_cutlen(packets_);
9150
0
        packets_ = &out;
9151
0
    }
9152
0
    dp_packet_batch_apply_cutlen(packets_);
9153
9154
0
    DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
9155
        /*
9156
         * Lookup the bond-hash table using hash to get the member.
9157
         */
9158
0
        uint32_t hash = dp_packet_get_rss_hash(packet);
9159
0
        struct member_entry *s_entry
9160
0
            = &p_bond->member_buckets[hash & BOND_MASK];
9161
0
        odp_port_t bond_member = s_entry->member_id;
9162
0
        uint32_t size = dp_packet_size(packet);
9163
0
        struct dp_packet_batch output_pkt;
9164
9165
0
        dp_packet_batch_init_packet(&output_pkt, packet);
9166
0
        if (OVS_LIKELY(dp_execute_output_action(pmd, &output_pkt, true,
9167
0
                                                bond_member))) {
9168
            /* Update member stats. */
9169
0
            non_atomic_ullong_add(&s_entry->n_packets, 1);
9170
0
            non_atomic_ullong_add(&s_entry->n_bytes, size);
9171
0
        }
9172
0
    }
9173
0
}
9174
9175
static void
9176
dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
9177
              const struct nlattr *a, bool should_steal)
9178
    OVS_NO_THREAD_SAFETY_ANALYSIS
9179
0
{
9180
0
    struct dp_netdev_execute_aux *aux = aux_;
9181
0
    uint32_t *depth = recirc_depth_get();
9182
0
    struct dp_netdev_pmd_thread *pmd = aux->pmd;
9183
0
    struct dp_netdev *dp = pmd->dp;
9184
0
    int type = nl_attr_type(a);
9185
0
    struct tx_port *p;
9186
0
    uint32_t packet_count, packets_dropped;
9187
9188
0
    switch ((enum ovs_action_attr)type) {
9189
0
    case OVS_ACTION_ATTR_OUTPUT:
9190
0
        dp_execute_output_action(pmd, packets_, should_steal,
9191
0
                                 nl_attr_get_odp_port(a));
9192
0
        return;
9193
9194
0
    case OVS_ACTION_ATTR_LB_OUTPUT:
9195
0
        dp_execute_lb_output_action(pmd, packets_, should_steal,
9196
0
                                    nl_attr_get_u32(a));
9197
0
        return;
9198
9199
0
    case OVS_ACTION_ATTR_TUNNEL_PUSH:
9200
0
        if (should_steal) {
9201
            /* We're requested to push tunnel header, but also we need to take
9202
             * the ownership of these packets. Thus, we can avoid performing
9203
             * the action, because the caller will not use the result anyway.
9204
             * Just break to free the batch. */
9205
0
            break;
9206
0
        }
9207
0
        dp_packet_batch_apply_cutlen(packets_);
9208
0
        packet_count = dp_packet_batch_size(packets_);
9209
0
        if (push_tnl_action(pmd, a, packets_)) {
9210
0
            COVERAGE_ADD(datapath_drop_tunnel_push_error,
9211
0
                         packet_count);
9212
0
        }
9213
0
        return;
9214
9215
0
    case OVS_ACTION_ATTR_TUNNEL_POP:
9216
0
        if (*depth < MAX_RECIRC_DEPTH) {
9217
0
            struct dp_packet_batch *orig_packets_ = packets_;
9218
0
            odp_port_t portno = nl_attr_get_odp_port(a);
9219
9220
0
            p = pmd_tnl_port_cache_lookup(pmd, portno);
9221
0
            if (p) {
9222
0
                struct dp_packet_batch tnl_pkt;
9223
9224
0
                if (!should_steal) {
9225
0
                    dp_packet_batch_clone(&tnl_pkt, packets_);
9226
0
                    packets_ = &tnl_pkt;
9227
0
                    dp_packet_batch_reset_cutlen(orig_packets_);
9228
0
                }
9229
9230
0
                dp_packet_batch_apply_cutlen(packets_);
9231
9232
0
                packet_count = dp_packet_batch_size(packets_);
9233
0
                netdev_pop_header(p->port->netdev, packets_);
9234
0
                packets_dropped =
9235
0
                   packet_count - dp_packet_batch_size(packets_);
9236
0
                if (packets_dropped) {
9237
0
                    COVERAGE_ADD(datapath_drop_tunnel_pop_error,
9238
0
                                 packets_dropped);
9239
0
                }
9240
0
                if (dp_packet_batch_is_empty(packets_)) {
9241
0
                    return;
9242
0
                }
9243
9244
0
                struct dp_packet *packet;
9245
0
                DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
9246
0
                    packet->md.in_port.odp_port = portno;
9247
0
                }
9248
9249
0
                (*depth)++;
9250
0
                dp_netdev_recirculate(pmd, packets_);
9251
0
                (*depth)--;
9252
0
                return;
9253
0
            }
9254
0
            COVERAGE_ADD(datapath_drop_invalid_tnl_port,
9255
0
                         dp_packet_batch_size(packets_));
9256
0
        } else {
9257
0
            COVERAGE_ADD(datapath_drop_recirc_error,
9258
0
                         dp_packet_batch_size(packets_));
9259
0
        }
9260
0
        break;
9261
9262
0
    case OVS_ACTION_ATTR_USERSPACE:
9263
0
        if (!fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
9264
0
            struct dp_packet_batch *orig_packets_ = packets_;
9265
0
            const struct nlattr *userdata;
9266
0
            struct dp_packet_batch usr_pkt;
9267
0
            struct ofpbuf actions;
9268
0
            struct flow flow;
9269
0
            ovs_u128 ufid;
9270
0
            bool clone = false;
9271
9272
0
            userdata = nl_attr_find_nested(a, OVS_USERSPACE_ATTR_USERDATA);
9273
0
            ofpbuf_init(&actions, 0);
9274
9275
0
            if (packets_->trunc) {
9276
0
                if (!should_steal) {
9277
0
                    dp_packet_batch_clone(&usr_pkt, packets_);
9278
0
                    packets_ = &usr_pkt;
9279
0
                    clone = true;
9280
0
                    dp_packet_batch_reset_cutlen(orig_packets_);
9281
0
                }
9282
9283
0
                dp_packet_batch_apply_cutlen(packets_);
9284
0
            }
9285
9286
0
            struct dp_packet *packet;
9287
0
            DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
9288
0
                flow_extract(packet, &flow);
9289
0
                odp_flow_key_hash(&flow, sizeof flow, &ufid);
9290
0
                dp_execute_userspace_action(pmd, packet, should_steal, &flow,
9291
0
                                            &ufid, &actions, userdata);
9292
0
            }
9293
9294
0
            if (clone) {
9295
0
                dp_packet_delete_batch(packets_, true);
9296
0
            }
9297
9298
0
            ofpbuf_uninit(&actions);
9299
0
            fat_rwlock_unlock(&dp->upcall_rwlock);
9300
9301
0
            return;
9302
0
        }
9303
0
        COVERAGE_ADD(datapath_drop_lock_error,
9304
0
                     dp_packet_batch_size(packets_));
9305
0
        break;
9306
9307
0
    case OVS_ACTION_ATTR_RECIRC:
9308
0
        if (*depth < MAX_RECIRC_DEPTH) {
9309
0
            struct dp_packet_batch recirc_pkts;
9310
9311
0
            if (!should_steal) {
9312
0
               dp_packet_batch_clone(&recirc_pkts, packets_);
9313
0
               packets_ = &recirc_pkts;
9314
0
            }
9315
9316
0
            struct dp_packet *packet;
9317
0
            DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
9318
0
                packet->md.recirc_id = nl_attr_get_u32(a);
9319
0
            }
9320
9321
0
            (*depth)++;
9322
0
            dp_netdev_recirculate(pmd, packets_);
9323
0
            (*depth)--;
9324
9325
0
            return;
9326
0
        }
9327
9328
0
        COVERAGE_ADD(datapath_drop_recirc_error,
9329
0
                     dp_packet_batch_size(packets_));
9330
0
        VLOG_WARN("Packet dropped. Max recirculation depth exceeded.");
9331
0
        break;
9332
9333
0
    case OVS_ACTION_ATTR_CT: {
9334
0
        const struct nlattr *b;
9335
0
        bool force = false;
9336
0
        bool commit = false;
9337
0
        unsigned int left;
9338
0
        uint16_t zone = 0;
9339
0
        uint32_t tp_id = 0;
9340
0
        const char *helper = NULL;
9341
0
        const uint32_t *setmark = NULL;
9342
0
        const struct ovs_key_ct_labels *setlabel = NULL;
9343
0
        struct nat_action_info_t nat_action_info;
9344
0
        struct nat_action_info_t *nat_action_info_ref = NULL;
9345
0
        bool nat_config = false;
9346
9347
0
        NL_ATTR_FOR_EACH_UNSAFE (b, left, nl_attr_get(a),
9348
0
                                 nl_attr_get_size(a)) {
9349
0
            enum ovs_ct_attr sub_type = nl_attr_type(b);
9350
9351
0
            switch(sub_type) {
9352
0
            case OVS_CT_ATTR_FORCE_COMMIT:
9353
0
                force = true;
9354
                /* fall through. */
9355
0
            case OVS_CT_ATTR_COMMIT:
9356
0
                commit = true;
9357
0
                break;
9358
0
            case OVS_CT_ATTR_ZONE:
9359
0
                zone = nl_attr_get_u16(b);
9360
0
                break;
9361
0
            case OVS_CT_ATTR_HELPER:
9362
0
                helper = nl_attr_get_string(b);
9363
0
                break;
9364
0
            case OVS_CT_ATTR_MARK:
9365
0
                setmark = nl_attr_get(b);
9366
0
                break;
9367
0
            case OVS_CT_ATTR_LABELS:
9368
0
                setlabel = nl_attr_get(b);
9369
0
                break;
9370
0
            case OVS_CT_ATTR_EVENTMASK:
9371
                /* Silently ignored, as userspace datapath does not generate
9372
                 * netlink events. */
9373
0
                break;
9374
0
            case OVS_CT_ATTR_TIMEOUT:
9375
0
                if (!str_to_uint(nl_attr_get_string(b), 10, &tp_id)) {
9376
0
                    VLOG_WARN("Invalid Timeout Policy ID: %s.",
9377
0
                              nl_attr_get_string(b));
9378
0
                    tp_id = DEFAULT_TP_ID;
9379
0
                }
9380
0
                break;
9381
0
            case OVS_CT_ATTR_NAT: {
9382
0
                const struct nlattr *b_nest;
9383
0
                unsigned int left_nest;
9384
0
                bool ip_min_specified = false;
9385
0
                bool proto_num_min_specified = false;
9386
0
                bool ip_max_specified = false;
9387
0
                bool proto_num_max_specified = false;
9388
0
                memset(&nat_action_info, 0, sizeof nat_action_info);
9389
0
                nat_action_info_ref = &nat_action_info;
9390
9391
0
                NL_NESTED_FOR_EACH_UNSAFE (b_nest, left_nest, b) {
9392
0
                    enum ovs_nat_attr sub_type_nest = nl_attr_type(b_nest);
9393
9394
0
                    switch (sub_type_nest) {
9395
0
                    case OVS_NAT_ATTR_SRC:
9396
0
                    case OVS_NAT_ATTR_DST:
9397
0
                        nat_config = true;
9398
0
                        nat_action_info.nat_action |=
9399
0
                            ((sub_type_nest == OVS_NAT_ATTR_SRC)
9400
0
                                ? NAT_ACTION_SRC : NAT_ACTION_DST);
9401
0
                        break;
9402
0
                    case OVS_NAT_ATTR_IP_MIN:
9403
0
                        memcpy(&nat_action_info.min_addr,
9404
0
                               nl_attr_get(b_nest),
9405
0
                               nl_attr_get_size(b_nest));
9406
0
                        ip_min_specified = true;
9407
0
                        break;
9408
0
                    case OVS_NAT_ATTR_IP_MAX:
9409
0
                        memcpy(&nat_action_info.max_addr,
9410
0
                               nl_attr_get(b_nest),
9411
0
                               nl_attr_get_size(b_nest));
9412
0
                        ip_max_specified = true;
9413
0
                        break;
9414
0
                    case OVS_NAT_ATTR_PROTO_MIN:
9415
0
                        nat_action_info.min_port =
9416
0
                            nl_attr_get_u16(b_nest);
9417
0
                        proto_num_min_specified = true;
9418
0
                        break;
9419
0
                    case OVS_NAT_ATTR_PROTO_MAX:
9420
0
                        nat_action_info.max_port =
9421
0
                            nl_attr_get_u16(b_nest);
9422
0
                        proto_num_max_specified = true;
9423
0
                        break;
9424
0
                    case OVS_NAT_ATTR_PROTO_RANDOM:
9425
0
                        nat_action_info.nat_flags |= NAT_RANGE_RANDOM;
9426
0
                        break;
9427
0
                    case OVS_NAT_ATTR_PERSISTENT:
9428
0
                        nat_action_info.nat_flags |= NAT_PERSISTENT;
9429
0
                        break;
9430
0
                    case OVS_NAT_ATTR_PROTO_HASH:
9431
0
                        break;
9432
0
                    case OVS_NAT_ATTR_UNSPEC:
9433
0
                    case __OVS_NAT_ATTR_MAX:
9434
0
                        OVS_NOT_REACHED();
9435
0
                    }
9436
0
                }
9437
9438
0
                if (ip_min_specified && !ip_max_specified) {
9439
0
                    nat_action_info.max_addr = nat_action_info.min_addr;
9440
0
                }
9441
0
                if (proto_num_min_specified && !proto_num_max_specified) {
9442
0
                    nat_action_info.max_port = nat_action_info.min_port;
9443
0
                }
9444
0
                if (proto_num_min_specified || proto_num_max_specified) {
9445
0
                    if (nat_action_info.nat_action & NAT_ACTION_SRC) {
9446
0
                        nat_action_info.nat_action |= NAT_ACTION_SRC_PORT;
9447
0
                    } else if (nat_action_info.nat_action & NAT_ACTION_DST) {
9448
0
                        nat_action_info.nat_action |= NAT_ACTION_DST_PORT;
9449
0
                    }
9450
0
                }
9451
0
                break;
9452
0
            }
9453
0
            case OVS_CT_ATTR_UNSPEC:
9454
0
            case __OVS_CT_ATTR_MAX:
9455
0
                OVS_NOT_REACHED();
9456
0
            }
9457
0
        }
9458
9459
        /* We won't be able to function properly in this case, hence
9460
         * complain loudly. */
9461
0
        if (nat_config && !commit) {
9462
0
            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
9463
0
            VLOG_WARN_RL(&rl, "NAT specified without commit.");
9464
0
        }
9465
9466
0
        conntrack_execute(dp->conntrack, packets_, aux->flow->dl_type, force,
9467
0
                          commit, zone, setmark, setlabel, helper,
9468
0
                          nat_action_info_ref, pmd->ctx.now / 1000, tp_id);
9469
0
        break;
9470
0
    }
9471
9472
0
    case OVS_ACTION_ATTR_METER:
9473
0
        dp_netdev_run_meter(pmd->dp, packets_, nl_attr_get_u32(a),
9474
0
                            pmd->ctx.now / 1000);
9475
0
        break;
9476
9477
0
    case OVS_ACTION_ATTR_PUSH_VLAN:
9478
0
    case OVS_ACTION_ATTR_POP_VLAN:
9479
0
    case OVS_ACTION_ATTR_PUSH_MPLS:
9480
0
    case OVS_ACTION_ATTR_POP_MPLS:
9481
0
    case OVS_ACTION_ATTR_SET:
9482
0
    case OVS_ACTION_ATTR_SET_MASKED:
9483
0
    case OVS_ACTION_ATTR_SAMPLE:
9484
0
    case OVS_ACTION_ATTR_HASH:
9485
0
    case OVS_ACTION_ATTR_UNSPEC:
9486
0
    case OVS_ACTION_ATTR_TRUNC:
9487
0
    case OVS_ACTION_ATTR_PUSH_ETH:
9488
0
    case OVS_ACTION_ATTR_POP_ETH:
9489
0
    case OVS_ACTION_ATTR_CLONE:
9490
0
    case OVS_ACTION_ATTR_PUSH_NSH:
9491
0
    case OVS_ACTION_ATTR_POP_NSH:
9492
0
    case OVS_ACTION_ATTR_CT_CLEAR:
9493
0
    case OVS_ACTION_ATTR_CHECK_PKT_LEN:
9494
0
    case OVS_ACTION_ATTR_DROP:
9495
0
    case OVS_ACTION_ATTR_ADD_MPLS:
9496
0
    case OVS_ACTION_ATTR_DEC_TTL:
9497
0
    case OVS_ACTION_ATTR_PSAMPLE:
9498
0
    case __OVS_ACTION_ATTR_MAX:
9499
0
        OVS_NOT_REACHED();
9500
0
    }
9501
9502
0
    dp_packet_delete_batch(packets_, should_steal);
9503
0
}
9504
9505
static void
9506
dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
9507
                          struct dp_packet_batch *packets,
9508
                          bool should_steal, const struct flow *flow,
9509
                          const struct nlattr *actions, size_t actions_len)
9510
0
{
9511
0
    struct dp_netdev_execute_aux aux = { pmd, flow };
9512
9513
0
    odp_execute_actions(&aux, packets, should_steal, actions,
9514
0
                        actions_len, dp_execute_cb);
9515
0
}
9516
9517
struct dp_netdev_ct_dump {
9518
    struct ct_dpif_dump_state up;
9519
    struct conntrack_dump dump;
9520
    struct conntrack *ct;
9521
    struct dp_netdev *dp;
9522
};
9523
9524
static int
9525
dpif_netdev_ct_dump_start(struct dpif *dpif, struct ct_dpif_dump_state **dump_,
9526
                          const uint16_t *pzone, int *ptot_bkts)
9527
0
{
9528
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9529
0
    struct dp_netdev_ct_dump *dump;
9530
9531
0
    dump = xzalloc(sizeof *dump);
9532
0
    dump->dp = dp;
9533
0
    dump->ct = dp->conntrack;
9534
9535
0
    conntrack_dump_start(dp->conntrack, &dump->dump, pzone, ptot_bkts);
9536
9537
0
    *dump_ = &dump->up;
9538
9539
0
    return 0;
9540
0
}
9541
9542
static int
9543
dpif_netdev_ct_dump_next(struct dpif *dpif OVS_UNUSED,
9544
                         struct ct_dpif_dump_state *dump_,
9545
                         struct ct_dpif_entry *entry)
9546
0
{
9547
0
    struct dp_netdev_ct_dump *dump;
9548
9549
0
    INIT_CONTAINER(dump, dump_, up);
9550
9551
0
    return conntrack_dump_next(&dump->dump, entry);
9552
0
}
9553
9554
static int
9555
dpif_netdev_ct_dump_done(struct dpif *dpif OVS_UNUSED,
9556
                         struct ct_dpif_dump_state *dump_)
9557
0
{
9558
0
    struct dp_netdev_ct_dump *dump;
9559
0
    int err;
9560
9561
0
    INIT_CONTAINER(dump, dump_, up);
9562
9563
0
    err = conntrack_dump_done(&dump->dump);
9564
9565
0
    free(dump);
9566
9567
0
    return err;
9568
0
}
9569
9570
static int
9571
dpif_netdev_ct_exp_dump_start(struct dpif *dpif,
9572
                              struct ct_dpif_dump_state **dump_,
9573
                              const uint16_t *pzone)
9574
0
{
9575
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9576
0
    struct dp_netdev_ct_dump *dump;
9577
9578
0
    dump = xzalloc(sizeof *dump);
9579
0
    dump->dp = dp;
9580
0
    dump->ct = dp->conntrack;
9581
9582
0
    conntrack_exp_dump_start(dp->conntrack, &dump->dump, pzone);
9583
9584
0
    *dump_ = &dump->up;
9585
9586
0
    return 0;
9587
0
}
9588
9589
static int
9590
dpif_netdev_ct_exp_dump_next(struct dpif *dpif OVS_UNUSED,
9591
                             struct ct_dpif_dump_state *dump_,
9592
                             struct ct_dpif_exp *entry)
9593
0
{
9594
0
    struct dp_netdev_ct_dump *dump;
9595
9596
0
    INIT_CONTAINER(dump, dump_, up);
9597
9598
0
    return conntrack_exp_dump_next(&dump->dump, entry);
9599
0
}
9600
9601
static int
9602
dpif_netdev_ct_exp_dump_done(struct dpif *dpif OVS_UNUSED,
9603
                             struct ct_dpif_dump_state *dump_)
9604
0
{
9605
0
    struct dp_netdev_ct_dump *dump;
9606
0
    int err;
9607
9608
0
    INIT_CONTAINER(dump, dump_, up);
9609
9610
0
    err = conntrack_exp_dump_done(&dump->dump);
9611
9612
0
    free(dump);
9613
9614
0
    return err;
9615
0
}
9616
9617
static int
9618
dpif_netdev_ct_flush(struct dpif *dpif, const uint16_t *zone,
9619
                     const struct ct_dpif_tuple *tuple)
9620
0
{
9621
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9622
9623
0
    if (tuple) {
9624
0
        return conntrack_flush_tuple(dp->conntrack, tuple, zone ? *zone : 0);
9625
0
    }
9626
0
    return conntrack_flush(dp->conntrack, zone);
9627
0
}
9628
9629
static int
9630
dpif_netdev_ct_set_maxconns(struct dpif *dpif, uint32_t maxconns)
9631
0
{
9632
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9633
9634
0
    return conntrack_set_maxconns(dp->conntrack, maxconns);
9635
0
}
9636
9637
static int
9638
dpif_netdev_ct_get_maxconns(struct dpif *dpif, uint32_t *maxconns)
9639
0
{
9640
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9641
9642
0
    return conntrack_get_maxconns(dp->conntrack, maxconns);
9643
0
}
9644
9645
static int
9646
dpif_netdev_ct_get_nconns(struct dpif *dpif, uint32_t *nconns)
9647
0
{
9648
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9649
9650
0
    return conntrack_get_nconns(dp->conntrack, nconns);
9651
0
}
9652
9653
static int
9654
dpif_netdev_ct_set_tcp_seq_chk(struct dpif *dpif, bool enabled)
9655
0
{
9656
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9657
9658
0
    return conntrack_set_tcp_seq_chk(dp->conntrack, enabled);
9659
0
}
9660
9661
static int
9662
dpif_netdev_ct_get_tcp_seq_chk(struct dpif *dpif, bool *enabled)
9663
0
{
9664
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9665
0
    *enabled = conntrack_get_tcp_seq_chk(dp->conntrack);
9666
0
    return 0;
9667
0
}
9668
9669
static int
9670
dpif_netdev_ct_set_sweep_interval(struct dpif *dpif, uint32_t ms)
9671
0
{
9672
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9673
0
    return conntrack_set_sweep_interval(dp->conntrack, ms);
9674
0
}
9675
9676
static int
9677
dpif_netdev_ct_get_sweep_interval(struct dpif *dpif, uint32_t *ms)
9678
0
{
9679
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9680
0
    *ms = conntrack_get_sweep_interval(dp->conntrack);
9681
0
    return 0;
9682
0
}
9683
9684
static int
9685
dpif_netdev_ct_set_limits(struct dpif *dpif,
9686
                           const struct ovs_list *zone_limits)
9687
0
{
9688
0
    int err = 0;
9689
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9690
9691
0
    struct ct_dpif_zone_limit *zone_limit;
9692
0
    LIST_FOR_EACH (zone_limit, node, zone_limits) {
9693
0
        err = zone_limit_update(dp->conntrack, zone_limit->zone,
9694
0
                                zone_limit->limit);
9695
0
        if (err != 0) {
9696
0
            break;
9697
0
        }
9698
0
    }
9699
0
    return err;
9700
0
}
9701
9702
static int
9703
dpif_netdev_ct_get_limits(struct dpif *dpif,
9704
                           const struct ovs_list *zone_limits_request,
9705
                           struct ovs_list *zone_limits_reply)
9706
0
{
9707
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9708
0
    struct conntrack_zone_info czl;
9709
9710
0
    if (!ovs_list_is_empty(zone_limits_request)) {
9711
0
        struct ct_dpif_zone_limit *zone_limit;
9712
0
        LIST_FOR_EACH (zone_limit, node, zone_limits_request) {
9713
0
            czl = zone_limit_get(dp->conntrack, zone_limit->zone);
9714
0
            if (czl.zone == zone_limit->zone || czl.zone == DEFAULT_ZONE) {
9715
0
                ct_dpif_push_zone_limit(zone_limits_reply, zone_limit->zone,
9716
0
                                        czl.limit,
9717
0
                                        czl.count);
9718
0
            } else {
9719
0
                return EINVAL;
9720
0
            }
9721
0
        }
9722
0
    } else {
9723
0
        czl = zone_limit_get(dp->conntrack, DEFAULT_ZONE);
9724
0
        if (czl.zone == DEFAULT_ZONE) {
9725
0
            ct_dpif_push_zone_limit(zone_limits_reply, DEFAULT_ZONE,
9726
0
                                    czl.limit, 0);
9727
0
        }
9728
9729
0
        for (int z = MIN_ZONE; z <= MAX_ZONE; z++) {
9730
0
            czl = zone_limit_get(dp->conntrack, z);
9731
0
            if (czl.zone == z) {
9732
0
                ct_dpif_push_zone_limit(zone_limits_reply, z, czl.limit,
9733
0
                                        czl.count);
9734
0
            }
9735
0
        }
9736
0
    }
9737
9738
0
    return 0;
9739
0
}
9740
9741
static int
9742
dpif_netdev_ct_del_limits(struct dpif *dpif,
9743
                           const struct ovs_list *zone_limits)
9744
0
{
9745
0
    int err = 0;
9746
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9747
0
    struct ct_dpif_zone_limit *zone_limit;
9748
0
    LIST_FOR_EACH (zone_limit, node, zone_limits) {
9749
0
        err = zone_limit_delete(dp->conntrack, zone_limit->zone);
9750
0
        if (err != 0) {
9751
0
            break;
9752
0
        }
9753
0
    }
9754
9755
0
    return err;
9756
0
}
9757
9758
static int
9759
dpif_netdev_ct_get_features(struct dpif *dpif OVS_UNUSED,
9760
                            enum ct_features *features)
9761
0
{
9762
0
    if (features != NULL) {
9763
0
        *features = CONNTRACK_F_ZERO_SNAT;
9764
0
    }
9765
0
    return 0;
9766
0
}
9767
9768
static int
9769
dpif_netdev_ct_set_timeout_policy(struct dpif *dpif,
9770
                                  const struct ct_dpif_timeout_policy *dpif_tp)
9771
0
{
9772
0
    struct timeout_policy tp;
9773
0
    struct dp_netdev *dp;
9774
9775
0
    dp = get_dp_netdev(dpif);
9776
0
    memcpy(&tp.policy, dpif_tp, sizeof tp.policy);
9777
0
    return timeout_policy_update(dp->conntrack, &tp);
9778
0
}
9779
9780
static int
9781
dpif_netdev_ct_get_timeout_policy(struct dpif *dpif, uint32_t tp_id,
9782
                                  struct ct_dpif_timeout_policy *dpif_tp)
9783
0
{
9784
0
    struct timeout_policy *tp;
9785
0
    struct dp_netdev *dp;
9786
0
    int err = 0;
9787
9788
0
    dp = get_dp_netdev(dpif);
9789
0
    tp = timeout_policy_get(dp->conntrack, tp_id);
9790
0
    if (!tp) {
9791
0
        return ENOENT;
9792
0
    }
9793
0
    memcpy(dpif_tp, &tp->policy, sizeof tp->policy);
9794
0
    return err;
9795
0
}
9796
9797
static int
9798
dpif_netdev_ct_del_timeout_policy(struct dpif *dpif,
9799
                                  uint32_t tp_id)
9800
0
{
9801
0
    struct dp_netdev *dp;
9802
0
    int err = 0;
9803
9804
0
    dp = get_dp_netdev(dpif);
9805
0
    err = timeout_policy_delete(dp->conntrack, tp_id);
9806
0
    return err;
9807
0
}
9808
9809
static int
9810
dpif_netdev_ct_get_timeout_policy_name(struct dpif *dpif OVS_UNUSED,
9811
                                       uint32_t tp_id,
9812
                                       uint16_t dl_type OVS_UNUSED,
9813
                                       uint8_t nw_proto OVS_UNUSED,
9814
                                       char **tp_name, bool *is_generic)
9815
0
{
9816
0
    struct ds ds = DS_EMPTY_INITIALIZER;
9817
9818
0
    ds_put_format(&ds, "%"PRIu32, tp_id);
9819
0
    *tp_name = ds_steal_cstr(&ds);
9820
0
    *is_generic = true;
9821
0
    return 0;
9822
0
}
9823
9824
static int
9825
dpif_netdev_ipf_set_enabled(struct dpif *dpif, bool v6, bool enable)
9826
0
{
9827
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9828
0
    return ipf_set_enabled(conntrack_ipf_ctx(dp->conntrack), v6, enable);
9829
0
}
9830
9831
static int
9832
dpif_netdev_ipf_set_min_frag(struct dpif *dpif, bool v6, uint32_t min_frag)
9833
0
{
9834
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9835
0
    return ipf_set_min_frag(conntrack_ipf_ctx(dp->conntrack), v6, min_frag);
9836
0
}
9837
9838
static int
9839
dpif_netdev_ipf_set_max_nfrags(struct dpif *dpif, uint32_t max_frags)
9840
0
{
9841
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9842
0
    return ipf_set_max_nfrags(conntrack_ipf_ctx(dp->conntrack), max_frags);
9843
0
}
9844
9845
/* Adjust this function if 'dpif_ipf_status' and 'ipf_status' were to
9846
 * diverge. */
9847
static int
9848
dpif_netdev_ipf_get_status(struct dpif *dpif,
9849
                           struct dpif_ipf_status *dpif_ipf_status)
9850
0
{
9851
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9852
0
    ipf_get_status(conntrack_ipf_ctx(dp->conntrack),
9853
0
                   (struct ipf_status *) dpif_ipf_status);
9854
0
    return 0;
9855
0
}
9856
9857
static int
9858
dpif_netdev_ipf_dump_start(struct dpif *dpif OVS_UNUSED,
9859
                           struct ipf_dump_ctx **ipf_dump_ctx)
9860
0
{
9861
0
    return ipf_dump_start(ipf_dump_ctx);
9862
0
}
9863
9864
static int
9865
dpif_netdev_ipf_dump_next(struct dpif *dpif, void *ipf_dump_ctx, char **dump)
9866
0
{
9867
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9868
0
    return ipf_dump_next(conntrack_ipf_ctx(dp->conntrack), ipf_dump_ctx,
9869
0
                         dump);
9870
0
}
9871
9872
static int
9873
dpif_netdev_ipf_dump_done(struct dpif *dpif OVS_UNUSED, void *ipf_dump_ctx)
9874
0
{
9875
0
    return ipf_dump_done(ipf_dump_ctx);
9876
9877
0
}
9878
9879
static int
9880
dpif_netdev_bond_add(struct dpif *dpif, uint32_t bond_id,
9881
                     odp_port_t *member_map)
9882
0
{
9883
0
    struct tx_bond *new_tx = xzalloc(sizeof *new_tx);
9884
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9885
0
    struct dp_netdev_pmd_thread *pmd;
9886
9887
    /* Prepare new bond mapping. */
9888
0
    new_tx->bond_id = bond_id;
9889
0
    for (int bucket = 0; bucket < BOND_BUCKETS; bucket++) {
9890
0
        new_tx->member_buckets[bucket].member_id = member_map[bucket];
9891
0
    }
9892
9893
0
    ovs_mutex_lock(&dp->bond_mutex);
9894
    /* Check if bond already existed. */
9895
0
    struct tx_bond *old_tx = tx_bond_lookup(&dp->tx_bonds, bond_id);
9896
0
    if (old_tx) {
9897
0
        cmap_replace(&dp->tx_bonds, &old_tx->node, &new_tx->node,
9898
0
                     hash_bond_id(bond_id));
9899
0
        ovsrcu_postpone(free, old_tx);
9900
0
    } else {
9901
0
        cmap_insert(&dp->tx_bonds, &new_tx->node, hash_bond_id(bond_id));
9902
0
    }
9903
0
    ovs_mutex_unlock(&dp->bond_mutex);
9904
9905
    /* Update all PMDs with new bond mapping. */
9906
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
9907
0
        dp_netdev_add_bond_tx_to_pmd(pmd, new_tx, true);
9908
0
    }
9909
0
    return 0;
9910
0
}
9911
9912
static int
9913
dpif_netdev_bond_del(struct dpif *dpif, uint32_t bond_id)
9914
0
{
9915
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9916
0
    struct dp_netdev_pmd_thread *pmd;
9917
0
    struct tx_bond *tx;
9918
9919
0
    ovs_mutex_lock(&dp->bond_mutex);
9920
    /* Check if bond existed. */
9921
0
    tx = tx_bond_lookup(&dp->tx_bonds, bond_id);
9922
0
    if (tx) {
9923
0
        cmap_remove(&dp->tx_bonds, &tx->node, hash_bond_id(bond_id));
9924
0
        ovsrcu_postpone(free, tx);
9925
0
    } else {
9926
        /* Bond is not present. */
9927
0
        ovs_mutex_unlock(&dp->bond_mutex);
9928
0
        return ENOENT;
9929
0
    }
9930
0
    ovs_mutex_unlock(&dp->bond_mutex);
9931
9932
    /* Remove the bond map in all pmds. */
9933
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
9934
0
        dp_netdev_del_bond_tx_from_pmd(pmd, bond_id);
9935
0
    }
9936
0
    return 0;
9937
0
}
9938
9939
static int
9940
dpif_netdev_bond_stats_get(struct dpif *dpif, uint32_t bond_id,
9941
                           uint64_t *n_bytes)
9942
0
{
9943
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9944
0
    struct dp_netdev_pmd_thread *pmd;
9945
9946
0
    if (!tx_bond_lookup(&dp->tx_bonds, bond_id)) {
9947
0
        return ENOENT;
9948
0
    }
9949
9950
    /* Search the bond in all PMDs. */
9951
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
9952
0
        struct tx_bond *pmd_bond_entry
9953
0
            = tx_bond_lookup(&pmd->tx_bonds, bond_id);
9954
9955
0
        if (!pmd_bond_entry) {
9956
0
            continue;
9957
0
        }
9958
9959
        /* Read bond stats. */
9960
0
        for (int i = 0; i < BOND_BUCKETS; i++) {
9961
0
            uint64_t pmd_n_bytes;
9962
9963
0
            atomic_read_relaxed(&pmd_bond_entry->member_buckets[i].n_bytes,
9964
0
                                &pmd_n_bytes);
9965
0
            n_bytes[i] += pmd_n_bytes;
9966
0
        }
9967
0
    }
9968
0
    return 0;
9969
0
}
9970
9971
const struct dpif_class dpif_netdev_class = {
9972
    "netdev",
9973
    true,                       /* cleanup_required */
9974
    true,                       /* synced_dp_layers */
9975
    dpif_netdev_init,
9976
    dpif_netdev_enumerate,
9977
    dpif_netdev_port_open_type,
9978
    dpif_netdev_open,
9979
    dpif_netdev_close,
9980
    dpif_netdev_destroy,
9981
    dpif_netdev_run,
9982
    dpif_netdev_wait,
9983
    dpif_netdev_get_stats,
9984
    NULL,                      /* set_features */
9985
    dpif_netdev_port_add,
9986
    dpif_netdev_port_del,
9987
    dpif_netdev_port_set_config,
9988
    dpif_netdev_port_query_by_number,
9989
    dpif_netdev_port_query_by_name,
9990
    NULL,                       /* port_get_pid */
9991
    dpif_netdev_port_dump_start,
9992
    dpif_netdev_port_dump_next,
9993
    dpif_netdev_port_dump_done,
9994
    dpif_netdev_port_poll,
9995
    dpif_netdev_port_poll_wait,
9996
    dpif_netdev_flow_flush,
9997
    dpif_netdev_flow_dump_create,
9998
    dpif_netdev_flow_dump_destroy,
9999
    dpif_netdev_flow_dump_thread_create,
10000
    dpif_netdev_flow_dump_thread_destroy,
10001
    dpif_netdev_flow_dump_next,
10002
    dpif_netdev_operate,
10003
    dpif_netdev_offload_stats_get,
10004
    NULL,                       /* recv_set */
10005
    NULL,                       /* handlers_set */
10006
    dpif_netdev_number_handlers_required,
10007
    dpif_netdev_set_config,
10008
    dpif_netdev_queue_to_priority,
10009
    NULL,                       /* recv */
10010
    NULL,                       /* recv_wait */
10011
    NULL,                       /* recv_purge */
10012
    dpif_netdev_register_dp_purge_cb,
10013
    dpif_netdev_register_upcall_cb,
10014
    dpif_netdev_enable_upcall,
10015
    dpif_netdev_disable_upcall,
10016
    dpif_netdev_get_datapath_version,
10017
    dpif_netdev_ct_dump_start,
10018
    dpif_netdev_ct_dump_next,
10019
    dpif_netdev_ct_dump_done,
10020
    dpif_netdev_ct_exp_dump_start,
10021
    dpif_netdev_ct_exp_dump_next,
10022
    dpif_netdev_ct_exp_dump_done,
10023
    dpif_netdev_ct_flush,
10024
    dpif_netdev_ct_set_maxconns,
10025
    dpif_netdev_ct_get_maxconns,
10026
    dpif_netdev_ct_get_nconns,
10027
    dpif_netdev_ct_set_tcp_seq_chk,
10028
    dpif_netdev_ct_get_tcp_seq_chk,
10029
    dpif_netdev_ct_set_sweep_interval,
10030
    dpif_netdev_ct_get_sweep_interval,
10031
    dpif_netdev_ct_set_limits,
10032
    dpif_netdev_ct_get_limits,
10033
    dpif_netdev_ct_del_limits,
10034
    dpif_netdev_ct_set_timeout_policy,
10035
    dpif_netdev_ct_get_timeout_policy,
10036
    dpif_netdev_ct_del_timeout_policy,
10037
    NULL,                       /* ct_timeout_policy_dump_start */
10038
    NULL,                       /* ct_timeout_policy_dump_next */
10039
    NULL,                       /* ct_timeout_policy_dump_done */
10040
    dpif_netdev_ct_get_timeout_policy_name,
10041
    dpif_netdev_ct_get_features,
10042
    dpif_netdev_ipf_set_enabled,
10043
    dpif_netdev_ipf_set_min_frag,
10044
    dpif_netdev_ipf_set_max_nfrags,
10045
    dpif_netdev_ipf_get_status,
10046
    dpif_netdev_ipf_dump_start,
10047
    dpif_netdev_ipf_dump_next,
10048
    dpif_netdev_ipf_dump_done,
10049
    dpif_netdev_meter_get_features,
10050
    dpif_netdev_meter_set,
10051
    dpif_netdev_meter_get,
10052
    dpif_netdev_meter_del,
10053
    dpif_netdev_bond_add,
10054
    dpif_netdev_bond_del,
10055
    dpif_netdev_bond_stats_get,
10056
    NULL,                       /* cache_get_supported_levels */
10057
    NULL,                       /* cache_get_name */
10058
    NULL,                       /* cache_get_size */
10059
    NULL,                       /* cache_set_size */
10060
};
10061
10062
static void
10063
dpif_dummy_change_port_number(struct unixctl_conn *conn, int argc OVS_UNUSED,
10064
                              const char *argv[], void *aux OVS_UNUSED)
10065
0
{
10066
0
    struct dp_netdev_port *port;
10067
0
    struct dp_netdev *dp;
10068
0
    odp_port_t port_no;
10069
10070
0
    ovs_mutex_lock(&dp_netdev_mutex);
10071
0
    dp = shash_find_data(&dp_netdevs, argv[1]);
10072
0
    if (!dp || !dpif_netdev_class_is_dummy(dp->class)) {
10073
0
        ovs_mutex_unlock(&dp_netdev_mutex);
10074
0
        unixctl_command_reply_error(conn, "unknown datapath or not a dummy");
10075
0
        return;
10076
0
    }
10077
0
    ovs_refcount_ref(&dp->ref_cnt);
10078
0
    ovs_mutex_unlock(&dp_netdev_mutex);
10079
10080
0
    ovs_rwlock_wrlock(&dp->port_rwlock);
10081
0
    if (get_port_by_name(dp, argv[2], &port)) {
10082
0
        unixctl_command_reply_error(conn, "unknown port");
10083
0
        goto exit;
10084
0
    }
10085
10086
0
    port_no = u32_to_odp(atoi(argv[3]));
10087
0
    if (!port_no || port_no == ODPP_NONE) {
10088
0
        unixctl_command_reply_error(conn, "bad port number");
10089
0
        goto exit;
10090
0
    }
10091
0
    if (dp_netdev_lookup_port(dp, port_no)) {
10092
0
        unixctl_command_reply_error(conn, "port number already in use");
10093
0
        goto exit;
10094
0
    }
10095
10096
    /* Remove port. */
10097
0
    hmap_remove(&dp->ports, &port->node);
10098
0
    reconfigure_datapath(dp);
10099
10100
    /* Reinsert with new port number. */
10101
0
    port->port_no = port_no;
10102
0
    hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
10103
0
    reconfigure_datapath(dp);
10104
10105
0
    seq_change(dp->port_seq);
10106
0
    unixctl_command_reply(conn, NULL);
10107
10108
0
exit:
10109
0
    ovs_rwlock_unlock(&dp->port_rwlock);
10110
0
    dp_netdev_unref(dp);
10111
0
}
10112
10113
static void
10114
dpif_dummy_register__(const char *type)
10115
0
{
10116
0
    struct dpif_class *class;
10117
10118
0
    class = xmalloc(sizeof *class);
10119
0
    *class = dpif_netdev_class;
10120
0
    class->type = xstrdup(type);
10121
0
    dp_register_provider(class);
10122
0
}
10123
10124
static void
10125
dpif_dummy_override(const char *type)
10126
0
{
10127
0
    int error;
10128
10129
    /*
10130
     * Ignore EAFNOSUPPORT to allow --enable-dummy=system with
10131
     * a userland-only build.  It's useful for testsuite.
10132
     */
10133
0
    error = dp_unregister_provider(type);
10134
0
    if (error == 0 || error == EAFNOSUPPORT) {
10135
0
        dpif_dummy_register__(type);
10136
0
    }
10137
0
}
10138
10139
void
10140
dpif_dummy_register(enum dummy_level level)
10141
0
{
10142
0
    if (level == DUMMY_OVERRIDE_ALL) {
10143
0
        struct sset types;
10144
0
        const char *type;
10145
10146
0
        sset_init(&types);
10147
0
        dp_enumerate_types(&types);
10148
0
        SSET_FOR_EACH (type, &types) {
10149
0
            dpif_dummy_override(type);
10150
0
        }
10151
0
        sset_destroy(&types);
10152
0
    } else if (level == DUMMY_OVERRIDE_SYSTEM) {
10153
0
        dpif_dummy_override("system");
10154
0
    }
10155
10156
0
    dpif_dummy_register__("dummy");
10157
10158
0
    unixctl_command_register("dpif-dummy/change-port-number",
10159
0
                             "dp port new-number",
10160
0
                             3, 3, dpif_dummy_change_port_number, NULL);
10161
0
}
10162

10163
/* Datapath Classifier. */
10164
10165
static void
10166
dpcls_subtable_destroy_cb(struct dpcls_subtable *subtable)
10167
0
{
10168
0
    cmap_destroy(&subtable->rules);
10169
0
    ovsrcu_postpone(free, subtable->mf_masks);
10170
0
    ovsrcu_postpone(free, subtable);
10171
0
}
10172
10173
/* Initializes 'cls' as a classifier that initially contains no classification
10174
 * rules. */
10175
static void
10176
dpcls_init(struct dpcls *cls)
10177
0
{
10178
0
    cmap_init(&cls->subtables_map);
10179
0
    pvector_init(&cls->subtables);
10180
0
}
10181
10182
static void
10183
dpcls_destroy_subtable(struct dpcls *cls, struct dpcls_subtable *subtable)
10184
0
{
10185
0
    VLOG_DBG("Destroying subtable %p for in_port %d", subtable, cls->in_port);
10186
0
    pvector_remove(&cls->subtables, subtable);
10187
0
    cmap_remove(&cls->subtables_map, &subtable->cmap_node,
10188
0
                subtable->mask.hash);
10189
0
    dpcls_info_dec_usage(subtable->lookup_func_info);
10190
0
    ovsrcu_postpone(dpcls_subtable_destroy_cb, subtable);
10191
0
}
10192
10193
/* Destroys 'cls'.  Rules within 'cls', if any, are not freed; this is the
10194
 * caller's responsibility.
10195
 * May only be called after all the readers have been terminated. */
10196
static void
10197
dpcls_destroy(struct dpcls *cls)
10198
0
{
10199
0
    if (cls) {
10200
0
        struct dpcls_subtable *subtable;
10201
10202
0
        CMAP_FOR_EACH (subtable, cmap_node, &cls->subtables_map) {
10203
0
            ovs_assert(cmap_count(&subtable->rules) == 0);
10204
0
            dpcls_destroy_subtable(cls, subtable);
10205
0
        }
10206
0
        cmap_destroy(&cls->subtables_map);
10207
0
        pvector_destroy(&cls->subtables);
10208
0
    }
10209
0
}
10210
10211
static struct dpcls_subtable *
10212
dpcls_create_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
10213
0
{
10214
0
    struct dpcls_subtable *subtable;
10215
10216
    /* Need to add one. */
10217
0
    subtable = xmalloc(sizeof *subtable
10218
0
                       - sizeof subtable->mask.mf + mask->len);
10219
0
    cmap_init(&subtable->rules);
10220
0
    subtable->hit_cnt = 0;
10221
0
    netdev_flow_key_clone(&subtable->mask, mask);
10222
10223
    /* The count of bits in the mask defines the space required for masks.
10224
     * Then call gen_masks() to create the appropriate masks, avoiding the cost
10225
     * of doing runtime calculations. */
10226
0
    uint32_t unit0 = count_1bits(mask->mf.map.bits[0]);
10227
0
    uint32_t unit1 = count_1bits(mask->mf.map.bits[1]);
10228
0
    subtable->mf_bits_set_unit0 = unit0;
10229
0
    subtable->mf_bits_set_unit1 = unit1;
10230
0
    subtable->mf_masks = xmalloc(sizeof(uint64_t) * (unit0 + unit1));
10231
0
    dpcls_flow_key_gen_masks(mask, subtable->mf_masks, unit0, unit1);
10232
10233
    /* Get the preferred subtable search function for this (u0,u1) subtable.
10234
     * The function is guaranteed to always return a valid implementation, and
10235
     * possibly an ISA optimized, and/or specialized implementation. Initialize
10236
     * the subtable search function atomically to avoid garbage data being read
10237
     * by the PMD thread.
10238
     */
10239
0
    atomic_init(&subtable->lookup_func,
10240
0
                dpcls_subtable_get_best_impl(unit0, unit1,
10241
0
                                             &subtable->lookup_func_info));
10242
0
    dpcls_info_inc_usage(subtable->lookup_func_info);
10243
10244
0
    cmap_insert(&cls->subtables_map, &subtable->cmap_node, mask->hash);
10245
    /* Add the new subtable at the end of the pvector (with no hits yet) */
10246
0
    pvector_insert(&cls->subtables, subtable, 0);
10247
0
    VLOG_DBG("Creating %"PRIuSIZE". subtable %p for in_port %d",
10248
0
             cmap_count(&cls->subtables_map), subtable, cls->in_port);
10249
0
    pvector_publish(&cls->subtables);
10250
10251
0
    return subtable;
10252
0
}
10253
10254
static inline struct dpcls_subtable *
10255
dpcls_find_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
10256
0
{
10257
0
    struct dpcls_subtable *subtable;
10258
10259
0
    CMAP_FOR_EACH_WITH_HASH (subtable, cmap_node, mask->hash,
10260
0
                             &cls->subtables_map) {
10261
0
        if (netdev_flow_key_equal(&subtable->mask, mask)) {
10262
0
            return subtable;
10263
0
        }
10264
0
    }
10265
0
    return dpcls_create_subtable(cls, mask);
10266
0
}
10267
10268
/* Checks for the best available implementation for each subtable lookup
10269
 * function, and assigns it as the lookup function pointer for each subtable.
10270
 * Returns the number of subtables that have changed lookup implementation.
10271
 * This function requires holding a flow_mutex when called. This is to make
10272
 * sure modifications done by this function are not overwritten. This could
10273
 * happen if dpcls_sort_subtable_vector() is called at the same time as this
10274
 * function.
10275
 */
10276
static uint32_t
10277
dpcls_subtable_lookup_reprobe(struct dpcls *cls)
10278
0
{
10279
0
    struct pvector *pvec = &cls->subtables;
10280
0
    uint32_t subtables_changed = 0;
10281
0
    struct dpcls_subtable *subtable = NULL;
10282
10283
0
    PVECTOR_FOR_EACH (subtable, pvec) {
10284
0
        uint32_t u0_bits = subtable->mf_bits_set_unit0;
10285
0
        uint32_t u1_bits = subtable->mf_bits_set_unit1;
10286
0
        void *old_func = subtable->lookup_func;
10287
0
        struct dpcls_subtable_lookup_info_t *old_info;
10288
0
        old_info = subtable->lookup_func_info;
10289
        /* Set the subtable lookup function atomically to avoid garbage data
10290
         * being read by the PMD thread. */
10291
0
        atomic_store_relaxed(&subtable->lookup_func,
10292
0
                dpcls_subtable_get_best_impl(u0_bits, u1_bits,
10293
0
                                             &subtable->lookup_func_info));
10294
0
        if (old_func != subtable->lookup_func) {
10295
0
            subtables_changed += 1;
10296
0
        }
10297
10298
0
        if (old_info != subtable->lookup_func_info) {
10299
            /* In theory, functions can be shared between implementations, so
10300
             * do an explicit check on the function info structures. */
10301
0
            dpcls_info_dec_usage(old_info);
10302
0
            dpcls_info_inc_usage(subtable->lookup_func_info);
10303
0
        }
10304
0
    }
10305
10306
0
    return subtables_changed;
10307
0
}
10308
10309
/* Periodically sort the dpcls subtable vectors according to hit counts */
10310
static void
10311
dpcls_sort_subtable_vector(struct dpcls *cls)
10312
0
{
10313
0
    struct pvector *pvec = &cls->subtables;
10314
0
    struct dpcls_subtable *subtable;
10315
10316
0
    PVECTOR_FOR_EACH (subtable, pvec) {
10317
0
        pvector_change_priority(pvec, subtable, subtable->hit_cnt);
10318
0
        subtable->hit_cnt = 0;
10319
0
    }
10320
0
    pvector_publish(pvec);
10321
0
}
10322
10323
static inline void
10324
dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
10325
                           struct polled_queue *poll_list, int poll_cnt)
10326
0
{
10327
0
    struct dpcls *cls;
10328
0
    uint64_t tot_idle = 0, tot_proc = 0, tot_sleep = 0;
10329
0
    unsigned int pmd_load = 0;
10330
10331
0
    if (pmd->ctx.now > pmd->next_cycle_store) {
10332
0
        uint64_t curr_tsc;
10333
0
        uint8_t rebalance_load_trigger;
10334
0
        struct pmd_auto_lb *pmd_alb = &pmd->dp->pmd_alb;
10335
0
        unsigned int idx;
10336
10337
0
        if (pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] >=
10338
0
                pmd->prev_stats[PMD_CYCLES_ITER_IDLE] &&
10339
0
            pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] >=
10340
0
                pmd->prev_stats[PMD_CYCLES_ITER_BUSY]) {
10341
0
            tot_idle = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] -
10342
0
                       pmd->prev_stats[PMD_CYCLES_ITER_IDLE];
10343
0
            tot_proc = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] -
10344
0
                       pmd->prev_stats[PMD_CYCLES_ITER_BUSY];
10345
0
            tot_sleep = pmd->perf_stats.counters.n[PMD_CYCLES_SLEEP] -
10346
0
                        pmd->prev_stats[PMD_CYCLES_SLEEP];
10347
10348
0
            if (pmd_alb->is_enabled && !pmd->isolated) {
10349
0
                if (tot_proc) {
10350
0
                    pmd_load = ((tot_proc * 100) /
10351
0
                                    (tot_idle + tot_proc + tot_sleep));
10352
0
                }
10353
10354
0
                atomic_read_relaxed(&pmd_alb->rebalance_load_thresh,
10355
0
                                    &rebalance_load_trigger);
10356
0
                if (pmd_load >= rebalance_load_trigger) {
10357
0
                    atomic_count_inc(&pmd->pmd_overloaded);
10358
0
                } else {
10359
0
                    atomic_count_set(&pmd->pmd_overloaded, 0);
10360
0
                }
10361
0
            }
10362
0
        }
10363
10364
0
        pmd->prev_stats[PMD_CYCLES_ITER_IDLE] =
10365
0
                        pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE];
10366
0
        pmd->prev_stats[PMD_CYCLES_ITER_BUSY] =
10367
0
                        pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY];
10368
0
        pmd->prev_stats[PMD_CYCLES_SLEEP] =
10369
0
                        pmd->perf_stats.counters.n[PMD_CYCLES_SLEEP];
10370
10371
        /* Get the cycles that were used to process each queue and store. */
10372
0
        for (unsigned i = 0; i < poll_cnt; i++) {
10373
0
            uint64_t rxq_cyc_curr = dp_netdev_rxq_get_cycles(poll_list[i].rxq,
10374
0
                                                        RXQ_CYCLES_PROC_CURR);
10375
0
            dp_netdev_rxq_set_intrvl_cycles(poll_list[i].rxq, rxq_cyc_curr);
10376
0
            dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR,
10377
0
                                     0);
10378
0
        }
10379
0
        curr_tsc = cycles_counter_update(&pmd->perf_stats);
10380
0
        if (pmd->intrvl_tsc_prev) {
10381
            /* There is a prev timestamp, store a new intrvl cycle count. */
10382
0
            atomic_store_relaxed(&pmd->intrvl_cycles,
10383
0
                                 curr_tsc - pmd->intrvl_tsc_prev);
10384
0
        }
10385
0
        idx = atomic_count_inc(&pmd->intrvl_idx) % PMD_INTERVAL_MAX;
10386
0
        atomic_store_relaxed(&pmd->busy_cycles_intrvl[idx], tot_proc);
10387
0
        pmd->intrvl_tsc_prev = curr_tsc;
10388
        /* Start new measuring interval */
10389
0
        pmd->next_cycle_store = pmd->ctx.now + PMD_INTERVAL_LEN;
10390
0
    }
10391
10392
0
    if (pmd->ctx.now > pmd->next_optimization) {
10393
        /* Try to obtain the flow lock to block out revalidator threads.
10394
         * If not possible, just try next time. */
10395
0
        if (!ovs_mutex_trylock(&pmd->flow_mutex)) {
10396
            /* Optimize each classifier */
10397
0
            CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
10398
0
                dpcls_sort_subtable_vector(cls);
10399
0
            }
10400
0
            ovs_mutex_unlock(&pmd->flow_mutex);
10401
            /* Start new measuring interval */
10402
0
            pmd->next_optimization = pmd->ctx.now
10403
0
                                     + DPCLS_OPTIMIZATION_INTERVAL;
10404
0
        }
10405
0
    }
10406
0
}
10407
10408
/* Returns the sum of a specified number of newest to
10409
 * oldest interval values. 'cur_idx' is where the next
10410
 * write will be and wrap around needs to be handled.
10411
 */
10412
static uint64_t
10413
get_interval_values(atomic_ullong *source, atomic_count *cur_idx,
10414
0
                    int num_to_read) {
10415
0
    unsigned int i;
10416
0
    uint64_t total = 0;
10417
10418
0
    i = atomic_count_get(cur_idx) % PMD_INTERVAL_MAX;
10419
0
    for (int read = 0; read < num_to_read; read++) {
10420
0
        uint64_t interval_value;
10421
10422
0
        i = i ? i - 1 : PMD_INTERVAL_MAX - 1;
10423
0
        atomic_read_relaxed(&source[i], &interval_value);
10424
0
        total += interval_value;
10425
0
    }
10426
0
    return total;
10427
0
}
10428
10429
/* Insert 'rule' into 'cls'. */
10430
static void
10431
dpcls_insert(struct dpcls *cls, struct dpcls_rule *rule,
10432
             const struct netdev_flow_key *mask)
10433
0
{
10434
0
    struct dpcls_subtable *subtable = dpcls_find_subtable(cls, mask);
10435
10436
    /* Refer to subtable's mask, also for later removal. */
10437
0
    rule->mask = &subtable->mask;
10438
0
    cmap_insert(&subtable->rules, &rule->cmap_node, rule->flow.hash);
10439
0
}
10440
10441
/* Removes 'rule' from 'cls', also destructing the 'rule'. */
10442
static void
10443
dpcls_remove(struct dpcls *cls, struct dpcls_rule *rule)
10444
0
{
10445
0
    struct dpcls_subtable *subtable;
10446
10447
0
    ovs_assert(rule->mask);
10448
10449
    /* Get subtable from reference in rule->mask. */
10450
0
    INIT_CONTAINER(subtable, rule->mask, mask);
10451
0
    if (cmap_remove(&subtable->rules, &rule->cmap_node, rule->flow.hash)
10452
0
        == 0) {
10453
        /* Delete empty subtable. */
10454
0
        dpcls_destroy_subtable(cls, subtable);
10455
0
        pvector_publish(&cls->subtables);
10456
0
    }
10457
0
}
10458
10459
/* Inner loop for mask generation of a unit, see dpcls_flow_key_gen_masks. */
10460
static inline void
10461
dpcls_flow_key_gen_mask_unit(uint64_t iter, const uint64_t count,
10462
                             uint64_t *mf_masks)
10463
0
{
10464
0
    int i;
10465
0
    for (i = 0; i < count; i++) {
10466
0
        uint64_t lowest_bit = (iter & -iter);
10467
0
        iter &= ~lowest_bit;
10468
0
        mf_masks[i] = (lowest_bit - 1);
10469
0
    }
10470
    /* Checks that count has covered all bits in the iter bitmap. */
10471
0
    ovs_assert(iter == 0);
10472
0
}
10473
10474
/* Generate a mask for each block in the miniflow, based on the bits set. This
10475
 * allows easily masking packets with the generated array here, without
10476
 * calculations. This replaces runtime-calculating the masks.
10477
 * @param key The table to generate the mf_masks for
10478
 * @param mf_masks Pointer to a u64 array of at least *mf_bits* in size
10479
 * @param mf_bits_total Number of bits set in the whole miniflow (both units)
10480
 * @param mf_bits_unit0 Number of bits set in unit0 of the miniflow
10481
 */
10482
void
10483
dpcls_flow_key_gen_masks(const struct netdev_flow_key *tbl,
10484
                         uint64_t *mf_masks,
10485
                         const uint32_t mf_bits_u0,
10486
                         const uint32_t mf_bits_u1)
10487
0
{
10488
0
    uint64_t iter_u0 = tbl->mf.map.bits[0];
10489
0
    uint64_t iter_u1 = tbl->mf.map.bits[1];
10490
10491
0
    dpcls_flow_key_gen_mask_unit(iter_u0, mf_bits_u0, &mf_masks[0]);
10492
0
    dpcls_flow_key_gen_mask_unit(iter_u1, mf_bits_u1, &mf_masks[mf_bits_u0]);
10493
0
}
10494
10495
/* Returns true if 'target' satisfies 'key' in 'mask', that is, if each 1-bit
10496
 * in 'mask' the values in 'key' and 'target' are the same. */
10497
inline bool
10498
dpcls_rule_matches_key(const struct dpcls_rule *rule,
10499
                       const struct netdev_flow_key *target)
10500
0
{
10501
0
    const uint64_t *keyp = miniflow_get_values(&rule->flow.mf);
10502
0
    const uint64_t *maskp = miniflow_get_values(&rule->mask->mf);
10503
0
    uint64_t value;
10504
10505
0
    NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, target, rule->flow.mf.map) {
10506
0
        if (OVS_UNLIKELY((value & *maskp++) != *keyp++)) {
10507
0
            return false;
10508
0
        }
10509
0
    }
10510
0
    return true;
10511
0
}
10512
10513
/* For each miniflow in 'keys' performs a classifier lookup writing the result
10514
 * into the corresponding slot in 'rules'.  If a particular entry in 'keys' is
10515
 * NULL it is skipped.
10516
 *
10517
 * This function is optimized for use in the userspace datapath and therefore
10518
 * does not implement a lot of features available in the standard
10519
 * classifier_lookup() function.  Specifically, it does not implement
10520
 * priorities, instead returning any rule which matches the flow.
10521
 *
10522
 * Returns true if all miniflows found a corresponding rule. */
10523
bool
10524
dpcls_lookup(struct dpcls *cls, const struct netdev_flow_key *keys[],
10525
             struct dpcls_rule **rules, const size_t cnt,
10526
             int *num_lookups_p)
10527
0
{
10528
    /* The received 'cnt' miniflows are the search-keys that will be processed
10529
     * to find a matching entry into the available subtables.
10530
     * The number of bits in map_type is equal to NETDEV_MAX_BURST. */
10531
0
#define MAP_BITS (sizeof(uint32_t) * CHAR_BIT)
10532
0
    BUILD_ASSERT_DECL(MAP_BITS >= NETDEV_MAX_BURST);
10533
10534
0
    struct dpcls_subtable *subtable;
10535
0
    uint32_t keys_map = TYPE_MAXIMUM(uint32_t); /* Set all bits. */
10536
10537
0
    if (cnt != MAP_BITS) {
10538
0
        keys_map >>= MAP_BITS - cnt; /* Clear extra bits. */
10539
0
    }
10540
0
    memset(rules, 0, cnt * sizeof *rules);
10541
10542
0
    int lookups_match = 0, subtable_pos = 1;
10543
0
    uint32_t found_map;
10544
10545
    /* The Datapath classifier - aka dpcls - is composed of subtables.
10546
     * Subtables are dynamically created as needed when new rules are inserted.
10547
     * Each subtable collects rules with matches on a specific subset of packet
10548
     * fields as defined by the subtable's mask.  We proceed to process every
10549
     * search-key against each subtable, but when a match is found for a
10550
     * search-key, the search for that key can stop because the rules are
10551
     * non-overlapping. */
10552
0
    PVECTOR_FOR_EACH (subtable, &cls->subtables) {
10553
        /* Call the subtable specific lookup function. */
10554
0
        found_map = subtable->lookup_func(subtable, keys_map, keys, rules);
10555
10556
        /* Count the number of subtables searched for this packet match. This
10557
         * estimates the "spread" of subtables looked at per matched packet. */
10558
0
        uint32_t pkts_matched = count_1bits(found_map);
10559
0
        lookups_match += pkts_matched * subtable_pos;
10560
10561
        /* Clear the found rules, and return early if all packets are found. */
10562
0
        keys_map &= ~found_map;
10563
0
        if (!keys_map) {
10564
0
            if (num_lookups_p) {
10565
0
                *num_lookups_p = lookups_match;
10566
0
            }
10567
0
            return true;
10568
0
        }
10569
0
        subtable_pos++;
10570
0
    }
10571
10572
0
    if (num_lookups_p) {
10573
0
        *num_lookups_p = lookups_match;
10574
0
    }
10575
0
    return false;
10576
0
}